/ YoutubeVideoTranslater / gpu_processor.py
gpu_processor.py
1 import yt_dlp 2 import sys 3 import torch 4 import os 5 from whisper.transcribe import transcribe 6 from whisper import load_model 7 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 8 import scipy 9 from scipy.io import wavfile 10 import ffmpeg 11 import soundfile as sf 12 from io import BytesIO 13 import struct 14 from scipy.io.wavfile import write 15 from pyannote.audio import Pipeline 16 import numpy as np 17 from scipy.signal import resample 18 import json 19 from collections import defaultdict 20 import time 21 import requests 22 23 from datetime import datetime 24 import pymysql as mysql 25 import socket 26 import threading 27 from langdetect import detect 28 import openai 29 from functions import * 30 import logging 31 32 '''os.environ["CUDA_VISIBLE_DEVICES"] = "0" 33 from bark.generation import ( 34 generate_text_semantic, 35 preload_models, 36 ) 37 semantic_path = "semantic_output/pytorch_model.bin" # set to None if you don't want to use finetuned semantic 38 coarse_path = "coarse_output/pytorch_model.bin" # set to None if you don't want to use finetuned coarse 39 fine_path = "fine_output/pytorch_model.bin" # set to None if you don't want to use finetuned fine 40 41 is_half=True 42 preload_models( 43 text_use_gpu=True, 44 text_use_small=False, 45 text_model_path=semantic_path, 46 coarse_use_gpu=True, 47 coarse_use_small=False, 48 coarse_model_path=coarse_path, 49 fine_use_gpu=True, 50 fine_use_small=False, 51 fine_model_path=fine_path, 52 codec_use_gpu=True, 53 force_reload=False, 54 path="models" 55 ) 56 from bark import generate_audio, SAMPLE_RATE''' 57 58 59 environment = 'production' # Default to 'production' if not set 60 if len(sys.argv)> 1 and sys.argv[1] is not None: 61 environment = sys.argv[1] 62 63 home_dir = "/home/cyril/dev/VideoTranslator" 64 temp_dir = home_dir + '/temp' 65 downloads_dir = home_dir + '/downloads' 66 output_dir = home_dir + '/output' 67 IPs = ( 68 '162.199.220.174', 69 'translatizer.com' 70 ) 71 BANDWIDTH_PORT = 2212 72 GPU_PORT = 2213 73 pw = 'H0ur1!' 74 host = '127.0.0.1' 75 if environment == 'production': 76 logging.basicConfig(filename='gpu_processor.log', level=logging.INFO) 77 BANDWIDTH_PORT = 62212 78 GPU_PORT = 62213 79 pw = 'Cyr1lH0ur1!' 80 host = 'translatizer.com' 81 output_dir = home_dir + '/s3-bucket' 82 else: 83 logging.basicConfig(level=logging.INFO, 84 format='%(asctime)s - %(levelname)s - %(message)s') 85 # MySQL Configuration 86 db_config = { 87 'user': 'appuser', 88 'password': pw, 89 'host': host, # or the IP address of your MySQL server 90 'database': 'videotranslator', 91 } 92 93 94 # Initialize models and devices for voice cloning 95 device = 'cuda' # or 'cpu' 96 def get_destination_ip(): 97 retValue = '' 98 if environment == 'debug': 99 retValue = '127.0.0.1' 100 else: 101 if os.environ.get('PUBLIC_IP') == IPs[0]: 102 retValue = IPs[1] 103 else: 104 retValue = IPs[0] 105 return retValue 106 107 108 109 110 def merge_audio_files(original_file, translated_file, output_file): 111 """ 112 Merge two mono audio files into one stereo file using FFmpeg. 113 114 Parameters: 115 - original_file (str): Path to the first mono audio file. 116 - translated_file (str): Path to the second mono audio file. 117 - output_file (str): Path to the output stereo audio file. 118 119 Returns: 120 - str: Output message from FFmpeg. 121 """ 122 """ 123 Merge two mono audio files into one stereo file using FFmpeg. 124 125 Parameters: 126 - original_file (str): Path to the first mono audio file. 127 - translated_file (str): Path to the second mono audio file. 128 - output_file (str): Path to the output stereo audio file. 129 """ 130 131 input_args = ["-i", original_file, "-i", translated_file] 132 filter_args = [ 133 "-filter_complex", 134 "[0:a]volume=0.2,pan=mono|c0=c0[a1]; [1:a]volume=0.8,pan=mono|c0=c0[a2]; [a1][a2]amerge=inputs=2[aout]", 135 "-ac", "2", 136 "-map", "[aout]" 137 ] 138 output_args = [output_file] 139 140 cmd = ["ffmpeg"] + input_args + filter_args + output_args 141 142 ffmpeg.run(cmd) 143 144 # Example usage: 145 # merge_audio_files("original_trunc.wav", "translated.wav", "merged.wav") 146 147 import subprocess 148 149 150 def createAudioOfSegment(translated_text, processor, model, voice_preset): 151 # voice_preset = '/home/cyril/tools/bark-with-voice-clone/bark/assets/prompts/Tucker.npz' 152 # voice_preset = "v2/fr_speaker_5" 153 device = 'cuda' if torch.cuda.is_available() else 'cpu' 154 inputs = processor(translated_text, voice_preset=voice_preset) 155 inputs.to(device) 156 157 audio_array = model.generate(**inputs) 158 # audio_array.to(device) 159 audio_array = audio_array.cpu().numpy().squeeze() 160 sample_rate = model.generation_config.sample_rate 161 return sample_rate, audio_array 162 163 164 def split_elements_for_caption(input_list): 165 output_list = [] 166 167 for element in input_list: 168 words = element['text'].split() 169 total_words = len(words) 170 total_duration = element['end'] - element['start'] 171 172 # Calculate the duration per word 173 duration_per_word = total_duration / total_words 174 175 # Split the words into chunks of 10 or fewer 176 for i in range(0, total_words, 10): 177 chunk_words = words[i:i + 10] 178 chunk_start = element['start'] + i * duration_per_word 179 chunk_end = chunk_start + len(chunk_words) * duration_per_word 180 181 output_list.append({ 182 'start': chunk_start, 183 'end': chunk_end, 184 'text': ' '.join(chunk_words) 185 }) 186 187 return output_list 188 189 def convert_to_transcript(result): 190 retValue = [] 191 for i in range(0,len(result['segments']) ): 192 current_part = { 193 'start': result['segments'][i]['start'], 194 'end': result['segments'][i]['end'], 195 'speaker_id': None, 196 'text': result['segments'][i]['text'] 197 198 } 199 retValue.append(current_part) 200 return retValue 201 202 def split_text(result): 203 words = [] 204 segment_id = 0 205 word_id = 0 206 for segment in result['segments']: 207 segment_id += 1 208 for word in segment['words']: 209 word_id += 1 210 thisWord = { 211 'start': word['start'], 212 'end': word['end'], 213 'speaker_id': None, 214 'word': word['word'] 215 216 } 217 words.append(thisWord) 218 parts = [] 219 current_part_text = [] 220 words_so_far = 0 221 word_count = 0 222 for word in words: 223 current_part_text.append(word['word']) 224 word_count += 1 225 226 # Check if the word count is between 150 and 200 and the word ends with a period 227 if word_count >= 35: 228 if word_count >= 60 or word['word'][-1] == '.' or word['word'][-1] == '?': 229 current_part_text_str = ''.join(current_part_text) 230 current_part = { 231 'start': words[words_so_far]['start'], 232 'end': words[words_so_far + word_count - 1]['end'], 233 'speaker_id': None, 234 'text': current_part_text_str 235 236 } 237 parts.append(current_part) 238 current_part_text = [] 239 words_so_far += word_count 240 word_count = 0 241 242 # Add any remaining words to the parts list 243 if current_part_text: 244 current_part_text_str = ''.join(current_part_text) 245 current_part = { 246 'start': words[words_so_far]['start'], 247 'end': words[words_so_far + word_count - 1]['end'], 248 'speaker_id': None, 249 'text': current_part_text_str, 250 251 } 252 parts.append(current_part) 253 # retValue = split_elements_for_caption(parts) 254 return parts 255 256 def translate_string(text, nllb_target_code, model, tokenizer): 257 device = 'cuda' if torch.cuda.is_available() else 'cpu' 258 tgt_lang_id = tokenizer.lang_code_to_id[nllb_target_code] 259 model_inputs = tokenizer(text, return_tensors='pt', padding='longest') 260 model_inputs.to(device) 261 gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tgt_lang_id, max_new_tokens=300) 262 translated_part = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True) 263 return translated_part[0] 264 265 def translate_captions(parts, nllb_target_code, model, tokenizer): 266 translatedParts = [] 267 for part in parts: 268 str = translate_string(part['text'], nllb_target_code, model, tokenizer ) 269 current_part = { 270 'start': part['start'], 271 'end': part['end'], 272 'speaker_id': part['speaker_id'], 273 'text': str 274 275 } 276 translatedParts.append(current_part) 277 # translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='fra_Latn', max_length = 400) 278 return translatedParts 279 280 281 def index_exists(index_name, cursor): 282 query = "SELECT COUNT(1) AS IndexIsThere FROM INFORMATION_SCHEMA.STATISTICS WHERE table_schema=DATABASE() AND index_name='" + index_name + "';" 283 cursor.execute(query) 284 285 # Fetch the result 286 result = cursor.fetchone() 287 return result[0] 288 289 290 291 292 293 def has_translation_been_revised_already(url_hash, language_code): 294 retValue = False 295 conn = mysql.connect(**db_config) 296 cursor = conn.cursor() 297 sql_statement = "SELECT A.id FROM transcripts A, languages B, videos C WHERE A.video_id = C.id and C.url_hash = %s and A.language_id = B.id and B.code = %s and B.revised = 1 limit 1" 298 cursor.execute(sql_statement, (url_hash, language_code,)) 299 id = cursor.fetchone() 300 if cursor.rowcount > 0: 301 retValue = True 302 return retValue 303 304 305 import hashlib 306 307 def get_video_object(): 308 video = { 309 'id': None, 310 'url': None, 311 'url_hash': None, 312 'thumbnail_url': None, 313 'language_id': None, 314 'width': None, 315 'height': None, 316 'metadata': [] 317 } 318 return video 319 320 def get_metedata_object(): 321 metadata = { 322 'language_id': None, 323 'title': None, 324 'description': None, 325 'tags': None, 326 } 327 return metadata 328 329 330 def download_video(url): 331 332 333 # Ensure the output directory exists 334 # create a new sha256 hash object 335 336 337 url_hash = hashlib.sha256(url.encode()).hexdigest() 338 # Define the output template for the downloaded file 339 output_template = os.path.join(downloads_dir, url_hash + '.%(ext)s') 340 341 options = { 342 'format': 'bestvideo+bestaudio/best', 343 'outtmpl': output_template, 344 'writeinfojson': True, 345 'quiet': True, 346 'nocheckcertificate': True, 347 } 348 349 fileNameWithPath = '' 350 if not os.path.exists(fileNameWithPath): 351 with yt_dlp.YoutubeDL(options) as ydl: 352 # Extract video information without downloading 353 info_dict = ydl.extract_info(url, download=True) 354 fileNameWithPath = ydl.prepare_filename(info_dict) 355 video = { 356 'id': None, 357 'url': url, 358 'url_hash': url_hash, 359 'thumbnail_url': info_dict.get('thumbnail', ''), 360 'language_id': None, 361 'metadata' : [], 362 'width': info_dict['width'], 363 'height': info_dict['height'] 364 } 365 metadata = { 366 'language_id': None, 367 'title': info_dict.get('title', ''), 368 'description': info_dict.get('description', ''), 369 'tags': ', '.join(info_dict.get('tags', [])), 370 } 371 video['metadata'].append(metadata) 372 list = str.split(fileNameWithPath, '/') 373 outputName = list[len(list) - 1] 374 return video, outputName 375 376 377 def get_audio_codec(video_path): 378 cmd = [ 379 'ffprobe', 380 '-v', 'quiet', 381 '-print_format', 'json', 382 '-show_streams', 383 video_path 384 ] 385 386 387 result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 388 output = result.stdout.decode('utf-8') 389 data = json.loads(output) 390 391 for stream in data['streams']: 392 if stream['codec_type'] == 'audio': 393 return stream['codec_name'] 394 395 return None 396 397 398 def extractAudioFromVideo(filename, desired_samplerate=24000): 399 out, _ = ( 400 ffmpeg.input(downloads_dir + '/' + filename) 401 .output('pipe:', format='wav', ac=1, ar=desired_samplerate) 402 .run(capture_stdout=True, capture_stderr=True) 403 ) 404 405 406 return out 407 408 409 def normalize_text(t): 410 return t.replace("%", "o/o") 411 412 413 def format_drawtext_filter(captions, video): 414 """Generate drawtext filter string for ffmpeg.""" 415 416 417 filters = [] 418 font_size = int(video['width'] / 20) 419 for caption in captions: 420 text = caption['text'] 421 modified_text = str(text).replace("'", "\u2019") 422 modified_text = normalize_text(modified_text) 423 filter_str = ( 424 # f"drawtext=text='{modified_text}':x=10:y=H-th-10:fontsize=24:fontcolor=white:" fontsize=24:fontcolor=black: 425 f"drawtext=text='{modified_text}':x=(w-text_w)/2:y=h-th-50:fontsize=" + str( 426 font_size) + ":fontcolor=yellow:box=1:boxcolor=black@0.5:" 427 f"enable='between(t,{caption['start']},{caption['end']}')" 428 ) 429 filters.append(filter_str) 430 return ",".join(filters) 431 432 import subprocess 433 434 435 def split_captions_for_display(captions): 436 result = [] 437 438 439 number_of_words_per_caption = 50 440 441 442 def split_text(text, start, speaker_id, end): 443 444 445 words = text.split() 446 sub_items = [] 447 current_text = "" 448 for word in words: 449 if len(current_text) + len(word) + 1 > number_of_words_per_caption: 450 sub_items.append(current_text.strip()) 451 current_text = "" 452 current_text += word + " " 453 if current_text: 454 sub_items.append(current_text.strip()) 455 456 duration = end - start 457 total_chars = sum(len(item) for item in sub_items) 458 current_start = start 459 for item in sub_items: 460 current_end = current_start + (float(len(item)) / float(total_chars)) * duration 461 result_text = format_text(item) 462 result.append({ 463 'start': current_start, 464 'end': current_end, 465 'speaker_id': speaker_id, 466 'text': result_text 467 }) 468 current_start = current_end 469 470 471 def format_text(text): 472 if len(text) <= int(number_of_words_per_caption / 2): 473 return text 474 475 476 # Find the nearest space to the 20th character 477 for i in range(int(number_of_words_per_caption / 2), len(text)): 478 if text[i] == ' ': 479 first_part = text[:i] 480 second_part = text[i + 1:] 481 break 482 else: 483 first_part = text 484 second_part = "" 485 486 spaces_needed = (len(first_part) - len(second_part)) // 2 487 centered_second_part = ' ' * spaces_needed + second_part 488 return first_part + '\n' + centered_second_part 489 490 for caption in captions: 491 split_text(caption['text'], caption['start'], caption['speaker_id'], caption['end']) 492 493 return result 494 495 496 def get_video_bitrate(filename): 497 try: 498 499 500 # Run FFmpeg command to get video details 501 result = subprocess.run(['ffmpeg', '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) 502 503 # Extract overall bitrate from FFmpeg output using regex 504 overall_bitrate_match = re.search(r'bitrate: (\d+ kb/s)', result.stderr) 505 if overall_bitrate_match: 506 overall_bitrate = int(overall_bitrate_match.group(1).split(' ')[0]) 507 else: 508 overall_bitrate = None 509 510 return overall_bitrate 511 except Exception as e: 512 print(f"Error: {e}") 513 return None 514 515 516 def split_in_smaller_chunks(subtitles, chunk_duration): 517 retValue = [] 518 519 520 temp_subtitles = [] 521 last_timestamp = 0 522 for i in range(0, len(subtitles)): 523 if subtitles[i]['end'] > last_timestamp + chunk_duration: 524 retValue.append(temp_subtitles) 525 temp_subtitles = [] 526 last_timestamp = subtitles[i]['end'] 527 temp_subtitles.append(subtitles[i]) 528 retValue.append(temp_subtitles) 529 return retValue 530 531 532 def add_subtitles_to_video(subtitles, language, video): 533 534 535 filename = get_filename_from_hash(video['url_hash']) 536 #bitrate = get_video_bitrate(filename) 537 538 subtitles2 = split_captions_for_display(subtitles) 539 subtitles_list = split_in_smaller_chunks(subtitles2, 600) 540 part_start_time = 0 541 for i in range(0, len(subtitles_list)): 542 if len(subtitles_list) == 1: 543 output_video = output_dir + '/' + video['url_hash'] + "_" + language + ".mp4" 544 else: 545 output_video = temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4" 546 if os.path.exists(output_video): 547 os.remove(output_video) 548 drawtext_filter = format_drawtext_filter(subtitles_list[i], video) 549 550 cmd = '' 551 video_duration = subtitles_list[i][len(subtitles_list[i]) - 1]['end'] - part_start_time 552 '''cmd = [ 553 'ffmpeg', # specify the exact path to your FFmpeg binary 554 '-hwaccel', 'cuda', 555 '-hwaccel_device', '0', 556 '-i', downloads_dir + '/' + filename, 557 '-ss', str(part_start_time), 558 '-t', str(video_duration), 559 '-vf', drawtext_filter, 560 '-c:v', 'h264_nvenc', 561 output_video 562 ]''' 563 564 cmd = [ 565 'ffmpeg', # specify the exact path to your FFmpeg binary 566 '-i', downloads_dir + '/' + filename, 567 '-ss', str(part_start_time), 568 '-t', str(video_duration), 569 '-vf', drawtext_filter, 570 '-c:v', 'h264_nvenc', 571 output_video 572 ] 573 574 result = subprocess.run(cmd) 575 part_start_time += video_duration 576 if len(subtitles_list) > 1: 577 str1 = '' 578 str2 = "" 579 for i in range(0, len(subtitles_list)): 580 str1 += " -i " + temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4 " 581 str2 += "[" + str(i) + ":v:0][" + str(i) + ":a:0]" 582 str2 += "concat=n=" + str(len(subtitles_list)) + ":v=1:a=1[outv][outa]" 583 print(str1) 584 print(str2) 585 586 cmd = "ffmpeg -hwaccel cuda -hwaccel_device 0 -y " + str1 + " -filter_complex " + str2 + " -map [outv] -map [outa] -c:v h264_nvenc " + output_dir + '/' + video['url_hash'] + "_" + language + ".mp4" 587 os.system(cmd) 588 for i in range(0, len(subtitles_list)): 589 fileToDelete = temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4" 590 try: 591 os.remove(fileToDelete) 592 except Exception as e: 593 print(e) 594 595 def overlay_captions2(input_video, subtitles, language): 596 parts = filename.split("/") 597 fileNameWithExtension = parts[len(parts) - 1] 598 parts = fileNameWithExtension.split(".") 599 fileName_without_extension = parts[0] 600 directory = filename[:filename.index(fileNameWithExtension)] 601 output_video = directory + fileName_without_extension + "_" + language + ".mp4" 602 if os.path.exists(directory + "temp.srt"): 603 os.remove(directory + "temp.srt") 604 if os.path.exists(output_video): 605 os.remove(output_video) 606 # Generate SRT content 607 srt_content = generate_srt(subtitles) 608 with open(directory + "temp.srt", "w", encoding="utf-8") as f: 609 f.write(srt_content) 610 611 # Use ffmpeg to overlay the subtitles on the video 612 video = ffmpeg.input(input_video) 613 audio = video.audio 614 ffmpeg.concat( 615 video.filter('subtitles', directory + 'temp.srt', force_style="OutlineColour=&H40000000,BorderStyle=3"), 616 audio, v=1, a=1 617 ).output(output_video).run(quiet=True, overwrite_output=True) 618 619 # Clean up temporary SRT file 620 os.remove(directory + "temp.srt") 621 622 623 def generate_srt(subtitles): 624 """Generate SRT content from a list of subtitles.""" 625 srt_content = "" 626 for i, (start, end, text) in enumerate(subtitles, 1): 627 srt_content += f"{i}\n" 628 srt_content += f"{format_timestamp(start)} --> {format_timestamp(end)}\n" 629 srt_content += f"{text}\n\n" 630 return srt_content 631 632 633 def get_wav_duration(wav_bytes, sample_rate): 634 # Subtract the header size to get the size of the audio data 635 audio_data_size = len(wav_bytes) - 44 636 # Calculate the number of samples 637 num_samples = audio_data_size / 2 638 # Calculate the duration 639 duration = num_samples / sample_rate 640 return duration 641 642 643 def extract_subportion(wav_bytes, start_sec, end_sec, sample_rate): 644 # Constants for WAV format 645 HEADER_SIZE = 44 646 BYTES_PER_SAMPLE = 2 # Assuming 16-bit samples 647 648 # Convert start and end seconds to sample indices 649 start_sample = int(start_sec * sample_rate) 650 end_sample = int(end_sec * sample_rate) 651 652 # Calculate start and end byte positions in the audio data 653 start_byte = HEADER_SIZE + start_sample * BYTES_PER_SAMPLE 654 end_byte = HEADER_SIZE + end_sample * BYTES_PER_SAMPLE 655 656 # Extract the audio data for the sub-portion 657 sub_audio_data = wav_bytes[start_byte:end_byte] 658 659 # Create a new WAV header for the sub-portion 660 num_channels = 1 # Assuming mono audio 661 byte_rate = sample_rate * num_channels * BYTES_PER_SAMPLE 662 block_align = num_channels * BYTES_PER_SAMPLE 663 bits_per_sample = 16 664 sub_data_size = len(sub_audio_data) 665 sub_chunk_size = 16 666 chunk_size = 36 + sub_data_size 667 668 header = struct.pack( 669 '<4sI4s4sIHHIIHH4sI', 670 b'RIFF', chunk_size, b'WAVE', b'fmt ', sub_chunk_size, 1, 671 num_channels, sample_rate, byte_rate, block_align, 672 bits_per_sample, b'data', sub_data_size 673 ) 674 675 # Combine the new header and the sub audio data 676 sub_wav_bytes = header + sub_audio_data 677 678 return sub_wav_bytes 679 680 681 def bytes_to_np_array(audio_bytes): 682 audio_data, _ = sf.read(BytesIO(audio_bytes)) 683 return audio_data.astype(np.float32) # Convert to float32 684 685 686 def save_wav_to_disk(wav_array, sampling_rate, output_file): 687 wav_int16 = (wav_array * np.iinfo(np.int16).max).astype(np.int16) 688 write(output_file, sampling_rate, wav_int16) 689 690 691 def write_wav_files(wav_arrays, sampling_rate, output_dir, prefix="wav"): 692 file_paths = [] 693 for idx, wav_array in enumerate(wav_arrays): 694 # Convert the wav array to int16 format 695 wav_int16 = (wav_array * np.iinfo(np.int16).max).astype(np.int16) 696 697 # Define the output file path 698 file_path = f"{output_dir}/{prefix}_{idx + 1}.wav" 699 file_paths.append(file_path) 700 701 # Write the wav array to a WAV file 702 write(file_path, sampling_rate, wav_int16) 703 704 return file_paths 705 706 707 def WavFileToTensorFormat(filename): 708 # Read the WAV file 709 sample_rate, audio_data = wavfile.read(filename) 710 711 # Ensure the audio data is in np.int16 format 712 if audio_data.dtype != np.int16: 713 # Normalize audio data to the range [-1, 1] 714 audio_data_normalized = np.interp(audio_data, (audio_data.min(), audio_data.max()), (-1, 1)) 715 716 # Convert normalized data to np.int16 format 717 audio_data = np.int16(audio_data_normalized * 32767) 718 719 # Convert to PyTorch tensor 720 audio_tensor = torch.tensor(audio_data).float() 721 722 return audio_tensor 723 724 725 def transcribe_with_word_timestamps(model, audio, **kwargs): 726 # Ensure word_timestamps is set to True 727 kwargs["word_timestamps"] = True 728 return transcribe(model, audio, **kwargs) 729 730 731 def Transcribe(audio_data): 732 # Load the Whisper model 733 model_name = "large" # Replace with the appropriate model name 734 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 735 whisperModel = load_model(model_name) 736 whisperModel.to(device) 737 audio_tensor = torch.tensor(audio_data).to(device) 738 result = transcribe_with_word_timestamps(whisperModel, audio_tensor) 739 return result 740 741 742 def resample_wav(data, old_samplerate, new_samplerate): 743 # Calculate the new length of the data 744 new_length = int(len(data) * new_samplerate / old_samplerate) 745 746 # Resample the data 747 resampled_data = resample(data, new_length) 748 749 return resampled_data 750 751 from pyannote.audio.pipelines import SpeakerDiarization 752 def segment_audio_by_speakers(wav_array, sr, SampleRate=16000): 753 # Load the pre-trained model 754 # pipeline = SpeakerDiarization(segmentation="pyannote/segmentation", embedding="pyannote/embedding") 755 # device = 0 756 # Ensure GPU is available 757 if torch.cuda.is_available(): 758 device = torch.device("cuda") 759 else: 760 raise ValueError("GPU is not available.") 761 pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", 762 use_auth_token="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") 763 764 pipeline = pipeline.to(device) 765 waveform_tensor = torch.tensor(wav_array).unsqueeze(0) # Add channel dimension 766 767 audio_data = { 768 "waveform": waveform_tensor, 769 "sample_rate": sr 770 } 771 # Apply the diarization pipeline 772 diarization = pipeline(audio_data) 773 print(diarization.labels()) 774 # Count the number of unique speakers 775 num_speakers = len(diarization.labels()) 776 # Extract segments for each speaker 777 segments = { 778 } 779 i = 0 780 for turn, _, speaker in diarization.itertracks(yield_label=True): 781 start_time, end_time = turn.start, turn.end 782 start_sample, end_sample = int(start_time * SampleRate), int(end_time * SampleRate) 783 segment = wav_array[start_sample:end_sample] 784 segments[i] = {'speaker': speaker, 'start': start_time, 'end': end_time, 'segment': segment} 785 i += 1 786 787 # segments.append((speaker,start_time,end_time, segment)) 788 789 return segments 790 791 792 def get_segment_from_timestamp(segments, timestamp_in_s): 793 i = 0 794 retValue = segments[0] 795 while (i < len(segments)): 796 if (segments[i]['start'] > timestamp_in_s): 797 break 798 if (segments[i]['end'] > timestamp_in_s): 799 retValue = segments[i] 800 i += 1 801 return retValue 802 803 804 def format_timestamp(seconds_str): 805 """Convert string representation of seconds to 'HH:MM:SS,mmm' format.""" 806 total_seconds = float(seconds_str) 807 hours = int(total_seconds // 3600) 808 total_seconds %= 3600 809 minutes = int(total_seconds // 60) 810 seconds = int(total_seconds % 60) 811 milliseconds = int((total_seconds - int(total_seconds)) * 1000) 812 return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" 813 814 815 def createCaptions(translatedFullText, result, font_size): 816 captions = [] 817 max_chars_per_line = font_size * 120 818 parts = str.split(translatedFullText, ' ') 819 i = 0 820 tempCaption = '' 821 tempsStart = 0 822 tempEnd = 0 823 while (i < max_chars_per_line): 824 tempCaption += parts[i] + ' ' 825 826 caption = (tempsStart, tempEnd, tempCaption) 827 captions.append(caption) 828 829 830 from transformers import T5Tokenizer, T5ForConditionalGeneration 831 832 833 def transform_sentence_in_french(sentence): 834 # Load pre-trained T5 model and tokenizer 835 model = T5ForConditionalGeneration.from_pretrained("t5-large") 836 tokenizer = T5Tokenizer.from_pretrained("t5-large") 837 838 # Use the modified prompt 839 input_text = "correct the following French sentence: " + sentence 840 input_ids = tokenizer(input_text, return_tensors="pt").input_ids 841 842 # Generate output 843 output = model.generate(input_ids) 844 transformed_sentence = tokenizer.decode(output[0], skip_special_tokens=True) 845 846 # Refine post-processing: Remove only the prefix of the output that matches the input 847 if transformed_sentence.startswith(sentence): 848 transformed_sentence = transformed_sentence[len(sentence):].strip() 849 850 return transformed_sentence 851 852 853 def format_transcript_for_chatgpt(segments): 854 # Check unique speaker IDs in the segments 855 unique_speaker_ids = set([segment['speaker_id'] for segment in segments]) 856 857 formatted_text = [] 858 prev_speaker_id = None 859 860 for segment in segments: 861 current_text = segment['text'] 862 current_speaker_id = segment['speaker_id'] 863 start = round(segment['start'], 2) 864 end = round(segment['end'], 2) 865 if prev_speaker_id is None or prev_speaker_id != current_speaker_id: 866 formatted_text.append(f"\n[{start},{end}] /* {current_speaker_id} */: {current_text}") 867 else: 868 formatted_text.append(f"[{start},{end}] : {current_text}") 869 870 prev_speaker_id = current_speaker_id 871 872 return "".join(formatted_text) 873 874 def format_transcript_for_google(segments): 875 # Check unique speaker IDs in the segments 876 unique_speaker_ids = set([segment['speaker_id'] for segment in segments]) 877 878 formatted_text = [] 879 prev_speaker_id = None 880 881 for segment in segments: 882 current_text = segment['text'] 883 current_speaker_id = segment['speaker_id'] 884 start = round(float(segment['start']), 2) 885 end = round(float(segment['end']), 2) 886 if prev_speaker_id is None or prev_speaker_id != current_speaker_id: 887 formatted_text.append(f"\n[{start},{end}] /* {current_speaker_id} */: {current_text}") 888 else: 889 formatted_text.append(f"[{start},{end}] : {current_text}") 890 891 prev_speaker_id = current_speaker_id 892 893 return "".join(formatted_text) 894 895 896 def format_transcript_for_google_old(segments): 897 # Check unique speaker IDs in the segments 898 unique_speaker_ids = set([segment['speaker_id'] for segment in segments]) 899 900 # If there's only one speaker, concatenate the 'text' fields 901 if len(unique_speaker_ids) == 1: 902 return "[0] ".join([segment['text'] for segment in segments]) 903 904 # If there are multiple speakers, format the text accordingly 905 else: 906 formatted_text = [] 907 prev_speaker_id = None 908 909 for segment in segments: 910 current_text = segment['text'] 911 current_speaker_id = segment['speaker_id'] 912 913 # If the speaker ID changed from the previous segment or it's the start 914 if current_speaker_id != prev_speaker_id: 915 if formatted_text: 916 formatted_text.append("\n") # New line between different speakers 917 formatted_text.append(f"[{current_speaker_id}]: {current_text}") 918 else: 919 formatted_text.append(f" {current_text}") 920 921 prev_speaker_id = current_speaker_id 922 923 return "".join(formatted_text) 924 925 #model=nllb-200-1.3B 926 #nllb-200-distilled-600M 927 nllbModelName = "facebook/nllb-200-distilled-600M" 928 nllbModel = AutoModelForSeq2SeqLM.from_pretrained(nllbModelName) 929 nllbModel.to(device) 930 nllbTokenizer = AutoTokenizer.from_pretrained(nllbModelName) 931 932 933 934 935 936 def query_chatgpt(myText): 937 retValue = '' 938 # max_tokens = 3 * len(myText.split()) 939 openai.api_key = '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' 940 941 response = openai.ChatCompletion.create( 942 model="gpt-4", # Replace with the correct chat model name 943 messages=[ 944 {"role": "system", "content": "You are a helpful assistant."}, 945 {"role": "user", "content": myText}, 946 ] 947 ) 948 949 correct_translation = response.choices[0].message['content'] 950 951 return correct_translation 952 953 954 955 956 import re 957 958 959 def break_text_into_captions(text, captions): 960 result = [] 961 parts = str.split(text, '|') 962 963 for i in range(0, len(parts)): 964 result.append({ 965 'start': captions[i]['start'], 966 'end': captions[i]['end'], 967 'text': parts[i].strip(), 968 'speaker_id': captions['speaker_id'] 969 }) 970 971 return result 972 973 def set_unknown_speaker_ids(captions, default_speaker_id): 974 for i in range(0, len(captions)): 975 if captions[i]['speaker_id'] is None: 976 captions[i]['speaker_id'] = default_speaker_id 977 978 def set_speaker_in_captions(captions, segments): 979 max_speaker_id = 0 980 for i in range(0, len(captions)): 981 # time_at_middle_part = int(captions[i]['start']) + (int(captions[i]['end']) - int(captions[i]['start'])) / 2 982 start = float(captions[i]['start']) 983 end = float(captions[i]['end']) 984 speaker_id = get_speaker_id_at_timestamp(segments, start, end) 985 if speaker_id is not None: 986 if speaker_id > max_speaker_id: 987 max_speaker_id = speaker_id 988 captions[i]['speaker_id'] = get_speaker_id_at_timestamp(segments, start, end) 989 set_unknown_speaker_ids(captions, max_speaker_id) 990 991 992 def get_speaker_id_at_timestamp(segments, start, end): 993 # Dictionary to store the total duration each value appears within the interval 994 value_durations = defaultdict(int) 995 996 for i in range(0, len(segments)): 997 # Check if the entry overlaps with the interval [start1, end1] 998 overlap_start = max(segments[i]['start'], start) 999 overlap_end = min(segments[i]['end'], end) 1000 1001 if overlap_start < overlap_end: 1002 duration = overlap_end - overlap_start 1003 value_durations[segments[i]['speaker']] += duration 1004 1005 # Find the value with the maximum duration within the interval 1006 max_value = None 1007 max_duration = 0 1008 for value, duration in value_durations.items(): 1009 if duration > max_duration: 1010 max_duration = duration 1011 max_value = value 1012 speaker_id = None 1013 if max_value is not None: 1014 strsplit = str.split(max_value, '_') 1015 speaker_id = int(strsplit[1]) 1016 return speaker_id 1017 1018 def translate_with_nllb(transcript, video, target_language_code): 1019 target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code) 1020 captions = translate_captions(transcript, target_nllb_code, nllbModel, nllbTokenizer) 1021 # drop_local_model_request_in_queue(transcript, video, target_language_code) 1022 title = translate_string(str(video['metadata'][0]['title']), target_nllb_code, nllbModel, nllbTokenizer) 1023 description = translate_string(str(video['metadata'][0]['description']), target_nllb_code, nllbModel, 1024 nllbTokenizer) 1025 tags = translate_string(str(video['metadata'][0]['tags']), target_nllb_code, nllbModel, nllbTokenizer) 1026 metadata = { 1027 'language_id': target_language_id, 1028 'title': title, 1029 'description': description, 1030 'tags': tags 1031 } 1032 video['metadata'].append(metadata) 1033 1034 save_to_database(db_config, video) 1035 1036 insert_captions_in_db(db_config, video, captions, target_language_code, nllbModelName) 1037 1038 return captions 1039 1040 def enhance_with_chatgpt(transcript, video, target_language_code): 1041 target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code) 1042 source_text = format_transcript_for_google(transcript) 1043 chunks = break_text(source_text) 1044 chatgpt_output = "" 1045 for i in range(0, len(chunks)): 1046 chatgpt_text = "This is a translation in " + target_language_name + ". Make it make sense. Just provide the output without comments: '" + chunks[i] + "'" 1047 chatgpt_output += query_chatgpt(chatgpt_text) 1048 1049 good_segments = split_by_speaker(chatgpt_output) 1050 if get_transcripts_total_textLength(good_segments) < 1.2 * len(chatgpt_output): 1051 good_segments = split_by_sentence(chatgpt_output) 1052 good_segments[0]['start'] = transcript[0]['start'] 1053 good_segments[len(good_segments) - 1]['end'] = transcript[len(good_segments) - 1]['end'] 1054 match_translations(transcript, good_segments) 1055 distribute_times(good_segments) 1056 insert_captions_in_db(db_config, video, good_segments, target_language_code, 'chatgpt') 1057 return good_segments 1058 def translate_with_google(transcript, video, target_language_code): 1059 bodyText = format_transcript_for_google(transcript) 1060 fullText = "" 1061 if video['metadata'][0]['title'] is not None: 1062 fullText = video['metadata'][0]['title'] + '\n\n' + bodyText 1063 else: 1064 fullText = bodyText 1065 sample = transcript[0]['text'] + " " + transcript[1]['text'] 1066 #original_language_code = query_google_detect_language(sample) 1067 language_id = video['language_id'] 1068 original_language_code, nllb_code, name = fetch_language_values_from_id(db_config, language_id) 1069 target_language_id, target_nllb_code, target_name = fetch_language_values(db_config, target_language_code) 1070 translation_text = query_google_translate(original_language_code, target_language_code, fullText) 1071 video_title, body = split_text_at_number(translation_text) 1072 description = translate_string(str(video['metadata'][0]['description']), target_nllb_code, nllbModel, 1073 nllbTokenizer) 1074 tags = translate_string(str(video['metadata'][0]['tags']), target_nllb_code, nllbModel, nllbTokenizer) 1075 metadata_for_target_language_exists = False 1076 for i in range(0, len(video['metadata'])): 1077 if video['metadata'][i]['language_id'] == target_language_id: 1078 video['metadata'][i]['title'] = video_title 1079 video['metadata'][i]['description'] = description 1080 video['metadata'][i]['tags'] = tags 1081 metadata_for_target_language_exists = True 1082 if metadata_for_target_language_exists == False: 1083 metadata = { 1084 'language_id': target_language_id, 1085 'title': video_title, 1086 'description': description, 1087 'tags': tags 1088 } 1089 video['metadata'].append(metadata) 1090 good_segments = split_by_speaker(body) 1091 1092 return good_segments 1093 1094 def void(): 1095 return None 1096 def process_audio(queue_id, video_id, original_audio_array): 1097 SampleRate = 16000 1098 clipDuration = get_wav_duration(original_audio_array, SampleRate) 1099 1100 wav_header_length = 44 1101 device = 'cuda' if torch.cuda.is_available() else 'cpu' 1102 audio_data = bytes_to_np_array(original_audio_array) 1103 speakerDiarization = True 1104 1105 # wav_int16 = (audio_data * np.iinfo(np.int16).max).astype(np.int16) 1106 # write(output_dir + '/originalWav.wav', SampleRate, wav_int16) 1107 1108 # Ensure audio_data is a 1D array 1109 if len(audio_data.shape) > 1: 1110 audio_data = audio_data[:, 0] 1111 1112 print_with_current_milli_time(logging, "Transcribing...") 1113 transcript = [] 1114 video = fetch_video_in_db_from_id(db_config, video_id) 1115 url, target_language_code = fetch_queue_in_db_from_id(db_config, queue_id) 1116 target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code) 1117 original_language_id = video['language_id'] 1118 original_language_code, original_nllb_code, original_language_name = fetch_language_values_from_id(db_config, 1119 original_language_id) 1120 transcript = get_transcript_from_db(db_config, video, model_name='whisper') 1121 if len(transcript) == 0: 1122 print_with_current_milli_time(logging, "Transcribing queue_id=" + str(queue_id)) 1123 result = Transcribe(audio_data) 1124 original_language_code = result['language'] 1125 original_language_id, original_nllb_code, original_language_name = fetch_language_values(db_config, original_language_code) 1126 video['language_id'] = original_language_id 1127 update_video_to_database(db_config, video) 1128 if (speakerDiarization == True): 1129 print_with_current_milli_time(logging, "Speaker diarization for queue_id=" + str(queue_id)) 1130 segments = segment_audio_by_speakers(audio_data, SampleRate) 1131 transcript = convert_to_transcript(result) 1132 set_speaker_in_captions(transcript, segments) 1133 insert_captions_in_db(db_config, video, transcript, original_language_code, 'whisper') 1134 print(result['text']) 1135 else: 1136 original_language_id = video['language_id'] 1137 original_language_code, original_nllb_code, original_language_name = fetch_language_values_from_id(db_config, 1138 original_language_id) 1139 thread1 = None 1140 thread2 = None 1141 transcript = group_by_sentence(transcript) 1142 1143 if "youtube" in video['url'] or "youtu.be" in video['url']: 1144 try: 1145 translated_segments = get_transcript_from_db(db_config, video, target_language_code, model_name='google') 1146 if translated_segments is None or len(translated_segments) == 0: 1147 print_with_current_milli_time(logging, "Google translating for queue_id=" + str(queue_id)) 1148 translated_segments = translate_with_google(transcript, video, target_language_code) 1149 update_video_to_database(db_config, video) 1150 for i in range (0, len(transcript)): 1151 found = False 1152 j = 0 1153 while found == False: 1154 testSegment = translated_segments[j] 1155 if float(transcript[i]['start']) == float(testSegment['start']): 1156 found = True 1157 elif float(testSegment['start']) > float(transcript[i]['start']) and found == False: 1158 break 1159 else: 1160 j += 1 1161 if found == False: 1162 translationOfString = translate_string(transcript[i]['text'], target_nllb_code, 1163 nllbModel, 1164 nllbTokenizer) 1165 missing_segment = { 1166 'speaker_id': transcript[i]['speaker_id'], 1167 'start': transcript[i]['start'], 1168 'end': transcript[i]['end'], 1169 'text': translationOfString 1170 } 1171 translated_segments.insert(j, missing_segment) 1172 1173 1174 1175 1176 insert_captions_in_db(db_config, video, translated_segments, target_language_code, 'google') 1177 '''if float(translated_segments[len(translated_segments) - 1]['end']) < 600: 1178 good_segments = get_transcript_from_db(db_config, video, target_language_code, model_name='chatgpt') 1179 if good_segments is None or len(good_segments) == 0: 1180 good_segments = enhance_with_chatgpt(translated_segments, video, target_language_code)''' 1181 except Exception as e: 1182 print(f"Error: {e}") 1183 captions = get_transcript_from_db(db_config, video, target_language_code, nllbModelName) 1184 if captions is None or len(captions) == 0: 1185 print_with_current_milli_time(logging, "Translating with NLLB for queue_id=" + str(queue_id)) 1186 translate_with_nllb(transcript, video, target_language_code) 1187 1188 else: 1189 captions = get_transcript_from_db(db_config, video, target_language_code, nllbModelName) 1190 if captions is None or len(captions) == 0: 1191 translate_with_nllb(transcript, video, target_language_code) 1192 print_with_current_milli_time(logging, "Translating with NLLB for queue_id=" + str(queue_id)) 1193 '''if len(captions) == 0: 1194 captions = translate_with_local_model(transcript, video, target_language_code) 1195 #drop_local_model_request_in_queue(transcript, video, target_language_code)''' 1196 revised_translation_str = "" 1197 1198 '''annotate_speaker_switch(captions, good_segments) 1199 match_translations(captions, good_segments) 1200 reset_inconsistent_timestamps(good_segments) 1201 # If the start of the first item is None, set it to 0 1202 if good_segments[0]['start'] is None: 1203 good_segments[0]['start'] = captions[0]['start'] 1204 1205 if good_segments[len(good_segments) - 1]['end'] is None: 1206 good_segments[len(good_segments) - 1]['end'] = captions[len(captions) - 1]['end'] 1207 good_segments4 = distribute_times(good_segments)''' 1208 1209 1210 total_segment = '' 1211 silence = np.zeros(int(0.25 * 24000)) # quarter second of silence 1212 1213 wav_arrays = [] 1214 sampling_rates = [] 1215 1216 # voice_path = '/home/cyril/tools/bark-with-voice-clone/bark/assets/prompts/' + voice_name + '.npz' 1217 1218 pieces = [] 1219 1220 1221 1222 1223 1224 1225 #result = assign_timestamps(captions, corrected_transcript[0]['text']) 1226 '''subtitles2 = split_captions_for_display(captions) 1227 subtitles_list = split_in_smaller_chunks(subtitles2, 600) 1228 drawtext_filter = format_drawtext_filter(subtitles_list[0], video) 1229 print(drawtext_filter)''' 1230 1231 1232 set_content_processed_in_db(queue_id) 1233 print_with_current_milli_time(logging, "queue_id=" + str(queue_id) + "fully processed") 1234 torch.cuda.empty_cache() 1235 1236 '''hasTranslationBeenRevised = has_translation_been_revised_already(video['url_hash'], target_language_code) 1237 if useChatGPT == True and hasTranslationBeenRevised == False: 1238 chatGPTQuery = 'Here is a conversation between multiple speakers. It can be hard to understand at times. Try to correct this exchange in a way that is easy to translate:"' + '"' + chatGPTText + '"' 1239 fullText = correct_translation_with_chatgpt(chatGPTQuery) 1240 print(fullText) 1241 # createCaptions(translatedFullText,result, 24) 1242 revisedCaptions = break_text_into_captions(fullText, transcript) 1243 if (speakerDiarization): 1244 set_speaker_in_captions(revisedCaptions, segments) 1245 insert_captions_in_db(video['url_hash'], revisedCaptions, target_language_code, 1) 1246 add_subtitles_to_video(filename, revisedCaptions, target_language_code, video) 1247 else:''' 1248 1249 # overlay_captions(filename,captions,language_code) 1250 1251 # translated_audio_array = np.concatenate(pieces) 1252 # audio_array1_ndarray = np.frombuffer(original_audio_array, dtype=np.int16) 1253 '''if len(audio_array1_ndarray) > len(translated_audio_array): 1254 translated_audio_array = np.pad(translated_audio_array, (0, len(audio_array1_ndarray) - len(translated_audio_array))) 1255 elif len(audio_array1_ndarray) < len(translated_audio_array): 1256 audio_array1_ndarray = np.pad(audio_array1_ndarray, (0, len(translated_audio_array) - len(audio_array1_ndarray))) 1257 1258 # Mix the audio arrays with the specified volume ratios 1259 interleaved_mono_audio = np.vstack((audio_array1_ndarray, translated_audio_array)).T.ravel()''' 1260 1261 import queue 1262 1263 1264 def get_transcript_temp(db_config, video): 1265 caption = None 1266 conn = mysql.connect(**db_config) 1267 cursor = conn.cursor() 1268 sql_text = "select start, end, text from transcripts_temp where video_id = %s" 1269 cursor.execute(sql_text, (int(video['id']),)) 1270 row = cursor.fetchone() 1271 if row is not None: 1272 caption = { 1273 'start': float(row[0]), 1274 'end': float(row[1]), 1275 'speaker_id': None, 1276 'text': row[2], 1277 1278 } 1279 return caption 1280 1281 1282 1283 # Initialize Translation client 1284 1285 def split_text_at_number(text): 1286 match = re.search(r'\[', text) 1287 if match: 1288 before = text[:match.start()] 1289 after = text[match.start():] 1290 return (before, after) 1291 else: 1292 return (text, "") 1293 1294 def break_text(text): 1295 words = text.split() 1296 chunks = [] 1297 start = 0 1298 1299 while start < len(words): 1300 end = start + 3000 1301 if end >= len(words): 1302 chunks.append(' '.join(words[start:])) 1303 break 1304 1305 # First, look for numbers in brackets after the 3000th word up to the 3500th word 1306 found_bracket = False 1307 for i in range(end, min(end + 500, len(words))): 1308 if re.match(r'\[\d+\]', words[i]): 1309 end = i 1310 found_bracket = True 1311 break 1312 1313 # If we didn't find a number in brackets by the 3500th word, look for the end of a sentence 1314 if not found_bracket: 1315 for i in range(end + 500, len(words)): 1316 if '.' in words[i]: 1317 end = i + 1 1318 break 1319 1320 chunks.append(' '.join(words[start:end])) 1321 start = end 1322 1323 return chunks 1324 1325 def query_google_detect_language(source_text): 1326 retValue = "" 1327 """Translates text using the Google Cloud Translation API v3 with gcloud authentication.""" 1328 project_id = "ambient-inquiry-264521" 1329 location = "global" 1330 # Get the access token using gcloud 1331 #access_token = "ya29.a0AfB_byD-poU0YPWYtRPiscR2K1TG58nFmkV3TFwrrHkBVYLk5_6CVIEH58qZW0YJ-6NP0zBFJP2QNVdgkmM1Ug2GNq9c0PnmJZUFHEfJTbTI4femKcNBgwR85DvIPs-nC6sOSaSpHtbocIMMfxROsGA74YuzFrnANWFdYp9nEAaCgYKAcESARASFQGOcNnCzJKx7c5wlIjX-qhdpo__PQ0177" 1332 access_token = subprocess.getoutput("/home/cyril/google-cloud-sdk/bin/gcloud auth print-access-token") 1333 1334 endpoint = "https://translation.googleapis.com/language/translate/v2/detect" 1335 1336 headers = { 1337 "Authorization": f"Bearer {access_token}", 1338 "x-goog-user-project": project_id, 1339 "Content-Type": "application/json; charset=utf-8" 1340 } 1341 1342 data = { 1343 "q": source_text 1344 } 1345 1346 response = requests.post(endpoint, headers=headers, data=json.dumps(data)) 1347 response_json = response.json() 1348 1349 # Extract and return the detected language 1350 return response_json["data"]["detections"][0][0]["language"] 1351 1352 1353 def query_google_translate( source_language, target_language, source_text): 1354 retValue = "" 1355 1356 """Translates text using the Google Cloud Translation API v3 with gcloud authentication.""" 1357 project_id = "ambient-inquiry-264521" 1358 location = "global" 1359 # Get the access token using gcloud 1360 #access_token = "ya29.a0AfB_byD-poU0YPWYtRPiscR2K1TG58nFmkV3TFwrrHkBVYLk5_6CVIEH58qZW0YJ-6NP0zBFJP2QNVdgkmM1Ug2GNq9c0PnmJZUFHEfJTbTI4femKcNBgwR85DvIPs-nC6sOSaSpHtbocIMMfxROsGA74YuzFrnANWFdYp9nEAaCgYKAcESARASFQGOcNnCzJKx7c5wlIjX-qhdpo__PQ0177" 1361 access_token = subprocess.getoutput("/home/cyril/google-cloud-sdk/bin/gcloud auth print-access-token") 1362 endpoint = f"https://translation.googleapis.com/v3/projects/{project_id}/locations/{location}:translateText" 1363 1364 headers = { 1365 "Authorization": f"Bearer {access_token}", 1366 "x-goog-user-project": project_id, 1367 "Content-Type": "application/json; charset=utf-8" 1368 } 1369 chunks = break_text(source_text) 1370 1371 for i in range (0, len(chunks)): 1372 body = { 1373 "source_language_code": source_language, 1374 "target_language_code": target_language, 1375 "contents": chunks[i], 1376 "mime_type": "text/plain", 1377 "transliteration_config": { 1378 "enable_transliteration": False 1379 } 1380 } 1381 1382 response = requests.post(endpoint, headers=headers, data=json.dumps(body)) 1383 response_json = response.json() 1384 retValue += response_json["translations"][0]["translatedText"] + " " 1385 1386 # Extract and return the translated text 1387 return retValue 1388 1389 1390 def insert_into_temp(db_config, video, caption): 1391 conn = mysql.connect(**db_config) 1392 cursor = conn.cursor() 1393 1394 # Fetch the ID of the given last name 1395 1396 start = caption['start'] 1397 end = caption['end'] 1398 text = caption['text'] 1399 cursor.execute( 1400 "INSERT IGNORE INTO transcripts_temp (video_id, start, end,text) VALUES (%s, %s, %s, %s)", 1401 (video['id'], start, end, text)) 1402 1403 conn.commit() 1404 1405 1406 1407 1408 1409 1410 def set_content_processed_in_db(queue_id): 1411 # Prepare the header 1412 conn = mysql.connect(**db_config) 1413 cursor = conn.cursor() 1414 now = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') 1415 cursor.execute("UPDATE queue SET time_content_processed = %s WHERE id = %s", (now, queue_id)) 1416 conn.commit() 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 # Configuration 1428 HOST = '0.0.0.0' 1429 HEADER_SIZE = 18 1430 1431 def receive_data(client_socket): 1432 # First receive the header 1433 header = client_socket.recv(HEADER_SIZE).decode('utf-8') 1434 data_type, queue_id, video_id = header.split(",")[:3] 1435 data_type = str.strip(data_type) 1436 queue_id, video_id = int(queue_id), int(video_id) 1437 data = bytearray() 1438 chunk = client_socket.recv(4096) 1439 while chunk: 1440 data.extend(chunk) 1441 chunk = client_socket.recv(4096) 1442 1443 if data_type == "JSON" and len(data) > 0: 1444 data = json.loads(data.decode('utf-8')) 1445 1446 return data_type, queue_id, video_id, data 1447 1448 def requestsWorker(queue_id, video_id, data ): 1449 1450 print_with_current_milli_time(logging, "Processing audio for queue_id=" + str(queue_id)) 1451 process_audio(queue_id, video_id, data) 1452 '''print("requestQueue started") 1453 while True: 1454 # Get item from queue 1455 item = requestQueue.get() 1456 print("requestQueue item") 1457 if item is not None: 1458 queue_id, video_id, data = item 1459 print_with_current_milli_time(logging, "Processing audio for queue_id=" + str(queue_id)) 1460 process_audio(queue_id, video_id, data) 1461 time.sleep(1)''' 1462 1463 1464 1465 1466 def server_listen(): 1467 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 1468 s.bind((HOST, GPU_PORT)) 1469 s.listen() 1470 print('Server listening for a connection...') 1471 while True: 1472 conn, addr = s.accept() 1473 print_with_current_milli_time(logging, f'Connected by {addr}') 1474 data_type, queue_id, video_id, data = receive_data(conn) 1475 conn.close() 1476 1477 if data_type == "AUDIO": 1478 t1 = threading.Thread(target=requestsWorker, args=(queue_id, video_id, data)) 1479 t1.start() 1480 else: 1481 print("Unknown data type received") 1482 return None 1483 1484 1485 1486 1487 1488 1489 if __name__ == '__main__': 1490 # Start the server thread 1491 1492 server_thread = threading.Thread(target=server_listen) 1493 server_thread.start() 1494 while True: 1495 time.sleep(1) 1496