/ YoutubeVideoTranslater / gpu_processor.py
gpu_processor.py
   1  import yt_dlp
   2  import sys
   3  import torch
   4  import os
   5  from whisper.transcribe import transcribe
   6  from whisper import load_model
   7  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
   8  import scipy
   9  from scipy.io import wavfile
  10  import ffmpeg
  11  import soundfile as sf
  12  from io import BytesIO
  13  import struct
  14  from scipy.io.wavfile import write
  15  from pyannote.audio import Pipeline
  16  import numpy as np
  17  from scipy.signal import resample
  18  import json
  19  from collections import defaultdict
  20  import time
  21  import requests
  22  
  23  from datetime import datetime
  24  import pymysql as mysql
  25  import socket
  26  import threading
  27  from langdetect import detect
  28  import openai
  29  from functions import *
  30  import logging
  31  
  32  '''os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  33  from bark.generation import (
  34      generate_text_semantic,
  35      preload_models,
  36  )
  37  semantic_path = "semantic_output/pytorch_model.bin" # set to None if you don't want to use finetuned semantic
  38  coarse_path = "coarse_output/pytorch_model.bin" # set to None if you don't want to use finetuned coarse
  39  fine_path = "fine_output/pytorch_model.bin" # set to None if you don't want to use finetuned fine
  40  
  41  is_half=True
  42  preload_models(
  43      text_use_gpu=True,
  44      text_use_small=False,
  45      text_model_path=semantic_path,
  46      coarse_use_gpu=True,
  47      coarse_use_small=False,
  48      coarse_model_path=coarse_path,
  49      fine_use_gpu=True,
  50      fine_use_small=False,
  51      fine_model_path=fine_path,
  52      codec_use_gpu=True,
  53      force_reload=False,
  54      path="models"
  55  )
  56  from bark import generate_audio, SAMPLE_RATE'''
  57  
  58  
  59  environment = 'production'  # Default to 'production' if not set
  60  if len(sys.argv)> 1 and  sys.argv[1] is not None:
  61      environment = sys.argv[1]
  62  
  63  home_dir = "/home/cyril/dev/VideoTranslator"
  64  temp_dir = home_dir + '/temp'
  65  downloads_dir = home_dir + '/downloads'
  66  output_dir = home_dir + '/output'
  67  IPs = (
  68      '162.199.220.174',
  69      'translatizer.com'
  70  )
  71  BANDWIDTH_PORT = 2212
  72  GPU_PORT = 2213
  73  pw = 'H0ur1!'
  74  host = '127.0.0.1'
  75  if environment == 'production':
  76      logging.basicConfig(filename='gpu_processor.log', level=logging.INFO)
  77      BANDWIDTH_PORT = 62212
  78      GPU_PORT = 62213
  79      pw = 'Cyr1lH0ur1!'
  80      host = 'translatizer.com'
  81      output_dir = home_dir + '/s3-bucket'
  82  else:
  83      logging.basicConfig(level=logging.INFO,
  84                          format='%(asctime)s - %(levelname)s - %(message)s')
  85  # MySQL Configuration
  86  db_config = {
  87      'user': 'appuser',
  88      'password': pw,
  89      'host': host,  # or the IP address of your MySQL server
  90      'database': 'videotranslator',
  91  }
  92  
  93  
  94  # Initialize models and devices for voice cloning
  95  device = 'cuda'  # or 'cpu'
  96  def get_destination_ip():
  97      retValue = ''
  98      if environment == 'debug':
  99          retValue = '127.0.0.1'
 100      else:
 101          if os.environ.get('PUBLIC_IP') == IPs[0]:
 102              retValue = IPs[1]
 103          else:
 104              retValue = IPs[0]
 105      return retValue
 106  
 107  
 108  
 109  
 110  def merge_audio_files(original_file, translated_file, output_file):
 111      """
 112      Merge two mono audio files into one stereo file using FFmpeg.
 113  
 114      Parameters:
 115      - original_file (str): Path to the first mono audio file.
 116      - translated_file (str): Path to the second mono audio file.
 117      - output_file (str): Path to the output stereo audio file.
 118  
 119      Returns:
 120      - str: Output message from FFmpeg.
 121      """
 122      """
 123      Merge two mono audio files into one stereo file using FFmpeg.
 124  
 125      Parameters:
 126      - original_file (str): Path to the first mono audio file.
 127      - translated_file (str): Path to the second mono audio file.
 128      - output_file (str): Path to the output stereo audio file.
 129      """
 130  
 131      input_args = ["-i", original_file, "-i", translated_file]
 132      filter_args = [
 133          "-filter_complex",
 134          "[0:a]volume=0.2,pan=mono|c0=c0[a1]; [1:a]volume=0.8,pan=mono|c0=c0[a2]; [a1][a2]amerge=inputs=2[aout]",
 135          "-ac", "2",
 136          "-map", "[aout]"
 137      ]
 138      output_args = [output_file]
 139  
 140      cmd = ["ffmpeg"] + input_args + filter_args + output_args
 141  
 142      ffmpeg.run(cmd)
 143  
 144      # Example usage:
 145      # merge_audio_files("original_trunc.wav", "translated.wav", "merged.wav")
 146  
 147      import subprocess
 148  
 149  
 150  def createAudioOfSegment(translated_text, processor, model, voice_preset):
 151      # voice_preset = '/home/cyril/tools/bark-with-voice-clone/bark/assets/prompts/Tucker.npz'
 152      # voice_preset = "v2/fr_speaker_5"
 153      device = 'cuda' if torch.cuda.is_available() else 'cpu'
 154      inputs = processor(translated_text, voice_preset=voice_preset)
 155      inputs.to(device)
 156  
 157      audio_array = model.generate(**inputs)
 158      # audio_array.to(device)
 159      audio_array = audio_array.cpu().numpy().squeeze()
 160      sample_rate = model.generation_config.sample_rate
 161      return sample_rate, audio_array
 162  
 163  
 164  def split_elements_for_caption(input_list):
 165      output_list = []
 166  
 167      for element in input_list:
 168          words = element['text'].split()
 169          total_words = len(words)
 170          total_duration = element['end'] - element['start']
 171  
 172          # Calculate the duration per word
 173          duration_per_word = total_duration / total_words
 174  
 175          # Split the words into chunks of 10 or fewer
 176          for i in range(0, total_words, 10):
 177              chunk_words = words[i:i + 10]
 178              chunk_start = element['start'] + i * duration_per_word
 179              chunk_end = chunk_start + len(chunk_words) * duration_per_word
 180  
 181              output_list.append({
 182                  'start': chunk_start,
 183                  'end': chunk_end,
 184                  'text': ' '.join(chunk_words)
 185              })
 186  
 187      return output_list
 188  
 189  def convert_to_transcript(result):
 190      retValue = []
 191      for i in range(0,len(result['segments']) ):
 192          current_part = {
 193              'start': result['segments'][i]['start'],
 194              'end': result['segments'][i]['end'],
 195              'speaker_id': None,
 196              'text': result['segments'][i]['text']
 197  
 198          }
 199          retValue.append(current_part)
 200      return retValue
 201  
 202  def split_text(result):
 203      words = []
 204      segment_id = 0
 205      word_id = 0
 206      for segment in result['segments']:
 207          segment_id += 1
 208          for word in segment['words']:
 209              word_id += 1
 210              thisWord = {
 211                  'start': word['start'],
 212                  'end': word['end'],
 213                  'speaker_id': None,
 214                  'word': word['word']
 215  
 216              }
 217              words.append(thisWord)
 218      parts = []
 219      current_part_text = []
 220      words_so_far = 0
 221      word_count = 0
 222      for word in words:
 223          current_part_text.append(word['word'])
 224          word_count += 1
 225  
 226          # Check if the word count is between 150 and 200 and the word ends with a period
 227          if word_count >= 35:
 228              if word_count >= 60 or word['word'][-1] == '.' or word['word'][-1] == '?':
 229                  current_part_text_str = ''.join(current_part_text)
 230                  current_part = {
 231                      'start': words[words_so_far]['start'],
 232                      'end': words[words_so_far + word_count - 1]['end'],
 233                      'speaker_id': None,
 234                      'text': current_part_text_str
 235  
 236                  }
 237                  parts.append(current_part)
 238                  current_part_text = []
 239                  words_so_far += word_count
 240                  word_count = 0
 241  
 242      # Add any remaining words to the parts list
 243      if current_part_text:
 244          current_part_text_str = ''.join(current_part_text)
 245          current_part = {
 246              'start': words[words_so_far]['start'],
 247              'end': words[words_so_far + word_count - 1]['end'],
 248              'speaker_id': None,
 249              'text': current_part_text_str,
 250  
 251          }
 252          parts.append(current_part)
 253      # retValue = split_elements_for_caption(parts)
 254      return parts
 255  
 256  def translate_string(text, nllb_target_code, model, tokenizer):
 257      device = 'cuda' if torch.cuda.is_available() else 'cpu'
 258      tgt_lang_id = tokenizer.lang_code_to_id[nllb_target_code]
 259      model_inputs = tokenizer(text, return_tensors='pt', padding='longest')
 260      model_inputs.to(device)
 261      gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tgt_lang_id, max_new_tokens=300)
 262      translated_part = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
 263      return translated_part[0]
 264  
 265  def translate_captions(parts, nllb_target_code, model, tokenizer):
 266      translatedParts = []
 267      for part in parts:
 268          str = translate_string(part['text'], nllb_target_code, model, tokenizer )
 269          current_part = {
 270              'start': part['start'],
 271              'end': part['end'],
 272              'speaker_id': part['speaker_id'],
 273              'text': str
 274  
 275          }
 276          translatedParts.append(current_part)
 277          # translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='fra_Latn', max_length = 400)
 278      return translatedParts
 279  
 280  
 281  def index_exists(index_name, cursor):
 282      query = "SELECT COUNT(1) AS IndexIsThere FROM INFORMATION_SCHEMA.STATISTICS  WHERE table_schema=DATABASE() AND index_name='" + index_name + "';"
 283      cursor.execute(query)
 284  
 285      # Fetch the result
 286      result = cursor.fetchone()
 287      return result[0]
 288  
 289  
 290  
 291  
 292  
 293  def has_translation_been_revised_already(url_hash, language_code):
 294      retValue = False
 295      conn = mysql.connect(**db_config)
 296      cursor = conn.cursor()
 297      sql_statement = "SELECT A.id  FROM transcripts A, languages B, videos C WHERE A.video_id = C.id and C.url_hash = %s and A.language_id = B.id and B.code = %s and B.revised = 1 limit 1"
 298      cursor.execute(sql_statement, (url_hash, language_code,))
 299      id = cursor.fetchone()
 300      if cursor.rowcount > 0:
 301          retValue = True
 302      return retValue
 303  
 304  
 305  import hashlib
 306  
 307  def get_video_object():
 308      video = {
 309          'id': None,
 310          'url': None,
 311          'url_hash': None,
 312          'thumbnail_url': None,
 313          'language_id': None,
 314          'width': None,
 315          'height': None,
 316          'metadata': []
 317      }
 318      return video
 319  
 320  def get_metedata_object():
 321      metadata = {
 322          'language_id': None,
 323          'title': None,
 324          'description': None,
 325          'tags': None,
 326      }
 327      return metadata
 328  
 329  
 330  def download_video(url):
 331  
 332  
 333      # Ensure the output directory exists
 334      # create a new sha256 hash object
 335  
 336  
 337      url_hash = hashlib.sha256(url.encode()).hexdigest()
 338      # Define the output template for the downloaded file
 339      output_template = os.path.join(downloads_dir, url_hash + '.%(ext)s')
 340  
 341      options = {
 342          'format': 'bestvideo+bestaudio/best',
 343          'outtmpl': output_template,
 344          'writeinfojson': True,
 345          'quiet': True,
 346          'nocheckcertificate': True,
 347      }
 348  
 349      fileNameWithPath = ''
 350      if not os.path.exists(fileNameWithPath):
 351          with yt_dlp.YoutubeDL(options) as ydl:
 352              # Extract video information without downloading
 353              info_dict = ydl.extract_info(url, download=True)
 354              fileNameWithPath = ydl.prepare_filename(info_dict)
 355              video = {
 356                  'id': None,
 357                  'url': url,
 358                  'url_hash': url_hash,
 359                  'thumbnail_url': info_dict.get('thumbnail', ''),
 360                  'language_id': None,
 361                  'metadata' : [],
 362                  'width': info_dict['width'],
 363                  'height': info_dict['height']
 364              }
 365              metadata = {
 366                  'language_id': None,
 367                  'title': info_dict.get('title', ''),
 368                  'description': info_dict.get('description', ''),
 369                  'tags': ', '.join(info_dict.get('tags', [])),
 370              }
 371              video['metadata'].append(metadata)
 372      list = str.split(fileNameWithPath, '/')
 373      outputName = list[len(list) - 1]
 374      return video, outputName
 375  
 376  
 377  def get_audio_codec(video_path):
 378      cmd = [
 379          'ffprobe',
 380          '-v', 'quiet',
 381          '-print_format', 'json',
 382          '-show_streams',
 383          video_path
 384      ]
 385  
 386  
 387      result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 388      output = result.stdout.decode('utf-8')
 389      data = json.loads(output)
 390  
 391      for stream in data['streams']:
 392          if stream['codec_type'] == 'audio':
 393              return stream['codec_name']
 394  
 395      return None
 396  
 397  
 398  def extractAudioFromVideo(filename, desired_samplerate=24000):
 399      out, _ = (
 400          ffmpeg.input(downloads_dir + '/' + filename)
 401          .output('pipe:', format='wav', ac=1, ar=desired_samplerate)
 402          .run(capture_stdout=True, capture_stderr=True)
 403      )
 404  
 405  
 406      return out
 407  
 408  
 409  def normalize_text(t):
 410      return t.replace("%", "o/o")
 411  
 412  
 413  def format_drawtext_filter(captions, video):
 414      """Generate drawtext filter string for ffmpeg."""
 415  
 416  
 417      filters = []
 418      font_size = int(video['width'] / 20)
 419      for caption in captions:
 420          text = caption['text']
 421          modified_text = str(text).replace("'", "\u2019")
 422          modified_text = normalize_text(modified_text)
 423          filter_str = (
 424              # f"drawtext=text='{modified_text}':x=10:y=H-th-10:fontsize=24:fontcolor=white:" fontsize=24:fontcolor=black:
 425                  f"drawtext=text='{modified_text}':x=(w-text_w)/2:y=h-th-50:fontsize=" + str(
 426              font_size) + ":fontcolor=yellow:box=1:boxcolor=black@0.5:"
 427                           f"enable='between(t,{caption['start']},{caption['end']}')"
 428          )
 429          filters.append(filter_str)
 430      return ",".join(filters)
 431  
 432  import subprocess
 433  
 434  
 435  def split_captions_for_display(captions):
 436      result = []
 437  
 438  
 439      number_of_words_per_caption = 50
 440  
 441  
 442      def split_text(text, start, speaker_id, end):
 443  
 444  
 445          words = text.split()
 446          sub_items = []
 447          current_text = ""
 448          for word in words:
 449              if len(current_text) + len(word) + 1 > number_of_words_per_caption:
 450                  sub_items.append(current_text.strip())
 451                  current_text = ""
 452              current_text += word + " "
 453          if current_text:
 454              sub_items.append(current_text.strip())
 455  
 456          duration = end - start
 457          total_chars = sum(len(item) for item in sub_items)
 458          current_start = start
 459          for item in sub_items:
 460              current_end = current_start + (float(len(item)) / float(total_chars)) * duration
 461              result_text = format_text(item)
 462              result.append({
 463                  'start': current_start,
 464                  'end': current_end,
 465                  'speaker_id': speaker_id,
 466                  'text': result_text
 467              })
 468              current_start = current_end
 469  
 470  
 471      def format_text(text):
 472          if len(text) <= int(number_of_words_per_caption / 2):
 473              return text
 474  
 475  
 476          # Find the nearest space to the 20th character
 477          for i in range(int(number_of_words_per_caption / 2), len(text)):
 478              if text[i] == ' ':
 479                  first_part = text[:i]
 480                  second_part = text[i + 1:]
 481                  break
 482          else:
 483              first_part = text
 484              second_part = ""
 485  
 486          spaces_needed = (len(first_part) - len(second_part)) // 2
 487          centered_second_part = ' ' * spaces_needed + second_part
 488          return first_part + '\n' + centered_second_part
 489  
 490      for caption in captions:
 491          split_text(caption['text'], caption['start'], caption['speaker_id'], caption['end'])
 492  
 493      return result
 494  
 495  
 496  def get_video_bitrate(filename):
 497      try:
 498  
 499  
 500          # Run FFmpeg command to get video details
 501          result = subprocess.run(['ffmpeg', '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 502  
 503          # Extract overall bitrate from FFmpeg output using regex
 504          overall_bitrate_match = re.search(r'bitrate: (\d+ kb/s)', result.stderr)
 505          if overall_bitrate_match:
 506              overall_bitrate = int(overall_bitrate_match.group(1).split(' ')[0])
 507          else:
 508              overall_bitrate = None
 509  
 510          return overall_bitrate
 511      except Exception as e:
 512          print(f"Error: {e}")
 513          return None
 514  
 515  
 516  def split_in_smaller_chunks(subtitles, chunk_duration):
 517      retValue = []
 518  
 519  
 520      temp_subtitles = []
 521      last_timestamp = 0
 522      for i in range(0, len(subtitles)):
 523          if subtitles[i]['end'] > last_timestamp + chunk_duration:
 524              retValue.append(temp_subtitles)
 525              temp_subtitles = []
 526              last_timestamp = subtitles[i]['end']
 527          temp_subtitles.append(subtitles[i])
 528      retValue.append(temp_subtitles)
 529      return retValue
 530  
 531  
 532  def add_subtitles_to_video(subtitles, language, video):
 533  
 534  
 535      filename = get_filename_from_hash(video['url_hash'])
 536      #bitrate = get_video_bitrate(filename)
 537  
 538      subtitles2 = split_captions_for_display(subtitles)
 539      subtitles_list = split_in_smaller_chunks(subtitles2, 600)
 540      part_start_time = 0
 541      for i in range(0, len(subtitles_list)):
 542          if len(subtitles_list) == 1:
 543              output_video = output_dir + '/' + video['url_hash'] + "_" + language + ".mp4"
 544          else:
 545              output_video = temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4"
 546          if os.path.exists(output_video):
 547              os.remove(output_video)
 548          drawtext_filter = format_drawtext_filter(subtitles_list[i], video)
 549  
 550          cmd = ''
 551          video_duration = subtitles_list[i][len(subtitles_list[i]) - 1]['end'] - part_start_time
 552          '''cmd = [
 553              'ffmpeg',  # specify the exact path to your FFmpeg binary
 554              '-hwaccel', 'cuda',
 555              '-hwaccel_device', '0',
 556              '-i', downloads_dir + '/' + filename,
 557              '-ss', str(part_start_time),
 558              '-t', str(video_duration),
 559              '-vf', drawtext_filter,
 560              '-c:v', 'h264_nvenc',
 561              output_video
 562          ]'''
 563  
 564          cmd = [
 565              'ffmpeg',  # specify the exact path to your FFmpeg binary
 566              '-i', downloads_dir + '/' + filename,
 567              '-ss', str(part_start_time),
 568              '-t', str(video_duration),
 569              '-vf', drawtext_filter,
 570              '-c:v', 'h264_nvenc',
 571              output_video
 572          ]
 573  
 574          result = subprocess.run(cmd)
 575          part_start_time += video_duration
 576      if len(subtitles_list) > 1:
 577          str1 = ''
 578          str2 = ""
 579          for i in range(0, len(subtitles_list)):
 580              str1 += " -i " + temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4 "
 581              str2 += "[" + str(i) + ":v:0][" + str(i) + ":a:0]"
 582          str2 += "concat=n=" + str(len(subtitles_list)) + ":v=1:a=1[outv][outa]"
 583          print(str1)
 584          print(str2)
 585  
 586          cmd = "ffmpeg -hwaccel cuda -hwaccel_device 0 -y " + str1 + " -filter_complex " + str2 + " -map [outv] -map [outa] -c:v h264_nvenc " + output_dir + '/' + video['url_hash'] + "_" + language + ".mp4"
 587          os.system(cmd)
 588          for i in range(0, len(subtitles_list)):
 589              fileToDelete = temp_dir + '/' + video['url_hash'] + "_" + language + "_" + str(i) + ".mp4"
 590              try:
 591                  os.remove(fileToDelete)
 592              except Exception as e:
 593                  print(e)
 594  
 595  def overlay_captions2(input_video, subtitles, language):
 596      parts = filename.split("/")
 597      fileNameWithExtension = parts[len(parts) - 1]
 598      parts = fileNameWithExtension.split(".")
 599      fileName_without_extension = parts[0]
 600      directory = filename[:filename.index(fileNameWithExtension)]
 601      output_video = directory + fileName_without_extension + "_" + language + ".mp4"
 602      if os.path.exists(directory + "temp.srt"):
 603          os.remove(directory + "temp.srt")
 604      if os.path.exists(output_video):
 605          os.remove(output_video)
 606      # Generate SRT content
 607      srt_content = generate_srt(subtitles)
 608      with open(directory + "temp.srt", "w", encoding="utf-8") as f:
 609          f.write(srt_content)
 610  
 611      # Use ffmpeg to overlay the subtitles on the video
 612      video = ffmpeg.input(input_video)
 613      audio = video.audio
 614      ffmpeg.concat(
 615          video.filter('subtitles', directory + 'temp.srt', force_style="OutlineColour=&H40000000,BorderStyle=3"),
 616          audio, v=1, a=1
 617      ).output(output_video).run(quiet=True, overwrite_output=True)
 618  
 619      # Clean up temporary SRT file
 620      os.remove(directory + "temp.srt")
 621  
 622  
 623  def generate_srt(subtitles):
 624      """Generate SRT content from a list of subtitles."""
 625      srt_content = ""
 626      for i, (start, end, text) in enumerate(subtitles, 1):
 627          srt_content += f"{i}\n"
 628          srt_content += f"{format_timestamp(start)} --> {format_timestamp(end)}\n"
 629          srt_content += f"{text}\n\n"
 630      return srt_content
 631  
 632  
 633  def get_wav_duration(wav_bytes, sample_rate):
 634      # Subtract the header size to get the size of the audio data
 635      audio_data_size = len(wav_bytes) - 44
 636      # Calculate the number of samples
 637      num_samples = audio_data_size / 2
 638      # Calculate the duration
 639      duration = num_samples / sample_rate
 640      return duration
 641  
 642  
 643  def extract_subportion(wav_bytes, start_sec, end_sec, sample_rate):
 644      # Constants for WAV format
 645      HEADER_SIZE = 44
 646      BYTES_PER_SAMPLE = 2  # Assuming 16-bit samples
 647  
 648      # Convert start and end seconds to sample indices
 649      start_sample = int(start_sec * sample_rate)
 650      end_sample = int(end_sec * sample_rate)
 651  
 652      # Calculate start and end byte positions in the audio data
 653      start_byte = HEADER_SIZE + start_sample * BYTES_PER_SAMPLE
 654      end_byte = HEADER_SIZE + end_sample * BYTES_PER_SAMPLE
 655  
 656      # Extract the audio data for the sub-portion
 657      sub_audio_data = wav_bytes[start_byte:end_byte]
 658  
 659      # Create a new WAV header for the sub-portion
 660      num_channels = 1  # Assuming mono audio
 661      byte_rate = sample_rate * num_channels * BYTES_PER_SAMPLE
 662      block_align = num_channels * BYTES_PER_SAMPLE
 663      bits_per_sample = 16
 664      sub_data_size = len(sub_audio_data)
 665      sub_chunk_size = 16
 666      chunk_size = 36 + sub_data_size
 667  
 668      header = struct.pack(
 669          '<4sI4s4sIHHIIHH4sI',
 670          b'RIFF', chunk_size, b'WAVE', b'fmt ', sub_chunk_size, 1,
 671          num_channels, sample_rate, byte_rate, block_align,
 672          bits_per_sample, b'data', sub_data_size
 673      )
 674  
 675      # Combine the new header and the sub audio data
 676      sub_wav_bytes = header + sub_audio_data
 677  
 678      return sub_wav_bytes
 679  
 680  
 681  def bytes_to_np_array(audio_bytes):
 682      audio_data, _ = sf.read(BytesIO(audio_bytes))
 683      return audio_data.astype(np.float32)  # Convert to float32
 684  
 685  
 686  def save_wav_to_disk(wav_array, sampling_rate, output_file):
 687      wav_int16 = (wav_array * np.iinfo(np.int16).max).astype(np.int16)
 688      write(output_file, sampling_rate, wav_int16)
 689  
 690  
 691  def write_wav_files(wav_arrays, sampling_rate, output_dir, prefix="wav"):
 692      file_paths = []
 693      for idx, wav_array in enumerate(wav_arrays):
 694          # Convert the wav array to int16 format
 695          wav_int16 = (wav_array * np.iinfo(np.int16).max).astype(np.int16)
 696  
 697          # Define the output file path
 698          file_path = f"{output_dir}/{prefix}_{idx + 1}.wav"
 699          file_paths.append(file_path)
 700  
 701          # Write the wav array to a WAV file
 702          write(file_path, sampling_rate, wav_int16)
 703  
 704      return file_paths
 705  
 706  
 707  def WavFileToTensorFormat(filename):
 708      # Read the WAV file
 709      sample_rate, audio_data = wavfile.read(filename)
 710  
 711      # Ensure the audio data is in np.int16 format
 712      if audio_data.dtype != np.int16:
 713          # Normalize audio data to the range [-1, 1]
 714          audio_data_normalized = np.interp(audio_data, (audio_data.min(), audio_data.max()), (-1, 1))
 715  
 716          # Convert normalized data to np.int16 format
 717          audio_data = np.int16(audio_data_normalized * 32767)
 718  
 719      # Convert to PyTorch tensor
 720      audio_tensor = torch.tensor(audio_data).float()
 721  
 722      return audio_tensor
 723  
 724  
 725  def transcribe_with_word_timestamps(model, audio, **kwargs):
 726      # Ensure word_timestamps is set to True
 727      kwargs["word_timestamps"] = True
 728      return transcribe(model, audio, **kwargs)
 729  
 730  
 731  def Transcribe(audio_data):
 732      # Load the Whisper model
 733      model_name = "large"  # Replace with the appropriate model name
 734      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 735      whisperModel = load_model(model_name)
 736      whisperModel.to(device)
 737      audio_tensor = torch.tensor(audio_data).to(device)
 738      result = transcribe_with_word_timestamps(whisperModel, audio_tensor)
 739      return result
 740  
 741  
 742  def resample_wav(data, old_samplerate, new_samplerate):
 743      # Calculate the new length of the data
 744      new_length = int(len(data) * new_samplerate / old_samplerate)
 745  
 746      # Resample the data
 747      resampled_data = resample(data, new_length)
 748  
 749      return resampled_data
 750  
 751  from pyannote.audio.pipelines import SpeakerDiarization
 752  def segment_audio_by_speakers(wav_array, sr, SampleRate=16000):
 753      # Load the pre-trained model
 754      # pipeline = SpeakerDiarization(segmentation="pyannote/segmentation", embedding="pyannote/embedding")
 755      # device = 0
 756      # Ensure GPU is available
 757      if torch.cuda.is_available():
 758          device = torch.device("cuda")
 759      else:
 760          raise ValueError("GPU is not available.")
 761      pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
 762                                          use_auth_token="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")
 763  
 764      pipeline = pipeline.to(device)
 765      waveform_tensor = torch.tensor(wav_array).unsqueeze(0)  # Add channel dimension
 766  
 767      audio_data = {
 768          "waveform": waveform_tensor,
 769          "sample_rate": sr
 770      }
 771      # Apply the diarization pipeline
 772      diarization = pipeline(audio_data)
 773      print(diarization.labels())
 774      # Count the number of unique speakers
 775      num_speakers = len(diarization.labels())
 776      # Extract segments for each speaker
 777      segments = {
 778      }
 779      i = 0
 780      for turn, _, speaker in diarization.itertracks(yield_label=True):
 781          start_time, end_time = turn.start, turn.end
 782          start_sample, end_sample = int(start_time * SampleRate), int(end_time * SampleRate)
 783          segment = wav_array[start_sample:end_sample]
 784          segments[i] = {'speaker': speaker, 'start': start_time, 'end': end_time, 'segment': segment}
 785          i += 1
 786  
 787          # segments.append((speaker,start_time,end_time, segment))
 788  
 789      return segments
 790  
 791  
 792  def get_segment_from_timestamp(segments, timestamp_in_s):
 793      i = 0
 794      retValue = segments[0]
 795      while (i < len(segments)):
 796          if (segments[i]['start'] > timestamp_in_s):
 797              break
 798          if (segments[i]['end'] > timestamp_in_s):
 799              retValue = segments[i]
 800          i += 1
 801      return retValue
 802  
 803  
 804  def format_timestamp(seconds_str):
 805      """Convert string representation of seconds to 'HH:MM:SS,mmm' format."""
 806      total_seconds = float(seconds_str)
 807      hours = int(total_seconds // 3600)
 808      total_seconds %= 3600
 809      minutes = int(total_seconds // 60)
 810      seconds = int(total_seconds % 60)
 811      milliseconds = int((total_seconds - int(total_seconds)) * 1000)
 812      return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
 813  
 814  
 815  def createCaptions(translatedFullText, result, font_size):
 816      captions = []
 817      max_chars_per_line = font_size * 120
 818      parts = str.split(translatedFullText, ' ')
 819      i = 0
 820      tempCaption = ''
 821      tempsStart = 0
 822      tempEnd = 0
 823      while (i < max_chars_per_line):
 824          tempCaption += parts[i] + ' '
 825  
 826      caption = (tempsStart, tempEnd, tempCaption)
 827      captions.append(caption)
 828  
 829  
 830  from transformers import T5Tokenizer, T5ForConditionalGeneration
 831  
 832  
 833  def transform_sentence_in_french(sentence):
 834      # Load pre-trained T5 model and tokenizer
 835      model = T5ForConditionalGeneration.from_pretrained("t5-large")
 836      tokenizer = T5Tokenizer.from_pretrained("t5-large")
 837  
 838      # Use the modified prompt
 839      input_text = "correct the following French sentence: " + sentence
 840      input_ids = tokenizer(input_text, return_tensors="pt").input_ids
 841  
 842      # Generate output
 843      output = model.generate(input_ids)
 844      transformed_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
 845  
 846      # Refine post-processing: Remove only the prefix of the output that matches the input
 847      if transformed_sentence.startswith(sentence):
 848          transformed_sentence = transformed_sentence[len(sentence):].strip()
 849  
 850      return transformed_sentence
 851  
 852  
 853  def format_transcript_for_chatgpt(segments):
 854      # Check unique speaker IDs in the segments
 855      unique_speaker_ids = set([segment['speaker_id'] for segment in segments])
 856  
 857      formatted_text = []
 858      prev_speaker_id = None
 859  
 860      for segment in segments:
 861          current_text = segment['text']
 862          current_speaker_id = segment['speaker_id']
 863          start = round(segment['start'], 2)
 864          end = round(segment['end'], 2)
 865          if prev_speaker_id is None or prev_speaker_id != current_speaker_id:
 866              formatted_text.append(f"\n[{start},{end}] /* {current_speaker_id} */: {current_text}")
 867          else:
 868              formatted_text.append(f"[{start},{end}] : {current_text}")
 869  
 870          prev_speaker_id = current_speaker_id
 871  
 872      return "".join(formatted_text)
 873  
 874  def format_transcript_for_google(segments):
 875      # Check unique speaker IDs in the segments
 876      unique_speaker_ids = set([segment['speaker_id'] for segment in segments])
 877  
 878      formatted_text = []
 879      prev_speaker_id = None
 880  
 881      for segment in segments:
 882          current_text = segment['text']
 883          current_speaker_id = segment['speaker_id']
 884          start = round(float(segment['start']), 2)
 885          end = round(float(segment['end']), 2)
 886          if prev_speaker_id is None or prev_speaker_id != current_speaker_id:
 887              formatted_text.append(f"\n[{start},{end}] /* {current_speaker_id} */: {current_text}")
 888          else:
 889              formatted_text.append(f"[{start},{end}] : {current_text}")
 890  
 891          prev_speaker_id = current_speaker_id
 892  
 893      return "".join(formatted_text)
 894  
 895  
 896  def format_transcript_for_google_old(segments):
 897      # Check unique speaker IDs in the segments
 898      unique_speaker_ids = set([segment['speaker_id'] for segment in segments])
 899  
 900      # If there's only one speaker, concatenate the 'text' fields
 901      if len(unique_speaker_ids) == 1:
 902          return "[0] ".join([segment['text'] for segment in segments])
 903  
 904      # If there are multiple speakers, format the text accordingly
 905      else:
 906          formatted_text = []
 907          prev_speaker_id = None
 908  
 909          for segment in segments:
 910              current_text = segment['text']
 911              current_speaker_id = segment['speaker_id']
 912  
 913              # If the speaker ID changed from the previous segment or it's the start
 914              if current_speaker_id != prev_speaker_id:
 915                  if formatted_text:
 916                      formatted_text.append("\n")  # New line between different speakers
 917                  formatted_text.append(f"[{current_speaker_id}]: {current_text}")
 918              else:
 919                  formatted_text.append(f" {current_text}")
 920  
 921              prev_speaker_id = current_speaker_id
 922  
 923          return "".join(formatted_text)
 924  
 925  #model=nllb-200-1.3B
 926  #nllb-200-distilled-600M
 927  nllbModelName = "facebook/nllb-200-distilled-600M"
 928  nllbModel = AutoModelForSeq2SeqLM.from_pretrained(nllbModelName)
 929  nllbModel.to(device)
 930  nllbTokenizer = AutoTokenizer.from_pretrained(nllbModelName)
 931  
 932  
 933  
 934  
 935  
 936  def query_chatgpt(myText):
 937      retValue = ''
 938      # max_tokens = 3 * len(myText.split())
 939      openai.api_key = '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'
 940  
 941      response = openai.ChatCompletion.create(
 942          model="gpt-4",  # Replace with the correct chat model name
 943          messages=[
 944              {"role": "system", "content": "You are a helpful assistant."},
 945              {"role": "user", "content": myText},
 946          ]
 947      )
 948  
 949      correct_translation = response.choices[0].message['content']
 950  
 951      return correct_translation
 952  
 953  
 954  
 955  
 956  import re
 957  
 958  
 959  def break_text_into_captions(text, captions):
 960      result = []
 961      parts = str.split(text, '|')
 962  
 963      for i in range(0, len(parts)):
 964          result.append({
 965              'start': captions[i]['start'],
 966              'end': captions[i]['end'],
 967              'text': parts[i].strip(),
 968              'speaker_id': captions['speaker_id']
 969          })
 970  
 971      return result
 972  
 973  def set_unknown_speaker_ids(captions, default_speaker_id):
 974      for i in range(0, len(captions)):
 975          if captions[i]['speaker_id'] is None:
 976              captions[i]['speaker_id'] = default_speaker_id
 977  
 978  def set_speaker_in_captions(captions, segments):
 979      max_speaker_id = 0
 980      for i in range(0, len(captions)):
 981          # time_at_middle_part = int(captions[i]['start']) + (int(captions[i]['end']) - int(captions[i]['start'])) / 2
 982          start = float(captions[i]['start'])
 983          end = float(captions[i]['end'])
 984          speaker_id = get_speaker_id_at_timestamp(segments, start, end)
 985          if speaker_id is not None:
 986              if speaker_id > max_speaker_id:
 987                  max_speaker_id = speaker_id
 988          captions[i]['speaker_id'] = get_speaker_id_at_timestamp(segments, start, end)
 989      set_unknown_speaker_ids(captions, max_speaker_id)
 990  
 991  
 992  def get_speaker_id_at_timestamp(segments, start, end):
 993      # Dictionary to store the total duration each value appears within the interval
 994      value_durations = defaultdict(int)
 995  
 996      for i in range(0, len(segments)):
 997          # Check if the entry overlaps with the interval [start1, end1]
 998          overlap_start = max(segments[i]['start'], start)
 999          overlap_end = min(segments[i]['end'], end)
1000  
1001          if overlap_start < overlap_end:
1002              duration = overlap_end - overlap_start
1003              value_durations[segments[i]['speaker']] += duration
1004  
1005      # Find the value with the maximum duration within the interval
1006      max_value = None
1007      max_duration = 0
1008      for value, duration in value_durations.items():
1009          if duration > max_duration:
1010              max_duration = duration
1011              max_value = value
1012      speaker_id = None
1013      if max_value is not None:
1014          strsplit = str.split(max_value, '_')
1015          speaker_id = int(strsplit[1])
1016      return speaker_id
1017  
1018  def translate_with_nllb(transcript, video, target_language_code):
1019      target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code)
1020      captions = translate_captions(transcript, target_nllb_code, nllbModel, nllbTokenizer)
1021      # drop_local_model_request_in_queue(transcript, video, target_language_code)
1022      title = translate_string(str(video['metadata'][0]['title']), target_nllb_code, nllbModel, nllbTokenizer)
1023      description = translate_string(str(video['metadata'][0]['description']), target_nllb_code, nllbModel,
1024                                     nllbTokenizer)
1025      tags = translate_string(str(video['metadata'][0]['tags']), target_nllb_code, nllbModel, nllbTokenizer)
1026      metadata = {
1027          'language_id': target_language_id,
1028          'title': title,
1029          'description': description,
1030          'tags': tags
1031      }
1032      video['metadata'].append(metadata)
1033  
1034      save_to_database(db_config, video)
1035  
1036      insert_captions_in_db(db_config, video, captions, target_language_code, nllbModelName)
1037  
1038      return captions
1039  
1040  def enhance_with_chatgpt(transcript, video, target_language_code):
1041      target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code)
1042      source_text = format_transcript_for_google(transcript)
1043      chunks = break_text(source_text)
1044      chatgpt_output = ""
1045      for i in range(0, len(chunks)):
1046          chatgpt_text = "This is a translation in " + target_language_name + ". Make it make sense. Just provide the output without comments: '" + chunks[i] + "'"
1047          chatgpt_output += query_chatgpt(chatgpt_text)
1048  
1049      good_segments = split_by_speaker(chatgpt_output)
1050      if get_transcripts_total_textLength(good_segments) < 1.2 * len(chatgpt_output):
1051          good_segments = split_by_sentence(chatgpt_output)
1052          good_segments[0]['start'] = transcript[0]['start']
1053          good_segments[len(good_segments) - 1]['end'] = transcript[len(good_segments) - 1]['end']
1054          match_translations(transcript, good_segments)
1055          distribute_times(good_segments)
1056      insert_captions_in_db(db_config, video, good_segments, target_language_code, 'chatgpt')
1057      return good_segments
1058  def translate_with_google(transcript, video, target_language_code):
1059      bodyText = format_transcript_for_google(transcript)
1060      fullText = ""
1061      if video['metadata'][0]['title'] is not None:
1062          fullText = video['metadata'][0]['title'] + '\n\n' + bodyText
1063      else:
1064          fullText = bodyText
1065      sample = transcript[0]['text'] + " " + transcript[1]['text']
1066      #original_language_code = query_google_detect_language(sample)
1067      language_id = video['language_id']
1068      original_language_code, nllb_code, name = fetch_language_values_from_id(db_config, language_id)
1069      target_language_id, target_nllb_code, target_name = fetch_language_values(db_config, target_language_code)
1070      translation_text = query_google_translate(original_language_code, target_language_code, fullText)
1071      video_title, body = split_text_at_number(translation_text)
1072      description = translate_string(str(video['metadata'][0]['description']), target_nllb_code, nllbModel,
1073                                     nllbTokenizer)
1074      tags = translate_string(str(video['metadata'][0]['tags']), target_nllb_code, nllbModel, nllbTokenizer)
1075      metadata_for_target_language_exists = False
1076      for i in range(0, len(video['metadata'])):
1077          if video['metadata'][i]['language_id'] == target_language_id:
1078              video['metadata'][i]['title'] = video_title
1079              video['metadata'][i]['description'] = description
1080              video['metadata'][i]['tags'] = tags
1081              metadata_for_target_language_exists = True
1082      if metadata_for_target_language_exists == False:
1083          metadata = {
1084              'language_id': target_language_id,
1085              'title': video_title,
1086              'description': description,
1087              'tags': tags
1088          }
1089          video['metadata'].append(metadata)
1090      good_segments = split_by_speaker(body)
1091  
1092      return good_segments
1093  
1094  def void():
1095      return None
1096  def process_audio(queue_id, video_id, original_audio_array):
1097      SampleRate = 16000
1098      clipDuration = get_wav_duration(original_audio_array, SampleRate)
1099  
1100      wav_header_length = 44
1101      device = 'cuda' if torch.cuda.is_available() else 'cpu'
1102      audio_data = bytes_to_np_array(original_audio_array)
1103      speakerDiarization = True
1104  
1105      # wav_int16 = (audio_data * np.iinfo(np.int16).max).astype(np.int16)
1106      # write(output_dir + '/originalWav.wav', SampleRate, wav_int16)
1107  
1108      # Ensure audio_data is a 1D array
1109      if len(audio_data.shape) > 1:
1110          audio_data = audio_data[:, 0]
1111  
1112      print_with_current_milli_time(logging, "Transcribing...")
1113      transcript = []
1114      video = fetch_video_in_db_from_id(db_config, video_id)
1115      url, target_language_code = fetch_queue_in_db_from_id(db_config, queue_id)
1116      target_language_id, target_nllb_code, target_language_name = fetch_language_values(db_config, target_language_code)
1117      original_language_id = video['language_id']
1118      original_language_code, original_nllb_code, original_language_name = fetch_language_values_from_id(db_config,
1119                                                                                                      original_language_id)
1120      transcript = get_transcript_from_db(db_config, video, model_name='whisper')
1121      if len(transcript) == 0:
1122          print_with_current_milli_time(logging, "Transcribing queue_id=" + str(queue_id))
1123          result = Transcribe(audio_data)
1124          original_language_code = result['language']
1125          original_language_id, original_nllb_code, original_language_name = fetch_language_values(db_config, original_language_code)
1126          video['language_id'] = original_language_id
1127          update_video_to_database(db_config, video)
1128          if (speakerDiarization == True):
1129              print_with_current_milli_time(logging, "Speaker diarization for queue_id=" + str(queue_id))
1130              segments = segment_audio_by_speakers(audio_data, SampleRate)
1131              transcript = convert_to_transcript(result)
1132              set_speaker_in_captions(transcript, segments)
1133          insert_captions_in_db(db_config, video, transcript, original_language_code, 'whisper')
1134          print(result['text'])
1135      else:
1136          original_language_id = video['language_id']
1137          original_language_code, original_nllb_code, original_language_name = fetch_language_values_from_id(db_config,
1138                                                                                                   original_language_id)
1139      thread1 = None
1140      thread2 = None
1141      transcript = group_by_sentence(transcript)
1142  
1143      if "youtube" in video['url'] or "youtu.be" in video['url']:
1144          try:
1145              translated_segments = get_transcript_from_db(db_config, video, target_language_code, model_name='google')
1146              if translated_segments is None or len(translated_segments) == 0:
1147                  print_with_current_milli_time(logging, "Google translating for queue_id=" + str(queue_id))
1148                  translated_segments = translate_with_google(transcript, video, target_language_code)
1149                  update_video_to_database(db_config, video)
1150                  for i in range (0, len(transcript)):
1151                      found = False
1152                      j = 0
1153                      while found == False:
1154                          testSegment = translated_segments[j]
1155                          if float(transcript[i]['start']) == float(testSegment['start']):
1156                              found = True
1157                          elif float(testSegment['start']) > float(transcript[i]['start']) and found == False:
1158                              break
1159                          else:
1160                              j += 1
1161                      if found == False:
1162                          translationOfString = translate_string(transcript[i]['text'], target_nllb_code,
1163                                                         nllbModel,
1164                                                         nllbTokenizer)
1165                          missing_segment = {
1166                              'speaker_id': transcript[i]['speaker_id'],
1167                              'start': transcript[i]['start'],
1168                              'end': transcript[i]['end'],
1169                              'text': translationOfString
1170                          }
1171                          translated_segments.insert(j, missing_segment)
1172  
1173  
1174  
1175  
1176                  insert_captions_in_db(db_config, video, translated_segments, target_language_code, 'google')
1177              '''if float(translated_segments[len(translated_segments) - 1]['end']) < 600:
1178                  good_segments = get_transcript_from_db(db_config, video, target_language_code, model_name='chatgpt')
1179                  if good_segments is None or len(good_segments) == 0:
1180                      good_segments = enhance_with_chatgpt(translated_segments, video, target_language_code)'''
1181          except Exception as e:
1182              print(f"Error: {e}")
1183              captions = get_transcript_from_db(db_config, video, target_language_code, nllbModelName)
1184              if captions is None or len(captions) == 0:
1185                  print_with_current_milli_time(logging, "Translating with NLLB for queue_id=" + str(queue_id))
1186                  translate_with_nllb(transcript, video, target_language_code)
1187  
1188      else:
1189          captions = get_transcript_from_db(db_config, video, target_language_code, nllbModelName)
1190          if captions is None or len(captions) == 0:
1191              translate_with_nllb(transcript, video, target_language_code)
1192              print_with_current_milli_time(logging, "Translating with NLLB for queue_id=" + str(queue_id))
1193      '''if len(captions) == 0:
1194          captions = translate_with_local_model(transcript, video, target_language_code)
1195          #drop_local_model_request_in_queue(transcript, video, target_language_code)'''
1196      revised_translation_str = ""
1197  
1198      '''annotate_speaker_switch(captions, good_segments)
1199      match_translations(captions, good_segments)
1200      reset_inconsistent_timestamps(good_segments)
1201      # If the start of the first item is None, set it to 0
1202      if good_segments[0]['start'] is None:
1203          good_segments[0]['start'] = captions[0]['start']
1204  
1205      if good_segments[len(good_segments) - 1]['end'] is None:
1206          good_segments[len(good_segments) - 1]['end'] = captions[len(captions) - 1]['end']
1207      good_segments4 = distribute_times(good_segments)'''
1208  
1209  
1210      total_segment = ''
1211      silence = np.zeros(int(0.25 * 24000))  # quarter second of silence
1212  
1213      wav_arrays = []
1214      sampling_rates = []
1215  
1216      # voice_path = '/home/cyril/tools/bark-with-voice-clone/bark/assets/prompts/' + voice_name + '.npz'
1217  
1218      pieces = []
1219  
1220  
1221  
1222  
1223  
1224  
1225      #result = assign_timestamps(captions, corrected_transcript[0]['text'])
1226      '''subtitles2 = split_captions_for_display(captions)
1227      subtitles_list = split_in_smaller_chunks(subtitles2, 600)
1228      drawtext_filter = format_drawtext_filter(subtitles_list[0], video)
1229      print(drawtext_filter)'''
1230  
1231  
1232      set_content_processed_in_db(queue_id)
1233      print_with_current_milli_time(logging, "queue_id=" + str(queue_id) + "fully processed")
1234      torch.cuda.empty_cache()
1235  
1236      '''hasTranslationBeenRevised = has_translation_been_revised_already(video['url_hash'], target_language_code)
1237      if useChatGPT == True and hasTranslationBeenRevised == False:
1238          chatGPTQuery = 'Here is a conversation between multiple speakers. It can be hard to understand at times. Try to correct this exchange in a way that is easy to translate:"' + '"' + chatGPTText + '"'
1239          fullText = correct_translation_with_chatgpt(chatGPTQuery)
1240          print(fullText)
1241          # createCaptions(translatedFullText,result, 24)
1242          revisedCaptions = break_text_into_captions(fullText, transcript)
1243          if (speakerDiarization):
1244              set_speaker_in_captions(revisedCaptions, segments)
1245          insert_captions_in_db(video['url_hash'], revisedCaptions, target_language_code, 1)
1246          add_subtitles_to_video(filename, revisedCaptions, target_language_code, video)
1247      else:'''
1248  
1249      # overlay_captions(filename,captions,language_code)
1250  
1251      # translated_audio_array = np.concatenate(pieces)
1252      # audio_array1_ndarray = np.frombuffer(original_audio_array, dtype=np.int16)
1253      '''if len(audio_array1_ndarray) > len(translated_audio_array):
1254          translated_audio_array = np.pad(translated_audio_array, (0, len(audio_array1_ndarray) - len(translated_audio_array)))
1255      elif len(audio_array1_ndarray) < len(translated_audio_array):
1256          audio_array1_ndarray = np.pad(audio_array1_ndarray, (0, len(translated_audio_array) - len(audio_array1_ndarray)))
1257  
1258      # Mix the audio arrays with the specified volume ratios
1259      interleaved_mono_audio = np.vstack((audio_array1_ndarray, translated_audio_array)).T.ravel()'''
1260  
1261  import queue
1262  
1263  
1264  def get_transcript_temp(db_config, video):
1265      caption = None
1266      conn = mysql.connect(**db_config)
1267      cursor = conn.cursor()
1268      sql_text = "select start, end, text from transcripts_temp where video_id = %s"
1269      cursor.execute(sql_text, (int(video['id']),))
1270      row = cursor.fetchone()
1271      if row is not None:
1272          caption = {
1273              'start': float(row[0]),
1274              'end': float(row[1]),
1275              'speaker_id': None,
1276              'text': row[2],
1277  
1278          }
1279      return caption
1280  
1281  
1282  
1283  # Initialize Translation client
1284  
1285  def split_text_at_number(text):
1286      match = re.search(r'\[', text)
1287      if match:
1288          before = text[:match.start()]
1289          after = text[match.start():]
1290          return (before, after)
1291      else:
1292          return (text, "")
1293  
1294  def break_text(text):
1295      words = text.split()
1296      chunks = []
1297      start = 0
1298  
1299      while start < len(words):
1300          end = start + 3000
1301          if end >= len(words):
1302              chunks.append(' '.join(words[start:]))
1303              break
1304  
1305          # First, look for numbers in brackets after the 3000th word up to the 3500th word
1306          found_bracket = False
1307          for i in range(end, min(end + 500, len(words))):
1308              if re.match(r'\[\d+\]', words[i]):
1309                  end = i
1310                  found_bracket = True
1311                  break
1312  
1313          # If we didn't find a number in brackets by the 3500th word, look for the end of a sentence
1314          if not found_bracket:
1315              for i in range(end + 500, len(words)):
1316                  if '.' in words[i]:
1317                      end = i + 1
1318                      break
1319  
1320          chunks.append(' '.join(words[start:end]))
1321          start = end
1322  
1323      return chunks
1324  
1325  def query_google_detect_language(source_text):
1326      retValue = ""
1327      """Translates text using the Google Cloud Translation API v3 with gcloud authentication."""
1328      project_id = "ambient-inquiry-264521"
1329      location = "global"
1330      # Get the access token using gcloud
1331      #access_token = "ya29.a0AfB_byD-poU0YPWYtRPiscR2K1TG58nFmkV3TFwrrHkBVYLk5_6CVIEH58qZW0YJ-6NP0zBFJP2QNVdgkmM1Ug2GNq9c0PnmJZUFHEfJTbTI4femKcNBgwR85DvIPs-nC6sOSaSpHtbocIMMfxROsGA74YuzFrnANWFdYp9nEAaCgYKAcESARASFQGOcNnCzJKx7c5wlIjX-qhdpo__PQ0177"
1332      access_token = subprocess.getoutput("/home/cyril/google-cloud-sdk/bin/gcloud auth print-access-token")
1333  
1334      endpoint = "https://translation.googleapis.com/language/translate/v2/detect"
1335  
1336      headers = {
1337          "Authorization": f"Bearer {access_token}",
1338          "x-goog-user-project": project_id,
1339          "Content-Type": "application/json; charset=utf-8"
1340      }
1341  
1342      data = {
1343          "q": source_text
1344      }
1345  
1346      response = requests.post(endpoint, headers=headers, data=json.dumps(data))
1347      response_json = response.json()
1348  
1349      # Extract and return the detected language
1350      return response_json["data"]["detections"][0][0]["language"]
1351  
1352  
1353  def query_google_translate( source_language, target_language, source_text):
1354      retValue = ""
1355  
1356      """Translates text using the Google Cloud Translation API v3 with gcloud authentication."""
1357      project_id = "ambient-inquiry-264521"
1358      location = "global"
1359      # Get the access token using gcloud
1360      #access_token = "ya29.a0AfB_byD-poU0YPWYtRPiscR2K1TG58nFmkV3TFwrrHkBVYLk5_6CVIEH58qZW0YJ-6NP0zBFJP2QNVdgkmM1Ug2GNq9c0PnmJZUFHEfJTbTI4femKcNBgwR85DvIPs-nC6sOSaSpHtbocIMMfxROsGA74YuzFrnANWFdYp9nEAaCgYKAcESARASFQGOcNnCzJKx7c5wlIjX-qhdpo__PQ0177"
1361      access_token = subprocess.getoutput("/home/cyril/google-cloud-sdk/bin/gcloud auth print-access-token")
1362      endpoint = f"https://translation.googleapis.com/v3/projects/{project_id}/locations/{location}:translateText"
1363  
1364      headers = {
1365          "Authorization": f"Bearer {access_token}",
1366          "x-goog-user-project": project_id,
1367          "Content-Type": "application/json; charset=utf-8"
1368      }
1369      chunks = break_text(source_text)
1370  
1371      for i in range (0, len(chunks)):
1372          body = {
1373              "source_language_code": source_language,
1374              "target_language_code": target_language,
1375              "contents": chunks[i],
1376              "mime_type": "text/plain",
1377              "transliteration_config": {
1378                  "enable_transliteration": False
1379              }
1380          }
1381  
1382          response = requests.post(endpoint, headers=headers, data=json.dumps(body))
1383          response_json = response.json()
1384          retValue += response_json["translations"][0]["translatedText"] + " "
1385  
1386      # Extract and return the translated text
1387      return retValue
1388  
1389  
1390  def insert_into_temp(db_config, video, caption):
1391      conn = mysql.connect(**db_config)
1392      cursor = conn.cursor()
1393  
1394      # Fetch the ID of the given last name
1395  
1396      start = caption['start']
1397      end = caption['end']
1398      text = caption['text']
1399      cursor.execute(
1400          "INSERT IGNORE INTO transcripts_temp (video_id, start, end,text) VALUES (%s, %s, %s, %s)",
1401          (video['id'], start, end, text))
1402  
1403      conn.commit()
1404  
1405  
1406  
1407  
1408  
1409  
1410  def set_content_processed_in_db(queue_id):
1411      # Prepare the header
1412      conn = mysql.connect(**db_config)
1413      cursor = conn.cursor()
1414      now = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
1415      cursor.execute("UPDATE queue SET time_content_processed = %s WHERE id = %s", (now, queue_id))
1416      conn.commit()
1417  
1418  
1419  
1420  
1421  
1422  
1423  
1424  
1425  
1426  
1427  # Configuration
1428  HOST = '0.0.0.0'
1429  HEADER_SIZE = 18
1430  
1431  def receive_data(client_socket):
1432      # First receive the header
1433      header = client_socket.recv(HEADER_SIZE).decode('utf-8')
1434      data_type, queue_id, video_id = header.split(",")[:3]
1435      data_type = str.strip(data_type)
1436      queue_id, video_id = int(queue_id), int(video_id)
1437      data = bytearray()
1438      chunk = client_socket.recv(4096)
1439      while chunk:
1440          data.extend(chunk)
1441          chunk = client_socket.recv(4096)
1442  
1443      if data_type == "JSON" and len(data) > 0:
1444          data = json.loads(data.decode('utf-8'))
1445  
1446      return data_type, queue_id, video_id, data
1447  
1448  def requestsWorker(queue_id, video_id, data ):
1449  
1450      print_with_current_milli_time(logging, "Processing audio for queue_id=" + str(queue_id))
1451      process_audio(queue_id, video_id, data)
1452      '''print("requestQueue started")
1453      while True:
1454          # Get item from queue
1455          item = requestQueue.get()
1456          print("requestQueue item")
1457          if item is not None:
1458              queue_id, video_id, data = item
1459              print_with_current_milli_time(logging, "Processing audio for queue_id=" + str(queue_id))
1460              process_audio(queue_id, video_id, data)
1461              time.sleep(1)'''
1462  
1463  
1464  
1465  
1466  def server_listen():
1467      s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1468      s.bind((HOST, GPU_PORT))
1469      s.listen()
1470      print('Server listening for a connection...')
1471      while True:
1472          conn, addr = s.accept()
1473          print_with_current_milli_time(logging, f'Connected by {addr}')
1474          data_type, queue_id, video_id, data = receive_data(conn)
1475          conn.close()
1476  
1477          if data_type == "AUDIO":
1478              t1 = threading.Thread(target=requestsWorker, args=(queue_id, video_id, data))
1479              t1.start()
1480          else:
1481              print("Unknown data type received")
1482              return None
1483  
1484  
1485  
1486  
1487  
1488  
1489  if __name__ == '__main__':
1490      # Start the server thread
1491  
1492      server_thread = threading.Thread(target=server_listen)
1493      server_thread.start()
1494      while True:
1495          time.sleep(1)
1496