/ restai / audio / workers / crisperwhisper_beta.py
crisperwhisper_beta.py
 1  import os
 2  
 3  
 4  def get_python_executable():
 5      current_file_path = os.path.abspath(__file__)
 6      project_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file_path))))
 7      
 8      return os.path.join(project_path, ".venvs/.venv-crisperwhisper/bin/python")
 9  
10  def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
11      adjusted_chunks = pipeline_output["chunks"].copy()
12  
13      for i in range(len(adjusted_chunks) - 1):
14          current_chunk = adjusted_chunks[i]
15          next_chunk = adjusted_chunks[i + 1]
16  
17          current_start, current_end = current_chunk["timestamp"]
18          next_start, next_end = next_chunk["timestamp"]
19          pause_duration = next_start - current_end
20  
21          if pause_duration > 0:
22              if pause_duration > split_threshold:
23                  distribute = split_threshold / 2
24              else:
25                  distribute = pause_duration / 2
26  
27              # Adjust current chunk end time
28              adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)
29  
30              # Adjust next chunk start time
31              adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
32      pipeline_output["chunks"] = adjusted_chunks
33  
34      return pipeline_output
35  
36  def worker(prompt, sharedmem):
37      import torch
38      from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
39          
40      file_path = sharedmem["file_path"]
41      filename = sharedmem["filename"]
42      
43      device = "cuda:0" if torch.cuda.is_available() else "cpu"
44      torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
45  
46      model_id = "nyrahealth/CrisperWhisper"
47  
48      model = AutoModelForSpeechSeq2Seq.from_pretrained(
49          model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_cache=False
50      )
51      model.to(device)
52      processor = AutoProcessor.from_pretrained(model_id)
53  
54      pipe = pipeline(
55          "automatic-speech-recognition",
56          model=model,
57          tokenizer=processor.tokenizer,
58          feature_extractor=processor.feature_extractor,
59          chunk_length_s=30,
60          batch_size=16,
61          return_timestamps='word',
62          torch_dtype=torch_dtype,
63          device=device,
64      )
65  
66      hf_pipeline_output = pipe(file_path)
67      crisper_whisper_result = adjust_pauses_for_hf_pipeline_output(hf_pipeline_output)
68      
69      print(crisper_whisper_result)