Cradicle Explorer

/ transcription_models / transcription_core.py
transcription_core.py
  1  """
  2  Main module for video and audio transcription
  3  ----------------------------------------------------------------
  4  This module integrates the functionalities of other transcription modules
  5  to provide a unified API for transcribing multimedia content.
  6  """
  7  
  8  import os
  9  import json
 10  import logging
 11  import time
 12  import traceback
 13  from pathlib import Path
 14  from typing import Dict, List, Any, Optional, Callable, Tuple, Union
 15  from tempfile import NamedTemporaryFile
 16  
 17  # Import specialized modules
 18  from .audio_extraction import extract_audio, cleanup_audio_file
 19  from .whisper_utils import transcribe_audio, format_whisper_result, cleanup_whisper_model
 20  from .diarization import diarize_audio, assign_speakers, format_diarized_transcription
 21  
 22  # Logging configuration
 23  logger = logging.getLogger("transcription.core")
 24  
 25  # Temporary directory for output files
 26  RESULTS_DIR = Path("results/transcriptions")
 27  RESULTS_DIR.mkdir(parents=True, exist_ok=True)
 28  
 29  def process_monologue(
 30      video_path: str, 
 31      output_txt: Optional[str] = None, 
 32      model_size: Optional[str] = None, 
 33      progress: Optional[Callable] = None
 34  ) -> Dict[str, Any]:
 35      """
 36      Transcribes a video in monologue mode (without speaker identification)
 37      
 38      Args:
 39          video_path: Path to the video file
 40          output_txt: Output path for the text file (optional)
 41          model_size: Size of the Whisper model to use
 42          progress: Progress tracking function (optional)
 43          
 44      Returns:
 45          Dictionary containing the complete transcription and segments
 46          
 47      Raises:
 48          Exception: If an error occurs during processing
 49      """
 50      try:
 51          # Extract audio
 52          if progress:
 53              progress(0.1, desc="Extracting audio...")
 54          
 55          audio_path = extract_audio(video_path, progress=progress)
 56          audio_extracted = True
 57          
 58          # Transcribe audio
 59          if progress:
 60              progress(0.3, desc="Transcription in progress...")
 61          
 62          result = transcribe_audio(audio_path, model_size, progress=progress)
 63          
 64          # Save transcription if requested
 65          if output_txt:
 66              with open(output_txt, "w", encoding="utf-8") as f:
 67                  formatted_text = format_whisper_result(result)
 68                  f.write(formatted_text)
 69              
 70              if progress:
 71                  progress(0.95, desc=f"Transcription saved to {output_txt}")
 72          
 73          # Cleanup
 74          if progress:
 75              progress(1.0, desc="Transcription completed")
 76          
 77          if audio_extracted:
 78              cleanup_audio_file(audio_path)
 79          
 80          # Unified output format
 81          return {
 82              "transcription": result["text"],
 83              "segments": result["segments"],
 84              "language": result.get("language", ""),
 85              "duration": result.get("duration", 0)
 86          }
 87          
 88      except Exception as e:
 89          error_msg = f"Error during transcription: {str(e)}"
 90          logger.error(error_msg)
 91          logger.error(traceback.format_exc())
 92          
 93          # Cleanup in case of error
 94          if 'audio_path' in locals() and 'audio_extracted' in locals() and audio_extracted:
 95              cleanup_audio_file(audio_path)
 96          
 97          raise Exception(error_msg)
 98  
 99  def process_multiple_speakers(
100      video_path: str, 
101      output_txt: Optional[str] = None, 
102      model_size: Optional[str] = None, 
103      huggingface_token: Optional[str] = None, 
104      progress: Optional[Callable] = None
105  ) -> Dict[str, Any]:
106      """
107      Transcribes a video with speaker identification
108      
109      Args:
110          video_path: Path to the video file
111          output_txt: Output path for the text file (optional)
112          model_size: Size of the Whisper model to use
113          huggingface_token: Hugging Face token for access to the diarization model
114          progress: Progress tracking function (optional)
115          
116      Returns:
117          Dictionary containing the transcription with speaker identification
118          
119      Raises:
120          Exception: If an error occurs during processing
121      """
122      try:
123          # Extract audio
124          if progress:
125              progress(0.1, desc="Extracting audio...")
126          
127          audio_path = extract_audio(video_path, progress=progress)
128          audio_extracted = True
129          
130          # Transcribe audio
131          if progress:
132              progress(0.3, desc="Transcription in progress...")
133          
134          result = transcribe_audio(audio_path, model_size, progress=progress)
135          
136          # Identify speakers
137          if progress:
138              progress(0.5, desc="Speaker identification in progress...")
139          
140          diarization = diarize_audio(audio_path, huggingface_token, progress=progress)
141          
142          # Associate speakers with the transcription
143          final_transcription = assign_speakers(result, diarization)
144          
145          # Save the result if requested
146          if output_txt:
147              with open(output_txt, "w", encoding="utf-8") as f:
148                  formatted_text = format_diarized_transcription(final_transcription)
149                  f.write(formatted_text)
150              
151              if progress:
152                  progress(0.95, desc=f"Transcription saved to {output_txt}")
153          
154          # Cleanup
155          if progress:
156              progress(1.0, desc="Transcription completed")
157          
158          if audio_extracted:
159              cleanup_audio_file(audio_path)
160          
161          # Unified output format
162          return {
163              "transcription": format_diarized_transcription(final_transcription, include_timestamps=False),
164              "segments": final_transcription,
165              "language": result.get("language", ""),
166              "duration": result.get("duration", 0),
167              "speakers": list(set(segment["speaker"] for segment in final_transcription))
168          }
169          
170      except Exception as e:
171          error_msg = f"Error during transcription with speaker identification: {str(e)}"
172          logger.error(error_msg)
173          logger.error(traceback.format_exc())
174          
175          # Cleanup in case of error
176          if 'audio_path' in locals() and 'audio_extracted' in locals() and audio_extracted:
177              cleanup_audio_file(audio_path)
178          
179          # Free model memory
180          cleanup_whisper_model()
181          
182          raise Exception(error_msg)
183  
184  def transcribe_external_audio(
185      audio_path: str,
186      model_size: Optional[str] = None,
187      output_txt: Optional[str] = None,
188      progress: Optional[Callable] = None
189  ) -> Dict[str, Any]:
190      """
191      Transcribes an existing audio file without extraction
192      
193      Args:
194          audio_path: Path to the audio file
195          model_size: Size of the Whisper model to use
196          output_txt: Output path for the text file (optional)
197          progress: Progress tracking function (optional)
198          
199      Returns:
200          Dictionary containing the complete transcription and segments
201          
202      Raises:
203          Exception: If an error occurs during processing
204      """
205      try:
206          # Transcribe audio
207          if progress:
208              progress(0.2, desc="Audio transcription in progress...")
209          
210          result = transcribe_audio(audio_path, model_size, progress=progress)
211          
212          # Save transcription if requested
213          if output_txt:
214              with open(output_txt, "w", encoding="utf-8") as f:
215                  formatted_text = format_whisper_result(result)
216                  f.write(formatted_text)
217              
218              if progress:
219                  progress(0.9, desc=f"Transcription saved to {output_txt}")
220          
221          if progress:
222              progress(1.0, desc="Transcription completed")
223          
224          # Unified output format
225          return {
226              "transcription": result["text"],
227              "segments": result["segments"],
228              "language": result.get("language", ""),
229              "duration": result.get("duration", 0)
230          }
231          
232      except Exception as e:
233          error_msg = f"Error during audio transcription: {str(e)}"
234          logger.error(error_msg)
235          logger.error(traceback.format_exc())
236          raise Exception(error_msg)
237  
238  def get_available_models() -> Dict[str, Dict[str, Any]]:
239      """
240      Returns information about available transcription models
241      
242      Returns:
243          Dictionary with information about the models
244      """
245      from .whisper_utils import get_available_whisper_models
246      
247      models = {
248          "whisper": get_available_whisper_models(),
249          "diarization": {
250              "pyannote": {
251                  "description": "Diarization model for speaker identification",
252                  "requires_token": True,
253                  "source": "https://huggingface.co/pyannote/speaker-diarization-3.1"
254              }
255          }
256      }
257      
258      return models
259  
260  # Ajoutez cette fonction à la fin du fichier
261  def analyze_transcript(transcription: str, language: Optional[str] = None) -> str:
262      """
263      Analyze a transcription text to extract key points and insights
264      
265      Args:
266          transcription: Text transcription to analyze
267          language: Language of the transcription
268          
269      Returns:
270          Analysis text
271      """
272      from model_manager import ModelManager
273      
274      try:
275          # Get LLM model
276          model = ModelManager.get_instance().get_model("llm")
277          if not model:
278              return "Error: No LLM model available for analysis"
279          
280          # Generate analysis
281          result = model.generate(transcription, max_tokens=1024)
282          return result
283      except Exception as e:
284          logger.error(f"Error during transcription analysis: {str(e)}")
285          logger.error(traceback.format_exc())
286          return f"Error analyzing transcription: {str(e)}"