/ transcription_models / transcription_core.py
transcription_core.py
1 """ 2 Main module for video and audio transcription 3 ---------------------------------------------------------------- 4 This module integrates the functionalities of other transcription modules 5 to provide a unified API for transcribing multimedia content. 6 """ 7 8 import os 9 import json 10 import logging 11 import time 12 import traceback 13 from pathlib import Path 14 from typing import Dict, List, Any, Optional, Callable, Tuple, Union 15 from tempfile import NamedTemporaryFile 16 17 # Import specialized modules 18 from .audio_extraction import extract_audio, cleanup_audio_file 19 from .whisper_utils import transcribe_audio, format_whisper_result, cleanup_whisper_model 20 from .diarization import diarize_audio, assign_speakers, format_diarized_transcription 21 22 # Logging configuration 23 logger = logging.getLogger("transcription.core") 24 25 # Temporary directory for output files 26 RESULTS_DIR = Path("results/transcriptions") 27 RESULTS_DIR.mkdir(parents=True, exist_ok=True) 28 29 def process_monologue( 30 video_path: str, 31 output_txt: Optional[str] = None, 32 model_size: Optional[str] = None, 33 progress: Optional[Callable] = None 34 ) -> Dict[str, Any]: 35 """ 36 Transcribes a video in monologue mode (without speaker identification) 37 38 Args: 39 video_path: Path to the video file 40 output_txt: Output path for the text file (optional) 41 model_size: Size of the Whisper model to use 42 progress: Progress tracking function (optional) 43 44 Returns: 45 Dictionary containing the complete transcription and segments 46 47 Raises: 48 Exception: If an error occurs during processing 49 """ 50 try: 51 # Extract audio 52 if progress: 53 progress(0.1, desc="Extracting audio...") 54 55 audio_path = extract_audio(video_path, progress=progress) 56 audio_extracted = True 57 58 # Transcribe audio 59 if progress: 60 progress(0.3, desc="Transcription in progress...") 61 62 result = transcribe_audio(audio_path, model_size, progress=progress) 63 64 # Save transcription if requested 65 if output_txt: 66 with open(output_txt, "w", encoding="utf-8") as f: 67 formatted_text = format_whisper_result(result) 68 f.write(formatted_text) 69 70 if progress: 71 progress(0.95, desc=f"Transcription saved to {output_txt}") 72 73 # Cleanup 74 if progress: 75 progress(1.0, desc="Transcription completed") 76 77 if audio_extracted: 78 cleanup_audio_file(audio_path) 79 80 # Unified output format 81 return { 82 "transcription": result["text"], 83 "segments": result["segments"], 84 "language": result.get("language", ""), 85 "duration": result.get("duration", 0) 86 } 87 88 except Exception as e: 89 error_msg = f"Error during transcription: {str(e)}" 90 logger.error(error_msg) 91 logger.error(traceback.format_exc()) 92 93 # Cleanup in case of error 94 if 'audio_path' in locals() and 'audio_extracted' in locals() and audio_extracted: 95 cleanup_audio_file(audio_path) 96 97 raise Exception(error_msg) 98 99 def process_multiple_speakers( 100 video_path: str, 101 output_txt: Optional[str] = None, 102 model_size: Optional[str] = None, 103 huggingface_token: Optional[str] = None, 104 progress: Optional[Callable] = None 105 ) -> Dict[str, Any]: 106 """ 107 Transcribes a video with speaker identification 108 109 Args: 110 video_path: Path to the video file 111 output_txt: Output path for the text file (optional) 112 model_size: Size of the Whisper model to use 113 huggingface_token: Hugging Face token for access to the diarization model 114 progress: Progress tracking function (optional) 115 116 Returns: 117 Dictionary containing the transcription with speaker identification 118 119 Raises: 120 Exception: If an error occurs during processing 121 """ 122 try: 123 # Extract audio 124 if progress: 125 progress(0.1, desc="Extracting audio...") 126 127 audio_path = extract_audio(video_path, progress=progress) 128 audio_extracted = True 129 130 # Transcribe audio 131 if progress: 132 progress(0.3, desc="Transcription in progress...") 133 134 result = transcribe_audio(audio_path, model_size, progress=progress) 135 136 # Identify speakers 137 if progress: 138 progress(0.5, desc="Speaker identification in progress...") 139 140 diarization = diarize_audio(audio_path, huggingface_token, progress=progress) 141 142 # Associate speakers with the transcription 143 final_transcription = assign_speakers(result, diarization) 144 145 # Save the result if requested 146 if output_txt: 147 with open(output_txt, "w", encoding="utf-8") as f: 148 formatted_text = format_diarized_transcription(final_transcription) 149 f.write(formatted_text) 150 151 if progress: 152 progress(0.95, desc=f"Transcription saved to {output_txt}") 153 154 # Cleanup 155 if progress: 156 progress(1.0, desc="Transcription completed") 157 158 if audio_extracted: 159 cleanup_audio_file(audio_path) 160 161 # Unified output format 162 return { 163 "transcription": format_diarized_transcription(final_transcription, include_timestamps=False), 164 "segments": final_transcription, 165 "language": result.get("language", ""), 166 "duration": result.get("duration", 0), 167 "speakers": list(set(segment["speaker"] for segment in final_transcription)) 168 } 169 170 except Exception as e: 171 error_msg = f"Error during transcription with speaker identification: {str(e)}" 172 logger.error(error_msg) 173 logger.error(traceback.format_exc()) 174 175 # Cleanup in case of error 176 if 'audio_path' in locals() and 'audio_extracted' in locals() and audio_extracted: 177 cleanup_audio_file(audio_path) 178 179 # Free model memory 180 cleanup_whisper_model() 181 182 raise Exception(error_msg) 183 184 def transcribe_external_audio( 185 audio_path: str, 186 model_size: Optional[str] = None, 187 output_txt: Optional[str] = None, 188 progress: Optional[Callable] = None 189 ) -> Dict[str, Any]: 190 """ 191 Transcribes an existing audio file without extraction 192 193 Args: 194 audio_path: Path to the audio file 195 model_size: Size of the Whisper model to use 196 output_txt: Output path for the text file (optional) 197 progress: Progress tracking function (optional) 198 199 Returns: 200 Dictionary containing the complete transcription and segments 201 202 Raises: 203 Exception: If an error occurs during processing 204 """ 205 try: 206 # Transcribe audio 207 if progress: 208 progress(0.2, desc="Audio transcription in progress...") 209 210 result = transcribe_audio(audio_path, model_size, progress=progress) 211 212 # Save transcription if requested 213 if output_txt: 214 with open(output_txt, "w", encoding="utf-8") as f: 215 formatted_text = format_whisper_result(result) 216 f.write(formatted_text) 217 218 if progress: 219 progress(0.9, desc=f"Transcription saved to {output_txt}") 220 221 if progress: 222 progress(1.0, desc="Transcription completed") 223 224 # Unified output format 225 return { 226 "transcription": result["text"], 227 "segments": result["segments"], 228 "language": result.get("language", ""), 229 "duration": result.get("duration", 0) 230 } 231 232 except Exception as e: 233 error_msg = f"Error during audio transcription: {str(e)}" 234 logger.error(error_msg) 235 logger.error(traceback.format_exc()) 236 raise Exception(error_msg) 237 238 def get_available_models() -> Dict[str, Dict[str, Any]]: 239 """ 240 Returns information about available transcription models 241 242 Returns: 243 Dictionary with information about the models 244 """ 245 from .whisper_utils import get_available_whisper_models 246 247 models = { 248 "whisper": get_available_whisper_models(), 249 "diarization": { 250 "pyannote": { 251 "description": "Diarization model for speaker identification", 252 "requires_token": True, 253 "source": "https://huggingface.co/pyannote/speaker-diarization-3.1" 254 } 255 } 256 } 257 258 return models 259 260 # Ajoutez cette fonction à la fin du fichier 261 def analyze_transcript(transcription: str, language: Optional[str] = None) -> str: 262 """ 263 Analyze a transcription text to extract key points and insights 264 265 Args: 266 transcription: Text transcription to analyze 267 language: Language of the transcription 268 269 Returns: 270 Analysis text 271 """ 272 from model_manager import ModelManager 273 274 try: 275 # Get LLM model 276 model = ModelManager.get_instance().get_model("llm") 277 if not model: 278 return "Error: No LLM model available for analysis" 279 280 # Generate analysis 281 result = model.generate(transcription, max_tokens=1024) 282 return result 283 except Exception as e: 284 logger.error(f"Error during transcription analysis: {str(e)}") 285 logger.error(traceback.format_exc()) 286 return f"Error analyzing transcription: {str(e)}"