transcription_tools.py
1 #!/usr/bin/env python3 2 """ 3 Transcription Tools Module 4 5 Provides speech-to-text transcription with six providers: 6 7 - **local** (default, free) — faster-whisper running locally, no API key needed. 8 Auto-downloads the model (~150 MB for ``base``) on first use. 9 - **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``. 10 - **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``. 11 - **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``. 12 - **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy, 13 Inverse Text Normalization, diarization, 21 languages. 14 15 Used by the messaging gateway to automatically transcribe voice messages 16 sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. 17 18 Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg, aac 19 20 Usage:: 21 22 from tools.transcription_tools import transcribe_audio 23 24 result = transcribe_audio("/path/to/audio.ogg") 25 if result["success"]: 26 print(result["transcript"]) 27 """ 28 29 import logging 30 import os 31 import shlex 32 import shutil 33 import subprocess 34 import tempfile 35 from pathlib import Path 36 from typing import Optional, Dict, Any 37 from urllib.parse import urljoin 38 39 from utils import is_truthy_value 40 from tools.managed_tool_gateway import resolve_managed_tool_gateway 41 from tools.tool_backend_helpers import managed_nous_tools_enabled, resolve_openai_audio_api_key 42 43 logger = logging.getLogger(__name__) 44 45 def get_env_value(name, default=None): 46 """Read env values through the live config module. 47 48 Tests may monkeypatch and later restore ``hermes_cli.config.get_env_value`` 49 before this module is imported. Resolve the helper at call time so STT does 50 not keep a stale imported function for the rest of the test process. 51 """ 52 try: 53 from hermes_cli.config import get_env_value as _get_env_value 54 except ImportError: 55 return os.getenv(name, default) 56 value = _get_env_value(name) 57 return default if value is None else value 58 59 # --------------------------------------------------------------------------- 60 # Optional imports — graceful degradation 61 # --------------------------------------------------------------------------- 62 63 import importlib.util as _ilu 64 65 66 def _safe_find_spec(module_name: str) -> bool: 67 try: 68 return _ilu.find_spec(module_name) is not None 69 except (ImportError, ValueError): 70 return module_name in globals() or module_name in os.sys.modules 71 72 73 _HAS_FASTER_WHISPER = _safe_find_spec("faster_whisper") 74 _HAS_OPENAI = _safe_find_spec("openai") 75 _HAS_MISTRAL = _safe_find_spec("mistralai") 76 77 # --------------------------------------------------------------------------- 78 # Constants 79 # --------------------------------------------------------------------------- 80 81 DEFAULT_PROVIDER = "local" 82 DEFAULT_LOCAL_MODEL = "base" 83 DEFAULT_LOCAL_STT_LANGUAGE = "en" 84 DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1") 85 DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo") 86 DEFAULT_MISTRAL_STT_MODEL = os.getenv("STT_MISTRAL_MODEL", "voxtral-mini-latest") 87 LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND" 88 LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE" 89 COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin") 90 91 GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") 92 OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") 93 XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1") 94 95 SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"} 96 LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"} 97 MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB 98 99 # Known model sets for auto-correction 100 OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"} 101 GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"} 102 103 # Singleton for the local model — loaded once, reused across calls 104 _local_model: Optional[object] = None 105 _local_model_name: Optional[str] = None 106 107 # --------------------------------------------------------------------------- 108 # Config helpers 109 # --------------------------------------------------------------------------- 110 111 112 113 def _load_stt_config() -> dict: 114 """Load the ``stt`` section from user config, falling back to defaults.""" 115 try: 116 from hermes_cli.config import load_config 117 return load_config().get("stt", {}) 118 except Exception: 119 return {} 120 121 122 def is_stt_enabled(stt_config: Optional[dict] = None) -> bool: 123 """Return whether STT is enabled in config.""" 124 if stt_config is None: 125 stt_config = _load_stt_config() 126 enabled = stt_config.get("enabled", True) 127 return is_truthy_value(enabled, default=True) 128 129 130 def _has_openai_audio_backend() -> bool: 131 """Return True when OpenAI audio can use config credentials, env credentials, or the managed gateway.""" 132 try: 133 _resolve_openai_audio_client_config() 134 return True 135 except ValueError: 136 return False 137 138 139 def _find_binary(binary_name: str) -> Optional[str]: 140 """Find a local binary, checking common Homebrew/local prefixes as well as PATH.""" 141 for directory in COMMON_LOCAL_BIN_DIRS: 142 candidate = Path(directory) / binary_name 143 if candidate.exists() and os.access(candidate, os.X_OK): 144 return str(candidate) 145 return shutil.which(binary_name) 146 147 148 def _find_ffmpeg_binary() -> Optional[str]: 149 return _find_binary("ffmpeg") 150 151 152 def _find_whisper_binary() -> Optional[str]: 153 return _find_binary("whisper") 154 155 156 def _get_local_command_template() -> Optional[str]: 157 configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip() 158 if configured: 159 return configured 160 161 whisper_binary = _find_whisper_binary() 162 if whisper_binary: 163 quoted_binary = shlex.quote(whisper_binary) 164 return ( 165 f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt " 166 "--output_dir {output_dir} --language {language}" 167 ) 168 return None 169 170 171 def _has_local_command() -> bool: 172 return _get_local_command_template() is not None 173 174 175 def _normalize_local_model(model_name: Optional[str]) -> str: 176 """Return a valid faster-whisper model size, mapping cloud-only names to the default. 177 178 Cloud providers like OpenAI use names such as ``whisper-1`` which are not 179 valid for faster-whisper (which expects ``tiny``, ``base``, ``small``, 180 ``medium``, or ``large-v*``). When such a name is detected we fall back to 181 the default local model and emit a warning so the user knows what happened. 182 """ 183 if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS: 184 if model_name and (model_name in OPENAI_MODELS or model_name in GROQ_MODELS): 185 logger.warning( 186 "STT model '%s' is a cloud-only name and cannot be used with the local " 187 "provider. Falling back to '%s'. Set stt.local.model to a valid " 188 "faster-whisper size (tiny, base, small, medium, large-v3).", 189 model_name, 190 DEFAULT_LOCAL_MODEL, 191 ) 192 return DEFAULT_LOCAL_MODEL 193 return model_name 194 195 196 def _normalize_local_command_model(model_name: Optional[str]) -> str: 197 return _normalize_local_model(model_name) 198 199 200 def _get_provider(stt_config: dict) -> str: 201 """Determine which STT provider to use. 202 203 When ``stt.provider`` is explicitly set in config, that choice is 204 honoured — no silent cloud fallback. When no provider is configured, 205 auto-detect tries: local > groq (free) > openai (paid). 206 """ 207 if not is_stt_enabled(stt_config): 208 return "none" 209 210 explicit = "provider" in stt_config 211 provider = stt_config.get("provider", DEFAULT_PROVIDER) 212 213 # --- Explicit provider: respect the user's choice ---------------------- 214 215 if explicit: 216 if provider == "local": 217 if _HAS_FASTER_WHISPER: 218 return "local" 219 if _has_local_command(): 220 return "local_command" 221 logger.warning( 222 "STT provider 'local' configured but unavailable " 223 "(install faster-whisper or set HERMES_LOCAL_STT_COMMAND)" 224 ) 225 return "none" 226 227 if provider == "local_command": 228 if _has_local_command(): 229 return "local_command" 230 if _HAS_FASTER_WHISPER: 231 logger.info("Local STT command unavailable, using local faster-whisper") 232 return "local" 233 logger.warning( 234 "STT provider 'local_command' configured but unavailable" 235 ) 236 return "none" 237 238 if provider == "groq": 239 if _HAS_OPENAI and get_env_value("GROQ_API_KEY"): 240 return "groq" 241 logger.warning( 242 "STT provider 'groq' configured but GROQ_API_KEY not set" 243 ) 244 return "none" 245 246 if provider == "openai": 247 if _HAS_OPENAI and _has_openai_audio_backend(): 248 return "openai" 249 logger.warning( 250 "STT provider 'openai' configured but no API key available" 251 ) 252 return "none" 253 254 if provider == "mistral": 255 if _HAS_MISTRAL and get_env_value("MISTRAL_API_KEY"): 256 return "mistral" 257 logger.warning( 258 "STT provider 'mistral' configured but mistralai package " 259 "not installed or MISTRAL_API_KEY not set" 260 ) 261 return "none" 262 263 if provider == "xai": 264 if get_env_value("XAI_API_KEY"): 265 return "xai" 266 logger.warning( 267 "STT provider 'xai' configured but XAI_API_KEY not set" 268 ) 269 return "none" 270 271 return provider # Unknown — let it fail downstream 272 273 # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai - 274 275 if _HAS_FASTER_WHISPER: 276 return "local" 277 if _has_local_command(): 278 return "local_command" 279 if _HAS_OPENAI and get_env_value("GROQ_API_KEY"): 280 logger.info("No local STT available, using Groq Whisper API") 281 return "groq" 282 if _HAS_OPENAI and _has_openai_audio_backend(): 283 logger.info("No local STT available, using OpenAI Whisper API") 284 return "openai" 285 if _HAS_MISTRAL and get_env_value("MISTRAL_API_KEY"): 286 logger.info("No local STT available, using Mistral Voxtral Transcribe API") 287 return "mistral" 288 if get_env_value("XAI_API_KEY"): 289 logger.info("No local STT available, using xAI Grok STT API") 290 return "xai" 291 return "none" 292 293 # --------------------------------------------------------------------------- 294 # Shared validation 295 # --------------------------------------------------------------------------- 296 297 298 def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]: 299 """Validate the audio file. Returns an error dict or None if OK.""" 300 audio_path = Path(file_path) 301 302 if not audio_path.exists(): 303 return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"} 304 if not audio_path.is_file(): 305 return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"} 306 if audio_path.suffix.lower() not in SUPPORTED_FORMATS: 307 return { 308 "success": False, 309 "transcript": "", 310 "error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}", 311 } 312 try: 313 file_size = audio_path.stat().st_size 314 if file_size > MAX_FILE_SIZE: 315 return { 316 "success": False, 317 "transcript": "", 318 "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)", 319 } 320 except OSError as e: 321 return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"} 322 323 return None 324 325 # --------------------------------------------------------------------------- 326 # Provider: local (faster-whisper) 327 # --------------------------------------------------------------------------- 328 329 330 # Substrings that identify a missing/unloadable CUDA runtime library. When 331 # ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the 332 # "auto" device picker has already committed to CUDA and the model can no 333 # longer be used — we fall back to CPU and reload. 334 # 335 # Deliberately narrow: we match on library-name tokens and dlopen phrasing so 336 # we DO NOT accidentally catch legitimate runtime failures like "CUDA out of 337 # memory" — those should surface to the user, not silently fall back to CPU 338 # (a 32GB audio clip on CPU at int8 isn't useful either). 339 _CUDA_LIB_ERROR_MARKERS = ( 340 "libcublas", 341 "libcudnn", 342 "libcudart", 343 "cannot be loaded", 344 "cannot open shared object", 345 "no kernel image is available", 346 "no CUDA-capable device", 347 "CUDA driver version is insufficient", 348 ) 349 350 351 def _looks_like_cuda_lib_error(exc: BaseException) -> bool: 352 """Heuristic: is this exception a missing/broken CUDA runtime library? 353 354 ctranslate2 raises plain RuntimeError with messages like 355 ``Library libcublas.so.12 is not found or cannot be loaded``. We want to 356 catch missing/unloadable shared libs and driver-mismatch errors, NOT 357 legitimate runtime failures ("CUDA out of memory", model bugs, etc.). 358 """ 359 msg = str(exc) 360 return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS) 361 362 363 def _load_local_whisper_model(model_name: str): 364 """Load faster-whisper with graceful CUDA → CPU fallback. 365 366 faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel 367 ships CUDA shared libs, even on hosts where the NVIDIA runtime 368 (``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2 369 without CUDA-on-WSL, headless servers, and CPU-only developer machines. 370 On those hosts the load itself sometimes succeeds and the dlopen failure 371 only surfaces at first ``transcribe()`` call. 372 373 We try ``auto`` first (fast CUDA path when it works), and on any CUDA 374 library load failure fall back to CPU + int8. 375 """ 376 from faster_whisper import WhisperModel 377 try: 378 return WhisperModel(model_name, device="auto", compute_type="auto") 379 except Exception as exc: 380 if not _looks_like_cuda_lib_error(exc): 381 raise 382 logger.warning( 383 "faster-whisper CUDA load failed (%s) — falling back to CPU (int8). " 384 "Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.", 385 exc, 386 ) 387 return WhisperModel(model_name, device="cpu", compute_type="int8") 388 389 390 def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: 391 """Transcribe using faster-whisper (local, free).""" 392 global _local_model, _local_model_name 393 394 if not _HAS_FASTER_WHISPER: 395 return {"success": False, "transcript": "", "error": "faster-whisper not installed"} 396 397 try: 398 # Lazy-load the model (downloads on first use, ~150 MB for 'base') 399 if _local_model is None or _local_model_name != model_name: 400 logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) 401 _local_model = _load_local_whisper_model(model_name) 402 _local_model_name = model_name 403 404 # Language: config.yaml (stt.local.language) > env var > auto-detect. 405 _forced_lang = ( 406 _load_stt_config().get("local", {}).get("language") 407 or os.getenv(LOCAL_STT_LANGUAGE_ENV) 408 or None 409 ) 410 transcribe_kwargs = {"beam_size": 5} 411 if _forced_lang: 412 transcribe_kwargs["language"] = _forced_lang 413 414 try: 415 segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) 416 transcript = " ".join(segment.text.strip() for segment in segments) 417 except Exception as exc: 418 # CUDA runtime libs sometimes only fail at dlopen-on-first-use, 419 # AFTER the model loaded successfully. Evict the broken cached 420 # model, reload on CPU, retry once. Without this the module- 421 # global `_local_model` is poisoned and every subsequent voice 422 # message on this process fails identically until restart. 423 if not _looks_like_cuda_lib_error(exc): 424 raise 425 logger.warning( 426 "faster-whisper CUDA runtime failed mid-transcribe (%s) — " 427 "evicting cached model and retrying on CPU (int8).", 428 exc, 429 ) 430 _local_model = None 431 _local_model_name = None 432 from faster_whisper import WhisperModel 433 _local_model = WhisperModel(model_name, device="cpu", compute_type="int8") 434 _local_model_name = model_name 435 segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) 436 transcript = " ".join(segment.text.strip() for segment in segments) 437 438 logger.info( 439 "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)", 440 Path(file_path).name, model_name, info.language, info.duration, 441 ) 442 443 return {"success": True, "transcript": transcript, "provider": "local"} 444 445 except Exception as e: 446 logger.error("Local transcription failed: %s", e, exc_info=True) 447 return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} 448 449 450 def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]: 451 """Normalize audio for local CLI STT when needed.""" 452 audio_path = Path(file_path) 453 if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS: 454 return file_path, None 455 456 ffmpeg = _find_ffmpeg_binary() 457 if not ffmpeg: 458 return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found" 459 460 converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav") 461 command = [ffmpeg, "-y", "-i", file_path, converted_path] 462 463 try: 464 subprocess.run(command, check=True, capture_output=True, text=True) 465 return converted_path, None 466 except subprocess.CalledProcessError as e: 467 details = e.stderr.strip() or e.stdout.strip() or str(e) 468 logger.error("ffmpeg conversion failed for %s: %s", file_path, details) 469 return None, f"Failed to convert audio for local STT: {details}" 470 471 472 def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]: 473 """Run the configured local STT command template and read back a .txt transcript.""" 474 command_template = _get_local_command_template() 475 if not command_template: 476 return { 477 "success": False, 478 "transcript": "", 479 "error": ( 480 f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found" 481 ), 482 } 483 484 # Language: config.yaml (stt.local.language) > env var > "en" default. 485 language = ( 486 _load_stt_config().get("local", {}).get("language") 487 or os.getenv(LOCAL_STT_LANGUAGE_ENV) 488 or DEFAULT_LOCAL_STT_LANGUAGE 489 ) 490 normalized_model = _normalize_local_command_model(model_name) 491 492 try: 493 with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir: 494 prepared_input, prep_error = _prepare_local_audio(file_path, output_dir) 495 if prep_error: 496 return {"success": False, "transcript": "", "error": prep_error} 497 498 command = command_template.format( 499 input_path=shlex.quote(prepared_input), 500 output_dir=shlex.quote(output_dir), 501 language=shlex.quote(language), 502 model=shlex.quote(normalized_model), 503 ) 504 subprocess.run(command, shell=True, check=True, capture_output=True, text=True) 505 506 txt_files = sorted(Path(output_dir).glob("*.txt")) 507 if not txt_files: 508 return { 509 "success": False, 510 "transcript": "", 511 "error": "Local STT command completed but did not produce a .txt transcript", 512 } 513 514 transcript_text = txt_files[0].read_text(encoding="utf-8").strip() 515 logger.info( 516 "Transcribed %s via local STT command (%s, %d chars)", 517 Path(file_path).name, 518 normalized_model, 519 len(transcript_text), 520 ) 521 return {"success": True, "transcript": transcript_text, "provider": "local_command"} 522 523 except KeyError as e: 524 return { 525 "success": False, 526 "transcript": "", 527 "error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}", 528 } 529 except subprocess.CalledProcessError as e: 530 details = e.stderr.strip() or e.stdout.strip() or str(e) 531 logger.error("Local STT command failed for %s: %s", file_path, details) 532 return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"} 533 except Exception as e: 534 logger.error("Unexpected error during local command transcription: %s", e, exc_info=True) 535 return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} 536 537 # --------------------------------------------------------------------------- 538 # Provider: groq (Whisper API — free tier) 539 # --------------------------------------------------------------------------- 540 541 542 def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]: 543 """Transcribe using Groq Whisper API (free tier available).""" 544 api_key = get_env_value("GROQ_API_KEY") 545 if not api_key: 546 return {"success": False, "transcript": "", "error": "GROQ_API_KEY not set"} 547 548 if not _HAS_OPENAI: 549 return {"success": False, "transcript": "", "error": "openai package not installed"} 550 551 # Auto-correct model if caller passed an OpenAI-only model 552 if model_name in OPENAI_MODELS: 553 logger.info("Model %s not available on Groq, using %s", model_name, DEFAULT_GROQ_STT_MODEL) 554 model_name = DEFAULT_GROQ_STT_MODEL 555 556 try: 557 from openai import OpenAI, APIError, APIConnectionError, APITimeoutError 558 client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL, timeout=30, max_retries=0) 559 try: 560 with open(file_path, "rb") as audio_file: 561 transcription = client.audio.transcriptions.create( 562 model=model_name, 563 file=audio_file, 564 response_format="text", 565 ) 566 567 transcript_text = str(transcription).strip() 568 logger.info("Transcribed %s via Groq API (%s, %d chars)", 569 Path(file_path).name, model_name, len(transcript_text)) 570 571 return {"success": True, "transcript": transcript_text, "provider": "groq"} 572 finally: 573 close = getattr(client, "close", None) 574 if callable(close): 575 close() 576 577 except PermissionError: 578 return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} 579 except APIConnectionError as e: 580 return {"success": False, "transcript": "", "error": f"Connection error: {e}"} 581 except APITimeoutError as e: 582 return {"success": False, "transcript": "", "error": f"Request timeout: {e}"} 583 except APIError as e: 584 return {"success": False, "transcript": "", "error": f"API error: {e}"} 585 except Exception as e: 586 logger.error("Groq transcription failed: %s", e, exc_info=True) 587 return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"} 588 589 # --------------------------------------------------------------------------- 590 # Provider: openai (Whisper API) 591 # --------------------------------------------------------------------------- 592 593 594 def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: 595 """Transcribe using OpenAI Whisper API (paid).""" 596 try: 597 api_key, base_url = _resolve_openai_audio_client_config() 598 except ValueError as exc: 599 return { 600 "success": False, 601 "transcript": "", 602 "error": str(exc), 603 } 604 605 if not _HAS_OPENAI: 606 return {"success": False, "transcript": "", "error": "openai package not installed"} 607 608 # Auto-correct model if caller passed a Groq-only model 609 if model_name in GROQ_MODELS: 610 logger.info("Model %s not available on OpenAI, using %s", model_name, DEFAULT_STT_MODEL) 611 model_name = DEFAULT_STT_MODEL 612 613 try: 614 from openai import OpenAI, APIError, APIConnectionError, APITimeoutError 615 client = OpenAI(api_key=api_key, base_url=base_url, timeout=30, max_retries=0) 616 try: 617 with open(file_path, "rb") as audio_file: 618 transcription = client.audio.transcriptions.create( 619 model=model_name, 620 file=audio_file, 621 response_format="text" if model_name == "whisper-1" else "json", 622 ) 623 624 transcript_text = _extract_transcript_text(transcription) 625 logger.info("Transcribed %s via OpenAI API (%s, %d chars)", 626 Path(file_path).name, model_name, len(transcript_text)) 627 628 return {"success": True, "transcript": transcript_text, "provider": "openai"} 629 finally: 630 close = getattr(client, "close", None) 631 if callable(close): 632 close() 633 634 except PermissionError: 635 return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} 636 except APIConnectionError as e: 637 return {"success": False, "transcript": "", "error": f"Connection error: {e}"} 638 except APITimeoutError as e: 639 return {"success": False, "transcript": "", "error": f"Request timeout: {e}"} 640 except APIError as e: 641 return {"success": False, "transcript": "", "error": f"API error: {e}"} 642 except Exception as e: 643 logger.error("OpenAI transcription failed: %s", e, exc_info=True) 644 return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"} 645 646 # --------------------------------------------------------------------------- 647 # Provider: mistral (Voxtral Transcribe API) 648 # --------------------------------------------------------------------------- 649 650 651 def _transcribe_mistral(file_path: str, model_name: str) -> Dict[str, Any]: 652 """Transcribe using Mistral Voxtral Transcribe API. 653 654 Uses the ``mistralai`` Python SDK to call ``/v1/audio/transcriptions``. 655 Requires ``MISTRAL_API_KEY`` environment variable. 656 """ 657 api_key = get_env_value("MISTRAL_API_KEY") 658 if not api_key: 659 return {"success": False, "transcript": "", "error": "MISTRAL_API_KEY not set"} 660 661 try: 662 from mistralai.client import Mistral 663 664 with Mistral(api_key=api_key) as client: 665 with open(file_path, "rb") as audio_file: 666 result = client.audio.transcriptions.complete( 667 model=model_name, 668 file={"content": audio_file, "file_name": Path(file_path).name}, 669 ) 670 671 transcript_text = _extract_transcript_text(result) 672 logger.info( 673 "Transcribed %s via Mistral API (%s, %d chars)", 674 Path(file_path).name, model_name, len(transcript_text), 675 ) 676 return {"success": True, "transcript": transcript_text, "provider": "mistral"} 677 678 except PermissionError: 679 return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} 680 except Exception as e: 681 logger.error("Mistral transcription failed: %s", e, exc_info=True) 682 return {"success": False, "transcript": "", "error": f"Mistral transcription failed: {type(e).__name__}"} 683 684 685 # --------------------------------------------------------------------------- 686 # Provider: xAI (Grok STT API) 687 # --------------------------------------------------------------------------- 688 689 690 def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]: 691 """Transcribe using xAI Grok STT API. 692 693 Uses the ``POST /v1/stt`` REST endpoint with multipart/form-data. 694 Supports Inverse Text Normalization, diarization, and word-level timestamps. 695 Requires ``XAI_API_KEY`` environment variable. 696 """ 697 api_key = get_env_value("XAI_API_KEY") 698 if not api_key: 699 return {"success": False, "transcript": "", "error": "XAI_API_KEY not set"} 700 701 stt_config = _load_stt_config() 702 xai_config = stt_config.get("xai", {}) 703 base_url = str( 704 xai_config.get("base_url") 705 or get_env_value("XAI_STT_BASE_URL") 706 or XAI_STT_BASE_URL 707 ).strip().rstrip("/") 708 language = str( 709 xai_config.get("language") 710 or os.getenv("HERMES_LOCAL_STT_LANGUAGE") 711 or DEFAULT_LOCAL_STT_LANGUAGE 712 ).strip() 713 # .get("format", True) already defaults to True when the key is absent; 714 # is_truthy_value only normalizes truthy/falsy strings from config. 715 use_format = is_truthy_value(xai_config.get("format", True)) 716 use_diarize = is_truthy_value(xai_config.get("diarize", False)) 717 718 try: 719 import requests 720 from tools.xai_http import hermes_xai_user_agent 721 722 data: Dict[str, str] = {} 723 if language: 724 data["language"] = language 725 if use_format: 726 data["format"] = "true" 727 if use_diarize: 728 data["diarize"] = "true" 729 730 with open(file_path, "rb") as audio_file: 731 response = requests.post( 732 f"{base_url}/stt", 733 headers={ 734 "Authorization": f"Bearer {api_key}", 735 "User-Agent": hermes_xai_user_agent(), 736 }, 737 files={ 738 "file": (Path(file_path).name, audio_file), 739 }, 740 data=data, 741 timeout=120, 742 ) 743 744 if response.status_code != 200: 745 detail = "" 746 try: 747 err_body = response.json() 748 detail = err_body.get("error", {}).get("message", "") or response.text[:300] 749 except Exception: 750 detail = response.text[:300] 751 return { 752 "success": False, 753 "transcript": "", 754 "error": f"xAI STT API error (HTTP {response.status_code}): {detail}", 755 } 756 757 result = response.json() 758 transcript_text = result.get("text", "").strip() 759 760 if not transcript_text: 761 return { 762 "success": False, 763 "transcript": "", 764 "error": "xAI STT returned empty transcript", 765 } 766 767 logger.info( 768 "Transcribed %s via xAI Grok STT (lang=%s, %.1fs audio, %d chars)", 769 Path(file_path).name, 770 result.get("language", language), 771 result.get("duration", 0), 772 len(transcript_text), 773 ) 774 775 return {"success": True, "transcript": transcript_text, "provider": "xai"} 776 777 except PermissionError: 778 return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} 779 except Exception as e: 780 logger.error("xAI STT transcription failed: %s", e, exc_info=True) 781 return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"} 782 783 784 # --------------------------------------------------------------------------- 785 # Public API 786 # --------------------------------------------------------------------------- 787 788 789 def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]: 790 """ 791 Transcribe an audio file using the configured STT provider. 792 793 Provider priority: 794 1. User config (``stt.provider`` in config.yaml) 795 2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid) 796 797 Args: 798 file_path: Absolute path to the audio file to transcribe. 799 model: Override the model. If None, uses config or provider default. 800 801 Returns: 802 dict with keys: 803 - "success" (bool): Whether transcription succeeded 804 - "transcript" (str): The transcribed text (empty on failure) 805 - "error" (str, optional): Error message if success is False 806 - "provider" (str, optional): Which provider was used 807 """ 808 # Validate input 809 error = _validate_audio_file(file_path) 810 if error: 811 return error 812 813 # Load config and determine provider 814 stt_config = _load_stt_config() 815 if not is_stt_enabled(stt_config): 816 return { 817 "success": False, 818 "transcript": "", 819 "error": "STT is disabled in config.yaml (stt.enabled: false).", 820 } 821 822 provider = _get_provider(stt_config) 823 824 if provider == "local": 825 local_cfg = stt_config.get("local", {}) 826 model_name = _normalize_local_model( 827 model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) 828 ) 829 return _transcribe_local(file_path, model_name) 830 831 if provider == "local_command": 832 local_cfg = stt_config.get("local", {}) 833 model_name = _normalize_local_command_model( 834 model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) 835 ) 836 return _transcribe_local_command(file_path, model_name) 837 838 if provider == "groq": 839 model_name = model or DEFAULT_GROQ_STT_MODEL 840 return _transcribe_groq(file_path, model_name) 841 842 if provider == "openai": 843 openai_cfg = stt_config.get("openai", {}) 844 model_name = model or openai_cfg.get("model", DEFAULT_STT_MODEL) 845 return _transcribe_openai(file_path, model_name) 846 847 if provider == "mistral": 848 mistral_cfg = stt_config.get("mistral", {}) 849 model_name = model or mistral_cfg.get("model", DEFAULT_MISTRAL_STT_MODEL) 850 return _transcribe_mistral(file_path, model_name) 851 852 if provider == "xai": 853 # xAI Grok STT doesn't use a model parameter — pass through for logging 854 model_name = model or "grok-stt" 855 return _transcribe_xai(file_path, model_name) 856 857 # No provider available 858 return { 859 "success": False, 860 "transcript": "", 861 "error": ( 862 "No STT provider available. Install faster-whisper for free local " 863 f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, " 864 "set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral " 865 "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY " 866 "or OPENAI_API_KEY for the OpenAI Whisper API." 867 ), 868 } 869 870 871 def _resolve_openai_audio_client_config() -> tuple[str, str]: 872 """Return direct OpenAI audio config or a managed gateway fallback.""" 873 stt_config = _load_stt_config() 874 openai_cfg = stt_config.get("openai", {}) 875 cfg_api_key = openai_cfg.get("api_key", "") 876 cfg_base_url = openai_cfg.get("base_url", "") 877 if cfg_api_key: 878 return cfg_api_key, (cfg_base_url or OPENAI_BASE_URL) 879 880 direct_api_key = resolve_openai_audio_api_key() 881 if direct_api_key: 882 return direct_api_key, OPENAI_BASE_URL 883 884 managed_gateway = resolve_managed_tool_gateway("openai-audio") 885 if managed_gateway is None: 886 message = "Neither stt.openai.api_key in config nor VOICE_TOOLS_OPENAI_KEY/OPENAI_API_KEY is set" 887 if managed_nous_tools_enabled(): 888 message += ", and the managed OpenAI audio gateway is unavailable" 889 raise ValueError(message) 890 891 return managed_gateway.nous_user_token, urljoin( 892 f"{managed_gateway.gateway_origin.rstrip('/')}/", "v1" 893 ) 894 895 896 def _extract_transcript_text(transcription: Any) -> str: 897 """Normalize text and JSON transcription responses to a plain string.""" 898 if isinstance(transcription, str): 899 return transcription.strip() 900 901 if hasattr(transcription, "text"): 902 value = getattr(transcription, "text") 903 if isinstance(value, str): 904 return value.strip() 905 906 if isinstance(transcription, dict): 907 value = transcription.get("text") 908 if isinstance(value, str): 909 return value.strip() 910 911 return str(transcription).strip()