Cradicle Explorer

/ tools / transcription_tools.py
transcription_tools.py
  1  #!/usr/bin/env python3
  2  """
  3  Transcription Tools Module
  4  
  5  Provides speech-to-text transcription with six providers:
  6  
  7    - **local** (default, free) — faster-whisper running locally, no API key needed.
  8      Auto-downloads the model (~150 MB for ``base``) on first use.
  9    - **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``.
 10    - **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
 11    - **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``.
 12    - **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy,
 13      Inverse Text Normalization, diarization, 21 languages.
 14  
 15  Used by the messaging gateway to automatically transcribe voice messages
 16  sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
 17  
 18  Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg, aac
 19  
 20  Usage::
 21  
 22      from tools.transcription_tools import transcribe_audio
 23  
 24      result = transcribe_audio("/path/to/audio.ogg")
 25      if result["success"]:
 26          print(result["transcript"])
 27  """
 28  
 29  import logging
 30  import os
 31  import shlex
 32  import shutil
 33  import subprocess
 34  import tempfile
 35  from pathlib import Path
 36  from typing import Optional, Dict, Any
 37  from urllib.parse import urljoin
 38  
 39  from utils import is_truthy_value
 40  from tools.managed_tool_gateway import resolve_managed_tool_gateway
 41  from tools.tool_backend_helpers import managed_nous_tools_enabled, resolve_openai_audio_api_key
 42  
 43  logger = logging.getLogger(__name__)
 44  
 45  def get_env_value(name, default=None):
 46      """Read env values through the live config module.
 47  
 48      Tests may monkeypatch and later restore ``hermes_cli.config.get_env_value``
 49      before this module is imported. Resolve the helper at call time so STT does
 50      not keep a stale imported function for the rest of the test process.
 51      """
 52      try:
 53          from hermes_cli.config import get_env_value as _get_env_value
 54      except ImportError:
 55          return os.getenv(name, default)
 56      value = _get_env_value(name)
 57      return default if value is None else value
 58  
 59  # ---------------------------------------------------------------------------
 60  # Optional imports — graceful degradation
 61  # ---------------------------------------------------------------------------
 62  
 63  import importlib.util as _ilu
 64  
 65  
 66  def _safe_find_spec(module_name: str) -> bool:
 67      try:
 68          return _ilu.find_spec(module_name) is not None
 69      except (ImportError, ValueError):
 70          return module_name in globals() or module_name in os.sys.modules
 71  
 72  
 73  _HAS_FASTER_WHISPER = _safe_find_spec("faster_whisper")
 74  _HAS_OPENAI = _safe_find_spec("openai")
 75  _HAS_MISTRAL = _safe_find_spec("mistralai")
 76  
 77  # ---------------------------------------------------------------------------
 78  # Constants
 79  # ---------------------------------------------------------------------------
 80  
 81  DEFAULT_PROVIDER = "local"
 82  DEFAULT_LOCAL_MODEL = "base"
 83  DEFAULT_LOCAL_STT_LANGUAGE = "en"
 84  DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
 85  DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
 86  DEFAULT_MISTRAL_STT_MODEL = os.getenv("STT_MISTRAL_MODEL", "voxtral-mini-latest")
 87  LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
 88  LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
 89  COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
 90  
 91  GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
 92  OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
 93  XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1")
 94  
 95  SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
 96  LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
 97  MAX_FILE_SIZE = 25 * 1024 * 1024  # 25 MB
 98  
 99  # Known model sets for auto-correction
100  OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"}
101  GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"}
102  
103  # Singleton for the local model — loaded once, reused across calls
104  _local_model: Optional[object] = None
105  _local_model_name: Optional[str] = None
106  
107  # ---------------------------------------------------------------------------
108  # Config helpers
109  # ---------------------------------------------------------------------------
110  
111  
112  
113  def _load_stt_config() -> dict:
114      """Load the ``stt`` section from user config, falling back to defaults."""
115      try:
116          from hermes_cli.config import load_config
117          return load_config().get("stt", {})
118      except Exception:
119          return {}
120  
121  
122  def is_stt_enabled(stt_config: Optional[dict] = None) -> bool:
123      """Return whether STT is enabled in config."""
124      if stt_config is None:
125          stt_config = _load_stt_config()
126      enabled = stt_config.get("enabled", True)
127      return is_truthy_value(enabled, default=True)
128  
129  
130  def _has_openai_audio_backend() -> bool:
131      """Return True when OpenAI audio can use config credentials, env credentials, or the managed gateway."""
132      try:
133          _resolve_openai_audio_client_config()
134          return True
135      except ValueError:
136          return False
137  
138  
139  def _find_binary(binary_name: str) -> Optional[str]:
140      """Find a local binary, checking common Homebrew/local prefixes as well as PATH."""
141      for directory in COMMON_LOCAL_BIN_DIRS:
142          candidate = Path(directory) / binary_name
143          if candidate.exists() and os.access(candidate, os.X_OK):
144              return str(candidate)
145      return shutil.which(binary_name)
146  
147  
148  def _find_ffmpeg_binary() -> Optional[str]:
149      return _find_binary("ffmpeg")
150  
151  
152  def _find_whisper_binary() -> Optional[str]:
153      return _find_binary("whisper")
154  
155  
156  def _get_local_command_template() -> Optional[str]:
157      configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip()
158      if configured:
159          return configured
160  
161      whisper_binary = _find_whisper_binary()
162      if whisper_binary:
163          quoted_binary = shlex.quote(whisper_binary)
164          return (
165              f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt "
166              "--output_dir {output_dir} --language {language}"
167          )
168      return None
169  
170  
171  def _has_local_command() -> bool:
172      return _get_local_command_template() is not None
173  
174  
175  def _normalize_local_model(model_name: Optional[str]) -> str:
176      """Return a valid faster-whisper model size, mapping cloud-only names to the default.
177  
178      Cloud providers like OpenAI use names such as ``whisper-1`` which are not
179      valid for faster-whisper (which expects ``tiny``, ``base``, ``small``,
180      ``medium``, or ``large-v*``).  When such a name is detected we fall back to
181      the default local model and emit a warning so the user knows what happened.
182      """
183      if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS:
184          if model_name and (model_name in OPENAI_MODELS or model_name in GROQ_MODELS):
185              logger.warning(
186                  "STT model '%s' is a cloud-only name and cannot be used with the local "
187                  "provider. Falling back to '%s'. Set stt.local.model to a valid "
188                  "faster-whisper size (tiny, base, small, medium, large-v3).",
189                  model_name,
190                  DEFAULT_LOCAL_MODEL,
191              )
192          return DEFAULT_LOCAL_MODEL
193      return model_name
194  
195  
196  def _normalize_local_command_model(model_name: Optional[str]) -> str:
197      return _normalize_local_model(model_name)
198  
199  
200  def _get_provider(stt_config: dict) -> str:
201      """Determine which STT provider to use.
202  
203      When ``stt.provider`` is explicitly set in config, that choice is
204      honoured — no silent cloud fallback.  When no provider is configured,
205      auto-detect tries: local > groq (free) > openai (paid).
206      """
207      if not is_stt_enabled(stt_config):
208          return "none"
209  
210      explicit = "provider" in stt_config
211      provider = stt_config.get("provider", DEFAULT_PROVIDER)
212  
213      # --- Explicit provider: respect the user's choice ----------------------
214  
215      if explicit:
216          if provider == "local":
217              if _HAS_FASTER_WHISPER:
218                  return "local"
219              if _has_local_command():
220                  return "local_command"
221              logger.warning(
222                  "STT provider 'local' configured but unavailable "
223                  "(install faster-whisper or set HERMES_LOCAL_STT_COMMAND)"
224              )
225              return "none"
226  
227          if provider == "local_command":
228              if _has_local_command():
229                  return "local_command"
230              if _HAS_FASTER_WHISPER:
231                  logger.info("Local STT command unavailable, using local faster-whisper")
232                  return "local"
233              logger.warning(
234                  "STT provider 'local_command' configured but unavailable"
235              )
236              return "none"
237  
238          if provider == "groq":
239              if _HAS_OPENAI and get_env_value("GROQ_API_KEY"):
240                  return "groq"
241              logger.warning(
242                  "STT provider 'groq' configured but GROQ_API_KEY not set"
243              )
244              return "none"
245  
246          if provider == "openai":
247              if _HAS_OPENAI and _has_openai_audio_backend():
248                  return "openai"
249              logger.warning(
250                  "STT provider 'openai' configured but no API key available"
251              )
252              return "none"
253  
254          if provider == "mistral":
255              if _HAS_MISTRAL and get_env_value("MISTRAL_API_KEY"):
256                  return "mistral"
257              logger.warning(
258                  "STT provider 'mistral' configured but mistralai package "
259                  "not installed or MISTRAL_API_KEY not set"
260              )
261              return "none"
262  
263          if provider == "xai":
264              if get_env_value("XAI_API_KEY"):
265                  return "xai"
266              logger.warning(
267                  "STT provider 'xai' configured but XAI_API_KEY not set"
268              )
269              return "none"
270  
271          return provider  # Unknown — let it fail downstream
272  
273      # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai -
274  
275      if _HAS_FASTER_WHISPER:
276          return "local"
277      if _has_local_command():
278          return "local_command"
279      if _HAS_OPENAI and get_env_value("GROQ_API_KEY"):
280          logger.info("No local STT available, using Groq Whisper API")
281          return "groq"
282      if _HAS_OPENAI and _has_openai_audio_backend():
283          logger.info("No local STT available, using OpenAI Whisper API")
284          return "openai"
285      if _HAS_MISTRAL and get_env_value("MISTRAL_API_KEY"):
286          logger.info("No local STT available, using Mistral Voxtral Transcribe API")
287          return "mistral"
288      if get_env_value("XAI_API_KEY"):
289          logger.info("No local STT available, using xAI Grok STT API")
290          return "xai"
291      return "none"
292  
293  # ---------------------------------------------------------------------------
294  # Shared validation
295  # ---------------------------------------------------------------------------
296  
297  
298  def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
299      """Validate the audio file.  Returns an error dict or None if OK."""
300      audio_path = Path(file_path)
301  
302      if not audio_path.exists():
303          return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"}
304      if not audio_path.is_file():
305          return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"}
306      if audio_path.suffix.lower() not in SUPPORTED_FORMATS:
307          return {
308              "success": False,
309              "transcript": "",
310              "error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}",
311          }
312      try:
313          file_size = audio_path.stat().st_size
314          if file_size > MAX_FILE_SIZE:
315              return {
316                  "success": False,
317                  "transcript": "",
318                  "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)",
319              }
320      except OSError as e:
321          return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"}
322  
323      return None
324  
325  # ---------------------------------------------------------------------------
326  # Provider: local (faster-whisper)
327  # ---------------------------------------------------------------------------
328  
329  
330  # Substrings that identify a missing/unloadable CUDA runtime library.  When
331  # ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the
332  # "auto" device picker has already committed to CUDA and the model can no
333  # longer be used — we fall back to CPU and reload.
334  #
335  # Deliberately narrow: we match on library-name tokens and dlopen phrasing so
336  # we DO NOT accidentally catch legitimate runtime failures like "CUDA out of
337  # memory" — those should surface to the user, not silently fall back to CPU
338  # (a 32GB audio clip on CPU at int8 isn't useful either).
339  _CUDA_LIB_ERROR_MARKERS = (
340      "libcublas",
341      "libcudnn",
342      "libcudart",
343      "cannot be loaded",
344      "cannot open shared object",
345      "no kernel image is available",
346      "no CUDA-capable device",
347      "CUDA driver version is insufficient",
348  )
349  
350  
351  def _looks_like_cuda_lib_error(exc: BaseException) -> bool:
352      """Heuristic: is this exception a missing/broken CUDA runtime library?
353  
354      ctranslate2 raises plain RuntimeError with messages like
355      ``Library libcublas.so.12 is not found or cannot be loaded``.  We want to
356      catch missing/unloadable shared libs and driver-mismatch errors, NOT
357      legitimate runtime failures ("CUDA out of memory", model bugs, etc.).
358      """
359      msg = str(exc)
360      return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS)
361  
362  
363  def _load_local_whisper_model(model_name: str):
364      """Load faster-whisper with graceful CUDA → CPU fallback.
365  
366      faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel
367      ships CUDA shared libs, even on hosts where the NVIDIA runtime
368      (``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2
369      without CUDA-on-WSL, headless servers, and CPU-only developer machines.
370      On those hosts the load itself sometimes succeeds and the dlopen failure
371      only surfaces at first ``transcribe()`` call.
372  
373      We try ``auto`` first (fast CUDA path when it works), and on any CUDA
374      library load failure fall back to CPU + int8.
375      """
376      from faster_whisper import WhisperModel
377      try:
378          return WhisperModel(model_name, device="auto", compute_type="auto")
379      except Exception as exc:
380          if not _looks_like_cuda_lib_error(exc):
381              raise
382          logger.warning(
383              "faster-whisper CUDA load failed (%s) — falling back to CPU (int8). "
384              "Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.",
385              exc,
386          )
387          return WhisperModel(model_name, device="cpu", compute_type="int8")
388  
389  
390  def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
391      """Transcribe using faster-whisper (local, free)."""
392      global _local_model, _local_model_name
393  
394      if not _HAS_FASTER_WHISPER:
395          return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
396  
397      try:
398          # Lazy-load the model (downloads on first use, ~150 MB for 'base')
399          if _local_model is None or _local_model_name != model_name:
400              logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
401              _local_model = _load_local_whisper_model(model_name)
402              _local_model_name = model_name
403  
404          # Language: config.yaml (stt.local.language) > env var > auto-detect.
405          _forced_lang = (
406              _load_stt_config().get("local", {}).get("language")
407              or os.getenv(LOCAL_STT_LANGUAGE_ENV)
408              or None
409          )
410          transcribe_kwargs = {"beam_size": 5}
411          if _forced_lang:
412              transcribe_kwargs["language"] = _forced_lang
413  
414          try:
415              segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
416              transcript = " ".join(segment.text.strip() for segment in segments)
417          except Exception as exc:
418              # CUDA runtime libs sometimes only fail at dlopen-on-first-use,
419              # AFTER the model loaded successfully.  Evict the broken cached
420              # model, reload on CPU, retry once.  Without this the module-
421              # global `_local_model` is poisoned and every subsequent voice
422              # message on this process fails identically until restart.
423              if not _looks_like_cuda_lib_error(exc):
424                  raise
425              logger.warning(
426                  "faster-whisper CUDA runtime failed mid-transcribe (%s) — "
427                  "evicting cached model and retrying on CPU (int8).",
428                  exc,
429              )
430              _local_model = None
431              _local_model_name = None
432              from faster_whisper import WhisperModel
433              _local_model = WhisperModel(model_name, device="cpu", compute_type="int8")
434              _local_model_name = model_name
435              segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
436              transcript = " ".join(segment.text.strip() for segment in segments)
437  
438          logger.info(
439              "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)",
440              Path(file_path).name, model_name, info.language, info.duration,
441          )
442  
443          return {"success": True, "transcript": transcript, "provider": "local"}
444  
445      except Exception as e:
446          logger.error("Local transcription failed: %s", e, exc_info=True)
447          return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
448  
449  
450  def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]:
451      """Normalize audio for local CLI STT when needed."""
452      audio_path = Path(file_path)
453      if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS:
454          return file_path, None
455  
456      ffmpeg = _find_ffmpeg_binary()
457      if not ffmpeg:
458          return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found"
459  
460      converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav")
461      command = [ffmpeg, "-y", "-i", file_path, converted_path]
462  
463      try:
464          subprocess.run(command, check=True, capture_output=True, text=True)
465          return converted_path, None
466      except subprocess.CalledProcessError as e:
467          details = e.stderr.strip() or e.stdout.strip() or str(e)
468          logger.error("ffmpeg conversion failed for %s: %s", file_path, details)
469          return None, f"Failed to convert audio for local STT: {details}"
470  
471  
472  def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]:
473      """Run the configured local STT command template and read back a .txt transcript."""
474      command_template = _get_local_command_template()
475      if not command_template:
476          return {
477              "success": False,
478              "transcript": "",
479              "error": (
480                  f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found"
481              ),
482          }
483  
484      # Language: config.yaml (stt.local.language) > env var > "en" default.
485      language = (
486          _load_stt_config().get("local", {}).get("language")
487          or os.getenv(LOCAL_STT_LANGUAGE_ENV)
488          or DEFAULT_LOCAL_STT_LANGUAGE
489      )
490      normalized_model = _normalize_local_command_model(model_name)
491  
492      try:
493          with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir:
494              prepared_input, prep_error = _prepare_local_audio(file_path, output_dir)
495              if prep_error:
496                  return {"success": False, "transcript": "", "error": prep_error}
497  
498              command = command_template.format(
499                  input_path=shlex.quote(prepared_input),
500                  output_dir=shlex.quote(output_dir),
501                  language=shlex.quote(language),
502                  model=shlex.quote(normalized_model),
503              )
504              subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
505  
506              txt_files = sorted(Path(output_dir).glob("*.txt"))
507              if not txt_files:
508                  return {
509                      "success": False,
510                      "transcript": "",
511                      "error": "Local STT command completed but did not produce a .txt transcript",
512                  }
513  
514              transcript_text = txt_files[0].read_text(encoding="utf-8").strip()
515              logger.info(
516                  "Transcribed %s via local STT command (%s, %d chars)",
517                  Path(file_path).name,
518                  normalized_model,
519                  len(transcript_text),
520              )
521              return {"success": True, "transcript": transcript_text, "provider": "local_command"}
522  
523      except KeyError as e:
524          return {
525              "success": False,
526              "transcript": "",
527              "error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}",
528          }
529      except subprocess.CalledProcessError as e:
530          details = e.stderr.strip() or e.stdout.strip() or str(e)
531          logger.error("Local STT command failed for %s: %s", file_path, details)
532          return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"}
533      except Exception as e:
534          logger.error("Unexpected error during local command transcription: %s", e, exc_info=True)
535          return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
536  
537  # ---------------------------------------------------------------------------
538  # Provider: groq (Whisper API — free tier)
539  # ---------------------------------------------------------------------------
540  
541  
542  def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
543      """Transcribe using Groq Whisper API (free tier available)."""
544      api_key = get_env_value("GROQ_API_KEY")
545      if not api_key:
546          return {"success": False, "transcript": "", "error": "GROQ_API_KEY not set"}
547  
548      if not _HAS_OPENAI:
549          return {"success": False, "transcript": "", "error": "openai package not installed"}
550  
551      # Auto-correct model if caller passed an OpenAI-only model
552      if model_name in OPENAI_MODELS:
553          logger.info("Model %s not available on Groq, using %s", model_name, DEFAULT_GROQ_STT_MODEL)
554          model_name = DEFAULT_GROQ_STT_MODEL
555  
556      try:
557          from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
558          client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL, timeout=30, max_retries=0)
559          try:
560              with open(file_path, "rb") as audio_file:
561                  transcription = client.audio.transcriptions.create(
562                      model=model_name,
563                      file=audio_file,
564                      response_format="text",
565                  )
566  
567              transcript_text = str(transcription).strip()
568              logger.info("Transcribed %s via Groq API (%s, %d chars)",
569                           Path(file_path).name, model_name, len(transcript_text))
570  
571              return {"success": True, "transcript": transcript_text, "provider": "groq"}
572          finally:
573              close = getattr(client, "close", None)
574              if callable(close):
575                  close()
576  
577      except PermissionError:
578          return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
579      except APIConnectionError as e:
580          return {"success": False, "transcript": "", "error": f"Connection error: {e}"}
581      except APITimeoutError as e:
582          return {"success": False, "transcript": "", "error": f"Request timeout: {e}"}
583      except APIError as e:
584          return {"success": False, "transcript": "", "error": f"API error: {e}"}
585      except Exception as e:
586          logger.error("Groq transcription failed: %s", e, exc_info=True)
587          return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"}
588  
589  # ---------------------------------------------------------------------------
590  # Provider: openai (Whisper API)
591  # ---------------------------------------------------------------------------
592  
593  
594  def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
595      """Transcribe using OpenAI Whisper API (paid)."""
596      try:
597          api_key, base_url = _resolve_openai_audio_client_config()
598      except ValueError as exc:
599          return {
600              "success": False,
601              "transcript": "",
602              "error": str(exc),
603          }
604  
605      if not _HAS_OPENAI:
606          return {"success": False, "transcript": "", "error": "openai package not installed"}
607  
608      # Auto-correct model if caller passed a Groq-only model
609      if model_name in GROQ_MODELS:
610          logger.info("Model %s not available on OpenAI, using %s", model_name, DEFAULT_STT_MODEL)
611          model_name = DEFAULT_STT_MODEL
612  
613      try:
614          from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
615          client = OpenAI(api_key=api_key, base_url=base_url, timeout=30, max_retries=0)
616          try:
617              with open(file_path, "rb") as audio_file:
618                  transcription = client.audio.transcriptions.create(
619                      model=model_name,
620                      file=audio_file,
621                      response_format="text" if model_name == "whisper-1" else "json",
622                  )
623  
624              transcript_text = _extract_transcript_text(transcription)
625              logger.info("Transcribed %s via OpenAI API (%s, %d chars)",
626                           Path(file_path).name, model_name, len(transcript_text))
627  
628              return {"success": True, "transcript": transcript_text, "provider": "openai"}
629          finally:
630              close = getattr(client, "close", None)
631              if callable(close):
632                  close()
633  
634      except PermissionError:
635          return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
636      except APIConnectionError as e:
637          return {"success": False, "transcript": "", "error": f"Connection error: {e}"}
638      except APITimeoutError as e:
639          return {"success": False, "transcript": "", "error": f"Request timeout: {e}"}
640      except APIError as e:
641          return {"success": False, "transcript": "", "error": f"API error: {e}"}
642      except Exception as e:
643          logger.error("OpenAI transcription failed: %s", e, exc_info=True)
644          return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"}
645  
646  # ---------------------------------------------------------------------------
647  # Provider: mistral (Voxtral Transcribe API)
648  # ---------------------------------------------------------------------------
649  
650  
651  def _transcribe_mistral(file_path: str, model_name: str) -> Dict[str, Any]:
652      """Transcribe using Mistral Voxtral Transcribe API.
653  
654      Uses the ``mistralai`` Python SDK to call ``/v1/audio/transcriptions``.
655      Requires ``MISTRAL_API_KEY`` environment variable.
656      """
657      api_key = get_env_value("MISTRAL_API_KEY")
658      if not api_key:
659          return {"success": False, "transcript": "", "error": "MISTRAL_API_KEY not set"}
660  
661      try:
662          from mistralai.client import Mistral
663  
664          with Mistral(api_key=api_key) as client:
665              with open(file_path, "rb") as audio_file:
666                  result = client.audio.transcriptions.complete(
667                      model=model_name,
668                      file={"content": audio_file, "file_name": Path(file_path).name},
669                  )
670  
671              transcript_text = _extract_transcript_text(result)
672              logger.info(
673                  "Transcribed %s via Mistral API (%s, %d chars)",
674                  Path(file_path).name, model_name, len(transcript_text),
675              )
676              return {"success": True, "transcript": transcript_text, "provider": "mistral"}
677  
678      except PermissionError:
679          return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
680      except Exception as e:
681          logger.error("Mistral transcription failed: %s", e, exc_info=True)
682          return {"success": False, "transcript": "", "error": f"Mistral transcription failed: {type(e).__name__}"}
683  
684  
685  # ---------------------------------------------------------------------------
686  # Provider: xAI (Grok STT API)
687  # ---------------------------------------------------------------------------
688  
689  
690  def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]:
691      """Transcribe using xAI Grok STT API.
692  
693      Uses the ``POST /v1/stt`` REST endpoint with multipart/form-data.
694      Supports Inverse Text Normalization, diarization, and word-level timestamps.
695      Requires ``XAI_API_KEY`` environment variable.
696      """
697      api_key = get_env_value("XAI_API_KEY")
698      if not api_key:
699          return {"success": False, "transcript": "", "error": "XAI_API_KEY not set"}
700  
701      stt_config = _load_stt_config()
702      xai_config = stt_config.get("xai", {})
703      base_url = str(
704          xai_config.get("base_url")
705          or get_env_value("XAI_STT_BASE_URL")
706          or XAI_STT_BASE_URL
707      ).strip().rstrip("/")
708      language = str(
709          xai_config.get("language")
710          or os.getenv("HERMES_LOCAL_STT_LANGUAGE")
711          or DEFAULT_LOCAL_STT_LANGUAGE
712      ).strip()
713      # .get("format", True) already defaults to True when the key is absent;
714      # is_truthy_value only normalizes truthy/falsy strings from config.
715      use_format = is_truthy_value(xai_config.get("format", True))
716      use_diarize = is_truthy_value(xai_config.get("diarize", False))
717  
718      try:
719          import requests
720          from tools.xai_http import hermes_xai_user_agent
721  
722          data: Dict[str, str] = {}
723          if language:
724              data["language"] = language
725          if use_format:
726              data["format"] = "true"
727          if use_diarize:
728              data["diarize"] = "true"
729  
730          with open(file_path, "rb") as audio_file:
731              response = requests.post(
732                  f"{base_url}/stt",
733                  headers={
734                      "Authorization": f"Bearer {api_key}",
735                      "User-Agent": hermes_xai_user_agent(),
736                  },
737                  files={
738                      "file": (Path(file_path).name, audio_file),
739                  },
740                  data=data,
741                  timeout=120,
742              )
743  
744          if response.status_code != 200:
745              detail = ""
746              try:
747                  err_body = response.json()
748                  detail = err_body.get("error", {}).get("message", "") or response.text[:300]
749              except Exception:
750                  detail = response.text[:300]
751              return {
752                  "success": False,
753                  "transcript": "",
754                  "error": f"xAI STT API error (HTTP {response.status_code}): {detail}",
755              }
756  
757          result = response.json()
758          transcript_text = result.get("text", "").strip()
759  
760          if not transcript_text:
761              return {
762                  "success": False,
763                  "transcript": "",
764                  "error": "xAI STT returned empty transcript",
765              }
766  
767          logger.info(
768              "Transcribed %s via xAI Grok STT (lang=%s, %.1fs audio, %d chars)",
769              Path(file_path).name,
770              result.get("language", language),
771              result.get("duration", 0),
772              len(transcript_text),
773          )
774  
775          return {"success": True, "transcript": transcript_text, "provider": "xai"}
776  
777      except PermissionError:
778          return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
779      except Exception as e:
780          logger.error("xAI STT transcription failed: %s", e, exc_info=True)
781          return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"}
782  
783  
784  # ---------------------------------------------------------------------------
785  # Public API
786  # ---------------------------------------------------------------------------
787  
788  
789  def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]:
790      """
791      Transcribe an audio file using the configured STT provider.
792  
793      Provider priority:
794        1. User config (``stt.provider`` in config.yaml)
795        2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid)
796  
797      Args:
798          file_path: Absolute path to the audio file to transcribe.
799          model:     Override the model. If None, uses config or provider default.
800  
801      Returns:
802          dict with keys:
803            - "success" (bool): Whether transcription succeeded
804            - "transcript" (str): The transcribed text (empty on failure)
805            - "error" (str, optional): Error message if success is False
806            - "provider" (str, optional): Which provider was used
807      """
808      # Validate input
809      error = _validate_audio_file(file_path)
810      if error:
811          return error
812  
813      # Load config and determine provider
814      stt_config = _load_stt_config()
815      if not is_stt_enabled(stt_config):
816          return {
817              "success": False,
818              "transcript": "",
819              "error": "STT is disabled in config.yaml (stt.enabled: false).",
820          }
821  
822      provider = _get_provider(stt_config)
823  
824      if provider == "local":
825          local_cfg = stt_config.get("local", {})
826          model_name = _normalize_local_model(
827              model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
828          )
829          return _transcribe_local(file_path, model_name)
830  
831      if provider == "local_command":
832          local_cfg = stt_config.get("local", {})
833          model_name = _normalize_local_command_model(
834              model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
835          )
836          return _transcribe_local_command(file_path, model_name)
837  
838      if provider == "groq":
839          model_name = model or DEFAULT_GROQ_STT_MODEL
840          return _transcribe_groq(file_path, model_name)
841  
842      if provider == "openai":
843          openai_cfg = stt_config.get("openai", {})
844          model_name = model or openai_cfg.get("model", DEFAULT_STT_MODEL)
845          return _transcribe_openai(file_path, model_name)
846  
847      if provider == "mistral":
848          mistral_cfg = stt_config.get("mistral", {})
849          model_name = model or mistral_cfg.get("model", DEFAULT_MISTRAL_STT_MODEL)
850          return _transcribe_mistral(file_path, model_name)
851  
852      if provider == "xai":
853          # xAI Grok STT doesn't use a model parameter — pass through for logging
854          model_name = model or "grok-stt"
855          return _transcribe_xai(file_path, model_name)
856  
857      # No provider available
858      return {
859          "success": False,
860          "transcript": "",
861          "error": (
862              "No STT provider available. Install faster-whisper for free local "
863              f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
864              "set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral "
865              "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY "
866              "or OPENAI_API_KEY for the OpenAI Whisper API."
867          ),
868      }
869  
870  
871  def _resolve_openai_audio_client_config() -> tuple[str, str]:
872      """Return direct OpenAI audio config or a managed gateway fallback."""
873      stt_config = _load_stt_config()
874      openai_cfg = stt_config.get("openai", {})
875      cfg_api_key = openai_cfg.get("api_key", "")
876      cfg_base_url = openai_cfg.get("base_url", "")
877      if cfg_api_key:
878          return cfg_api_key, (cfg_base_url or OPENAI_BASE_URL)
879  
880      direct_api_key = resolve_openai_audio_api_key()
881      if direct_api_key:
882          return direct_api_key, OPENAI_BASE_URL
883  
884      managed_gateway = resolve_managed_tool_gateway("openai-audio")
885      if managed_gateway is None:
886          message = "Neither stt.openai.api_key in config nor VOICE_TOOLS_OPENAI_KEY/OPENAI_API_KEY is set"
887          if managed_nous_tools_enabled():
888              message += ", and the managed OpenAI audio gateway is unavailable"
889          raise ValueError(message)
890  
891      return managed_gateway.nous_user_token, urljoin(
892          f"{managed_gateway.gateway_origin.rstrip('/')}/", "v1"
893      )
894  
895  
896  def _extract_transcript_text(transcription: Any) -> str:
897      """Normalize text and JSON transcription responses to a plain string."""
898      if isinstance(transcription, str):
899          return transcription.strip()
900  
901      if hasattr(transcription, "text"):
902          value = getattr(transcription, "text")
903          if isinstance(value, str):
904              return value.strip()
905  
906      if isinstance(transcription, dict):
907          value = transcription.get("text")
908          if isinstance(value, str):
909              return value.strip()
910  
911      return str(transcription).strip()