Cradicle Explorer

/ tools / voice_mode.py
voice_mode.py
   1  """Voice Mode -- Push-to-talk audio recording and playback for the CLI.
   2  
   3  Provides audio capture via sounddevice, WAV encoding via stdlib wave,
   4  STT dispatch via tools.transcription_tools, and TTS playback via
   5  sounddevice or system audio players.
   6  
   7  Dependencies (optional):
   8      pip install sounddevice numpy
   9      or: pip install hermes-agent[voice]
  10  """
  11  
  12  import logging
  13  import os
  14  import platform
  15  import re
  16  import shutil
  17  import subprocess
  18  import sys
  19  import tempfile
  20  import threading
  21  import time
  22  import wave
  23  from typing import Any, Dict, List, Optional
  24  
  25  logger = logging.getLogger(__name__)
  26  
  27  # ---------------------------------------------------------------------------
  28  # Lazy audio imports -- never imported at module level to avoid crashing
  29  # in headless environments (SSH, Docker, WSL, no PortAudio).
  30  # ---------------------------------------------------------------------------
  31  
  32  def _import_audio():
  33      """Lazy-import sounddevice and numpy.  Returns (sd, np).
  34  
  35      Raises ImportError or OSError if the libraries are not available
  36      (e.g. PortAudio missing on headless servers).
  37      """
  38      import sounddevice as sd
  39      import numpy as np
  40      return sd, np
  41  
  42  
  43  def _audio_available() -> bool:
  44      """Return True if audio libraries can be imported."""
  45      try:
  46          _import_audio()
  47          return True
  48      except (ImportError, OSError):
  49          return False
  50  
  51  
  52  from hermes_constants import is_termux as _is_termux_environment
  53  
  54  
  55  def _voice_capture_install_hint() -> str:
  56      if _is_termux_environment():
  57          return "pkg install python-numpy portaudio && python -m pip install sounddevice"
  58      return "pip install sounddevice numpy"
  59  
  60  
  61  def _termux_microphone_command() -> Optional[str]:
  62      if not _is_termux_environment():
  63          return None
  64      return shutil.which("termux-microphone-record")
  65  
  66  
  67  
  68  def _termux_api_app_installed() -> bool:
  69      if not _is_termux_environment():
  70          return False
  71      try:
  72          result = subprocess.run(
  73              ["pm", "list", "packages", "com.termux.api"],
  74              capture_output=True,
  75              text=True,
  76              timeout=5,
  77              check=False,
  78          )
  79          return "package:com.termux.api" in (result.stdout or "")
  80      except Exception:
  81          return False
  82  
  83  
  84  def _termux_voice_capture_available() -> bool:
  85      return _termux_microphone_command() is not None and _termux_api_app_installed()
  86  
  87  
  88  def detect_audio_environment() -> dict:
  89      """Detect if the current environment supports audio I/O.
  90  
  91      Returns dict with 'available' (bool), 'warnings' (list of hard-fail
  92      reasons that block voice mode), and 'notices' (list of informational
  93      messages that do NOT block voice mode).
  94      """
  95      warnings = []   # hard-fail: these block voice mode
  96      notices = []     # informational: logged but don't block
  97      termux_mic_cmd = _termux_microphone_command()
  98      termux_app_installed = _termux_api_app_installed()
  99      termux_capture = bool(termux_mic_cmd and termux_app_installed)
 100  
 101      # SSH detection
 102      if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')):
 103          warnings.append("Running over SSH -- no audio devices available")
 104  
 105      # Docker/Podman container detection
 106      # Containerized environments are not automatically voice-incompatible:
 107      # some users pass host audio devices through and voice works fine.
 108      # So container presence alone is only a notice; we hard-fail later
 109      # only if device detection or audio initialization also shows no path.
 110      from hermes_constants import is_container
 111      container_env = is_container()
 112      if container_env:
 113          notices.append("Running inside Docker container")
 114  
 115      # WSL detection — PulseAudio bridge makes audio work in WSL.
 116      # Only block if PULSE_SERVER is not configured.
 117      try:
 118          with open('/proc/version', 'r') as f:
 119              if 'microsoft' in f.read().lower():
 120                  if os.environ.get('PULSE_SERVER'):
 121                      notices.append("Running in WSL with PulseAudio bridge")
 122                  else:
 123                      warnings.append(
 124                          "Running in WSL -- audio requires PulseAudio bridge.\n"
 125                          "  1. Set PULSE_SERVER=unix:/mnt/wslg/PulseServer\n"
 126                          "  2. Create ~/.asoundrc pointing ALSA at PulseAudio\n"
 127                          "  3. Verify with: arecord -d 3 /tmp/test.wav && aplay /tmp/test.wav"
 128                      )
 129      except (FileNotFoundError, PermissionError, OSError):
 130          pass
 131  
 132      # Check audio libraries
 133      try:
 134          sd, _ = _import_audio()
 135          try:
 136              devices = sd.query_devices()
 137              if not devices:
 138                  if termux_capture:
 139                      notices.append("No PortAudio devices detected, but Termux:API microphone capture is available")
 140                  elif container_env:
 141                      warnings.append("Running inside Docker container -- no audio devices")
 142                  else:
 143                      warnings.append("No audio input/output devices detected")
 144          except Exception:
 145              # In WSL with PulseAudio, device queries can fail even though
 146              # recording/playback works fine. Don't block if PULSE_SERVER is set.
 147              if os.environ.get('PULSE_SERVER'):
 148                  notices.append("Audio device query failed but PULSE_SERVER is set -- continuing")
 149              elif termux_capture:
 150                  notices.append("PortAudio device query failed, but Termux:API microphone capture is available")
 151              elif container_env:
 152                  warnings.append("Running inside Docker container -- no audio devices")
 153              else:
 154                  warnings.append("PortAudio device query failed (audio subsystem error)")
 155      except ImportError:
 156          if termux_capture:
 157              notices.append("Termux:API microphone recording available (sounddevice not required)")
 158          elif termux_mic_cmd and not termux_app_installed:
 159              warnings.append(
 160                  "Termux:API Android app is not installed. Install/update the Termux:API app to use termux-microphone-record."
 161              )
 162          else:
 163              warnings.append(f"Audio libraries not installed ({_voice_capture_install_hint()})")
 164      except OSError:
 165          if termux_capture:
 166              notices.append("Termux:API microphone recording available (PortAudio not required)")
 167          elif termux_mic_cmd and not termux_app_installed:
 168              warnings.append(
 169                  "Termux:API Android app is not installed. Install/update the Termux:API app to use termux-microphone-record."
 170              )
 171          elif _is_termux_environment():
 172              warnings.append(
 173                  "PortAudio system library not found -- install it first:\n"
 174                  "  Termux: pkg install portaudio\n"
 175                  "Then retry /voice on."
 176              )
 177          else:
 178              warnings.append(
 179                  "PortAudio system library not found -- install it first:\n"
 180                  "  Linux:  sudo apt-get install libportaudio2\n"
 181                  "  macOS:  brew install portaudio\n"
 182                  "Then retry /voice on."
 183              )
 184  
 185      return {
 186          "available": not warnings,
 187          "warnings": warnings,
 188          "notices": notices,
 189      }
 190  
 191  # ---------------------------------------------------------------------------
 192  # Recording parameters
 193  # ---------------------------------------------------------------------------
 194  SAMPLE_RATE = 16000  # Whisper native rate
 195  CHANNELS = 1  # Mono
 196  DTYPE = "int16"  # 16-bit PCM
 197  SAMPLE_WIDTH = 2  # bytes per sample (int16)
 198  
 199  # Silence detection defaults
 200  SILENCE_RMS_THRESHOLD = 200  # RMS below this = silence (int16 range 0-32767)
 201  SILENCE_DURATION_SECONDS = 3.0  # Seconds of continuous silence before auto-stop
 202  
 203  # Temp directory for voice recordings
 204  _TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
 205  
 206  
 207  # ============================================================================
 208  # Audio cues (beep tones)
 209  # ============================================================================
 210  def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None:
 211      """Play a short beep tone using numpy + sounddevice.
 212  
 213      Args:
 214          frequency: Tone frequency in Hz (default 880 = A5).
 215          duration: Duration of each beep in seconds.
 216          count: Number of beeps to play (with short gap between).
 217      """
 218      try:
 219          sd, np = _import_audio()
 220      except (ImportError, OSError):
 221          return
 222      try:
 223          gap = 0.06  # seconds between beeps
 224          samples_per_beep = int(SAMPLE_RATE * duration)
 225          samples_per_gap = int(SAMPLE_RATE * gap)
 226  
 227          parts = []
 228          for i in range(count):
 229              t = np.linspace(0, duration, samples_per_beep, endpoint=False)
 230              # Apply fade in/out to avoid click artifacts
 231              tone = np.sin(2 * np.pi * frequency * t)
 232              fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4)
 233              tone[:fade_len] *= np.linspace(0, 1, fade_len)
 234              tone[-fade_len:] *= np.linspace(1, 0, fade_len)
 235              parts.append((tone * 0.3 * 32767).astype(np.int16))
 236              if i < count - 1:
 237                  parts.append(np.zeros(samples_per_gap, dtype=np.int16))
 238  
 239          audio = np.concatenate(parts)
 240          sd.play(audio, samplerate=SAMPLE_RATE)
 241          # sd.wait() calls Event.wait() without timeout — hangs forever if the
 242          # audio device stalls.  Poll with a 2s ceiling and force-stop.
 243          deadline = time.monotonic() + 2.0
 244          while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
 245              time.sleep(0.01)
 246          sd.stop()
 247      except Exception as e:
 248          logger.debug("Beep playback failed: %s", e)
 249  
 250  
 251  # ============================================================================
 252  # Termux Audio Recorder
 253  # ============================================================================
 254  class TermuxAudioRecorder:
 255      """Recorder backend that uses Termux:API microphone capture commands."""
 256  
 257      supports_silence_autostop = False
 258  
 259      def __init__(self) -> None:
 260          self._lock = threading.Lock()
 261          self._recording = False
 262          self._start_time = 0.0
 263          self._recording_path: Optional[str] = None
 264          self._current_rms = 0
 265  
 266      @property
 267      def is_recording(self) -> bool:
 268          return self._recording
 269  
 270      @property
 271      def elapsed_seconds(self) -> float:
 272          if not self._recording:
 273              return 0.0
 274          return time.monotonic() - self._start_time
 275  
 276      @property
 277      def current_rms(self) -> int:
 278          return self._current_rms
 279  
 280      def start(self, on_silence_stop=None) -> None:
 281          del on_silence_stop  # Termux:API does not expose live silence callbacks.
 282          mic_cmd = _termux_microphone_command()
 283          if not mic_cmd:
 284              raise RuntimeError(
 285                  "Termux voice capture requires the termux-api package and app.\n"
 286                  "Install with: pkg install termux-api\n"
 287                  "Then install/update the Termux:API Android app."
 288              )
 289          if not _termux_api_app_installed():
 290              raise RuntimeError(
 291                  "Termux voice capture requires the Termux:API Android app.\n"
 292                  "Install/update the Termux:API app, then retry /voice on."
 293              )
 294  
 295          with self._lock:
 296              if self._recording:
 297                  return
 298              os.makedirs(_TEMP_DIR, exist_ok=True)
 299              timestamp = time.strftime("%Y%m%d_%H%M%S")
 300              self._recording_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.aac")
 301  
 302          command = [
 303              mic_cmd,
 304              "-f", self._recording_path,
 305              "-l", "0",
 306              "-e", "aac",
 307              "-r", str(SAMPLE_RATE),
 308              "-c", str(CHANNELS),
 309          ]
 310          try:
 311              subprocess.run(command, capture_output=True, text=True, timeout=15, check=True)
 312          except subprocess.CalledProcessError as e:
 313              details = (e.stderr or e.stdout or str(e)).strip()
 314              raise RuntimeError(f"Termux microphone start failed: {details}") from e
 315          except Exception as e:
 316              raise RuntimeError(f"Termux microphone start failed: {e}") from e
 317  
 318          with self._lock:
 319              self._start_time = time.monotonic()
 320              self._recording = True
 321              self._current_rms = 0
 322          logger.info("Termux voice recording started")
 323  
 324      def _stop_termux_recording(self) -> None:
 325          mic_cmd = _termux_microphone_command()
 326          if not mic_cmd:
 327              return
 328          subprocess.run([mic_cmd, "-q"], capture_output=True, text=True, timeout=15, check=False)
 329  
 330      def stop(self) -> Optional[str]:
 331          with self._lock:
 332              if not self._recording:
 333                  return None
 334              self._recording = False
 335              path = self._recording_path
 336              self._recording_path = None
 337              started_at = self._start_time
 338              self._current_rms = 0
 339  
 340          self._stop_termux_recording()
 341          if not path or not os.path.isfile(path):
 342              return None
 343          if time.monotonic() - started_at < 0.3:
 344              try:
 345                  os.unlink(path)
 346              except OSError:
 347                  pass
 348              return None
 349          if os.path.getsize(path) <= 0:
 350              try:
 351                  os.unlink(path)
 352              except OSError:
 353                  pass
 354              return None
 355          logger.info("Termux voice recording stopped: %s", path)
 356          return path
 357  
 358      def cancel(self) -> None:
 359          with self._lock:
 360              path = self._recording_path
 361              self._recording = False
 362              self._recording_path = None
 363              self._current_rms = 0
 364          try:
 365              self._stop_termux_recording()
 366          except Exception:
 367              pass
 368          if path and os.path.isfile(path):
 369              try:
 370                  os.unlink(path)
 371              except OSError:
 372                  pass
 373          logger.info("Termux voice recording cancelled")
 374  
 375      def shutdown(self) -> None:
 376          self.cancel()
 377  
 378  
 379  # ============================================================================
 380  # AudioRecorder
 381  # ============================================================================
 382  class AudioRecorder:
 383      """Thread-safe audio recorder using sounddevice.InputStream.
 384  
 385      Usage::
 386  
 387          recorder = AudioRecorder()
 388          recorder.start(on_silence_stop=my_callback)
 389          # ... user speaks ...
 390          wav_path = recorder.stop()   # returns path to WAV file
 391          # or
 392          recorder.cancel()            # discard without saving
 393  
 394      If ``on_silence_stop`` is provided, recording automatically stops when
 395      the user is silent for ``silence_duration`` seconds and calls the callback.
 396      """
 397  
 398      supports_silence_autostop = True
 399  
 400      def __init__(self) -> None:
 401          self._lock = threading.Lock()
 402          self._stream: Any = None
 403          self._frames: List[Any] = []
 404          self._recording = False
 405          self._start_time: float = 0.0
 406          # Silence detection state
 407          self._has_spoken = False
 408          self._speech_start: float = 0.0  # When speech attempt began
 409          self._dip_start: float = 0.0  # When current below-threshold dip began
 410          self._min_speech_duration: float = 0.3  # Seconds of speech needed to confirm
 411          self._max_dip_tolerance: float = 0.3  # Max dip duration before resetting speech
 412          self._silence_start: float = 0.0
 413          self._resume_start: float = 0.0  # Tracks sustained speech after silence starts
 414          self._resume_dip_start: float = 0.0  # Dip tolerance tracker for resume detection
 415          self._on_silence_stop = None
 416          self._silence_threshold: int = SILENCE_RMS_THRESHOLD
 417          self._silence_duration: float = SILENCE_DURATION_SECONDS
 418          self._max_wait: float = 15.0  # Max seconds to wait for speech before auto-stop
 419          # Peak RMS seen during recording (for speech presence check in stop())
 420          self._peak_rms: int = 0
 421          # Live audio level (read by UI for visual feedback)
 422          self._current_rms: int = 0
 423          self._sample_rate: int = SAMPLE_RATE
 424  
 425      # -- public properties ---------------------------------------------------
 426  
 427      @property
 428      def elapsed_seconds(self) -> float:
 429          if not self._recording:
 430              return 0.0
 431          return time.monotonic() - self._start_time
 432  
 433      @property
 434      def current_rms(self) -> int:
 435          """Current audio input RMS level (0-32767). Updated each audio chunk."""
 436          return self._current_rms
 437  
 438      @property
 439      def is_recording(self) -> bool:
 440          """Whether audio recording is currently active."""
 441          return self._recording
 442  
 443      # -- public methods ------------------------------------------------------
 444  
 445      def _ensure_stream(self) -> None:
 446          """Create the audio InputStream once and keep it alive.
 447  
 448          The stream stays open for the lifetime of the recorder.  Between
 449          recordings the callback simply discards audio chunks (``_recording``
 450          is ``False``).  This avoids the CoreAudio bug where closing and
 451          re-opening an ``InputStream`` hangs indefinitely on macOS.
 452          """
 453          if self._stream is not None:
 454              return  # already alive
 455  
 456          sd, np = _import_audio()
 457  
 458          def _callback(indata, frames, time_info, status):  # noqa: ARG001
 459              if status:
 460                  logger.debug("sounddevice status: %s", status)
 461              # When not recording the stream is idle — discard audio.
 462              if not self._recording:
 463                  return
 464              self._frames.append(indata.copy())
 465  
 466              # Compute RMS for level display and silence detection
 467              rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
 468              self._current_rms = rms
 469              if rms > self._peak_rms:
 470                  self._peak_rms = rms
 471  
 472              # Silence detection
 473              if self._on_silence_stop is not None:
 474                  now = time.monotonic()
 475                  elapsed = now - self._start_time
 476  
 477                  if rms > self._silence_threshold:
 478                      # Audio is above threshold -- this is speech (or noise).
 479                      self._dip_start = 0.0  # Reset dip tracker
 480                      if self._speech_start == 0.0:
 481                          self._speech_start = now
 482                      elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
 483                          self._has_spoken = True
 484                          logger.debug("Speech confirmed (%.2fs above threshold)",
 485                                       now - self._speech_start)
 486                      # After speech is confirmed, only reset silence timer if
 487                      # speech is sustained (>0.3s above threshold).  Brief
 488                      # spikes from ambient noise should NOT reset the timer.
 489                      if not self._has_spoken:
 490                          self._silence_start = 0.0
 491                      else:
 492                          # Track resumed speech with dip tolerance.
 493                          # Brief dips below threshold are normal during speech,
 494                          # so we mirror the initial speech detection pattern:
 495                          # start tracking, tolerate short dips, confirm after 0.3s.
 496                          self._resume_dip_start = 0.0  # Above threshold — no dip
 497                          if self._resume_start == 0.0:
 498                              self._resume_start = now
 499                          elif now - self._resume_start >= self._min_speech_duration:
 500                              self._silence_start = 0.0
 501                              self._resume_start = 0.0
 502                  elif self._has_spoken:
 503                      # Below threshold after speech confirmed.
 504                      # Use dip tolerance before resetting resume tracker —
 505                      # natural speech has brief dips below threshold.
 506                      if self._resume_start > 0:
 507                          if self._resume_dip_start == 0.0:
 508                              self._resume_dip_start = now
 509                          elif now - self._resume_dip_start >= self._max_dip_tolerance:
 510                              # Sustained dip — user actually stopped speaking
 511                              self._resume_start = 0.0
 512                              self._resume_dip_start = 0.0
 513                  elif self._speech_start > 0:
 514                      # We were in a speech attempt but RMS dipped.
 515                      # Tolerate brief dips (micro-pauses between syllables).
 516                      if self._dip_start == 0.0:
 517                          self._dip_start = now
 518                      elif now - self._dip_start >= self._max_dip_tolerance:
 519                          # Dip lasted too long -- genuine silence, reset
 520                          logger.debug("Speech attempt reset (dip lasted %.2fs)",
 521                                       now - self._dip_start)
 522                          self._speech_start = 0.0
 523                          self._dip_start = 0.0
 524  
 525                  # Fire silence callback when:
 526                  # 1. User spoke then went silent for silence_duration, OR
 527                  # 2. No speech detected at all for max_wait seconds
 528                  should_fire = False
 529                  if self._has_spoken and rms <= self._silence_threshold:
 530                      # User was speaking and now is silent
 531                      if self._silence_start == 0.0:
 532                          self._silence_start = now
 533                      elif now - self._silence_start >= self._silence_duration:
 534                          logger.info("Silence detected (%.1fs), auto-stopping",
 535                                      self._silence_duration)
 536                          should_fire = True
 537                  elif not self._has_spoken and elapsed >= self._max_wait:
 538                      logger.info("No speech within %.0fs, auto-stopping",
 539                                  self._max_wait)
 540                      should_fire = True
 541  
 542                  if should_fire:
 543                      with self._lock:
 544                          cb = self._on_silence_stop
 545                          self._on_silence_stop = None  # fire only once
 546                      if cb:
 547                          def _safe_cb():
 548                              try:
 549                                  cb()
 550                              except Exception as e:
 551                                  logger.error("Silence callback failed: %s", e, exc_info=True)
 552                          threading.Thread(target=_safe_cb, daemon=True).start()
 553  
 554          def _default_input_sample_rate() -> int:
 555              """Return the default input device sample rate, or SAMPLE_RATE fallback."""
 556              try:
 557                  default_device = getattr(sd.default, "device", None)
 558                  if isinstance(default_device, (list, tuple)):
 559                      input_idx = default_device[0]
 560                  else:
 561                      input_idx = default_device
 562                  if input_idx is None or int(input_idx) < 0:
 563                      return SAMPLE_RATE
 564                  info = sd.query_devices(int(input_idx))
 565                  default_rate = info.get("default_samplerate") if isinstance(info, dict) else None
 566                  if not isinstance(default_rate, (int, float)):
 567                      return SAMPLE_RATE
 568                  detected = int(default_rate)
 569                  return detected if detected > 0 else SAMPLE_RATE
 570              except Exception:
 571                  return SAMPLE_RATE
 572  
 573          # Create stream — may block on CoreAudio (first call only).
 574          # Prefer Whisper-native 16kHz, but fall back to the device default if
 575          # PortAudio/ALSA rejects that rate.
 576          stream = None
 577          sample_rate_candidates = [SAMPLE_RATE]
 578          detected_rate = _default_input_sample_rate()
 579          if detected_rate not in sample_rate_candidates:
 580              sample_rate_candidates.append(detected_rate)
 581  
 582          last_error = None
 583          for candidate_rate in sample_rate_candidates:
 584              try:
 585                  stream = sd.InputStream(
 586                      samplerate=candidate_rate,
 587                      channels=CHANNELS,
 588                      dtype=DTYPE,
 589                      callback=_callback,
 590                  )
 591                  stream.start()
 592                  self._sample_rate = int(candidate_rate)
 593                  if candidate_rate != SAMPLE_RATE:
 594                      logger.info(
 595                          "Voice recorder falling back to input device sample rate %d Hz (preferred %d Hz unsupported)",
 596                          candidate_rate,
 597                          SAMPLE_RATE,
 598                      )
 599                  break
 600              except Exception as e:
 601                  last_error = e
 602                  if stream is not None:
 603                      try:
 604                          stream.close()
 605                      except Exception:
 606                          pass
 607                      stream = None
 608  
 609          if stream is None:
 610              raise RuntimeError(
 611                  f"Failed to open audio input stream: {last_error}. "
 612                  "Check that a microphone is connected and accessible."
 613              ) from last_error
 614          self._stream = stream
 615  
 616      def start(self, on_silence_stop=None) -> None:
 617          """Start capturing audio from the default input device.
 618  
 619          The underlying InputStream is created once and kept alive across
 620          recordings.  Subsequent calls simply reset detection state and
 621          toggle frame collection via ``_recording``.
 622  
 623          Args:
 624              on_silence_stop: Optional callback invoked (in a daemon thread) when
 625                  silence is detected after speech. The callback receives no arguments.
 626                  Use this to auto-stop recording and trigger transcription.
 627  
 628          Raises ``RuntimeError`` if sounddevice/numpy are not installed
 629          or if a recording is already in progress.
 630          """
 631          try:
 632              _import_audio()
 633          except (ImportError, OSError) as e:
 634              raise RuntimeError(
 635                  "Voice mode requires sounddevice and numpy.\n"
 636                  f"Install with: {sys.executable} -m pip install sounddevice numpy"
 637              ) from e
 638  
 639          with self._lock:
 640              if self._recording:
 641                  return  # already recording
 642  
 643              self._frames = []
 644              self._start_time = time.monotonic()
 645              self._has_spoken = False
 646              self._speech_start = 0.0
 647              self._dip_start = 0.0
 648              self._silence_start = 0.0
 649              self._resume_start = 0.0
 650              self._resume_dip_start = 0.0
 651              self._peak_rms = 0
 652              self._current_rms = 0
 653              self._on_silence_stop = on_silence_stop
 654  
 655          # Ensure the persistent stream is alive (no-op after first call).
 656          self._ensure_stream()
 657  
 658          with self._lock:
 659              self._recording = True
 660          logger.info("Voice recording started (rate=%d, channels=%d)", self._sample_rate, CHANNELS)
 661  
 662      def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
 663          """Close the audio stream with a timeout to prevent CoreAudio hangs."""
 664          if self._stream is None:
 665              return
 666  
 667          stream = self._stream
 668          self._stream = None
 669  
 670          def _do_close():
 671              try:
 672                  stream.stop()
 673                  stream.close()
 674              except Exception:
 675                  pass
 676  
 677          t = threading.Thread(target=_do_close, daemon=True)
 678          t.start()
 679          # Poll in short intervals so Ctrl+C is not blocked
 680          deadline = __import__("time").monotonic() + timeout
 681          while t.is_alive() and __import__("time").monotonic() < deadline:
 682              t.join(timeout=0.1)
 683          if t.is_alive():
 684              logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
 685  
 686      def stop(self) -> Optional[str]:
 687          """Stop recording and write captured audio to a WAV file.
 688  
 689          The underlying stream is kept alive for reuse — only frame
 690          collection is stopped.
 691  
 692          Returns:
 693              Path to the WAV file, or ``None`` if no audio was captured.
 694          """
 695          with self._lock:
 696              if not self._recording:
 697                  return None
 698  
 699              self._recording = False
 700              self._current_rms = 0
 701              # Stream stays alive — no close needed.
 702  
 703              if not self._frames:
 704                  return None
 705  
 706              # Concatenate frames and write WAV
 707              _, np = _import_audio()
 708              audio_data = np.concatenate(self._frames, axis=0)
 709              self._frames = []
 710  
 711              elapsed = time.monotonic() - self._start_time
 712              logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
 713  
 714              # Skip very short recordings (< 0.3s of audio)
 715              min_samples = int(self._sample_rate * 0.3)
 716              if len(audio_data) < min_samples:
 717                  logger.debug("Recording too short (%d samples), discarding", len(audio_data))
 718                  return None
 719  
 720              # Skip silent recordings using peak RMS (not overall average, which
 721              # gets diluted by silence at the end of the recording).
 722              if self._peak_rms < SILENCE_RMS_THRESHOLD:
 723                  logger.info("Recording too quiet (peak RMS=%d < %d), discarding",
 724                              self._peak_rms, SILENCE_RMS_THRESHOLD)
 725                  return None
 726  
 727              return self._write_wav(audio_data, self._sample_rate)
 728  
 729      def cancel(self) -> None:
 730          """Stop recording and discard all captured audio.
 731  
 732          The underlying stream is kept alive for reuse.
 733          """
 734          with self._lock:
 735              self._recording = False
 736              self._frames = []
 737              self._on_silence_stop = None
 738              self._current_rms = 0
 739          logger.info("Voice recording cancelled")
 740  
 741      def shutdown(self) -> None:
 742          """Release the audio stream.  Call when voice mode is disabled."""
 743          with self._lock:
 744              self._recording = False
 745              self._frames = []
 746              self._on_silence_stop = None
 747          # Close stream OUTSIDE the lock to avoid deadlock with audio callback
 748          self._close_stream_with_timeout()
 749          logger.info("AudioRecorder shut down")
 750  
 751      # -- private helpers -----------------------------------------------------
 752  
 753      @staticmethod
 754      def _write_wav(audio_data, sample_rate: int) -> str:
 755          """Write numpy int16 audio data to a WAV file.
 756  
 757          Returns the file path.
 758          """
 759          os.makedirs(_TEMP_DIR, exist_ok=True)
 760          timestamp = time.strftime("%Y%m%d_%H%M%S")
 761          wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
 762  
 763          with wave.open(wav_path, "wb") as wf:
 764              wf.setnchannels(CHANNELS)
 765              wf.setsampwidth(SAMPLE_WIDTH)
 766              wf.setframerate(sample_rate)
 767              wf.writeframes(audio_data.tobytes())
 768  
 769          file_size = os.path.getsize(wav_path)
 770          logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
 771          return wav_path
 772  
 773  
 774  def create_audio_recorder() -> AudioRecorder | TermuxAudioRecorder:
 775      """Return the best recorder backend for the current environment."""
 776      if _termux_voice_capture_available():
 777          return TermuxAudioRecorder()
 778      return AudioRecorder()
 779  
 780  
 781  # ============================================================================
 782  # Whisper hallucination filter
 783  # ============================================================================
 784  # Whisper commonly hallucinates these phrases on silent/near-silent audio.
 785  WHISPER_HALLUCINATIONS = {
 786      "thank you.",
 787      "thank you",
 788      "thanks for watching.",
 789      "thanks for watching",
 790      "subscribe to my channel.",
 791      "subscribe to my channel",
 792      "like and subscribe.",
 793      "like and subscribe",
 794      "please subscribe.",
 795      "please subscribe",
 796      "thank you for watching.",
 797      "thank you for watching",
 798      "bye.",
 799      "bye",
 800      "you",
 801      "the end.",
 802      "the end",
 803      # Non-English hallucinations (common on silence)
 804      "продолжение следует",
 805      "продолжение следует...",
 806      "sous-titres",
 807      "sous-titres réalisés par la communauté d'amara.org",
 808      "sottotitoli creati dalla comunità amara.org",
 809      "untertitel von stephanie geiges",
 810      "amara.org",
 811      "www.mooji.org",
 812      "ご視聴ありがとうございました",
 813  }
 814  
 815  # Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.")
 816  _HALLUCINATION_REPEAT_RE = re.compile(
 817      r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$',
 818      flags=re.IGNORECASE,
 819  )
 820  
 821  
 822  def is_whisper_hallucination(transcript: str) -> bool:
 823      """Check if a transcript is a known Whisper hallucination on silence."""
 824      cleaned = transcript.strip().lower()
 825      if not cleaned:
 826          return True
 827      # Exact match against known phrases
 828      if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS:
 829          return True
 830      # Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you")
 831      if _HALLUCINATION_REPEAT_RE.match(cleaned):
 832          return True
 833      return False
 834  
 835  
 836  # ============================================================================
 837  # STT dispatch
 838  # ============================================================================
 839  def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
 840      """Transcribe a WAV recording using the existing Whisper pipeline.
 841  
 842      Delegates to ``tools.transcription_tools.transcribe_audio()``.
 843      Filters out known Whisper hallucinations on silent audio.
 844  
 845      Args:
 846          wav_path: Path to the WAV file.
 847          model: Whisper model name (default: from config or ``whisper-1``).
 848  
 849      Returns:
 850          Dict with ``success``, ``transcript``, and optionally ``error``.
 851      """
 852      from tools.transcription_tools import transcribe_audio
 853  
 854      result = transcribe_audio(wav_path, model=model)
 855  
 856      # Filter out Whisper hallucinations (common on silent/near-silent audio)
 857      if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
 858          logger.info("Filtered Whisper hallucination: %r", result["transcript"])
 859          return {"success": True, "transcript": "", "filtered": True}
 860  
 861      return result
 862  
 863  
 864  # ============================================================================
 865  # Audio playback (interruptable)
 866  # ============================================================================
 867  
 868  # Global reference to the active playback process so it can be interrupted.
 869  _active_playback: Optional[subprocess.Popen] = None
 870  _playback_stop_requested = False
 871  _playback_lock = threading.Lock()
 872  
 873  
 874  def stop_playback() -> None:
 875      """Interrupt the currently playing audio (if any)."""
 876      global _active_playback, _playback_stop_requested
 877      with _playback_lock:
 878          proc = _active_playback
 879          if proc and proc.poll() is None:
 880              _playback_stop_requested = True
 881          _active_playback = None
 882      if proc and proc.poll() is None:
 883          try:
 884              proc.terminate()
 885              logger.info("Audio playback interrupted")
 886          except Exception:
 887              pass
 888      # Also stop sounddevice playback if active
 889      try:
 890          sd, _ = _import_audio()
 891          sd.stop()
 892      except Exception:
 893          pass
 894  
 895  
 896  def play_audio_file(file_path: str) -> bool:
 897      """Play an audio file through the default output device.
 898  
 899      Strategy:
 900      1. WAV files via ``sounddevice.play()`` when available.
 901      2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
 902         ``aplay`` (Linux ALSA).
 903  
 904      Playback can be interrupted by calling ``stop_playback()``.
 905  
 906      Returns:
 907          ``True`` if playback succeeded, ``False`` otherwise.
 908      """
 909      global _active_playback, _playback_stop_requested
 910  
 911      if not os.path.isfile(file_path):
 912          logger.warning("Audio file not found: %s", file_path)
 913          return False
 914  
 915      # Try sounddevice for WAV files
 916      if file_path.endswith(".wav"):
 917          try:
 918              sd, np = _import_audio()
 919              with wave.open(file_path, "rb") as wf:
 920                  frames = wf.readframes(wf.getnframes())
 921                  audio_data = np.frombuffer(frames, dtype=np.int16)
 922                  sample_rate = wf.getframerate()
 923  
 924              sd.play(audio_data, samplerate=sample_rate)
 925              # sd.wait() calls Event.wait() without timeout — hangs forever if
 926              # the audio device stalls.  Poll with a ceiling and force-stop.
 927              duration_secs = len(audio_data) / sample_rate
 928              deadline = time.monotonic() + duration_secs + 2.0
 929              while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline:
 930                  time.sleep(0.01)
 931              sd.stop()
 932              return True
 933          except (ImportError, OSError):
 934              pass  # audio libs not available, fall through to system players
 935          except Exception as e:
 936              logger.debug("sounddevice playback failed: %s", e)
 937  
 938      # Fall back to system audio players (using Popen for interruptability)
 939      system = platform.system()
 940      players = []
 941  
 942      if system == "Darwin":
 943          players.append(["afplay", file_path])
 944          players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
 945      elif system == "Linux":
 946          # Prefer PulseAudio/PipeWire-native playback on Linux.
 947          # On this host ffplay is installed but fails to open the PipeWire stream,
 948          # while paplay succeeds. If ffplay runs first, CLI TTS can fail silently.
 949          players.append(["paplay", file_path])
 950          players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
 951          players.append(["aplay", "-q", file_path])
 952      else:
 953          players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
 954  
 955      for cmd in players:
 956          exe = shutil.which(cmd[0])
 957          if exe:
 958              proc = None
 959              try:
 960                  proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 961                  with _playback_lock:
 962                      _active_playback = proc
 963                      _playback_stop_requested = False
 964                  exit_code = proc.wait(timeout=300)
 965                  with _playback_lock:
 966                      interrupted = _playback_stop_requested
 967                      if _active_playback is proc:
 968                          _active_playback = None
 969                      if interrupted:
 970                          _playback_stop_requested = False
 971                  if interrupted:
 972                      logger.info("System player %s interrupted by user", cmd[0])
 973                      return False
 974                  if exit_code == 0:
 975                      return True
 976                  logger.debug("System player %s exited with code %s", cmd[0], exit_code)
 977              except subprocess.TimeoutExpired:
 978                  logger.warning("System player %s timed out, killing process", cmd[0])
 979                  if proc is not None:
 980                      proc.kill()
 981                      proc.wait()
 982                  with _playback_lock:
 983                      if _active_playback is proc:
 984                          _active_playback = None
 985              except Exception as e:
 986                  logger.debug("System player %s failed: %s", cmd[0], e)
 987                  with _playback_lock:
 988                      if proc is not None and _active_playback is proc:
 989                          _active_playback = None
 990  
 991      logger.warning("No audio player available for %s", file_path)
 992      return False
 993  
 994  
 995  # ============================================================================
 996  # Requirements check
 997  # ============================================================================
 998  def check_voice_requirements() -> Dict[str, Any]:
 999      """Check if all voice mode requirements are met.
1000  
1001      Returns:
1002          Dict with ``available``, ``audio_available``, ``stt_available``,
1003          ``missing_packages``, and ``details``.
1004      """
1005      # Determine STT provider availability
1006      from tools.transcription_tools import _get_provider, _load_stt_config, is_stt_enabled
1007      stt_config = _load_stt_config()
1008      stt_enabled = is_stt_enabled(stt_config)
1009      stt_provider = _get_provider(stt_config)
1010      stt_available = stt_enabled and stt_provider != "none"
1011  
1012      missing: List[str] = []
1013      termux_capture = _termux_voice_capture_available()
1014      has_audio = _audio_available() or termux_capture
1015  
1016      if not has_audio:
1017          missing.extend(["sounddevice", "numpy"])
1018  
1019      # Environment detection
1020      env_check = detect_audio_environment()
1021  
1022      available = has_audio and stt_available and env_check["available"]
1023      details_parts = []
1024  
1025      if termux_capture:
1026          details_parts.append("Audio capture: OK (Termux:API microphone)")
1027      elif has_audio:
1028          details_parts.append("Audio capture: OK")
1029      else:
1030          details_parts.append(f"Audio capture: MISSING ({_voice_capture_install_hint()})")
1031  
1032      if not stt_enabled:
1033          details_parts.append("STT provider: DISABLED in config (stt.enabled: false)")
1034      elif stt_provider == "local":
1035          details_parts.append("STT provider: OK (local faster-whisper)")
1036      elif stt_provider == "groq":
1037          details_parts.append("STT provider: OK (Groq)")
1038      elif stt_provider == "openai":
1039          details_parts.append("STT provider: OK (OpenAI)")
1040      else:
1041          details_parts.append(
1042              "STT provider: MISSING (pip install faster-whisper, "
1043              "or set GROQ_API_KEY / VOICE_TOOLS_OPENAI_KEY)"
1044          )
1045  
1046      for warning in env_check["warnings"]:
1047          details_parts.append(f"Environment: {warning}")
1048      for notice in env_check.get("notices", []):
1049          details_parts.append(f"Environment: {notice}")
1050  
1051      return {
1052          "available": available,
1053          "audio_available": has_audio,
1054          "stt_available": stt_available,
1055          "missing_packages": missing,
1056          "details": "\n".join(details_parts),
1057          "environment": env_check,
1058      }
1059  
1060  
1061  # ============================================================================
1062  # Temp file cleanup
1063  # ============================================================================
1064  def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
1065      """Remove old temporary voice recording files.
1066  
1067      Args:
1068          max_age_seconds: Delete files older than this (default: 1 hour).
1069  
1070      Returns:
1071          Number of files deleted.
1072      """
1073      if not os.path.isdir(_TEMP_DIR):
1074          return 0
1075  
1076      deleted = 0
1077      now = time.time()
1078  
1079      for entry in os.scandir(_TEMP_DIR):
1080          if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
1081              try:
1082                  age = now - entry.stat().st_mtime
1083                  if age > max_age_seconds:
1084                      os.unlink(entry.path)
1085                      deleted += 1
1086              except OSError:
1087                  pass
1088  
1089      if deleted:
1090          logger.debug("Cleaned up %d old voice recordings", deleted)
1091      return deleted