voice_mode.py
1 """Voice Mode -- Push-to-talk audio recording and playback for the CLI. 2 3 Provides audio capture via sounddevice, WAV encoding via stdlib wave, 4 STT dispatch via tools.transcription_tools, and TTS playback via 5 sounddevice or system audio players. 6 7 Dependencies (optional): 8 pip install sounddevice numpy 9 or: pip install hermes-agent[voice] 10 """ 11 12 import logging 13 import os 14 import platform 15 import re 16 import shutil 17 import subprocess 18 import sys 19 import tempfile 20 import threading 21 import time 22 import wave 23 from typing import Any, Dict, List, Optional 24 25 logger = logging.getLogger(__name__) 26 27 # --------------------------------------------------------------------------- 28 # Lazy audio imports -- never imported at module level to avoid crashing 29 # in headless environments (SSH, Docker, WSL, no PortAudio). 30 # --------------------------------------------------------------------------- 31 32 def _import_audio(): 33 """Lazy-import sounddevice and numpy. Returns (sd, np). 34 35 Raises ImportError or OSError if the libraries are not available 36 (e.g. PortAudio missing on headless servers). 37 """ 38 import sounddevice as sd 39 import numpy as np 40 return sd, np 41 42 43 def _audio_available() -> bool: 44 """Return True if audio libraries can be imported.""" 45 try: 46 _import_audio() 47 return True 48 except (ImportError, OSError): 49 return False 50 51 52 from hermes_constants import is_termux as _is_termux_environment 53 54 55 def _voice_capture_install_hint() -> str: 56 if _is_termux_environment(): 57 return "pkg install python-numpy portaudio && python -m pip install sounddevice" 58 return "pip install sounddevice numpy" 59 60 61 def _termux_microphone_command() -> Optional[str]: 62 if not _is_termux_environment(): 63 return None 64 return shutil.which("termux-microphone-record") 65 66 67 68 def _termux_api_app_installed() -> bool: 69 if not _is_termux_environment(): 70 return False 71 try: 72 result = subprocess.run( 73 ["pm", "list", "packages", "com.termux.api"], 74 capture_output=True, 75 text=True, 76 timeout=5, 77 check=False, 78 ) 79 return "package:com.termux.api" in (result.stdout or "") 80 except Exception: 81 return False 82 83 84 def _termux_voice_capture_available() -> bool: 85 return _termux_microphone_command() is not None and _termux_api_app_installed() 86 87 88 def detect_audio_environment() -> dict: 89 """Detect if the current environment supports audio I/O. 90 91 Returns dict with 'available' (bool), 'warnings' (list of hard-fail 92 reasons that block voice mode), and 'notices' (list of informational 93 messages that do NOT block voice mode). 94 """ 95 warnings = [] # hard-fail: these block voice mode 96 notices = [] # informational: logged but don't block 97 termux_mic_cmd = _termux_microphone_command() 98 termux_app_installed = _termux_api_app_installed() 99 termux_capture = bool(termux_mic_cmd and termux_app_installed) 100 101 # SSH detection 102 if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')): 103 warnings.append("Running over SSH -- no audio devices available") 104 105 # Docker/Podman container detection 106 # Containerized environments are not automatically voice-incompatible: 107 # some users pass host audio devices through and voice works fine. 108 # So container presence alone is only a notice; we hard-fail later 109 # only if device detection or audio initialization also shows no path. 110 from hermes_constants import is_container 111 container_env = is_container() 112 if container_env: 113 notices.append("Running inside Docker container") 114 115 # WSL detection — PulseAudio bridge makes audio work in WSL. 116 # Only block if PULSE_SERVER is not configured. 117 try: 118 with open('/proc/version', 'r') as f: 119 if 'microsoft' in f.read().lower(): 120 if os.environ.get('PULSE_SERVER'): 121 notices.append("Running in WSL with PulseAudio bridge") 122 else: 123 warnings.append( 124 "Running in WSL -- audio requires PulseAudio bridge.\n" 125 " 1. Set PULSE_SERVER=unix:/mnt/wslg/PulseServer\n" 126 " 2. Create ~/.asoundrc pointing ALSA at PulseAudio\n" 127 " 3. Verify with: arecord -d 3 /tmp/test.wav && aplay /tmp/test.wav" 128 ) 129 except (FileNotFoundError, PermissionError, OSError): 130 pass 131 132 # Check audio libraries 133 try: 134 sd, _ = _import_audio() 135 try: 136 devices = sd.query_devices() 137 if not devices: 138 if termux_capture: 139 notices.append("No PortAudio devices detected, but Termux:API microphone capture is available") 140 elif container_env: 141 warnings.append("Running inside Docker container -- no audio devices") 142 else: 143 warnings.append("No audio input/output devices detected") 144 except Exception: 145 # In WSL with PulseAudio, device queries can fail even though 146 # recording/playback works fine. Don't block if PULSE_SERVER is set. 147 if os.environ.get('PULSE_SERVER'): 148 notices.append("Audio device query failed but PULSE_SERVER is set -- continuing") 149 elif termux_capture: 150 notices.append("PortAudio device query failed, but Termux:API microphone capture is available") 151 elif container_env: 152 warnings.append("Running inside Docker container -- no audio devices") 153 else: 154 warnings.append("PortAudio device query failed (audio subsystem error)") 155 except ImportError: 156 if termux_capture: 157 notices.append("Termux:API microphone recording available (sounddevice not required)") 158 elif termux_mic_cmd and not termux_app_installed: 159 warnings.append( 160 "Termux:API Android app is not installed. Install/update the Termux:API app to use termux-microphone-record." 161 ) 162 else: 163 warnings.append(f"Audio libraries not installed ({_voice_capture_install_hint()})") 164 except OSError: 165 if termux_capture: 166 notices.append("Termux:API microphone recording available (PortAudio not required)") 167 elif termux_mic_cmd and not termux_app_installed: 168 warnings.append( 169 "Termux:API Android app is not installed. Install/update the Termux:API app to use termux-microphone-record." 170 ) 171 elif _is_termux_environment(): 172 warnings.append( 173 "PortAudio system library not found -- install it first:\n" 174 " Termux: pkg install portaudio\n" 175 "Then retry /voice on." 176 ) 177 else: 178 warnings.append( 179 "PortAudio system library not found -- install it first:\n" 180 " Linux: sudo apt-get install libportaudio2\n" 181 " macOS: brew install portaudio\n" 182 "Then retry /voice on." 183 ) 184 185 return { 186 "available": not warnings, 187 "warnings": warnings, 188 "notices": notices, 189 } 190 191 # --------------------------------------------------------------------------- 192 # Recording parameters 193 # --------------------------------------------------------------------------- 194 SAMPLE_RATE = 16000 # Whisper native rate 195 CHANNELS = 1 # Mono 196 DTYPE = "int16" # 16-bit PCM 197 SAMPLE_WIDTH = 2 # bytes per sample (int16) 198 199 # Silence detection defaults 200 SILENCE_RMS_THRESHOLD = 200 # RMS below this = silence (int16 range 0-32767) 201 SILENCE_DURATION_SECONDS = 3.0 # Seconds of continuous silence before auto-stop 202 203 # Temp directory for voice recordings 204 _TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice") 205 206 207 # ============================================================================ 208 # Audio cues (beep tones) 209 # ============================================================================ 210 def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None: 211 """Play a short beep tone using numpy + sounddevice. 212 213 Args: 214 frequency: Tone frequency in Hz (default 880 = A5). 215 duration: Duration of each beep in seconds. 216 count: Number of beeps to play (with short gap between). 217 """ 218 try: 219 sd, np = _import_audio() 220 except (ImportError, OSError): 221 return 222 try: 223 gap = 0.06 # seconds between beeps 224 samples_per_beep = int(SAMPLE_RATE * duration) 225 samples_per_gap = int(SAMPLE_RATE * gap) 226 227 parts = [] 228 for i in range(count): 229 t = np.linspace(0, duration, samples_per_beep, endpoint=False) 230 # Apply fade in/out to avoid click artifacts 231 tone = np.sin(2 * np.pi * frequency * t) 232 fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4) 233 tone[:fade_len] *= np.linspace(0, 1, fade_len) 234 tone[-fade_len:] *= np.linspace(1, 0, fade_len) 235 parts.append((tone * 0.3 * 32767).astype(np.int16)) 236 if i < count - 1: 237 parts.append(np.zeros(samples_per_gap, dtype=np.int16)) 238 239 audio = np.concatenate(parts) 240 sd.play(audio, samplerate=SAMPLE_RATE) 241 # sd.wait() calls Event.wait() without timeout — hangs forever if the 242 # audio device stalls. Poll with a 2s ceiling and force-stop. 243 deadline = time.monotonic() + 2.0 244 while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline: 245 time.sleep(0.01) 246 sd.stop() 247 except Exception as e: 248 logger.debug("Beep playback failed: %s", e) 249 250 251 # ============================================================================ 252 # Termux Audio Recorder 253 # ============================================================================ 254 class TermuxAudioRecorder: 255 """Recorder backend that uses Termux:API microphone capture commands.""" 256 257 supports_silence_autostop = False 258 259 def __init__(self) -> None: 260 self._lock = threading.Lock() 261 self._recording = False 262 self._start_time = 0.0 263 self._recording_path: Optional[str] = None 264 self._current_rms = 0 265 266 @property 267 def is_recording(self) -> bool: 268 return self._recording 269 270 @property 271 def elapsed_seconds(self) -> float: 272 if not self._recording: 273 return 0.0 274 return time.monotonic() - self._start_time 275 276 @property 277 def current_rms(self) -> int: 278 return self._current_rms 279 280 def start(self, on_silence_stop=None) -> None: 281 del on_silence_stop # Termux:API does not expose live silence callbacks. 282 mic_cmd = _termux_microphone_command() 283 if not mic_cmd: 284 raise RuntimeError( 285 "Termux voice capture requires the termux-api package and app.\n" 286 "Install with: pkg install termux-api\n" 287 "Then install/update the Termux:API Android app." 288 ) 289 if not _termux_api_app_installed(): 290 raise RuntimeError( 291 "Termux voice capture requires the Termux:API Android app.\n" 292 "Install/update the Termux:API app, then retry /voice on." 293 ) 294 295 with self._lock: 296 if self._recording: 297 return 298 os.makedirs(_TEMP_DIR, exist_ok=True) 299 timestamp = time.strftime("%Y%m%d_%H%M%S") 300 self._recording_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.aac") 301 302 command = [ 303 mic_cmd, 304 "-f", self._recording_path, 305 "-l", "0", 306 "-e", "aac", 307 "-r", str(SAMPLE_RATE), 308 "-c", str(CHANNELS), 309 ] 310 try: 311 subprocess.run(command, capture_output=True, text=True, timeout=15, check=True) 312 except subprocess.CalledProcessError as e: 313 details = (e.stderr or e.stdout or str(e)).strip() 314 raise RuntimeError(f"Termux microphone start failed: {details}") from e 315 except Exception as e: 316 raise RuntimeError(f"Termux microphone start failed: {e}") from e 317 318 with self._lock: 319 self._start_time = time.monotonic() 320 self._recording = True 321 self._current_rms = 0 322 logger.info("Termux voice recording started") 323 324 def _stop_termux_recording(self) -> None: 325 mic_cmd = _termux_microphone_command() 326 if not mic_cmd: 327 return 328 subprocess.run([mic_cmd, "-q"], capture_output=True, text=True, timeout=15, check=False) 329 330 def stop(self) -> Optional[str]: 331 with self._lock: 332 if not self._recording: 333 return None 334 self._recording = False 335 path = self._recording_path 336 self._recording_path = None 337 started_at = self._start_time 338 self._current_rms = 0 339 340 self._stop_termux_recording() 341 if not path or not os.path.isfile(path): 342 return None 343 if time.monotonic() - started_at < 0.3: 344 try: 345 os.unlink(path) 346 except OSError: 347 pass 348 return None 349 if os.path.getsize(path) <= 0: 350 try: 351 os.unlink(path) 352 except OSError: 353 pass 354 return None 355 logger.info("Termux voice recording stopped: %s", path) 356 return path 357 358 def cancel(self) -> None: 359 with self._lock: 360 path = self._recording_path 361 self._recording = False 362 self._recording_path = None 363 self._current_rms = 0 364 try: 365 self._stop_termux_recording() 366 except Exception: 367 pass 368 if path and os.path.isfile(path): 369 try: 370 os.unlink(path) 371 except OSError: 372 pass 373 logger.info("Termux voice recording cancelled") 374 375 def shutdown(self) -> None: 376 self.cancel() 377 378 379 # ============================================================================ 380 # AudioRecorder 381 # ============================================================================ 382 class AudioRecorder: 383 """Thread-safe audio recorder using sounddevice.InputStream. 384 385 Usage:: 386 387 recorder = AudioRecorder() 388 recorder.start(on_silence_stop=my_callback) 389 # ... user speaks ... 390 wav_path = recorder.stop() # returns path to WAV file 391 # or 392 recorder.cancel() # discard without saving 393 394 If ``on_silence_stop`` is provided, recording automatically stops when 395 the user is silent for ``silence_duration`` seconds and calls the callback. 396 """ 397 398 supports_silence_autostop = True 399 400 def __init__(self) -> None: 401 self._lock = threading.Lock() 402 self._stream: Any = None 403 self._frames: List[Any] = [] 404 self._recording = False 405 self._start_time: float = 0.0 406 # Silence detection state 407 self._has_spoken = False 408 self._speech_start: float = 0.0 # When speech attempt began 409 self._dip_start: float = 0.0 # When current below-threshold dip began 410 self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm 411 self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech 412 self._silence_start: float = 0.0 413 self._resume_start: float = 0.0 # Tracks sustained speech after silence starts 414 self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection 415 self._on_silence_stop = None 416 self._silence_threshold: int = SILENCE_RMS_THRESHOLD 417 self._silence_duration: float = SILENCE_DURATION_SECONDS 418 self._max_wait: float = 15.0 # Max seconds to wait for speech before auto-stop 419 # Peak RMS seen during recording (for speech presence check in stop()) 420 self._peak_rms: int = 0 421 # Live audio level (read by UI for visual feedback) 422 self._current_rms: int = 0 423 self._sample_rate: int = SAMPLE_RATE 424 425 # -- public properties --------------------------------------------------- 426 427 @property 428 def elapsed_seconds(self) -> float: 429 if not self._recording: 430 return 0.0 431 return time.monotonic() - self._start_time 432 433 @property 434 def current_rms(self) -> int: 435 """Current audio input RMS level (0-32767). Updated each audio chunk.""" 436 return self._current_rms 437 438 @property 439 def is_recording(self) -> bool: 440 """Whether audio recording is currently active.""" 441 return self._recording 442 443 # -- public methods ------------------------------------------------------ 444 445 def _ensure_stream(self) -> None: 446 """Create the audio InputStream once and keep it alive. 447 448 The stream stays open for the lifetime of the recorder. Between 449 recordings the callback simply discards audio chunks (``_recording`` 450 is ``False``). This avoids the CoreAudio bug where closing and 451 re-opening an ``InputStream`` hangs indefinitely on macOS. 452 """ 453 if self._stream is not None: 454 return # already alive 455 456 sd, np = _import_audio() 457 458 def _callback(indata, frames, time_info, status): # noqa: ARG001 459 if status: 460 logger.debug("sounddevice status: %s", status) 461 # When not recording the stream is idle — discard audio. 462 if not self._recording: 463 return 464 self._frames.append(indata.copy()) 465 466 # Compute RMS for level display and silence detection 467 rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) 468 self._current_rms = rms 469 if rms > self._peak_rms: 470 self._peak_rms = rms 471 472 # Silence detection 473 if self._on_silence_stop is not None: 474 now = time.monotonic() 475 elapsed = now - self._start_time 476 477 if rms > self._silence_threshold: 478 # Audio is above threshold -- this is speech (or noise). 479 self._dip_start = 0.0 # Reset dip tracker 480 if self._speech_start == 0.0: 481 self._speech_start = now 482 elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration: 483 self._has_spoken = True 484 logger.debug("Speech confirmed (%.2fs above threshold)", 485 now - self._speech_start) 486 # After speech is confirmed, only reset silence timer if 487 # speech is sustained (>0.3s above threshold). Brief 488 # spikes from ambient noise should NOT reset the timer. 489 if not self._has_spoken: 490 self._silence_start = 0.0 491 else: 492 # Track resumed speech with dip tolerance. 493 # Brief dips below threshold are normal during speech, 494 # so we mirror the initial speech detection pattern: 495 # start tracking, tolerate short dips, confirm after 0.3s. 496 self._resume_dip_start = 0.0 # Above threshold — no dip 497 if self._resume_start == 0.0: 498 self._resume_start = now 499 elif now - self._resume_start >= self._min_speech_duration: 500 self._silence_start = 0.0 501 self._resume_start = 0.0 502 elif self._has_spoken: 503 # Below threshold after speech confirmed. 504 # Use dip tolerance before resetting resume tracker — 505 # natural speech has brief dips below threshold. 506 if self._resume_start > 0: 507 if self._resume_dip_start == 0.0: 508 self._resume_dip_start = now 509 elif now - self._resume_dip_start >= self._max_dip_tolerance: 510 # Sustained dip — user actually stopped speaking 511 self._resume_start = 0.0 512 self._resume_dip_start = 0.0 513 elif self._speech_start > 0: 514 # We were in a speech attempt but RMS dipped. 515 # Tolerate brief dips (micro-pauses between syllables). 516 if self._dip_start == 0.0: 517 self._dip_start = now 518 elif now - self._dip_start >= self._max_dip_tolerance: 519 # Dip lasted too long -- genuine silence, reset 520 logger.debug("Speech attempt reset (dip lasted %.2fs)", 521 now - self._dip_start) 522 self._speech_start = 0.0 523 self._dip_start = 0.0 524 525 # Fire silence callback when: 526 # 1. User spoke then went silent for silence_duration, OR 527 # 2. No speech detected at all for max_wait seconds 528 should_fire = False 529 if self._has_spoken and rms <= self._silence_threshold: 530 # User was speaking and now is silent 531 if self._silence_start == 0.0: 532 self._silence_start = now 533 elif now - self._silence_start >= self._silence_duration: 534 logger.info("Silence detected (%.1fs), auto-stopping", 535 self._silence_duration) 536 should_fire = True 537 elif not self._has_spoken and elapsed >= self._max_wait: 538 logger.info("No speech within %.0fs, auto-stopping", 539 self._max_wait) 540 should_fire = True 541 542 if should_fire: 543 with self._lock: 544 cb = self._on_silence_stop 545 self._on_silence_stop = None # fire only once 546 if cb: 547 def _safe_cb(): 548 try: 549 cb() 550 except Exception as e: 551 logger.error("Silence callback failed: %s", e, exc_info=True) 552 threading.Thread(target=_safe_cb, daemon=True).start() 553 554 def _default_input_sample_rate() -> int: 555 """Return the default input device sample rate, or SAMPLE_RATE fallback.""" 556 try: 557 default_device = getattr(sd.default, "device", None) 558 if isinstance(default_device, (list, tuple)): 559 input_idx = default_device[0] 560 else: 561 input_idx = default_device 562 if input_idx is None or int(input_idx) < 0: 563 return SAMPLE_RATE 564 info = sd.query_devices(int(input_idx)) 565 default_rate = info.get("default_samplerate") if isinstance(info, dict) else None 566 if not isinstance(default_rate, (int, float)): 567 return SAMPLE_RATE 568 detected = int(default_rate) 569 return detected if detected > 0 else SAMPLE_RATE 570 except Exception: 571 return SAMPLE_RATE 572 573 # Create stream — may block on CoreAudio (first call only). 574 # Prefer Whisper-native 16kHz, but fall back to the device default if 575 # PortAudio/ALSA rejects that rate. 576 stream = None 577 sample_rate_candidates = [SAMPLE_RATE] 578 detected_rate = _default_input_sample_rate() 579 if detected_rate not in sample_rate_candidates: 580 sample_rate_candidates.append(detected_rate) 581 582 last_error = None 583 for candidate_rate in sample_rate_candidates: 584 try: 585 stream = sd.InputStream( 586 samplerate=candidate_rate, 587 channels=CHANNELS, 588 dtype=DTYPE, 589 callback=_callback, 590 ) 591 stream.start() 592 self._sample_rate = int(candidate_rate) 593 if candidate_rate != SAMPLE_RATE: 594 logger.info( 595 "Voice recorder falling back to input device sample rate %d Hz (preferred %d Hz unsupported)", 596 candidate_rate, 597 SAMPLE_RATE, 598 ) 599 break 600 except Exception as e: 601 last_error = e 602 if stream is not None: 603 try: 604 stream.close() 605 except Exception: 606 pass 607 stream = None 608 609 if stream is None: 610 raise RuntimeError( 611 f"Failed to open audio input stream: {last_error}. " 612 "Check that a microphone is connected and accessible." 613 ) from last_error 614 self._stream = stream 615 616 def start(self, on_silence_stop=None) -> None: 617 """Start capturing audio from the default input device. 618 619 The underlying InputStream is created once and kept alive across 620 recordings. Subsequent calls simply reset detection state and 621 toggle frame collection via ``_recording``. 622 623 Args: 624 on_silence_stop: Optional callback invoked (in a daemon thread) when 625 silence is detected after speech. The callback receives no arguments. 626 Use this to auto-stop recording and trigger transcription. 627 628 Raises ``RuntimeError`` if sounddevice/numpy are not installed 629 or if a recording is already in progress. 630 """ 631 try: 632 _import_audio() 633 except (ImportError, OSError) as e: 634 raise RuntimeError( 635 "Voice mode requires sounddevice and numpy.\n" 636 f"Install with: {sys.executable} -m pip install sounddevice numpy" 637 ) from e 638 639 with self._lock: 640 if self._recording: 641 return # already recording 642 643 self._frames = [] 644 self._start_time = time.monotonic() 645 self._has_spoken = False 646 self._speech_start = 0.0 647 self._dip_start = 0.0 648 self._silence_start = 0.0 649 self._resume_start = 0.0 650 self._resume_dip_start = 0.0 651 self._peak_rms = 0 652 self._current_rms = 0 653 self._on_silence_stop = on_silence_stop 654 655 # Ensure the persistent stream is alive (no-op after first call). 656 self._ensure_stream() 657 658 with self._lock: 659 self._recording = True 660 logger.info("Voice recording started (rate=%d, channels=%d)", self._sample_rate, CHANNELS) 661 662 def _close_stream_with_timeout(self, timeout: float = 3.0) -> None: 663 """Close the audio stream with a timeout to prevent CoreAudio hangs.""" 664 if self._stream is None: 665 return 666 667 stream = self._stream 668 self._stream = None 669 670 def _do_close(): 671 try: 672 stream.stop() 673 stream.close() 674 except Exception: 675 pass 676 677 t = threading.Thread(target=_do_close, daemon=True) 678 t.start() 679 # Poll in short intervals so Ctrl+C is not blocked 680 deadline = __import__("time").monotonic() + timeout 681 while t.is_alive() and __import__("time").monotonic() < deadline: 682 t.join(timeout=0.1) 683 if t.is_alive(): 684 logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout) 685 686 def stop(self) -> Optional[str]: 687 """Stop recording and write captured audio to a WAV file. 688 689 The underlying stream is kept alive for reuse — only frame 690 collection is stopped. 691 692 Returns: 693 Path to the WAV file, or ``None`` if no audio was captured. 694 """ 695 with self._lock: 696 if not self._recording: 697 return None 698 699 self._recording = False 700 self._current_rms = 0 701 # Stream stays alive — no close needed. 702 703 if not self._frames: 704 return None 705 706 # Concatenate frames and write WAV 707 _, np = _import_audio() 708 audio_data = np.concatenate(self._frames, axis=0) 709 self._frames = [] 710 711 elapsed = time.monotonic() - self._start_time 712 logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data)) 713 714 # Skip very short recordings (< 0.3s of audio) 715 min_samples = int(self._sample_rate * 0.3) 716 if len(audio_data) < min_samples: 717 logger.debug("Recording too short (%d samples), discarding", len(audio_data)) 718 return None 719 720 # Skip silent recordings using peak RMS (not overall average, which 721 # gets diluted by silence at the end of the recording). 722 if self._peak_rms < SILENCE_RMS_THRESHOLD: 723 logger.info("Recording too quiet (peak RMS=%d < %d), discarding", 724 self._peak_rms, SILENCE_RMS_THRESHOLD) 725 return None 726 727 return self._write_wav(audio_data, self._sample_rate) 728 729 def cancel(self) -> None: 730 """Stop recording and discard all captured audio. 731 732 The underlying stream is kept alive for reuse. 733 """ 734 with self._lock: 735 self._recording = False 736 self._frames = [] 737 self._on_silence_stop = None 738 self._current_rms = 0 739 logger.info("Voice recording cancelled") 740 741 def shutdown(self) -> None: 742 """Release the audio stream. Call when voice mode is disabled.""" 743 with self._lock: 744 self._recording = False 745 self._frames = [] 746 self._on_silence_stop = None 747 # Close stream OUTSIDE the lock to avoid deadlock with audio callback 748 self._close_stream_with_timeout() 749 logger.info("AudioRecorder shut down") 750 751 # -- private helpers ----------------------------------------------------- 752 753 @staticmethod 754 def _write_wav(audio_data, sample_rate: int) -> str: 755 """Write numpy int16 audio data to a WAV file. 756 757 Returns the file path. 758 """ 759 os.makedirs(_TEMP_DIR, exist_ok=True) 760 timestamp = time.strftime("%Y%m%d_%H%M%S") 761 wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav") 762 763 with wave.open(wav_path, "wb") as wf: 764 wf.setnchannels(CHANNELS) 765 wf.setsampwidth(SAMPLE_WIDTH) 766 wf.setframerate(sample_rate) 767 wf.writeframes(audio_data.tobytes()) 768 769 file_size = os.path.getsize(wav_path) 770 logger.info("WAV written: %s (%d bytes)", wav_path, file_size) 771 return wav_path 772 773 774 def create_audio_recorder() -> AudioRecorder | TermuxAudioRecorder: 775 """Return the best recorder backend for the current environment.""" 776 if _termux_voice_capture_available(): 777 return TermuxAudioRecorder() 778 return AudioRecorder() 779 780 781 # ============================================================================ 782 # Whisper hallucination filter 783 # ============================================================================ 784 # Whisper commonly hallucinates these phrases on silent/near-silent audio. 785 WHISPER_HALLUCINATIONS = { 786 "thank you.", 787 "thank you", 788 "thanks for watching.", 789 "thanks for watching", 790 "subscribe to my channel.", 791 "subscribe to my channel", 792 "like and subscribe.", 793 "like and subscribe", 794 "please subscribe.", 795 "please subscribe", 796 "thank you for watching.", 797 "thank you for watching", 798 "bye.", 799 "bye", 800 "you", 801 "the end.", 802 "the end", 803 # Non-English hallucinations (common on silence) 804 "продолжение следует", 805 "продолжение следует...", 806 "sous-titres", 807 "sous-titres réalisés par la communauté d'amara.org", 808 "sottotitoli creati dalla comunità amara.org", 809 "untertitel von stephanie geiges", 810 "amara.org", 811 "www.mooji.org", 812 "ご視聴ありがとうございました", 813 } 814 815 # Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.") 816 _HALLUCINATION_REPEAT_RE = re.compile( 817 r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$', 818 flags=re.IGNORECASE, 819 ) 820 821 822 def is_whisper_hallucination(transcript: str) -> bool: 823 """Check if a transcript is a known Whisper hallucination on silence.""" 824 cleaned = transcript.strip().lower() 825 if not cleaned: 826 return True 827 # Exact match against known phrases 828 if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS: 829 return True 830 # Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you") 831 if _HALLUCINATION_REPEAT_RE.match(cleaned): 832 return True 833 return False 834 835 836 # ============================================================================ 837 # STT dispatch 838 # ============================================================================ 839 def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]: 840 """Transcribe a WAV recording using the existing Whisper pipeline. 841 842 Delegates to ``tools.transcription_tools.transcribe_audio()``. 843 Filters out known Whisper hallucinations on silent audio. 844 845 Args: 846 wav_path: Path to the WAV file. 847 model: Whisper model name (default: from config or ``whisper-1``). 848 849 Returns: 850 Dict with ``success``, ``transcript``, and optionally ``error``. 851 """ 852 from tools.transcription_tools import transcribe_audio 853 854 result = transcribe_audio(wav_path, model=model) 855 856 # Filter out Whisper hallucinations (common on silent/near-silent audio) 857 if result.get("success") and is_whisper_hallucination(result.get("transcript", "")): 858 logger.info("Filtered Whisper hallucination: %r", result["transcript"]) 859 return {"success": True, "transcript": "", "filtered": True} 860 861 return result 862 863 864 # ============================================================================ 865 # Audio playback (interruptable) 866 # ============================================================================ 867 868 # Global reference to the active playback process so it can be interrupted. 869 _active_playback: Optional[subprocess.Popen] = None 870 _playback_stop_requested = False 871 _playback_lock = threading.Lock() 872 873 874 def stop_playback() -> None: 875 """Interrupt the currently playing audio (if any).""" 876 global _active_playback, _playback_stop_requested 877 with _playback_lock: 878 proc = _active_playback 879 if proc and proc.poll() is None: 880 _playback_stop_requested = True 881 _active_playback = None 882 if proc and proc.poll() is None: 883 try: 884 proc.terminate() 885 logger.info("Audio playback interrupted") 886 except Exception: 887 pass 888 # Also stop sounddevice playback if active 889 try: 890 sd, _ = _import_audio() 891 sd.stop() 892 except Exception: 893 pass 894 895 896 def play_audio_file(file_path: str) -> bool: 897 """Play an audio file through the default output device. 898 899 Strategy: 900 1. WAV files via ``sounddevice.play()`` when available. 901 2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform), 902 ``aplay`` (Linux ALSA). 903 904 Playback can be interrupted by calling ``stop_playback()``. 905 906 Returns: 907 ``True`` if playback succeeded, ``False`` otherwise. 908 """ 909 global _active_playback, _playback_stop_requested 910 911 if not os.path.isfile(file_path): 912 logger.warning("Audio file not found: %s", file_path) 913 return False 914 915 # Try sounddevice for WAV files 916 if file_path.endswith(".wav"): 917 try: 918 sd, np = _import_audio() 919 with wave.open(file_path, "rb") as wf: 920 frames = wf.readframes(wf.getnframes()) 921 audio_data = np.frombuffer(frames, dtype=np.int16) 922 sample_rate = wf.getframerate() 923 924 sd.play(audio_data, samplerate=sample_rate) 925 # sd.wait() calls Event.wait() without timeout — hangs forever if 926 # the audio device stalls. Poll with a ceiling and force-stop. 927 duration_secs = len(audio_data) / sample_rate 928 deadline = time.monotonic() + duration_secs + 2.0 929 while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline: 930 time.sleep(0.01) 931 sd.stop() 932 return True 933 except (ImportError, OSError): 934 pass # audio libs not available, fall through to system players 935 except Exception as e: 936 logger.debug("sounddevice playback failed: %s", e) 937 938 # Fall back to system audio players (using Popen for interruptability) 939 system = platform.system() 940 players = [] 941 942 if system == "Darwin": 943 players.append(["afplay", file_path]) 944 players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path]) 945 elif system == "Linux": 946 # Prefer PulseAudio/PipeWire-native playback on Linux. 947 # On this host ffplay is installed but fails to open the PipeWire stream, 948 # while paplay succeeds. If ffplay runs first, CLI TTS can fail silently. 949 players.append(["paplay", file_path]) 950 players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path]) 951 players.append(["aplay", "-q", file_path]) 952 else: 953 players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path]) 954 955 for cmd in players: 956 exe = shutil.which(cmd[0]) 957 if exe: 958 proc = None 959 try: 960 proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 961 with _playback_lock: 962 _active_playback = proc 963 _playback_stop_requested = False 964 exit_code = proc.wait(timeout=300) 965 with _playback_lock: 966 interrupted = _playback_stop_requested 967 if _active_playback is proc: 968 _active_playback = None 969 if interrupted: 970 _playback_stop_requested = False 971 if interrupted: 972 logger.info("System player %s interrupted by user", cmd[0]) 973 return False 974 if exit_code == 0: 975 return True 976 logger.debug("System player %s exited with code %s", cmd[0], exit_code) 977 except subprocess.TimeoutExpired: 978 logger.warning("System player %s timed out, killing process", cmd[0]) 979 if proc is not None: 980 proc.kill() 981 proc.wait() 982 with _playback_lock: 983 if _active_playback is proc: 984 _active_playback = None 985 except Exception as e: 986 logger.debug("System player %s failed: %s", cmd[0], e) 987 with _playback_lock: 988 if proc is not None and _active_playback is proc: 989 _active_playback = None 990 991 logger.warning("No audio player available for %s", file_path) 992 return False 993 994 995 # ============================================================================ 996 # Requirements check 997 # ============================================================================ 998 def check_voice_requirements() -> Dict[str, Any]: 999 """Check if all voice mode requirements are met. 1000 1001 Returns: 1002 Dict with ``available``, ``audio_available``, ``stt_available``, 1003 ``missing_packages``, and ``details``. 1004 """ 1005 # Determine STT provider availability 1006 from tools.transcription_tools import _get_provider, _load_stt_config, is_stt_enabled 1007 stt_config = _load_stt_config() 1008 stt_enabled = is_stt_enabled(stt_config) 1009 stt_provider = _get_provider(stt_config) 1010 stt_available = stt_enabled and stt_provider != "none" 1011 1012 missing: List[str] = [] 1013 termux_capture = _termux_voice_capture_available() 1014 has_audio = _audio_available() or termux_capture 1015 1016 if not has_audio: 1017 missing.extend(["sounddevice", "numpy"]) 1018 1019 # Environment detection 1020 env_check = detect_audio_environment() 1021 1022 available = has_audio and stt_available and env_check["available"] 1023 details_parts = [] 1024 1025 if termux_capture: 1026 details_parts.append("Audio capture: OK (Termux:API microphone)") 1027 elif has_audio: 1028 details_parts.append("Audio capture: OK") 1029 else: 1030 details_parts.append(f"Audio capture: MISSING ({_voice_capture_install_hint()})") 1031 1032 if not stt_enabled: 1033 details_parts.append("STT provider: DISABLED in config (stt.enabled: false)") 1034 elif stt_provider == "local": 1035 details_parts.append("STT provider: OK (local faster-whisper)") 1036 elif stt_provider == "groq": 1037 details_parts.append("STT provider: OK (Groq)") 1038 elif stt_provider == "openai": 1039 details_parts.append("STT provider: OK (OpenAI)") 1040 else: 1041 details_parts.append( 1042 "STT provider: MISSING (pip install faster-whisper, " 1043 "or set GROQ_API_KEY / VOICE_TOOLS_OPENAI_KEY)" 1044 ) 1045 1046 for warning in env_check["warnings"]: 1047 details_parts.append(f"Environment: {warning}") 1048 for notice in env_check.get("notices", []): 1049 details_parts.append(f"Environment: {notice}") 1050 1051 return { 1052 "available": available, 1053 "audio_available": has_audio, 1054 "stt_available": stt_available, 1055 "missing_packages": missing, 1056 "details": "\n".join(details_parts), 1057 "environment": env_check, 1058 } 1059 1060 1061 # ============================================================================ 1062 # Temp file cleanup 1063 # ============================================================================ 1064 def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int: 1065 """Remove old temporary voice recording files. 1066 1067 Args: 1068 max_age_seconds: Delete files older than this (default: 1 hour). 1069 1070 Returns: 1071 Number of files deleted. 1072 """ 1073 if not os.path.isdir(_TEMP_DIR): 1074 return 0 1075 1076 deleted = 0 1077 now = time.time() 1078 1079 for entry in os.scandir(_TEMP_DIR): 1080 if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"): 1081 try: 1082 age = now - entry.stat().st_mtime 1083 if age > max_age_seconds: 1084 os.unlink(entry.path) 1085 deleted += 1 1086 except OSError: 1087 pass 1088 1089 if deleted: 1090 logger.debug("Cleaned up %d old voice recordings", deleted) 1091 return deleted