/ src / hooks / useVoice.ts
useVoice.ts
   1  // React hook for hold-to-talk voice input using Anthropic voice_stream STT.
   2  //
   3  // Hold the keybinding to record; release to stop and submit.  Auto-repeat
   4  // key events reset an internal timer — when no keypress arrives within
   5  // RELEASE_TIMEOUT_MS the recording stops automatically.  Uses the native
   6  // audio module (macOS) or SoX for recording, and Anthropic's voice_stream
   7  // endpoint (conversation_engine) for STT.
   8  
   9  import { useCallback, useEffect, useRef, useState } from 'react'
  10  import { useSetVoiceState } from '../context/voice.js'
  11  import { useTerminalFocus } from '../ink/hooks/use-terminal-focus.js'
  12  import {
  13    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  14    logEvent,
  15  } from '../services/analytics/index.js'
  16  import { getVoiceKeyterms } from '../services/voiceKeyterms.js'
  17  import {
  18    connectVoiceStream,
  19    type FinalizeSource,
  20    isVoiceStreamAvailable,
  21    type VoiceStreamConnection,
  22  } from '../services/voiceStreamSTT.js'
  23  import { logForDebugging } from '../utils/debug.js'
  24  import { toError } from '../utils/errors.js'
  25  import { getSystemLocaleLanguage } from '../utils/intl.js'
  26  import { logError } from '../utils/log.js'
  27  import { getInitialSettings } from '../utils/settings/settings.js'
  28  import { sleep } from '../utils/sleep.js'
  29  
  30  // ─── Language normalization ─────────────────────────────────────────────
  31  
  32  const DEFAULT_STT_LANGUAGE = 'en'
  33  
  34  // Maps language names (English and native) to BCP-47 codes supported by
  35  // the voice_stream Deepgram backend.  Keys must be lowercase.
  36  //
  37  // This list must be a SUBSET of the server-side supported_language_codes
  38  // allowlist (GrowthBook: speech_to_text_voice_stream_config).
  39  // If the CLI sends a code the server rejects, the WebSocket closes with
  40  // 1008 "Unsupported language" and voice breaks.  Unsupported languages
  41  // fall back to DEFAULT_STT_LANGUAGE so recording still works.
  42  const LANGUAGE_NAME_TO_CODE: Record<string, string> = {
  43    english: 'en',
  44    spanish: 'es',
  45    español: 'es',
  46    espanol: 'es',
  47    french: 'fr',
  48    français: 'fr',
  49    francais: 'fr',
  50    japanese: 'ja',
  51    日本語: 'ja',
  52    german: 'de',
  53    deutsch: 'de',
  54    portuguese: 'pt',
  55    português: 'pt',
  56    portugues: 'pt',
  57    italian: 'it',
  58    italiano: 'it',
  59    korean: 'ko',
  60    한국어: 'ko',
  61    hindi: 'hi',
  62    हिन्दी: 'hi',
  63    हिंदी: 'hi',
  64    indonesian: 'id',
  65    'bahasa indonesia': 'id',
  66    bahasa: 'id',
  67    russian: 'ru',
  68    русский: 'ru',
  69    polish: 'pl',
  70    polski: 'pl',
  71    turkish: 'tr',
  72    türkçe: 'tr',
  73    turkce: 'tr',
  74    dutch: 'nl',
  75    nederlands: 'nl',
  76    ukrainian: 'uk',
  77    українська: 'uk',
  78    greek: 'el',
  79    ελληνικά: 'el',
  80    czech: 'cs',
  81    čeština: 'cs',
  82    cestina: 'cs',
  83    danish: 'da',
  84    dansk: 'da',
  85    swedish: 'sv',
  86    svenska: 'sv',
  87    norwegian: 'no',
  88    norsk: 'no',
  89  }
  90  
  91  // Subset of the GrowthBook speech_to_text_voice_stream_config allowlist.
  92  // Sending a code not in the server allowlist closes the connection.
  93  const SUPPORTED_LANGUAGE_CODES = new Set([
  94    'en',
  95    'es',
  96    'fr',
  97    'ja',
  98    'de',
  99    'pt',
 100    'it',
 101    'ko',
 102    'hi',
 103    'id',
 104    'ru',
 105    'pl',
 106    'tr',
 107    'nl',
 108    'uk',
 109    'el',
 110    'cs',
 111    'da',
 112    'sv',
 113    'no',
 114  ])
 115  
 116  // Normalize a language preference string (from settings.language) to a
 117  // BCP-47 code supported by the voice_stream endpoint.  Returns the
 118  // default language if the input cannot be resolved.  When the input is
 119  // non-empty but unsupported, fellBackFrom is set to the original input so
 120  // callers can surface a warning.
 121  export function normalizeLanguageForSTT(language: string | undefined): {
 122    code: string
 123    fellBackFrom?: string
 124  } {
 125    if (!language) return { code: DEFAULT_STT_LANGUAGE }
 126    const lower = language.toLowerCase().trim()
 127    if (!lower) return { code: DEFAULT_STT_LANGUAGE }
 128    if (SUPPORTED_LANGUAGE_CODES.has(lower)) return { code: lower }
 129    const fromName = LANGUAGE_NAME_TO_CODE[lower]
 130    if (fromName) return { code: fromName }
 131    const base = lower.split('-')[0]
 132    if (base && SUPPORTED_LANGUAGE_CODES.has(base)) return { code: base }
 133    return { code: DEFAULT_STT_LANGUAGE, fellBackFrom: language }
 134  }
 135  
 136  // Lazy-loaded voice module. We defer importing voice.ts (and its native
 137  // audio-capture-napi dependency) until voice input is actually activated.
 138  // On macOS, loading the native audio module can trigger a TCC microphone
 139  // permission prompt — we must avoid that until voice input is actually enabled.
 140  type VoiceModule = typeof import('../services/voice.js')
 141  let voiceModule: VoiceModule | null = null
 142  
 143  type VoiceState = 'idle' | 'recording' | 'processing'
 144  
 145  type UseVoiceOptions = {
 146    onTranscript: (text: string) => void
 147    onError?: (message: string) => void
 148    enabled: boolean
 149    focusMode: boolean
 150  }
 151  
 152  type UseVoiceReturn = {
 153    state: VoiceState
 154    handleKeyEvent: (fallbackMs?: number) => void
 155  }
 156  
 157  // Gap (ms) between auto-repeat key events that signals key release.
 158  // Terminal auto-repeat typically fires every 30-80ms; 200ms comfortably
 159  // covers jitter while still feeling responsive.
 160  const RELEASE_TIMEOUT_MS = 200
 161  
 162  // Fallback (ms) to arm the release timer if no auto-repeat is seen.
 163  // macOS default key repeat delay is ~500ms; 600ms gives headroom.
 164  // If the user tapped and released before auto-repeat started, this
 165  // ensures the release timer gets armed and recording stops.
 166  //
 167  // For modifier-combo first-press activation (handleKeyEvent called at
 168  // t=0, before any auto-repeat), callers should pass FIRST_PRESS_FALLBACK_MS
 169  // instead — the gap to the next keypress is the OS initial repeat *delay*
 170  // (up to ~2s on macOS with slider at "Long"), not the repeat *rate*.
 171  const REPEAT_FALLBACK_MS = 600
 172  export const FIRST_PRESS_FALLBACK_MS = 2000
 173  
 174  // How long (ms) to keep a focus-mode session alive without any speech
 175  // before tearing it down to free the WebSocket connection. Re-arms on
 176  // the next focus cycle (blur → refocus).
 177  const FOCUS_SILENCE_TIMEOUT_MS = 5_000
 178  
 179  // Number of bars shown in the recording waveform visualizer.
 180  const AUDIO_LEVEL_BARS = 16
 181  
 182  // Compute RMS amplitude from a 16-bit signed PCM buffer and return a
 183  // normalized 0-1 value. A sqrt curve spreads quieter levels across more
 184  // of the visual range so the waveform uses the full set of block heights.
 185  export function computeLevel(chunk: Buffer): number {
 186    const samples = chunk.length >> 1 // 16-bit = 2 bytes per sample
 187    if (samples === 0) return 0
 188    let sumSq = 0
 189    for (let i = 0; i < chunk.length - 1; i += 2) {
 190      // Read 16-bit signed little-endian
 191      const sample = ((chunk[i]! | (chunk[i + 1]! << 8)) << 16) >> 16
 192      sumSq += sample * sample
 193    }
 194    const rms = Math.sqrt(sumSq / samples)
 195    const normalized = Math.min(rms / 2000, 1)
 196    return Math.sqrt(normalized)
 197  }
 198  
 199  export function useVoice({
 200    onTranscript,
 201    onError,
 202    enabled,
 203    focusMode,
 204  }: UseVoiceOptions): UseVoiceReturn {
 205    const [state, setState] = useState<VoiceState>('idle')
 206    const stateRef = useRef<VoiceState>('idle')
 207    const connectionRef = useRef<VoiceStreamConnection | null>(null)
 208    const accumulatedRef = useRef('')
 209    const onTranscriptRef = useRef(onTranscript)
 210    const onErrorRef = useRef(onError)
 211    const cleanupTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
 212    const releaseTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
 213    // True once we've seen a second keypress (auto-repeat) while recording.
 214    // The OS key repeat delay (~500ms on macOS) means the first keypress is
 215    // solo — arming the release timer before auto-repeat starts would cause
 216    // a false release.
 217    const seenRepeatRef = useRef(false)
 218    const repeatFallbackTimerRef = useRef<ReturnType<typeof setTimeout> | null>(
 219      null,
 220    )
 221    // True when the current recording session was started by terminal focus
 222    // (not by a keypress). Focus-driven sessions end on blur, not key release.
 223    const focusTriggeredRef = useRef(false)
 224    // Timer that tears down the session after prolonged silence in focus mode.
 225    const focusSilenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(
 226      null,
 227    )
 228    // Set when a focus-mode session is torn down due to silence. Prevents
 229    // the focus effect from immediately restarting. Cleared on blur so the
 230    // next focus cycle re-arms recording.
 231    const silenceTimedOutRef = useRef(false)
 232    const recordingStartRef = useRef(0)
 233    // Incremented on each startRecordingSession(). Callbacks capture their
 234    // generation and bail if a newer session has started — prevents a zombie
 235    // slow-connecting WS from an abandoned session from overwriting
 236    // connectionRef mid-way through the next session.
 237    const sessionGenRef = useRef(0)
 238    // True if the early-error retry fired during this session.
 239    // Tracked for the tengu_voice_recording_completed analytics event.
 240    const retryUsedRef = useRef(false)
 241    // Full audio captured this session, kept for silent-drop replay. ~1% of
 242    // sessions get a sticky-broken CE pod that accepts audio but returns zero
 243    // transcripts (anthropics/anthropic#287008 session-sticky variant); when
 244    // finalize() resolves via no_data_timeout with hadAudioSignal=true, we
 245    // replay the buffer on a fresh WS once. Bounded: 32KB/s × ~60s max ≈ 2MB.
 246    const fullAudioRef = useRef<Buffer[]>([])
 247    const silentDropRetriedRef = useRef(false)
 248    // Bumped when the early-error retry is scheduled. Captured per
 249    // attemptConnect — onError swallows stale-gen events (conn 1's
 250    // trailing close-error) but surfaces current-gen ones (conn 2's
 251    // genuine failure). Same shape as sessionGenRef, one level down.
 252    const attemptGenRef = useRef(0)
 253    // Running total of chars flushed in focus mode (each final transcript is
 254    // injected immediately and accumulatedRef reset). Added to transcriptChars
 255    // in the completed event so focus-mode sessions don't false-positive as
 256    // silent-drops (transcriptChars=0 despite successful transcription).
 257    const focusFlushedCharsRef = useRef(0)
 258    // True if at least one audio chunk with non-trivial signal was received.
 259    // Used to distinguish "microphone is silent/inaccessible" from "speech not detected".
 260    const hasAudioSignalRef = useRef(false)
 261    // True once onReady fired for the current session. Unlike connectionRef
 262    // (which cleanup() nulls), this survives effect-order races where Effect 3
 263    // cleanup runs before Effect 2's finishRecording() — e.g. /voice toggled
 264    // off mid-recording in focus mode. Used for the wsConnected analytics
 265    // dimension and error-message branching. Reset in startRecordingSession.
 266    const everConnectedRef = useRef(false)
 267    const audioLevelsRef = useRef<number[]>([])
 268    const isFocused = useTerminalFocus()
 269    const setVoiceState = useSetVoiceState()
 270  
 271    // Keep callback refs current without triggering re-renders
 272    onTranscriptRef.current = onTranscript
 273    onErrorRef.current = onError
 274  
 275    function updateState(newState: VoiceState): void {
 276      stateRef.current = newState
 277      setState(newState)
 278      setVoiceState(prev => {
 279        if (prev.voiceState === newState) return prev
 280        return { ...prev, voiceState: newState }
 281      })
 282    }
 283  
 284    const cleanup = useCallback((): void => {
 285      // Stale any in-flight session (main connection isStale(), replay
 286      // isStale(), finishRecording continuation). Without this, disabling
 287      // voice during the replay window lets the stale replay open a WS,
 288      // accumulate transcript, and inject it after voice was torn down.
 289      sessionGenRef.current++
 290      if (cleanupTimerRef.current) {
 291        clearTimeout(cleanupTimerRef.current)
 292        cleanupTimerRef.current = null
 293      }
 294      if (releaseTimerRef.current) {
 295        clearTimeout(releaseTimerRef.current)
 296        releaseTimerRef.current = null
 297      }
 298      if (repeatFallbackTimerRef.current) {
 299        clearTimeout(repeatFallbackTimerRef.current)
 300        repeatFallbackTimerRef.current = null
 301      }
 302      if (focusSilenceTimerRef.current) {
 303        clearTimeout(focusSilenceTimerRef.current)
 304        focusSilenceTimerRef.current = null
 305      }
 306      silenceTimedOutRef.current = false
 307      voiceModule?.stopRecording()
 308      if (connectionRef.current) {
 309        connectionRef.current.close()
 310        connectionRef.current = null
 311      }
 312      accumulatedRef.current = ''
 313      audioLevelsRef.current = []
 314      fullAudioRef.current = []
 315      setVoiceState(prev => {
 316        if (prev.voiceInterimTranscript === '' && !prev.voiceAudioLevels.length)
 317          return prev
 318        return { ...prev, voiceInterimTranscript: '', voiceAudioLevels: [] }
 319      })
 320    }, [setVoiceState])
 321  
 322    function finishRecording(): void {
 323      logForDebugging(
 324        '[voice] finishRecording: stopping recording, transitioning to processing',
 325      )
 326      // Session ending — stale any in-flight attempt so its late onError
 327      // (conn 2 responding after user released key) doesn't double-fire on
 328      // top of the "check network" message below.
 329      attemptGenRef.current++
 330      // Capture focusTriggered BEFORE clearing it — needed as an event dimension
 331      // so BigQuery can filter out passive focus-mode auto-recordings (user focused
 332      // terminal without speaking → ambient noise sets hadAudioSignal=true → false
 333      // silent-drop signature). focusFlushedCharsRef fixes transcriptChars accuracy
 334      // for sessions WITH speech; focusTriggered enables filtering sessions WITHOUT.
 335      const focusTriggered = focusTriggeredRef.current
 336      focusTriggeredRef.current = false
 337      updateState('processing')
 338      voiceModule?.stopRecording()
 339      // Capture duration BEFORE the finalize round-trip so that the WebSocket
 340      // wait time is not included (otherwise a quick tap looks like > 2s).
 341      // All ref-backed values are captured here, BEFORE the async boundary —
 342      // a keypress during the finalize wait can start a new session and reset
 343      // these refs (e.g. focusFlushedCharsRef = 0 in startRecordingSession),
 344      // reproducing the silent-drop false-positive this ref exists to prevent.
 345      const recordingDurationMs = Date.now() - recordingStartRef.current
 346      const hadAudioSignal = hasAudioSignalRef.current
 347      const retried = retryUsedRef.current
 348      const focusFlushedChars = focusFlushedCharsRef.current
 349      // wsConnected distinguishes "backend received audio but dropped it" (the
 350      // bug backend PR #287008 fixes) from "WS handshake never completed" —
 351      // in the latter case audio is still in audioBuffer, never reached the
 352      // server, but hasAudioSignalRef is already true from ambient noise.
 353      const wsConnected = everConnectedRef.current
 354      // Capture generation BEFORE the .then() — if a new session starts during
 355      // the finalize wait, sessionGenRef has already advanced by the time the
 356      // continuation runs, so capturing inside the .then() would yield the new
 357      // session's gen and every staleness check would be a no-op.
 358      const myGen = sessionGenRef.current
 359      const isStale = () => sessionGenRef.current !== myGen
 360      logForDebugging('[voice] Recording stopped')
 361  
 362      // Send finalize and wait for the WebSocket to close before reading the
 363      // accumulated transcript.  The close handler promotes any unreported
 364      // interim text to final, so we must wait for it to fire.
 365      const finalizePromise: Promise<FinalizeSource | undefined> =
 366        connectionRef.current
 367          ? connectionRef.current.finalize()
 368          : Promise.resolve(undefined)
 369  
 370      void finalizePromise
 371        .then(async finalizeSource => {
 372          if (isStale()) return
 373          // Silent-drop replay: when the server accepted audio (wsConnected),
 374          // the mic captured real signal (hadAudioSignal), but finalize timed
 375          // out with zero transcript — the ~1% session-sticky CE-pod bug.
 376          // Replay the buffered audio on a fresh connection once. A 250ms
 377          // backoff clears the same-pod rapid-reconnect race (same gap as the
 378          // early-error retry path below).
 379          if (
 380            finalizeSource === 'no_data_timeout' &&
 381            hadAudioSignal &&
 382            wsConnected &&
 383            !focusTriggered &&
 384            focusFlushedChars === 0 &&
 385            accumulatedRef.current.trim() === '' &&
 386            !silentDropRetriedRef.current &&
 387            fullAudioRef.current.length > 0
 388          ) {
 389            silentDropRetriedRef.current = true
 390            logForDebugging(
 391              `[voice] Silent-drop detected (no_data_timeout, ${String(fullAudioRef.current.length)} chunks); replaying on fresh connection`,
 392            )
 393            logEvent('tengu_voice_silent_drop_replay', {
 394              recordingDurationMs,
 395              chunkCount: fullAudioRef.current.length,
 396            })
 397            if (connectionRef.current) {
 398              connectionRef.current.close()
 399              connectionRef.current = null
 400            }
 401            const replayBuffer = fullAudioRef.current
 402            await sleep(250)
 403            if (isStale()) return
 404            const stt = normalizeLanguageForSTT(getInitialSettings().language)
 405            const keyterms = await getVoiceKeyterms()
 406            if (isStale()) return
 407            await new Promise<void>(resolve => {
 408              void connectVoiceStream(
 409                {
 410                  onTranscript: (t, isFinal) => {
 411                    if (isStale()) return
 412                    if (isFinal && t.trim()) {
 413                      if (accumulatedRef.current) accumulatedRef.current += ' '
 414                      accumulatedRef.current += t.trim()
 415                    }
 416                  },
 417                  onError: () => resolve(),
 418                  onClose: () => {},
 419                  onReady: conn => {
 420                    if (isStale()) {
 421                      conn.close()
 422                      resolve()
 423                      return
 424                    }
 425                    connectionRef.current = conn
 426                    const SLICE = 32_000
 427                    let slice: Buffer[] = []
 428                    let bytes = 0
 429                    for (const c of replayBuffer) {
 430                      if (bytes > 0 && bytes + c.length > SLICE) {
 431                        conn.send(Buffer.concat(slice))
 432                        slice = []
 433                        bytes = 0
 434                      }
 435                      slice.push(c)
 436                      bytes += c.length
 437                    }
 438                    if (slice.length) conn.send(Buffer.concat(slice))
 439                    void conn.finalize().then(() => {
 440                      conn.close()
 441                      resolve()
 442                    })
 443                  },
 444                },
 445                { language: stt.code, keyterms },
 446              ).then(
 447                c => {
 448                  if (!c) resolve()
 449                },
 450                () => resolve(),
 451              )
 452            })
 453            if (isStale()) return
 454          }
 455          fullAudioRef.current = []
 456  
 457          const text = accumulatedRef.current.trim()
 458          logForDebugging(
 459            `[voice] Final transcript assembled (${String(text.length)} chars): "${text.slice(0, 200)}"`,
 460          )
 461  
 462          // Tracks silent-drop rate: transcriptChars=0 + hadAudioSignal=true
 463          // + recordingDurationMs>2000 = the bug backend PR #287008 fixes.
 464          // focusFlushedCharsRef makes transcriptChars accurate for focus mode
 465          // (where each final is injected immediately and accumulatedRef reset).
 466          //
 467          // NOTE: this fires only on the finishRecording() path. The onError
 468          // fallthrough and !conn (no-OAuth) paths bypass this → don't compute
 469          // COUNT(completed)/COUNT(started) as a success rate; the silent-drop
 470          // denominator (completed events only) is internally consistent.
 471          logEvent('tengu_voice_recording_completed', {
 472            transcriptChars: text.length + focusFlushedChars,
 473            recordingDurationMs,
 474            hadAudioSignal,
 475            retried,
 476            silentDropRetried: silentDropRetriedRef.current,
 477            wsConnected,
 478            focusTriggered,
 479          })
 480  
 481          if (connectionRef.current) {
 482            connectionRef.current.close()
 483            connectionRef.current = null
 484          }
 485  
 486          if (text) {
 487            logForDebugging(
 488              `[voice] Injecting transcript (${String(text.length)} chars)`,
 489            )
 490            onTranscriptRef.current(text)
 491          } else if (focusFlushedChars === 0 && recordingDurationMs > 2000) {
 492            // Only warn about empty transcript if nothing was flushed in focus
 493            // mode either, and recording was > 2s (short recordings = accidental
 494            // taps → silently return to idle).
 495            if (!wsConnected) {
 496              // WS never connected → audio never reached backend. Not a silent
 497              // drop; a connection failure (slow OAuth refresh, network, etc).
 498              onErrorRef.current?.(
 499                'Voice connection failed. Check your network and try again.',
 500              )
 501            } else if (!hadAudioSignal) {
 502              // Distinguish silent mic (capture issue) from speech not recognized.
 503              onErrorRef.current?.(
 504                'No audio detected from microphone. Check that the correct input device is selected and that Claude Code has microphone access.',
 505              )
 506            } else {
 507              onErrorRef.current?.('No speech detected.')
 508            }
 509          }
 510  
 511          accumulatedRef.current = ''
 512          setVoiceState(prev => {
 513            if (prev.voiceInterimTranscript === '') return prev
 514            return { ...prev, voiceInterimTranscript: '' }
 515          })
 516          updateState('idle')
 517        })
 518        .catch(err => {
 519          logError(toError(err))
 520          if (!isStale()) updateState('idle')
 521        })
 522    }
 523  
 524    // When voice is enabled, lazy-import voice.ts so checkRecordingAvailability
 525    // et al. are ready when the user presses the voice key. Do NOT preload the
 526    // native module — require('audio-capture.node') is a synchronous dlopen of
 527    // CoreAudio/AudioUnit that blocks the event loop for ~1s (warm) to ~8s
 528    // (cold coreaudiod). setImmediate doesn't help: it yields one tick, then the
 529    // dlopen still blocks. The first voice keypress pays the dlopen cost instead.
 530    useEffect(() => {
 531      if (enabled && !voiceModule) {
 532        void import('../services/voice.js').then(mod => {
 533          voiceModule = mod
 534        })
 535      }
 536    }, [enabled])
 537  
 538    // ── Focus silence timer ────────────────────────────────────────────
 539    // Arms (or resets) a timer that tears down the focus-mode session
 540    // after FOCUS_SILENCE_TIMEOUT_MS of no speech. Called when a session
 541    // starts and after each flushed transcript.
 542    function armFocusSilenceTimer(): void {
 543      if (focusSilenceTimerRef.current) {
 544        clearTimeout(focusSilenceTimerRef.current)
 545      }
 546      focusSilenceTimerRef.current = setTimeout(
 547        (
 548          focusSilenceTimerRef,
 549          stateRef,
 550          focusTriggeredRef,
 551          silenceTimedOutRef,
 552          finishRecording,
 553        ) => {
 554          focusSilenceTimerRef.current = null
 555          if (stateRef.current === 'recording' && focusTriggeredRef.current) {
 556            logForDebugging(
 557              '[voice] Focus silence timeout — tearing down session',
 558            )
 559            silenceTimedOutRef.current = true
 560            finishRecording()
 561          }
 562        },
 563        FOCUS_SILENCE_TIMEOUT_MS,
 564        focusSilenceTimerRef,
 565        stateRef,
 566        focusTriggeredRef,
 567        silenceTimedOutRef,
 568        finishRecording,
 569      )
 570    }
 571  
 572    // ── Focus-driven recording ──────────────────────────────────────────
 573    // In focus mode, start recording when the terminal gains focus and
 574    // stop when it loses focus. This enables a "multi-clauding army"
 575    // workflow where voice input follows window focus.
 576    useEffect(() => {
 577      if (!enabled || !focusMode) {
 578        // Focus mode was disabled while a focus-driven recording was active —
 579        // stop the recording so it doesn't linger until the silence timer fires.
 580        if (focusTriggeredRef.current && stateRef.current === 'recording') {
 581          logForDebugging(
 582            '[voice] Focus mode disabled during recording, finishing',
 583          )
 584          finishRecording()
 585        }
 586        return
 587      }
 588      let cancelled = false
 589      if (
 590        isFocused &&
 591        stateRef.current === 'idle' &&
 592        !silenceTimedOutRef.current
 593      ) {
 594        const beginFocusRecording = (): void => {
 595          // Re-check conditions — state or enabled/focusMode may have changed
 596          // during the await (effect cleanup sets cancelled).
 597          if (
 598            cancelled ||
 599            stateRef.current !== 'idle' ||
 600            silenceTimedOutRef.current
 601          )
 602            return
 603          logForDebugging('[voice] Focus gained, starting recording session')
 604          focusTriggeredRef.current = true
 605          void startRecordingSession()
 606          armFocusSilenceTimer()
 607        }
 608        if (voiceModule) {
 609          beginFocusRecording()
 610        } else {
 611          // Voice module is loading (async import resolves from cache as a
 612          // microtask). Wait for it before starting the recording session.
 613          void import('../services/voice.js').then(mod => {
 614            voiceModule = mod
 615            beginFocusRecording()
 616          })
 617        }
 618      } else if (!isFocused) {
 619        // Clear the silence timeout flag on blur so the next focus
 620        // cycle re-arms recording.
 621        silenceTimedOutRef.current = false
 622        if (stateRef.current === 'recording') {
 623          logForDebugging('[voice] Focus lost, finishing recording')
 624          finishRecording()
 625        }
 626      }
 627      return () => {
 628        cancelled = true
 629      }
 630    }, [enabled, focusMode, isFocused])
 631  
 632    // ── Start a new recording session (voice_stream connect + audio) ──
 633    async function startRecordingSession(): Promise<void> {
 634      if (!voiceModule) {
 635        onErrorRef.current?.(
 636          'Voice module not loaded yet. Try again in a moment.',
 637        )
 638        return
 639      }
 640  
 641      // Transition to 'recording' synchronously, BEFORE any await. Callers
 642      // read state synchronously right after `void startRecordingSession()`:
 643      // - useVoiceIntegration.tsx space-hold guard reads voiceState from the
 644      //   store immediately — if it sees 'idle' it clears isSpaceHoldActiveRef
 645      //   and space auto-repeat leaks into the text input (100% repro)
 646      // - handleKeyEvent's `currentState === 'idle'` re-entry check below
 647      // If an await runs first, both see stale 'idle'. See PR #20873 review.
 648      updateState('recording')
 649      recordingStartRef.current = Date.now()
 650      accumulatedRef.current = ''
 651      seenRepeatRef.current = false
 652      hasAudioSignalRef.current = false
 653      retryUsedRef.current = false
 654      silentDropRetriedRef.current = false
 655      fullAudioRef.current = []
 656      focusFlushedCharsRef.current = 0
 657      everConnectedRef.current = false
 658      const myGen = ++sessionGenRef.current
 659  
 660      // ── Pre-check: can we actually record audio? ──────────────
 661      const availability = await voiceModule.checkRecordingAvailability()
 662      if (!availability.available) {
 663        logForDebugging(
 664          `[voice] Recording not available: ${availability.reason ?? 'unknown'}`,
 665        )
 666        onErrorRef.current?.(
 667          availability.reason ?? 'Audio recording is not available.',
 668        )
 669        cleanup()
 670        updateState('idle')
 671        return
 672      }
 673  
 674      logForDebugging(
 675        '[voice] Starting recording session, connecting voice stream',
 676      )
 677      // Clear any previous error
 678      setVoiceState(prev => {
 679        if (!prev.voiceError) return prev
 680        return { ...prev, voiceError: null }
 681      })
 682  
 683      // Buffer audio chunks while the WebSocket connects. Once the connection
 684      // is ready (onReady fires), buffered chunks are flushed and subsequent
 685      // chunks are sent directly.
 686      const audioBuffer: Buffer[] = []
 687  
 688      // Start recording IMMEDIATELY — audio is buffered until the WebSocket
 689      // opens, eliminating the 1-2s latency from waiting for OAuth + WS connect.
 690      logForDebugging(
 691        '[voice] startRecording: buffering audio while WebSocket connects',
 692      )
 693      audioLevelsRef.current = []
 694      const started = await voiceModule.startRecording(
 695        (chunk: Buffer) => {
 696          // Copy for fullAudioRef replay buffer. send() in voiceStreamSTT
 697          // copies again defensively — acceptable overhead at audio rates.
 698          // Skip buffering in focus mode — replay is gated on !focusTriggered
 699          // so the buffer is dead weight (up to ~20MB for a 10min session).
 700          const owned = Buffer.from(chunk)
 701          if (!focusTriggeredRef.current) {
 702            fullAudioRef.current.push(owned)
 703          }
 704          if (connectionRef.current) {
 705            connectionRef.current.send(owned)
 706          } else {
 707            audioBuffer.push(owned)
 708          }
 709          // Update audio level histogram for the recording visualizer
 710          const level = computeLevel(chunk)
 711          if (!hasAudioSignalRef.current && level > 0.01) {
 712            hasAudioSignalRef.current = true
 713          }
 714          const levels = audioLevelsRef.current
 715          if (levels.length >= AUDIO_LEVEL_BARS) {
 716            levels.shift()
 717          }
 718          levels.push(level)
 719          // Copy the array so React sees a new reference
 720          const snapshot = [...levels]
 721          audioLevelsRef.current = snapshot
 722          setVoiceState(prev => ({ ...prev, voiceAudioLevels: snapshot }))
 723        },
 724        () => {
 725          // External end (e.g. device error) - treat as stop
 726          if (stateRef.current === 'recording') {
 727            finishRecording()
 728          }
 729        },
 730        { silenceDetection: false },
 731      )
 732  
 733      if (!started) {
 734        logError(new Error('[voice] Recording failed — no audio tool found'))
 735        onErrorRef.current?.(
 736          'Failed to start audio capture. Check that your microphone is accessible.',
 737        )
 738        cleanup()
 739        updateState('idle')
 740        setVoiceState(prev => ({
 741          ...prev,
 742          voiceError: 'Recording failed — no audio tool found',
 743        }))
 744        return
 745      }
 746  
 747      const rawLanguage = getInitialSettings().language
 748      const stt = normalizeLanguageForSTT(rawLanguage)
 749      logEvent('tengu_voice_recording_started', {
 750        focusTriggered: focusTriggeredRef.current,
 751        sttLanguage:
 752          stt.code as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 753        sttLanguageIsDefault: !rawLanguage?.trim(),
 754        sttLanguageFellBack: stt.fellBackFrom !== undefined,
 755        // ISO 639 subtag from Intl (bounded set, never user text). undefined if
 756        // Intl failed — omitted from the payload, no retry cost (cached).
 757        systemLocaleLanguage:
 758          getSystemLocaleLanguage() as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 759      })
 760  
 761      // Retry once if the connection errors before delivering any transcript.
 762      // The conversation-engine proxy can reject rapid reconnects (~1/N_pods
 763      // same-pod collision) or CE's Deepgram upstream can fail during its own
 764      // teardown window (anthropics/anthropic#287008 surfaces this as
 765      // TranscriptError instead of silent-drop). A 250ms backoff clears both.
 766      // Audio captured during the retry window routes to audioBuffer (via the
 767      // connectionRef.current null check in the recording callback above) and
 768      // is flushed by the second onReady.
 769      let sawTranscript = false
 770  
 771      // Connect WebSocket in parallel with audio recording.
 772      // Gather keyterms first (async but fast — no model calls), then connect.
 773      // Bail from callbacks if a newer session has started. Prevents a
 774      // slow-connecting zombie WS (e.g. user released, pressed again, first
 775      // WS still handshaking) from firing onReady/onError into the new
 776      // session and corrupting its connectionRef / triggering a bogus retry.
 777      const isStale = () => sessionGenRef.current !== myGen
 778  
 779      const attemptConnect = (keyterms: string[]): void => {
 780        const myAttemptGen = attemptGenRef.current
 781        void connectVoiceStream(
 782          {
 783            onTranscript: (text: string, isFinal: boolean) => {
 784              if (isStale()) return
 785              sawTranscript = true
 786              logForDebugging(
 787                `[voice] onTranscript: isFinal=${String(isFinal)} text="${text}"`,
 788              )
 789              if (isFinal && text.trim()) {
 790                if (focusTriggeredRef.current) {
 791                  // Focus mode: flush each final transcript immediately and
 792                  // keep recording. This gives continuous transcription while
 793                  // the terminal is focused.
 794                  logForDebugging(
 795                    `[voice] Focus mode: flushing final transcript immediately: "${text.trim()}"`,
 796                  )
 797                  onTranscriptRef.current(text.trim())
 798                  focusFlushedCharsRef.current += text.trim().length
 799                  setVoiceState(prev => {
 800                    if (prev.voiceInterimTranscript === '') return prev
 801                    return { ...prev, voiceInterimTranscript: '' }
 802                  })
 803                  accumulatedRef.current = ''
 804                  // User is actively speaking — reset the silence timer.
 805                  armFocusSilenceTimer()
 806                } else {
 807                  // Hold-to-talk: accumulate final transcripts separated by spaces
 808                  if (accumulatedRef.current) {
 809                    accumulatedRef.current += ' '
 810                  }
 811                  accumulatedRef.current += text.trim()
 812                  logForDebugging(
 813                    `[voice] Accumulated final transcript: "${accumulatedRef.current}"`,
 814                  )
 815                  // Clear interim since final supersedes it
 816                  setVoiceState(prev => {
 817                    const preview = accumulatedRef.current
 818                    if (prev.voiceInterimTranscript === preview) return prev
 819                    return { ...prev, voiceInterimTranscript: preview }
 820                  })
 821                }
 822              } else if (!isFinal) {
 823                // Active interim speech resets the focus silence timer.
 824                // Nova 3 disables auto-finalize so isFinal is never true
 825                // mid-stream — without this, the 5s timer fires during
 826                // active speech and tears down the session.
 827                if (focusTriggeredRef.current) {
 828                  armFocusSilenceTimer()
 829                }
 830                // Show accumulated finals + current interim as live preview
 831                const interim = text.trim()
 832                const preview = accumulatedRef.current
 833                  ? accumulatedRef.current + (interim ? ' ' + interim : '')
 834                  : interim
 835                setVoiceState(prev => {
 836                  if (prev.voiceInterimTranscript === preview) return prev
 837                  return { ...prev, voiceInterimTranscript: preview }
 838                })
 839              }
 840            },
 841            onError: (error: string, opts?: { fatal?: boolean }) => {
 842              if (isStale()) {
 843                logForDebugging(
 844                  `[voice] ignoring onError from stale session: ${error}`,
 845                )
 846                return
 847              }
 848              // Swallow errors from superseded attempts. Covers conn 1's
 849              // trailing close after retry is scheduled, AND the current
 850              // conn's ws close event after its ws error already surfaced
 851              // below (gen bumped at surface).
 852              if (attemptGenRef.current !== myAttemptGen) {
 853                logForDebugging(
 854                  `[voice] ignoring stale onError from superseded attempt: ${error}`,
 855                )
 856                return
 857              }
 858              // Early-failure retry: server error before any transcript =
 859              // likely a transient upstream race (CE rejection, Deepgram
 860              // not ready). Clear connectionRef so audio re-buffers, back
 861              // off, reconnect. Skip if the user has already released the
 862              // key (state left 'recording') — no point retrying a session
 863              // they've ended. Fatal errors (Cloudflare bot challenge, auth
 864              // rejection) are the same failure on every retry attempt, so
 865              // fall through to surface the message.
 866              if (
 867                !opts?.fatal &&
 868                !sawTranscript &&
 869                stateRef.current === 'recording'
 870              ) {
 871                if (!retryUsedRef.current) {
 872                  retryUsedRef.current = true
 873                  logForDebugging(
 874                    `[voice] early voice_stream error (pre-transcript), retrying once: ${error}`,
 875                  )
 876                  logEvent('tengu_voice_stream_early_retry', {})
 877                  connectionRef.current = null
 878                  attemptGenRef.current++
 879                  setTimeout(
 880                    (stateRef, attemptConnect, keyterms) => {
 881                      if (stateRef.current === 'recording') {
 882                        attemptConnect(keyterms)
 883                      }
 884                    },
 885                    250,
 886                    stateRef,
 887                    attemptConnect,
 888                    keyterms,
 889                  )
 890                  return
 891                }
 892              }
 893              // Surfacing — bump gen so this conn's trailing close-error
 894              // (ws fires error then close 1006) is swallowed above.
 895              attemptGenRef.current++
 896              logError(new Error(`[voice] voice_stream error: ${error}`))
 897              onErrorRef.current?.(`Voice stream error: ${error}`)
 898              // Clear the audio buffer on error to avoid memory leaks
 899              audioBuffer.length = 0
 900              focusTriggeredRef.current = false
 901              cleanup()
 902              updateState('idle')
 903            },
 904            onClose: () => {
 905              // no-op; lifecycle handled by cleanup()
 906            },
 907            onReady: conn => {
 908              // Only proceed if we're still in recording state AND this is
 909              // still the current session. A zombie late-connecting WS from
 910              // an abandoned session can pass the 'recording' check if the
 911              // user has since started a new session.
 912              if (isStale() || stateRef.current !== 'recording') {
 913                conn.close()
 914                return
 915              }
 916  
 917              // The WebSocket is now truly open — assign connectionRef so
 918              // subsequent audio callbacks send directly instead of buffering.
 919              connectionRef.current = conn
 920              everConnectedRef.current = true
 921  
 922              // Flush all audio chunks that were buffered while the WebSocket
 923              // was connecting.  This is safe because onReady fires from the
 924              // WebSocket 'open' event, guaranteeing send() will not be dropped.
 925              //
 926              // Coalesce into ~1s slices rather than one ws.send per chunk
 927              // — fewer WS frames means less overhead on both ends.
 928              const SLICE_TARGET_BYTES = 32_000 // ~1s at 16kHz/16-bit/mono
 929              if (audioBuffer.length > 0) {
 930                let totalBytes = 0
 931                for (const c of audioBuffer) totalBytes += c.length
 932                const slices: Buffer[][] = [[]]
 933                let sliceBytes = 0
 934                for (const chunk of audioBuffer) {
 935                  if (
 936                    sliceBytes > 0 &&
 937                    sliceBytes + chunk.length > SLICE_TARGET_BYTES
 938                  ) {
 939                    slices.push([])
 940                    sliceBytes = 0
 941                  }
 942                  slices[slices.length - 1]!.push(chunk)
 943                  sliceBytes += chunk.length
 944                }
 945                logForDebugging(
 946                  `[voice] onReady: flushing ${String(audioBuffer.length)} buffered chunks (${String(totalBytes)} bytes) as ${String(slices.length)} coalesced frame(s)`,
 947                )
 948                for (const slice of slices) {
 949                  conn.send(Buffer.concat(slice))
 950                }
 951              }
 952              audioBuffer.length = 0
 953  
 954              // Reset the release timer now that the WebSocket is ready.
 955              // Only arm it if auto-repeat has been seen — otherwise the OS
 956              // key repeat delay (~500ms) hasn't elapsed yet and the timer
 957              // would fire prematurely.
 958              if (releaseTimerRef.current) {
 959                clearTimeout(releaseTimerRef.current)
 960              }
 961              if (seenRepeatRef.current) {
 962                releaseTimerRef.current = setTimeout(
 963                  (releaseTimerRef, stateRef, finishRecording) => {
 964                    releaseTimerRef.current = null
 965                    if (stateRef.current === 'recording') {
 966                      finishRecording()
 967                    }
 968                  },
 969                  RELEASE_TIMEOUT_MS,
 970                  releaseTimerRef,
 971                  stateRef,
 972                  finishRecording,
 973                )
 974              }
 975            },
 976          },
 977          {
 978            language: stt.code,
 979            keyterms,
 980          },
 981        ).then(conn => {
 982          if (isStale()) {
 983            conn?.close()
 984            return
 985          }
 986          if (!conn) {
 987            logForDebugging(
 988              '[voice] Failed to connect to voice_stream (no OAuth token?)',
 989            )
 990            onErrorRef.current?.(
 991              'Voice mode requires a Claude.ai account. Please run /login to sign in.',
 992            )
 993            // Clear the audio buffer on failure
 994            audioBuffer.length = 0
 995            cleanup()
 996            updateState('idle')
 997            return
 998          }
 999  
1000          // Safety check: if the user released the key before connectVoiceStream
1001          // resolved (but after onReady already ran), close the connection.
1002          if (stateRef.current !== 'recording') {
1003            audioBuffer.length = 0
1004            conn.close()
1005            return
1006          }
1007        })
1008      }
1009  
1010      void getVoiceKeyterms().then(attemptConnect)
1011    }
1012  
1013    // ── Hold-to-talk handler ────────────────────────────────────────────
1014    // Called on every keypress (including terminal auto-repeats while
1015    // the key is held).  A gap longer than RELEASE_TIMEOUT_MS between
1016    // events is interpreted as key release.
1017    //
1018    // Recording starts immediately on the first keypress to eliminate
1019    // startup delay.  The release timer is only armed after auto-repeat
1020    // is detected (to avoid false releases during the OS key repeat
1021    // delay of ~500ms on macOS).
1022    const handleKeyEvent = useCallback(
1023      (fallbackMs = REPEAT_FALLBACK_MS): void => {
1024        if (!enabled || !isVoiceStreamAvailable()) {
1025          return
1026        }
1027  
1028        // In focus mode, recording is driven by terminal focus, not keypresses.
1029        if (focusTriggeredRef.current) {
1030          // Active focus recording — ignore key events (session ends on blur).
1031          return
1032        }
1033        if (focusMode && silenceTimedOutRef.current) {
1034          // Focus session timed out due to silence — keypress re-arms it.
1035          logForDebugging(
1036            '[voice] Re-arming focus recording after silence timeout',
1037          )
1038          silenceTimedOutRef.current = false
1039          focusTriggeredRef.current = true
1040          void startRecordingSession()
1041          armFocusSilenceTimer()
1042          return
1043        }
1044  
1045        const currentState = stateRef.current
1046  
1047        // Ignore keypresses while processing
1048        if (currentState === 'processing') {
1049          return
1050        }
1051  
1052        if (currentState === 'idle') {
1053          logForDebugging(
1054            '[voice] handleKeyEvent: idle, starting recording session immediately',
1055          )
1056          void startRecordingSession()
1057          // Fallback: if no auto-repeat arrives within REPEAT_FALLBACK_MS,
1058          // arm the release timer anyway (the user likely tapped and released).
1059          repeatFallbackTimerRef.current = setTimeout(
1060            (
1061              repeatFallbackTimerRef,
1062              stateRef,
1063              seenRepeatRef,
1064              releaseTimerRef,
1065              finishRecording,
1066            ) => {
1067              repeatFallbackTimerRef.current = null
1068              if (stateRef.current === 'recording' && !seenRepeatRef.current) {
1069                logForDebugging(
1070                  '[voice] No auto-repeat seen, arming release timer via fallback',
1071                )
1072                seenRepeatRef.current = true
1073                releaseTimerRef.current = setTimeout(
1074                  (releaseTimerRef, stateRef, finishRecording) => {
1075                    releaseTimerRef.current = null
1076                    if (stateRef.current === 'recording') {
1077                      finishRecording()
1078                    }
1079                  },
1080                  RELEASE_TIMEOUT_MS,
1081                  releaseTimerRef,
1082                  stateRef,
1083                  finishRecording,
1084                )
1085              }
1086            },
1087            fallbackMs,
1088            repeatFallbackTimerRef,
1089            stateRef,
1090            seenRepeatRef,
1091            releaseTimerRef,
1092            finishRecording,
1093          )
1094        } else if (currentState === 'recording') {
1095          // Second+ keypress while recording — auto-repeat has started.
1096          seenRepeatRef.current = true
1097          if (repeatFallbackTimerRef.current) {
1098            clearTimeout(repeatFallbackTimerRef.current)
1099            repeatFallbackTimerRef.current = null
1100          }
1101        }
1102  
1103        // Reset the release timer on every keypress (including auto-repeats)
1104        if (releaseTimerRef.current) {
1105          clearTimeout(releaseTimerRef.current)
1106        }
1107  
1108        // Only arm the release timer once auto-repeat has been seen.
1109        // The OS key repeat delay is ~500ms on macOS; without this gate
1110        // the 200ms timer fires before repeat starts, causing a false release.
1111        if (stateRef.current === 'recording' && seenRepeatRef.current) {
1112          releaseTimerRef.current = setTimeout(
1113            (releaseTimerRef, stateRef, finishRecording) => {
1114              releaseTimerRef.current = null
1115              if (stateRef.current === 'recording') {
1116                finishRecording()
1117              }
1118            },
1119            RELEASE_TIMEOUT_MS,
1120            releaseTimerRef,
1121            stateRef,
1122            finishRecording,
1123          )
1124        }
1125      },
1126      [enabled, focusMode, cleanup],
1127    )
1128  
1129    // Cleanup only when disabled or unmounted - NOT on state changes
1130    useEffect(() => {
1131      if (!enabled && stateRef.current !== 'idle') {
1132        cleanup()
1133        updateState('idle')
1134      }
1135      return () => {
1136        cleanup()
1137      }
1138    }, [enabled, cleanup])
1139  
1140    return {
1141      state,
1142      handleKeyEvent,
1143    }
1144  }