/ utils / gracefulShutdown.ts
gracefulShutdown.ts
  1  import chalk from 'chalk'
  2  import { writeSync } from 'fs'
  3  import memoize from 'lodash-es/memoize.js'
  4  import { onExit } from 'signal-exit'
  5  import type { ExitReason } from 'src/entrypoints/agentSdkTypes.js'
  6  import {
  7    getIsInteractive,
  8    getIsScrollDraining,
  9    getLastMainRequestId,
 10    getSessionId,
 11    isSessionPersistenceDisabled,
 12  } from '../bootstrap/state.js'
 13  import instances from '../ink/instances.js'
 14  import {
 15    DISABLE_KITTY_KEYBOARD,
 16    DISABLE_MODIFY_OTHER_KEYS,
 17  } from '../ink/termio/csi.js'
 18  import {
 19    DBP,
 20    DFE,
 21    DISABLE_MOUSE_TRACKING,
 22    EXIT_ALT_SCREEN,
 23    SHOW_CURSOR,
 24  } from '../ink/termio/dec.js'
 25  import {
 26    CLEAR_ITERM2_PROGRESS,
 27    CLEAR_TAB_STATUS,
 28    CLEAR_TERMINAL_TITLE,
 29    supportsTabStatus,
 30    wrapForMultiplexer,
 31  } from '../ink/termio/osc.js'
 32  import { shutdownDatadog } from '../services/analytics/datadog.js'
 33  import { shutdown1PEventLogging } from '../services/analytics/firstPartyEventLogger.js'
 34  import {
 35    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 36    logEvent,
 37  } from '../services/analytics/index.js'
 38  import type { AppState } from '../state/AppState.js'
 39  import { runCleanupFunctions } from './cleanupRegistry.js'
 40  import { logForDebugging } from './debug.js'
 41  import { logForDiagnosticsNoPII } from './diagLogs.js'
 42  import { isEnvTruthy } from './envUtils.js'
 43  import { getCurrentSessionTitle, sessionIdExists } from './sessionStorage.js'
 44  import { sleep } from './sleep.js'
 45  import { profileReport } from './startupProfiler.js'
 46  
 47  /**
 48   * Clean up terminal modes synchronously before process exit.
 49   * This ensures terminal escape sequences (Kitty keyboard, focus reporting, etc.)
 50   * are properly disabled even if React's componentWillUnmount doesn't run in time.
 51   * Uses writeSync to ensure writes complete before exit.
 52   *
 53   * We unconditionally send all disable sequences because:
 54   * 1. Terminal detection may not always work correctly (e.g., in tmux, screen)
 55   * 2. These sequences are no-ops on terminals that don't support them
 56   * 3. Failing to disable leaves the terminal in a broken state
 57   */
 58  /* eslint-disable custom-rules/no-sync-fs -- must be sync to flush before process.exit */
 59  function cleanupTerminalModes(): void {
 60    if (!process.stdout.isTTY) {
 61      return
 62    }
 63  
 64    try {
 65      // Disable mouse tracking FIRST, before the React unmount tree-walk.
 66      // The terminal needs a round-trip to process this and stop sending
 67      // events; doing it now (not after unmount) gives that time while
 68      // we're busy unmounting. Otherwise events arrive during cooked-mode
 69      // cleanup and either echo to the screen or leak to the shell.
 70      writeSync(1, DISABLE_MOUSE_TRACKING)
 71      // Exit alt screen FIRST so printResumeHint() (and all sequences below)
 72      // land on the main buffer.
 73      //
 74      // Unmount Ink directly rather than writing EXIT_ALT_SCREEN ourselves.
 75      // Ink registered its unmount with signal-exit, so it will otherwise run
 76      // AGAIN inside forceExit() → process.exit(). Two problems with letting
 77      // that happen:
 78      //   1. If we write 1049l here and unmount writes it again later, the
 79      //      second one triggers another DECRC — the cursor jumps back over
 80      //      the resume hint and the shell prompt lands on the wrong line.
 81      //   2. unmount()'s onRender() must run with altScreenActive=true (alt-
 82      //      screen cursor math) AND on the alt buffer. Exiting alt-screen
 83      //      here first makes onRender() scribble a REPL frame onto main.
 84      // Calling unmount() now does the final render on the alt buffer,
 85      // unsubscribes from signal-exit, and writes 1049l exactly once.
 86      const inst = instances.get(process.stdout)
 87      if (inst?.isAltScreenActive) {
 88        try {
 89          inst.unmount()
 90        } catch {
 91          // Reconciler/render threw — fall back to manual alt-screen exit
 92          // so printResumeHint still hits the main buffer.
 93          writeSync(1, EXIT_ALT_SCREEN)
 94        }
 95      }
 96      // Catches events that arrived during the unmount tree-walk.
 97      // detachForShutdown() below also drains.
 98      inst?.drainStdin()
 99      // Mark the Ink instance unmounted so signal-exit's deferred ink.unmount()
100      // early-returns instead of sending redundant EXIT_ALT_SCREEN sequences
101      // (from its writeSync cleanup block + AlternateScreen's unmount cleanup).
102      // Those redundant sequences land AFTER printResumeHint() and clobber the
103      // resume hint on tmux (and possibly other terminals) by restoring the
104      // saved cursor position. Safe to skip full unmount: this function already
105      // sends all the terminal-reset sequences, and the process is exiting.
106      inst?.detachForShutdown()
107      // Disable extended key reporting — always send both since terminals
108      // silently ignore whichever they don't implement
109      writeSync(1, DISABLE_MODIFY_OTHER_KEYS)
110      writeSync(1, DISABLE_KITTY_KEYBOARD)
111      // Disable focus events (DECSET 1004)
112      writeSync(1, DFE)
113      // Disable bracketed paste mode
114      writeSync(1, DBP)
115      // Show cursor
116      writeSync(1, SHOW_CURSOR)
117      // Clear iTerm2 progress bar - prevents lingering progress indicator
118      // that can cause bell sounds when returning to the terminal tab
119      writeSync(1, CLEAR_ITERM2_PROGRESS)
120      // Clear tab status (OSC 21337) so a stale dot doesn't linger
121      if (supportsTabStatus()) writeSync(1, wrapForMultiplexer(CLEAR_TAB_STATUS))
122      // Clear terminal title so the tab doesn't show stale session info.
123      // Respect CLAUDE_CODE_DISABLE_TERMINAL_TITLE — if the user opted out of
124      // title changes, don't clear their existing title on exit either.
125      if (!isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_TERMINAL_TITLE)) {
126        if (process.platform === 'win32') {
127          process.title = ''
128        } else {
129          writeSync(1, CLEAR_TERMINAL_TITLE)
130        }
131      }
132    } catch {
133      // Terminal may already be gone (e.g., SIGHUP after terminal close).
134      // Ignore write errors since we're exiting anyway.
135    }
136  }
137  
138  let resumeHintPrinted = false
139  
140  /**
141   * Print a hint about how to resume the session.
142   * Only shown for interactive sessions with persistence enabled.
143   */
144  function printResumeHint(): void {
145    // Only print once (failsafe timer may call this again after normal shutdown)
146    if (resumeHintPrinted) {
147      return
148    }
149    // Only show with TTY, interactive sessions, and persistence
150    if (
151      process.stdout.isTTY &&
152      getIsInteractive() &&
153      !isSessionPersistenceDisabled()
154    ) {
155      try {
156        const sessionId = getSessionId()
157        // Don't show resume hint if no session file exists (e.g., subcommands like `claude update`)
158        if (!sessionIdExists(sessionId)) {
159          return
160        }
161        const customTitle = getCurrentSessionTitle(sessionId)
162  
163        // Use custom title if available, otherwise fall back to session ID
164        let resumeArg: string
165        if (customTitle) {
166          // Wrap in double quotes, escape backslashes first then quotes
167          const escaped = customTitle.replace(/\\/g, '\\\\').replace(/"/g, '\\"')
168          resumeArg = `"${escaped}"`
169        } else {
170          resumeArg = sessionId
171        }
172  
173        writeSync(
174          1,
175          chalk.dim(
176            `\nResume this session with:\nclaude --resume ${resumeArg}\n`,
177          ),
178        )
179        resumeHintPrinted = true
180      } catch {
181        // Ignore write errors
182      }
183    }
184  }
185  /* eslint-enable custom-rules/no-sync-fs */
186  
187  /**
188   * Force process exit, handling the case where the terminal is gone.
189   * When the terminal/PTY is closed (e.g., SIGHUP), process.exit() can throw
190   * EIO errors because Bun tries to flush stdout to a dead file descriptor.
191   * In that case, fall back to SIGKILL which always works.
192   */
193  function forceExit(exitCode: number): never {
194    // Clear failsafe timer since we're exiting now
195    if (failsafeTimer !== undefined) {
196      clearTimeout(failsafeTimer)
197      failsafeTimer = undefined
198    }
199    // Drain stdin LAST, right before exit. cleanupTerminalModes() sent
200    // DISABLE_MOUSE_TRACKING early, but the terminal round-trip plus any
201    // events already in flight means bytes can arrive during the seconds
202    // of async cleanup between then and now. Draining here catches them.
203    // Use the Ink class method (not the standalone drainStdin()) so we
204    // drain the instance's stdin — when process.stdin is piped,
205    // getStdinOverride() opens /dev/tty as the real input stream and the
206    // class method knows about it; the standalone function defaults to
207    // process.stdin which would early-return on isTTY=false.
208    try {
209      instances.get(process.stdout)?.drainStdin()
210    } catch {
211      // Terminal may be gone (SIGHUP). Ignore — we are about to exit.
212    }
213    try {
214      process.exit(exitCode)
215    } catch (e) {
216      // process.exit() threw. In tests, it's mocked to throw - re-throw so test sees it.
217      // In production, it's likely EIO from dead terminal - use SIGKILL.
218      if ((process.env.NODE_ENV as string) === 'test') {
219        throw e
220      }
221      // Fall back to SIGKILL which doesn't try to flush anything.
222      process.kill(process.pid, 'SIGKILL')
223    }
224    // In tests, process.exit may be mocked to return instead of exiting.
225    // In production, we should never reach here.
226    if ((process.env.NODE_ENV as string) !== 'test') {
227      throw new Error('unreachable')
228    }
229    // TypeScript trick: cast to never since we know this only happens in tests
230    // where the mock returns instead of exiting
231    return undefined as never
232  }
233  
234  /**
235   * Set up global signal handlers for graceful shutdown
236   */
237  export const setupGracefulShutdown = memoize(() => {
238    // Work around a Bun bug where process.removeListener(sig, fn) resets the
239    // kernel sigaction for that signal even when other JS listeners remain —
240    // the signal then falls back to its default action (terminate) and our
241    // process.on('SIGTERM') handler never runs.
242    //
243    // Trigger: any short-lived signal-exit v4 subscriber (e.g. execa per child
244    // process, or an Ink instance that unmounts). When its unsubscribe runs and
245    // it was the last v4 subscriber, v4.unload() calls removeListener on every
246    // signal in its list (SIGTERM, SIGINT, SIGHUP, …), tripping the Bun bug and
247    // nuking our handlers at the kernel level.
248    //
249    // Fix: pin signal-exit v4 loaded by registering a no-op onExit callback that
250    // is never unsubscribed. This keeps v4's internal emitter count > 0 so
251    // unload() never runs and removeListener is never called. Harmless under
252    // Node.js — the pin also ensures signal-exit's process.exit hook stays
253    // active for Ink cleanup.
254    onExit(() => {})
255  
256    process.on('SIGINT', () => {
257      // In print mode, print.ts registers its own SIGINT handler that aborts
258      // the in-flight query and calls gracefulShutdown(0); skip here to
259      // avoid racing with it. Only check print mode — other non-interactive
260      // sessions (--sdk-url, --init-only, non-TTY) don't register their own
261      // SIGINT handler and need gracefulShutdown to run.
262      if (process.argv.includes('-p') || process.argv.includes('--print')) {
263        return
264      }
265      logForDiagnosticsNoPII('info', 'shutdown_signal', { signal: 'SIGINT' })
266      void gracefulShutdown(0)
267    })
268    process.on('SIGTERM', () => {
269      logForDiagnosticsNoPII('info', 'shutdown_signal', { signal: 'SIGTERM' })
270      void gracefulShutdown(143) // Exit code 143 (128 + 15) for SIGTERM
271    })
272    if (process.platform !== 'win32') {
273      process.on('SIGHUP', () => {
274        logForDiagnosticsNoPII('info', 'shutdown_signal', { signal: 'SIGHUP' })
275        void gracefulShutdown(129) // Exit code 129 (128 + 1) for SIGHUP
276      })
277  
278      // Detect orphaned process when terminal closes without delivering SIGHUP.
279      // macOS revokes TTY file descriptors instead of signaling, leaving the
280      // process alive but unable to read/write. Periodically check stdin validity.
281      if (process.stdin.isTTY) {
282        orphanCheckInterval = setInterval(() => {
283          // Skip during scroll drain — even a cheap check consumes an event
284          // loop tick that scroll frames need. 30s interval → missing one is fine.
285          if (getIsScrollDraining()) return
286          // process.stdout.writable becomes false when the TTY is revoked
287          if (!process.stdout.writable || !process.stdin.readable) {
288            clearInterval(orphanCheckInterval)
289            logForDiagnosticsNoPII('info', 'shutdown_signal', {
290              signal: 'orphan_detected',
291            })
292            void gracefulShutdown(129)
293          }
294        }, 30_000) // Check every 30 seconds
295        orphanCheckInterval.unref() // Don't keep process alive just for this check
296      }
297    }
298  
299    // Log uncaught exceptions for container observability and analytics
300    // Error names (e.g., "TypeError") are not sensitive - safe to log
301    process.on('uncaughtException', error => {
302      logForDiagnosticsNoPII('error', 'uncaught_exception', {
303        error_name: error.name,
304        error_message: error.message.slice(0, 2000),
305      })
306      logEvent('tengu_uncaught_exception', {
307        error_name:
308          error.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
309      })
310    })
311  
312    // Log unhandled promise rejections for container observability and analytics
313    process.on('unhandledRejection', reason => {
314      const errorName =
315        reason instanceof Error
316          ? reason.name
317          : typeof reason === 'string'
318            ? 'string'
319            : 'unknown'
320      const errorInfo =
321        reason instanceof Error
322          ? {
323              error_name: reason.name,
324              error_message: reason.message.slice(0, 2000),
325              error_stack: reason.stack?.slice(0, 4000),
326            }
327          : { error_message: String(reason).slice(0, 2000) }
328      logForDiagnosticsNoPII('error', 'unhandled_rejection', errorInfo)
329      logEvent('tengu_unhandled_rejection', {
330        error_name:
331          errorName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
332      })
333    })
334  })
335  
336  export function gracefulShutdownSync(
337    exitCode = 0,
338    reason: ExitReason = 'other',
339    options?: {
340      getAppState?: () => AppState
341      setAppState?: (f: (prev: AppState) => AppState) => void
342    },
343  ): void {
344    // Set the exit code that will be used when process naturally exits. Note that we do it
345    // here inside the sync version too so that it is possible to determine if
346    // gracefulShutdownSync was called by checking process.exitCode.
347    process.exitCode = exitCode
348  
349    pendingShutdown = gracefulShutdown(exitCode, reason, options)
350      .catch(error => {
351        logForDebugging(`Graceful shutdown failed: ${error}`, { level: 'error' })
352        cleanupTerminalModes()
353        printResumeHint()
354        forceExit(exitCode)
355      })
356      // Prevent unhandled rejection: forceExit re-throws in test mode,
357      // which would escape the .catch() handler above as a new rejection.
358      .catch(() => {})
359  }
360  
361  let shutdownInProgress = false
362  let failsafeTimer: ReturnType<typeof setTimeout> | undefined
363  let orphanCheckInterval: ReturnType<typeof setInterval> | undefined
364  let pendingShutdown: Promise<void> | undefined
365  
366  /** Check if graceful shutdown is in progress */
367  export function isShuttingDown(): boolean {
368    return shutdownInProgress
369  }
370  
371  /** Reset shutdown state - only for use in tests */
372  export function resetShutdownState(): void {
373    shutdownInProgress = false
374    resumeHintPrinted = false
375    if (failsafeTimer !== undefined) {
376      clearTimeout(failsafeTimer)
377      failsafeTimer = undefined
378    }
379    pendingShutdown = undefined
380  }
381  
382  /**
383   * Returns the in-flight shutdown promise, if any. Only for use in tests
384   * to await completion before restoring mocks.
385   */
386  export function getPendingShutdownForTesting(): Promise<void> | undefined {
387    return pendingShutdown
388  }
389  
390  // Graceful shutdown function that drains the event loop
391  export async function gracefulShutdown(
392    exitCode = 0,
393    reason: ExitReason = 'other',
394    options?: {
395      getAppState?: () => AppState
396      setAppState?: (f: (prev: AppState) => AppState) => void
397      /** Printed to stderr after alt-screen exit, before forceExit. */
398      finalMessage?: string
399    },
400  ): Promise<void> {
401    if (shutdownInProgress) {
402      return
403    }
404    shutdownInProgress = true
405  
406    // Resolve the SessionEnd hook budget before arming the failsafe so the
407    // failsafe can scale with it. Without this, a user-configured 10s hook
408    // budget is silently truncated by the 5s failsafe (gh-32712 follow-up).
409    const { executeSessionEndHooks, getSessionEndHookTimeoutMs } = await import(
410      './hooks.js'
411    )
412    const sessionEndTimeoutMs = getSessionEndHookTimeoutMs()
413  
414    // Failsafe: guarantee process exits even if cleanup hangs (e.g., MCP connections).
415    // Runs cleanupTerminalModes first so a hung cleanup doesn't leave the terminal dirty.
416    // Budget = max(5s, hook budget + 3.5s headroom for cleanup + analytics flush).
417    failsafeTimer = setTimeout(
418      code => {
419        cleanupTerminalModes()
420        printResumeHint()
421        forceExit(code)
422      },
423      Math.max(5000, sessionEndTimeoutMs + 3500),
424      exitCode,
425    )
426    failsafeTimer.unref()
427  
428    // Set the exit code that will be used when process naturally exits
429    process.exitCode = exitCode
430  
431    // Exit alt screen and print resume hint FIRST, before any async operations.
432    // This ensures the hint is visible even if the process is killed during
433    // cleanup (e.g., SIGKILL during macOS reboot). Without this, the resume
434    // hint would only appear after cleanup functions, hooks, and analytics
435    // flush — which can take several seconds.
436    cleanupTerminalModes()
437    printResumeHint()
438  
439    // Flush session data first — this is the most critical cleanup. If the
440    // terminal is dead (SIGHUP, SSH disconnect), hooks and analytics may hang
441    // on I/O to a dead TTY or unreachable network, eating into the
442    // failsafe budget. Session persistence must complete before anything else.
443    let cleanupTimeoutId: ReturnType<typeof setTimeout> | undefined
444    try {
445      const cleanupPromise = (async () => {
446        try {
447          await runCleanupFunctions()
448        } catch {
449          // Silently ignore cleanup errors
450        }
451      })()
452  
453      await Promise.race([
454        cleanupPromise,
455        new Promise((_, reject) => {
456          cleanupTimeoutId = setTimeout(
457            rej => rej(new CleanupTimeoutError()),
458            2000,
459            reject,
460          )
461        }),
462      ])
463      clearTimeout(cleanupTimeoutId)
464    } catch {
465      // Silently handle timeout and other errors
466      clearTimeout(cleanupTimeoutId)
467    }
468  
469    // Execute SessionEnd hooks. Bound both the per-hook default timeout and the
470    // overall execution via a single budget (CLAUDE_CODE_SESSIONEND_HOOKS_TIMEOUT_MS,
471    // default 1.5s). hook.timeout in settings is respected up to this cap.
472    try {
473      await executeSessionEndHooks(reason, {
474        ...options,
475        signal: AbortSignal.timeout(sessionEndTimeoutMs),
476        timeoutMs: sessionEndTimeoutMs,
477      })
478    } catch {
479      // Ignore SessionEnd hook exceptions (including AbortError on timeout)
480    }
481  
482    // Log startup perf before analytics shutdown flushes/cancels timers
483    try {
484      profileReport()
485    } catch {
486      // Ignore profiling errors during shutdown
487    }
488  
489    // Signal to inference that this session's cache can be evicted.
490    // Fires before analytics flush so the event makes it to the pipeline.
491    const lastRequestId = getLastMainRequestId()
492    if (lastRequestId) {
493      logEvent('tengu_cache_eviction_hint', {
494        scope:
495          'session_end' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
496        last_request_id:
497          lastRequestId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
498      })
499    }
500  
501    // Flush analytics — capped at 500ms. Previously unbounded: the 1P exporter
502    // awaits all pending axios POSTs (10s each), eating the full failsafe budget.
503    // Lost analytics on slow networks are acceptable; a hanging exit is not.
504    try {
505      await Promise.race([
506        Promise.all([shutdown1PEventLogging(), shutdownDatadog()]),
507        sleep(500),
508      ])
509    } catch {
510      // Ignore analytics shutdown errors
511    }
512  
513    if (options?.finalMessage) {
514      try {
515        // eslint-disable-next-line custom-rules/no-sync-fs -- must flush before forceExit
516        writeSync(2, options.finalMessage + '\n')
517      } catch {
518        // stderr may be closed (e.g., SSH disconnect). Ignore write errors.
519      }
520    }
521  
522    forceExit(exitCode)
523  }
524  
525  class CleanupTimeoutError extends Error {
526    constructor() {
527      super('Cleanup timeout')
528    }
529  }