/ bridge / bridgeDebug.ts
bridgeDebug.ts
  1  import { logForDebugging } from '../utils/debug.js'
  2  import { BridgeFatalError } from './bridgeApi.js'
  3  import type { BridgeApiClient } from './types.js'
  4  
  5  /**
  6   * Ant-only fault injection for manually testing bridge recovery paths.
  7   *
  8   * Real failure modes this targets (BQ 2026-03-12, 7-day window):
  9   *   poll 404 not_found_error   — 147K sessions/week, dead onEnvironmentLost gate
 10   *   ws_closed 1002/1006        —  22K sessions/week, zombie poll after close
 11   *   register transient failure —  residual: network blips during doReconnect
 12   *
 13   * Usage: /bridge-kick <subcommand> from the REPL while Remote Control is
 14   * connected, then tail debug.log to watch the recovery machinery react.
 15   *
 16   * Module-level state is intentional here: one bridge per REPL process, the
 17   * /bridge-kick slash command has no other way to reach into initBridgeCore's
 18   * closures, and teardown clears the slot.
 19   */
 20  
 21  /** One-shot fault to inject on the next matching api call. */
 22  type BridgeFault = {
 23    method:
 24      | 'pollForWork'
 25      | 'registerBridgeEnvironment'
 26      | 'reconnectSession'
 27      | 'heartbeatWork'
 28    /** Fatal errors go through handleErrorStatus → BridgeFatalError. Transient
 29     *  errors surface as plain axios rejections (5xx / network). Recovery code
 30     *  distinguishes the two: fatal → teardown, transient → retry/backoff. */
 31    kind: 'fatal' | 'transient'
 32    status: number
 33    errorType?: string
 34    /** Remaining injections. Decremented on consume; removed at 0. */
 35    count: number
 36  }
 37  
 38  export type BridgeDebugHandle = {
 39    /** Invoke the transport's permanent-close handler directly. Tests the
 40     *  ws_closed → reconnectEnvironmentWithSession escalation (#22148). */
 41    fireClose: (code: number) => void
 42    /** Call reconnectEnvironmentWithSession() — same as SIGUSR2 but
 43     *  reachable from the slash command. */
 44    forceReconnect: () => void
 45    /** Queue a fault for the next N calls to the named api method. */
 46    injectFault: (fault: BridgeFault) => void
 47    /** Abort the at-capacity sleep so an injected poll fault lands
 48     *  immediately instead of up to 10min later. */
 49    wakePollLoop: () => void
 50    /** env/session IDs for the debug.log grep. */
 51    describe: () => string
 52  }
 53  
 54  let debugHandle: BridgeDebugHandle | null = null
 55  const faultQueue: BridgeFault[] = []
 56  
 57  export function registerBridgeDebugHandle(h: BridgeDebugHandle): void {
 58    debugHandle = h
 59  }
 60  
 61  export function clearBridgeDebugHandle(): void {
 62    debugHandle = null
 63    faultQueue.length = 0
 64  }
 65  
 66  export function getBridgeDebugHandle(): BridgeDebugHandle | null {
 67    return debugHandle
 68  }
 69  
 70  export function injectBridgeFault(fault: BridgeFault): void {
 71    faultQueue.push(fault)
 72    logForDebugging(
 73      `[bridge:debug] Queued fault: ${fault.method} ${fault.kind}/${fault.status}${fault.errorType ? `/${fault.errorType}` : ''} ×${fault.count}`,
 74    )
 75  }
 76  
 77  /**
 78   * Wrap a BridgeApiClient so each call first checks the fault queue. If a
 79   * matching fault is queued, throw the specified error instead of calling
 80   * through. Delegates everything else to the real client.
 81   *
 82   * Only called when USER_TYPE === 'ant' — zero overhead in external builds.
 83   */
 84  export function wrapApiForFaultInjection(
 85    api: BridgeApiClient,
 86  ): BridgeApiClient {
 87    function consume(method: BridgeFault['method']): BridgeFault | null {
 88      const idx = faultQueue.findIndex(f => f.method === method)
 89      if (idx === -1) return null
 90      const fault = faultQueue[idx]!
 91      fault.count--
 92      if (fault.count <= 0) faultQueue.splice(idx, 1)
 93      return fault
 94    }
 95  
 96    function throwFault(fault: BridgeFault, context: string): never {
 97      logForDebugging(
 98        `[bridge:debug] Injecting ${fault.kind} fault into ${context}: status=${fault.status} errorType=${fault.errorType ?? 'none'}`,
 99      )
100      if (fault.kind === 'fatal') {
101        throw new BridgeFatalError(
102          `[injected] ${context} ${fault.status}`,
103          fault.status,
104          fault.errorType,
105        )
106      }
107      // Transient: mimic an axios rejection (5xx / network). No .status on
108      // the error itself — that's how the catch blocks distinguish.
109      throw new Error(`[injected transient] ${context} ${fault.status}`)
110    }
111  
112    return {
113      ...api,
114      async pollForWork(envId, secret, signal, reclaimMs) {
115        const f = consume('pollForWork')
116        if (f) throwFault(f, 'Poll')
117        return api.pollForWork(envId, secret, signal, reclaimMs)
118      },
119      async registerBridgeEnvironment(config) {
120        const f = consume('registerBridgeEnvironment')
121        if (f) throwFault(f, 'Registration')
122        return api.registerBridgeEnvironment(config)
123      },
124      async reconnectSession(envId, sessionId) {
125        const f = consume('reconnectSession')
126        if (f) throwFault(f, 'ReconnectSession')
127        return api.reconnectSession(envId, sessionId)
128      },
129      async heartbeatWork(envId, workId, token) {
130        const f = consume('heartbeatWork')
131        if (f) throwFault(f, 'Heartbeat')
132        return api.heartbeatWork(envId, workId, token)
133      },
134    }
135  }