/ commands / bridge-kick.ts
bridge-kick.ts
  1  import { getBridgeDebugHandle } from '../bridge/bridgeDebug.js'
  2  import type { Command } from '../commands.js'
  3  import type { LocalCommandCall } from '../types/command.js'
  4  
  5  /**
  6   * Ant-only: inject bridge failure states to manually test recovery paths.
  7   *
  8   *   /bridge-kick close 1002            — fire ws_closed with code 1002
  9   *   /bridge-kick close 1006            — fire ws_closed with code 1006
 10   *   /bridge-kick poll 404              — next poll throws 404/not_found_error
 11   *   /bridge-kick poll 404 <type>       — next poll throws 404 with error_type
 12   *   /bridge-kick poll 401              — next poll throws 401 (auth)
 13   *   /bridge-kick poll transient        — next poll throws axios-style rejection
 14   *   /bridge-kick register fail         — next register (inside doReconnect) transient-fails
 15   *   /bridge-kick register fail 3       — next 3 registers transient-fail
 16   *   /bridge-kick register fatal        — next register 403s (terminal)
 17   *   /bridge-kick reconnect-session fail — POST /bridge/reconnect fails (→ Strategy 2)
 18   *   /bridge-kick heartbeat 401         — next heartbeat 401s (JWT expired)
 19   *   /bridge-kick reconnect             — call doReconnect directly (= SIGUSR2)
 20   *   /bridge-kick status                — print current bridge state
 21   *
 22   * Workflow: connect Remote Control, run a subcommand, `tail -f debug.log`
 23   * and watch [bridge:repl] / [bridge:debug] lines for the recovery reaction.
 24   *
 25   * Composite sequences — the failure modes in the BQ data are chains, not
 26   * single events. Queue faults then fire the trigger:
 27   *
 28   *   # #22148 residual: ws_closed → register transient-blips → teardown?
 29   *   /bridge-kick register fail 2
 30   *   /bridge-kick close 1002
 31   *   → expect: doReconnect tries register, fails, returns false → teardown
 32   *     (demonstrates the retry gap that needs fixing)
 33   *
 34   *   # Dead gate: poll 404/not_found_error → does onEnvironmentLost fire?
 35   *   /bridge-kick poll 404
 36   *   → expect: tengu_bridge_repl_fatal_error (gate is dead — 147K/wk)
 37   *     after fix: tengu_bridge_repl_env_lost → doReconnect
 38   */
 39  
 40  const USAGE = `/bridge-kick <subcommand>
 41    close <code>              fire ws_closed with the given code (e.g. 1002)
 42    poll <status> [type]      next poll throws BridgeFatalError(status, type)
 43    poll transient            next poll throws axios-style rejection (5xx/net)
 44    register fail [N]         next N registers transient-fail (default 1)
 45    register fatal            next register 403s (terminal)
 46    reconnect-session fail    next POST /bridge/reconnect fails
 47    heartbeat <status>        next heartbeat throws BridgeFatalError(status)
 48    reconnect                 call reconnectEnvironmentWithSession directly
 49    status                    print bridge state`
 50  
 51  const call: LocalCommandCall = async args => {
 52    const h = getBridgeDebugHandle()
 53    if (!h) {
 54      return {
 55        type: 'text',
 56        value:
 57          'No bridge debug handle registered. Remote Control must be connected (USER_TYPE=ant).',
 58      }
 59    }
 60  
 61    const [sub, a, b] = args.trim().split(/\s+/)
 62  
 63    switch (sub) {
 64      case 'close': {
 65        const code = Number(a)
 66        if (!Number.isFinite(code)) {
 67          return { type: 'text', value: `close: need a numeric code\n${USAGE}` }
 68        }
 69        h.fireClose(code)
 70        return {
 71          type: 'text',
 72          value: `Fired transport close(${code}). Watch debug.log for [bridge:repl] recovery.`,
 73        }
 74      }
 75  
 76      case 'poll': {
 77        if (a === 'transient') {
 78          h.injectFault({
 79            method: 'pollForWork',
 80            kind: 'transient',
 81            status: 503,
 82            count: 1,
 83          })
 84          h.wakePollLoop()
 85          return {
 86            type: 'text',
 87            value:
 88              'Next poll will throw a transient (axios rejection). Poll loop woken.',
 89          }
 90        }
 91        const status = Number(a)
 92        if (!Number.isFinite(status)) {
 93          return {
 94            type: 'text',
 95            value: `poll: need 'transient' or a status code\n${USAGE}`,
 96          }
 97        }
 98        // Default to what the server ACTUALLY sends for 404 (BQ-verified),
 99        // so `/bridge-kick poll 404` reproduces the real 147K/week state.
100        const errorType =
101          b ?? (status === 404 ? 'not_found_error' : 'authentication_error')
102        h.injectFault({
103          method: 'pollForWork',
104          kind: 'fatal',
105          status,
106          errorType,
107          count: 1,
108        })
109        h.wakePollLoop()
110        return {
111          type: 'text',
112          value: `Next poll will throw BridgeFatalError(${status}, ${errorType}). Poll loop woken.`,
113        }
114      }
115  
116      case 'register': {
117        if (a === 'fatal') {
118          h.injectFault({
119            method: 'registerBridgeEnvironment',
120            kind: 'fatal',
121            status: 403,
122            errorType: 'permission_error',
123            count: 1,
124          })
125          return {
126            type: 'text',
127            value:
128              'Next registerBridgeEnvironment will 403. Trigger with close/reconnect.',
129          }
130        }
131        const n = Number(b) || 1
132        h.injectFault({
133          method: 'registerBridgeEnvironment',
134          kind: 'transient',
135          status: 503,
136          count: n,
137        })
138        return {
139          type: 'text',
140          value: `Next ${n} registerBridgeEnvironment call(s) will transient-fail. Trigger with close/reconnect.`,
141        }
142      }
143  
144      case 'reconnect-session': {
145        h.injectFault({
146          method: 'reconnectSession',
147          kind: 'fatal',
148          status: 404,
149          errorType: 'not_found_error',
150          count: 2,
151        })
152        return {
153          type: 'text',
154          value:
155            'Next 2 POST /bridge/reconnect calls will 404. doReconnect Strategy 1 falls through to Strategy 2.',
156        }
157      }
158  
159      case 'heartbeat': {
160        const status = Number(a) || 401
161        h.injectFault({
162          method: 'heartbeatWork',
163          kind: 'fatal',
164          status,
165          errorType: status === 401 ? 'authentication_error' : 'not_found_error',
166          count: 1,
167        })
168        return {
169          type: 'text',
170          value: `Next heartbeat will ${status}. Watch for onHeartbeatFatal → work-state teardown.`,
171        }
172      }
173  
174      case 'reconnect': {
175        h.forceReconnect()
176        return {
177          type: 'text',
178          value: 'Called reconnectEnvironmentWithSession(). Watch debug.log.',
179        }
180      }
181  
182      case 'status': {
183        return { type: 'text', value: h.describe() }
184      }
185  
186      default:
187        return { type: 'text', value: USAGE }
188    }
189  }
190  
191  const bridgeKick = {
192    type: 'local',
193    name: 'bridge-kick',
194    description: 'Inject bridge failure states for manual recovery testing',
195    isEnabled: () => process.env.USER_TYPE === 'ant',
196    supportsNonInteractive: false,
197    load: () => Promise.resolve({ call }),
198  } satisfies Command
199  
200  export default bridgeKick