/ services / api / withRetry.ts
withRetry.ts
  1  import { feature } from 'bun:bundle'
  2  import type Anthropic from '@anthropic-ai/sdk'
  3  import {
  4    APIConnectionError,
  5    APIError,
  6    APIUserAbortError,
  7  } from '@anthropic-ai/sdk'
  8  import type { QuerySource } from 'src/constants/querySource.js'
  9  import type { SystemAPIErrorMessage } from 'src/types/message.js'
 10  import { isAwsCredentialsProviderError } from 'src/utils/aws.js'
 11  import { logForDebugging } from 'src/utils/debug.js'
 12  import { logError } from 'src/utils/log.js'
 13  import { createSystemAPIErrorMessage } from 'src/utils/messages.js'
 14  import { getAPIProviderForStatsig } from 'src/utils/model/providers.js'
 15  import {
 16    clearApiKeyHelperCache,
 17    clearAwsCredentialsCache,
 18    clearGcpCredentialsCache,
 19    getClaudeAIOAuthTokens,
 20    handleOAuth401Error,
 21    isClaudeAISubscriber,
 22    isEnterpriseSubscriber,
 23  } from '../../utils/auth.js'
 24  import { isEnvTruthy } from '../../utils/envUtils.js'
 25  import { errorMessage } from '../../utils/errors.js'
 26  import {
 27    type CooldownReason,
 28    handleFastModeOverageRejection,
 29    handleFastModeRejectedByAPI,
 30    isFastModeCooldown,
 31    isFastModeEnabled,
 32    triggerFastModeCooldown,
 33  } from '../../utils/fastMode.js'
 34  import { isNonCustomOpusModel } from '../../utils/model/model.js'
 35  import { disableKeepAlive } from '../../utils/proxy.js'
 36  import { sleep } from '../../utils/sleep.js'
 37  import type { ThinkingConfig } from '../../utils/thinking.js'
 38  import { getFeatureValue_CACHED_MAY_BE_STALE } from '../analytics/growthbook.js'
 39  import {
 40    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 41    logEvent,
 42  } from '../analytics/index.js'
 43  import {
 44    checkMockRateLimitError,
 45    isMockRateLimitError,
 46  } from '../rateLimitMocking.js'
 47  import { REPEATED_529_ERROR_MESSAGE } from './errors.js'
 48  import { extractConnectionErrorDetails } from './errorUtils.js'
 49  
 50  const abortError = () => new APIUserAbortError()
 51  
 52  const DEFAULT_MAX_RETRIES = 10
 53  const FLOOR_OUTPUT_TOKENS = 3000
 54  const MAX_529_RETRIES = 3
 55  export const BASE_DELAY_MS = 500
 56  
 57  // Foreground query sources where the user IS blocking on the result — these
 58  // retry on 529. Everything else (summaries, titles, suggestions, classifiers)
 59  // bails immediately: during a capacity cascade each retry is 3-10× gateway
 60  // amplification, and the user never sees those fail anyway. New sources
 61  // default to no-retry — add here only if the user is waiting on the result.
 62  const FOREGROUND_529_RETRY_SOURCES = new Set<QuerySource>([
 63    'repl_main_thread',
 64    'repl_main_thread:outputStyle:custom',
 65    'repl_main_thread:outputStyle:Explanatory',
 66    'repl_main_thread:outputStyle:Learning',
 67    'sdk',
 68    'agent:custom',
 69    'agent:default',
 70    'agent:builtin',
 71    'compact',
 72    'hook_agent',
 73    'hook_prompt',
 74    'verification_agent',
 75    'side_question',
 76    // Security classifiers — must complete for auto-mode correctness.
 77    // yoloClassifier.ts uses 'auto_mode' (not 'yolo_classifier' — that's
 78    // type-only). bash_classifier is ant-only; feature-gate so the string
 79    // tree-shakes out of external builds (excluded-strings.txt).
 80    'auto_mode',
 81    ...(feature('BASH_CLASSIFIER') ? (['bash_classifier'] as const) : []),
 82  ])
 83  
 84  function shouldRetry529(querySource: QuerySource | undefined): boolean {
 85    // undefined → retry (conservative for untagged call paths)
 86    return (
 87      querySource === undefined || FOREGROUND_529_RETRY_SOURCES.has(querySource)
 88    )
 89  }
 90  
 91  // CLAUDE_CODE_UNATTENDED_RETRY: for unattended sessions (ant-only). Retries 429/529
 92  // indefinitely with higher backoff and periodic keep-alive yields so the host
 93  // environment does not mark the session idle mid-wait.
 94  // TODO(ANT-344): the keep-alive via SystemAPIErrorMessage yields is a stopgap
 95  // until there's a dedicated keep-alive channel.
 96  const PERSISTENT_MAX_BACKOFF_MS = 5 * 60 * 1000
 97  const PERSISTENT_RESET_CAP_MS = 6 * 60 * 60 * 1000
 98  const HEARTBEAT_INTERVAL_MS = 30_000
 99  
100  function isPersistentRetryEnabled(): boolean {
101    return feature('UNATTENDED_RETRY')
102      ? isEnvTruthy(process.env.CLAUDE_CODE_UNATTENDED_RETRY)
103      : false
104  }
105  
106  function isTransientCapacityError(error: unknown): boolean {
107    return (
108      is529Error(error) || (error instanceof APIError && error.status === 429)
109    )
110  }
111  
112  function isStaleConnectionError(error: unknown): boolean {
113    if (!(error instanceof APIConnectionError)) {
114      return false
115    }
116    const details = extractConnectionErrorDetails(error)
117    return details?.code === 'ECONNRESET' || details?.code === 'EPIPE'
118  }
119  
120  export interface RetryContext {
121    maxTokensOverride?: number
122    model: string
123    thinkingConfig: ThinkingConfig
124    fastMode?: boolean
125  }
126  
127  interface RetryOptions {
128    maxRetries?: number
129    model: string
130    fallbackModel?: string
131    thinkingConfig: ThinkingConfig
132    fastMode?: boolean
133    signal?: AbortSignal
134    querySource?: QuerySource
135    /**
136     * Pre-seed the consecutive 529 counter. Used when this retry loop is a
137     * non-streaming fallback after a streaming 529 — the streaming 529 should
138     * count toward MAX_529_RETRIES so total 529s-before-fallback is consistent
139     * regardless of which request mode hit the overload.
140     */
141    initialConsecutive529Errors?: number
142  }
143  
144  export class CannotRetryError extends Error {
145    constructor(
146      public readonly originalError: unknown,
147      public readonly retryContext: RetryContext,
148    ) {
149      const message = errorMessage(originalError)
150      super(message)
151      this.name = 'RetryError'
152  
153      // Preserve the original stack trace if available
154      if (originalError instanceof Error && originalError.stack) {
155        this.stack = originalError.stack
156      }
157    }
158  }
159  
160  export class FallbackTriggeredError extends Error {
161    constructor(
162      public readonly originalModel: string,
163      public readonly fallbackModel: string,
164    ) {
165      super(`Model fallback triggered: ${originalModel} -> ${fallbackModel}`)
166      this.name = 'FallbackTriggeredError'
167    }
168  }
169  
170  export async function* withRetry<T>(
171    getClient: () => Promise<Anthropic>,
172    operation: (
173      client: Anthropic,
174      attempt: number,
175      context: RetryContext,
176    ) => Promise<T>,
177    options: RetryOptions,
178  ): AsyncGenerator<SystemAPIErrorMessage, T> {
179    const maxRetries = getMaxRetries(options)
180    const retryContext: RetryContext = {
181      model: options.model,
182      thinkingConfig: options.thinkingConfig,
183      ...(isFastModeEnabled() && { fastMode: options.fastMode }),
184    }
185    let client: Anthropic | null = null
186    let consecutive529Errors = options.initialConsecutive529Errors ?? 0
187    let lastError: unknown
188    let persistentAttempt = 0
189    for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
190      if (options.signal?.aborted) {
191        throw new APIUserAbortError()
192      }
193  
194      // Capture whether fast mode is active before this attempt
195      // (fallback may change the state mid-loop)
196      const wasFastModeActive = isFastModeEnabled()
197        ? retryContext.fastMode && !isFastModeCooldown()
198        : false
199  
200      try {
201        // Check for mock rate limits (used by /mock-limits command for Ant employees)
202        if (process.env.USER_TYPE === 'ant') {
203          const mockError = checkMockRateLimitError(
204            retryContext.model,
205            wasFastModeActive,
206          )
207          if (mockError) {
208            throw mockError
209          }
210        }
211  
212        // Get a fresh client instance on first attempt or after authentication errors
213        // - 401 for first-party API authentication failures
214        // - 403 "OAuth token has been revoked" (another process refreshed the token)
215        // - Bedrock-specific auth errors (403 or CredentialsProviderError)
216        // - Vertex-specific auth errors (credential refresh failures, 401)
217        // - ECONNRESET/EPIPE: stale keep-alive socket; disable pooling and reconnect
218        const isStaleConnection = isStaleConnectionError(lastError)
219        if (
220          isStaleConnection &&
221          getFeatureValue_CACHED_MAY_BE_STALE(
222            'tengu_disable_keepalive_on_econnreset',
223            false,
224          )
225        ) {
226          logForDebugging(
227            'Stale connection (ECONNRESET/EPIPE) — disabling keep-alive for retry',
228          )
229          disableKeepAlive()
230        }
231  
232        if (
233          client === null ||
234          (lastError instanceof APIError && lastError.status === 401) ||
235          isOAuthTokenRevokedError(lastError) ||
236          isBedrockAuthError(lastError) ||
237          isVertexAuthError(lastError) ||
238          isStaleConnection
239        ) {
240          // On 401 "token expired" or 403 "token revoked", force a token refresh
241          if (
242            (lastError instanceof APIError && lastError.status === 401) ||
243            isOAuthTokenRevokedError(lastError)
244          ) {
245            const failedAccessToken = getClaudeAIOAuthTokens()?.accessToken
246            if (failedAccessToken) {
247              await handleOAuth401Error(failedAccessToken)
248            }
249          }
250          client = await getClient()
251        }
252  
253        return await operation(client, attempt, retryContext)
254      } catch (error) {
255        lastError = error
256        logForDebugging(
257          `API error (attempt ${attempt}/${maxRetries + 1}): ${error instanceof APIError ? `${error.status} ${error.message}` : errorMessage(error)}`,
258          { level: 'error' },
259        )
260  
261        // Fast mode fallback: on 429/529, either wait and retry (short delays)
262        // or fall back to standard speed (long delays) to avoid cache thrashing.
263        // Skip in persistent mode: the short-retry path below loops with fast
264        // mode still active, so its `continue` never reaches the attempt clamp
265        // and the for-loop terminates. Persistent sessions want the chunked
266        // keep-alive path instead of fast-mode cache-preservation anyway.
267        if (
268          wasFastModeActive &&
269          !isPersistentRetryEnabled() &&
270          error instanceof APIError &&
271          (error.status === 429 || is529Error(error))
272        ) {
273          // If the 429 is specifically because extra usage (overage) is not
274          // available, permanently disable fast mode with a specific message.
275          const overageReason = error.headers?.get(
276            'anthropic-ratelimit-unified-overage-disabled-reason',
277          )
278          if (overageReason !== null && overageReason !== undefined) {
279            handleFastModeOverageRejection(overageReason)
280            retryContext.fastMode = false
281            continue
282          }
283  
284          const retryAfterMs = getRetryAfterMs(error)
285          if (retryAfterMs !== null && retryAfterMs < SHORT_RETRY_THRESHOLD_MS) {
286            // Short retry-after: wait and retry with fast mode still active
287            // to preserve prompt cache (same model name on retry).
288            await sleep(retryAfterMs, options.signal, { abortError })
289            continue
290          }
291          // Long or unknown retry-after: enter cooldown (switches to standard
292          // speed model), with a minimum floor to avoid flip-flopping.
293          const cooldownMs = Math.max(
294            retryAfterMs ?? DEFAULT_FAST_MODE_FALLBACK_HOLD_MS,
295            MIN_COOLDOWN_MS,
296          )
297          const cooldownReason: CooldownReason = is529Error(error)
298            ? 'overloaded'
299            : 'rate_limit'
300          triggerFastModeCooldown(Date.now() + cooldownMs, cooldownReason)
301          if (isFastModeEnabled()) {
302            retryContext.fastMode = false
303          }
304          continue
305        }
306  
307        // Fast mode fallback: if the API rejects the fast mode parameter
308        // (e.g., org doesn't have fast mode enabled), permanently disable fast
309        // mode and retry at standard speed.
310        if (wasFastModeActive && isFastModeNotEnabledError(error)) {
311          handleFastModeRejectedByAPI()
312          retryContext.fastMode = false
313          continue
314        }
315  
316        // Non-foreground sources bail immediately on 529 — no retry amplification
317        // during capacity cascades. User never sees these fail.
318        if (is529Error(error) && !shouldRetry529(options.querySource)) {
319          logEvent('tengu_api_529_background_dropped', {
320            query_source:
321              options.querySource as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
322          })
323          throw new CannotRetryError(error, retryContext)
324        }
325  
326        // Track consecutive 529 errors
327        if (
328          is529Error(error) &&
329          // If FALLBACK_FOR_ALL_PRIMARY_MODELS is not set, fall through only if the primary model is a non-custom Opus model.
330          // TODO: Revisit if the isNonCustomOpusModel check should still exist, or if isNonCustomOpusModel is a stale artifact of when Claude Code was hardcoded on Opus.
331          (process.env.FALLBACK_FOR_ALL_PRIMARY_MODELS ||
332            (!isClaudeAISubscriber() && isNonCustomOpusModel(options.model)))
333        ) {
334          consecutive529Errors++
335          if (consecutive529Errors >= MAX_529_RETRIES) {
336            // Check if fallback model is specified
337            if (options.fallbackModel) {
338              logEvent('tengu_api_opus_fallback_triggered', {
339                original_model:
340                  options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
341                fallback_model:
342                  options.fallbackModel as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
343                provider: getAPIProviderForStatsig(),
344              })
345  
346              // Throw special error to indicate fallback was triggered
347              throw new FallbackTriggeredError(
348                options.model,
349                options.fallbackModel,
350              )
351            }
352  
353            if (
354              process.env.USER_TYPE === 'external' &&
355              !process.env.IS_SANDBOX &&
356              !isPersistentRetryEnabled()
357            ) {
358              logEvent('tengu_api_custom_529_overloaded_error', {})
359              throw new CannotRetryError(
360                new Error(REPEATED_529_ERROR_MESSAGE),
361                retryContext,
362              )
363            }
364          }
365        }
366  
367        // Only retry if the error indicates we should
368        const persistent =
369          isPersistentRetryEnabled() && isTransientCapacityError(error)
370        if (attempt > maxRetries && !persistent) {
371          throw new CannotRetryError(error, retryContext)
372        }
373  
374        // AWS/GCP errors aren't always APIError, but can be retried
375        const handledCloudAuthError =
376          handleAwsCredentialError(error) || handleGcpCredentialError(error)
377        if (
378          !handledCloudAuthError &&
379          (!(error instanceof APIError) || !shouldRetry(error))
380        ) {
381          throw new CannotRetryError(error, retryContext)
382        }
383  
384        // Handle max tokens context overflow errors by adjusting max_tokens for the next attempt
385        // NOTE: With extended-context-window beta, this 400 error should not occur.
386        // The API now returns 'model_context_window_exceeded' stop_reason instead.
387        // Keeping for backward compatibility.
388        if (error instanceof APIError) {
389          const overflowData = parseMaxTokensContextOverflowError(error)
390          if (overflowData) {
391            const { inputTokens, contextLimit } = overflowData
392  
393            const safetyBuffer = 1000
394            const availableContext = Math.max(
395              0,
396              contextLimit - inputTokens - safetyBuffer,
397            )
398            if (availableContext < FLOOR_OUTPUT_TOKENS) {
399              logError(
400                new Error(
401                  `availableContext ${availableContext} is less than FLOOR_OUTPUT_TOKENS ${FLOOR_OUTPUT_TOKENS}`,
402                ),
403              )
404              throw error
405            }
406            // Ensure we have enough tokens for thinking + at least 1 output token
407            const minRequired =
408              (retryContext.thinkingConfig.type === 'enabled'
409                ? retryContext.thinkingConfig.budgetTokens
410                : 0) + 1
411            const adjustedMaxTokens = Math.max(
412              FLOOR_OUTPUT_TOKENS,
413              availableContext,
414              minRequired,
415            )
416            retryContext.maxTokensOverride = adjustedMaxTokens
417  
418            logEvent('tengu_max_tokens_context_overflow_adjustment', {
419              inputTokens,
420              contextLimit,
421              adjustedMaxTokens,
422              attempt,
423            })
424  
425            continue
426          }
427        }
428  
429        // For other errors, proceed with normal retry logic
430        // Get retry-after header if available
431        const retryAfter = getRetryAfter(error)
432        let delayMs: number
433        if (persistent && error instanceof APIError && error.status === 429) {
434          persistentAttempt++
435          // Window-based limits (e.g. 5hr Max/Pro) include a reset timestamp.
436          // Wait until reset rather than polling every 5 min uselessly.
437          const resetDelay = getRateLimitResetDelayMs(error)
438          delayMs =
439            resetDelay ??
440            Math.min(
441              getRetryDelay(
442                persistentAttempt,
443                retryAfter,
444                PERSISTENT_MAX_BACKOFF_MS,
445              ),
446              PERSISTENT_RESET_CAP_MS,
447            )
448        } else if (persistent) {
449          persistentAttempt++
450          // Retry-After is a server directive and bypasses maxDelayMs inside
451          // getRetryDelay (intentional — honoring it is correct). Cap at the
452          // 6hr reset-cap here so a pathological header can't wait unbounded.
453          delayMs = Math.min(
454            getRetryDelay(
455              persistentAttempt,
456              retryAfter,
457              PERSISTENT_MAX_BACKOFF_MS,
458            ),
459            PERSISTENT_RESET_CAP_MS,
460          )
461        } else {
462          delayMs = getRetryDelay(attempt, retryAfter)
463        }
464  
465        // In persistent mode the for-loop `attempt` is clamped at maxRetries+1;
466        // use persistentAttempt for telemetry/yields so they show the true count.
467        const reportedAttempt = persistent ? persistentAttempt : attempt
468        logEvent('tengu_api_retry', {
469          attempt: reportedAttempt,
470          delayMs: delayMs,
471          error: (error as APIError)
472            .message as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
473          status: (error as APIError).status,
474          provider: getAPIProviderForStatsig(),
475        })
476  
477        if (persistent) {
478          if (delayMs > 60_000) {
479            logEvent('tengu_api_persistent_retry_wait', {
480              status: (error as APIError).status,
481              delayMs,
482              attempt: reportedAttempt,
483              provider: getAPIProviderForStatsig(),
484            })
485          }
486          // Chunk long sleeps so the host sees periodic stdout activity and
487          // does not mark the session idle. Each yield surfaces as
488          // {type:'system', subtype:'api_retry'} on stdout via QueryEngine.
489          let remaining = delayMs
490          while (remaining > 0) {
491            if (options.signal?.aborted) throw new APIUserAbortError()
492            if (error instanceof APIError) {
493              yield createSystemAPIErrorMessage(
494                error,
495                remaining,
496                reportedAttempt,
497                maxRetries,
498              )
499            }
500            const chunk = Math.min(remaining, HEARTBEAT_INTERVAL_MS)
501            await sleep(chunk, options.signal, { abortError })
502            remaining -= chunk
503          }
504          // Clamp so the for-loop never terminates. Backoff uses the separate
505          // persistentAttempt counter which keeps growing to the 5-min cap.
506          if (attempt >= maxRetries) attempt = maxRetries
507        } else {
508          if (error instanceof APIError) {
509            yield createSystemAPIErrorMessage(error, delayMs, attempt, maxRetries)
510          }
511          await sleep(delayMs, options.signal, { abortError })
512        }
513      }
514    }
515  
516    throw new CannotRetryError(lastError, retryContext)
517  }
518  
519  function getRetryAfter(error: unknown): string | null {
520    return (
521      ((error as { headers?: { 'retry-after'?: string } }).headers?.[
522        'retry-after'
523      ] ||
524        // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins
525        ((error as APIError).headers as Headers)?.get?.('retry-after')) ??
526      null
527    )
528  }
529  
530  export function getRetryDelay(
531    attempt: number,
532    retryAfterHeader?: string | null,
533    maxDelayMs = 32000,
534  ): number {
535    if (retryAfterHeader) {
536      const seconds = parseInt(retryAfterHeader, 10)
537      if (!isNaN(seconds)) {
538        return seconds * 1000
539      }
540    }
541  
542    const baseDelay = Math.min(
543      BASE_DELAY_MS * Math.pow(2, attempt - 1),
544      maxDelayMs,
545    )
546    const jitter = Math.random() * 0.25 * baseDelay
547    return baseDelay + jitter
548  }
549  
550  export function parseMaxTokensContextOverflowError(error: APIError):
551    | {
552        inputTokens: number
553        maxTokens: number
554        contextLimit: number
555      }
556    | undefined {
557    if (error.status !== 400 || !error.message) {
558      return undefined
559    }
560  
561    if (
562      !error.message.includes(
563        'input length and `max_tokens` exceed context limit',
564      )
565    ) {
566      return undefined
567    }
568  
569    // Example format: "input length and `max_tokens` exceed context limit: 188059 + 20000 > 200000"
570    const regex =
571      /input length and `max_tokens` exceed context limit: (\d+) \+ (\d+) > (\d+)/
572    const match = error.message.match(regex)
573  
574    if (!match || match.length !== 4) {
575      return undefined
576    }
577  
578    if (!match[1] || !match[2] || !match[3]) {
579      logError(
580        new Error(
581          'Unable to parse max_tokens from max_tokens exceed context limit error message',
582        ),
583      )
584      return undefined
585    }
586    const inputTokens = parseInt(match[1], 10)
587    const maxTokens = parseInt(match[2], 10)
588    const contextLimit = parseInt(match[3], 10)
589  
590    if (isNaN(inputTokens) || isNaN(maxTokens) || isNaN(contextLimit)) {
591      return undefined
592    }
593  
594    return { inputTokens, maxTokens, contextLimit }
595  }
596  
597  // TODO: Replace with a response header check once the API adds a dedicated
598  // header for fast-mode rejection (e.g., x-fast-mode-rejected). String-matching
599  // the error message is fragile and will break if the API wording changes.
600  function isFastModeNotEnabledError(error: unknown): boolean {
601    if (!(error instanceof APIError)) {
602      return false
603    }
604    return (
605      error.status === 400 &&
606      (error.message?.includes('Fast mode is not enabled') ?? false)
607    )
608  }
609  
610  export function is529Error(error: unknown): boolean {
611    if (!(error instanceof APIError)) {
612      return false
613    }
614  
615    // Check for 529 status code or overloaded error in message
616    return (
617      error.status === 529 ||
618      // See below: the SDK sometimes fails to properly pass the 529 status code during streaming
619      (error.message?.includes('"type":"overloaded_error"') ?? false)
620    )
621  }
622  
623  function isOAuthTokenRevokedError(error: unknown): boolean {
624    return (
625      error instanceof APIError &&
626      error.status === 403 &&
627      (error.message?.includes('OAuth token has been revoked') ?? false)
628    )
629  }
630  
631  function isBedrockAuthError(error: unknown): boolean {
632    if (isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK)) {
633      // AWS libs reject without an API call if .aws holds a past Expiration value
634      // otherwise, API calls that receive expired tokens give generic 403
635      // "The security token included in the request is invalid"
636      if (
637        isAwsCredentialsProviderError(error) ||
638        (error instanceof APIError && error.status === 403)
639      ) {
640        return true
641      }
642    }
643    return false
644  }
645  
646  /**
647   * Clear AWS auth caches if appropriate.
648   * @returns true if action was taken.
649   */
650  function handleAwsCredentialError(error: unknown): boolean {
651    if (isBedrockAuthError(error)) {
652      clearAwsCredentialsCache()
653      return true
654    }
655    return false
656  }
657  
658  // google-auth-library throws plain Error (no typed name like AWS's
659  // CredentialsProviderError). Match common SDK-level credential-failure messages.
660  function isGoogleAuthLibraryCredentialError(error: unknown): boolean {
661    if (!(error instanceof Error)) return false
662    const msg = error.message
663    return (
664      msg.includes('Could not load the default credentials') ||
665      msg.includes('Could not refresh access token') ||
666      msg.includes('invalid_grant')
667    )
668  }
669  
670  function isVertexAuthError(error: unknown): boolean {
671    if (isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX)) {
672      // SDK-level: google-auth-library fails in prepareOptions() before the HTTP call
673      if (isGoogleAuthLibraryCredentialError(error)) {
674        return true
675      }
676      // Server-side: Vertex returns 401 for expired/invalid tokens
677      if (error instanceof APIError && error.status === 401) {
678        return true
679      }
680    }
681    return false
682  }
683  
684  /**
685   * Clear GCP auth caches if appropriate.
686   * @returns true if action was taken.
687   */
688  function handleGcpCredentialError(error: unknown): boolean {
689    if (isVertexAuthError(error)) {
690      clearGcpCredentialsCache()
691      return true
692    }
693    return false
694  }
695  
696  function shouldRetry(error: APIError): boolean {
697    // Never retry mock errors - they're from /mock-limits command for testing
698    if (isMockRateLimitError(error)) {
699      return false
700    }
701  
702    // Persistent mode: 429/529 always retryable, bypass subscriber gates and
703    // x-should-retry header.
704    if (isPersistentRetryEnabled() && isTransientCapacityError(error)) {
705      return true
706    }
707  
708    // CCR mode: auth is via infrastructure-provided JWTs, so a 401/403 is a
709    // transient blip (auth service flap, network hiccup) rather than bad
710    // credentials. Bypass x-should-retry:false — the server assumes we'd retry
711    // the same bad key, but our key is fine.
712    if (
713      isEnvTruthy(process.env.CLAUDE_CODE_REMOTE) &&
714      (error.status === 401 || error.status === 403)
715    ) {
716      return true
717    }
718  
719    // Check for overloaded errors first by examining the message content
720    // The SDK sometimes fails to properly pass the 529 status code during streaming,
721    // so we need to check the error message directly
722    if (error.message?.includes('"type":"overloaded_error"')) {
723      return true
724    }
725  
726    // Check for max tokens context overflow errors that we can handle
727    if (parseMaxTokensContextOverflowError(error)) {
728      return true
729    }
730  
731    // Note this is not a standard header.
732    const shouldRetryHeader = error.headers?.get('x-should-retry')
733  
734    // If the server explicitly says whether or not to retry, obey.
735    // For Max and Pro users, should-retry is true, but in several hours, so we shouldn't.
736    // Enterprise users can retry because they typically use PAYG instead of rate limits.
737    if (
738      shouldRetryHeader === 'true' &&
739      (!isClaudeAISubscriber() || isEnterpriseSubscriber())
740    ) {
741      return true
742    }
743  
744    // Ants can ignore x-should-retry: false for 5xx server errors only.
745    // For other status codes (401, 403, 400, 429, etc.), respect the header.
746    if (shouldRetryHeader === 'false') {
747      const is5xxError = error.status !== undefined && error.status >= 500
748      if (!(process.env.USER_TYPE === 'ant' && is5xxError)) {
749        return false
750      }
751    }
752  
753    if (error instanceof APIConnectionError) {
754      return true
755    }
756  
757    if (!error.status) return false
758  
759    // Retry on request timeouts.
760    if (error.status === 408) return true
761  
762    // Retry on lock timeouts.
763    if (error.status === 409) return true
764  
765    // Retry on rate limits, but not for ClaudeAI Subscription users
766    // Enterprise users can retry because they typically use PAYG instead of rate limits
767    if (error.status === 429) {
768      return !isClaudeAISubscriber() || isEnterpriseSubscriber()
769    }
770  
771    // Clear API key cache on 401 and allow retry.
772    // OAuth token handling is done in the main retry loop via handleOAuth401Error.
773    if (error.status === 401) {
774      clearApiKeyHelperCache()
775      return true
776    }
777  
778    // Retry on 403 "token revoked" (same refresh logic as 401, see above)
779    if (isOAuthTokenRevokedError(error)) {
780      return true
781    }
782  
783    // Retry internal errors.
784    if (error.status && error.status >= 500) return true
785  
786    return false
787  }
788  
789  export function getDefaultMaxRetries(): number {
790    if (process.env.CLAUDE_CODE_MAX_RETRIES) {
791      return parseInt(process.env.CLAUDE_CODE_MAX_RETRIES, 10)
792    }
793    return DEFAULT_MAX_RETRIES
794  }
795  function getMaxRetries(options: RetryOptions): number {
796    return options.maxRetries ?? getDefaultMaxRetries()
797  }
798  
799  const DEFAULT_FAST_MODE_FALLBACK_HOLD_MS = 30 * 60 * 1000 // 30 minutes
800  const SHORT_RETRY_THRESHOLD_MS = 20 * 1000 // 20 seconds
801  const MIN_COOLDOWN_MS = 10 * 60 * 1000 // 10 minutes
802  
803  function getRetryAfterMs(error: APIError): number | null {
804    const retryAfter = getRetryAfter(error)
805    if (retryAfter) {
806      const seconds = parseInt(retryAfter, 10)
807      if (!isNaN(seconds)) {
808        return seconds * 1000
809      }
810    }
811    return null
812  }
813  
814  function getRateLimitResetDelayMs(error: APIError): number | null {
815    const resetHeader = error.headers?.get?.('anthropic-ratelimit-unified-reset')
816    if (!resetHeader) return null
817    const resetUnixSec = Number(resetHeader)
818    if (!Number.isFinite(resetUnixSec)) return null
819    const delayMs = resetUnixSec * 1000 - Date.now()
820    if (delayMs <= 0) return null
821    return Math.min(delayMs, PERSISTENT_RESET_CAP_MS)
822  }