/ utils / context.ts
context.ts
  1  // biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered
  2  import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js'
  3  import { getGlobalConfig } from './config.js'
  4  import { isEnvTruthy } from './envUtils.js'
  5  import { getCanonicalName } from './model/model.js'
  6  import { getModelCapability } from './model/modelCapabilities.js'
  7  
  8  // Model context window size (200k tokens for all models right now)
  9  export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
 10  
 11  // Maximum output tokens for compact operations
 12  export const COMPACT_MAX_OUTPUT_TOKENS = 20_000
 13  
 14  // Default max output tokens
 15  const MAX_OUTPUT_TOKENS_DEFAULT = 32_000
 16  const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000
 17  
 18  // Capped default for slot-reservation optimization. BQ p99 output = 4,911
 19  // tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap
 20  // enabled, <1% of requests hit the limit; those get one clean retry at 64k
 21  // (see query.ts max_output_tokens_escalate). Cap is applied in
 22  // claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context
 23  // import cycle.
 24  export const CAPPED_DEFAULT_MAX_TOKENS = 8_000
 25  export const ESCALATED_MAX_TOKENS = 64_000
 26  
 27  /**
 28   * Check if 1M context is disabled via environment variable.
 29   * Used by C4E admins to disable 1M context for HIPAA compliance.
 30   */
 31  export function is1mContextDisabled(): boolean {
 32    return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT)
 33  }
 34  
 35  export function has1mContext(model: string): boolean {
 36    if (is1mContextDisabled()) {
 37      return false
 38    }
 39    return /\[1m\]/i.test(model)
 40  }
 41  
 42  // @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context
 43  export function modelSupports1M(model: string): boolean {
 44    if (is1mContextDisabled()) {
 45      return false
 46    }
 47    const canonical = getCanonicalName(model)
 48    return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6')
 49  }
 50  
 51  export function getContextWindowForModel(
 52    model: string,
 53    betas?: string[],
 54  ): number {
 55    // Allow override via environment variable (ant-only)
 56    // This takes precedence over all other context window resolution, including 1M detection,
 57    // so users can cap the effective context window for local decisions (auto-compact, etc.)
 58    // while still using a 1M-capable endpoint.
 59    if (
 60      process.env.USER_TYPE === 'ant' &&
 61      process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
 62    ) {
 63      const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
 64      if (!isNaN(override) && override > 0) {
 65        return override
 66      }
 67    }
 68  
 69    // [1m] suffix — explicit client-side opt-in, respected over all detection
 70    if (has1mContext(model)) {
 71      return 1_000_000
 72    }
 73  
 74    const cap = getModelCapability(model)
 75    if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
 76      if (
 77        cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT &&
 78        is1mContextDisabled()
 79      ) {
 80        return MODEL_CONTEXT_WINDOW_DEFAULT
 81      }
 82      return cap.max_input_tokens
 83    }
 84  
 85    if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) {
 86      return 1_000_000
 87    }
 88    if (getSonnet1mExpTreatmentEnabled(model)) {
 89      return 1_000_000
 90    }
 91    if (process.env.USER_TYPE === 'ant') {
 92      const antModel = resolveAntModel(model)
 93      if (antModel?.contextWindow) {
 94        return antModel.contextWindow
 95      }
 96    }
 97    return MODEL_CONTEXT_WINDOW_DEFAULT
 98  }
 99  
100  export function getSonnet1mExpTreatmentEnabled(model: string): boolean {
101    if (is1mContextDisabled()) {
102      return false
103    }
104    // Only applies to sonnet 4.6 without an explicit [1m] suffix
105    if (has1mContext(model)) {
106      return false
107    }
108    if (!getCanonicalName(model).includes('sonnet-4-6')) {
109      return false
110    }
111    return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true'
112  }
113  
114  /**
115   * Calculate context window usage percentage from token usage data.
116   * Returns used and remaining percentages, or null values if no usage data.
117   */
118  export function calculateContextPercentages(
119    currentUsage: {
120      input_tokens: number
121      cache_creation_input_tokens: number
122      cache_read_input_tokens: number
123    } | null,
124    contextWindowSize: number,
125  ): { used: number | null; remaining: number | null } {
126    if (!currentUsage) {
127      return { used: null, remaining: null }
128    }
129  
130    const totalInputTokens =
131      currentUsage.input_tokens +
132      currentUsage.cache_creation_input_tokens +
133      currentUsage.cache_read_input_tokens
134  
135    const usedPercentage = Math.round(
136      (totalInputTokens / contextWindowSize) * 100,
137    )
138    const clampedUsed = Math.min(100, Math.max(0, usedPercentage))
139  
140    return {
141      used: clampedUsed,
142      remaining: 100 - clampedUsed,
143    }
144  }
145  
146  /**
147   * Returns the model's default and upper limit for max output tokens.
148   */
149  export function getModelMaxOutputTokens(model: string): {
150    default: number
151    upperLimit: number
152  } {
153    let defaultTokens: number
154    let upperLimit: number
155  
156    if (process.env.USER_TYPE === 'ant') {
157      const antModel = resolveAntModel(model.toLowerCase())
158      if (antModel) {
159        defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT
160        upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT
161        return { default: defaultTokens, upperLimit }
162      }
163    }
164  
165    const m = getCanonicalName(model)
166  
167    if (m.includes('opus-4-6')) {
168      defaultTokens = 64_000
169      upperLimit = 128_000
170    } else if (m.includes('sonnet-4-6')) {
171      defaultTokens = 32_000
172      upperLimit = 128_000
173    } else if (
174      m.includes('opus-4-5') ||
175      m.includes('sonnet-4') ||
176      m.includes('haiku-4')
177    ) {
178      defaultTokens = 32_000
179      upperLimit = 64_000
180    } else if (m.includes('opus-4-1') || m.includes('opus-4')) {
181      defaultTokens = 32_000
182      upperLimit = 32_000
183    } else if (m.includes('claude-3-opus')) {
184      defaultTokens = 4_096
185      upperLimit = 4_096
186    } else if (m.includes('claude-3-sonnet')) {
187      defaultTokens = 8_192
188      upperLimit = 8_192
189    } else if (m.includes('claude-3-haiku')) {
190      defaultTokens = 4_096
191      upperLimit = 4_096
192    } else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) {
193      defaultTokens = 8_192
194      upperLimit = 8_192
195    } else if (m.includes('3-7-sonnet')) {
196      defaultTokens = 32_000
197      upperLimit = 64_000
198    } else {
199      defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT
200      upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT
201    }
202  
203    const cap = getModelCapability(model)
204    if (cap?.max_tokens && cap.max_tokens >= 4_096) {
205      upperLimit = cap.max_tokens
206      defaultTokens = Math.min(defaultTokens, upperLimit)
207    }
208  
209    return { default: defaultTokens, upperLimit }
210  }
211  
212  /**
213   * Returns the max thinking budget tokens for a given model. The max
214   * thinking tokens should be strictly less than the max output tokens.
215   *
216   * Deprecated since newer models use adaptive thinking rather than a
217   * strict thinking token budget.
218   */
219  export function getMaxThinkingTokensForModel(model: string): number {
220    return getModelMaxOutputTokens(model).upperLimit - 1
221  }