/ services / tokenEstimation.ts
tokenEstimation.ts
  1  import type { Anthropic } from '@anthropic-ai/sdk'
  2  import type { BetaMessageParam as MessageParam } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  3  // @aws-sdk/client-bedrock-runtime is imported dynamically in countTokensWithBedrock()
  4  // to defer ~279KB of AWS SDK code until a Bedrock call is actually made
  5  import type { CountTokensCommandInput } from '@aws-sdk/client-bedrock-runtime'
  6  import { getAPIProvider } from 'src/utils/model/providers.js'
  7  import { VERTEX_COUNT_TOKENS_ALLOWED_BETAS } from '../constants/betas.js'
  8  import type { Attachment } from '../utils/attachments.js'
  9  import { getModelBetas } from '../utils/betas.js'
 10  import { getVertexRegionForModel, isEnvTruthy } from '../utils/envUtils.js'
 11  import { logError } from '../utils/log.js'
 12  import { normalizeAttachmentForAPI } from '../utils/messages.js'
 13  import {
 14    createBedrockRuntimeClient,
 15    getInferenceProfileBackingModel,
 16    isFoundationModel,
 17  } from '../utils/model/bedrock.js'
 18  import {
 19    getDefaultSonnetModel,
 20    getMainLoopModel,
 21    getSmallFastModel,
 22    normalizeModelStringForAPI,
 23  } from '../utils/model/model.js'
 24  import { jsonStringify } from '../utils/slowOperations.js'
 25  import { isToolReferenceBlock } from '../utils/toolSearch.js'
 26  import { getAPIMetadata, getExtraBodyParams } from './api/claude.js'
 27  import { getAnthropicClient } from './api/client.js'
 28  import { withTokenCountVCR } from './vcr.js'
 29  
 30  // Minimal values for token counting with thinking enabled
 31  // API constraint: max_tokens must be greater than thinking.budget_tokens
 32  const TOKEN_COUNT_THINKING_BUDGET = 1024
 33  const TOKEN_COUNT_MAX_TOKENS = 2048
 34  
 35  /**
 36   * Check if messages contain thinking blocks
 37   */
 38  function hasThinkingBlocks(
 39    messages: Anthropic.Beta.Messages.BetaMessageParam[],
 40  ): boolean {
 41    for (const message of messages) {
 42      if (message.role === 'assistant' && Array.isArray(message.content)) {
 43        for (const block of message.content) {
 44          if (
 45            typeof block === 'object' &&
 46            block !== null &&
 47            'type' in block &&
 48            (block.type === 'thinking' || block.type === 'redacted_thinking')
 49          ) {
 50            return true
 51          }
 52        }
 53      }
 54    }
 55    return false
 56  }
 57  
 58  /**
 59   * Strip tool search-specific fields from messages before sending for token counting.
 60   * This removes 'caller' from tool_use blocks and 'tool_reference' from tool_result content.
 61   * These fields are only valid with the tool search beta and will cause errors otherwise.
 62   *
 63   * Note: We use 'as unknown as' casts because the SDK types don't include tool search beta fields,
 64   * but at runtime these fields may exist from API responses when tool search was enabled.
 65   */
 66  function stripToolSearchFieldsFromMessages(
 67    messages: Anthropic.Beta.Messages.BetaMessageParam[],
 68  ): Anthropic.Beta.Messages.BetaMessageParam[] {
 69    return messages.map(message => {
 70      if (!Array.isArray(message.content)) {
 71        return message
 72      }
 73  
 74      const normalizedContent = message.content.map(block => {
 75        // Strip 'caller' from tool_use blocks (assistant messages)
 76        if (block.type === 'tool_use') {
 77          // Destructure to exclude any extra fields like 'caller'
 78          const toolUse =
 79            block as Anthropic.Beta.Messages.BetaToolUseBlockParam & {
 80              caller?: unknown
 81            }
 82          return {
 83            type: 'tool_use' as const,
 84            id: toolUse.id,
 85            name: toolUse.name,
 86            input: toolUse.input,
 87          }
 88        }
 89  
 90        // Strip tool_reference blocks from tool_result content (user messages)
 91        if (block.type === 'tool_result') {
 92          const toolResult =
 93            block as Anthropic.Beta.Messages.BetaToolResultBlockParam
 94          if (Array.isArray(toolResult.content)) {
 95            const filteredContent = (toolResult.content as unknown[]).filter(
 96              c => !isToolReferenceBlock(c),
 97            ) as typeof toolResult.content
 98  
 99            if (filteredContent.length === 0) {
100              return {
101                ...toolResult,
102                content: [{ type: 'text' as const, text: '[tool references]' }],
103              }
104            }
105            if (filteredContent.length !== toolResult.content.length) {
106              return {
107                ...toolResult,
108                content: filteredContent,
109              }
110            }
111          }
112        }
113  
114        return block
115      })
116  
117      return {
118        ...message,
119        content: normalizedContent,
120      }
121    })
122  }
123  
124  export async function countTokensWithAPI(
125    content: string,
126  ): Promise<number | null> {
127    // Special case for empty content - API doesn't accept empty messages
128    if (!content) {
129      return 0
130    }
131  
132    const message: Anthropic.Beta.Messages.BetaMessageParam = {
133      role: 'user',
134      content: content,
135    }
136  
137    return countMessagesTokensWithAPI([message], [])
138  }
139  
140  export async function countMessagesTokensWithAPI(
141    messages: Anthropic.Beta.Messages.BetaMessageParam[],
142    tools: Anthropic.Beta.Messages.BetaToolUnion[],
143  ): Promise<number | null> {
144    return withTokenCountVCR(messages, tools, async () => {
145      try {
146        const model = getMainLoopModel()
147        const betas = getModelBetas(model)
148        const containsThinking = hasThinkingBlocks(messages)
149  
150        if (getAPIProvider() === 'bedrock') {
151          // @anthropic-sdk/bedrock-sdk doesn't support countTokens currently
152          return countTokensWithBedrock({
153            model: normalizeModelStringForAPI(model),
154            messages,
155            tools,
156            betas,
157            containsThinking,
158          })
159        }
160  
161        const anthropic = await getAnthropicClient({
162          maxRetries: 1,
163          model,
164          source: 'count_tokens',
165        })
166  
167        const filteredBetas =
168          getAPIProvider() === 'vertex'
169            ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
170            : betas
171  
172        const response = await anthropic.beta.messages.countTokens({
173          model: normalizeModelStringForAPI(model),
174          messages:
175            // When we pass tools and no messages, we need to pass a dummy message
176            // to get an accurate tool token count.
177            messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
178          tools,
179          ...(filteredBetas.length > 0 && { betas: filteredBetas }),
180          // Enable thinking if messages contain thinking blocks
181          ...(containsThinking && {
182            thinking: {
183              type: 'enabled',
184              budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
185            },
186          }),
187        })
188  
189        if (typeof response.input_tokens !== 'number') {
190          // Vertex client throws
191          // Bedrock client succeeds with { Output: { __type: 'com.amazon.coral.service#UnknownOperationException' }, Version: '1.0' }
192          return null
193        }
194  
195        return response.input_tokens
196      } catch (error) {
197        logError(error)
198        return null
199      }
200    })
201  }
202  
203  export function roughTokenCountEstimation(
204    content: string,
205    bytesPerToken: number = 4,
206  ): number {
207    return Math.round(content.length / bytesPerToken)
208  }
209  
210  /**
211   * Returns an estimated bytes-per-token ratio for a given file extension.
212   * Dense JSON has many single-character tokens (`{`, `}`, `:`, `,`, `"`)
213   * which makes the real ratio closer to 2 rather than the default 4.
214   */
215  export function bytesPerTokenForFileType(fileExtension: string): number {
216    switch (fileExtension) {
217      case 'json':
218      case 'jsonl':
219      case 'jsonc':
220        return 2
221      default:
222        return 4
223    }
224  }
225  
226  /**
227   * Like {@link roughTokenCountEstimation} but uses a more accurate
228   * bytes-per-token ratio when the file type is known.
229   *
230   * This matters when the API-based token count is unavailable (e.g. on
231   * Bedrock) and we fall back to the rough estimate — an underestimate can
232   * let an oversized tool result slip into the conversation.
233   */
234  export function roughTokenCountEstimationForFileType(
235    content: string,
236    fileExtension: string,
237  ): number {
238    return roughTokenCountEstimation(
239      content,
240      bytesPerTokenForFileType(fileExtension),
241    )
242  }
243  
244  /**
245   * Estimates token count for a Message object by extracting and analyzing its text content.
246   * This provides a more reliable estimate than getTokenUsage for messages that may have been compacted.
247   * Uses Haiku for token counting (Haiku 4.5 supports thinking blocks), except:
248   * - Vertex global region: uses Sonnet (Haiku not available)
249   * - Bedrock with thinking blocks: uses Sonnet (Haiku 3.5 doesn't support thinking)
250   */
251  export async function countTokensViaHaikuFallback(
252    messages: Anthropic.Beta.Messages.BetaMessageParam[],
253    tools: Anthropic.Beta.Messages.BetaToolUnion[],
254  ): Promise<number | null> {
255    // Check if messages contain thinking blocks
256    const containsThinking = hasThinkingBlocks(messages)
257  
258    // If we're on Vertex and using global region, always use Sonnet since Haiku is not available there.
259    const isVertexGlobalEndpoint =
260      isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) &&
261      getVertexRegionForModel(getSmallFastModel()) === 'global'
262    // If we're on Bedrock with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
263    const isBedrockWithThinking =
264      isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK) && containsThinking
265    // If we're on Vertex with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
266    const isVertexWithThinking =
267      isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) && containsThinking
268    // Otherwise always use Haiku - Haiku 4.5 supports thinking blocks.
269    // WARNING: if you change this to use a non-Haiku model, this request will fail in 1P unless it uses getCLISyspromptPrefix.
270    // Note: We don't need Sonnet for tool_reference blocks because we strip them via
271    // stripToolSearchFieldsFromMessages() before sending.
272    // Use getSmallFastModel() to respect ANTHROPIC_SMALL_FAST_MODEL env var for Bedrock users
273    // with global inference profiles (see issue #10883).
274    const model =
275      isVertexGlobalEndpoint || isBedrockWithThinking || isVertexWithThinking
276        ? getDefaultSonnetModel()
277        : getSmallFastModel()
278    const anthropic = await getAnthropicClient({
279      maxRetries: 1,
280      model,
281      source: 'count_tokens',
282    })
283  
284    // Strip tool search-specific fields (caller, tool_reference) before sending
285    // These fields are only valid with the tool search beta header
286    const normalizedMessages = stripToolSearchFieldsFromMessages(messages)
287  
288    const messagesToSend: MessageParam[] =
289      normalizedMessages.length > 0
290        ? (normalizedMessages as MessageParam[])
291        : [{ role: 'user', content: 'count' }]
292  
293    const betas = getModelBetas(model)
294    // Filter betas for Vertex - some betas (like web-search) cause 400 errors
295    // on certain Vertex endpoints. See issue #10789.
296    const filteredBetas =
297      getAPIProvider() === 'vertex'
298        ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
299        : betas
300  
301    // biome-ignore lint/plugin: token counting needs specialized parameters (thinking, betas) that sideQuery doesn't support
302    const response = await anthropic.beta.messages.create({
303      model: normalizeModelStringForAPI(model),
304      max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
305      messages: messagesToSend,
306      tools: tools.length > 0 ? tools : undefined,
307      ...(filteredBetas.length > 0 && { betas: filteredBetas }),
308      metadata: getAPIMetadata(),
309      ...getExtraBodyParams(),
310      // Enable thinking if messages contain thinking blocks
311      ...(containsThinking && {
312        thinking: {
313          type: 'enabled',
314          budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
315        },
316      }),
317    })
318  
319    const usage = response.usage
320    const inputTokens = usage.input_tokens
321    const cacheCreationTokens = usage.cache_creation_input_tokens || 0
322    const cacheReadTokens = usage.cache_read_input_tokens || 0
323  
324    return inputTokens + cacheCreationTokens + cacheReadTokens
325  }
326  
327  export function roughTokenCountEstimationForMessages(
328    messages: readonly {
329      type: string
330      message?: { content?: unknown }
331      attachment?: Attachment
332    }[],
333  ): number {
334    let totalTokens = 0
335    for (const message of messages) {
336      totalTokens += roughTokenCountEstimationForMessage(message)
337    }
338    return totalTokens
339  }
340  
341  export function roughTokenCountEstimationForMessage(message: {
342    type: string
343    message?: { content?: unknown }
344    attachment?: Attachment
345  }): number {
346    if (
347      (message.type === 'assistant' || message.type === 'user') &&
348      message.message?.content
349    ) {
350      return roughTokenCountEstimationForContent(
351        message.message?.content as
352          | string
353          | Array<Anthropic.ContentBlock>
354          | Array<Anthropic.ContentBlockParam>
355          | undefined,
356      )
357    }
358  
359    if (message.type === 'attachment' && message.attachment) {
360      const userMessages = normalizeAttachmentForAPI(message.attachment)
361      let total = 0
362      for (const userMsg of userMessages) {
363        total += roughTokenCountEstimationForContent(userMsg.message.content)
364      }
365      return total
366    }
367  
368    return 0
369  }
370  
371  function roughTokenCountEstimationForContent(
372    content:
373      | string
374      | Array<Anthropic.ContentBlock>
375      | Array<Anthropic.ContentBlockParam>
376      | undefined,
377  ): number {
378    if (!content) {
379      return 0
380    }
381    if (typeof content === 'string') {
382      return roughTokenCountEstimation(content)
383    }
384    let totalTokens = 0
385    for (const block of content) {
386      totalTokens += roughTokenCountEstimationForBlock(block)
387    }
388    return totalTokens
389  }
390  
391  function roughTokenCountEstimationForBlock(
392    block: string | Anthropic.ContentBlock | Anthropic.ContentBlockParam,
393  ): number {
394    if (typeof block === 'string') {
395      return roughTokenCountEstimation(block)
396    }
397    if (block.type === 'text') {
398      return roughTokenCountEstimation(block.text)
399    }
400    if (block.type === 'image' || block.type === 'document') {
401      // https://platform.claude.com/docs/en/build-with-claude/vision#calculate-image-costs
402      // tokens = (width px * height px)/750
403      // Images are resized to max 2000x2000 (5333 tokens). Use a conservative
404      // estimate that matches microCompact's IMAGE_MAX_TOKEN_SIZE to avoid
405      // underestimating and triggering auto-compact too late.
406      //
407      // document: base64 PDF in source.data.  Must NOT reach the
408      // jsonStringify catch-all — a 1MB PDF is ~1.33M base64 chars →
409      // ~325k estimated tokens, vs the ~2000 the API actually charges.
410      // Same constant as microCompact's calculateToolResultTokens.
411      return 2000
412    }
413    if (block.type === 'tool_result') {
414      return roughTokenCountEstimationForContent(block.content)
415    }
416    if (block.type === 'tool_use') {
417      // input is the JSON the model generated — arbitrarily large (bash
418      // commands, Edit diffs, file contents).  Stringify once for the
419      // char count; the API re-serializes anyway so this is what it sees.
420      return roughTokenCountEstimation(
421        block.name + jsonStringify(block.input ?? {}),
422      )
423    }
424    if (block.type === 'thinking') {
425      return roughTokenCountEstimation(block.thinking)
426    }
427    if (block.type === 'redacted_thinking') {
428      return roughTokenCountEstimation(block.data)
429    }
430    // server_tool_use, web_search_tool_result, mcp_tool_use, etc. —
431    // text-like payloads (tool inputs, search results, no base64).
432    // Stringify-length tracks the serialized form the API sees; the
433    // key/bracket overhead is single-digit percent on real blocks.
434    return roughTokenCountEstimation(jsonStringify(block))
435  }
436  
437  async function countTokensWithBedrock({
438    model,
439    messages,
440    tools,
441    betas,
442    containsThinking,
443  }: {
444    model: string
445    messages: Anthropic.Beta.Messages.BetaMessageParam[]
446    tools: Anthropic.Beta.Messages.BetaToolUnion[]
447    betas: string[]
448    containsThinking: boolean
449  }): Promise<number | null> {
450    try {
451      const client = await createBedrockRuntimeClient()
452      // Bedrock CountTokens requires a model ID, not an inference profile / ARN
453      const modelId = isFoundationModel(model)
454        ? model
455        : await getInferenceProfileBackingModel(model)
456      if (!modelId) {
457        return null
458      }
459  
460      const requestBody = {
461        anthropic_version: 'bedrock-2023-05-31',
462        // When we pass tools and no messages, we need to pass a dummy message
463        // to get an accurate tool token count.
464        messages:
465          messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
466        max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
467        ...(tools.length > 0 && { tools }),
468        ...(betas.length > 0 && { anthropic_beta: betas }),
469        ...(containsThinking && {
470          thinking: {
471            type: 'enabled',
472            budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
473          },
474        }),
475      }
476  
477      const { CountTokensCommand } = await import(
478        '@aws-sdk/client-bedrock-runtime'
479      )
480      const input: CountTokensCommandInput = {
481        modelId,
482        input: {
483          invokeModel: {
484            body: new TextEncoder().encode(jsonStringify(requestBody)),
485          },
486        },
487      }
488      const response = await client.send(new CountTokensCommand(input))
489      const tokenCount = response.inputTokens ?? null
490      return tokenCount
491    } catch (error) {
492      logError(error)
493      return null
494    }
495  }