/ src / utils / tokens.ts
tokens.ts
  1  import type { BetaUsage as Usage } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  2  import { roughTokenCountEstimationForMessages } from '../services/tokenEstimation.js'
  3  import type { AssistantMessage, Message } from '../types/message.js'
  4  import { SYNTHETIC_MESSAGES, SYNTHETIC_MODEL } from './messages.js'
  5  import { jsonStringify } from './slowOperations.js'
  6  
  7  export function getTokenUsage(message: Message): Usage | undefined {
  8    if (
  9      message?.type === 'assistant' &&
 10      'usage' in message.message &&
 11      !(
 12        message.message.content[0]?.type === 'text' &&
 13        SYNTHETIC_MESSAGES.has(message.message.content[0].text)
 14      ) &&
 15      message.message.model !== SYNTHETIC_MODEL
 16    ) {
 17      return message.message.usage
 18    }
 19    return undefined
 20  }
 21  
 22  /**
 23   * Get the API response id for an assistant message with real (non-synthetic) usage.
 24   * Used to identify split assistant records that came from the same API response —
 25   * when parallel tool calls are streamed, each content block becomes a separate
 26   * AssistantMessage record, but they all share the same message.id.
 27   */
 28  function getAssistantMessageId(message: Message): string | undefined {
 29    if (
 30      message?.type === 'assistant' &&
 31      'id' in message.message &&
 32      message.message.model !== SYNTHETIC_MODEL
 33    ) {
 34      return message.message.id
 35    }
 36    return undefined
 37  }
 38  
 39  /**
 40   * Calculate total context window tokens from an API response's usage data.
 41   * Includes input_tokens + cache tokens + output_tokens.
 42   *
 43   * This represents the full context size at the time of that API call.
 44   * Use tokenCountWithEstimation() when you need context size from messages.
 45   */
 46  export function getTokenCountFromUsage(usage: Usage): number {
 47    return (
 48      usage.input_tokens +
 49      (usage.cache_creation_input_tokens ?? 0) +
 50      (usage.cache_read_input_tokens ?? 0) +
 51      usage.output_tokens
 52    )
 53  }
 54  
 55  export function tokenCountFromLastAPIResponse(messages: Message[]): number {
 56    let i = messages.length - 1
 57    while (i >= 0) {
 58      const message = messages[i]
 59      const usage = message ? getTokenUsage(message) : undefined
 60      if (usage) {
 61        return getTokenCountFromUsage(usage)
 62      }
 63      i--
 64    }
 65    return 0
 66  }
 67  
 68  /**
 69   * Final context window size from the last API response's usage.iterations[-1].
 70   * Used for task_budget.remaining computation across compaction boundaries —
 71   * the server's budget countdown is context-based, so remaining decrements by
 72   * the pre-compact final window, not billing spend. See monorepo
 73   * api/api/sampling/prompt/renderer.py:292 for the server-side computation.
 74   *
 75   * Falls back to top-level input_tokens + output_tokens when iterations is
 76   * absent (no server-side tool loops, so top-level usage IS the final window).
 77   * Both paths exclude cache tokens to match #304930's formula.
 78   */
 79  export function finalContextTokensFromLastResponse(
 80    messages: Message[],
 81  ): number {
 82    let i = messages.length - 1
 83    while (i >= 0) {
 84      const message = messages[i]
 85      const usage = message ? getTokenUsage(message) : undefined
 86      if (usage) {
 87        // Stainless types don't include iterations yet — cast like advisor.ts:43
 88        const iterations = (
 89          usage as {
 90            iterations?: Array<{
 91              input_tokens: number
 92              output_tokens: number
 93            }> | null
 94          }
 95        ).iterations
 96        if (iterations && iterations.length > 0) {
 97          const last = iterations.at(-1)!
 98          return last.input_tokens + last.output_tokens
 99        }
100        // No iterations → no server tool loop → top-level usage IS the final
101        // window. Match the iterations path's formula (input + output, no cache)
102        // rather than getTokenCountFromUsage — #304930 defines final window as
103        // non-cache input + output. Whether the server's budget countdown
104        // (renderer.py:292 calculate_context_tokens) counts cache the same way
105        // is an open question; aligning with the iterations path keeps the two
106        // branches consistent until that's resolved.
107        return usage.input_tokens + usage.output_tokens
108      }
109      i--
110    }
111    return 0
112  }
113  
114  /**
115   * Get only the output_tokens from the last API response.
116   * This excludes input context (system prompt, tools, prior messages).
117   *
118   * WARNING: Do NOT use this for threshold comparisons (autocompact, session memory).
119   * Use tokenCountWithEstimation() instead, which measures full context size.
120   * This function is only useful for measuring how many tokens Claude generated
121   * in a single response, not how full the context window is.
122   */
123  export function messageTokenCountFromLastAPIResponse(
124    messages: Message[],
125  ): number {
126    let i = messages.length - 1
127    while (i >= 0) {
128      const message = messages[i]
129      const usage = message ? getTokenUsage(message) : undefined
130      if (usage) {
131        return usage.output_tokens
132      }
133      i--
134    }
135    return 0
136  }
137  
138  export function getCurrentUsage(messages: Message[]): {
139    input_tokens: number
140    output_tokens: number
141    cache_creation_input_tokens: number
142    cache_read_input_tokens: number
143  } | null {
144    for (let i = messages.length - 1; i >= 0; i--) {
145      const message = messages[i]
146      const usage = message ? getTokenUsage(message) : undefined
147      if (usage) {
148        return {
149          input_tokens: usage.input_tokens,
150          output_tokens: usage.output_tokens,
151          cache_creation_input_tokens: usage.cache_creation_input_tokens ?? 0,
152          cache_read_input_tokens: usage.cache_read_input_tokens ?? 0,
153        }
154      }
155    }
156    return null
157  }
158  
159  export function doesMostRecentAssistantMessageExceed200k(
160    messages: Message[],
161  ): boolean {
162    const THRESHOLD = 200_000
163  
164    const lastAsst = messages.findLast(m => m.type === 'assistant')
165    if (!lastAsst) return false
166    const usage = getTokenUsage(lastAsst)
167    return usage ? getTokenCountFromUsage(usage) > THRESHOLD : false
168  }
169  
170  /**
171   * Calculate the character content length of an assistant message.
172   * Used for spinner token estimation (characters / 4 ≈ tokens).
173   * This is used when subagent streaming events are filtered out and we
174   * need to count content from completed messages instead.
175   *
176   * Counts the same content that handleMessageFromStream would count via deltas:
177   * - text (text_delta)
178   * - thinking (thinking_delta)
179   * - redacted_thinking data
180   * - tool_use input (input_json_delta)
181   * Note: signature_delta is excluded from streaming counts (not model output).
182   */
183  export function getAssistantMessageContentLength(
184    message: AssistantMessage,
185  ): number {
186    let contentLength = 0
187    for (const block of message.message.content) {
188      if (block.type === 'text') {
189        contentLength += block.text.length
190      } else if (block.type === 'thinking') {
191        contentLength += block.thinking.length
192      } else if (block.type === 'redacted_thinking') {
193        contentLength += block.data.length
194      } else if (block.type === 'tool_use') {
195        contentLength += jsonStringify(block.input).length
196      }
197    }
198    return contentLength
199  }
200  
201  /**
202   * Get the current context window size in tokens.
203   *
204   * This is the CANONICAL function for measuring context size when checking
205   * thresholds (autocompact, session memory init, etc.). Uses the last API
206   * response's token count (input + output + cache) plus estimates for any
207   * messages added since.
208   *
209   * Always use this instead of:
210   * - Cumulative token counting (which double-counts as context grows)
211   * - messageTokenCountFromLastAPIResponse (which only counts output_tokens)
212   * - tokenCountFromLastAPIResponse (which doesn't estimate new messages)
213   *
214   * Implementation note on parallel tool calls: when the model makes multiple
215   * tool calls in one response, the streaming code emits a SEPARATE assistant
216   * record per content block (all sharing the same message.id and usage), and
217   * the query loop interleaves each tool_result immediately after its tool_use.
218   * So the messages array looks like:
219   *   [..., assistant(id=A), user(result), assistant(id=A), user(result), ...]
220   * If we stop at the LAST assistant record, we only estimate the one tool_result
221   * after it and miss all the earlier interleaved tool_results — which will ALL
222   * be in the next API request. To avoid undercounting, after finding a usage-
223   * bearing record we walk back to the FIRST sibling with the same message.id
224   * so every interleaved tool_result is included in the rough estimate.
225   */
226  export function tokenCountWithEstimation(messages: readonly Message[]): number {
227    let i = messages.length - 1
228    while (i >= 0) {
229      const message = messages[i]
230      const usage = message ? getTokenUsage(message) : undefined
231      if (message && usage) {
232        // Walk back past any earlier sibling records split from the same API
233        // response (same message.id) so interleaved tool_results between them
234        // are included in the estimation slice.
235        const responseId = getAssistantMessageId(message)
236        if (responseId) {
237          let j = i - 1
238          while (j >= 0) {
239            const prior = messages[j]
240            const priorId = prior ? getAssistantMessageId(prior) : undefined
241            if (priorId === responseId) {
242              // Earlier split of the same API response — anchor here instead.
243              i = j
244            } else if (priorId !== undefined) {
245              // Hit a different API response — stop walking.
246              break
247            }
248            // priorId === undefined: a user/tool_result/attachment message,
249            // possibly interleaved between splits — keep walking.
250            j--
251          }
252        }
253        return (
254          getTokenCountFromUsage(usage) +
255          roughTokenCountEstimationForMessages(messages.slice(i + 1))
256        )
257      }
258      i--
259    }
260    return roughTokenCountEstimationForMessages(messages)
261  }