Cradicle Explorer

/ utils / toolResultStorage.ts
toolResultStorage.ts
   1  /**
   2   * Utility for persisting large tool results to disk instead of truncating them.
   3   */
   4  
   5  import type { ToolResultBlockParam } from '@anthropic-ai/sdk/resources/index.mjs'
   6  import { mkdir, writeFile } from 'fs/promises'
   7  import { join } from 'path'
   8  import { getOriginalCwd, getSessionId } from '../bootstrap/state.js'
   9  import {
  10    BYTES_PER_TOKEN,
  11    DEFAULT_MAX_RESULT_SIZE_CHARS,
  12    MAX_TOOL_RESULT_BYTES,
  13    MAX_TOOL_RESULTS_PER_MESSAGE_CHARS,
  14  } from '../constants/toolLimits.js'
  15  import { getFeatureValue_CACHED_MAY_BE_STALE } from '../services/analytics/growthbook.js'
  16  import { logEvent } from '../services/analytics/index.js'
  17  import { sanitizeToolNameForAnalytics } from '../services/analytics/metadata.js'
  18  import type { Message } from '../types/message.js'
  19  import { logForDebugging } from './debug.js'
  20  import { getErrnoCode, toError } from './errors.js'
  21  import { formatFileSize } from './format.js'
  22  import { logError } from './log.js'
  23  import { getProjectDir } from './sessionStorage.js'
  24  import { jsonStringify } from './slowOperations.js'
  25  
  26  // Subdirectory name for tool results within a session
  27  export const TOOL_RESULTS_SUBDIR = 'tool-results'
  28  
  29  // XML tag used to wrap persisted output messages
  30  export const PERSISTED_OUTPUT_TAG = '<persisted-output>'
  31  export const PERSISTED_OUTPUT_CLOSING_TAG = '</persisted-output>'
  32  
  33  // Message used when tool result content was cleared without persisting to file
  34  export const TOOL_RESULT_CLEARED_MESSAGE = '[Old tool result content cleared]'
  35  
  36  /**
  37   * GrowthBook override map: tool name -> persistence threshold (chars).
  38   * When a tool name is present in this map, that value is used directly as the
  39   * effective threshold, bypassing the Math.min() clamp against the 50k default.
  40   * Tools absent from the map use the hardcoded fallback.
  41   * Flag default is {} (no overrides == behavior unchanged).
  42   */
  43  const PERSIST_THRESHOLD_OVERRIDE_FLAG = 'tengu_satin_quoll'
  44  
  45  /**
  46   * Resolve the effective persistence threshold for a tool.
  47   * GrowthBook override wins when present; otherwise falls back to the declared
  48   * per-tool cap clamped by the global default.
  49   *
  50   * Defensive: GrowthBook's cache returns `cached !== undefined ? cached : default`,
  51   * so a flag served as `null` leaks through. We guard with optional chaining and a
  52   * typeof check so any non-object flag value (null, string, number) falls through
  53   * to the hardcoded default instead of throwing on index or returning 0.
  54   */
  55  export function getPersistenceThreshold(
  56    toolName: string,
  57    declaredMaxResultSizeChars: number,
  58  ): number {
  59    // Infinity = hard opt-out. Read self-bounds via maxTokens; persisting its
  60    // output to a file the model reads back with Read is circular. Checked
  61    // before the GB override so tengu_satin_quoll can't force it back on.
  62    if (!Number.isFinite(declaredMaxResultSizeChars)) {
  63      return declaredMaxResultSizeChars
  64    }
  65    const overrides = getFeatureValue_CACHED_MAY_BE_STALE<Record<
  66      string,
  67      number
  68    > | null>(PERSIST_THRESHOLD_OVERRIDE_FLAG, {})
  69    const override = overrides?.[toolName]
  70    if (
  71      typeof override === 'number' &&
  72      Number.isFinite(override) &&
  73      override > 0
  74    ) {
  75      return override
  76    }
  77    return Math.min(declaredMaxResultSizeChars, DEFAULT_MAX_RESULT_SIZE_CHARS)
  78  }
  79  
  80  // Result of persisting a tool result to disk
  81  export type PersistedToolResult = {
  82    filepath: string
  83    originalSize: number
  84    isJson: boolean
  85    preview: string
  86    hasMore: boolean
  87  }
  88  
  89  // Error result when persistence fails
  90  export type PersistToolResultError = {
  91    error: string
  92  }
  93  
  94  /**
  95   * Get the session directory (projectDir/sessionId)
  96   */
  97  function getSessionDir(): string {
  98    return join(getProjectDir(getOriginalCwd()), getSessionId())
  99  }
 100  
 101  /**
 102   * Get the tool results directory for this session (projectDir/sessionId/tool-results)
 103   */
 104  export function getToolResultsDir(): string {
 105    return join(getSessionDir(), TOOL_RESULTS_SUBDIR)
 106  }
 107  
 108  // Preview size in bytes for the reference message
 109  export const PREVIEW_SIZE_BYTES = 2000
 110  
 111  /**
 112   * Get the filepath where a tool result would be persisted.
 113   */
 114  export function getToolResultPath(id: string, isJson: boolean): string {
 115    const ext = isJson ? 'json' : 'txt'
 116    return join(getToolResultsDir(), `${id}.${ext}`)
 117  }
 118  
 119  /**
 120   * Ensure the session-specific tool results directory exists
 121   */
 122  export async function ensureToolResultsDir(): Promise<void> {
 123    try {
 124      await mkdir(getToolResultsDir(), { recursive: true })
 125    } catch {
 126      // Directory may already exist
 127    }
 128  }
 129  
 130  /**
 131   * Persist a tool result to disk and return information about the persisted file
 132   *
 133   * @param content - The tool result content to persist (string or array of content blocks)
 134   * @param toolUseId - The ID of the tool use that produced the result
 135   * @returns Information about the persisted file including filepath and preview
 136   */
 137  export async function persistToolResult(
 138    content: NonNullable<ToolResultBlockParam['content']>,
 139    toolUseId: string,
 140  ): Promise<PersistedToolResult | PersistToolResultError> {
 141    const isJson = Array.isArray(content)
 142  
 143    // Check for non-text content - we can only persist text blocks
 144    if (isJson) {
 145      const hasNonTextContent = content.some(block => block.type !== 'text')
 146      if (hasNonTextContent) {
 147        return {
 148          error: 'Cannot persist tool results containing non-text content',
 149        }
 150      }
 151    }
 152  
 153    await ensureToolResultsDir()
 154    const filepath = getToolResultPath(toolUseId, isJson)
 155    const contentStr = isJson ? jsonStringify(content, null, 2) : content
 156  
 157    // tool_use_id is unique per invocation and content is deterministic for a
 158    // given id, so skip if the file already exists. This prevents re-writing
 159    // the same content on every API turn when microcompact replays the
 160    // original messages. Use 'wx' instead of a stat-then-write race.
 161    try {
 162      await writeFile(filepath, contentStr, { encoding: 'utf-8', flag: 'wx' })
 163      logForDebugging(
 164        `Persisted tool result to ${filepath} (${formatFileSize(contentStr.length)})`,
 165      )
 166    } catch (error) {
 167      if (getErrnoCode(error) !== 'EEXIST') {
 168        logError(toError(error))
 169        return { error: getFileSystemErrorMessage(toError(error)) }
 170      }
 171      // EEXIST: already persisted on a prior turn, fall through to preview
 172    }
 173  
 174    // Generate a preview
 175    const { preview, hasMore } = generatePreview(contentStr, PREVIEW_SIZE_BYTES)
 176  
 177    return {
 178      filepath,
 179      originalSize: contentStr.length,
 180      isJson,
 181      preview,
 182      hasMore,
 183    }
 184  }
 185  
 186  /**
 187   * Build a message for large tool results with preview
 188   */
 189  export function buildLargeToolResultMessage(
 190    result: PersistedToolResult,
 191  ): string {
 192    let message = `${PERSISTED_OUTPUT_TAG}\n`
 193    message += `Output too large (${formatFileSize(result.originalSize)}). Full output saved to: ${result.filepath}\n\n`
 194    message += `Preview (first ${formatFileSize(PREVIEW_SIZE_BYTES)}):\n`
 195    message += result.preview
 196    message += result.hasMore ? '\n...\n' : '\n'
 197    message += PERSISTED_OUTPUT_CLOSING_TAG
 198    return message
 199  }
 200  
 201  /**
 202   * Process a tool result for inclusion in a message.
 203   * Maps the result to the API format and persists large results to disk.
 204   */
 205  export async function processToolResultBlock<T>(
 206    tool: {
 207      name: string
 208      maxResultSizeChars: number
 209      mapToolResultToToolResultBlockParam: (
 210        result: T,
 211        toolUseID: string,
 212      ) => ToolResultBlockParam
 213    },
 214    toolUseResult: T,
 215    toolUseID: string,
 216  ): Promise<ToolResultBlockParam> {
 217    const toolResultBlock = tool.mapToolResultToToolResultBlockParam(
 218      toolUseResult,
 219      toolUseID,
 220    )
 221    return maybePersistLargeToolResult(
 222      toolResultBlock,
 223      tool.name,
 224      getPersistenceThreshold(tool.name, tool.maxResultSizeChars),
 225    )
 226  }
 227  
 228  /**
 229   * Process a pre-mapped tool result block. Applies persistence for large results
 230   * without re-calling mapToolResultToToolResultBlockParam.
 231   */
 232  export async function processPreMappedToolResultBlock(
 233    toolResultBlock: ToolResultBlockParam,
 234    toolName: string,
 235    maxResultSizeChars: number,
 236  ): Promise<ToolResultBlockParam> {
 237    return maybePersistLargeToolResult(
 238      toolResultBlock,
 239      toolName,
 240      getPersistenceThreshold(toolName, maxResultSizeChars),
 241    )
 242  }
 243  
 244  /**
 245   * True when a tool_result's content is empty or effectively empty. Covers:
 246   * undefined/null/'', whitespace-only strings, empty arrays, and arrays whose
 247   * only blocks are text blocks with empty/whitespace text. Non-text blocks
 248   * (images, tool_reference) are treated as non-empty.
 249   */
 250  export function isToolResultContentEmpty(
 251    content: ToolResultBlockParam['content'],
 252  ): boolean {
 253    if (!content) return true
 254    if (typeof content === 'string') return content.trim() === ''
 255    if (!Array.isArray(content)) return false
 256    if (content.length === 0) return true
 257    return content.every(
 258      block =>
 259        typeof block === 'object' &&
 260        'type' in block &&
 261        block.type === 'text' &&
 262        'text' in block &&
 263        (typeof block.text !== 'string' || block.text.trim() === ''),
 264    )
 265  }
 266  
 267  /**
 268   * Handle large tool results by persisting to disk instead of truncating.
 269   * Returns the original block if no persistence needed, or a modified block
 270   * with the content replaced by a reference to the persisted file.
 271   */
 272  async function maybePersistLargeToolResult(
 273    toolResultBlock: ToolResultBlockParam,
 274    toolName: string,
 275    persistenceThreshold?: number,
 276  ): Promise<ToolResultBlockParam> {
 277    // Check size first before doing any async work - most tool results are small
 278    const content = toolResultBlock.content
 279  
 280    // inc-4586: Empty tool_result content at the prompt tail causes some models
 281    // (notably capybara) to emit the \n\nHuman: stop sequence and end their turn
 282    // with zero output. The server renderer inserts no \n\nAssistant: marker after
 283    // tool results, so a bare </function_results>\n\n pattern-matches to a turn
 284    // boundary. Several tools can legitimately produce empty output (silent-success
 285    // shell commands, MCP servers returning content:[], REPL statements, etc.).
 286    // Inject a short marker so the model always has something to react to.
 287    if (isToolResultContentEmpty(content)) {
 288      logEvent('tengu_tool_empty_result', {
 289        toolName: sanitizeToolNameForAnalytics(toolName),
 290      })
 291      return {
 292        ...toolResultBlock,
 293        content: `(${toolName} completed with no output)`,
 294      }
 295    }
 296    // Narrow after the emptiness guard — content is non-nullish past this point.
 297    if (!content) {
 298      return toolResultBlock
 299    }
 300  
 301    // Skip persistence for image content blocks - they need to be sent as-is to Claude
 302    if (hasImageBlock(content)) {
 303      return toolResultBlock
 304    }
 305  
 306    const size = contentSize(content)
 307  
 308    // Use tool-specific threshold if provided, otherwise fall back to global limit
 309    const threshold = persistenceThreshold ?? MAX_TOOL_RESULT_BYTES
 310    if (size <= threshold) {
 311      return toolResultBlock
 312    }
 313  
 314    // Persist the entire content as a unit
 315    const result = await persistToolResult(content, toolResultBlock.tool_use_id)
 316    if (isPersistError(result)) {
 317      // If persistence failed, return the original block unchanged
 318      return toolResultBlock
 319    }
 320  
 321    const message = buildLargeToolResultMessage(result)
 322  
 323    // Log analytics
 324    logEvent('tengu_tool_result_persisted', {
 325      toolName: sanitizeToolNameForAnalytics(toolName),
 326      originalSizeBytes: result.originalSize,
 327      persistedSizeBytes: message.length,
 328      estimatedOriginalTokens: Math.ceil(result.originalSize / BYTES_PER_TOKEN),
 329      estimatedPersistedTokens: Math.ceil(message.length / BYTES_PER_TOKEN),
 330      thresholdUsed: threshold,
 331    })
 332  
 333    return { ...toolResultBlock, content: message }
 334  }
 335  
 336  /**
 337   * Generate a preview of content, truncating at a newline boundary when possible.
 338   */
 339  export function generatePreview(
 340    content: string,
 341    maxBytes: number,
 342  ): { preview: string; hasMore: boolean } {
 343    if (content.length <= maxBytes) {
 344      return { preview: content, hasMore: false }
 345    }
 346  
 347    // Find the last newline within the limit to avoid cutting mid-line
 348    const truncated = content.slice(0, maxBytes)
 349    const lastNewline = truncated.lastIndexOf('\n')
 350  
 351    // If we found a newline reasonably close to the limit, use it
 352    // Otherwise fall back to the exact limit
 353    const cutPoint = lastNewline > maxBytes * 0.5 ? lastNewline : maxBytes
 354  
 355    return { preview: content.slice(0, cutPoint), hasMore: true }
 356  }
 357  
 358  /**
 359   * Type guard to check if persist result is an error
 360   */
 361  export function isPersistError(
 362    result: PersistedToolResult | PersistToolResultError,
 363  ): result is PersistToolResultError {
 364    return 'error' in result
 365  }
 366  
 367  // --- Message-level aggregate tool result budget ---
 368  //
 369  // Tracks replacement state across turns so enforceToolResultBudget makes the
 370  // same choices every time (preserves prompt cache prefix).
 371  
 372  /**
 373   * Per-conversation-thread state for the aggregate tool result budget.
 374   * State must be stable to preserve prompt cache:
 375   *   - seenIds: results that have passed through the budget check (replaced
 376   *     or not). Once seen, a result's fate is frozen for the conversation.
 377   *   - replacements: subset of seenIds that were persisted to disk and
 378   *     replaced with previews, mapped to the exact preview string shown to
 379   *     the model. Re-application is a Map lookup — no file I/O, guaranteed
 380   *     byte-identical, cannot fail.
 381   *
 382   * Lifecycle: one instance per conversation thread, carried on ToolUseContext.
 383   * Main thread: REPL provisions once, never resets — stale entries after
 384   * /clear, rewind, resume, or compact are never looked up (tool_use_ids are
 385   * UUIDs) so they're harmless. Subagents: createSubagentContext clones the
 386   * parent's state by default (cache-sharing forks like agentSummary need
 387   * identical decisions), or resumeAgentBackground threads one reconstructed
 388   * from sidechain records.
 389   */
 390  export type ContentReplacementState = {
 391    seenIds: Set<string>
 392    replacements: Map<string, string>
 393  }
 394  
 395  export function createContentReplacementState(): ContentReplacementState {
 396    return { seenIds: new Set(), replacements: new Map() }
 397  }
 398  
 399  /**
 400   * Clone replacement state for a cache-sharing fork (e.g. agentSummary).
 401   * The fork needs state identical to the source at fork time so
 402   * enforceToolResultBudget makes the same choices → same wire prefix →
 403   * prompt cache hit. Mutating the clone does not affect the source.
 404   */
 405  export function cloneContentReplacementState(
 406    source: ContentReplacementState,
 407  ): ContentReplacementState {
 408    return {
 409      seenIds: new Set(source.seenIds),
 410      replacements: new Map(source.replacements),
 411    }
 412  }
 413  
 414  /**
 415   * Resolve the per-message aggregate budget limit. GrowthBook override
 416   * (tengu_hawthorn_window) wins when present and a finite positive number;
 417   * otherwise falls back to the hardcoded constant. Defensive typeof/finite
 418   * check: GrowthBook's cache returns `cached !== undefined ? cached : default`,
 419   * so a flag served as null/string/NaN leaks through.
 420   */
 421  export function getPerMessageBudgetLimit(): number {
 422    const override = getFeatureValue_CACHED_MAY_BE_STALE<number | null>(
 423      'tengu_hawthorn_window',
 424      null,
 425    )
 426    if (
 427      typeof override === 'number' &&
 428      Number.isFinite(override) &&
 429      override > 0
 430    ) {
 431      return override
 432    }
 433    return MAX_TOOL_RESULTS_PER_MESSAGE_CHARS
 434  }
 435  
 436  /**
 437   * Provision replacement state for a new conversation thread.
 438   *
 439   * Encapsulates the feature-flag gate + reconstruct-vs-fresh choice:
 440   *   - Flag off → undefined (query.ts skips enforcement entirely)
 441   *   - No initialMessages (cold start) → fresh
 442   *   - initialMessages present → reconstruct (freeze all candidate IDs so the
 443   *     budget never replaces content the model already saw unreplaced). Empty
 444   *     or absent records freeze everything; non-empty records additionally
 445   *     populate the replacements Map for byte-identical re-apply.
 446   */
 447  export function provisionContentReplacementState(
 448    initialMessages?: Message[],
 449    initialContentReplacements?: ContentReplacementRecord[],
 450  ): ContentReplacementState | undefined {
 451    const enabled = getFeatureValue_CACHED_MAY_BE_STALE(
 452      'tengu_hawthorn_steeple',
 453      false,
 454    )
 455    if (!enabled) return undefined
 456    if (initialMessages) {
 457      return reconstructContentReplacementState(
 458        initialMessages,
 459        initialContentReplacements ?? [],
 460      )
 461    }
 462    return createContentReplacementState()
 463  }
 464  
 465  /**
 466   * Serializable record of one content-replacement decision. Written to the
 467   * transcript as a ContentReplacementEntry so decisions survive resume.
 468   * Discriminated by `kind` so future replacement mechanisms (user text,
 469   * offloaded images) can share the same transcript entry type.
 470   *
 471   * `replacement` is the exact string the model saw — stored rather than
 472   * derived on resume so code changes to the preview template, size formatting,
 473   * or path layout can't silently break prompt cache.
 474   */
 475  export type ContentReplacementRecord = {
 476    kind: 'tool-result'
 477    toolUseId: string
 478    replacement: string
 479  }
 480  
 481  export type ToolResultReplacementRecord = Extract<
 482    ContentReplacementRecord,
 483    { kind: 'tool-result' }
 484  >
 485  
 486  type ToolResultCandidate = {
 487    toolUseId: string
 488    content: NonNullable<ToolResultBlockParam['content']>
 489    size: number
 490  }
 491  
 492  type CandidatePartition = {
 493    mustReapply: Array<ToolResultCandidate & { replacement: string }>
 494    frozen: ToolResultCandidate[]
 495    fresh: ToolResultCandidate[]
 496  }
 497  
 498  function isContentAlreadyCompacted(
 499    content: ToolResultBlockParam['content'],
 500  ): boolean {
 501    // All budget-produced content starts with the tag (buildLargeToolResultMessage).
 502    // `.startsWith()` avoids false-positives when the tag appears anywhere else
 503    // in the content (e.g., reading this source file).
 504    return typeof content === 'string' && content.startsWith(PERSISTED_OUTPUT_TAG)
 505  }
 506  
 507  function hasImageBlock(
 508    content: NonNullable<ToolResultBlockParam['content']>,
 509  ): boolean {
 510    return (
 511      Array.isArray(content) &&
 512      content.some(
 513        b => typeof b === 'object' && 'type' in b && b.type === 'image',
 514      )
 515    )
 516  }
 517  
 518  function contentSize(
 519    content: NonNullable<ToolResultBlockParam['content']>,
 520  ): number {
 521    if (typeof content === 'string') return content.length
 522    // Sum text-block lengths directly. Slightly under-counts vs serialized
 523    // (no JSON framing), but the budget is a rough token heuristic anyway.
 524    // Avoids allocating a content-sized string every enforcement pass.
 525    return content.reduce(
 526      (sum, b) => sum + (b.type === 'text' ? b.text.length : 0),
 527      0,
 528    )
 529  }
 530  
 531  /**
 532   * Walk messages and build tool_use_id → tool_name from assistant tool_use
 533   * blocks. tool_use always precedes its tool_result (model calls, then result
 534   * arrives), so by the time budget enforcement sees a result, its name is known.
 535   */
 536  function buildToolNameMap(messages: Message[]): Map<string, string> {
 537    const map = new Map<string, string>()
 538    for (const message of messages) {
 539      if (message.type !== 'assistant') continue
 540      const content = message.message.content
 541      if (!Array.isArray(content)) continue
 542      for (const block of content) {
 543        if (block.type === 'tool_use') {
 544          map.set(block.id, block.name)
 545        }
 546      }
 547    }
 548    return map
 549  }
 550  
 551  /**
 552   * Extract candidate tool_result blocks from a single user message: blocks
 553   * that are non-empty, non-image, and not already compacted by tag (i.e. by
 554   * the per-tool limit, or an earlier iteration of this same query call).
 555   * Returns [] for messages with no eligible blocks.
 556   */
 557  function collectCandidatesFromMessage(message: Message): ToolResultCandidate[] {
 558    if (message.type !== 'user' || !Array.isArray(message.message.content)) {
 559      return []
 560    }
 561    return message.message.content.flatMap(block => {
 562      if (block.type !== 'tool_result' || !block.content) return []
 563      if (isContentAlreadyCompacted(block.content)) return []
 564      if (hasImageBlock(block.content)) return []
 565      return [
 566        {
 567          toolUseId: block.tool_use_id,
 568          content: block.content,
 569          size: contentSize(block.content),
 570        },
 571      ]
 572    })
 573  }
 574  
 575  /**
 576   * Extract candidate tool_result blocks grouped by API-level user message.
 577   *
 578   * normalizeMessagesForAPI merges consecutive user messages into one
 579   * (Bedrock compat; 1P does the same server-side), so parallel tool
 580   * results that arrive as N separate user messages in our state become
 581   * ONE user message on the wire. The budget must group the same way or
 582   * it would see N under-budget messages instead of one over-budget
 583   * message and fail to enforce exactly when it matters most.
 584   *
 585   * A "group" is a maximal run of user messages NOT separated by an
 586   * assistant message. Only assistant messages create wire-level
 587   * boundaries — normalizeMessagesForAPI filters out progress entirely
 588   * and merges attachment / system(local_command) INTO adjacent user
 589   * blocks, so those types do NOT break groups here either.
 590   *
 591   * This matters for abort-during-parallel-tools paths: agent_progress
 592   * messages (non-ephemeral, persisted in REPL state) can interleave
 593   * between fresh tool_result messages. If we flushed on progress, those
 594   * tool_results would split into under-budget groups, slip through
 595   * unreplaced, get frozen, then be merged by normalizeMessagesForAPI
 596   * into one over-budget wire message — defeating the feature.
 597   *
 598   * Only groups with at least one eligible candidate are returned.
 599   */
 600  function collectCandidatesByMessage(
 601    messages: Message[],
 602  ): ToolResultCandidate[][] {
 603    const groups: ToolResultCandidate[][] = []
 604    let current: ToolResultCandidate[] = []
 605  
 606    const flush = () => {
 607      if (current.length > 0) groups.push(current)
 608      current = []
 609    }
 610  
 611    // Track all assistant message.ids seen so far — same-ID fragments are
 612    // merged by normalizeMessagesForAPI (messages.ts ~2126 walks back PAST
 613    // different-ID assistants via `continue`), so any re-appearance of a
 614    // previously-seen ID must NOT create a group boundary. Two scenarios:
 615    //   • Consecutive: streamingToolExecution yields one AssistantMessage per
 616    //     content_block_stop (same id); a fast tool drains between blocks;
 617    //     abort/hook-stop leaves [asst(X), user(trA), asst(X), user(trB)].
 618    //   • Interleaved: coordinator/teammate streams mix different responses
 619    //     so [asst(X), user(trA), asst(Y), user(trB), asst(X), user(trC)].
 620    // In both, normalizeMessagesForAPI merges the X fragments into one wire
 621    // assistant, and their following tool_results merge into one wire user
 622    // message — so the budget must see them as one group too.
 623    const seenAsstIds = new Set<string>()
 624    for (const message of messages) {
 625      if (message.type === 'user') {
 626        current.push(...collectCandidatesFromMessage(message))
 627      } else if (message.type === 'assistant') {
 628        if (!seenAsstIds.has(message.message.id)) {
 629          flush()
 630          seenAsstIds.add(message.message.id)
 631        }
 632      }
 633      // progress / attachment / system are filtered or merged by
 634      // normalizeMessagesForAPI — they don't create wire boundaries.
 635    }
 636    flush()
 637  
 638    return groups
 639  }
 640  
 641  /**
 642   * Partition candidates by their prior decision state:
 643   *  - mustReapply: previously replaced → re-apply the cached replacement for
 644   *    prefix stability
 645   *  - frozen: previously seen and left unreplaced → off-limits (replacing
 646   *    now would change a prefix that was already cached)
 647   *  - fresh: never seen → eligible for new replacement decisions
 648   */
 649  function partitionByPriorDecision(
 650    candidates: ToolResultCandidate[],
 651    state: ContentReplacementState,
 652  ): CandidatePartition {
 653    return candidates.reduce<CandidatePartition>(
 654      (acc, c) => {
 655        const replacement = state.replacements.get(c.toolUseId)
 656        if (replacement !== undefined) {
 657          acc.mustReapply.push({ ...c, replacement })
 658        } else if (state.seenIds.has(c.toolUseId)) {
 659          acc.frozen.push(c)
 660        } else {
 661          acc.fresh.push(c)
 662        }
 663        return acc
 664      },
 665      { mustReapply: [], frozen: [], fresh: [] },
 666    )
 667  }
 668  
 669  /**
 670   * Pick the largest fresh results to replace until the model-visible total
 671   * (frozen + remaining fresh) is at or under budget, or fresh is exhausted.
 672   * If frozen results alone exceed budget we accept the overage — microcompact
 673   * will eventually clear them.
 674   */
 675  function selectFreshToReplace(
 676    fresh: ToolResultCandidate[],
 677    frozenSize: number,
 678    limit: number,
 679  ): ToolResultCandidate[] {
 680    const sorted = [...fresh].sort((a, b) => b.size - a.size)
 681    const selected: ToolResultCandidate[] = []
 682    let remaining = frozenSize + fresh.reduce((sum, c) => sum + c.size, 0)
 683    for (const c of sorted) {
 684      if (remaining <= limit) break
 685      selected.push(c)
 686      // We don't know the replacement size until after persist, but previews
 687      // are ~2K and results hitting this path are much larger, so subtracting
 688      // the full size is a close approximation for selection purposes.
 689      remaining -= c.size
 690    }
 691    return selected
 692  }
 693  
 694  /**
 695   * Return a new Message[] where each tool_result block whose id appears in
 696   * replacementMap has its content replaced. Messages and blocks with no
 697   * replacements are passed through by reference.
 698   */
 699  function replaceToolResultContents(
 700    messages: Message[],
 701    replacementMap: Map<string, string>,
 702  ): Message[] {
 703    return messages.map(message => {
 704      if (message.type !== 'user' || !Array.isArray(message.message.content)) {
 705        return message
 706      }
 707      const content = message.message.content
 708      const needsReplace = content.some(
 709        b => b.type === 'tool_result' && replacementMap.has(b.tool_use_id),
 710      )
 711      if (!needsReplace) return message
 712      return {
 713        ...message,
 714        message: {
 715          ...message.message,
 716          content: content.map(block => {
 717            if (block.type !== 'tool_result') return block
 718            const replacement = replacementMap.get(block.tool_use_id)
 719            return replacement === undefined
 720              ? block
 721              : { ...block, content: replacement }
 722          }),
 723        },
 724      }
 725    })
 726  }
 727  
 728  async function buildReplacement(
 729    candidate: ToolResultCandidate,
 730  ): Promise<{ content: string; originalSize: number } | null> {
 731    const result = await persistToolResult(candidate.content, candidate.toolUseId)
 732    if (isPersistError(result)) return null
 733    return {
 734      content: buildLargeToolResultMessage(result),
 735      originalSize: result.originalSize,
 736    }
 737  }
 738  
 739  /**
 740   * Enforce the per-message budget on aggregate tool result size.
 741   *
 742   * For each user message whose tool_result blocks together exceed the
 743   * per-message limit (see getPerMessageBudgetLimit), the largest FRESH
 744   * (never-before-seen) results in THAT message are persisted to disk and
 745   * replaced with previews.
 746   * Messages are evaluated independently — a 150K result in one message and
 747   * a 150K result in another are both under budget and untouched.
 748   *
 749   * State is tracked by tool_use_id in `state`. Once a result is seen its
 750   * fate is frozen: previously-replaced results get the same replacement
 751   * re-applied every turn from the cached preview string (zero I/O,
 752   * byte-identical), and previously-unreplaced results are never replaced
 753   * later (would break prompt cache).
 754   *
 755   * Each turn adds at most one new user message with tool_result blocks,
 756   * so the per-message loop typically does the budget check at most once;
 757   * all prior messages just re-apply cached replacements.
 758   *
 759   * @param state — MUTATED: seenIds and replacements are updated in place
 760   *   to record choices made this call. The caller holds a stable reference
 761   *   across turns; returning a new object would require error-prone ref
 762   *   updates after every query.
 763   *
 764   * Returns `{ messages, newlyReplaced }`:
 765   *   - messages: same array instance when no replacement is needed
 766   *   - newlyReplaced: replacements made THIS call (not re-applies).
 767   *     Caller persists these to the transcript for resume reconstruction.
 768   */
 769  export async function enforceToolResultBudget(
 770    messages: Message[],
 771    state: ContentReplacementState,
 772    skipToolNames: ReadonlySet<string> = new Set(),
 773  ): Promise<{
 774    messages: Message[]
 775    newlyReplaced: ToolResultReplacementRecord[]
 776  }> {
 777    const candidatesByMessage = collectCandidatesByMessage(messages)
 778    const nameByToolUseId =
 779      skipToolNames.size > 0 ? buildToolNameMap(messages) : undefined
 780    const shouldSkip = (id: string): boolean =>
 781      nameByToolUseId !== undefined &&
 782      skipToolNames.has(nameByToolUseId.get(id) ?? '')
 783    // Resolve once per call. A mid-session flag change only affects FRESH
 784    // messages (prior decisions are frozen via seenIds/replacements), so
 785    // prompt cache for already-seen content is preserved regardless.
 786    const limit = getPerMessageBudgetLimit()
 787  
 788    // Walk each API-level message group independently. For previously-processed messages
 789    // (all IDs in seenIds) this just re-applies cached replacements. For the
 790    // single new message this turn added, it runs the budget check.
 791    const replacementMap = new Map<string, string>()
 792    const toPersist: ToolResultCandidate[] = []
 793    let reappliedCount = 0
 794    let messagesOverBudget = 0
 795  
 796    for (const candidates of candidatesByMessage) {
 797      const { mustReapply, frozen, fresh } = partitionByPriorDecision(
 798        candidates,
 799        state,
 800      )
 801  
 802      // Re-apply: pure Map lookups. No file I/O, byte-identical, cannot fail.
 803      mustReapply.forEach(c => replacementMap.set(c.toolUseId, c.replacement))
 804      reappliedCount += mustReapply.length
 805  
 806      // Fresh means this is a new message. Check its per-message budget.
 807      // (A previously-processed message has fresh.length === 0 because all
 808      // its IDs were added to seenIds when first seen.)
 809      if (fresh.length === 0) {
 810        // mustReapply/frozen are already in seenIds from their first pass —
 811        // re-adding is a no-op but keeps the invariant explicit.
 812        candidates.forEach(c => state.seenIds.add(c.toolUseId))
 813        continue
 814      }
 815  
 816      // Tools with maxResultSizeChars: Infinity (Read) — never persist.
 817      // Mark as seen (frozen) so the decision sticks across turns. They don't
 818      // count toward freshSize; if that lets the group slip under budget and
 819      // the wire message is still large, that's the contract — Read's own
 820      // maxTokens is the bound, not this wrapper.
 821      const skipped = fresh.filter(c => shouldSkip(c.toolUseId))
 822      skipped.forEach(c => state.seenIds.add(c.toolUseId))
 823      const eligible = fresh.filter(c => !shouldSkip(c.toolUseId))
 824  
 825      const frozenSize = frozen.reduce((sum, c) => sum + c.size, 0)
 826      const freshSize = eligible.reduce((sum, c) => sum + c.size, 0)
 827  
 828      const selected =
 829        frozenSize + freshSize > limit
 830          ? selectFreshToReplace(eligible, frozenSize, limit)
 831          : []
 832  
 833      // Mark non-persisting candidates as seen NOW (synchronously). IDs
 834      // selected for persist are marked seen AFTER the await, alongside
 835      // replacements.set — keeps the pair atomic under observation so no
 836      // concurrent reader (once subagents share state) ever sees X∈seenIds
 837      // but X∉replacements, which would misclassify X as frozen and send
 838      // full content while the main thread sends the preview → cache miss.
 839      const selectedIds = new Set(selected.map(c => c.toolUseId))
 840      candidates
 841        .filter(c => !selectedIds.has(c.toolUseId))
 842        .forEach(c => state.seenIds.add(c.toolUseId))
 843  
 844      if (selected.length === 0) continue
 845      messagesOverBudget++
 846      toPersist.push(...selected)
 847    }
 848  
 849    if (replacementMap.size === 0 && toPersist.length === 0) {
 850      return { messages, newlyReplaced: [] }
 851    }
 852  
 853    // Fresh: concurrent persist for all selected candidates across all
 854    // messages. In practice toPersist comes from a single message per turn.
 855    const freshReplacements = await Promise.all(
 856      toPersist.map(async c => [c, await buildReplacement(c)] as const),
 857    )
 858    const newlyReplaced: ToolResultReplacementRecord[] = []
 859    let replacedSize = 0
 860    for (const [candidate, replacement] of freshReplacements) {
 861      // Mark seen HERE, post-await, atomically with replacements.set for
 862      // success cases. For persist failures (replacement === null) the ID
 863      // is seen-but-unreplaced — the original content was sent to the
 864      // model, so treating it as frozen going forward is correct.
 865      state.seenIds.add(candidate.toolUseId)
 866      if (replacement === null) continue
 867      replacedSize += candidate.size
 868      replacementMap.set(candidate.toolUseId, replacement.content)
 869      state.replacements.set(candidate.toolUseId, replacement.content)
 870      newlyReplaced.push({
 871        kind: 'tool-result',
 872        toolUseId: candidate.toolUseId,
 873        replacement: replacement.content,
 874      })
 875      logEvent('tengu_tool_result_persisted_message_budget', {
 876        originalSizeBytes: replacement.originalSize,
 877        persistedSizeBytes: replacement.content.length,
 878        estimatedOriginalTokens: Math.ceil(
 879          replacement.originalSize / BYTES_PER_TOKEN,
 880        ),
 881        estimatedPersistedTokens: Math.ceil(
 882          replacement.content.length / BYTES_PER_TOKEN,
 883        ),
 884      })
 885    }
 886  
 887    if (replacementMap.size === 0) {
 888      return { messages, newlyReplaced: [] }
 889    }
 890  
 891    if (newlyReplaced.length > 0) {
 892      logForDebugging(
 893        `Per-message budget: persisted ${newlyReplaced.length} tool results ` +
 894          `across ${messagesOverBudget} over-budget message(s), ` +
 895          `shed ~${formatFileSize(replacedSize)}, ${reappliedCount} re-applied`,
 896      )
 897      logEvent('tengu_message_level_tool_result_budget_enforced', {
 898        resultsPersisted: newlyReplaced.length,
 899        messagesOverBudget,
 900        replacedSizeBytes: replacedSize,
 901        reapplied: reappliedCount,
 902      })
 903    }
 904  
 905    return {
 906      messages: replaceToolResultContents(messages, replacementMap),
 907      newlyReplaced,
 908    }
 909  }
 910  
 911  /**
 912   * Query-loop integration point for the aggregate budget.
 913   *
 914   * Gates on `state` (undefined means feature disabled → no-op return),
 915   * applies enforcement, and fires an optional transcript-write callback
 916   * for new replacements. The caller (query.ts) owns the persistence gate
 917   * — it passes a callback only for querySources that read records back on
 918   * resume (repl_main_thread*, agent:*); ephemeral runForkedAgent callers
 919   * (agentSummary, sessionMemory, /btw, compact) pass undefined.
 920   *
 921   * @returns messages with replacements applied, or the input array unchanged
 922   *   when the feature is off or no replacement occurred.
 923   */
 924  export async function applyToolResultBudget(
 925    messages: Message[],
 926    state: ContentReplacementState | undefined,
 927    writeToTranscript?: (records: ToolResultReplacementRecord[]) => void,
 928    skipToolNames?: ReadonlySet<string>,
 929  ): Promise<Message[]> {
 930    if (!state) return messages
 931    const result = await enforceToolResultBudget(messages, state, skipToolNames)
 932    if (result.newlyReplaced.length > 0) {
 933      writeToTranscript?.(result.newlyReplaced)
 934    }
 935    return result.messages
 936  }
 937  
 938  /**
 939   * Reconstruct replacement state from content-replacement records loaded from
 940   * the transcript. Used on resume so the budget makes the same choices it
 941   * made in the original session (prompt cache stability).
 942   *
 943   * Accepts the full ContentReplacementRecord[] from LogOption (may include
 944   * future non-tool-result kinds); only tool-result records are applied here.
 945   *
 946   *   - replacements: populated directly from the stored replacement strings.
 947   *     Records for IDs not in messages (e.g. after compact) are skipped —
 948   *     they're inert anyway.
 949   *   - seenIds: every candidate tool_use_id in the loaded messages. A result
 950   *     being in the transcript means it was sent to the model, so it was seen.
 951   *     This freezes unreplaced results against future replacement.
 952   *   - inheritedReplacements: gap-fill for fork-subagent resume. A fork's
 953   *     original run applies parent-inherited replacements via mustReapply
 954   *     (never persisted — not newlyReplaced). On resume the sidechain has
 955   *     the original content but no record, so records alone would classify
 956   *     it as frozen. The parent's live state still has the mapping; copy
 957   *     it for IDs in messages that records don't cover. No-op for non-fork
 958   *     resumes (parent IDs aren't in the subagent's messages).
 959   */
 960  export function reconstructContentReplacementState(
 961    messages: Message[],
 962    records: ContentReplacementRecord[],
 963    inheritedReplacements?: ReadonlyMap<string, string>,
 964  ): ContentReplacementState {
 965    const state = createContentReplacementState()
 966    const candidateIds = new Set(
 967      collectCandidatesByMessage(messages)
 968        .flat()
 969        .map(c => c.toolUseId),
 970    )
 971  
 972    for (const id of candidateIds) {
 973      state.seenIds.add(id)
 974    }
 975    for (const r of records) {
 976      if (r.kind === 'tool-result' && candidateIds.has(r.toolUseId)) {
 977        state.replacements.set(r.toolUseId, r.replacement)
 978      }
 979    }
 980    if (inheritedReplacements) {
 981      for (const [id, replacement] of inheritedReplacements) {
 982        if (candidateIds.has(id) && !state.replacements.has(id)) {
 983          state.replacements.set(id, replacement)
 984        }
 985      }
 986    }
 987    return state
 988  }
 989  
 990  /**
 991   * AgentTool-resume variant: encapsulates the feature-flag gate + parent
 992   * gap-fill so both AgentTool.call and resumeAgentBackground share one
 993   * implementation. Returns undefined when parentState is undefined (feature
 994   * off); otherwise reconstructs from sidechain records with parent's live
 995   * replacements filling gaps for fork-inherited mustReapply entries.
 996   *
 997   * Kept out of AgentTool.tsx — that file is at the feature() DCE complexity
 998   * cliff and cannot tolerate even +1 net source line without silently
 999   * breaking feature('TRANSCRIPT_CLASSIFIER') eval in tests.
1000   */
1001  export function reconstructForSubagentResume(
1002    parentState: ContentReplacementState | undefined,
1003    resumedMessages: Message[],
1004    sidechainRecords: ContentReplacementRecord[],
1005  ): ContentReplacementState | undefined {
1006    if (!parentState) return undefined
1007    return reconstructContentReplacementState(
1008      resumedMessages,
1009      sidechainRecords,
1010      parentState.replacements,
1011    )
1012  }
1013  
1014  /**
1015   * Get a human-readable error message from a filesystem error
1016   */
1017  function getFileSystemErrorMessage(error: Error): string {
1018    // Node.js filesystem errors have a 'code' property
1019    // eslint-disable-next-line no-restricted-syntax -- uses .path, not just .code
1020    const nodeError = error as NodeJS.ErrnoException
1021    if (nodeError.code) {
1022      switch (nodeError.code) {
1023        case 'ENOENT':
1024          return `Directory not found: ${nodeError.path ?? 'unknown path'}`
1025        case 'EACCES':
1026          return `Permission denied: ${nodeError.path ?? 'unknown path'}`
1027        case 'ENOSPC':
1028          return 'No space left on device'
1029        case 'EROFS':
1030          return 'Read-only file system'
1031        case 'EMFILE':
1032          return 'Too many open files'
1033        case 'EEXIST':
1034          return `File already exists: ${nodeError.path ?? 'unknown path'}`
1035        default:
1036          return `${nodeError.code}: ${nodeError.message}`
1037      }
1038    }
1039    return error.message
1040  }