/ utils / mcpOutputStorage.ts
mcpOutputStorage.ts
  1  import { writeFile } from 'fs/promises'
  2  import { join } from 'path'
  3  import {
  4    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  5    logEvent,
  6  } from '../services/analytics/index.js'
  7  import type { MCPResultType } from '../services/mcp/client.js'
  8  import { toError } from './errors.js'
  9  import { formatFileSize } from './format.js'
 10  import { logError } from './log.js'
 11  import { ensureToolResultsDir, getToolResultsDir } from './toolResultStorage.js'
 12  
 13  /**
 14   * Generates a format description string based on the MCP result type and schema.
 15   */
 16  export function getFormatDescription(
 17    type: MCPResultType,
 18    schema?: unknown,
 19  ): string {
 20    switch (type) {
 21      case 'toolResult':
 22        return 'Plain text'
 23      case 'structuredContent':
 24        return schema ? `JSON with schema: ${schema}` : 'JSON'
 25      case 'contentArray':
 26        return schema ? `JSON array with schema: ${schema}` : 'JSON array'
 27    }
 28  }
 29  
 30  /**
 31   * Generates instruction text for Claude to read from a saved output file.
 32   *
 33   * @param rawOutputPath - Path to the saved output file
 34   * @param contentLength - Length of the content in characters
 35   * @param formatDescription - Description of the content format
 36   * @param maxReadLength - Optional max chars for Read tool (for Bash output context)
 37   * @returns Instruction text to include in the tool result
 38   */
 39  export function getLargeOutputInstructions(
 40    rawOutputPath: string,
 41    contentLength: number,
 42    formatDescription: string,
 43    maxReadLength?: number,
 44  ): string {
 45    const baseInstructions =
 46      `Error: result (${contentLength.toLocaleString()} characters) exceeds maximum allowed tokens. Output has been saved to ${rawOutputPath}.\n` +
 47      `Format: ${formatDescription}\n` +
 48      `Use offset and limit parameters to read specific portions of the file, search within it for specific content, and jq to make structured queries.\n` +
 49      `REQUIREMENTS FOR SUMMARIZATION/ANALYSIS/REVIEW:\n` +
 50      `- You MUST read the content from the file at ${rawOutputPath} in sequential chunks until 100% of the content has been read.\n`
 51  
 52    const truncationWarning = maxReadLength
 53      ? `- If you receive truncation warnings when reading the file ("[N lines truncated]"), reduce the chunk size until you have read 100% of the content without truncation ***DO NOT PROCEED UNTIL YOU HAVE DONE THIS***. Bash output is limited to ${maxReadLength.toLocaleString()} chars.\n`
 54      : `- If you receive truncation warnings when reading the file, reduce the chunk size until you have read 100% of the content without truncation.\n`
 55  
 56    const completionRequirement = `- Before producing ANY summary or analysis, you MUST explicitly describe what portion of the content you have read. ***If you did not read the entire content, you MUST explicitly state this.***\n`
 57  
 58    return baseInstructions + truncationWarning + completionRequirement
 59  }
 60  
 61  /**
 62   * Map a mime type to a file extension. Conservative: known types get their
 63   * proper extension; unknown types get 'bin'. The extension matters because
 64   * the Read tool dispatches on it (PDFs, images, etc. need the right ext).
 65   */
 66  export function extensionForMimeType(mimeType: string | undefined): string {
 67    if (!mimeType) return 'bin'
 68    // Strip any charset/boundary parameter
 69    const mt = (mimeType.split(';')[0] ?? '').trim().toLowerCase()
 70    switch (mt) {
 71      case 'application/pdf':
 72        return 'pdf'
 73      case 'application/json':
 74        return 'json'
 75      case 'text/csv':
 76        return 'csv'
 77      case 'text/plain':
 78        return 'txt'
 79      case 'text/html':
 80        return 'html'
 81      case 'text/markdown':
 82        return 'md'
 83      case 'application/zip':
 84        return 'zip'
 85      case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
 86        return 'docx'
 87      case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
 88        return 'xlsx'
 89      case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
 90        return 'pptx'
 91      case 'application/msword':
 92        return 'doc'
 93      case 'application/vnd.ms-excel':
 94        return 'xls'
 95      case 'audio/mpeg':
 96        return 'mp3'
 97      case 'audio/wav':
 98        return 'wav'
 99      case 'audio/ogg':
100        return 'ogg'
101      case 'video/mp4':
102        return 'mp4'
103      case 'video/webm':
104        return 'webm'
105      case 'image/png':
106        return 'png'
107      case 'image/jpeg':
108        return 'jpg'
109      case 'image/gif':
110        return 'gif'
111      case 'image/webp':
112        return 'webp'
113      case 'image/svg+xml':
114        return 'svg'
115      default:
116        return 'bin'
117    }
118  }
119  
120  /**
121   * Heuristic for whether a content-type header indicates binary content that
122   * should be saved to disk rather than put into the model context.
123   * Text-ish types (text/*, json, xml, form data) are treated as non-binary.
124   */
125  export function isBinaryContentType(contentType: string): boolean {
126    if (!contentType) return false
127    const mt = (contentType.split(';')[0] ?? '').trim().toLowerCase()
128    if (mt.startsWith('text/')) return false
129    // Structured text formats delivered with an application/ type. Use suffix
130    // or exact match rather than substring so 'openxmlformats' (docx/xlsx) stays binary.
131    if (mt.endsWith('+json') || mt === 'application/json') return false
132    if (mt.endsWith('+xml') || mt === 'application/xml') return false
133    if (mt.startsWith('application/javascript')) return false
134    if (mt === 'application/x-www-form-urlencoded') return false
135    return true
136  }
137  
138  export type PersistBinaryResult =
139    | { filepath: string; size: number; ext: string }
140    | { error: string }
141  
142  /**
143   * Write raw binary bytes to the tool-results directory with a mime-derived
144   * extension. Unlike persistToolResult (which stringifies), this writes the
145   * bytes as-is so the resulting file can be opened with native tools (Read
146   * for PDFs, pandas for xlsx, etc.).
147   */
148  export async function persistBinaryContent(
149    bytes: Buffer,
150    mimeType: string | undefined,
151    persistId: string,
152  ): Promise<PersistBinaryResult> {
153    await ensureToolResultsDir()
154    const ext = extensionForMimeType(mimeType)
155    const filepath = join(getToolResultsDir(), `${persistId}.${ext}`)
156  
157    try {
158      await writeFile(filepath, bytes)
159    } catch (error) {
160      const err = toError(error)
161      logError(err)
162      return { error: err.message }
163    }
164  
165    // mime type and extension are safe fixed-vocabulary strings (not paths/code)
166    logEvent('tengu_binary_content_persisted', {
167      mimeType: (mimeType ??
168        'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
169      sizeBytes: bytes.length,
170      ext: ext as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
171    })
172  
173    return { filepath, size: bytes.length, ext }
174  }
175  
176  /**
177   * Build a short message telling Claude where binary content was saved.
178   * Just states the path — no prescriptive hint, since what the model can
179   * actually do with the file depends on provider/tooling.
180   */
181  export function getBinaryBlobSavedMessage(
182    filepath: string,
183    mimeType: string | undefined,
184    size: number,
185    sourceDescription: string,
186  ): string {
187    const mt = mimeType || 'unknown type'
188    return `${sourceDescription}Binary content (${mt}, ${formatFileSize(size)}) saved to ${filepath}`
189  }