mcpOutputStorage.ts
1 import { writeFile } from 'fs/promises' 2 import { join } from 'path' 3 import { 4 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 5 logEvent, 6 } from '../services/analytics/index.js' 7 import type { MCPResultType } from '../services/mcp/client.js' 8 import { toError } from './errors.js' 9 import { formatFileSize } from './format.js' 10 import { logError } from './log.js' 11 import { ensureToolResultsDir, getToolResultsDir } from './toolResultStorage.js' 12 13 /** 14 * Generates a format description string based on the MCP result type and schema. 15 */ 16 export function getFormatDescription( 17 type: MCPResultType, 18 schema?: unknown, 19 ): string { 20 switch (type) { 21 case 'toolResult': 22 return 'Plain text' 23 case 'structuredContent': 24 return schema ? `JSON with schema: ${schema}` : 'JSON' 25 case 'contentArray': 26 return schema ? `JSON array with schema: ${schema}` : 'JSON array' 27 } 28 } 29 30 /** 31 * Generates instruction text for Claude to read from a saved output file. 32 * 33 * @param rawOutputPath - Path to the saved output file 34 * @param contentLength - Length of the content in characters 35 * @param formatDescription - Description of the content format 36 * @param maxReadLength - Optional max chars for Read tool (for Bash output context) 37 * @returns Instruction text to include in the tool result 38 */ 39 export function getLargeOutputInstructions( 40 rawOutputPath: string, 41 contentLength: number, 42 formatDescription: string, 43 maxReadLength?: number, 44 ): string { 45 const baseInstructions = 46 `Error: result (${contentLength.toLocaleString()} characters) exceeds maximum allowed tokens. Output has been saved to ${rawOutputPath}.\n` + 47 `Format: ${formatDescription}\n` + 48 `Use offset and limit parameters to read specific portions of the file, search within it for specific content, and jq to make structured queries.\n` + 49 `REQUIREMENTS FOR SUMMARIZATION/ANALYSIS/REVIEW:\n` + 50 `- You MUST read the content from the file at ${rawOutputPath} in sequential chunks until 100% of the content has been read.\n` 51 52 const truncationWarning = maxReadLength 53 ? `- If you receive truncation warnings when reading the file ("[N lines truncated]"), reduce the chunk size until you have read 100% of the content without truncation ***DO NOT PROCEED UNTIL YOU HAVE DONE THIS***. Bash output is limited to ${maxReadLength.toLocaleString()} chars.\n` 54 : `- If you receive truncation warnings when reading the file, reduce the chunk size until you have read 100% of the content without truncation.\n` 55 56 const completionRequirement = `- Before producing ANY summary or analysis, you MUST explicitly describe what portion of the content you have read. ***If you did not read the entire content, you MUST explicitly state this.***\n` 57 58 return baseInstructions + truncationWarning + completionRequirement 59 } 60 61 /** 62 * Map a mime type to a file extension. Conservative: known types get their 63 * proper extension; unknown types get 'bin'. The extension matters because 64 * the Read tool dispatches on it (PDFs, images, etc. need the right ext). 65 */ 66 export function extensionForMimeType(mimeType: string | undefined): string { 67 if (!mimeType) return 'bin' 68 // Strip any charset/boundary parameter 69 const mt = (mimeType.split(';')[0] ?? '').trim().toLowerCase() 70 switch (mt) { 71 case 'application/pdf': 72 return 'pdf' 73 case 'application/json': 74 return 'json' 75 case 'text/csv': 76 return 'csv' 77 case 'text/plain': 78 return 'txt' 79 case 'text/html': 80 return 'html' 81 case 'text/markdown': 82 return 'md' 83 case 'application/zip': 84 return 'zip' 85 case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 86 return 'docx' 87 case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 88 return 'xlsx' 89 case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 90 return 'pptx' 91 case 'application/msword': 92 return 'doc' 93 case 'application/vnd.ms-excel': 94 return 'xls' 95 case 'audio/mpeg': 96 return 'mp3' 97 case 'audio/wav': 98 return 'wav' 99 case 'audio/ogg': 100 return 'ogg' 101 case 'video/mp4': 102 return 'mp4' 103 case 'video/webm': 104 return 'webm' 105 case 'image/png': 106 return 'png' 107 case 'image/jpeg': 108 return 'jpg' 109 case 'image/gif': 110 return 'gif' 111 case 'image/webp': 112 return 'webp' 113 case 'image/svg+xml': 114 return 'svg' 115 default: 116 return 'bin' 117 } 118 } 119 120 /** 121 * Heuristic for whether a content-type header indicates binary content that 122 * should be saved to disk rather than put into the model context. 123 * Text-ish types (text/*, json, xml, form data) are treated as non-binary. 124 */ 125 export function isBinaryContentType(contentType: string): boolean { 126 if (!contentType) return false 127 const mt = (contentType.split(';')[0] ?? '').trim().toLowerCase() 128 if (mt.startsWith('text/')) return false 129 // Structured text formats delivered with an application/ type. Use suffix 130 // or exact match rather than substring so 'openxmlformats' (docx/xlsx) stays binary. 131 if (mt.endsWith('+json') || mt === 'application/json') return false 132 if (mt.endsWith('+xml') || mt === 'application/xml') return false 133 if (mt.startsWith('application/javascript')) return false 134 if (mt === 'application/x-www-form-urlencoded') return false 135 return true 136 } 137 138 export type PersistBinaryResult = 139 | { filepath: string; size: number; ext: string } 140 | { error: string } 141 142 /** 143 * Write raw binary bytes to the tool-results directory with a mime-derived 144 * extension. Unlike persistToolResult (which stringifies), this writes the 145 * bytes as-is so the resulting file can be opened with native tools (Read 146 * for PDFs, pandas for xlsx, etc.). 147 */ 148 export async function persistBinaryContent( 149 bytes: Buffer, 150 mimeType: string | undefined, 151 persistId: string, 152 ): Promise<PersistBinaryResult> { 153 await ensureToolResultsDir() 154 const ext = extensionForMimeType(mimeType) 155 const filepath = join(getToolResultsDir(), `${persistId}.${ext}`) 156 157 try { 158 await writeFile(filepath, bytes) 159 } catch (error) { 160 const err = toError(error) 161 logError(err) 162 return { error: err.message } 163 } 164 165 // mime type and extension are safe fixed-vocabulary strings (not paths/code) 166 logEvent('tengu_binary_content_persisted', { 167 mimeType: (mimeType ?? 168 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 169 sizeBytes: bytes.length, 170 ext: ext as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 171 }) 172 173 return { filepath, size: bytes.length, ext } 174 } 175 176 /** 177 * Build a short message telling Claude where binary content was saved. 178 * Just states the path — no prescriptive hint, since what the model can 179 * actually do with the file depends on provider/tooling. 180 */ 181 export function getBinaryBlobSavedMessage( 182 filepath: string, 183 mimeType: string | undefined, 184 size: number, 185 sourceDescription: string, 186 ): string { 187 const mt = mimeType || 'unknown type' 188 return `${sourceDescription}Binary content (${mt}, ${formatFileSize(size)}) saved to ${filepath}` 189 }