document-utils.ts
1 import fs from 'fs' 2 import path from 'path' 3 import { spawnSync } from 'child_process' 4 import * as cheerio from 'cheerio' 5 import { dedup } from '@/lib/shared-utils' 6 import { findBinaryOnPath } from './session-tools/context' 7 import { safeJsonParse } from './json-utils' 8 9 const TEXT_EXTENSIONS = new Set([ 10 '.txt', '.md', '.markdown', '.json', '.jsonl', '.csv', '.tsv', 11 '', '.tsx', '.js', '.jsx', '.mjs', '.cjs', '.py', '.go', '.rs', 12 '.java', '.yaml', '.yml', '.sql', '.xml', '.css', '.scss', '.html', '.htm', 13 ]) 14 const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp', '.tif', '.tiff']) 15 16 export interface StructuredTable { 17 name: string 18 headers: string[] 19 rows: Array<Record<string, unknown>> 20 rowCount: number 21 } 22 23 export interface DocumentArtifact { 24 filePath: string 25 fileName: string 26 ext: string 27 method: string 28 text: string 29 metadata: Record<string, unknown> 30 tables: StructuredTable[] 31 } 32 33 function trimText(text: string, maxChars = 200_000): string { 34 const normalized = text.replace(/\r\n/g, '\n').replace(/\u0000/g, '').trim() 35 if (normalized.length <= maxChars) return normalized 36 return `${normalized.slice(0, maxChars)}\n... [truncated]` 37 } 38 39 function normalizeScalar(value: unknown): unknown { 40 if (value === undefined) return null 41 if (value === null) return null 42 if (typeof value === 'number' || typeof value === 'boolean' || typeof value === 'string') return value 43 if (value instanceof Date) return value.toISOString() 44 return String(value) 45 } 46 47 function parseDelimitedText(input: string, delimiter: string): string[][] { 48 const rows: string[][] = [] 49 let row: string[] = [] 50 let field = '' 51 let inQuotes = false 52 53 for (let index = 0; index < input.length; index += 1) { 54 const char = input[index] 55 const next = input[index + 1] 56 57 if (inQuotes) { 58 if (char === '"' && next === '"') { 59 field += '"' 60 index += 1 61 continue 62 } 63 if (char === '"') { 64 inQuotes = false 65 continue 66 } 67 field += char 68 continue 69 } 70 71 if (char === '"') { 72 inQuotes = true 73 continue 74 } 75 if (char === delimiter) { 76 row.push(field) 77 field = '' 78 continue 79 } 80 if (char === '\n') { 81 row.push(field) 82 rows.push(row) 83 row = [] 84 field = '' 85 continue 86 } 87 if (char === '\r') continue 88 field += char 89 } 90 91 if (field.length > 0 || row.length > 0) { 92 row.push(field) 93 rows.push(row) 94 } 95 96 return rows.filter((cells) => cells.some((cell) => cell.trim().length > 0)) 97 } 98 99 function matrixToTable(name: string, matrix: string[][]): StructuredTable { 100 if (matrix.length === 0) return { name, headers: [], rows: [], rowCount: 0 } 101 const headerRow = matrix[0].map((cell, index) => cell.trim() || `column_${index + 1}`) 102 const rows = matrix.slice(1).map((cells) => { 103 const row: Record<string, unknown> = {} 104 for (let index = 0; index < headerRow.length; index += 1) { 105 row[headerRow[index]] = cells[index] ?? '' 106 } 107 return row 108 }) 109 return { 110 name, 111 headers: headerRow, 112 rows, 113 rowCount: rows.length, 114 } 115 } 116 117 function objectsToTable(name: string, rows: Array<Record<string, unknown>>): StructuredTable { 118 const headers = dedup(rows.flatMap((row) => Object.keys(row))) 119 const normalizedRows = rows.map((row) => { 120 const out: Record<string, unknown> = {} 121 for (const header of headers) out[header] = normalizeScalar(row[header]) 122 return out 123 }) 124 return { 125 name, 126 headers, 127 rows: normalizedRows, 128 rowCount: normalizedRows.length, 129 } 130 } 131 132 function tablesToText(tables: StructuredTable[]): string { 133 return tables 134 .map((table) => { 135 const header = table.headers.join('\t') 136 const body = table.rows.slice(0, 100).map((row) => table.headers.map((key) => String(row[key] ?? '')).join('\t')).join('\n') 137 return `${table.name}\n${header}${body ? `\n${body}` : ''}` 138 }) 139 .join('\n\n') 140 } 141 142 function worksheetRowToArray(values: unknown): unknown[] { 143 if (Array.isArray(values)) return values.slice(1) 144 if (values && typeof values === 'object') { 145 return Object.entries(values as Record<string, unknown>) 146 .filter(([key]) => Number.isFinite(Number(key)) && Number(key) >= 1) 147 .sort((left, right) => Number(left[0]) - Number(right[0])) 148 .map(([, value]) => value) 149 } 150 return [] 151 } 152 153 function listZipEntries(filePath: string): { entries: string[]; method: string } { 154 const unzip = findBinaryOnPath('unzip') || findBinaryOnPath('zipinfo') 155 if (!unzip) throw new Error('ZIP listing requires `unzip` or `zipinfo` on PATH.') 156 const args = path.basename(unzip).includes('zipinfo') ? ['-1', filePath] : ['-Z1', filePath] 157 const out = spawnSync(unzip, args, { 158 encoding: 'utf-8', 159 maxBuffer: 10 * 1024 * 1024, 160 timeout: 20_000, 161 }) 162 if ((out.status ?? 1) !== 0) { 163 throw new Error(`Failed to inspect ZIP: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`) 164 } 165 const entries = (out.stdout || '').split(/\r?\n/).map((line) => line.trim()).filter(Boolean) 166 return { entries, method: path.basename(unzip) } 167 } 168 169 async function extractPdfText(filePath: string): Promise<{ text: string; method: string }> { 170 try { 171 const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse') 172 const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }> 173 const result = await pdfParse(fs.readFileSync(filePath)) 174 if ((result.text || '').trim()) { 175 return { text: result.text, method: 'pdf-parse' } 176 } 177 } catch { 178 // fall through to pdftotext 179 } 180 181 const pdftotext = findBinaryOnPath('pdftotext') 182 if (!pdftotext) throw new Error('PDF extraction requires `pdf-parse` or `pdftotext`.') 183 const out = spawnSync(pdftotext, ['-layout', '-nopgbrk', '-q', filePath, '-'], { 184 encoding: 'utf-8', 185 maxBuffer: 25 * 1024 * 1024, 186 timeout: 20_000, 187 }) 188 if ((out.status ?? 1) !== 0) { 189 throw new Error(`pdftotext failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`) 190 } 191 return { text: out.stdout || '', method: 'pdftotext' } 192 } 193 194 function extractImageText(filePath: string): { text: string; method: string } { 195 const tesseract = findBinaryOnPath('tesseract') 196 if (!tesseract) { 197 throw new Error('Image OCR requires `tesseract` on PATH.') 198 } 199 const out = spawnSync(tesseract, [filePath, 'stdout', '--psm', '6'], { 200 encoding: 'utf-8', 201 maxBuffer: 25 * 1024 * 1024, 202 timeout: 30_000, 203 }) 204 if ((out.status ?? 1) !== 0) { 205 throw new Error(`tesseract failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`) 206 } 207 return { text: out.stdout || '', method: 'tesseract' } 208 } 209 210 function extractRichText(filePath: string): { text: string; method: string } { 211 const textutil = findBinaryOnPath('textutil') 212 if (!textutil) throw new Error('DOC/DOCX/RTF extraction requires `textutil` on PATH.') 213 const out = spawnSync(textutil, ['-convert', 'txt', '-stdout', filePath], { 214 encoding: 'utf-8', 215 maxBuffer: 25 * 1024 * 1024, 216 timeout: 20_000, 217 }) 218 if ((out.status ?? 1) !== 0 || !(out.stdout || '').trim()) { 219 throw new Error(`textutil failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`) 220 } 221 return { text: out.stdout || '', method: 'textutil' } 222 } 223 224 export async function extractDocumentArtifact(filePath: string, options?: { maxChars?: number; preferOcr?: boolean }): Promise<DocumentArtifact> { 225 const resolved = path.resolve(filePath) 226 if (!fs.existsSync(resolved)) throw new Error(`File not found: ${filePath}`) 227 const stat = fs.statSync(resolved) 228 if (!stat.isFile()) throw new Error(`Expected a file: ${filePath}`) 229 230 const ext = path.extname(resolved).toLowerCase() 231 const metadata: Record<string, unknown> = { 232 sizeBytes: stat.size, 233 modifiedAt: stat.mtimeMs, 234 } 235 const maxChars = options?.maxChars || 200_000 236 let text = '' 237 let method = 'utf8' 238 let tables: StructuredTable[] = [] 239 240 if (ext === '.pdf') { 241 const pdf = await extractPdfText(resolved) 242 text = pdf.text 243 method = pdf.method 244 } else if (ext === '.csv' || ext === '.tsv') { 245 const delimiter = ext === '.tsv' ? '\t' : ',' 246 const raw = fs.readFileSync(resolved, 'utf-8') 247 const table = matrixToTable(path.basename(resolved), parseDelimitedText(raw, delimiter)) 248 tables = [table] 249 text = tablesToText(tables) 250 method = ext === '.tsv' ? 'tsv' : 'csv' 251 } else if (ext === '.xlsx' || ext === '.xlsm') { 252 const ExcelJS = await import('exceljs') 253 const workbook = new ExcelJS.Workbook() 254 await workbook.xlsx.readFile(resolved) 255 tables = workbook.worksheets.map((worksheet) => { 256 const matrix: string[][] = [] 257 worksheet.eachRow((row) => { 258 matrix.push(worksheetRowToArray(row.values).map((cell) => String(normalizeScalar(cell) ?? ''))) 259 }) 260 return matrixToTable(worksheet.name, matrix) 261 }).filter((table) => table.headers.length > 0 || table.rowCount > 0) 262 text = tablesToText(tables) 263 method = 'exceljs' 264 metadata.sheetNames = workbook.worksheets.map((sheet) => sheet.name) 265 } else if (ext === '.json') { 266 const raw = fs.readFileSync(resolved, 'utf-8') 267 text = raw 268 method = 'json' 269 const parsed = safeJsonParse<unknown>(raw, null) 270 if (Array.isArray(parsed) && parsed.every((row) => row && typeof row === 'object' && !Array.isArray(row))) { 271 tables = [objectsToTable(path.basename(resolved), parsed as Array<Record<string, unknown>>)] 272 } 273 } else if (ext === '.html' || ext === '.htm') { 274 const html = fs.readFileSync(resolved, 'utf-8') 275 const $ = cheerio.load(html) 276 $('script, style, noscript').remove() 277 text = $('body').text() || $.text() 278 method = 'html-strip' 279 } else if (ext === '.zip') { 280 const zip = listZipEntries(resolved) 281 text = zip.entries.join('\n') 282 method = zip.method 283 metadata.entries = zip.entries 284 } else if (ext === '.doc' || ext === '.docx' || ext === '.rtf') { 285 const rich = extractRichText(resolved) 286 text = rich.text 287 method = rich.method 288 } else if (IMAGE_EXTENSIONS.has(ext) || options?.preferOcr === true) { 289 const image = extractImageText(resolved) 290 text = image.text 291 method = image.method 292 } else if (TEXT_EXTENSIONS.has(ext) || !ext) { 293 text = fs.readFileSync(resolved, 'utf-8') 294 method = 'utf8' 295 } else { 296 text = fs.readFileSync(resolved, 'utf-8') 297 method = 'utf8-fallback' 298 } 299 300 return { 301 filePath: resolved, 302 fileName: path.basename(resolved), 303 ext, 304 method, 305 text: trimText(text, maxChars), 306 metadata, 307 tables, 308 } 309 } 310 311 export async function loadTabularFile(filePath: string, options?: { sheetName?: string }): Promise<StructuredTable> { 312 const resolved = path.resolve(filePath) 313 const ext = path.extname(resolved).toLowerCase() 314 if (ext === '.csv' || ext === '.tsv') { 315 const delimiter = ext === '.tsv' ? '\t' : ',' 316 return matrixToTable(path.basename(resolved), parseDelimitedText(fs.readFileSync(resolved, 'utf-8'), delimiter)) 317 } 318 if (ext === '.json') { 319 const parsed = JSON.parse(fs.readFileSync(resolved, 'utf-8')) 320 if (!Array.isArray(parsed) || !parsed.every((row) => row && typeof row === 'object' && !Array.isArray(row))) { 321 throw new Error('JSON table inputs must be an array of objects.') 322 } 323 return objectsToTable(path.basename(resolved), parsed as Array<Record<string, unknown>>) 324 } 325 if (ext === '.xlsx' || ext === '.xlsm') { 326 const ExcelJS = await import('exceljs') 327 const workbook = new ExcelJS.Workbook() 328 await workbook.xlsx.readFile(resolved) 329 const target = options?.sheetName 330 ? workbook.getWorksheet(options.sheetName) 331 : workbook.worksheets[0] 332 if (!target) throw new Error(`Worksheet not found: ${options?.sheetName || '(first worksheet)'}`) 333 const matrix: string[][] = [] 334 target.eachRow((row) => { 335 matrix.push(worksheetRowToArray(row.values).map((cell) => String(normalizeScalar(cell) ?? ''))) 336 }) 337 return matrixToTable(target.name, matrix) 338 } 339 throw new Error(`Unsupported tabular file: ${ext || '(no extension)'}`) 340 } 341 342 export function normalizeInlineRows(value: unknown): StructuredTable { 343 if (!Array.isArray(value)) throw new Error('rows must be an array.') 344 if (value.length === 0) return { name: 'rows', headers: [], rows: [], rowCount: 0 } 345 if (value.every((row) => Array.isArray(row))) { 346 return matrixToTable('rows', value.map((row) => (row as unknown[]).map((cell) => String(normalizeScalar(cell) ?? '')))) 347 } 348 if (value.every((row) => row && typeof row === 'object' && !Array.isArray(row))) { 349 return objectsToTable('rows', value as Array<Record<string, unknown>>) 350 } 351 throw new Error('rows must be an array of objects or arrays.') 352 } 353 354 function escapeDelimitedCell(value: unknown, delimiter: string): string { 355 const raw = String(normalizeScalar(value) ?? '') 356 if (raw.includes('"') || raw.includes('\n') || raw.includes(delimiter)) { 357 return `"${raw.replace(/"/g, '""')}"` 358 } 359 return raw 360 } 361 362 export function serializeTable(table: StructuredTable, delimiter = ','): string { 363 const header = table.headers.map((cell) => escapeDelimitedCell(cell, delimiter)).join(delimiter) 364 const rows = table.rows.map((row) => table.headers.map((headerCell) => escapeDelimitedCell(row[headerCell], delimiter)).join(delimiter)) 365 return [header, ...rows].join('\n') 366 } 367 368 export async function writeStructuredTable(filePath: string, table: StructuredTable): Promise<{ filePath: string; format: string }> { 369 const resolved = path.resolve(filePath) 370 const ext = path.extname(resolved).toLowerCase() 371 fs.mkdirSync(path.dirname(resolved), { recursive: true }) 372 373 if (ext === '.json') { 374 fs.writeFileSync(resolved, JSON.stringify(table.rows, null, 2), 'utf-8') 375 return { filePath: resolved, format: 'json' } 376 } 377 if (ext === '.tsv') { 378 fs.writeFileSync(resolved, serializeTable(table, '\t'), 'utf-8') 379 return { filePath: resolved, format: 'tsv' } 380 } 381 if (ext === '.xlsx') { 382 const ExcelJS = await import('exceljs') 383 const workbook = new ExcelJS.Workbook() 384 const worksheet = workbook.addWorksheet(table.name || 'Sheet1') 385 worksheet.addRow(table.headers) 386 for (const row of table.rows) { 387 worksheet.addRow(table.headers.map((header) => row[header] ?? null)) 388 } 389 await workbook.xlsx.writeFile(resolved) 390 return { filePath: resolved, format: 'xlsx' } 391 } 392 393 fs.writeFileSync(resolved, serializeTable(table, ','), 'utf-8') 394 return { filePath: resolved, format: 'csv' } 395 }