/ src / lib / server / document-utils.ts
document-utils.ts
  1  import fs from 'fs'
  2  import path from 'path'
  3  import { spawnSync } from 'child_process'
  4  import * as cheerio from 'cheerio'
  5  import { dedup } from '@/lib/shared-utils'
  6  import { findBinaryOnPath } from './session-tools/context'
  7  import { safeJsonParse } from './json-utils'
  8  
  9  const TEXT_EXTENSIONS = new Set([
 10    '.txt', '.md', '.markdown', '.json', '.jsonl', '.csv', '.tsv',
 11    '', '.tsx', '.js', '.jsx', '.mjs', '.cjs', '.py', '.go', '.rs',
 12    '.java', '.yaml', '.yml', '.sql', '.xml', '.css', '.scss', '.html', '.htm',
 13  ])
 14  const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp', '.tif', '.tiff'])
 15  
 16  export interface StructuredTable {
 17    name: string
 18    headers: string[]
 19    rows: Array<Record<string, unknown>>
 20    rowCount: number
 21  }
 22  
 23  export interface DocumentArtifact {
 24    filePath: string
 25    fileName: string
 26    ext: string
 27    method: string
 28    text: string
 29    metadata: Record<string, unknown>
 30    tables: StructuredTable[]
 31  }
 32  
 33  function trimText(text: string, maxChars = 200_000): string {
 34    const normalized = text.replace(/\r\n/g, '\n').replace(/\u0000/g, '').trim()
 35    if (normalized.length <= maxChars) return normalized
 36    return `${normalized.slice(0, maxChars)}\n... [truncated]`
 37  }
 38  
 39  function normalizeScalar(value: unknown): unknown {
 40    if (value === undefined) return null
 41    if (value === null) return null
 42    if (typeof value === 'number' || typeof value === 'boolean' || typeof value === 'string') return value
 43    if (value instanceof Date) return value.toISOString()
 44    return String(value)
 45  }
 46  
 47  function parseDelimitedText(input: string, delimiter: string): string[][] {
 48    const rows: string[][] = []
 49    let row: string[] = []
 50    let field = ''
 51    let inQuotes = false
 52  
 53    for (let index = 0; index < input.length; index += 1) {
 54      const char = input[index]
 55      const next = input[index + 1]
 56  
 57      if (inQuotes) {
 58        if (char === '"' && next === '"') {
 59          field += '"'
 60          index += 1
 61          continue
 62        }
 63        if (char === '"') {
 64          inQuotes = false
 65          continue
 66        }
 67        field += char
 68        continue
 69      }
 70  
 71      if (char === '"') {
 72        inQuotes = true
 73        continue
 74      }
 75      if (char === delimiter) {
 76        row.push(field)
 77        field = ''
 78        continue
 79      }
 80      if (char === '\n') {
 81        row.push(field)
 82        rows.push(row)
 83        row = []
 84        field = ''
 85        continue
 86      }
 87      if (char === '\r') continue
 88      field += char
 89    }
 90  
 91    if (field.length > 0 || row.length > 0) {
 92      row.push(field)
 93      rows.push(row)
 94    }
 95  
 96    return rows.filter((cells) => cells.some((cell) => cell.trim().length > 0))
 97  }
 98  
 99  function matrixToTable(name: string, matrix: string[][]): StructuredTable {
100    if (matrix.length === 0) return { name, headers: [], rows: [], rowCount: 0 }
101    const headerRow = matrix[0].map((cell, index) => cell.trim() || `column_${index + 1}`)
102    const rows = matrix.slice(1).map((cells) => {
103      const row: Record<string, unknown> = {}
104      for (let index = 0; index < headerRow.length; index += 1) {
105        row[headerRow[index]] = cells[index] ?? ''
106      }
107      return row
108    })
109    return {
110      name,
111      headers: headerRow,
112      rows,
113      rowCount: rows.length,
114    }
115  }
116  
117  function objectsToTable(name: string, rows: Array<Record<string, unknown>>): StructuredTable {
118    const headers = dedup(rows.flatMap((row) => Object.keys(row)))
119    const normalizedRows = rows.map((row) => {
120      const out: Record<string, unknown> = {}
121      for (const header of headers) out[header] = normalizeScalar(row[header])
122      return out
123    })
124    return {
125      name,
126      headers,
127      rows: normalizedRows,
128      rowCount: normalizedRows.length,
129    }
130  }
131  
132  function tablesToText(tables: StructuredTable[]): string {
133    return tables
134      .map((table) => {
135        const header = table.headers.join('\t')
136        const body = table.rows.slice(0, 100).map((row) => table.headers.map((key) => String(row[key] ?? '')).join('\t')).join('\n')
137        return `${table.name}\n${header}${body ? `\n${body}` : ''}`
138      })
139      .join('\n\n')
140  }
141  
142  function worksheetRowToArray(values: unknown): unknown[] {
143    if (Array.isArray(values)) return values.slice(1)
144    if (values && typeof values === 'object') {
145      return Object.entries(values as Record<string, unknown>)
146        .filter(([key]) => Number.isFinite(Number(key)) && Number(key) >= 1)
147        .sort((left, right) => Number(left[0]) - Number(right[0]))
148        .map(([, value]) => value)
149    }
150    return []
151  }
152  
153  function listZipEntries(filePath: string): { entries: string[]; method: string } {
154    const unzip = findBinaryOnPath('unzip') || findBinaryOnPath('zipinfo')
155    if (!unzip) throw new Error('ZIP listing requires `unzip` or `zipinfo` on PATH.')
156    const args = path.basename(unzip).includes('zipinfo') ? ['-1', filePath] : ['-Z1', filePath]
157    const out = spawnSync(unzip, args, {
158      encoding: 'utf-8',
159      maxBuffer: 10 * 1024 * 1024,
160      timeout: 20_000,
161    })
162    if ((out.status ?? 1) !== 0) {
163      throw new Error(`Failed to inspect ZIP: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`)
164    }
165    const entries = (out.stdout || '').split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
166    return { entries, method: path.basename(unzip) }
167  }
168  
169  async function extractPdfText(filePath: string): Promise<{ text: string; method: string }> {
170    try {
171      const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse')
172      const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }>
173      const result = await pdfParse(fs.readFileSync(filePath))
174      if ((result.text || '').trim()) {
175        return { text: result.text, method: 'pdf-parse' }
176      }
177    } catch {
178      // fall through to pdftotext
179    }
180  
181    const pdftotext = findBinaryOnPath('pdftotext')
182    if (!pdftotext) throw new Error('PDF extraction requires `pdf-parse` or `pdftotext`.')
183    const out = spawnSync(pdftotext, ['-layout', '-nopgbrk', '-q', filePath, '-'], {
184      encoding: 'utf-8',
185      maxBuffer: 25 * 1024 * 1024,
186      timeout: 20_000,
187    })
188    if ((out.status ?? 1) !== 0) {
189      throw new Error(`pdftotext failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`)
190    }
191    return { text: out.stdout || '', method: 'pdftotext' }
192  }
193  
194  function extractImageText(filePath: string): { text: string; method: string } {
195    const tesseract = findBinaryOnPath('tesseract')
196    if (!tesseract) {
197      throw new Error('Image OCR requires `tesseract` on PATH.')
198    }
199    const out = spawnSync(tesseract, [filePath, 'stdout', '--psm', '6'], {
200      encoding: 'utf-8',
201      maxBuffer: 25 * 1024 * 1024,
202      timeout: 30_000,
203    })
204    if ((out.status ?? 1) !== 0) {
205      throw new Error(`tesseract failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`)
206    }
207    return { text: out.stdout || '', method: 'tesseract' }
208  }
209  
210  function extractRichText(filePath: string): { text: string; method: string } {
211    const textutil = findBinaryOnPath('textutil')
212    if (!textutil) throw new Error('DOC/DOCX/RTF extraction requires `textutil` on PATH.')
213    const out = spawnSync(textutil, ['-convert', 'txt', '-stdout', filePath], {
214      encoding: 'utf-8',
215      maxBuffer: 25 * 1024 * 1024,
216      timeout: 20_000,
217    })
218    if ((out.status ?? 1) !== 0 || !(out.stdout || '').trim()) {
219      throw new Error(`textutil failed: ${(out.stderr || out.stdout || '').trim() || 'unknown error'}`)
220    }
221    return { text: out.stdout || '', method: 'textutil' }
222  }
223  
224  export async function extractDocumentArtifact(filePath: string, options?: { maxChars?: number; preferOcr?: boolean }): Promise<DocumentArtifact> {
225    const resolved = path.resolve(filePath)
226    if (!fs.existsSync(resolved)) throw new Error(`File not found: ${filePath}`)
227    const stat = fs.statSync(resolved)
228    if (!stat.isFile()) throw new Error(`Expected a file: ${filePath}`)
229  
230    const ext = path.extname(resolved).toLowerCase()
231    const metadata: Record<string, unknown> = {
232      sizeBytes: stat.size,
233      modifiedAt: stat.mtimeMs,
234    }
235    const maxChars = options?.maxChars || 200_000
236    let text = ''
237    let method = 'utf8'
238    let tables: StructuredTable[] = []
239  
240    if (ext === '.pdf') {
241      const pdf = await extractPdfText(resolved)
242      text = pdf.text
243      method = pdf.method
244    } else if (ext === '.csv' || ext === '.tsv') {
245      const delimiter = ext === '.tsv' ? '\t' : ','
246      const raw = fs.readFileSync(resolved, 'utf-8')
247      const table = matrixToTable(path.basename(resolved), parseDelimitedText(raw, delimiter))
248      tables = [table]
249      text = tablesToText(tables)
250      method = ext === '.tsv' ? 'tsv' : 'csv'
251    } else if (ext === '.xlsx' || ext === '.xlsm') {
252      const ExcelJS = await import('exceljs')
253      const workbook = new ExcelJS.Workbook()
254      await workbook.xlsx.readFile(resolved)
255      tables = workbook.worksheets.map((worksheet) => {
256        const matrix: string[][] = []
257        worksheet.eachRow((row) => {
258          matrix.push(worksheetRowToArray(row.values).map((cell) => String(normalizeScalar(cell) ?? '')))
259        })
260        return matrixToTable(worksheet.name, matrix)
261      }).filter((table) => table.headers.length > 0 || table.rowCount > 0)
262      text = tablesToText(tables)
263      method = 'exceljs'
264      metadata.sheetNames = workbook.worksheets.map((sheet) => sheet.name)
265    } else if (ext === '.json') {
266      const raw = fs.readFileSync(resolved, 'utf-8')
267      text = raw
268      method = 'json'
269      const parsed = safeJsonParse<unknown>(raw, null)
270      if (Array.isArray(parsed) && parsed.every((row) => row && typeof row === 'object' && !Array.isArray(row))) {
271        tables = [objectsToTable(path.basename(resolved), parsed as Array<Record<string, unknown>>)]
272      }
273    } else if (ext === '.html' || ext === '.htm') {
274      const html = fs.readFileSync(resolved, 'utf-8')
275      const $ = cheerio.load(html)
276      $('script, style, noscript').remove()
277      text = $('body').text() || $.text()
278      method = 'html-strip'
279    } else if (ext === '.zip') {
280      const zip = listZipEntries(resolved)
281      text = zip.entries.join('\n')
282      method = zip.method
283      metadata.entries = zip.entries
284    } else if (ext === '.doc' || ext === '.docx' || ext === '.rtf') {
285      const rich = extractRichText(resolved)
286      text = rich.text
287      method = rich.method
288    } else if (IMAGE_EXTENSIONS.has(ext) || options?.preferOcr === true) {
289      const image = extractImageText(resolved)
290      text = image.text
291      method = image.method
292    } else if (TEXT_EXTENSIONS.has(ext) || !ext) {
293      text = fs.readFileSync(resolved, 'utf-8')
294      method = 'utf8'
295    } else {
296      text = fs.readFileSync(resolved, 'utf-8')
297      method = 'utf8-fallback'
298    }
299  
300    return {
301      filePath: resolved,
302      fileName: path.basename(resolved),
303      ext,
304      method,
305      text: trimText(text, maxChars),
306      metadata,
307      tables,
308    }
309  }
310  
311  export async function loadTabularFile(filePath: string, options?: { sheetName?: string }): Promise<StructuredTable> {
312    const resolved = path.resolve(filePath)
313    const ext = path.extname(resolved).toLowerCase()
314    if (ext === '.csv' || ext === '.tsv') {
315      const delimiter = ext === '.tsv' ? '\t' : ','
316      return matrixToTable(path.basename(resolved), parseDelimitedText(fs.readFileSync(resolved, 'utf-8'), delimiter))
317    }
318    if (ext === '.json') {
319      const parsed = JSON.parse(fs.readFileSync(resolved, 'utf-8'))
320      if (!Array.isArray(parsed) || !parsed.every((row) => row && typeof row === 'object' && !Array.isArray(row))) {
321        throw new Error('JSON table inputs must be an array of objects.')
322      }
323      return objectsToTable(path.basename(resolved), parsed as Array<Record<string, unknown>>)
324    }
325    if (ext === '.xlsx' || ext === '.xlsm') {
326      const ExcelJS = await import('exceljs')
327      const workbook = new ExcelJS.Workbook()
328      await workbook.xlsx.readFile(resolved)
329      const target = options?.sheetName
330        ? workbook.getWorksheet(options.sheetName)
331        : workbook.worksheets[0]
332      if (!target) throw new Error(`Worksheet not found: ${options?.sheetName || '(first worksheet)'}`)
333      const matrix: string[][] = []
334      target.eachRow((row) => {
335        matrix.push(worksheetRowToArray(row.values).map((cell) => String(normalizeScalar(cell) ?? '')))
336      })
337      return matrixToTable(target.name, matrix)
338    }
339    throw new Error(`Unsupported tabular file: ${ext || '(no extension)'}`)
340  }
341  
342  export function normalizeInlineRows(value: unknown): StructuredTable {
343    if (!Array.isArray(value)) throw new Error('rows must be an array.')
344    if (value.length === 0) return { name: 'rows', headers: [], rows: [], rowCount: 0 }
345    if (value.every((row) => Array.isArray(row))) {
346      return matrixToTable('rows', value.map((row) => (row as unknown[]).map((cell) => String(normalizeScalar(cell) ?? ''))))
347    }
348    if (value.every((row) => row && typeof row === 'object' && !Array.isArray(row))) {
349      return objectsToTable('rows', value as Array<Record<string, unknown>>)
350    }
351    throw new Error('rows must be an array of objects or arrays.')
352  }
353  
354  function escapeDelimitedCell(value: unknown, delimiter: string): string {
355    const raw = String(normalizeScalar(value) ?? '')
356    if (raw.includes('"') || raw.includes('\n') || raw.includes(delimiter)) {
357      return `"${raw.replace(/"/g, '""')}"`
358    }
359    return raw
360  }
361  
362  export function serializeTable(table: StructuredTable, delimiter = ','): string {
363    const header = table.headers.map((cell) => escapeDelimitedCell(cell, delimiter)).join(delimiter)
364    const rows = table.rows.map((row) => table.headers.map((headerCell) => escapeDelimitedCell(row[headerCell], delimiter)).join(delimiter))
365    return [header, ...rows].join('\n')
366  }
367  
368  export async function writeStructuredTable(filePath: string, table: StructuredTable): Promise<{ filePath: string; format: string }> {
369    const resolved = path.resolve(filePath)
370    const ext = path.extname(resolved).toLowerCase()
371    fs.mkdirSync(path.dirname(resolved), { recursive: true })
372  
373    if (ext === '.json') {
374      fs.writeFileSync(resolved, JSON.stringify(table.rows, null, 2), 'utf-8')
375      return { filePath: resolved, format: 'json' }
376    }
377    if (ext === '.tsv') {
378      fs.writeFileSync(resolved, serializeTable(table, '\t'), 'utf-8')
379      return { filePath: resolved, format: 'tsv' }
380    }
381    if (ext === '.xlsx') {
382      const ExcelJS = await import('exceljs')
383      const workbook = new ExcelJS.Workbook()
384      const worksheet = workbook.addWorksheet(table.name || 'Sheet1')
385      worksheet.addRow(table.headers)
386      for (const row of table.rows) {
387        worksheet.addRow(table.headers.map((header) => row[header] ?? null))
388      }
389      await workbook.xlsx.writeFile(resolved)
390      return { filePath: resolved, format: 'xlsx' }
391    }
392  
393    fs.writeFileSync(resolved, serializeTable(table, ','), 'utf-8')
394    return { filePath: resolved, format: 'csv' }
395  }