Cradicle Explorer

/ src / lib / server / knowledge-import.ts
knowledge-import.ts
  1  import fs from 'fs'
  2  import path from 'path'
  3  import * as cheerio from 'cheerio'
  4  
  5  const TEXT_EXTS = new Set([
  6    '.txt', '.md', '.markdown', '.csv', '.tsv', '.json', '.jsonl',
  7    '.html', '.htm', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg',
  8    '.js', '.ts', '.tsx', '.jsx', '.py', '.go', '.rs', '.java', '.c', '.cpp', '.h',
  9    '.rb', '.php', '.sh', '.bash', '.zsh', '.sql', '.r', '.swift', '.kt',
 10    '.env', '.log', '.conf', '.properties', '.gitignore', '.dockerignore',
 11  ])
 12  
 13  export const MAX_KNOWLEDGE_IMPORT_BYTES = 10 * 1024 * 1024
 14  export const MAX_KNOWLEDGE_CONTENT_CHARS = 500_000
 15  
 16  export function isKnowledgeTextFile(filename: string): boolean {
 17    const ext = path.extname(filename).toLowerCase()
 18    return TEXT_EXTS.has(ext) || ext === ''
 19  }
 20  
 21  export function deriveKnowledgeTitle(filename: string): string {
 22    const name = path.basename(filename, path.extname(filename))
 23    return name
 24      .replace(/[-_]+/g, ' ')
 25      .replace(/([a-z])([A-Z])/g, '$1 $2')
 26      .replace(/\b\w/g, (char) => char.toUpperCase())
 27      .trim() || 'Knowledge Source'
 28  }
 29  
 30  function normalizeKnowledgeContent(content: string): string {
 31    const normalized = String(content || '')
 32      .replace(/^\uFEFF/, '')
 33      .replace(/\r\n/g, '\n')
 34      .trim()
 35  
 36    if (normalized.length <= MAX_KNOWLEDGE_CONTENT_CHARS) return normalized
 37    return `${normalized.slice(0, MAX_KNOWLEDGE_CONTENT_CHARS)}\n\n[... truncated at 500k characters]`
 38  }
 39  
 40  async function extractPdfText(buffer: Buffer, filePathHint?: string): Promise<string> {
 41    try {
 42      const pdfParseModule = await import('pdf-parse') as unknown as {
 43        default?: (input: Buffer) => Promise<{ text?: string }>
 44      }
 45      const pdfParse = pdfParseModule.default
 46      if (typeof pdfParse !== 'function') throw new Error('pdf-parse loader unavailable')
 47      const result = await pdfParse(buffer)
 48      return normalizeKnowledgeContent(result.text || '')
 49    } catch {
 50      return normalizeKnowledgeContent(
 51        `[PDF document]\n\nUnable to extract text automatically.${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
 52      )
 53    }
 54  }
 55  
 56  function htmlToReadableText(html: string): { title: string | null; content: string } {
 57    const $ = cheerio.load(html)
 58    $('script, style, noscript, svg, nav, footer, header').remove()
 59  
 60    const title = $('title').first().text().trim() || null
 61    const root = $('main').first().length
 62      ? $('main').first()
 63      : $('article').first().length
 64        ? $('article').first()
 65        : $('body').first().length
 66          ? $('body').first()
 67          : $('html').first()
 68  
 69    const text = root
 70      .text()
 71      .replace(/\u00a0/g, ' ')
 72      .split('\n')
 73      .map((line) => line.trim())
 74      .filter(Boolean)
 75      .join('\n\n')
 76  
 77    return {
 78      title,
 79      content: normalizeKnowledgeContent(text),
 80    }
 81  }
 82  
 83  export async function extractKnowledgeTextFromBuffer(
 84    buffer: Buffer,
 85    filename: string,
 86    filePathHint?: string,
 87  ): Promise<string> {
 88    if (buffer.length === 0) return ''
 89    if (buffer.length > MAX_KNOWLEDGE_IMPORT_BYTES) {
 90      throw new Error('File too large. Maximum 10MB.')
 91    }
 92  
 93    const ext = path.extname(filename).toLowerCase()
 94    if (ext === '.pdf') {
 95      return extractPdfText(buffer, filePathHint)
 96    }
 97  
 98    if (isKnowledgeTextFile(filename)) {
 99      return normalizeKnowledgeContent(buffer.toString('utf-8'))
100    }
101  
102    return normalizeKnowledgeContent(
103      `[Binary file: ${filename}]${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
104    )
105  }
106  
107  export async function extractKnowledgeTextFromFile(filePath: string, filename?: string): Promise<string> {
108    const buffer = await fs.promises.readFile(filePath)
109    return extractKnowledgeTextFromBuffer(buffer, filename || path.basename(filePath), filePath)
110  }
111  
112  export async function extractKnowledgeTextFromUrl(sourceUrl: string): Promise<{
113    title: string | null
114    content: string
115    contentType: string | null
116  }> {
117    const response = await fetch(sourceUrl, {
118      headers: {
119        'user-agent': 'SwarmClaw/knowledge-import',
120        accept: 'text/html, text/plain, application/json, application/pdf, */*',
121      },
122    })
123  
124    if (!response.ok) {
125      throw new Error(`URL fetch failed (${response.status})`)
126    }
127  
128    const contentType = response.headers.get('content-type')
129    const contentLength = Number.parseInt(response.headers.get('content-length') || '', 10)
130    if (Number.isFinite(contentLength) && contentLength > MAX_KNOWLEDGE_IMPORT_BYTES) {
131      throw new Error('Remote document is too large. Maximum 10MB.')
132    }
133  
134    if ((contentType || '').includes('application/pdf') || sourceUrl.toLowerCase().endsWith('.pdf')) {
135      const buffer = Buffer.from(await response.arrayBuffer())
136      return {
137        title: null,
138        content: await extractPdfText(buffer, sourceUrl),
139        contentType,
140      }
141    }
142  
143    const text = await response.text()
144    const looksLikeHtml = (contentType || '').includes('text/html') || /<html[\s>]|<body[\s>]/i.test(text)
145    if (looksLikeHtml) {
146      const parsed = htmlToReadableText(text)
147      return {
148        title: parsed.title,
149        content: parsed.content,
150        contentType,
151      }
152    }
153  
154    return {
155      title: null,
156      content: normalizeKnowledgeContent(text),
157      contentType,
158    }
159  }