knowledge-import.ts
1 import fs from 'fs' 2 import path from 'path' 3 import * as cheerio from 'cheerio' 4 5 const TEXT_EXTS = new Set([ 6 '.txt', '.md', '.markdown', '.csv', '.tsv', '.json', '.jsonl', 7 '.html', '.htm', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg', 8 '.js', '.ts', '.tsx', '.jsx', '.py', '.go', '.rs', '.java', '.c', '.cpp', '.h', 9 '.rb', '.php', '.sh', '.bash', '.zsh', '.sql', '.r', '.swift', '.kt', 10 '.env', '.log', '.conf', '.properties', '.gitignore', '.dockerignore', 11 ]) 12 13 export const MAX_KNOWLEDGE_IMPORT_BYTES = 10 * 1024 * 1024 14 export const MAX_KNOWLEDGE_CONTENT_CHARS = 500_000 15 16 export function isKnowledgeTextFile(filename: string): boolean { 17 const ext = path.extname(filename).toLowerCase() 18 return TEXT_EXTS.has(ext) || ext === '' 19 } 20 21 export function deriveKnowledgeTitle(filename: string): string { 22 const name = path.basename(filename, path.extname(filename)) 23 return name 24 .replace(/[-_]+/g, ' ') 25 .replace(/([a-z])([A-Z])/g, '$1 $2') 26 .replace(/\b\w/g, (char) => char.toUpperCase()) 27 .trim() || 'Knowledge Source' 28 } 29 30 function normalizeKnowledgeContent(content: string): string { 31 const normalized = String(content || '') 32 .replace(/^\uFEFF/, '') 33 .replace(/\r\n/g, '\n') 34 .trim() 35 36 if (normalized.length <= MAX_KNOWLEDGE_CONTENT_CHARS) return normalized 37 return `${normalized.slice(0, MAX_KNOWLEDGE_CONTENT_CHARS)}\n\n[... truncated at 500k characters]` 38 } 39 40 async function extractPdfText(buffer: Buffer, filePathHint?: string): Promise<string> { 41 try { 42 const pdfParseModule = await import('pdf-parse') as unknown as { 43 default?: (input: Buffer) => Promise<{ text?: string }> 44 } 45 const pdfParse = pdfParseModule.default 46 if (typeof pdfParse !== 'function') throw new Error('pdf-parse loader unavailable') 47 const result = await pdfParse(buffer) 48 return normalizeKnowledgeContent(result.text || '') 49 } catch { 50 return normalizeKnowledgeContent( 51 `[PDF document]\n\nUnable to extract text automatically.${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`, 52 ) 53 } 54 } 55 56 function htmlToReadableText(html: string): { title: string | null; content: string } { 57 const $ = cheerio.load(html) 58 $('script, style, noscript, svg, nav, footer, header').remove() 59 60 const title = $('title').first().text().trim() || null 61 const root = $('main').first().length 62 ? $('main').first() 63 : $('article').first().length 64 ? $('article').first() 65 : $('body').first().length 66 ? $('body').first() 67 : $('html').first() 68 69 const text = root 70 .text() 71 .replace(/\u00a0/g, ' ') 72 .split('\n') 73 .map((line) => line.trim()) 74 .filter(Boolean) 75 .join('\n\n') 76 77 return { 78 title, 79 content: normalizeKnowledgeContent(text), 80 } 81 } 82 83 export async function extractKnowledgeTextFromBuffer( 84 buffer: Buffer, 85 filename: string, 86 filePathHint?: string, 87 ): Promise<string> { 88 if (buffer.length === 0) return '' 89 if (buffer.length > MAX_KNOWLEDGE_IMPORT_BYTES) { 90 throw new Error('File too large. Maximum 10MB.') 91 } 92 93 const ext = path.extname(filename).toLowerCase() 94 if (ext === '.pdf') { 95 return extractPdfText(buffer, filePathHint) 96 } 97 98 if (isKnowledgeTextFile(filename)) { 99 return normalizeKnowledgeContent(buffer.toString('utf-8')) 100 } 101 102 return normalizeKnowledgeContent( 103 `[Binary file: ${filename}]${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`, 104 ) 105 } 106 107 export async function extractKnowledgeTextFromFile(filePath: string, filename?: string): Promise<string> { 108 const buffer = await fs.promises.readFile(filePath) 109 return extractKnowledgeTextFromBuffer(buffer, filename || path.basename(filePath), filePath) 110 } 111 112 export async function extractKnowledgeTextFromUrl(sourceUrl: string): Promise<{ 113 title: string | null 114 content: string 115 contentType: string | null 116 }> { 117 const response = await fetch(sourceUrl, { 118 headers: { 119 'user-agent': 'SwarmClaw/knowledge-import', 120 accept: 'text/html, text/plain, application/json, application/pdf, */*', 121 }, 122 }) 123 124 if (!response.ok) { 125 throw new Error(`URL fetch failed (${response.status})`) 126 } 127 128 const contentType = response.headers.get('content-type') 129 const contentLength = Number.parseInt(response.headers.get('content-length') || '', 10) 130 if (Number.isFinite(contentLength) && contentLength > MAX_KNOWLEDGE_IMPORT_BYTES) { 131 throw new Error('Remote document is too large. Maximum 10MB.') 132 } 133 134 if ((contentType || '').includes('application/pdf') || sourceUrl.toLowerCase().endsWith('.pdf')) { 135 const buffer = Buffer.from(await response.arrayBuffer()) 136 return { 137 title: null, 138 content: await extractPdfText(buffer, sourceUrl), 139 contentType, 140 } 141 } 142 143 const text = await response.text() 144 const looksLikeHtml = (contentType || '').includes('text/html') || /<html[\s>]|<body[\s>]/i.test(text) 145 if (looksLikeHtml) { 146 const parsed = htmlToReadableText(text) 147 return { 148 title: parsed.title, 149 content: parsed.content, 150 contentType, 151 } 152 } 153 154 return { 155 title: null, 156 content: normalizeKnowledgeContent(text), 157 contentType, 158 } 159 }