document-parser.ts
1 import path from "path"; 2 import { promises as fs } from "fs"; 3 import type { DocumentParseResult } from "../types/index.js"; 4 5 // Lazy-loaded parsers (imported only when needed) 6 let pdfParse: any = null; 7 let mammoth: any = null; 8 let officeParser: any = null; 9 10 const DOCUMENT_EXTENSIONS = [ 11 ".pdf", 12 ".docx", 13 ".pptx", 14 ".xlsx", 15 ".odt", 16 ".odp", 17 ".ods", 18 ] as const; 19 20 /** 21 * Checks if file is a supported document format 22 */ 23 export function isDocumentFile(filePath: string): boolean { 24 const ext = path.extname(filePath).toLowerCase(); 25 return DOCUMENT_EXTENSIONS.includes(ext as any); 26 } 27 28 /** 29 * Parse document with automatic format detection and fallback strategy 30 */ 31 export async function parseDocument( 32 filePath: string 33 ): Promise<DocumentParseResult> { 34 const ext = path.extname(filePath).toLowerCase(); 35 const stats = await fs.stat(filePath); 36 37 // File size validation 38 const MAX_SIZE = 50 * 1024 * 1024; // 50MB 39 if (stats.size > MAX_SIZE) { 40 throw new Error( 41 `Document too large (${(stats.size / 1024 / 1024).toFixed(1)}MB). ` + 42 `Maximum: 50MB` 43 ); 44 } 45 46 // Check for legacy .doc format 47 if (ext === ".doc") { 48 throw new Error( 49 "Legacy .doc format not supported. Convert to .docx:\n" + 50 "- Microsoft Word: File > Save As > Word Document (.docx)\n" + 51 "- LibreOffice: File > Save As > Word 2007-365 (.docx)\n" + 52 "- Online: https://cloudconvert.com/doc-to-docx" 53 ); 54 } 55 56 try { 57 switch (ext) { 58 case ".pdf": 59 return await parsePDF(filePath); 60 61 case ".docx": 62 return await parseDOCX(filePath); 63 64 case ".pptx": 65 case ".xlsx": 66 case ".odt": 67 case ".odp": 68 case ".ods": 69 return await parseOfficeDocument(filePath, ext); 70 71 default: 72 throw new Error(`Unsupported document format: ${ext}`); 73 } 74 } catch (error) { 75 // Fallback to officeparser for Office formats only (NOT PDF) 76 if ([".docx", ".pptx", ".xlsx", ".odt", ".odp", ".ods"].includes(ext)) { 77 try { 78 return await parseOfficeDocument(filePath, ext); 79 } catch (fallbackError) { 80 throw createUserFriendlyError( 81 filePath, 82 ext, 83 error instanceof Error ? error : new Error(String(error)) 84 ); 85 } 86 } 87 throw error; 88 } 89 } 90 91 /** 92 * Parse PDF using pdf-parse 93 */ 94 async function parsePDF(filePath: string): Promise<DocumentParseResult> { 95 // Lazy load pdf-parse 96 if (!pdfParse) { 97 const module = await import("pdf-parse"); 98 // Handle both ESM (PDFParse) and CommonJS (default function) exports 99 pdfParse = (module as any).PDFParse || (module as any).default; 100 } 101 102 const buffer = await fs.readFile(filePath); 103 104 // Check if it's a class or a function 105 const data = 106 typeof pdfParse === "function" && !pdfParse.prototype?.constructor 107 ? await pdfParse(buffer) // Function-style API 108 : await new pdfParse({ data: buffer }).getText(); // Class-style API 109 110 return { 111 text: data.text, 112 metadata: { 113 pages: data.numpages || data.info?.total, 114 author: data.info?.Author, 115 title: data.info?.Title, 116 format: "PDF", 117 }, 118 parser: "pdf-parse", 119 }; 120 } 121 122 /** 123 * Parse DOCX using mammoth (primary) - WITH MARKDOWN FORMATTING 124 */ 125 async function parseDOCX(filePath: string): Promise<DocumentParseResult> { 126 // Lazy load mammoth 127 if (!mammoth) { 128 mammoth = await import("mammoth"); 129 } 130 131 // Use convertToMarkdown to preserve document structure 132 const result = await mammoth.default.convertToMarkdown({ 133 path: filePath, 134 }); 135 136 return { 137 text: result.value, 138 metadata: { 139 format: "DOCX (Markdown)", 140 }, 141 parser: "mammoth", 142 }; 143 } 144 145 /** 146 * Parse Office documents using officeparser (fallback + other formats) 147 */ 148 async function parseOfficeDocument( 149 filePath: string, 150 ext: string 151 ): Promise<DocumentParseResult> { 152 // Lazy load officeparser 153 if (!officeParser) { 154 const module = await import("officeparser"); 155 officeParser = module.parseOfficeAsync; 156 } 157 158 const config = { 159 outputErrorToConsole: false, 160 newlineDelimiter: "\n", 161 ignoreNotes: false, 162 putNotesAtLast: false, 163 }; 164 165 const text = await officeParser(filePath, config); 166 167 return { 168 text, 169 metadata: { 170 format: ext.toUpperCase().slice(1), 171 }, 172 parser: "officeparser", 173 }; 174 } 175 176 /** 177 * Custom error class for document parsing errors 178 */ 179 export class DocumentParseError extends Error { 180 constructor( 181 public filePath: string, 182 public fileType: string, 183 message: string, 184 public originalError?: Error 185 ) { 186 super(message); 187 this.name = "DocumentParseError"; 188 } 189 } 190 191 /** 192 * Enhanced error messages for better user experience 193 */ 194 function createUserFriendlyError( 195 filePath: string, 196 ext: string, 197 error: Error 198 ): DocumentParseError { 199 if ( 200 error.message.includes("encrypted") || 201 error.message.includes("password") 202 ) { 203 return new DocumentParseError( 204 filePath, 205 ext, 206 `Password-protected ${ext.toUpperCase()} files are not supported. ` + 207 `Please remove password protection and try again.`, 208 error 209 ); 210 } 211 212 if ( 213 error.message.includes("corrupted") || 214 error.message.includes("invalid") 215 ) { 216 return new DocumentParseError( 217 filePath, 218 ext, 219 `File appears to be corrupted or is not a valid ${ext.toUpperCase()} document.`, 220 error 221 ); 222 } 223 224 return new DocumentParseError( 225 filePath, 226 ext, 227 `Failed to parse ${ext.toUpperCase()} document: ${error.message}`, 228 error 229 ); 230 }