/ src / utils / document-parser.ts
document-parser.ts
  1  import path from "path";
  2  import { promises as fs } from "fs";
  3  import type { DocumentParseResult } from "../types/index.js";
  4  
  5  // Lazy-loaded parsers (imported only when needed)
  6  let pdfParse: any = null;
  7  let mammoth: any = null;
  8  let officeParser: any = null;
  9  
 10  const DOCUMENT_EXTENSIONS = [
 11    ".pdf",
 12    ".docx",
 13    ".pptx",
 14    ".xlsx",
 15    ".odt",
 16    ".odp",
 17    ".ods",
 18  ] as const;
 19  
 20  /**
 21   * Checks if file is a supported document format
 22   */
 23  export function isDocumentFile(filePath: string): boolean {
 24    const ext = path.extname(filePath).toLowerCase();
 25    return DOCUMENT_EXTENSIONS.includes(ext as any);
 26  }
 27  
 28  /**
 29   * Parse document with automatic format detection and fallback strategy
 30   */
 31  export async function parseDocument(
 32    filePath: string
 33  ): Promise<DocumentParseResult> {
 34    const ext = path.extname(filePath).toLowerCase();
 35    const stats = await fs.stat(filePath);
 36  
 37    // File size validation
 38    const MAX_SIZE = 50 * 1024 * 1024; // 50MB
 39    if (stats.size > MAX_SIZE) {
 40      throw new Error(
 41        `Document too large (${(stats.size / 1024 / 1024).toFixed(1)}MB). ` +
 42          `Maximum: 50MB`
 43      );
 44    }
 45  
 46    // Check for legacy .doc format
 47    if (ext === ".doc") {
 48      throw new Error(
 49        "Legacy .doc format not supported. Convert to .docx:\n" +
 50          "- Microsoft Word: File > Save As > Word Document (.docx)\n" +
 51          "- LibreOffice: File > Save As > Word 2007-365 (.docx)\n" +
 52          "- Online: https://cloudconvert.com/doc-to-docx"
 53      );
 54    }
 55  
 56    try {
 57      switch (ext) {
 58        case ".pdf":
 59          return await parsePDF(filePath);
 60  
 61        case ".docx":
 62          return await parseDOCX(filePath);
 63  
 64        case ".pptx":
 65        case ".xlsx":
 66        case ".odt":
 67        case ".odp":
 68        case ".ods":
 69          return await parseOfficeDocument(filePath, ext);
 70  
 71        default:
 72          throw new Error(`Unsupported document format: ${ext}`);
 73      }
 74    } catch (error) {
 75      // Fallback to officeparser for Office formats only (NOT PDF)
 76      if ([".docx", ".pptx", ".xlsx", ".odt", ".odp", ".ods"].includes(ext)) {
 77        try {
 78          return await parseOfficeDocument(filePath, ext);
 79        } catch (fallbackError) {
 80          throw createUserFriendlyError(
 81            filePath,
 82            ext,
 83            error instanceof Error ? error : new Error(String(error))
 84          );
 85        }
 86      }
 87      throw error;
 88    }
 89  }
 90  
 91  /**
 92   * Parse PDF using pdf-parse
 93   */
 94  async function parsePDF(filePath: string): Promise<DocumentParseResult> {
 95    // Lazy load pdf-parse
 96    if (!pdfParse) {
 97      const module = await import("pdf-parse");
 98      // Handle both ESM (PDFParse) and CommonJS (default function) exports
 99      pdfParse = (module as any).PDFParse || (module as any).default;
100    }
101  
102    const buffer = await fs.readFile(filePath);
103  
104    // Check if it's a class or a function
105    const data =
106      typeof pdfParse === "function" && !pdfParse.prototype?.constructor
107        ? await pdfParse(buffer) // Function-style API
108        : await new pdfParse({ data: buffer }).getText(); // Class-style API
109  
110    return {
111      text: data.text,
112      metadata: {
113        pages: data.numpages || data.info?.total,
114        author: data.info?.Author,
115        title: data.info?.Title,
116        format: "PDF",
117      },
118      parser: "pdf-parse",
119    };
120  }
121  
122  /**
123   * Parse DOCX using mammoth (primary) - WITH MARKDOWN FORMATTING
124   */
125  async function parseDOCX(filePath: string): Promise<DocumentParseResult> {
126    // Lazy load mammoth
127    if (!mammoth) {
128      mammoth = await import("mammoth");
129    }
130  
131    // Use convertToMarkdown to preserve document structure
132    const result = await mammoth.default.convertToMarkdown({
133      path: filePath,
134    });
135  
136    return {
137      text: result.value,
138      metadata: {
139        format: "DOCX (Markdown)",
140      },
141      parser: "mammoth",
142    };
143  }
144  
145  /**
146   * Parse Office documents using officeparser (fallback + other formats)
147   */
148  async function parseOfficeDocument(
149    filePath: string,
150    ext: string
151  ): Promise<DocumentParseResult> {
152    // Lazy load officeparser
153    if (!officeParser) {
154      const module = await import("officeparser");
155      officeParser = module.parseOfficeAsync;
156    }
157  
158    const config = {
159      outputErrorToConsole: false,
160      newlineDelimiter: "\n",
161      ignoreNotes: false,
162      putNotesAtLast: false,
163    };
164  
165    const text = await officeParser(filePath, config);
166  
167    return {
168      text,
169      metadata: {
170        format: ext.toUpperCase().slice(1),
171      },
172      parser: "officeparser",
173    };
174  }
175  
176  /**
177   * Custom error class for document parsing errors
178   */
179  export class DocumentParseError extends Error {
180    constructor(
181      public filePath: string,
182      public fileType: string,
183      message: string,
184      public originalError?: Error
185    ) {
186      super(message);
187      this.name = "DocumentParseError";
188    }
189  }
190  
191  /**
192   * Enhanced error messages for better user experience
193   */
194  function createUserFriendlyError(
195    filePath: string,
196    ext: string,
197    error: Error
198  ): DocumentParseError {
199    if (
200      error.message.includes("encrypted") ||
201      error.message.includes("password")
202    ) {
203      return new DocumentParseError(
204        filePath,
205        ext,
206        `Password-protected ${ext.toUpperCase()} files are not supported. ` +
207          `Please remove password protection and try again.`,
208        error
209      );
210    }
211  
212    if (
213      error.message.includes("corrupted") ||
214      error.message.includes("invalid")
215    ) {
216      return new DocumentParseError(
217        filePath,
218        ext,
219        `File appears to be corrupted or is not a valid ${ext.toUpperCase()} document.`,
220        error
221      );
222    }
223  
224    return new DocumentParseError(
225      filePath,
226      ext,
227      `Failed to parse ${ext.toUpperCase()} document: ${error.message}`,
228      error
229    );
230  }