/ src / tests / document-parser.test.ts
document-parser.test.ts
  1  import { describe, test, expect, beforeAll, afterAll } from "@jest/globals";
  2  import { promises as fs } from "fs";
  3  import path from "path";
  4  import os from "os";
  5  import {
  6    isDocumentFile,
  7    parseDocument,
  8    DocumentParseError,
  9  } from "../utils/document-parser.js";
 10  
 11  const TEST_FIXTURES_DIR = path.join(__dirname, "fixtures");
 12  const TEST_WORKSPACE = path.join(os.tmpdir(), `vulcan-test-doc-parser-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`);
 13  const FIXTURES_DIR = path.join(TEST_WORKSPACE, "fixtures");
 14  
 15  // Helper to create test fixtures
 16  async function createTestFixtures() {
 17    try {
 18      // Ensure the test workspace directory exists
 19      await fs.mkdir(TEST_WORKSPACE, { recursive: true });
 20      await fs.mkdir(FIXTURES_DIR, { recursive: true });
 21  
 22      // Create oversized file (>50MB)
 23      const largeContent = Buffer.alloc(51 * 1024 * 1024, "x"); // 51MB
 24      await fs.writeFile(path.join(FIXTURES_DIR, "huge-file.pdf"), largeContent);
 25  
 26      // Create legacy .doc placeholder
 27      await fs.writeFile(path.join(FIXTURES_DIR, "legacy.doc"), "placeholder");
 28  
 29      // Create regular text file
 30      await fs.writeFile(
 31        path.join(FIXTURES_DIR, "text.txt"),
 32        "Plain text content"
 33      );
 34    } catch (error) {
 35      console.error("Failed to create test fixtures:", error);
 36      throw error;
 37    }
 38  }
 39  
 40  async function cleanupTestFixtures() {
 41    try {
 42      await fs.rm(TEST_WORKSPACE, { recursive: true, force: true });
 43    } catch (error) {
 44      // Ignore cleanup errors
 45    }
 46  }
 47  
 48  describe("Document Parser", () => {
 49    beforeAll(async () => {
 50      await createTestFixtures();
 51    });
 52  
 53    afterAll(async () => {
 54      await cleanupTestFixtures();
 55    });
 56  
 57    describe("isDocumentFile", () => {
 58      test("detects PDF files", () => {
 59        expect(isDocumentFile("document.pdf")).toBe(true);
 60        expect(isDocumentFile("DOCUMENT.PDF")).toBe(true);
 61        expect(isDocumentFile("/path/to/file.pdf")).toBe(true);
 62      });
 63  
 64      test("detects DOCX files", () => {
 65        expect(isDocumentFile("doc.docx")).toBe(true);
 66        expect(isDocumentFile("DOC.DOCX")).toBe(true);
 67      });
 68  
 69      test("detects Office files", () => {
 70        expect(isDocumentFile("slides.pptx")).toBe(true);
 71        expect(isDocumentFile("data.xlsx")).toBe(true);
 72        expect(isDocumentFile("document.odt")).toBe(true);
 73        expect(isDocumentFile("presentation.odp")).toBe(true);
 74        expect(isDocumentFile("spreadsheet.ods")).toBe(true);
 75      });
 76  
 77      test("rejects text files", () => {
 78        expect(isDocumentFile("file.txt")).toBe(false);
 79        expect(isDocumentFile("code.js")).toBe(false);
 80        expect(isDocumentFile("style.css")).toBe(false);
 81        expect(isDocumentFile("README.md")).toBe(false);
 82      });
 83  
 84      test("rejects legacy .doc format", () => {
 85        expect(isDocumentFile("legacy.doc")).toBe(false);
 86      });
 87  
 88      test("handles files without extensions", () => {
 89        expect(isDocumentFile("README")).toBe(false);
 90        expect(isDocumentFile("Makefile")).toBe(false);
 91      });
 92    });
 93  
 94    describe("parseDocument", () => {
 95      test("parses PDF with pdf2json or pdf-parse fallback", async () => {
 96        const result = await parseDocument(
 97          path.join(TEST_FIXTURES_DIR, "sample.pdf")
 98        );
 99  
100        expect(result.text).toBeDefined();
101        expect(result.text.length).toBeGreaterThan(0);
102        // Can use either pdf2json (primary) or pdf-parse (fallback)
103        expect(["pdf2json", "pdf-parse"]).toContain(result.parser);
104        expect(result.metadata?.format).toBe("PDF");
105      }, 10000); // 10 second timeout for PDF parsing
106  
107      test("rejects oversized files", async () => {
108        await expect(
109          parseDocument(path.join(FIXTURES_DIR, "huge-file.pdf"))
110        ).rejects.toThrow("too large");
111  
112        await expect(
113          parseDocument(path.join(FIXTURES_DIR, "huge-file.pdf"))
114        ).rejects.toThrow("Maximum: 50MB");
115      });
116  
117      test("rejects legacy .doc format with helpful message", async () => {
118        await expect(
119          parseDocument(path.join(FIXTURES_DIR, "legacy.doc"))
120        ).rejects.toThrow("Legacy .doc format not supported");
121  
122        await expect(
123          parseDocument(path.join(FIXTURES_DIR, "legacy.doc"))
124        ).rejects.toThrow("Convert to .docx");
125      });
126  
127      test("rejects non-existent files", async () => {
128        await expect(
129          parseDocument(path.join(FIXTURES_DIR, "nonexistent.pdf"))
130        ).rejects.toThrow();
131      });
132  
133      test("rejects unsupported document formats", async () => {
134        await expect(
135          parseDocument(path.join(FIXTURES_DIR, "text.txt"))
136        ).rejects.toThrow("Unsupported document format");
137      });
138    });
139  
140    describe("DocumentParseError", () => {
141      test("creates error with correct properties", () => {
142        const originalError = new Error("Original error message");
143        const docError = new DocumentParseError(
144          "/path/to/file.pdf",
145          ".pdf",
146          "Failed to parse PDF",
147          originalError
148        );
149  
150        expect(docError.name).toBe("DocumentParseError");
151        expect(docError.message).toBe("Failed to parse PDF");
152        expect(docError.filePath).toBe("/path/to/file.pdf");
153        expect(docError.fileType).toBe(".pdf");
154        expect(docError.originalError).toBe(originalError);
155      });
156  
157      test("works without original error", () => {
158        const docError = new DocumentParseError(
159          "/path/to/file.pdf",
160          ".pdf",
161          "Failed to parse PDF"
162        );
163  
164        expect(docError.originalError).toBeUndefined();
165      });
166    });
167  });