document-parser.test.ts
1 import { describe, test, expect, beforeAll, afterAll } from "@jest/globals"; 2 import { promises as fs } from "fs"; 3 import path from "path"; 4 import os from "os"; 5 import { 6 isDocumentFile, 7 parseDocument, 8 DocumentParseError, 9 } from "../utils/document-parser.js"; 10 11 const TEST_FIXTURES_DIR = path.join(__dirname, "fixtures"); 12 const TEST_WORKSPACE = path.join(os.tmpdir(), `vulcan-test-doc-parser-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`); 13 const FIXTURES_DIR = path.join(TEST_WORKSPACE, "fixtures"); 14 15 // Helper to create test fixtures 16 async function createTestFixtures() { 17 try { 18 // Ensure the test workspace directory exists 19 await fs.mkdir(TEST_WORKSPACE, { recursive: true }); 20 await fs.mkdir(FIXTURES_DIR, { recursive: true }); 21 22 // Create oversized file (>50MB) 23 const largeContent = Buffer.alloc(51 * 1024 * 1024, "x"); // 51MB 24 await fs.writeFile(path.join(FIXTURES_DIR, "huge-file.pdf"), largeContent); 25 26 // Create legacy .doc placeholder 27 await fs.writeFile(path.join(FIXTURES_DIR, "legacy.doc"), "placeholder"); 28 29 // Create regular text file 30 await fs.writeFile( 31 path.join(FIXTURES_DIR, "text.txt"), 32 "Plain text content" 33 ); 34 } catch (error) { 35 console.error("Failed to create test fixtures:", error); 36 throw error; 37 } 38 } 39 40 async function cleanupTestFixtures() { 41 try { 42 await fs.rm(TEST_WORKSPACE, { recursive: true, force: true }); 43 } catch (error) { 44 // Ignore cleanup errors 45 } 46 } 47 48 describe("Document Parser", () => { 49 beforeAll(async () => { 50 await createTestFixtures(); 51 }); 52 53 afterAll(async () => { 54 await cleanupTestFixtures(); 55 }); 56 57 describe("isDocumentFile", () => { 58 test("detects PDF files", () => { 59 expect(isDocumentFile("document.pdf")).toBe(true); 60 expect(isDocumentFile("DOCUMENT.PDF")).toBe(true); 61 expect(isDocumentFile("/path/to/file.pdf")).toBe(true); 62 }); 63 64 test("detects DOCX files", () => { 65 expect(isDocumentFile("doc.docx")).toBe(true); 66 expect(isDocumentFile("DOC.DOCX")).toBe(true); 67 }); 68 69 test("detects Office files", () => { 70 expect(isDocumentFile("slides.pptx")).toBe(true); 71 expect(isDocumentFile("data.xlsx")).toBe(true); 72 expect(isDocumentFile("document.odt")).toBe(true); 73 expect(isDocumentFile("presentation.odp")).toBe(true); 74 expect(isDocumentFile("spreadsheet.ods")).toBe(true); 75 }); 76 77 test("rejects text files", () => { 78 expect(isDocumentFile("file.txt")).toBe(false); 79 expect(isDocumentFile("code.js")).toBe(false); 80 expect(isDocumentFile("style.css")).toBe(false); 81 expect(isDocumentFile("README.md")).toBe(false); 82 }); 83 84 test("rejects legacy .doc format", () => { 85 expect(isDocumentFile("legacy.doc")).toBe(false); 86 }); 87 88 test("handles files without extensions", () => { 89 expect(isDocumentFile("README")).toBe(false); 90 expect(isDocumentFile("Makefile")).toBe(false); 91 }); 92 }); 93 94 describe("parseDocument", () => { 95 test("parses PDF with pdf2json or pdf-parse fallback", async () => { 96 const result = await parseDocument( 97 path.join(TEST_FIXTURES_DIR, "sample.pdf") 98 ); 99 100 expect(result.text).toBeDefined(); 101 expect(result.text.length).toBeGreaterThan(0); 102 // Can use either pdf2json (primary) or pdf-parse (fallback) 103 expect(["pdf2json", "pdf-parse"]).toContain(result.parser); 104 expect(result.metadata?.format).toBe("PDF"); 105 }, 10000); // 10 second timeout for PDF parsing 106 107 test("rejects oversized files", async () => { 108 await expect( 109 parseDocument(path.join(FIXTURES_DIR, "huge-file.pdf")) 110 ).rejects.toThrow("too large"); 111 112 await expect( 113 parseDocument(path.join(FIXTURES_DIR, "huge-file.pdf")) 114 ).rejects.toThrow("Maximum: 50MB"); 115 }); 116 117 test("rejects legacy .doc format with helpful message", async () => { 118 await expect( 119 parseDocument(path.join(FIXTURES_DIR, "legacy.doc")) 120 ).rejects.toThrow("Legacy .doc format not supported"); 121 122 await expect( 123 parseDocument(path.join(FIXTURES_DIR, "legacy.doc")) 124 ).rejects.toThrow("Convert to .docx"); 125 }); 126 127 test("rejects non-existent files", async () => { 128 await expect( 129 parseDocument(path.join(FIXTURES_DIR, "nonexistent.pdf")) 130 ).rejects.toThrow(); 131 }); 132 133 test("rejects unsupported document formats", async () => { 134 await expect( 135 parseDocument(path.join(FIXTURES_DIR, "text.txt")) 136 ).rejects.toThrow("Unsupported document format"); 137 }); 138 }); 139 140 describe("DocumentParseError", () => { 141 test("creates error with correct properties", () => { 142 const originalError = new Error("Original error message"); 143 const docError = new DocumentParseError( 144 "/path/to/file.pdf", 145 ".pdf", 146 "Failed to parse PDF", 147 originalError 148 ); 149 150 expect(docError.name).toBe("DocumentParseError"); 151 expect(docError.message).toBe("Failed to parse PDF"); 152 expect(docError.filePath).toBe("/path/to/file.pdf"); 153 expect(docError.fileType).toBe(".pdf"); 154 expect(docError.originalError).toBe(originalError); 155 }); 156 157 test("works without original error", () => { 158 const docError = new DocumentParseError( 159 "/path/to/file.pdf", 160 ".pdf", 161 "Failed to parse PDF" 162 ); 163 164 expect(docError.originalError).toBeUndefined(); 165 }); 166 }); 167 });