Cradicle Explorer

/ src / tests / html-conversion.test.ts
html-conversion.test.ts
  1  /**
  2   * Tests for HTML-to-Document conversion utilities
  3   */
  4  
  5  import { describe, test, expect } from "@jest/globals";
  6  import {
  7    isHTMLContent,
  8    convertHTMLToPDF,
  9    convertHTMLToDOCX,
 10  } from "../utils/html-to-document.js";
 11  
 12  describe("HTML Content Detection", () => {
 13    test("detects simple HTML with html tag", () => {
 14      const content = "<html><body><p>Test</p></body></html>";
 15      expect(isHTMLContent(content)).toBe(true);
 16    });
 17  
 18    test("detects HTML with body tag only", () => {
 19      const content = "<body><p>Test</p></body>";
 20      expect(isHTMLContent(content)).toBe(true);
 21    });
 22  
 23    test("detects HTML with heading tags", () => {
 24      expect(isHTMLContent("<h1>Title</h1>")).toBe(true);
 25      expect(isHTMLContent("<h2>Subtitle</h2>")).toBe(true);
 26      expect(isHTMLContent("<h3>Section</h3>")).toBe(true);
 27    });
 28  
 29    test("detects HTML with table tags", () => {
 30      expect(isHTMLContent("<table><tr><td>Test</td></tr></table>")).toBe(true);
 31    });
 32  
 33    test("detects HTML with list tags", () => {
 34      expect(isHTMLContent("<ul><li>Item</li></ul>")).toBe(true);
 35      expect(isHTMLContent("<ol><li>Item</li></ol>")).toBe(true);
 36    });
 37  
 38    test("detects HTML with common tags", () => {
 39      expect(isHTMLContent("<p>Paragraph</p>")).toBe(true);
 40      expect(isHTMLContent("<div>Division</div>")).toBe(true);
 41      expect(isHTMLContent("<table><tr><td>Cell</td></tr></table>")).toBe(true);
 42      expect(isHTMLContent("<ul><li>Item</li></ul>")).toBe(true);
 43      expect(isHTMLContent("<ol><li>Item</li></ol>")).toBe(true);
 44    });
 45  
 46    test("detects HTML with formatting tags", () => {
 47      expect(isHTMLContent("<strong>Bold</strong>")).toBe(true);
 48      expect(isHTMLContent("<em>Italic</em>")).toBe(true);
 49      expect(isHTMLContent("<span>Text</span>")).toBe(true);
 50    });
 51  
 52    test("detects HTML with self-closing tags", () => {
 53      expect(isHTMLContent("Line 1<br>Line 2")).toBe(true);
 54      expect(isHTMLContent("Line 1<br/>Line 2")).toBe(true);
 55    });
 56  
 57    test("does not detect plain text as HTML", () => {
 58      expect(isHTMLContent("This is plain text")).toBe(false);
 59      expect(isHTMLContent("Line 1\nLine 2\nLine 3")).toBe(false);
 60      expect(isHTMLContent("Some text with numbers 123")).toBe(false);
 61    });
 62  
 63    test("handles empty or whitespace content", () => {
 64      expect(isHTMLContent("")).toBe(false);
 65      expect(isHTMLContent("   ")).toBe(false);
 66      expect(isHTMLContent("\n\n")).toBe(false);
 67    });
 68  
 69    test("handles mixed content with HTML-like text", () => {
 70      // Text that looks like HTML but isn't really (no actual tags)
 71      expect(isHTMLContent("Price < 100 and > 50")).toBe(false);
 72      expect(isHTMLContent("Use <brackets> for grouping")).toBe(false);
 73    });
 74  
 75    test("case insensitive detection", () => {
 76      expect(isHTMLContent("<HTML><BODY>Test</BODY></HTML>")).toBe(true);
 77      expect(isHTMLContent("<Html><Body>Test</Body></Html>")).toBe(true);
 78    });
 79  });
 80  
 81  describe("HTML to PDF Conversion", () => {
 82    test("converts simple HTML to PDF buffer", async () => {
 83      const html =
 84        "<html><body><h1>Test Document</h1><p>This is a test.</p></body></html>";
 85      const buffer = await convertHTMLToPDF(html);
 86  
 87      expect(buffer).toBeInstanceOf(Buffer);
 88      expect(buffer.length).toBeGreaterThan(0);
 89  
 90      // Check PDF magic number (starts with %PDF)
 91      const header = buffer.toString("ascii", 0, 4);
 92      expect(header).toBe("%PDF");
 93    }, 10000);
 94  
 95    test("converts HTML with table to PDF", async () => {
 96      const html = `
 97        <html>
 98          <body>
 99            <h2>Data Table</h2>
100            <table>
101              <thead>
102                <tr>
103                  <th>Name</th>
104                  <th>Value</th>
105                </tr>
106              </thead>
107              <tbody>
108                <tr>
109                  <td>Item 1</td>
110                  <td>100</td>
111                </tr>
112                <tr>
113                  <td>Item 2</td>
114                  <td>200</td>
115                </tr>
116              </tbody>
117            </table>
118          </body>
119        </html>
120      `;
121  
122      const buffer = await convertHTMLToPDF(html);
123      expect(buffer).toBeInstanceOf(Buffer);
124      expect(buffer.length).toBeGreaterThan(0);
125    }, 10000);
126  
127    test("converts HTML with styled elements", async () => {
128      const html = `
129        <html>
130          <body>
131            <h1 style="color: #2c3e50;">Styled Title</h1>
132            <p style="text-align: center;">Centered paragraph</p>
133            <p><strong>Bold text</strong> and <em>italic text</em></p>
134            <ul>
135              <li>List item 1</li>
136              <li>List item 2</li>
137            </ul>
138          </body>
139        </html>
140      `;
141  
142      const buffer = await convertHTMLToPDF(html);
143      expect(buffer).toBeInstanceOf(Buffer);
144      expect(buffer.length).toBeGreaterThan(0);
145    }, 10000);
146  
147    test("includes metadata in PDF", async () => {
148      const html = "<html><body><h1>Test</h1></body></html>";
149      const buffer = await convertHTMLToPDF(html, {
150        title: "Test Document",
151        author: "Test Author",
152        subject: "Test Subject",
153        keywords: ["test", "pdf"],
154      });
155  
156      expect(buffer).toBeInstanceOf(Buffer);
157      expect(buffer.length).toBeGreaterThan(0);
158    }, 10000);
159  
160    test("handles complex HTML structure", async () => {
161      const html = `
162        <html>
163          <head>
164            <style>
165              body { font-family: Arial, sans-serif; }
166              .header { color: #2c3e50; }
167              .content { margin: 20px; }
168            </style>
169          </head>
170          <body>
171            <div class="header">
172              <h1>Invoice #12345</h1>
173              <p>Date: 2025-10-21</p>
174            </div>
175            <div class="content">
176              <h2>Items</h2>
177              <table style="border-collapse: collapse;">
178                <tr style="background-color: #ecf0f1;">
179                  <th style="border: 1px solid #bdc3c7; padding: 8px;">Item</th>
180                  <th style="border: 1px solid #bdc3c7; padding: 8px;">Price</th>
181                </tr>
182                <tr>
183                  <td style="border: 1px solid #bdc3c7; padding: 8px;">Widget A</td>
184                  <td style="border: 1px solid #bdc3c7; padding: 8px;">$20.00</td>
185                </tr>
186              </table>
187              <p style="text-align: right; font-weight: bold;">Total: $20.00</p>
188            </div>
189          </body>
190        </html>
191      `;
192  
193      const buffer = await convertHTMLToPDF(html);
194      expect(buffer).toBeInstanceOf(Buffer);
195      expect(buffer.length).toBeGreaterThan(1000); // Should be substantial
196    }, 10000);
197  
198    test("throws error for invalid HTML that causes conversion failure", async () => {
199      // Note: html-to-pdfmake is quite forgiving, so we'd need truly broken HTML
200      // For now, just verify that error handling exists
201      try {
202        await convertHTMLToPDF("<html><body>");
203        // If it doesn't throw, that's fine - the library is forgiving
204        expect(true).toBe(true);
205      } catch (error) {
206        expect(error).toBeInstanceOf(Error);
207        expect((error as Error).message).toContain(
208          "Failed to convert HTML to PDF"
209        );
210      }
211    }, 10000);
212  });
213  
214  describe("HTML to DOCX Conversion", () => {
215    test("converts simple HTML to DOCX buffer", async () => {
216      const html =
217        "<html><body><h1>Test Document</h1><p>This is a test.</p></body></html>";
218      const buffer = await convertHTMLToDOCX(html);
219  
220      expect(buffer).toBeInstanceOf(Buffer);
221      expect(buffer.length).toBeGreaterThan(0);
222  
223      // Check DOCX magic number (PK zip file format)
224      const header = buffer.toString("ascii", 0, 2);
225      expect(header).toBe("PK");
226    }, 10000);
227  
228    test("converts HTML with table to DOCX", async () => {
229      const html = `
230        <html>
231          <body>
232            <h2>Data Table</h2>
233            <table>
234              <thead>
235                <tr>
236                  <th>Name</th>
237                  <th>Value</th>
238                </tr>
239              </thead>
240              <tbody>
241                <tr>
242                  <td>Item 1</td>
243                  <td>100</td>
244                </tr>
245                <tr>
246                  <td>Item 2</td>
247                  <td>200</td>
248                </tr>
249              </tbody>
250            </table>
251          </body>
252        </html>
253      `;
254  
255      const buffer = await convertHTMLToDOCX(html);
256      expect(buffer).toBeInstanceOf(Buffer);
257      expect(buffer.length).toBeGreaterThan(0);
258    }, 10000);
259  
260    test("converts HTML with formatting to DOCX", async () => {
261      const html = `
262        <html>
263          <body>
264            <h1>Formatted Document</h1>
265            <p><strong>Bold text</strong> and <em>italic text</em></p>
266            <p><u>Underlined text</u> and <s>strikethrough text</s></p>
267            <ul>
268              <li>Bullet point 1</li>
269              <li>Bullet point 2</li>
270            </ul>
271            <ol>
272              <li>Numbered item 1</li>
273              <li>Numbered item 2</li>
274            </ol>
275          </body>
276        </html>
277      `;
278  
279      const buffer = await convertHTMLToDOCX(html);
280      expect(buffer).toBeInstanceOf(Buffer);
281      expect(buffer.length).toBeGreaterThan(0);
282    }, 10000);
283  
284    test("includes metadata in DOCX", async () => {
285      const html = "<html><body><h1>Test</h1></body></html>";
286      const buffer = await convertHTMLToDOCX(html, {
287        title: "Test Document",
288        author: "Test Author",
289        subject: "Test Subject",
290        keywords: ["test", "docx"],
291      });
292  
293      expect(buffer).toBeInstanceOf(Buffer);
294      expect(buffer.length).toBeGreaterThan(0);
295    }, 10000);
296  
297    test("supports landscape orientation", async () => {
298      const html = "<html><body><h1>Landscape Document</h1></body></html>";
299      const buffer = await convertHTMLToDOCX(html, {
300        orientation: "landscape",
301      });
302  
303      expect(buffer).toBeInstanceOf(Buffer);
304      expect(buffer.length).toBeGreaterThan(0);
305    }, 10000);
306  
307    test("handles complex document structure", async () => {
308      const html = `
309        <html>
310          <body>
311            <h1 style="color: #2c3e50;">Business Report</h1>
312            <p style="color: #7f8c8d;">Generated on October 21, 2025</p>
313            
314            <h2>Executive Summary</h2>
315            <p>This is the executive summary with important information.</p>
316            
317            <h2>Financial Data</h2>
318            <table>
319              <tr style="background-color: #ecf0f1;">
320                <th>Quarter</th>
321                <th>Revenue</th>
322              </tr>
323              <tr>
324                <td>Q1</td>
325                <td>$100,000</td>
326              </tr>
327              <tr>
328                <td>Q2</td>
329                <td>$150,000</td>
330              </tr>
331            </table>
332            
333            <h2>Conclusions</h2>
334            <ul>
335              <li>Revenue is growing</li>
336              <li>Market share is expanding</li>
337              <li>Future outlook is positive</li>
338            </ul>
339          </body>
340        </html>
341      `;
342  
343      const buffer = await convertHTMLToDOCX(html);
344      expect(buffer).toBeInstanceOf(Buffer);
345      expect(buffer.length).toBeGreaterThan(2000); // Should be substantial
346    }, 10000);
347  
348    test("handles special Unicode characters in DOCX", async () => {
349      const html = `
350        <html>
351          <body>
352            <h1>Special Characters Test</h1>
353            <p>Arrows: ↑ ↓ → ← ↔</p>
354            <p>Math symbols: × ÷ ±</p>
355            <p>Typography: — – " " ' ' …</p>
356            <p>Other symbols: ° © ® ™ • §</p>
357          </body>
358        </html>
359      `;
360  
361      const buffer = await convertHTMLToDOCX(html);
362      expect(buffer).toBeInstanceOf(Buffer);
363      expect(buffer.length).toBeGreaterThan(0);
364  
365      // Verify it's a valid DOCX (PK zip header)
366      const header = buffer.toString("ascii", 0, 2);
367      expect(header).toBe("PK");
368    }, 10000);
369  });
370  
371  describe("Error Handling", () => {
372    test("handles empty HTML gracefully for PDF", async () => {
373      const buffer = await convertHTMLToPDF("");
374      expect(buffer).toBeInstanceOf(Buffer);
375      // Empty HTML should still produce a valid PDF
376    }, 10000);
377  
378    test("handles empty HTML gracefully for DOCX", async () => {
379      const buffer = await convertHTMLToDOCX("");
380      expect(buffer).toBeInstanceOf(Buffer);
381      // Empty HTML should still produce a valid DOCX
382    }, 10000);
383  
384    test("handles whitespace-only HTML for PDF", async () => {
385      const buffer = await convertHTMLToPDF("   \n   \n   ");
386      expect(buffer).toBeInstanceOf(Buffer);
387    }, 10000);
388  
389    test("handles whitespace-only HTML for DOCX", async () => {
390      const buffer = await convertHTMLToDOCX("   \n   \n   ");
391      expect(buffer).toBeInstanceOf(Buffer);
392    }, 10000);
393  });