html-conversion.test.ts
1 /** 2 * Tests for HTML-to-Document conversion utilities 3 */ 4 5 import { describe, test, expect } from "@jest/globals"; 6 import { 7 isHTMLContent, 8 convertHTMLToPDF, 9 convertHTMLToDOCX, 10 } from "../utils/html-to-document.js"; 11 12 describe("HTML Content Detection", () => { 13 test("detects simple HTML with html tag", () => { 14 const content = "<html><body><p>Test</p></body></html>"; 15 expect(isHTMLContent(content)).toBe(true); 16 }); 17 18 test("detects HTML with body tag only", () => { 19 const content = "<body><p>Test</p></body>"; 20 expect(isHTMLContent(content)).toBe(true); 21 }); 22 23 test("detects HTML with heading tags", () => { 24 expect(isHTMLContent("<h1>Title</h1>")).toBe(true); 25 expect(isHTMLContent("<h2>Subtitle</h2>")).toBe(true); 26 expect(isHTMLContent("<h3>Section</h3>")).toBe(true); 27 }); 28 29 test("detects HTML with table tags", () => { 30 expect(isHTMLContent("<table><tr><td>Test</td></tr></table>")).toBe(true); 31 }); 32 33 test("detects HTML with list tags", () => { 34 expect(isHTMLContent("<ul><li>Item</li></ul>")).toBe(true); 35 expect(isHTMLContent("<ol><li>Item</li></ol>")).toBe(true); 36 }); 37 38 test("detects HTML with common tags", () => { 39 expect(isHTMLContent("<p>Paragraph</p>")).toBe(true); 40 expect(isHTMLContent("<div>Division</div>")).toBe(true); 41 expect(isHTMLContent("<table><tr><td>Cell</td></tr></table>")).toBe(true); 42 expect(isHTMLContent("<ul><li>Item</li></ul>")).toBe(true); 43 expect(isHTMLContent("<ol><li>Item</li></ol>")).toBe(true); 44 }); 45 46 test("detects HTML with formatting tags", () => { 47 expect(isHTMLContent("<strong>Bold</strong>")).toBe(true); 48 expect(isHTMLContent("<em>Italic</em>")).toBe(true); 49 expect(isHTMLContent("<span>Text</span>")).toBe(true); 50 }); 51 52 test("detects HTML with self-closing tags", () => { 53 expect(isHTMLContent("Line 1<br>Line 2")).toBe(true); 54 expect(isHTMLContent("Line 1<br/>Line 2")).toBe(true); 55 }); 56 57 test("does not detect plain text as HTML", () => { 58 expect(isHTMLContent("This is plain text")).toBe(false); 59 expect(isHTMLContent("Line 1\nLine 2\nLine 3")).toBe(false); 60 expect(isHTMLContent("Some text with numbers 123")).toBe(false); 61 }); 62 63 test("handles empty or whitespace content", () => { 64 expect(isHTMLContent("")).toBe(false); 65 expect(isHTMLContent(" ")).toBe(false); 66 expect(isHTMLContent("\n\n")).toBe(false); 67 }); 68 69 test("handles mixed content with HTML-like text", () => { 70 // Text that looks like HTML but isn't really (no actual tags) 71 expect(isHTMLContent("Price < 100 and > 50")).toBe(false); 72 expect(isHTMLContent("Use <brackets> for grouping")).toBe(false); 73 }); 74 75 test("case insensitive detection", () => { 76 expect(isHTMLContent("<HTML><BODY>Test</BODY></HTML>")).toBe(true); 77 expect(isHTMLContent("<Html><Body>Test</Body></Html>")).toBe(true); 78 }); 79 }); 80 81 describe("HTML to PDF Conversion", () => { 82 test("converts simple HTML to PDF buffer", async () => { 83 const html = 84 "<html><body><h1>Test Document</h1><p>This is a test.</p></body></html>"; 85 const buffer = await convertHTMLToPDF(html); 86 87 expect(buffer).toBeInstanceOf(Buffer); 88 expect(buffer.length).toBeGreaterThan(0); 89 90 // Check PDF magic number (starts with %PDF) 91 const header = buffer.toString("ascii", 0, 4); 92 expect(header).toBe("%PDF"); 93 }, 10000); 94 95 test("converts HTML with table to PDF", async () => { 96 const html = ` 97 <html> 98 <body> 99 <h2>Data Table</h2> 100 <table> 101 <thead> 102 <tr> 103 <th>Name</th> 104 <th>Value</th> 105 </tr> 106 </thead> 107 <tbody> 108 <tr> 109 <td>Item 1</td> 110 <td>100</td> 111 </tr> 112 <tr> 113 <td>Item 2</td> 114 <td>200</td> 115 </tr> 116 </tbody> 117 </table> 118 </body> 119 </html> 120 `; 121 122 const buffer = await convertHTMLToPDF(html); 123 expect(buffer).toBeInstanceOf(Buffer); 124 expect(buffer.length).toBeGreaterThan(0); 125 }, 10000); 126 127 test("converts HTML with styled elements", async () => { 128 const html = ` 129 <html> 130 <body> 131 <h1 style="color: #2c3e50;">Styled Title</h1> 132 <p style="text-align: center;">Centered paragraph</p> 133 <p><strong>Bold text</strong> and <em>italic text</em></p> 134 <ul> 135 <li>List item 1</li> 136 <li>List item 2</li> 137 </ul> 138 </body> 139 </html> 140 `; 141 142 const buffer = await convertHTMLToPDF(html); 143 expect(buffer).toBeInstanceOf(Buffer); 144 expect(buffer.length).toBeGreaterThan(0); 145 }, 10000); 146 147 test("includes metadata in PDF", async () => { 148 const html = "<html><body><h1>Test</h1></body></html>"; 149 const buffer = await convertHTMLToPDF(html, { 150 title: "Test Document", 151 author: "Test Author", 152 subject: "Test Subject", 153 keywords: ["test", "pdf"], 154 }); 155 156 expect(buffer).toBeInstanceOf(Buffer); 157 expect(buffer.length).toBeGreaterThan(0); 158 }, 10000); 159 160 test("handles complex HTML structure", async () => { 161 const html = ` 162 <html> 163 <head> 164 <style> 165 body { font-family: Arial, sans-serif; } 166 .header { color: #2c3e50; } 167 .content { margin: 20px; } 168 </style> 169 </head> 170 <body> 171 <div class="header"> 172 <h1>Invoice #12345</h1> 173 <p>Date: 2025-10-21</p> 174 </div> 175 <div class="content"> 176 <h2>Items</h2> 177 <table style="border-collapse: collapse;"> 178 <tr style="background-color: #ecf0f1;"> 179 <th style="border: 1px solid #bdc3c7; padding: 8px;">Item</th> 180 <th style="border: 1px solid #bdc3c7; padding: 8px;">Price</th> 181 </tr> 182 <tr> 183 <td style="border: 1px solid #bdc3c7; padding: 8px;">Widget A</td> 184 <td style="border: 1px solid #bdc3c7; padding: 8px;">$20.00</td> 185 </tr> 186 </table> 187 <p style="text-align: right; font-weight: bold;">Total: $20.00</p> 188 </div> 189 </body> 190 </html> 191 `; 192 193 const buffer = await convertHTMLToPDF(html); 194 expect(buffer).toBeInstanceOf(Buffer); 195 expect(buffer.length).toBeGreaterThan(1000); // Should be substantial 196 }, 10000); 197 198 test("throws error for invalid HTML that causes conversion failure", async () => { 199 // Note: html-to-pdfmake is quite forgiving, so we'd need truly broken HTML 200 // For now, just verify that error handling exists 201 try { 202 await convertHTMLToPDF("<html><body>"); 203 // If it doesn't throw, that's fine - the library is forgiving 204 expect(true).toBe(true); 205 } catch (error) { 206 expect(error).toBeInstanceOf(Error); 207 expect((error as Error).message).toContain( 208 "Failed to convert HTML to PDF" 209 ); 210 } 211 }, 10000); 212 }); 213 214 describe("HTML to DOCX Conversion", () => { 215 test("converts simple HTML to DOCX buffer", async () => { 216 const html = 217 "<html><body><h1>Test Document</h1><p>This is a test.</p></body></html>"; 218 const buffer = await convertHTMLToDOCX(html); 219 220 expect(buffer).toBeInstanceOf(Buffer); 221 expect(buffer.length).toBeGreaterThan(0); 222 223 // Check DOCX magic number (PK zip file format) 224 const header = buffer.toString("ascii", 0, 2); 225 expect(header).toBe("PK"); 226 }, 10000); 227 228 test("converts HTML with table to DOCX", async () => { 229 const html = ` 230 <html> 231 <body> 232 <h2>Data Table</h2> 233 <table> 234 <thead> 235 <tr> 236 <th>Name</th> 237 <th>Value</th> 238 </tr> 239 </thead> 240 <tbody> 241 <tr> 242 <td>Item 1</td> 243 <td>100</td> 244 </tr> 245 <tr> 246 <td>Item 2</td> 247 <td>200</td> 248 </tr> 249 </tbody> 250 </table> 251 </body> 252 </html> 253 `; 254 255 const buffer = await convertHTMLToDOCX(html); 256 expect(buffer).toBeInstanceOf(Buffer); 257 expect(buffer.length).toBeGreaterThan(0); 258 }, 10000); 259 260 test("converts HTML with formatting to DOCX", async () => { 261 const html = ` 262 <html> 263 <body> 264 <h1>Formatted Document</h1> 265 <p><strong>Bold text</strong> and <em>italic text</em></p> 266 <p><u>Underlined text</u> and <s>strikethrough text</s></p> 267 <ul> 268 <li>Bullet point 1</li> 269 <li>Bullet point 2</li> 270 </ul> 271 <ol> 272 <li>Numbered item 1</li> 273 <li>Numbered item 2</li> 274 </ol> 275 </body> 276 </html> 277 `; 278 279 const buffer = await convertHTMLToDOCX(html); 280 expect(buffer).toBeInstanceOf(Buffer); 281 expect(buffer.length).toBeGreaterThan(0); 282 }, 10000); 283 284 test("includes metadata in DOCX", async () => { 285 const html = "<html><body><h1>Test</h1></body></html>"; 286 const buffer = await convertHTMLToDOCX(html, { 287 title: "Test Document", 288 author: "Test Author", 289 subject: "Test Subject", 290 keywords: ["test", "docx"], 291 }); 292 293 expect(buffer).toBeInstanceOf(Buffer); 294 expect(buffer.length).toBeGreaterThan(0); 295 }, 10000); 296 297 test("supports landscape orientation", async () => { 298 const html = "<html><body><h1>Landscape Document</h1></body></html>"; 299 const buffer = await convertHTMLToDOCX(html, { 300 orientation: "landscape", 301 }); 302 303 expect(buffer).toBeInstanceOf(Buffer); 304 expect(buffer.length).toBeGreaterThan(0); 305 }, 10000); 306 307 test("handles complex document structure", async () => { 308 const html = ` 309 <html> 310 <body> 311 <h1 style="color: #2c3e50;">Business Report</h1> 312 <p style="color: #7f8c8d;">Generated on October 21, 2025</p> 313 314 <h2>Executive Summary</h2> 315 <p>This is the executive summary with important information.</p> 316 317 <h2>Financial Data</h2> 318 <table> 319 <tr style="background-color: #ecf0f1;"> 320 <th>Quarter</th> 321 <th>Revenue</th> 322 </tr> 323 <tr> 324 <td>Q1</td> 325 <td>$100,000</td> 326 </tr> 327 <tr> 328 <td>Q2</td> 329 <td>$150,000</td> 330 </tr> 331 </table> 332 333 <h2>Conclusions</h2> 334 <ul> 335 <li>Revenue is growing</li> 336 <li>Market share is expanding</li> 337 <li>Future outlook is positive</li> 338 </ul> 339 </body> 340 </html> 341 `; 342 343 const buffer = await convertHTMLToDOCX(html); 344 expect(buffer).toBeInstanceOf(Buffer); 345 expect(buffer.length).toBeGreaterThan(2000); // Should be substantial 346 }, 10000); 347 348 test("handles special Unicode characters in DOCX", async () => { 349 const html = ` 350 <html> 351 <body> 352 <h1>Special Characters Test</h1> 353 <p>Arrows: ↑ ↓ → ← ↔</p> 354 <p>Math symbols: × ÷ ±</p> 355 <p>Typography: — – " " ' ' …</p> 356 <p>Other symbols: ° © ® ™ • §</p> 357 </body> 358 </html> 359 `; 360 361 const buffer = await convertHTMLToDOCX(html); 362 expect(buffer).toBeInstanceOf(Buffer); 363 expect(buffer.length).toBeGreaterThan(0); 364 365 // Verify it's a valid DOCX (PK zip header) 366 const header = buffer.toString("ascii", 0, 2); 367 expect(header).toBe("PK"); 368 }, 10000); 369 }); 370 371 describe("Error Handling", () => { 372 test("handles empty HTML gracefully for PDF", async () => { 373 const buffer = await convertHTMLToPDF(""); 374 expect(buffer).toBeInstanceOf(Buffer); 375 // Empty HTML should still produce a valid PDF 376 }, 10000); 377 378 test("handles empty HTML gracefully for DOCX", async () => { 379 const buffer = await convertHTMLToDOCX(""); 380 expect(buffer).toBeInstanceOf(Buffer); 381 // Empty HTML should still produce a valid DOCX 382 }, 10000); 383 384 test("handles whitespace-only HTML for PDF", async () => { 385 const buffer = await convertHTMLToPDF(" \n \n "); 386 expect(buffer).toBeInstanceOf(Buffer); 387 }, 10000); 388 389 test("handles whitespace-only HTML for DOCX", async () => { 390 const buffer = await convertHTMLToDOCX(" \n \n "); 391 expect(buffer).toBeInstanceOf(Buffer); 392 }, 10000); 393 });