html-to-document.ts
1 /** 2 * HTML-to-Document Conversion Utilities 3 * 4 * Provides HTML to PDF and HTML to DOCX conversion capabilities 5 * using lightweight libraries (no browser/Chromium required). 6 * 7 * Libraries used: 8 * - html-to-pdfmake + pdfmake: HTML → PDF conversion 9 * - html-to-docx: HTML → DOCX conversion 10 * - jsdom: DOM emulation for Node.js 11 */ 12 13 import { promises as fs } from "fs"; 14 15 // Lazy-loaded libraries (imported only when needed) 16 let pdfMake: any = null; 17 let pdfFonts: any = null; 18 let htmlToPdfmake: any = null; 19 let HTMLtoDOCX: any = null; 20 let jsdom: any = null; 21 22 /** 23 * Detect if content is HTML 24 * Checks for common HTML tags and patterns 25 */ 26 export function isHTMLContent(content: string): boolean { 27 // Trim whitespace for accurate detection 28 const trimmed = content.trim(); 29 30 // Check for common HTML patterns 31 const htmlPatterns = [ 32 /<html/i, 33 /<body/i, 34 /<head/i, 35 /<h[1-6]>/i, 36 /<p>/i, 37 /<div>/i, 38 /<table>/i, 39 /<ul>/i, 40 /<ol>/i, 41 /<li>/i, 42 /<span>/i, 43 /<strong>/i, 44 /<em>/i, 45 /<br\s*\/?>/i, 46 ]; 47 48 return htmlPatterns.some((pattern) => pattern.test(trimmed)); 49 } 50 51 /** 52 * Sanitize HTML for DOCX conversion 53 * Converts problematic Unicode characters to HTML entities or safe alternatives 54 */ 55 function sanitizeHTMLForDOCX(html: string): string { 56 // Replace common problematic Unicode characters with HTML entities or text alternatives 57 return ( 58 html 59 // Arrows 60 .replace(/↑/g, "↑") // Up arrow 61 .replace(/↓/g, "↓") // Down arrow 62 .replace(/→/g, "→") // Right arrow 63 .replace(/←/g, "←") // Left arrow 64 .replace(/↔/g, "↔") // Left-right arrow 65 // Math symbols 66 .replace(/×/g, "×") // Multiplication 67 .replace(/÷/g, "÷") // Division 68 .replace(/±/g, "±") // Plus-minus 69 // Common typographic characters 70 .replace(/—/g, "—") // Em dash 71 .replace(/–/g, "–") // En dash 72 .replace(/"/g, "“") // Left double quote 73 .replace(/"/g, "”") // Right double quote 74 .replace(/'/g, "‘") // Left single quote 75 .replace(/'/g, "’") // Right single quote 76 .replace(/…/g, "…") // Ellipsis 77 // Degree and other symbols 78 .replace(/°/g, "°") // Degree 79 .replace(/©/g, "©") // Copyright 80 .replace(/®/g, "®") // Registered 81 .replace(/™/g, "™") // Trademark 82 // Bullets and special punctuation 83 .replace(/•/g, "•") // Bullet 84 .replace(/§/g, "§") 85 ); // Section 86 } 87 88 /** 89 * Convert HTML to PDF buffer using html-to-pdfmake + pdfmake 90 * 91 * This provides lightweight PDF generation without requiring Chromium. 92 * Supports rich formatting including: 93 * - Headings, paragraphs, text formatting 94 * - Tables with borders and styling 95 * - Lists (ordered and unordered) 96 * - Colors, fonts, alignment 97 * - Page breaks 98 * 99 * @param htmlContent - HTML string to convert 100 * @param options - PDF generation options 101 * @returns Buffer containing the PDF 102 */ 103 export async function htmlToPDF( 104 htmlContent: string, 105 options: { 106 title?: string; 107 author?: string; 108 subject?: string; 109 keywords?: string[]; 110 } = {} 111 ): Promise<Buffer> { 112 // Lazy load dependencies 113 if (!pdfMake) { 114 const pdfmakeModule = await import("pdfmake/build/pdfmake.js"); 115 pdfMake = (pdfmakeModule as any).default || pdfmakeModule; 116 } 117 if (!pdfFonts) { 118 const fontsModule = await import("pdfmake/build/vfs_fonts.js"); 119 pdfFonts = (fontsModule as any).default || fontsModule; 120 } 121 if (!htmlToPdfmake) { 122 const htmlToPdfmakeModule = await import("html-to-pdfmake"); 123 htmlToPdfmake = (htmlToPdfmakeModule as any).default || htmlToPdfmakeModule; 124 } 125 if (!jsdom) { 126 const jsdomModule = await import("jsdom"); 127 jsdom = jsdomModule.JSDOM; 128 } 129 130 // Initialize PDFMake fonts - handle different module structures 131 if (!pdfMake.vfs) { 132 // Try different ways to access the fonts 133 if (pdfFonts.pdfMake && pdfFonts.pdfMake.vfs) { 134 pdfMake.vfs = pdfFonts.pdfMake.vfs; 135 } else if (pdfFonts.vfs) { 136 pdfMake.vfs = pdfFonts.vfs; 137 } else { 138 // Last resort: assign the entire fonts object 139 pdfMake.vfs = pdfFonts; 140 } 141 } 142 143 // Create DOM window for html-to-pdfmake 144 const { window } = new jsdom(""); 145 146 // Convert HTML to PDFMake format with styling 147 const converted = htmlToPdfmake(htmlContent, { 148 window, 149 defaultStyles: { 150 // Headings with colors 151 h1: { 152 fontSize: 24, 153 bold: true, 154 marginBottom: 10, 155 color: "#2c3e50", 156 }, 157 h2: { 158 fontSize: 20, 159 bold: true, 160 marginBottom: 8, 161 color: "#34495e", 162 }, 163 h3: { 164 fontSize: 18, 165 bold: true, 166 marginBottom: 6, 167 color: "#34495e", 168 }, 169 h4: { 170 fontSize: 16, 171 bold: true, 172 marginBottom: 5, 173 }, 174 h5: { 175 fontSize: 14, 176 bold: true, 177 marginBottom: 5, 178 }, 179 h6: { 180 fontSize: 12, 181 bold: true, 182 marginBottom: 5, 183 }, 184 // Paragraphs with spacing 185 p: { 186 margin: [0, 5, 0, 10], 187 }, 188 // Tables with spacing 189 table: { 190 marginBottom: 10, 191 }, 192 // Table headers with background 193 th: { 194 bold: true, 195 fillColor: "#ecf0f1", 196 color: "#2c3e50", 197 }, 198 // Text formatting 199 strong: { 200 bold: true, 201 }, 202 b: { 203 bold: true, 204 }, 205 em: { 206 italics: true, 207 }, 208 i: { 209 italics: true, 210 }, 211 u: { 212 decoration: "underline", 213 }, 214 s: { 215 decoration: "lineThrough", 216 }, 217 del: { 218 decoration: "lineThrough", 219 }, 220 // Lists 221 ul: { 222 marginBottom: 5, 223 }, 224 ol: { 225 marginBottom: 5, 226 }, 227 li: { 228 marginBottom: 3, 229 }, 230 }, 231 tableAutoSize: true, // Auto-calculate table dimensions 232 removeExtraBlanks: true, // Clean up whitespace 233 }); 234 235 // Create PDF document definition 236 const docDefinition = { 237 content: converted, 238 info: { 239 title: options.title || "Document", 240 author: options.author || "vulcan-file-ops", 241 subject: options.subject || "", 242 keywords: options.keywords?.join(", ") || "", 243 creator: "Vulcan File Ops MCP Server", 244 producer: "pdfmake + html-to-pdfmake", 245 }, 246 // Default page settings 247 pageSize: "A4", 248 pageMargins: [40, 60, 40, 60], 249 }; 250 251 // Generate PDF and return as Buffer 252 return new Promise((resolve, reject) => { 253 try { 254 const pdfDoc = pdfMake.createPdf(docDefinition); 255 pdfDoc.getBuffer((buffer: Buffer) => { 256 resolve(buffer); 257 }); 258 } catch (error) { 259 reject(error); 260 } 261 }); 262 } 263 264 /** 265 * Convert HTML to DOCX buffer using html-to-docx 266 * 267 * Creates Word-compatible DOCX files with formatting. 268 * Supports: 269 * - Headings, paragraphs, text formatting 270 * - Tables 271 * - Lists 272 * - Images (Base64) 273 * - Page breaks 274 * - Headers and footers (via options) 275 * 276 * @param htmlContent - HTML string to convert 277 * @param options - DOCX generation options 278 * @returns Buffer containing the DOCX 279 */ 280 export async function htmlToDOCX( 281 htmlContent: string, 282 options: { 283 title?: string; 284 author?: string; 285 subject?: string; 286 keywords?: string[]; 287 orientation?: "portrait" | "landscape"; 288 } = {} 289 ): Promise<Buffer> { 290 // Lazy load @turbodocx/html-to-docx (maintained fork with better Word compatibility) 291 if (!HTMLtoDOCX) { 292 const module = await import("@turbodocx/html-to-docx"); 293 HTMLtoDOCX = (module as any).default || module; 294 } 295 296 // Sanitize HTML to handle problematic Unicode characters 297 const sanitizedHTML = sanitizeHTMLForDOCX(htmlContent); 298 299 // DOCX generation options 300 const docxOptions = { 301 title: options.title || "Document", 302 creator: options.author || "vulcan-file-ops", 303 subject: options.subject || "", 304 keywords: options.keywords || [], 305 description: options.subject || "", 306 orientation: (options.orientation || "portrait") as 307 | "portrait" 308 | "landscape", 309 margins: { 310 top: 1440, // 1 inch in TWIP units 311 right: 1800, // 1.25 inches 312 bottom: 1440, // 1 inch 313 left: 1800, // 1.25 inches 314 }, 315 font: "Arial", 316 fontSize: 22, // 22 HIP = 11pt 317 // Enable page numbers in footer 318 pageNumber: false, 319 footer: false, 320 header: false, 321 }; 322 323 // Convert HTML to DOCX 324 const buffer = await HTMLtoDOCX(sanitizedHTML, null, docxOptions); 325 326 return buffer; 327 } 328 329 /** 330 * Enhanced HTML-to-PDF conversion with error handling and fallback 331 * 332 * @param htmlContent - HTML string to convert 333 * @param options - Conversion options 334 * @returns PDF buffer or throws error 335 */ 336 export async function convertHTMLToPDF( 337 htmlContent: string, 338 options: { 339 title?: string; 340 author?: string; 341 subject?: string; 342 keywords?: string[]; 343 } = {} 344 ): Promise<Buffer> { 345 try { 346 // Handle empty or whitespace-only HTML 347 const trimmed = htmlContent.trim(); 348 if (!trimmed) { 349 // Return a minimal PDF with at least a paragraph 350 htmlContent = "<html><body><p></p></body></html>"; 351 } 352 353 return await htmlToPDF(htmlContent, options); 354 } catch (error) { 355 throw new Error( 356 `Failed to convert HTML to PDF: ${ 357 error instanceof Error ? error.message : String(error) 358 }` 359 ); 360 } 361 } 362 363 /** 364 * Enhanced HTML-to-DOCX conversion with error handling and fallback 365 * 366 * @param htmlContent - HTML string to convert 367 * @param options - Conversion options 368 * @returns DOCX buffer or throws error 369 */ 370 export async function convertHTMLToDOCX( 371 htmlContent: string, 372 options: { 373 title?: string; 374 author?: string; 375 subject?: string; 376 keywords?: string[]; 377 orientation?: "portrait" | "landscape"; 378 } = {} 379 ): Promise<Buffer> { 380 try { 381 // Handle empty or whitespace-only HTML 382 const trimmed = htmlContent.trim(); 383 if (!trimmed) { 384 // Return a minimal DOCX with at least a paragraph 385 htmlContent = "<html><body><p></p></body></html>"; 386 } 387 388 return await htmlToDOCX(htmlContent, options); 389 } catch (error) { 390 throw new Error( 391 `Failed to convert HTML to DOCX: ${ 392 error instanceof Error ? error.message : String(error) 393 }` 394 ); 395 } 396 }