Cradicle Explorer

/ src / utils / html-to-document.ts
html-to-document.ts
  1  /**
  2   * HTML-to-Document Conversion Utilities
  3   *
  4   * Provides HTML to PDF and HTML to DOCX conversion capabilities
  5   * using lightweight libraries (no browser/Chromium required).
  6   *
  7   * Libraries used:
  8   * - html-to-pdfmake + pdfmake: HTML → PDF conversion
  9   * - html-to-docx: HTML → DOCX conversion
 10   * - jsdom: DOM emulation for Node.js
 11   */
 12  
 13  import { promises as fs } from "fs";
 14  
 15  // Lazy-loaded libraries (imported only when needed)
 16  let pdfMake: any = null;
 17  let pdfFonts: any = null;
 18  let htmlToPdfmake: any = null;
 19  let HTMLtoDOCX: any = null;
 20  let jsdom: any = null;
 21  
 22  /**
 23   * Detect if content is HTML
 24   * Checks for common HTML tags and patterns
 25   */
 26  export function isHTMLContent(content: string): boolean {
 27    // Trim whitespace for accurate detection
 28    const trimmed = content.trim();
 29  
 30    // Check for common HTML patterns
 31    const htmlPatterns = [
 32      /<html/i,
 33      /<body/i,
 34      /<head/i,
 35      /<h[1-6]>/i,
 36      /<p>/i,
 37      /<div>/i,
 38      /<table>/i,
 39      /<ul>/i,
 40      /<ol>/i,
 41      /<li>/i,
 42      /<span>/i,
 43      /<strong>/i,
 44      /<em>/i,
 45      /<br\s*\/?>/i,
 46    ];
 47  
 48    return htmlPatterns.some((pattern) => pattern.test(trimmed));
 49  }
 50  
 51  /**
 52   * Sanitize HTML for DOCX conversion
 53   * Converts problematic Unicode characters to HTML entities or safe alternatives
 54   */
 55  function sanitizeHTMLForDOCX(html: string): string {
 56    // Replace common problematic Unicode characters with HTML entities or text alternatives
 57    return (
 58      html
 59        // Arrows
 60        .replace(/↑/g, "&uarr;") // Up arrow
 61        .replace(/↓/g, "&darr;") // Down arrow
 62        .replace(/→/g, "&rarr;") // Right arrow
 63        .replace(/←/g, "&larr;") // Left arrow
 64        .replace(/↔/g, "&harr;") // Left-right arrow
 65        // Math symbols
 66        .replace(/×/g, "&times;") // Multiplication
 67        .replace(/÷/g, "&divide;") // Division
 68        .replace(/±/g, "&plusmn;") // Plus-minus
 69        // Common typographic characters
 70        .replace(/—/g, "&mdash;") // Em dash
 71        .replace(/–/g, "&ndash;") // En dash
 72        .replace(/"/g, "&ldquo;") // Left double quote
 73        .replace(/"/g, "&rdquo;") // Right double quote
 74        .replace(/'/g, "&lsquo;") // Left single quote
 75        .replace(/'/g, "&rsquo;") // Right single quote
 76        .replace(/…/g, "&hellip;") // Ellipsis
 77        // Degree and other symbols
 78        .replace(/°/g, "&deg;") // Degree
 79        .replace(/©/g, "&copy;") // Copyright
 80        .replace(/®/g, "&reg;") // Registered
 81        .replace(/™/g, "&trade;") // Trademark
 82        // Bullets and special punctuation
 83        .replace(/•/g, "&bull;") // Bullet
 84        .replace(/§/g, "&sect;")
 85    ); // Section
 86  }
 87  
 88  /**
 89   * Convert HTML to PDF buffer using html-to-pdfmake + pdfmake
 90   *
 91   * This provides lightweight PDF generation without requiring Chromium.
 92   * Supports rich formatting including:
 93   * - Headings, paragraphs, text formatting
 94   * - Tables with borders and styling
 95   * - Lists (ordered and unordered)
 96   * - Colors, fonts, alignment
 97   * - Page breaks
 98   *
 99   * @param htmlContent - HTML string to convert
100   * @param options - PDF generation options
101   * @returns Buffer containing the PDF
102   */
103  export async function htmlToPDF(
104    htmlContent: string,
105    options: {
106      title?: string;
107      author?: string;
108      subject?: string;
109      keywords?: string[];
110    } = {}
111  ): Promise<Buffer> {
112    // Lazy load dependencies
113    if (!pdfMake) {
114      const pdfmakeModule = await import("pdfmake/build/pdfmake.js");
115      pdfMake = (pdfmakeModule as any).default || pdfmakeModule;
116    }
117    if (!pdfFonts) {
118      const fontsModule = await import("pdfmake/build/vfs_fonts.js");
119      pdfFonts = (fontsModule as any).default || fontsModule;
120    }
121    if (!htmlToPdfmake) {
122      const htmlToPdfmakeModule = await import("html-to-pdfmake");
123      htmlToPdfmake = (htmlToPdfmakeModule as any).default || htmlToPdfmakeModule;
124    }
125    if (!jsdom) {
126      const jsdomModule = await import("jsdom");
127      jsdom = jsdomModule.JSDOM;
128    }
129  
130    // Initialize PDFMake fonts - handle different module structures
131    if (!pdfMake.vfs) {
132      // Try different ways to access the fonts
133      if (pdfFonts.pdfMake && pdfFonts.pdfMake.vfs) {
134        pdfMake.vfs = pdfFonts.pdfMake.vfs;
135      } else if (pdfFonts.vfs) {
136        pdfMake.vfs = pdfFonts.vfs;
137      } else {
138        // Last resort: assign the entire fonts object
139        pdfMake.vfs = pdfFonts;
140      }
141    }
142  
143    // Create DOM window for html-to-pdfmake
144    const { window } = new jsdom("");
145  
146    // Convert HTML to PDFMake format with styling
147    const converted = htmlToPdfmake(htmlContent, {
148      window,
149      defaultStyles: {
150        // Headings with colors
151        h1: {
152          fontSize: 24,
153          bold: true,
154          marginBottom: 10,
155          color: "#2c3e50",
156        },
157        h2: {
158          fontSize: 20,
159          bold: true,
160          marginBottom: 8,
161          color: "#34495e",
162        },
163        h3: {
164          fontSize: 18,
165          bold: true,
166          marginBottom: 6,
167          color: "#34495e",
168        },
169        h4: {
170          fontSize: 16,
171          bold: true,
172          marginBottom: 5,
173        },
174        h5: {
175          fontSize: 14,
176          bold: true,
177          marginBottom: 5,
178        },
179        h6: {
180          fontSize: 12,
181          bold: true,
182          marginBottom: 5,
183        },
184        // Paragraphs with spacing
185        p: {
186          margin: [0, 5, 0, 10],
187        },
188        // Tables with spacing
189        table: {
190          marginBottom: 10,
191        },
192        // Table headers with background
193        th: {
194          bold: true,
195          fillColor: "#ecf0f1",
196          color: "#2c3e50",
197        },
198        // Text formatting
199        strong: {
200          bold: true,
201        },
202        b: {
203          bold: true,
204        },
205        em: {
206          italics: true,
207        },
208        i: {
209          italics: true,
210        },
211        u: {
212          decoration: "underline",
213        },
214        s: {
215          decoration: "lineThrough",
216        },
217        del: {
218          decoration: "lineThrough",
219        },
220        // Lists
221        ul: {
222          marginBottom: 5,
223        },
224        ol: {
225          marginBottom: 5,
226        },
227        li: {
228          marginBottom: 3,
229        },
230      },
231      tableAutoSize: true, // Auto-calculate table dimensions
232      removeExtraBlanks: true, // Clean up whitespace
233    });
234  
235    // Create PDF document definition
236    const docDefinition = {
237      content: converted,
238      info: {
239        title: options.title || "Document",
240        author: options.author || "vulcan-file-ops",
241        subject: options.subject || "",
242        keywords: options.keywords?.join(", ") || "",
243        creator: "Vulcan File Ops MCP Server",
244        producer: "pdfmake + html-to-pdfmake",
245      },
246      // Default page settings
247      pageSize: "A4",
248      pageMargins: [40, 60, 40, 60],
249    };
250  
251    // Generate PDF and return as Buffer
252    return new Promise((resolve, reject) => {
253      try {
254        const pdfDoc = pdfMake.createPdf(docDefinition);
255        pdfDoc.getBuffer((buffer: Buffer) => {
256          resolve(buffer);
257        });
258      } catch (error) {
259        reject(error);
260      }
261    });
262  }
263  
264  /**
265   * Convert HTML to DOCX buffer using html-to-docx
266   *
267   * Creates Word-compatible DOCX files with formatting.
268   * Supports:
269   * - Headings, paragraphs, text formatting
270   * - Tables
271   * - Lists
272   * - Images (Base64)
273   * - Page breaks
274   * - Headers and footers (via options)
275   *
276   * @param htmlContent - HTML string to convert
277   * @param options - DOCX generation options
278   * @returns Buffer containing the DOCX
279   */
280  export async function htmlToDOCX(
281    htmlContent: string,
282    options: {
283      title?: string;
284      author?: string;
285      subject?: string;
286      keywords?: string[];
287      orientation?: "portrait" | "landscape";
288    } = {}
289  ): Promise<Buffer> {
290    // Lazy load @turbodocx/html-to-docx (maintained fork with better Word compatibility)
291    if (!HTMLtoDOCX) {
292      const module = await import("@turbodocx/html-to-docx");
293      HTMLtoDOCX = (module as any).default || module;
294    }
295  
296    // Sanitize HTML to handle problematic Unicode characters
297    const sanitizedHTML = sanitizeHTMLForDOCX(htmlContent);
298  
299    // DOCX generation options
300    const docxOptions = {
301      title: options.title || "Document",
302      creator: options.author || "vulcan-file-ops",
303      subject: options.subject || "",
304      keywords: options.keywords || [],
305      description: options.subject || "",
306      orientation: (options.orientation || "portrait") as
307        | "portrait"
308        | "landscape",
309      margins: {
310        top: 1440, // 1 inch in TWIP units
311        right: 1800, // 1.25 inches
312        bottom: 1440, // 1 inch
313        left: 1800, // 1.25 inches
314      },
315      font: "Arial",
316      fontSize: 22, // 22 HIP = 11pt
317      // Enable page numbers in footer
318      pageNumber: false,
319      footer: false,
320      header: false,
321    };
322  
323    // Convert HTML to DOCX
324    const buffer = await HTMLtoDOCX(sanitizedHTML, null, docxOptions);
325  
326    return buffer;
327  }
328  
329  /**
330   * Enhanced HTML-to-PDF conversion with error handling and fallback
331   *
332   * @param htmlContent - HTML string to convert
333   * @param options - Conversion options
334   * @returns PDF buffer or throws error
335   */
336  export async function convertHTMLToPDF(
337    htmlContent: string,
338    options: {
339      title?: string;
340      author?: string;
341      subject?: string;
342      keywords?: string[];
343    } = {}
344  ): Promise<Buffer> {
345    try {
346      // Handle empty or whitespace-only HTML
347      const trimmed = htmlContent.trim();
348      if (!trimmed) {
349        // Return a minimal PDF with at least a paragraph
350        htmlContent = "<html><body><p></p></body></html>";
351      }
352  
353      return await htmlToPDF(htmlContent, options);
354    } catch (error) {
355      throw new Error(
356        `Failed to convert HTML to PDF: ${
357          error instanceof Error ? error.message : String(error)
358        }`
359      );
360    }
361  }
362  
363  /**
364   * Enhanced HTML-to-DOCX conversion with error handling and fallback
365   *
366   * @param htmlContent - HTML string to convert
367   * @param options - Conversion options
368   * @returns DOCX buffer or throws error
369   */
370  export async function convertHTMLToDOCX(
371    htmlContent: string,
372    options: {
373      title?: string;
374      author?: string;
375      subject?: string;
376      keywords?: string[];
377      orientation?: "portrait" | "landscape";
378    } = {}
379  ): Promise<Buffer> {
380    try {
381      // Handle empty or whitespace-only HTML
382      const trimmed = htmlContent.trim();
383      if (!trimmed) {
384        // Return a minimal DOCX with at least a paragraph
385        htmlContent = "<html><body><p></p></body></html>";
386      }
387  
388      return await htmlToDOCX(htmlContent, options);
389    } catch (error) {
390      throw new Error(
391        `Failed to convert HTML to DOCX: ${
392          error instanceof Error ? error.message : String(error)
393        }`
394      );
395    }
396  }