/ src / utils / html-storage.js
html-storage.js
  1  /**
  2   * Filesystem-backed HTML storage for sites.
  3   *
  4   * Stores html_dom and key_pages_html in a single JSON file per site:
  5   *   data/html/{site_id}.json
  6   *
  7   * Schema: { "html_dom": "...", "key_pages_html": { url: html, ... } }
  8   *
  9   * This reduces DB size by ~7 GB and dramatically improves query performance
 10   * on the sites table. Consistent layout with 2Step's data/html/{id}.json.
 11   */
 12  
 13  import { readFileSync, writeFileSync, mkdirSync, unlinkSync, existsSync } from 'fs';
 14  import { join, dirname } from 'path';
 15  
 16  const DATA_DIR = join(process.cwd(), 'data', 'html');
 17  
 18  /** Get file path for a site's HTML JSON */
 19  function sitePath(siteId) {
 20    const id = Number(siteId);
 21    if (!Number.isInteger(id) || id <= 0) {
 22      throw new Error(`Invalid siteId: ${siteId}`);
 23    }
 24    return join(DATA_DIR, `${id}.json`);
 25  }
 26  
 27  /** Read the full JSON envelope, or return empty object if missing */
 28  function readEnvelope(siteId) {
 29    try {
 30      return JSON.parse(readFileSync(sitePath(siteId), 'utf8'));
 31    } catch {
 32      return {};
 33    }
 34  }
 35  
 36  /** Write the JSON envelope back to disk */
 37  function writeEnvelope(siteId, data) {
 38    try {
 39      const filePath = sitePath(siteId);
 40      mkdirSync(dirname(filePath), { recursive: true });
 41      writeFileSync(filePath, JSON.stringify(data), 'utf8');
 42    } catch (err) {
 43      throw new Error(`Failed to write html storage for site ${siteId}: ${err.message}`);
 44    }
 45  }
 46  
 47  // --- html_dom ---
 48  
 49  /**
 50   * Write html_dom to filesystem.
 51   * @param {number} siteId
 52   * @param {string} html - Raw HTML content
 53   */
 54  function writeHtmlDom(siteId, html) {
 55    if (!html) return;
 56    const envelope = readEnvelope(siteId);
 57    envelope.html_dom = html;
 58    writeEnvelope(siteId, envelope);
 59  }
 60  
 61  /**
 62   * Read html_dom from filesystem.
 63   * @param {number} siteId
 64   * @returns {string|null} HTML content or null if not found
 65   */
 66  function readHtmlDom(siteId) {
 67    return readEnvelope(siteId).html_dom || null;
 68  }
 69  
 70  /**
 71   * Check if html_dom exists on filesystem.
 72   * @param {number} siteId
 73   * @returns {boolean}
 74   */
 75  function hasHtmlDom(siteId) {
 76    if (!existsSync(sitePath(siteId))) return false;
 77    return !!readEnvelope(siteId).html_dom;
 78  }
 79  
 80  /**
 81   * Delete html_dom from the envelope (keeps key_pages_html if present).
 82   * Removes the file entirely if nothing remains.
 83   * @param {number} siteId
 84   * @returns {boolean} true if html_dom was present and removed
 85   */
 86  function deleteHtmlDom(siteId) {
 87    const envelope = readEnvelope(siteId);
 88    if (!envelope.html_dom) return false;
 89    delete envelope.html_dom;
 90    if (Object.keys(envelope).length === 0) {
 91      deleteFile(siteId);
 92    } else {
 93      writeEnvelope(siteId, envelope);
 94    }
 95    return true;
 96  }
 97  
 98  // --- key_pages_html ---
 99  
100  /**
101   * Write key_pages_html to filesystem.
102   * @param {number} siteId
103   * @param {object} keyPages - { url: html } map
104   */
105  function writeKeyPagesHtml(siteId, keyPages) {
106    if (!keyPages || Object.keys(keyPages).length === 0) return;
107    const envelope = readEnvelope(siteId);
108    envelope.key_pages_html = keyPages;
109    writeEnvelope(siteId, envelope);
110  }
111  
112  /**
113   * Read key_pages_html from filesystem.
114   * @param {number} siteId
115   * @returns {object|null} { url: html } map or null
116   */
117  function readKeyPagesHtml(siteId) {
118    return readEnvelope(siteId).key_pages_html || null;
119  }
120  
121  /**
122   * Delete key_pages_html from the envelope (keeps html_dom if present).
123   * Removes the file entirely if nothing remains.
124   * @param {number} siteId
125   * @returns {boolean}
126   */
127  function deleteKeyPagesHtml(siteId) {
128    const envelope = readEnvelope(siteId);
129    if (!envelope.key_pages_html) return false;
130    delete envelope.key_pages_html;
131    if (Object.keys(envelope).length === 0) {
132      deleteFile(siteId);
133    } else {
134      writeEnvelope(siteId, envelope);
135    }
136    return true;
137  }
138  
139  /**
140   * Delete the entire HTML file for a site (dom + key-pages).
141   * @param {number} siteId
142   */
143  function deleteAllHtml(siteId) {
144    deleteFile(siteId);
145  }
146  
147  /** Remove the JSON file for a site */
148  function deleteFile(siteId) {
149    try {
150      unlinkSync(sitePath(siteId));
151    } catch {
152      // Already gone
153    }
154  }
155  
156  export {
157    writeHtmlDom,
158    readHtmlDom,
159    hasHtmlDom,
160    deleteHtmlDom,
161    writeKeyPagesHtml,
162    readKeyPagesHtml,
163    deleteKeyPagesHtml,
164    deleteAllHtml,
165    DATA_DIR,
166  };