html-storage.js
1 /** 2 * Filesystem-backed HTML storage for sites. 3 * 4 * Stores html_dom and key_pages_html in a single JSON file per site: 5 * data/html/{site_id}.json 6 * 7 * Schema: { "html_dom": "...", "key_pages_html": { url: html, ... } } 8 * 9 * This reduces DB size by ~7 GB and dramatically improves query performance 10 * on the sites table. Consistent layout with 2Step's data/html/{id}.json. 11 */ 12 13 import { readFileSync, writeFileSync, mkdirSync, unlinkSync, existsSync } from 'fs'; 14 import { join, dirname } from 'path'; 15 16 const DATA_DIR = join(process.cwd(), 'data', 'html'); 17 18 /** Get file path for a site's HTML JSON */ 19 function sitePath(siteId) { 20 const id = Number(siteId); 21 if (!Number.isInteger(id) || id <= 0) { 22 throw new Error(`Invalid siteId: ${siteId}`); 23 } 24 return join(DATA_DIR, `${id}.json`); 25 } 26 27 /** Read the full JSON envelope, or return empty object if missing */ 28 function readEnvelope(siteId) { 29 try { 30 return JSON.parse(readFileSync(sitePath(siteId), 'utf8')); 31 } catch { 32 return {}; 33 } 34 } 35 36 /** Write the JSON envelope back to disk */ 37 function writeEnvelope(siteId, data) { 38 try { 39 const filePath = sitePath(siteId); 40 mkdirSync(dirname(filePath), { recursive: true }); 41 writeFileSync(filePath, JSON.stringify(data), 'utf8'); 42 } catch (err) { 43 throw new Error(`Failed to write html storage for site ${siteId}: ${err.message}`); 44 } 45 } 46 47 // --- html_dom --- 48 49 /** 50 * Write html_dom to filesystem. 51 * @param {number} siteId 52 * @param {string} html - Raw HTML content 53 */ 54 function writeHtmlDom(siteId, html) { 55 if (!html) return; 56 const envelope = readEnvelope(siteId); 57 envelope.html_dom = html; 58 writeEnvelope(siteId, envelope); 59 } 60 61 /** 62 * Read html_dom from filesystem. 63 * @param {number} siteId 64 * @returns {string|null} HTML content or null if not found 65 */ 66 function readHtmlDom(siteId) { 67 return readEnvelope(siteId).html_dom || null; 68 } 69 70 /** 71 * Check if html_dom exists on filesystem. 72 * @param {number} siteId 73 * @returns {boolean} 74 */ 75 function hasHtmlDom(siteId) { 76 if (!existsSync(sitePath(siteId))) return false; 77 return !!readEnvelope(siteId).html_dom; 78 } 79 80 /** 81 * Delete html_dom from the envelope (keeps key_pages_html if present). 82 * Removes the file entirely if nothing remains. 83 * @param {number} siteId 84 * @returns {boolean} true if html_dom was present and removed 85 */ 86 function deleteHtmlDom(siteId) { 87 const envelope = readEnvelope(siteId); 88 if (!envelope.html_dom) return false; 89 delete envelope.html_dom; 90 if (Object.keys(envelope).length === 0) { 91 deleteFile(siteId); 92 } else { 93 writeEnvelope(siteId, envelope); 94 } 95 return true; 96 } 97 98 // --- key_pages_html --- 99 100 /** 101 * Write key_pages_html to filesystem. 102 * @param {number} siteId 103 * @param {object} keyPages - { url: html } map 104 */ 105 function writeKeyPagesHtml(siteId, keyPages) { 106 if (!keyPages || Object.keys(keyPages).length === 0) return; 107 const envelope = readEnvelope(siteId); 108 envelope.key_pages_html = keyPages; 109 writeEnvelope(siteId, envelope); 110 } 111 112 /** 113 * Read key_pages_html from filesystem. 114 * @param {number} siteId 115 * @returns {object|null} { url: html } map or null 116 */ 117 function readKeyPagesHtml(siteId) { 118 return readEnvelope(siteId).key_pages_html || null; 119 } 120 121 /** 122 * Delete key_pages_html from the envelope (keeps html_dom if present). 123 * Removes the file entirely if nothing remains. 124 * @param {number} siteId 125 * @returns {boolean} 126 */ 127 function deleteKeyPagesHtml(siteId) { 128 const envelope = readEnvelope(siteId); 129 if (!envelope.key_pages_html) return false; 130 delete envelope.key_pages_html; 131 if (Object.keys(envelope).length === 0) { 132 deleteFile(siteId); 133 } else { 134 writeEnvelope(siteId, envelope); 135 } 136 return true; 137 } 138 139 /** 140 * Delete the entire HTML file for a site (dom + key-pages). 141 * @param {number} siteId 142 */ 143 function deleteAllHtml(siteId) { 144 deleteFile(siteId); 145 } 146 147 /** Remove the JSON file for a site */ 148 function deleteFile(siteId) { 149 try { 150 unlinkSync(sitePath(siteId)); 151 } catch { 152 // Already gone 153 } 154 } 155 156 export { 157 writeHtmlDom, 158 readHtmlDom, 159 hasHtmlDom, 160 deleteHtmlDom, 161 writeKeyPagesHtml, 162 readKeyPagesHtml, 163 deleteKeyPagesHtml, 164 deleteAllHtml, 165 DATA_DIR, 166 };