html-storage.test.js
1 /** 2 * Tests for src/utils/html-storage.js 3 * 4 * Tests all exports: 5 * - writeHtmlDom / readHtmlDom / hasHtmlDom / deleteHtmlDom 6 * - writeKeyPagesHtml / readKeyPagesHtml / deleteKeyPagesHtml 7 * - deleteAllHtml 8 * - DATA_DIR export 9 * - Invalid siteId validation 10 * - Envelope isolation (html_dom and key_pages_html coexist) 11 * - Idempotent deletes (returns false when already absent) 12 * 13 * Uses a dedicated temp directory to avoid touching any real data directory. 14 * DATA_DIR is derived from process.cwd() at module load time, so we change 15 * process.cwd() indirectly by setting __DATA_DIR override via env — but 16 * html-storage.js does not read an env var for its path. Instead we use a 17 * temp dir that has the same structure and test against filesystem operations 18 * by working in a tmp cwd. 19 * 20 * Since DATA_DIR is computed as join(process.cwd(), 'data', 'html') at 21 * module load time, we change cwd BEFORE the dynamic import so the module 22 * picks up our temp directory. 23 */ 24 25 import { describe, test, before, after, beforeEach } from 'node:test'; 26 import assert from 'node:assert/strict'; 27 import { mkdtempSync, rmSync, existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs'; 28 import { join } from 'path'; 29 import { tmpdir } from 'os'; 30 31 // ─── Temp working directory setup ──────────────────────────────────────────── 32 33 const TEMP_CWD = mkdtempSync(join(tmpdir(), 'html-storage-test-')); 34 const HTML_DIR = join(TEMP_CWD, 'data', 'html'); 35 mkdirSync(HTML_DIR, { recursive: true }); 36 37 // Redirect cwd so DATA_DIR = TEMP_CWD/data/html 38 const ORIGINAL_CWD = process.cwd(); 39 process.chdir(TEMP_CWD); 40 41 const { 42 writeHtmlDom, 43 readHtmlDom, 44 hasHtmlDom, 45 deleteHtmlDom, 46 writeKeyPagesHtml, 47 readKeyPagesHtml, 48 deleteKeyPagesHtml, 49 deleteAllHtml, 50 DATA_DIR, 51 } = await import('../../src/utils/html-storage.js'); 52 53 // Restore cwd after import 54 process.chdir(ORIGINAL_CWD); 55 56 // ─── Helpers ───────────────────────────────────────────────────────────────── 57 58 const SITE_ID = 99801; 59 60 function filePath(id = SITE_ID) { 61 return join(HTML_DIR, `${id}.json`); 62 } 63 64 function cleanup(id = SITE_ID) { 65 try { rmSync(filePath(id), { force: true }); } catch { /* ignore */ } 66 } 67 68 // ─── Suite ─────────────────────────────────────────────────────────────────── 69 70 describe('html-storage', () => { 71 after(() => { 72 try { rmSync(TEMP_CWD, { recursive: true, force: true }); } catch { /* ignore */ } 73 }); 74 75 beforeEach(() => cleanup()); 76 77 // ─── DATA_DIR ─────────────────────────────────────────────────────────────── 78 79 describe('DATA_DIR', () => { 80 test('is a non-empty string', () => { 81 assert.ok(typeof DATA_DIR === 'string' && DATA_DIR.length > 0); 82 }); 83 84 test('ends with path segment "html"', () => { 85 assert.ok(DATA_DIR.endsWith('html'), `Expected DATA_DIR to end with 'html', got: ${DATA_DIR}`); 86 }); 87 }); 88 89 // ─── Invalid siteId ──────────────────────────────────────────────────────── 90 91 describe('siteId validation', () => { 92 test('writeHtmlDom throws for zero siteId', () => { 93 assert.throws(() => writeHtmlDom(0, '<html>'), /Invalid siteId/); 94 }); 95 96 test('writeHtmlDom throws for negative siteId', () => { 97 assert.throws(() => writeHtmlDom(-1, '<html>'), /Invalid siteId/); 98 }); 99 100 test('writeHtmlDom throws for non-numeric string', () => { 101 assert.throws(() => writeHtmlDom('abc', '<html>'), /Invalid siteId/); 102 }); 103 104 test('writeHtmlDom throws for float siteId', () => { 105 assert.throws(() => writeHtmlDom(1.5, '<html>'), /Invalid siteId/); 106 }); 107 108 test('readHtmlDom returns null for invalid siteId (catches error)', () => { 109 // readEnvelope swallows the error and returns {}; html_dom is undefined → null 110 const result = readHtmlDom(0); 111 assert.equal(result, null); 112 }); 113 114 test('hasHtmlDom returns false for invalid siteId (file won\'t exist)', () => { 115 // sitePath throws, existsSync catches → returns false 116 // hasHtmlDom calls existsSync(sitePath(siteId)) — but sitePath throws first 117 // The function will throw because sitePath throws before existsSync is called 118 assert.throws(() => hasHtmlDom(0), /Invalid siteId/); 119 }); 120 121 test('accepts numeric string siteId (coerced to integer)', () => { 122 assert.doesNotThrow(() => writeHtmlDom(String(SITE_ID), '<html>test</html>')); 123 cleanup(); 124 }); 125 126 test('accepts valid positive integer', () => { 127 assert.doesNotThrow(() => writeHtmlDom(SITE_ID, '<html>valid</html>')); 128 cleanup(); 129 }); 130 }); 131 132 // ─── writeHtmlDom / readHtmlDom ───────────────────────────────────────────── 133 134 describe('writeHtmlDom / readHtmlDom', () => { 135 test('writes html and reads it back', () => { 136 writeHtmlDom(SITE_ID, '<html><body>Hello</body></html>'); 137 const result = readHtmlDom(SITE_ID); 138 assert.equal(result, '<html><body>Hello</body></html>'); 139 }); 140 141 test('creates the file on disk', () => { 142 writeHtmlDom(SITE_ID, '<p>test</p>'); 143 assert.ok(existsSync(filePath()), 'file should exist after write'); 144 }); 145 146 test('returns null when file does not exist', () => { 147 assert.equal(readHtmlDom(SITE_ID), null); 148 }); 149 150 test('overwrites previous html_dom content', () => { 151 writeHtmlDom(SITE_ID, '<p>first</p>'); 152 writeHtmlDom(SITE_ID, '<p>second</p>'); 153 assert.equal(readHtmlDom(SITE_ID), '<p>second</p>'); 154 }); 155 156 test('is a no-op when html is falsy', () => { 157 writeHtmlDom(SITE_ID, null); 158 assert.ok(!existsSync(filePath()), 'file should not be created for null html'); 159 writeHtmlDom(SITE_ID, ''); 160 assert.ok(!existsSync(filePath()), 'file should not be created for empty string'); 161 }); 162 163 test('stores large HTML without truncation', () => { 164 const bigHtml = `<p>${ 'x'.repeat(50000) }</p>`; 165 writeHtmlDom(SITE_ID, bigHtml); 166 const result = readHtmlDom(SITE_ID); 167 assert.equal(result.length, bigHtml.length); 168 }); 169 }); 170 171 // ─── hasHtmlDom ───────────────────────────────────────────────────────────── 172 173 describe('hasHtmlDom', () => { 174 test('returns true when html_dom exists', () => { 175 writeHtmlDom(SITE_ID, '<p>exists</p>'); 176 assert.equal(hasHtmlDom(SITE_ID), true); 177 }); 178 179 test('returns false when file does not exist', () => { 180 assert.equal(hasHtmlDom(SITE_ID), false); 181 }); 182 183 test('returns false when file exists but html_dom is absent', () => { 184 // Write a file with only key_pages_html 185 writeFileSync(filePath(), JSON.stringify({ key_pages_html: { 'https://x.com': '<p>page</p>' } }), 'utf8'); 186 assert.equal(hasHtmlDom(SITE_ID), false); 187 }); 188 }); 189 190 // ─── deleteHtmlDom ────────────────────────────────────────────────────────── 191 192 describe('deleteHtmlDom', () => { 193 test('returns true and removes html_dom', () => { 194 writeHtmlDom(SITE_ID, '<p>delete me</p>'); 195 const result = deleteHtmlDom(SITE_ID); 196 assert.equal(result, true); 197 assert.equal(readHtmlDom(SITE_ID), null); 198 }); 199 200 test('removes entire file when nothing else remains', () => { 201 writeHtmlDom(SITE_ID, '<p>only html</p>'); 202 deleteHtmlDom(SITE_ID); 203 assert.ok(!existsSync(filePath()), 'file should be removed when envelope is empty'); 204 }); 205 206 test('keeps key_pages_html when deleting html_dom', () => { 207 writeHtmlDom(SITE_ID, '<p>dom</p>'); 208 writeKeyPagesHtml(SITE_ID, { 'https://example.com/about': '<p>about</p>' }); 209 deleteHtmlDom(SITE_ID); 210 assert.ok(existsSync(filePath()), 'file should still exist (key_pages_html remains)'); 211 assert.equal(readHtmlDom(SITE_ID), null); 212 assert.ok(readKeyPagesHtml(SITE_ID) !== null, 'key_pages_html should be preserved'); 213 }); 214 215 test('returns false when html_dom was not present', () => { 216 assert.equal(deleteHtmlDom(SITE_ID), false); 217 }); 218 219 test('is idempotent — second delete returns false', () => { 220 writeHtmlDom(SITE_ID, '<p>dom</p>'); 221 deleteHtmlDom(SITE_ID); 222 assert.equal(deleteHtmlDom(SITE_ID), false); 223 }); 224 }); 225 226 // ─── writeKeyPagesHtml / readKeyPagesHtml ──────────────────────────────────── 227 228 describe('writeKeyPagesHtml / readKeyPagesHtml', () => { 229 test('writes and reads key pages map', () => { 230 const pages = { 231 'https://example.com/': '<html>home</html>', 232 'https://example.com/about': '<html>about</html>', 233 }; 234 writeKeyPagesHtml(SITE_ID, pages); 235 const result = readKeyPagesHtml(SITE_ID); 236 assert.deepEqual(result, pages); 237 }); 238 239 test('creates the file on disk', () => { 240 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 241 assert.ok(existsSync(filePath())); 242 }); 243 244 test('returns null when file does not exist', () => { 245 assert.equal(readKeyPagesHtml(SITE_ID), null); 246 }); 247 248 test('overwrites previous key_pages_html', () => { 249 writeKeyPagesHtml(SITE_ID, { 'https://a.com': '<p>a</p>' }); 250 writeKeyPagesHtml(SITE_ID, { 'https://b.com': '<p>b</p>' }); 251 const result = readKeyPagesHtml(SITE_ID); 252 assert.ok('https://b.com' in result); 253 assert.ok(!('https://a.com' in result)); 254 }); 255 256 test('is a no-op for null or empty map', () => { 257 writeKeyPagesHtml(SITE_ID, null); 258 assert.ok(!existsSync(filePath()), 'file should not be created for null'); 259 writeKeyPagesHtml(SITE_ID, {}); 260 assert.ok(!existsSync(filePath()), 'file should not be created for empty object'); 261 }); 262 263 test('preserves existing html_dom when writing key_pages_html', () => { 264 writeHtmlDom(SITE_ID, '<p>dom</p>'); 265 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>page</p>' }); 266 assert.equal(readHtmlDom(SITE_ID), '<p>dom</p>'); 267 assert.ok(readKeyPagesHtml(SITE_ID) !== null); 268 }); 269 }); 270 271 // ─── deleteKeyPagesHtml ────────────────────────────────────────────────────── 272 273 describe('deleteKeyPagesHtml', () => { 274 test('returns true and removes key_pages_html', () => { 275 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 276 const result = deleteKeyPagesHtml(SITE_ID); 277 assert.equal(result, true); 278 assert.equal(readKeyPagesHtml(SITE_ID), null); 279 }); 280 281 test('removes entire file when nothing else remains', () => { 282 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 283 deleteKeyPagesHtml(SITE_ID); 284 assert.ok(!existsSync(filePath()), 'file should be removed when envelope is empty'); 285 }); 286 287 test('preserves html_dom when deleting key_pages_html', () => { 288 writeHtmlDom(SITE_ID, '<p>dom</p>'); 289 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 290 deleteKeyPagesHtml(SITE_ID); 291 assert.ok(existsSync(filePath()), 'file should remain (html_dom still present)'); 292 assert.equal(readHtmlDom(SITE_ID), '<p>dom</p>'); 293 assert.equal(readKeyPagesHtml(SITE_ID), null); 294 }); 295 296 test('returns false when key_pages_html was not present', () => { 297 assert.equal(deleteKeyPagesHtml(SITE_ID), false); 298 }); 299 300 test('is idempotent — second delete returns false', () => { 301 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 302 deleteKeyPagesHtml(SITE_ID); 303 assert.equal(deleteKeyPagesHtml(SITE_ID), false); 304 }); 305 }); 306 307 // ─── deleteAllHtml ────────────────────────────────────────────────────────── 308 309 describe('deleteAllHtml', () => { 310 test('removes the entire file', () => { 311 writeHtmlDom(SITE_ID, '<p>dom</p>'); 312 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>page</p>' }); 313 deleteAllHtml(SITE_ID); 314 assert.ok(!existsSync(filePath()), 'file should be gone after deleteAllHtml'); 315 }); 316 317 test('does not throw when file does not exist', () => { 318 assert.doesNotThrow(() => deleteAllHtml(SITE_ID)); 319 }); 320 321 test('after deleteAllHtml, readHtmlDom returns null', () => { 322 writeHtmlDom(SITE_ID, '<p>dom</p>'); 323 deleteAllHtml(SITE_ID); 324 assert.equal(readHtmlDom(SITE_ID), null); 325 }); 326 327 test('after deleteAllHtml, readKeyPagesHtml returns null', () => { 328 writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' }); 329 deleteAllHtml(SITE_ID); 330 assert.equal(readKeyPagesHtml(SITE_ID), null); 331 }); 332 }); 333 334 // ─── Envelope coexistence ─────────────────────────────────────────────────── 335 336 describe('envelope isolation — html_dom and key_pages_html coexist', () => { 337 test('both fields survive independent writes', () => { 338 writeHtmlDom(SITE_ID, '<p>dom content</p>'); 339 writeKeyPagesHtml(SITE_ID, { 'https://example.com/': '<p>home</p>' }); 340 341 const rawEnvelope = JSON.parse(readFileSync(filePath(), 'utf8')); 342 assert.ok('html_dom' in rawEnvelope, 'html_dom should be present'); 343 assert.ok('key_pages_html' in rawEnvelope, 'key_pages_html should be present'); 344 }); 345 346 test('envelope JSON is valid and minimal', () => { 347 writeHtmlDom(SITE_ID, '<p>dom</p>'); 348 const raw = readFileSync(filePath(), 'utf8'); 349 const parsed = JSON.parse(raw); 350 assert.equal(Object.keys(parsed).length, 1); 351 assert.equal(parsed.html_dom, '<p>dom</p>'); 352 }); 353 354 test('different site IDs have independent files', () => { 355 const OTHER_ID = 99802; 356 try { 357 writeHtmlDom(SITE_ID, '<p>site 1</p>'); 358 writeHtmlDom(OTHER_ID, '<p>site 2</p>'); 359 assert.equal(readHtmlDom(SITE_ID), '<p>site 1</p>'); 360 assert.equal(readHtmlDom(OTHER_ID), '<p>site 2</p>'); 361 } finally { 362 cleanup(OTHER_ID); 363 } 364 }); 365 }); 366 });