/ tests / utils / html-storage.test.js
html-storage.test.js
  1  /**
  2   * Tests for src/utils/html-storage.js
  3   *
  4   * Tests all exports:
  5   *   - writeHtmlDom / readHtmlDom / hasHtmlDom / deleteHtmlDom
  6   *   - writeKeyPagesHtml / readKeyPagesHtml / deleteKeyPagesHtml
  7   *   - deleteAllHtml
  8   *   - DATA_DIR export
  9   *   - Invalid siteId validation
 10   *   - Envelope isolation (html_dom and key_pages_html coexist)
 11   *   - Idempotent deletes (returns false when already absent)
 12   *
 13   * Uses a dedicated temp directory to avoid touching any real data directory.
 14   * DATA_DIR is derived from process.cwd() at module load time, so we change
 15   * process.cwd() indirectly by setting __DATA_DIR override via env — but
 16   * html-storage.js does not read an env var for its path. Instead we use a
 17   * temp dir that has the same structure and test against filesystem operations
 18   * by working in a tmp cwd.
 19   *
 20   * Since DATA_DIR is computed as join(process.cwd(), 'data', 'html') at
 21   * module load time, we change cwd BEFORE the dynamic import so the module
 22   * picks up our temp directory.
 23   */
 24  
 25  import { describe, test, before, after, beforeEach } from 'node:test';
 26  import assert from 'node:assert/strict';
 27  import { mkdtempSync, rmSync, existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
 28  import { join } from 'path';
 29  import { tmpdir } from 'os';
 30  
 31  // ─── Temp working directory setup ────────────────────────────────────────────
 32  
 33  const TEMP_CWD = mkdtempSync(join(tmpdir(), 'html-storage-test-'));
 34  const HTML_DIR = join(TEMP_CWD, 'data', 'html');
 35  mkdirSync(HTML_DIR, { recursive: true });
 36  
 37  // Redirect cwd so DATA_DIR = TEMP_CWD/data/html
 38  const ORIGINAL_CWD = process.cwd();
 39  process.chdir(TEMP_CWD);
 40  
 41  const {
 42    writeHtmlDom,
 43    readHtmlDom,
 44    hasHtmlDom,
 45    deleteHtmlDom,
 46    writeKeyPagesHtml,
 47    readKeyPagesHtml,
 48    deleteKeyPagesHtml,
 49    deleteAllHtml,
 50    DATA_DIR,
 51  } = await import('../../src/utils/html-storage.js');
 52  
 53  // Restore cwd after import
 54  process.chdir(ORIGINAL_CWD);
 55  
 56  // ─── Helpers ─────────────────────────────────────────────────────────────────
 57  
 58  const SITE_ID = 99801;
 59  
 60  function filePath(id = SITE_ID) {
 61    return join(HTML_DIR, `${id}.json`);
 62  }
 63  
 64  function cleanup(id = SITE_ID) {
 65    try { rmSync(filePath(id), { force: true }); } catch { /* ignore */ }
 66  }
 67  
 68  // ─── Suite ───────────────────────────────────────────────────────────────────
 69  
 70  describe('html-storage', () => {
 71    after(() => {
 72      try { rmSync(TEMP_CWD, { recursive: true, force: true }); } catch { /* ignore */ }
 73    });
 74  
 75    beforeEach(() => cleanup());
 76  
 77    // ─── DATA_DIR ───────────────────────────────────────────────────────────────
 78  
 79    describe('DATA_DIR', () => {
 80      test('is a non-empty string', () => {
 81        assert.ok(typeof DATA_DIR === 'string' && DATA_DIR.length > 0);
 82      });
 83  
 84      test('ends with path segment "html"', () => {
 85        assert.ok(DATA_DIR.endsWith('html'), `Expected DATA_DIR to end with 'html', got: ${DATA_DIR}`);
 86      });
 87    });
 88  
 89    // ─── Invalid siteId ────────────────────────────────────────────────────────
 90  
 91    describe('siteId validation', () => {
 92      test('writeHtmlDom throws for zero siteId', () => {
 93        assert.throws(() => writeHtmlDom(0, '<html>'), /Invalid siteId/);
 94      });
 95  
 96      test('writeHtmlDom throws for negative siteId', () => {
 97        assert.throws(() => writeHtmlDom(-1, '<html>'), /Invalid siteId/);
 98      });
 99  
100      test('writeHtmlDom throws for non-numeric string', () => {
101        assert.throws(() => writeHtmlDom('abc', '<html>'), /Invalid siteId/);
102      });
103  
104      test('writeHtmlDom throws for float siteId', () => {
105        assert.throws(() => writeHtmlDom(1.5, '<html>'), /Invalid siteId/);
106      });
107  
108      test('readHtmlDom returns null for invalid siteId (catches error)', () => {
109        // readEnvelope swallows the error and returns {}; html_dom is undefined → null
110        const result = readHtmlDom(0);
111        assert.equal(result, null);
112      });
113  
114      test('hasHtmlDom returns false for invalid siteId (file won\'t exist)', () => {
115        // sitePath throws, existsSync catches → returns false
116        // hasHtmlDom calls existsSync(sitePath(siteId)) — but sitePath throws first
117        // The function will throw because sitePath throws before existsSync is called
118        assert.throws(() => hasHtmlDom(0), /Invalid siteId/);
119      });
120  
121      test('accepts numeric string siteId (coerced to integer)', () => {
122        assert.doesNotThrow(() => writeHtmlDom(String(SITE_ID), '<html>test</html>'));
123        cleanup();
124      });
125  
126      test('accepts valid positive integer', () => {
127        assert.doesNotThrow(() => writeHtmlDom(SITE_ID, '<html>valid</html>'));
128        cleanup();
129      });
130    });
131  
132    // ─── writeHtmlDom / readHtmlDom ─────────────────────────────────────────────
133  
134    describe('writeHtmlDom / readHtmlDom', () => {
135      test('writes html and reads it back', () => {
136        writeHtmlDom(SITE_ID, '<html><body>Hello</body></html>');
137        const result = readHtmlDom(SITE_ID);
138        assert.equal(result, '<html><body>Hello</body></html>');
139      });
140  
141      test('creates the file on disk', () => {
142        writeHtmlDom(SITE_ID, '<p>test</p>');
143        assert.ok(existsSync(filePath()), 'file should exist after write');
144      });
145  
146      test('returns null when file does not exist', () => {
147        assert.equal(readHtmlDom(SITE_ID), null);
148      });
149  
150      test('overwrites previous html_dom content', () => {
151        writeHtmlDom(SITE_ID, '<p>first</p>');
152        writeHtmlDom(SITE_ID, '<p>second</p>');
153        assert.equal(readHtmlDom(SITE_ID), '<p>second</p>');
154      });
155  
156      test('is a no-op when html is falsy', () => {
157        writeHtmlDom(SITE_ID, null);
158        assert.ok(!existsSync(filePath()), 'file should not be created for null html');
159        writeHtmlDom(SITE_ID, '');
160        assert.ok(!existsSync(filePath()), 'file should not be created for empty string');
161      });
162  
163      test('stores large HTML without truncation', () => {
164        const bigHtml = `<p>${  'x'.repeat(50000)  }</p>`;
165        writeHtmlDom(SITE_ID, bigHtml);
166        const result = readHtmlDom(SITE_ID);
167        assert.equal(result.length, bigHtml.length);
168      });
169    });
170  
171    // ─── hasHtmlDom ─────────────────────────────────────────────────────────────
172  
173    describe('hasHtmlDom', () => {
174      test('returns true when html_dom exists', () => {
175        writeHtmlDom(SITE_ID, '<p>exists</p>');
176        assert.equal(hasHtmlDom(SITE_ID), true);
177      });
178  
179      test('returns false when file does not exist', () => {
180        assert.equal(hasHtmlDom(SITE_ID), false);
181      });
182  
183      test('returns false when file exists but html_dom is absent', () => {
184        // Write a file with only key_pages_html
185        writeFileSync(filePath(), JSON.stringify({ key_pages_html: { 'https://x.com': '<p>page</p>' } }), 'utf8');
186        assert.equal(hasHtmlDom(SITE_ID), false);
187      });
188    });
189  
190    // ─── deleteHtmlDom ──────────────────────────────────────────────────────────
191  
192    describe('deleteHtmlDom', () => {
193      test('returns true and removes html_dom', () => {
194        writeHtmlDom(SITE_ID, '<p>delete me</p>');
195        const result = deleteHtmlDom(SITE_ID);
196        assert.equal(result, true);
197        assert.equal(readHtmlDom(SITE_ID), null);
198      });
199  
200      test('removes entire file when nothing else remains', () => {
201        writeHtmlDom(SITE_ID, '<p>only html</p>');
202        deleteHtmlDom(SITE_ID);
203        assert.ok(!existsSync(filePath()), 'file should be removed when envelope is empty');
204      });
205  
206      test('keeps key_pages_html when deleting html_dom', () => {
207        writeHtmlDom(SITE_ID, '<p>dom</p>');
208        writeKeyPagesHtml(SITE_ID, { 'https://example.com/about': '<p>about</p>' });
209        deleteHtmlDom(SITE_ID);
210        assert.ok(existsSync(filePath()), 'file should still exist (key_pages_html remains)');
211        assert.equal(readHtmlDom(SITE_ID), null);
212        assert.ok(readKeyPagesHtml(SITE_ID) !== null, 'key_pages_html should be preserved');
213      });
214  
215      test('returns false when html_dom was not present', () => {
216        assert.equal(deleteHtmlDom(SITE_ID), false);
217      });
218  
219      test('is idempotent — second delete returns false', () => {
220        writeHtmlDom(SITE_ID, '<p>dom</p>');
221        deleteHtmlDom(SITE_ID);
222        assert.equal(deleteHtmlDom(SITE_ID), false);
223      });
224    });
225  
226    // ─── writeKeyPagesHtml / readKeyPagesHtml ────────────────────────────────────
227  
228    describe('writeKeyPagesHtml / readKeyPagesHtml', () => {
229      test('writes and reads key pages map', () => {
230        const pages = {
231          'https://example.com/': '<html>home</html>',
232          'https://example.com/about': '<html>about</html>',
233        };
234        writeKeyPagesHtml(SITE_ID, pages);
235        const result = readKeyPagesHtml(SITE_ID);
236        assert.deepEqual(result, pages);
237      });
238  
239      test('creates the file on disk', () => {
240        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
241        assert.ok(existsSync(filePath()));
242      });
243  
244      test('returns null when file does not exist', () => {
245        assert.equal(readKeyPagesHtml(SITE_ID), null);
246      });
247  
248      test('overwrites previous key_pages_html', () => {
249        writeKeyPagesHtml(SITE_ID, { 'https://a.com': '<p>a</p>' });
250        writeKeyPagesHtml(SITE_ID, { 'https://b.com': '<p>b</p>' });
251        const result = readKeyPagesHtml(SITE_ID);
252        assert.ok('https://b.com' in result);
253        assert.ok(!('https://a.com' in result));
254      });
255  
256      test('is a no-op for null or empty map', () => {
257        writeKeyPagesHtml(SITE_ID, null);
258        assert.ok(!existsSync(filePath()), 'file should not be created for null');
259        writeKeyPagesHtml(SITE_ID, {});
260        assert.ok(!existsSync(filePath()), 'file should not be created for empty object');
261      });
262  
263      test('preserves existing html_dom when writing key_pages_html', () => {
264        writeHtmlDom(SITE_ID, '<p>dom</p>');
265        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>page</p>' });
266        assert.equal(readHtmlDom(SITE_ID), '<p>dom</p>');
267        assert.ok(readKeyPagesHtml(SITE_ID) !== null);
268      });
269    });
270  
271    // ─── deleteKeyPagesHtml ──────────────────────────────────────────────────────
272  
273    describe('deleteKeyPagesHtml', () => {
274      test('returns true and removes key_pages_html', () => {
275        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
276        const result = deleteKeyPagesHtml(SITE_ID);
277        assert.equal(result, true);
278        assert.equal(readKeyPagesHtml(SITE_ID), null);
279      });
280  
281      test('removes entire file when nothing else remains', () => {
282        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
283        deleteKeyPagesHtml(SITE_ID);
284        assert.ok(!existsSync(filePath()), 'file should be removed when envelope is empty');
285      });
286  
287      test('preserves html_dom when deleting key_pages_html', () => {
288        writeHtmlDom(SITE_ID, '<p>dom</p>');
289        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
290        deleteKeyPagesHtml(SITE_ID);
291        assert.ok(existsSync(filePath()), 'file should remain (html_dom still present)');
292        assert.equal(readHtmlDom(SITE_ID), '<p>dom</p>');
293        assert.equal(readKeyPagesHtml(SITE_ID), null);
294      });
295  
296      test('returns false when key_pages_html was not present', () => {
297        assert.equal(deleteKeyPagesHtml(SITE_ID), false);
298      });
299  
300      test('is idempotent — second delete returns false', () => {
301        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
302        deleteKeyPagesHtml(SITE_ID);
303        assert.equal(deleteKeyPagesHtml(SITE_ID), false);
304      });
305    });
306  
307    // ─── deleteAllHtml ──────────────────────────────────────────────────────────
308  
309    describe('deleteAllHtml', () => {
310      test('removes the entire file', () => {
311        writeHtmlDom(SITE_ID, '<p>dom</p>');
312        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>page</p>' });
313        deleteAllHtml(SITE_ID);
314        assert.ok(!existsSync(filePath()), 'file should be gone after deleteAllHtml');
315      });
316  
317      test('does not throw when file does not exist', () => {
318        assert.doesNotThrow(() => deleteAllHtml(SITE_ID));
319      });
320  
321      test('after deleteAllHtml, readHtmlDom returns null', () => {
322        writeHtmlDom(SITE_ID, '<p>dom</p>');
323        deleteAllHtml(SITE_ID);
324        assert.equal(readHtmlDom(SITE_ID), null);
325      });
326  
327      test('after deleteAllHtml, readKeyPagesHtml returns null', () => {
328        writeKeyPagesHtml(SITE_ID, { 'https://x.com': '<p>x</p>' });
329        deleteAllHtml(SITE_ID);
330        assert.equal(readKeyPagesHtml(SITE_ID), null);
331      });
332    });
333  
334    // ─── Envelope coexistence ───────────────────────────────────────────────────
335  
336    describe('envelope isolation — html_dom and key_pages_html coexist', () => {
337      test('both fields survive independent writes', () => {
338        writeHtmlDom(SITE_ID, '<p>dom content</p>');
339        writeKeyPagesHtml(SITE_ID, { 'https://example.com/': '<p>home</p>' });
340  
341        const rawEnvelope = JSON.parse(readFileSync(filePath(), 'utf8'));
342        assert.ok('html_dom' in rawEnvelope, 'html_dom should be present');
343        assert.ok('key_pages_html' in rawEnvelope, 'key_pages_html should be present');
344      });
345  
346      test('envelope JSON is valid and minimal', () => {
347        writeHtmlDom(SITE_ID, '<p>dom</p>');
348        const raw = readFileSync(filePath(), 'utf8');
349        const parsed = JSON.parse(raw);
350        assert.equal(Object.keys(parsed).length, 1);
351        assert.equal(parsed.html_dom, '<p>dom</p>');
352      });
353  
354      test('different site IDs have independent files', () => {
355        const OTHER_ID = 99802;
356        try {
357          writeHtmlDom(SITE_ID, '<p>site 1</p>');
358          writeHtmlDom(OTHER_ID, '<p>site 2</p>');
359          assert.equal(readHtmlDom(SITE_ID), '<p>site 1</p>');
360          assert.equal(readHtmlDom(OTHER_ID), '<p>site 2</p>');
361        } finally {
362          cleanup(OTHER_ID);
363        }
364      });
365    });
366  });