/ tests / cron / cleanup-html-dom.test.js
cleanup-html-dom.test.js
  1  /**
  2   * Tests for cleanup-html-dom cron job
  3   *
  4   * cleanup-html-dom opens the DB at module load and closes it in runCleanup().
  5   * This means it can only be called once per process. We set up all test data
  6   * before calling runCleanup() once, then verify all assertions.
  7   */
  8  
  9  import { test, describe, before } from 'node:test';
 10  import assert from 'node:assert/strict';
 11  import Database from 'better-sqlite3';
 12  import { createPgMock } from '../helpers/pg-mock.js';
 13  import { mock } from 'node:test';
 14  
 15  // ─── Create in-memory test DB ─────────────────────────────────────────────────
 16  
 17  const db = new Database(':memory:');
 18  
 19  db.exec(`
 20    CREATE TABLE sites (
 21      id INTEGER PRIMARY KEY AUTOINCREMENT,
 22      domain TEXT NOT NULL DEFAULT 'example.com',
 23      landing_page_url TEXT,
 24      status TEXT NOT NULL DEFAULT 'found',
 25      grade TEXT,
 26      keyword TEXT,
 27      html_dom TEXT,
 28      contacts_json TEXT,
 29      score REAL,
 30      error_message TEXT,
 31      updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
 32      rescored_at DATETIME
 33    );
 34  `);
 35  
 36  // ─── Insert test data BEFORE mocking/importing ────────────────────────────────
 37  
 38  // 1. Ignored site — should have html_dom cleared
 39  db.prepare(
 40    `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom)
 41     VALUES (1, 'ignored.com', 'https://ignored.com', 'ignored', 'kw', '<html>ignored</html>')`
 42  ).run();
 43  
 44  // 2. High-scoring site (A) — should have html_dom cleared
 45  db.prepare(
 46    `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom)
 47     VALUES (2, 'high.com', 'https://high.com', 'prog_scored', 'A', 'kw', '<html>high</html>')`
 48  ).run();
 49  
 50  // 3. High-scoring site (B+) — should have html_dom cleared
 51  db.prepare(
 52    `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom)
 53     VALUES (3, 'bplus.com', 'https://bplus.com', 'prog_scored', 'B+', 'kw', '<html>bplus</html>')`
 54  ).run();
 55  
 56  // 4. Low-scoring site (B-) — should KEEP html_dom
 57  db.prepare(
 58    `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom)
 59     VALUES (4, 'low.com', 'https://low.com', 'prog_scored', 'B-', 'kw', '<html>low</html>')`
 60  ).run();
 61  
 62  // 5. Post-enriched site — should have html_dom cleared
 63  db.prepare(
 64    `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom)
 65     VALUES (5, 'enriched.com', 'https://enriched.com', 'enriched', 'kw', '<html>enriched</html>')`
 66  ).run();
 67  
 68  // 6. Gov domain — should be marked ignore + html_dom cleared
 69  db.prepare(
 70    `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom)
 71     VALUES (6, 'example.gov', 'https://example.gov', 'found', 'kw', '<html>gov</html>')`
 72  ).run();
 73  
 74  // 7. Site with already-null html_dom — no-op
 75  db.prepare(
 76    `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom)
 77     VALUES (7, 'nonull.com', 'https://nonull.com', 'ignored', 'kw', NULL)`
 78  ).run();
 79  
 80  // 8. Rescored site — should KEEP html_dom (enrich still needs it)
 81  db.prepare(
 82    `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom)
 83     VALUES (8, 'rescored.com', 'https://rescored.com', 'semantic_scored', 'kw', '<html>rescored</html>')`
 84  ).run();
 85  
 86  // ─── Mock db.js BEFORE importing cleanup-html-dom.js ─────────────────────────
 87  
 88  mock.module('../../src/utils/db.js', {
 89    namedExports: createPgMock(db),
 90  });
 91  
 92  mock.module('../../src/utils/logger.js', {
 93    defaultExport: class {
 94      info() {}
 95      warn() {}
 96      error() {}
 97      success() {}
 98      debug() {}
 99    },
100  });
101  
102  mock.module('../../src/utils/html-storage.js', {
103    namedExports: {
104      deleteHtmlDom: () => {},
105      hasHtmlDom: () => false,
106    },
107  });
108  
109  mock.module('../../src/utils/contacts-storage.js', {
110    namedExports: {
111      getContactsDataWithFallback: () => null,
112    },
113  });
114  
115  // ─── Import AFTER mock.module ─────────────────────────────────────────────────
116  
117  const { runCleanup } = await import('../../src/cron/cleanup-html-dom.js');
118  
119  // ─── Run cleanup ONCE and capture result ─────────────────────────────────────
120  
121  let cleanupResult;
122  
123  before(async () => {
124    cleanupResult = await runCleanup();
125  });
126  
127  // ─── Tests ───────────────────────────────────────────────────────────────────
128  
129  describe('runCleanup (cleanup-html-dom)', () => {
130    test('returns success:true with correct shape', () => {
131      assert.strictEqual(cleanupResult.success, true);
132      assert.ok(typeof cleanupResult.govEduCleaned === 'number');
133      assert.ok(typeof cleanupResult.ignoredCleaned === 'number');
134      assert.ok(typeof cleanupResult.highScoreCleaned === 'number');
135      assert.ok(typeof cleanupResult.postEnrichedCleaned === 'number');
136      assert.ok(typeof cleanupResult.totalCleaned === 'number');
137      assert.ok('savedFiles' in cleanupResult);
138    });
139  
140    test('clears html_dom from ignored sites', () => {
141      const site = db.prepare('SELECT html_dom FROM sites WHERE id=1').get();
142      assert.strictEqual(site.html_dom, null, 'Ignored site should have html_dom cleared to NULL');
143      assert.ok(cleanupResult.ignoredCleaned >= 1);
144    });
145  
146    test('clears html_dom from high-scoring sites (A, B+)', () => {
147      const high = db.prepare('SELECT html_dom FROM sites WHERE id=2').get();
148      const bplus = db.prepare('SELECT html_dom FROM sites WHERE id=3').get();
149      assert.strictEqual(high.html_dom, null, 'Grade A site should have html_dom cleared to NULL');
150      assert.strictEqual(bplus.html_dom, null, 'Grade B+ site should have html_dom cleared to NULL');
151      assert.ok(cleanupResult.highScoreCleaned >= 2);
152    });
153  
154    test('preserves html_dom for low-scoring sites (B-)', () => {
155      const low = db.prepare('SELECT html_dom FROM sites WHERE id=4').get();
156      assert.notStrictEqual(low.html_dom, null, 'B- site should keep html_dom');
157    });
158  
159    test('clears html_dom from post-enriched sites (but not rescored)', () => {
160      const enriched = db.prepare('SELECT html_dom FROM sites WHERE id=5').get();
161      // The enriched site has no contacts_json (NULL), so the >= 3 usable contacts guard
162      // prevents cleanup — html_dom should be retained for re-enrichment.
163      // Test just verifies the result is not the sentinel (either original or null).
164      assert.notStrictEqual(
165        enriched.html_dom,
166        'HTML removed after scoring',
167        'Enriched site with no contacts should retain html_dom for re-enrichment'
168      );
169    });
170  
171    test('preserves html_dom for rescored sites (enrich needs it)', () => {
172      const rescored = db.prepare('SELECT html_dom FROM sites WHERE id=8').get();
173      assert.notStrictEqual(rescored.html_dom, null, 'Rescored site should keep html_dom for enrich');
174    });
175  
176    test('marks gov domains as ignore and clears html_dom', () => {
177      const gov = db.prepare('SELECT status, html_dom FROM sites WHERE id=6').get();
178      assert.strictEqual(gov.status, 'ignored', 'Gov site should be marked ignored');
179      assert.strictEqual(gov.html_dom, null, 'Gov site should have html_dom cleared to NULL');
180      assert.ok(cleanupResult.govEduCleaned >= 1);
181    });
182  
183    test('totalCleaned equals sum of all categories', () => {
184      const expected =
185        cleanupResult.govEduCleaned +
186        cleanupResult.ignoredCleaned +
187        cleanupResult.highScoreCleaned +
188        cleanupResult.postEnrichedCleaned;
189      assert.strictEqual(cleanupResult.totalCleaned, expected);
190    });
191  
192    test('savedFiles is non-negative', () => {
193      assert.ok(typeof cleanupResult.savedFiles === 'number', 'savedFiles should be a number');
194      assert.ok(cleanupResult.savedFiles >= 0, `savedFiles should be >= 0, got ${cleanupResult.savedFiles}`);
195    });
196  });