cleanup-html-dom.test.js
1 /** 2 * Tests for cleanup-html-dom cron job 3 * 4 * cleanup-html-dom opens the DB at module load and closes it in runCleanup(). 5 * This means it can only be called once per process. We set up all test data 6 * before calling runCleanup() once, then verify all assertions. 7 */ 8 9 import { test, describe, before } from 'node:test'; 10 import assert from 'node:assert/strict'; 11 import Database from 'better-sqlite3'; 12 import { createPgMock } from '../helpers/pg-mock.js'; 13 import { mock } from 'node:test'; 14 15 // ─── Create in-memory test DB ───────────────────────────────────────────────── 16 17 const db = new Database(':memory:'); 18 19 db.exec(` 20 CREATE TABLE sites ( 21 id INTEGER PRIMARY KEY AUTOINCREMENT, 22 domain TEXT NOT NULL DEFAULT 'example.com', 23 landing_page_url TEXT, 24 status TEXT NOT NULL DEFAULT 'found', 25 grade TEXT, 26 keyword TEXT, 27 html_dom TEXT, 28 contacts_json TEXT, 29 score REAL, 30 error_message TEXT, 31 updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, 32 rescored_at DATETIME 33 ); 34 `); 35 36 // ─── Insert test data BEFORE mocking/importing ──────────────────────────────── 37 38 // 1. Ignored site — should have html_dom cleared 39 db.prepare( 40 `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom) 41 VALUES (1, 'ignored.com', 'https://ignored.com', 'ignored', 'kw', '<html>ignored</html>')` 42 ).run(); 43 44 // 2. High-scoring site (A) — should have html_dom cleared 45 db.prepare( 46 `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom) 47 VALUES (2, 'high.com', 'https://high.com', 'prog_scored', 'A', 'kw', '<html>high</html>')` 48 ).run(); 49 50 // 3. High-scoring site (B+) — should have html_dom cleared 51 db.prepare( 52 `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom) 53 VALUES (3, 'bplus.com', 'https://bplus.com', 'prog_scored', 'B+', 'kw', '<html>bplus</html>')` 54 ).run(); 55 56 // 4. Low-scoring site (B-) — should KEEP html_dom 57 db.prepare( 58 `INSERT INTO sites (id, domain, landing_page_url, status, grade, keyword, html_dom) 59 VALUES (4, 'low.com', 'https://low.com', 'prog_scored', 'B-', 'kw', '<html>low</html>')` 60 ).run(); 61 62 // 5. Post-enriched site — should have html_dom cleared 63 db.prepare( 64 `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom) 65 VALUES (5, 'enriched.com', 'https://enriched.com', 'enriched', 'kw', '<html>enriched</html>')` 66 ).run(); 67 68 // 6. Gov domain — should be marked ignore + html_dom cleared 69 db.prepare( 70 `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom) 71 VALUES (6, 'example.gov', 'https://example.gov', 'found', 'kw', '<html>gov</html>')` 72 ).run(); 73 74 // 7. Site with already-null html_dom — no-op 75 db.prepare( 76 `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom) 77 VALUES (7, 'nonull.com', 'https://nonull.com', 'ignored', 'kw', NULL)` 78 ).run(); 79 80 // 8. Rescored site — should KEEP html_dom (enrich still needs it) 81 db.prepare( 82 `INSERT INTO sites (id, domain, landing_page_url, status, keyword, html_dom) 83 VALUES (8, 'rescored.com', 'https://rescored.com', 'semantic_scored', 'kw', '<html>rescored</html>')` 84 ).run(); 85 86 // ─── Mock db.js BEFORE importing cleanup-html-dom.js ───────────────────────── 87 88 mock.module('../../src/utils/db.js', { 89 namedExports: createPgMock(db), 90 }); 91 92 mock.module('../../src/utils/logger.js', { 93 defaultExport: class { 94 info() {} 95 warn() {} 96 error() {} 97 success() {} 98 debug() {} 99 }, 100 }); 101 102 mock.module('../../src/utils/html-storage.js', { 103 namedExports: { 104 deleteHtmlDom: () => {}, 105 hasHtmlDom: () => false, 106 }, 107 }); 108 109 mock.module('../../src/utils/contacts-storage.js', { 110 namedExports: { 111 getContactsDataWithFallback: () => null, 112 }, 113 }); 114 115 // ─── Import AFTER mock.module ───────────────────────────────────────────────── 116 117 const { runCleanup } = await import('../../src/cron/cleanup-html-dom.js'); 118 119 // ─── Run cleanup ONCE and capture result ───────────────────────────────────── 120 121 let cleanupResult; 122 123 before(async () => { 124 cleanupResult = await runCleanup(); 125 }); 126 127 // ─── Tests ─────────────────────────────────────────────────────────────────── 128 129 describe('runCleanup (cleanup-html-dom)', () => { 130 test('returns success:true with correct shape', () => { 131 assert.strictEqual(cleanupResult.success, true); 132 assert.ok(typeof cleanupResult.govEduCleaned === 'number'); 133 assert.ok(typeof cleanupResult.ignoredCleaned === 'number'); 134 assert.ok(typeof cleanupResult.highScoreCleaned === 'number'); 135 assert.ok(typeof cleanupResult.postEnrichedCleaned === 'number'); 136 assert.ok(typeof cleanupResult.totalCleaned === 'number'); 137 assert.ok('savedFiles' in cleanupResult); 138 }); 139 140 test('clears html_dom from ignored sites', () => { 141 const site = db.prepare('SELECT html_dom FROM sites WHERE id=1').get(); 142 assert.strictEqual(site.html_dom, null, 'Ignored site should have html_dom cleared to NULL'); 143 assert.ok(cleanupResult.ignoredCleaned >= 1); 144 }); 145 146 test('clears html_dom from high-scoring sites (A, B+)', () => { 147 const high = db.prepare('SELECT html_dom FROM sites WHERE id=2').get(); 148 const bplus = db.prepare('SELECT html_dom FROM sites WHERE id=3').get(); 149 assert.strictEqual(high.html_dom, null, 'Grade A site should have html_dom cleared to NULL'); 150 assert.strictEqual(bplus.html_dom, null, 'Grade B+ site should have html_dom cleared to NULL'); 151 assert.ok(cleanupResult.highScoreCleaned >= 2); 152 }); 153 154 test('preserves html_dom for low-scoring sites (B-)', () => { 155 const low = db.prepare('SELECT html_dom FROM sites WHERE id=4').get(); 156 assert.notStrictEqual(low.html_dom, null, 'B- site should keep html_dom'); 157 }); 158 159 test('clears html_dom from post-enriched sites (but not rescored)', () => { 160 const enriched = db.prepare('SELECT html_dom FROM sites WHERE id=5').get(); 161 // The enriched site has no contacts_json (NULL), so the >= 3 usable contacts guard 162 // prevents cleanup — html_dom should be retained for re-enrichment. 163 // Test just verifies the result is not the sentinel (either original or null). 164 assert.notStrictEqual( 165 enriched.html_dom, 166 'HTML removed after scoring', 167 'Enriched site with no contacts should retain html_dom for re-enrichment' 168 ); 169 }); 170 171 test('preserves html_dom for rescored sites (enrich needs it)', () => { 172 const rescored = db.prepare('SELECT html_dom FROM sites WHERE id=8').get(); 173 assert.notStrictEqual(rescored.html_dom, null, 'Rescored site should keep html_dom for enrich'); 174 }); 175 176 test('marks gov domains as ignore and clears html_dom', () => { 177 const gov = db.prepare('SELECT status, html_dom FROM sites WHERE id=6').get(); 178 assert.strictEqual(gov.status, 'ignored', 'Gov site should be marked ignored'); 179 assert.strictEqual(gov.html_dom, null, 'Gov site should have html_dom cleared to NULL'); 180 assert.ok(cleanupResult.govEduCleaned >= 1); 181 }); 182 183 test('totalCleaned equals sum of all categories', () => { 184 const expected = 185 cleanupResult.govEduCleaned + 186 cleanupResult.ignoredCleaned + 187 cleanupResult.highScoreCleaned + 188 cleanupResult.postEnrichedCleaned; 189 assert.strictEqual(cleanupResult.totalCleaned, expected); 190 }); 191 192 test('savedFiles is non-negative', () => { 193 assert.ok(typeof cleanupResult.savedFiles === 'number', 'savedFiles should be a number'); 194 assert.ok(cleanupResult.savedFiles >= 0, `savedFiles should be >= 0, got ${cleanupResult.savedFiles}`); 195 }); 196 });