dedupe-domains.test.js
1 /** 2 * Tests for src/utils/dedupe-domains.js 3 * 4 * Covers: 5 * - dedupeDomains (no duplicates, with duplicates, dry-run, multiple groups) 6 */ 7 8 import { test, describe, mock } from 'node:test'; 9 import assert from 'node:assert/strict'; 10 import Database from 'better-sqlite3'; 11 import { createPgMock } from '../helpers/pg-mock.js'; 12 13 // dedupe-domains.js receives db as a parameter — mock db.js for any transitive imports. 14 const _sharedDb = new Database(':memory:'); 15 mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) }); 16 17 const { dedupeDomains } = await import('../../src/utils/dedupe-domains.js'); 18 19 function createDb() { 20 const db = new Database(':memory:'); 21 db.exec(` 22 CREATE TABLE IF NOT EXISTS keywords ( 23 keyword TEXT, 24 country_code TEXT, 25 search_volume INTEGER DEFAULT 0, 26 PRIMARY KEY (keyword, country_code) 27 ); 28 29 CREATE TABLE IF NOT EXISTS sites ( 30 id INTEGER PRIMARY KEY AUTOINCREMENT, 31 domain TEXT NOT NULL, 32 landing_page_url TEXT, 33 keyword TEXT, 34 country_code TEXT DEFAULT 'AU', 35 status TEXT DEFAULT 'enriched', 36 error_message TEXT, 37 score REAL, 38 grade TEXT, 39 rescored_at DATETIME 40 ); 41 `); 42 return db; 43 } 44 45 function insertSite(db, { domain, keyword = 'plumber', status = 'enriched', countryCode = 'AU' }) { 46 db.prepare( 47 'INSERT INTO sites (domain, landing_page_url, keyword, country_code, status) VALUES (?, ?, ?, ?, ?)' 48 ).run(domain, `https://${domain}`, keyword, countryCode, status); 49 } 50 51 function insertKeyword(db, { keyword, countryCode = 'AU', searchVolume = 1000 }) { 52 db.prepare( 53 'INSERT OR IGNORE INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)' 54 ).run(keyword, countryCode, searchVolume); 55 } 56 57 // ─── dedupeDomains ──────────────────────────────────────────────────────────── 58 59 describe('dedupeDomains', () => { 60 test('returns zeroed stats when no duplicates exist', () => { 61 const db = createDb(); 62 insertSite(db, { domain: 'alpha.com', keyword: 'plumber' }); 63 insertSite(db, { domain: 'beta.com', keyword: 'electrician' }); 64 65 const stats = dedupeDomains(db); 66 67 assert.equal(stats.duplicateDomains, 0); 68 assert.equal(stats.sitesMarkedIgnored, 0); 69 assert.equal(stats.sitesKept, 0); 70 db.close(); 71 }); 72 73 test('marks lower-volume duplicates as ignored', () => { 74 const db = createDb(); 75 insertKeyword(db, { keyword: 'plumber', searchVolume: 2000 }); 76 insertKeyword(db, { keyword: 'electrician', searchVolume: 500 }); 77 78 insertSite(db, { domain: 'example.com', keyword: 'plumber' }); 79 insertSite(db, { domain: 'example.com', keyword: 'electrician' }); 80 81 const stats = dedupeDomains(db); 82 83 assert.equal(stats.duplicateDomains, 1); 84 assert.equal(stats.sitesKept, 1); 85 assert.equal(stats.sitesMarkedIgnored, 1); 86 87 const kept = db 88 .prepare("SELECT * FROM sites WHERE domain='example.com' AND status != 'ignored'") 89 .get(); 90 assert.ok(kept, 'should have a kept site'); 91 assert.equal(kept.keyword, 'plumber'); 92 93 const ignored = db 94 .prepare("SELECT * FROM sites WHERE domain='example.com' AND status = 'ignored'") 95 .get(); 96 assert.ok(ignored, 'should have an ignored site'); 97 assert.equal(ignored.keyword, 'electrician'); 98 db.close(); 99 }); 100 101 test('dry run does not modify the database', () => { 102 const db = createDb(); 103 insertSite(db, { domain: 'dup.com', keyword: 'plumber' }); 104 insertSite(db, { domain: 'dup.com', keyword: 'electrician' }); 105 106 const countBefore = db 107 .prepare("SELECT COUNT(*) as n FROM sites WHERE status='enriched'") 108 .get().n; 109 110 const stats = dedupeDomains(db, { dryRun: true }); 111 112 const countAfter = db 113 .prepare("SELECT COUNT(*) as n FROM sites WHERE status='enriched'") 114 .get().n; 115 116 assert.equal(countBefore, countAfter, 'dry run should not change the DB'); 117 assert.equal(stats.duplicateDomains, 1); 118 assert.equal(stats.sitesMarkedIgnored, 0); 119 db.close(); 120 }); 121 122 test('handles multiple duplicate groups', () => { 123 const db = createDb(); 124 insertSite(db, { domain: 'site1.com', keyword: 'plumber' }); 125 insertSite(db, { domain: 'site1.com', keyword: 'electrician' }); 126 insertSite(db, { domain: 'site2.com', keyword: 'painter' }); 127 insertSite(db, { domain: 'site2.com', keyword: 'carpenter' }); 128 insertSite(db, { domain: 'unique.com', keyword: 'plumber' }); 129 130 const stats = dedupeDomains(db); 131 132 assert.equal(stats.duplicateDomains, 2); 133 assert.equal(stats.sitesKept, 2); 134 assert.equal(stats.sitesMarkedIgnored, 2); 135 db.close(); 136 }); 137 138 test('skips sites already marked as ignore', () => { 139 const db = createDb(); 140 insertSite(db, { domain: 'ignored.com', keyword: 'plumber', status: 'ignored' }); 141 insertSite(db, { domain: 'ignored.com', keyword: 'electrician' }); 142 143 const stats = dedupeDomains(db); 144 145 assert.equal(stats.duplicateDomains, 0); 146 db.close(); 147 }); 148 149 test('skips sites with status outreach_sent', () => { 150 const db = createDb(); 151 insertSite(db, { domain: 'sent.com', keyword: 'plumber', status: 'outreach_sent' }); 152 insertSite(db, { domain: 'sent.com', keyword: 'electrician' }); 153 154 const stats = dedupeDomains(db); 155 156 assert.equal(stats.duplicateDomains, 0); 157 db.close(); 158 }); 159 });