dedupe-domains.test.js
1 /** 2 * Tests for domain deduplication utility 3 */ 4 5 import { test, mock } from 'node:test'; 6 import assert from 'node:assert'; 7 import Database from 'better-sqlite3'; 8 import { createPgMock } from '../helpers/pg-mock.js'; 9 10 // ── Shared schema SQL ──────────────────────────────────────────────────────── 11 12 const SCHEMA_SQL = ` 13 CREATE TABLE sites ( 14 id INTEGER PRIMARY KEY, 15 domain TEXT, 16 keyword TEXT, 17 status TEXT, 18 error_message TEXT, 19 country_code TEXT DEFAULT 'AU', 20 rescored_at DATETIME 21 ); 22 23 CREATE TABLE keywords ( 24 id INTEGER PRIMARY KEY, 25 keyword TEXT, 26 country_code TEXT, 27 search_volume INTEGER 28 ); 29 `; 30 31 // ── Register db.js mock once at top level (dedupeDomains takes db as arg, 32 // so this mock is only needed for transitive imports) ───────────────────── 33 const _sharedDb = new Database(':memory:'); 34 mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) }); 35 36 import { dedupeDomains } from '../../src/utils/dedupe-domains.js'; 37 38 test('dedupeDomains - no duplicates', () => { 39 const db = new Database(':memory:'); 40 db.exec(SCHEMA_SQL); 41 42 // Insert single site per domain 43 db.prepare('INSERT INTO sites (domain, keyword, status) VALUES (?, ?, ?)').run( 44 'example.com', 45 'plumber', 46 'found' 47 ); 48 db.prepare('INSERT INTO sites (domain, keyword, status) VALUES (?, ?, ?)').run( 49 'test.com', 50 'electrician', 51 'found' 52 ); 53 54 const stats = dedupeDomains(db); 55 56 assert.strictEqual(stats.duplicateDomains, 0); 57 assert.strictEqual(stats.sitesMarkedIgnored, 0); 58 assert.strictEqual(stats.sitesKept, 0); 59 60 db.close(); 61 }); 62 63 test('dedupeDomains - keeps highest search volume', () => { 64 const db = new Database(':memory:'); 65 db.exec(SCHEMA_SQL); 66 67 // Insert keywords with search volumes 68 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 69 'plumber sydney', 70 'AU', 71 1000 72 ); 73 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 74 'emergency plumber', 75 'AU', 76 500 77 ); 78 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 79 'local plumber', 80 'AU', 81 200 82 ); 83 84 // Insert duplicate sites for same domain 85 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 86 'example.com', 87 'plumber sydney', 88 'found', 89 'AU' 90 ); 91 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 92 'example.com', 93 'emergency plumber', 94 'found', 95 'AU' 96 ); 97 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 98 'example.com', 99 'local plumber', 100 'found', 101 'AU' 102 ); 103 104 const stats = dedupeDomains(db); 105 106 assert.strictEqual(stats.duplicateDomains, 1); 107 assert.strictEqual(stats.sitesMarkedIgnored, 2); // 2 out of 3 marked as ignored 108 assert.strictEqual(stats.sitesKept, 1); 109 110 // Check that the highest search volume keyword was kept 111 const keptSite = db.prepare('SELECT * FROM sites WHERE status != ?').get('ignored'); 112 assert.strictEqual(keptSite.keyword, 'plumber sydney'); 113 assert.strictEqual(keptSite.domain, 'example.com'); 114 115 // Check that others were marked as ignored 116 const ignoredSites = db.prepare('SELECT * FROM sites WHERE status = ?').all('ignored'); 117 assert.strictEqual(ignoredSites.length, 2); 118 assert.ok(ignoredSites.every(s => s.error_message.includes('Duplicate domain'))); 119 120 db.close(); 121 }); 122 123 test('dedupeDomains - handles missing search volumes', () => { 124 const db = new Database(':memory:'); 125 db.exec(SCHEMA_SQL); 126 127 // Insert only one keyword (others will have NULL search_volume) 128 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 129 'plumber sydney', 130 'AU', 131 1000 132 ); 133 134 // Insert duplicate sites, some without search volume data 135 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 136 'example.com', 137 'plumber sydney', 138 'found', 139 'AU' 140 ); 141 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 142 'example.com', 143 'unknown keyword', 144 'found', 145 'AU' 146 ); 147 148 const stats = dedupeDomains(db); 149 150 assert.strictEqual(stats.duplicateDomains, 1); 151 assert.strictEqual(stats.sitesMarkedIgnored, 1); 152 assert.strictEqual(stats.sitesKept, 1); 153 154 // Check that the keyword WITH search volume was kept 155 const keptSite = db.prepare('SELECT * FROM sites WHERE status != ?').get('ignored'); 156 assert.strictEqual(keptSite.keyword, 'plumber sydney'); 157 158 db.close(); 159 }); 160 161 test('dedupeDomains - dry run mode', () => { 162 const db = new Database(':memory:'); 163 db.exec(SCHEMA_SQL); 164 165 // Insert keywords 166 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 167 'plumber', 168 'AU', 169 1000 170 ); 171 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 172 'electrician', 173 'AU', 174 500 175 ); 176 177 // Insert duplicates 178 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 179 'example.com', 180 'plumber', 181 'found', 182 'AU' 183 ); 184 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 185 'example.com', 186 'electrician', 187 'found', 188 'AU' 189 ); 190 191 const stats = dedupeDomains(db, { dryRun: true }); 192 193 assert.strictEqual(stats.duplicateDomains, 1); 194 assert.strictEqual(stats.sitesMarkedIgnored, 0); // Dry run shouldn't change anything 195 assert.strictEqual(stats.sitesKept, 1); 196 197 // Verify no changes were made 198 const allSites = db.prepare('SELECT * FROM sites').all(); 199 assert.strictEqual(allSites.length, 2); 200 assert.ok(allSites.every(s => s.status === 'found')); 201 202 db.close(); 203 }); 204 205 test('dedupeDomains - ignores already ignored sites', () => { 206 const db = new Database(':memory:'); 207 db.exec(SCHEMA_SQL); 208 209 // Insert keywords 210 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 211 'plumber', 212 'AU', 213 1000 214 ); 215 216 // Insert sites with different statuses 217 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 218 'example.com', 219 'plumber', 220 'found', 221 'AU' 222 ); 223 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 224 'example.com', 225 'plumber', 226 'ignored', 227 'AU' 228 ); // Already ignored 229 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 230 'example.com', 231 'plumber', 232 'outreach_sent', 233 'AU' 234 ); // Already processed 235 236 const stats = dedupeDomains(db); 237 238 // Should not count already ignored/processed sites as duplicates 239 assert.strictEqual(stats.duplicateDomains, 0); 240 assert.strictEqual(stats.sitesMarkedIgnored, 0); 241 242 db.close(); 243 }); 244 245 test('dedupeDomains - multiple duplicate domains', () => { 246 const db = new Database(':memory:'); 247 db.exec(SCHEMA_SQL); 248 249 // Insert keywords 250 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 251 'keyword1', 252 'AU', 253 1000 254 ); 255 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 256 'keyword2', 257 'AU', 258 500 259 ); 260 db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run( 261 'keyword3', 262 'AU', 263 800 264 ); 265 266 // Insert duplicates for domain1 267 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 268 'domain1.com', 269 'keyword1', 270 'found', 271 'AU' 272 ); 273 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 274 'domain1.com', 275 'keyword2', 276 'found', 277 'AU' 278 ); 279 280 // Insert duplicates for domain2 281 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 282 'domain2.com', 283 'keyword2', 284 'found', 285 'AU' 286 ); 287 db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run( 288 'domain2.com', 289 'keyword3', 290 'found', 291 'AU' 292 ); 293 294 const stats = dedupeDomains(db); 295 296 assert.strictEqual(stats.duplicateDomains, 2); 297 assert.strictEqual(stats.sitesMarkedIgnored, 2); // 1 per domain 298 assert.strictEqual(stats.sitesKept, 2); // 1 per domain 299 300 // Check domain1 kept highest volume (keyword1) 301 const domain1Kept = db 302 .prepare('SELECT * FROM sites WHERE domain = ? AND status != ?') 303 .get('domain1.com', 'ignored'); 304 assert.strictEqual(domain1Kept.keyword, 'keyword1'); 305 306 // Check domain2 kept highest volume (keyword3) 307 const domain2Kept = db 308 .prepare('SELECT * FROM sites WHERE domain = ? AND status != ?') 309 .get('domain2.com', 'ignored'); 310 assert.strictEqual(domain2Kept.keyword, 'keyword3'); 311 312 db.close(); 313 });