dedupe-locale-aware.test.js
1 import { test, mock } from 'node:test'; 2 import assert from 'node:assert/strict'; 3 import Database from 'better-sqlite3'; 4 import { createPgMock } from '../helpers/pg-mock.js'; 5 6 // Register db.js mock once at top level before any import of the module under test 7 const _sharedDb = new Database(':memory:'); 8 mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) }); 9 10 import { deduplicateSites } from '../../src/utils/dedupe-locale-aware.js'; 11 12 // ── Schema helper ──────────────────────────────────────────────────────────── 13 14 function createTestDb() { 15 const db = new Database(':memory:'); 16 17 db.exec(` 18 CREATE TABLE sites ( 19 id INTEGER PRIMARY KEY AUTOINCREMENT, 20 domain TEXT, 21 keyword TEXT, 22 country_code TEXT, 23 google_domain TEXT, 24 status TEXT DEFAULT 'found', 25 error_message TEXT, 26 rescored_at DATETIME 27 ); 28 29 CREATE TABLE keywords ( 30 keyword TEXT PRIMARY KEY, 31 search_volume INTEGER 32 ); 33 `); 34 35 return db; 36 } 37 38 test('deduplicateSites - keeps exact match over higher search volume', () => { 39 const db = createTestDb(); 40 41 // Insert test data: .com.au site found on both google.com.au and google.co.nz 42 db.prepare( 43 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 44 ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au'); 45 46 db.prepare( 47 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 48 ).run('example.com.au', 'plumber auckland', 'AU', 'google.co.nz'); 49 50 // Set search volumes (NZ keyword has higher volume) 51 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 52 'plumber sydney', 53 100 54 ); 55 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 56 'plumber auckland', 57 500 58 ); 59 60 const stats = deduplicateSites(db); 61 62 // Should keep google.com.au entry (exact match) despite lower search volume 63 const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get(); 64 const ignored = db.prepare("SELECT * FROM sites WHERE status = 'ignored'").get(); 65 66 assert.strictEqual(kept.google_domain, 'google.com.au'); 67 assert.strictEqual(ignored.google_domain, 'google.co.nz'); 68 assert.strictEqual(stats.sitesIgnored, 1); 69 assert.strictEqual(stats.crossBorder, 1); 70 }); 71 72 test('deduplicateSites - handles .com sites with no TLD detection', () => { 73 const db = createTestDb(); 74 75 // .com site found on multiple google domains - both exact matches, so keeps higher search volume 76 db.prepare( 77 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 78 ).run('example.com', 'plumber sydney', 'AU', 'google.com.au'); 79 80 db.prepare( 81 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 82 ).run('example.com', 'plumber london', 'UK', 'google.co.uk'); 83 84 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 85 'plumber sydney', 86 200 87 ); 88 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 89 'plumber london', 90 300 91 ); 92 93 const stats = deduplicateSites(db); 94 95 // Both are exact matches (AU-AU and UK-UK), so keeps higher search volume (UK=300 > AU=200) 96 const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get(); 97 assert.strictEqual(kept.google_domain, 'google.co.uk'); 98 assert.strictEqual(stats.sitesIgnored, 1); 99 }); 100 101 test('deduplicateSites - no duplicates returns zero stats', () => { 102 const db = createTestDb(); 103 104 db.prepare( 105 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 106 ).run('example1.com.au', 'plumber sydney', 'AU', 'google.com.au'); 107 108 db.prepare( 109 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 110 ).run('example2.com.au', 'plumber melbourne', 'AU', 'google.com.au'); 111 112 const stats = deduplicateSites(db); 113 114 assert.strictEqual(stats.duplicateDomains, 0); 115 assert.strictEqual(stats.sitesIgnored, 0); 116 }); 117 118 test('deduplicateSites - dry run does not modify database', () => { 119 const db = createTestDb(); 120 121 db.prepare( 122 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 123 ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au'); 124 125 db.prepare( 126 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 127 ).run('example.com.au', 'plumber auckland', 'AU', 'google.co.nz'); 128 129 const statsBefore = db 130 .prepare("SELECT COUNT(*) as count FROM sites WHERE status = 'ignored'") 131 .get(); 132 deduplicateSites(db, true); // Dry run 133 const statsAfter = db 134 .prepare("SELECT COUNT(*) as count FROM sites WHERE status = 'ignored'") 135 .get(); 136 137 assert.strictEqual(statsBefore.count, statsAfter.count); 138 }); 139 140 test('deduplicateSites - prefers search volume for same locale', () => { 141 const db = createTestDb(); 142 143 // Two AU sites found on google.com.au (both exact matches) 144 db.prepare( 145 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 146 ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au'); 147 148 db.prepare( 149 'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)' 150 ).run('example.com.au', 'electrician sydney', 'AU', 'google.com.au'); 151 152 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 153 'plumber sydney', 154 500 155 ); 156 db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run( 157 'electrician sydney', 158 200 159 ); 160 161 deduplicateSites(db); 162 163 // Should keep the one with higher search volume 164 const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get(); 165 assert.strictEqual(kept.keyword, 'plumber sydney'); 166 });