/ tests / pipeline / dedupe-locale-aware.test.js
dedupe-locale-aware.test.js
  1  import { test, mock } from 'node:test';
  2  import assert from 'node:assert/strict';
  3  import Database from 'better-sqlite3';
  4  import { createPgMock } from '../helpers/pg-mock.js';
  5  
  6  // Register db.js mock once at top level before any import of the module under test
  7  const _sharedDb = new Database(':memory:');
  8  mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) });
  9  
 10  import { deduplicateSites } from '../../src/utils/dedupe-locale-aware.js';
 11  
 12  // ── Schema helper ────────────────────────────────────────────────────────────
 13  
 14  function createTestDb() {
 15    const db = new Database(':memory:');
 16  
 17    db.exec(`
 18      CREATE TABLE sites (
 19        id INTEGER PRIMARY KEY AUTOINCREMENT,
 20        domain TEXT,
 21        keyword TEXT,
 22        country_code TEXT,
 23        google_domain TEXT,
 24        status TEXT DEFAULT 'found',
 25        error_message TEXT,
 26        rescored_at DATETIME
 27      );
 28  
 29      CREATE TABLE keywords (
 30        keyword TEXT PRIMARY KEY,
 31        search_volume INTEGER
 32      );
 33    `);
 34  
 35    return db;
 36  }
 37  
 38  test('deduplicateSites - keeps exact match over higher search volume', () => {
 39    const db = createTestDb();
 40  
 41    // Insert test data: .com.au site found on both google.com.au and google.co.nz
 42    db.prepare(
 43      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
 44    ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au');
 45  
 46    db.prepare(
 47      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
 48    ).run('example.com.au', 'plumber auckland', 'AU', 'google.co.nz');
 49  
 50    // Set search volumes (NZ keyword has higher volume)
 51    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
 52      'plumber sydney',
 53      100
 54    );
 55    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
 56      'plumber auckland',
 57      500
 58    );
 59  
 60    const stats = deduplicateSites(db);
 61  
 62    // Should keep google.com.au entry (exact match) despite lower search volume
 63    const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get();
 64    const ignored = db.prepare("SELECT * FROM sites WHERE status = 'ignored'").get();
 65  
 66    assert.strictEqual(kept.google_domain, 'google.com.au');
 67    assert.strictEqual(ignored.google_domain, 'google.co.nz');
 68    assert.strictEqual(stats.sitesIgnored, 1);
 69    assert.strictEqual(stats.crossBorder, 1);
 70  });
 71  
 72  test('deduplicateSites - handles .com sites with no TLD detection', () => {
 73    const db = createTestDb();
 74  
 75    // .com site found on multiple google domains - both exact matches, so keeps higher search volume
 76    db.prepare(
 77      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
 78    ).run('example.com', 'plumber sydney', 'AU', 'google.com.au');
 79  
 80    db.prepare(
 81      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
 82    ).run('example.com', 'plumber london', 'UK', 'google.co.uk');
 83  
 84    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
 85      'plumber sydney',
 86      200
 87    );
 88    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
 89      'plumber london',
 90      300
 91    );
 92  
 93    const stats = deduplicateSites(db);
 94  
 95    // Both are exact matches (AU-AU and UK-UK), so keeps higher search volume (UK=300 > AU=200)
 96    const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get();
 97    assert.strictEqual(kept.google_domain, 'google.co.uk');
 98    assert.strictEqual(stats.sitesIgnored, 1);
 99  });
100  
101  test('deduplicateSites - no duplicates returns zero stats', () => {
102    const db = createTestDb();
103  
104    db.prepare(
105      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
106    ).run('example1.com.au', 'plumber sydney', 'AU', 'google.com.au');
107  
108    db.prepare(
109      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
110    ).run('example2.com.au', 'plumber melbourne', 'AU', 'google.com.au');
111  
112    const stats = deduplicateSites(db);
113  
114    assert.strictEqual(stats.duplicateDomains, 0);
115    assert.strictEqual(stats.sitesIgnored, 0);
116  });
117  
118  test('deduplicateSites - dry run does not modify database', () => {
119    const db = createTestDb();
120  
121    db.prepare(
122      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
123    ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au');
124  
125    db.prepare(
126      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
127    ).run('example.com.au', 'plumber auckland', 'AU', 'google.co.nz');
128  
129    const statsBefore = db
130      .prepare("SELECT COUNT(*) as count FROM sites WHERE status = 'ignored'")
131      .get();
132    deduplicateSites(db, true); // Dry run
133    const statsAfter = db
134      .prepare("SELECT COUNT(*) as count FROM sites WHERE status = 'ignored'")
135      .get();
136  
137    assert.strictEqual(statsBefore.count, statsAfter.count);
138  });
139  
140  test('deduplicateSites - prefers search volume for same locale', () => {
141    const db = createTestDb();
142  
143    // Two AU sites found on google.com.au (both exact matches)
144    db.prepare(
145      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
146    ).run('example.com.au', 'plumber sydney', 'AU', 'google.com.au');
147  
148    db.prepare(
149      'INSERT INTO sites (domain, keyword, country_code, google_domain) VALUES (?, ?, ?, ?)'
150    ).run('example.com.au', 'electrician sydney', 'AU', 'google.com.au');
151  
152    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
153      'plumber sydney',
154      500
155    );
156    db.prepare('INSERT INTO keywords (keyword, search_volume) VALUES (?, ?)').run(
157      'electrician sydney',
158      200
159    );
160  
161    deduplicateSites(db);
162  
163    // Should keep the one with higher search volume
164    const kept = db.prepare("SELECT * FROM sites WHERE status != 'ignored'").get();
165    assert.strictEqual(kept.keyword, 'plumber sydney');
166  });