/ tests / pipeline / dedupe-domains.test.js
dedupe-domains.test.js
  1  /**
  2   * Tests for domain deduplication utility
  3   */
  4  
  5  import { test, mock } from 'node:test';
  6  import assert from 'node:assert';
  7  import Database from 'better-sqlite3';
  8  import { createPgMock } from '../helpers/pg-mock.js';
  9  
 10  // ── Shared schema SQL ────────────────────────────────────────────────────────
 11  
 12  const SCHEMA_SQL = `
 13    CREATE TABLE sites (
 14      id INTEGER PRIMARY KEY,
 15      domain TEXT,
 16      keyword TEXT,
 17      status TEXT,
 18      error_message TEXT,
 19      country_code TEXT DEFAULT 'AU',
 20      rescored_at DATETIME
 21    );
 22  
 23    CREATE TABLE keywords (
 24      id INTEGER PRIMARY KEY,
 25      keyword TEXT,
 26      country_code TEXT,
 27      search_volume INTEGER
 28    );
 29  `;
 30  
 31  // ── Register db.js mock once at top level (dedupeDomains takes db as arg,
 32  //    so this mock is only needed for transitive imports) ─────────────────────
 33  const _sharedDb = new Database(':memory:');
 34  mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) });
 35  
 36  import { dedupeDomains } from '../../src/utils/dedupe-domains.js';
 37  
 38  test('dedupeDomains - no duplicates', () => {
 39    const db = new Database(':memory:');
 40    db.exec(SCHEMA_SQL);
 41  
 42    // Insert single site per domain
 43    db.prepare('INSERT INTO sites (domain, keyword, status) VALUES (?, ?, ?)').run(
 44      'example.com',
 45      'plumber',
 46      'found'
 47    );
 48    db.prepare('INSERT INTO sites (domain, keyword, status) VALUES (?, ?, ?)').run(
 49      'test.com',
 50      'electrician',
 51      'found'
 52    );
 53  
 54    const stats = dedupeDomains(db);
 55  
 56    assert.strictEqual(stats.duplicateDomains, 0);
 57    assert.strictEqual(stats.sitesMarkedIgnored, 0);
 58    assert.strictEqual(stats.sitesKept, 0);
 59  
 60    db.close();
 61  });
 62  
 63  test('dedupeDomains - keeps highest search volume', () => {
 64    const db = new Database(':memory:');
 65    db.exec(SCHEMA_SQL);
 66  
 67    // Insert keywords with search volumes
 68    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
 69      'plumber sydney',
 70      'AU',
 71      1000
 72    );
 73    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
 74      'emergency plumber',
 75      'AU',
 76      500
 77    );
 78    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
 79      'local plumber',
 80      'AU',
 81      200
 82    );
 83  
 84    // Insert duplicate sites for same domain
 85    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
 86      'example.com',
 87      'plumber sydney',
 88      'found',
 89      'AU'
 90    );
 91    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
 92      'example.com',
 93      'emergency plumber',
 94      'found',
 95      'AU'
 96    );
 97    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
 98      'example.com',
 99      'local plumber',
100      'found',
101      'AU'
102    );
103  
104    const stats = dedupeDomains(db);
105  
106    assert.strictEqual(stats.duplicateDomains, 1);
107    assert.strictEqual(stats.sitesMarkedIgnored, 2); // 2 out of 3 marked as ignored
108    assert.strictEqual(stats.sitesKept, 1);
109  
110    // Check that the highest search volume keyword was kept
111    const keptSite = db.prepare('SELECT * FROM sites WHERE status != ?').get('ignored');
112    assert.strictEqual(keptSite.keyword, 'plumber sydney');
113    assert.strictEqual(keptSite.domain, 'example.com');
114  
115    // Check that others were marked as ignored
116    const ignoredSites = db.prepare('SELECT * FROM sites WHERE status = ?').all('ignored');
117    assert.strictEqual(ignoredSites.length, 2);
118    assert.ok(ignoredSites.every(s => s.error_message.includes('Duplicate domain')));
119  
120    db.close();
121  });
122  
123  test('dedupeDomains - handles missing search volumes', () => {
124    const db = new Database(':memory:');
125    db.exec(SCHEMA_SQL);
126  
127    // Insert only one keyword (others will have NULL search_volume)
128    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
129      'plumber sydney',
130      'AU',
131      1000
132    );
133  
134    // Insert duplicate sites, some without search volume data
135    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
136      'example.com',
137      'plumber sydney',
138      'found',
139      'AU'
140    );
141    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
142      'example.com',
143      'unknown keyword',
144      'found',
145      'AU'
146    );
147  
148    const stats = dedupeDomains(db);
149  
150    assert.strictEqual(stats.duplicateDomains, 1);
151    assert.strictEqual(stats.sitesMarkedIgnored, 1);
152    assert.strictEqual(stats.sitesKept, 1);
153  
154    // Check that the keyword WITH search volume was kept
155    const keptSite = db.prepare('SELECT * FROM sites WHERE status != ?').get('ignored');
156    assert.strictEqual(keptSite.keyword, 'plumber sydney');
157  
158    db.close();
159  });
160  
161  test('dedupeDomains - dry run mode', () => {
162    const db = new Database(':memory:');
163    db.exec(SCHEMA_SQL);
164  
165    // Insert keywords
166    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
167      'plumber',
168      'AU',
169      1000
170    );
171    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
172      'electrician',
173      'AU',
174      500
175    );
176  
177    // Insert duplicates
178    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
179      'example.com',
180      'plumber',
181      'found',
182      'AU'
183    );
184    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
185      'example.com',
186      'electrician',
187      'found',
188      'AU'
189    );
190  
191    const stats = dedupeDomains(db, { dryRun: true });
192  
193    assert.strictEqual(stats.duplicateDomains, 1);
194    assert.strictEqual(stats.sitesMarkedIgnored, 0); // Dry run shouldn't change anything
195    assert.strictEqual(stats.sitesKept, 1);
196  
197    // Verify no changes were made
198    const allSites = db.prepare('SELECT * FROM sites').all();
199    assert.strictEqual(allSites.length, 2);
200    assert.ok(allSites.every(s => s.status === 'found'));
201  
202    db.close();
203  });
204  
205  test('dedupeDomains - ignores already ignored sites', () => {
206    const db = new Database(':memory:');
207    db.exec(SCHEMA_SQL);
208  
209    // Insert keywords
210    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
211      'plumber',
212      'AU',
213      1000
214    );
215  
216    // Insert sites with different statuses
217    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
218      'example.com',
219      'plumber',
220      'found',
221      'AU'
222    );
223    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
224      'example.com',
225      'plumber',
226      'ignored',
227      'AU'
228    ); // Already ignored
229    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
230      'example.com',
231      'plumber',
232      'outreach_sent',
233      'AU'
234    ); // Already processed
235  
236    const stats = dedupeDomains(db);
237  
238    // Should not count already ignored/processed sites as duplicates
239    assert.strictEqual(stats.duplicateDomains, 0);
240    assert.strictEqual(stats.sitesMarkedIgnored, 0);
241  
242    db.close();
243  });
244  
245  test('dedupeDomains - multiple duplicate domains', () => {
246    const db = new Database(':memory:');
247    db.exec(SCHEMA_SQL);
248  
249    // Insert keywords
250    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
251      'keyword1',
252      'AU',
253      1000
254    );
255    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
256      'keyword2',
257      'AU',
258      500
259    );
260    db.prepare('INSERT INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)').run(
261      'keyword3',
262      'AU',
263      800
264    );
265  
266    // Insert duplicates for domain1
267    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
268      'domain1.com',
269      'keyword1',
270      'found',
271      'AU'
272    );
273    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
274      'domain1.com',
275      'keyword2',
276      'found',
277      'AU'
278    );
279  
280    // Insert duplicates for domain2
281    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
282      'domain2.com',
283      'keyword2',
284      'found',
285      'AU'
286    );
287    db.prepare('INSERT INTO sites (domain, keyword, status, country_code) VALUES (?, ?, ?, ?)').run(
288      'domain2.com',
289      'keyword3',
290      'found',
291      'AU'
292    );
293  
294    const stats = dedupeDomains(db);
295  
296    assert.strictEqual(stats.duplicateDomains, 2);
297    assert.strictEqual(stats.sitesMarkedIgnored, 2); // 1 per domain
298    assert.strictEqual(stats.sitesKept, 2); // 1 per domain
299  
300    // Check domain1 kept highest volume (keyword1)
301    const domain1Kept = db
302      .prepare('SELECT * FROM sites WHERE domain = ? AND status != ?')
303      .get('domain1.com', 'ignored');
304    assert.strictEqual(domain1Kept.keyword, 'keyword1');
305  
306    // Check domain2 kept highest volume (keyword3)
307    const domain2Kept = db
308      .prepare('SELECT * FROM sites WHERE domain = ? AND status != ?')
309      .get('domain2.com', 'ignored');
310    assert.strictEqual(domain2Kept.keyword, 'keyword3');
311  
312    db.close();
313  });