/ tests / utils / dedupe-domains.test.js
dedupe-domains.test.js
  1  /**
  2   * Tests for src/utils/dedupe-domains.js
  3   *
  4   * Covers:
  5   * - dedupeDomains (no duplicates, with duplicates, dry-run, multiple groups)
  6   */
  7  
  8  import { test, describe, mock } from 'node:test';
  9  import assert from 'node:assert/strict';
 10  import Database from 'better-sqlite3';
 11  import { createPgMock } from '../helpers/pg-mock.js';
 12  
 13  // dedupe-domains.js receives db as a parameter — mock db.js for any transitive imports.
 14  const _sharedDb = new Database(':memory:');
 15  mock.module('../../src/utils/db.js', { namedExports: createPgMock(_sharedDb) });
 16  
 17  const { dedupeDomains } = await import('../../src/utils/dedupe-domains.js');
 18  
 19  function createDb() {
 20    const db = new Database(':memory:');
 21    db.exec(`
 22      CREATE TABLE IF NOT EXISTS keywords (
 23        keyword TEXT,
 24        country_code TEXT,
 25        search_volume INTEGER DEFAULT 0,
 26        PRIMARY KEY (keyword, country_code)
 27      );
 28  
 29      CREATE TABLE IF NOT EXISTS sites (
 30        id INTEGER PRIMARY KEY AUTOINCREMENT,
 31        domain TEXT NOT NULL,
 32        landing_page_url TEXT,
 33        keyword TEXT,
 34        country_code TEXT DEFAULT 'AU',
 35        status TEXT DEFAULT 'enriched',
 36        error_message TEXT,
 37        score REAL,
 38        grade TEXT,
 39        rescored_at DATETIME
 40      );
 41    `);
 42    return db;
 43  }
 44  
 45  function insertSite(db, { domain, keyword = 'plumber', status = 'enriched', countryCode = 'AU' }) {
 46    db.prepare(
 47      'INSERT INTO sites (domain, landing_page_url, keyword, country_code, status) VALUES (?, ?, ?, ?, ?)'
 48    ).run(domain, `https://${domain}`, keyword, countryCode, status);
 49  }
 50  
 51  function insertKeyword(db, { keyword, countryCode = 'AU', searchVolume = 1000 }) {
 52    db.prepare(
 53      'INSERT OR IGNORE INTO keywords (keyword, country_code, search_volume) VALUES (?, ?, ?)'
 54    ).run(keyword, countryCode, searchVolume);
 55  }
 56  
 57  // ─── dedupeDomains ────────────────────────────────────────────────────────────
 58  
 59  describe('dedupeDomains', () => {
 60    test('returns zeroed stats when no duplicates exist', () => {
 61      const db = createDb();
 62      insertSite(db, { domain: 'alpha.com', keyword: 'plumber' });
 63      insertSite(db, { domain: 'beta.com', keyword: 'electrician' });
 64  
 65      const stats = dedupeDomains(db);
 66  
 67      assert.equal(stats.duplicateDomains, 0);
 68      assert.equal(stats.sitesMarkedIgnored, 0);
 69      assert.equal(stats.sitesKept, 0);
 70      db.close();
 71    });
 72  
 73    test('marks lower-volume duplicates as ignored', () => {
 74      const db = createDb();
 75      insertKeyword(db, { keyword: 'plumber', searchVolume: 2000 });
 76      insertKeyword(db, { keyword: 'electrician', searchVolume: 500 });
 77  
 78      insertSite(db, { domain: 'example.com', keyword: 'plumber' });
 79      insertSite(db, { domain: 'example.com', keyword: 'electrician' });
 80  
 81      const stats = dedupeDomains(db);
 82  
 83      assert.equal(stats.duplicateDomains, 1);
 84      assert.equal(stats.sitesKept, 1);
 85      assert.equal(stats.sitesMarkedIgnored, 1);
 86  
 87      const kept = db
 88        .prepare("SELECT * FROM sites WHERE domain='example.com' AND status != 'ignored'")
 89        .get();
 90      assert.ok(kept, 'should have a kept site');
 91      assert.equal(kept.keyword, 'plumber');
 92  
 93      const ignored = db
 94        .prepare("SELECT * FROM sites WHERE domain='example.com' AND status = 'ignored'")
 95        .get();
 96      assert.ok(ignored, 'should have an ignored site');
 97      assert.equal(ignored.keyword, 'electrician');
 98      db.close();
 99    });
100  
101    test('dry run does not modify the database', () => {
102      const db = createDb();
103      insertSite(db, { domain: 'dup.com', keyword: 'plumber' });
104      insertSite(db, { domain: 'dup.com', keyword: 'electrician' });
105  
106      const countBefore = db
107        .prepare("SELECT COUNT(*) as n FROM sites WHERE status='enriched'")
108        .get().n;
109  
110      const stats = dedupeDomains(db, { dryRun: true });
111  
112      const countAfter = db
113        .prepare("SELECT COUNT(*) as n FROM sites WHERE status='enriched'")
114        .get().n;
115  
116      assert.equal(countBefore, countAfter, 'dry run should not change the DB');
117      assert.equal(stats.duplicateDomains, 1);
118      assert.equal(stats.sitesMarkedIgnored, 0);
119      db.close();
120    });
121  
122    test('handles multiple duplicate groups', () => {
123      const db = createDb();
124      insertSite(db, { domain: 'site1.com', keyword: 'plumber' });
125      insertSite(db, { domain: 'site1.com', keyword: 'electrician' });
126      insertSite(db, { domain: 'site2.com', keyword: 'painter' });
127      insertSite(db, { domain: 'site2.com', keyword: 'carpenter' });
128      insertSite(db, { domain: 'unique.com', keyword: 'plumber' });
129  
130      const stats = dedupeDomains(db);
131  
132      assert.equal(stats.duplicateDomains, 2);
133      assert.equal(stats.sitesKept, 2);
134      assert.equal(stats.sitesMarkedIgnored, 2);
135      db.close();
136    });
137  
138    test('skips sites already marked as ignore', () => {
139      const db = createDb();
140      insertSite(db, { domain: 'ignored.com', keyword: 'plumber', status: 'ignored' });
141      insertSite(db, { domain: 'ignored.com', keyword: 'electrician' });
142  
143      const stats = dedupeDomains(db);
144  
145      assert.equal(stats.duplicateDomains, 0);
146      db.close();
147    });
148  
149    test('skips sites with status outreach_sent', () => {
150      const db = createDb();
151      insertSite(db, { domain: 'sent.com', keyword: 'plumber', status: 'outreach_sent' });
152      insertSite(db, { domain: 'sent.com', keyword: 'electrician' });
153  
154      const stats = dedupeDomains(db);
155  
156      assert.equal(stats.duplicateDomains, 0);
157      db.close();
158    });
159  });