/ src / utils / dedupe-domains.js
dedupe-domains.js
  1  /**
  2   * Domain Deduplication Utility
  3   * Removes duplicate domains from sites table, keeping only the entry
  4   * with the keyword that has the highest search volume
  5   *
  6   * DEPRECATED: Not called by the active pipeline (pipeline-service.js, src/stages/).
  7   * Only referenced by scripts/dedupe-domains.js (a standalone CLI script) and its
  8   * test files. Still uses the SQLite .prepare() API — do not use for new code.
  9   */
 10  
 11  import Logger from './logger.js';
 12  
 13  const logger = new Logger('Dedupe');
 14  
 15  /**
 16   * Deduplicate domains in sites table
 17   * For each domain with multiple entries, keeps the one with highest search volume
 18   * and marks others as status='ignored' with error_message='Duplicate domain'
 19   *
 20   * @param {Database} db - Database connection
 21   * @param {Object} options - Options
 22   * @param {boolean} options.dryRun - If true, only report what would be deduped
 23   * @returns {Object} Deduplication stats
 24   */
 25  export function dedupeDomains(db, options = {}) {
 26    const { dryRun = false } = options;
 27  
 28    // Find all domains that appear multiple times
 29    const duplicateDomains = db
 30      .prepare(
 31        `
 32      SELECT domain, COUNT(*) as count
 33      FROM sites
 34      WHERE status NOT IN ('ignored', 'outreach_partial', 'outreach_sent')
 35      GROUP BY domain
 36      HAVING COUNT(*) > 1
 37      ORDER BY COUNT(*) DESC
 38    `
 39      )
 40      .all();
 41  
 42    if (duplicateDomains.length === 0) {
 43      logger.info('No duplicate domains found');
 44      return {
 45        duplicateDomains: 0,
 46        sitesMarkedIgnored: 0,
 47        sitesKept: 0,
 48      };
 49    }
 50  
 51    logger.info(`Found ${duplicateDomains.length} domains with multiple entries`);
 52  
 53    const stats = {
 54      duplicateDomains: duplicateDomains.length,
 55      sitesMarkedIgnored: 0,
 56      sitesKept: 0,
 57    };
 58  
 59    // For each duplicate domain, find the best entry to keep
 60    for (const { domain } of duplicateDomains) {
 61      // Get all site entries for this domain with keyword search volumes
 62      const sites = db
 63        .prepare(
 64          `
 65        SELECT
 66          s.id,
 67          s.keyword,
 68          s.status,
 69          COALESCE(k.search_volume, 0) as search_volume
 70        FROM sites s
 71        LEFT JOIN keywords k ON s.keyword = k.keyword AND s.country_code = k.country_code
 72        WHERE s.domain = ?
 73          AND s.status NOT IN ('ignored', 'outreach_sent')
 74        ORDER BY k.search_volume DESC, s.id ASC
 75      `
 76        )
 77        .all(domain);
 78  
 79      if (sites.length <= 1) {
 80        continue; // Skip if only one non-ignored entry
 81      }
 82  
 83      // Keep the first one (highest search volume)
 84      const keepSite = sites[0];
 85      const duplicateSites = sites.slice(1);
 86  
 87      if (dryRun) {
 88        logger.info(
 89          `  Would keep: ${domain} (keyword: "${keepSite.keyword}", search_volume: ${keepSite.search_volume})`
 90        );
 91        logger.info(
 92          `  Would ignore: ${duplicateSites.length} duplicate(s) for keywords: ${duplicateSites.map(s => `"${s.keyword}"`).join(', ')}`
 93        );
 94      } else {
 95        // Mark duplicates as ignored
 96        const markIgnored = db.prepare(`
 97          UPDATE sites
 98          SET status = 'ignored',
 99              error_message = 'Duplicate domain (kept highest search volume keyword)'
100          WHERE id = ?
101        `);
102  
103        for (const site of duplicateSites) {
104          markIgnored.run(site.id);
105          stats.sitesMarkedIgnored++;
106        }
107  
108        logger.info(
109          `  Kept: ${domain} (keyword: "${keepSite.keyword}", search_volume: ${keepSite.search_volume})`
110        );
111        logger.info(
112          `  Marked ${duplicateSites.length} duplicate(s) as ignored for keywords: ${duplicateSites.map(s => `"${s.keyword}"`).join(', ')}`
113        );
114      }
115  
116      stats.sitesKept++;
117    }
118  
119    if (dryRun) {
120      logger.info('\nDry run complete - no changes made');
121    } else {
122      logger.success(
123        `\nDeduplication complete: ${stats.sitesKept} domains kept, ${stats.sitesMarkedIgnored} duplicates marked as ignored`
124      );
125    }
126  
127    return stats;
128  }