/ src / utils / dedupe-locale-aware.js
dedupe-locale-aware.js
  1  /**
  2   * Locale-Aware Domain Deduplication
  3   * Deduplicates sites based on both domain AND locale matching
  4   * Keeps sites where country_code matches google_domain's country (exact locale match)
  5   * Ignores cross-border duplicates (e.g., .com.au site found on google.co.nz)
  6   *
  7   * MIGRATION REQUIRED: deduplicateSites() still uses the SQLite .prepare() API
  8   * (synchronous). src/stages/assets.js calls it with getPool() as the db argument,
  9   * which causes it to attempt SQLite methods on a pg.Pool — this will fail at runtime
 10   * once the pool no longer has a .prepare() shim. Convert to use async getAll/run
 11   * from db.js and update the call sites in assets.js to await the result.
 12   */
 13  
 14  import { parseCountryFromGoogleDomain } from './tld-detector.js';
 15  import Logger from './logger.js';
 16  
 17  const logger = new Logger('DedupeLocale');
 18  
 19  /**
 20   * Deduplicate sites with locale awareness
 21   * Priority: Exact locale match > Search volume > First found
 22   * @param {Object} db - Database connection
 23   * @param {boolean} dryRun - If true, only log what would be done
 24   * @returns {Object} - Stats about deduplication
 25   */
 26  export function deduplicateSites(db, dryRun = false) {
 27    // Find domains with multiple entries
 28    const duplicates = db
 29      .prepare(
 30        `
 31      SELECT domain, COUNT(*) as count
 32      FROM sites
 33      WHERE status NOT IN ('ignored')
 34      GROUP BY domain
 35      HAVING count > 1
 36    `
 37      )
 38      .all();
 39  
 40    if (duplicates.length === 0) {
 41      logger.info('No duplicate domains found');
 42      return {
 43        duplicateDomains: 0,
 44        sitesIgnored: 0,
 45        exactMatches: 0,
 46        crossBorder: 0,
 47      };
 48    }
 49  
 50    logger.info(`Found ${duplicates.length} domains with multiple entries`);
 51  
 52    let sitesIgnored = 0;
 53    let exactMatches = 0;
 54    let crossBorder = 0;
 55  
 56    for (const { domain } of duplicates) {
 57      // Get all sites for this domain
 58      const sites = db
 59        .prepare(
 60          `
 61        SELECT id, country_code, google_domain, keyword
 62        FROM sites
 63        WHERE domain = ? AND status NOT IN ('ignored')
 64        ORDER BY id
 65      `
 66        )
 67        .all(domain);
 68  
 69      // Calculate score for each site
 70      // Score = (exact_match ? 1000000 : 0) + search_volume
 71      const sitesWithScores = sites.map(site => {
 72        const googleCountry = parseCountryFromGoogleDomain(site.google_domain);
 73        const isExactMatch = site.country_code === googleCountry;
 74  
 75        // Get search volume from keywords table
 76        const keywordData = db
 77          .prepare(
 78            `
 79          SELECT search_volume FROM keywords WHERE keyword = ? LIMIT 1
 80        `
 81          )
 82          .get(site.keyword);
 83  
 84        const searchVolume = keywordData?.search_volume || 0;
 85  
 86        // Exact match heavily weighted (1 million points)
 87        const score = (isExactMatch ? 1 : 0) * 1000000 + searchVolume;
 88  
 89        return {
 90          ...site,
 91          googleCountry,
 92          isExactMatch,
 93          searchVolume,
 94          score,
 95        };
 96      });
 97  
 98      // Sort by score (descending) and keep the best one
 99      sitesWithScores.sort((a, b) => b.score - a.score);
100      const bestSite = sitesWithScores[0];
101      const sitesToIgnore = sitesWithScores.slice(1);
102  
103      if (dryRun) {
104        logger.info(`\nDomain: ${domain}`);
105        logger.info(`  KEEP: Site #${bestSite.id} (${bestSite.google_domain})`);
106        logger.info(`    country_code: ${bestSite.country_code}`);
107        logger.info(`    google_domain country: ${bestSite.googleCountry}`);
108        logger.info(`    exact_match: ${bestSite.isExactMatch}`);
109        logger.info(`    search_volume: ${bestSite.searchVolume}`);
110  
111        for (const site of sitesToIgnore) {
112          logger.warn(`  IGNORE: Site #${site.id} (${site.google_domain})`);
113          logger.warn(`    country_code: ${site.country_code}`);
114          logger.warn(`    google_domain country: ${site.googleCountry}`);
115          logger.warn(`    exact_match: ${site.isExactMatch}`);
116          logger.warn(`    search_volume: ${site.searchVolume}`);
117        }
118      } else {
119        // Mark others as ignored
120        for (const site of sitesToIgnore) {
121          db.prepare(
122            `
123            UPDATE sites
124            SET status = 'ignored',
125                error_message = 'Cross-border duplicate (kept ' || ? || ' google_domain entry)'
126            WHERE id = ?
127          `
128          ).run(bestSite.google_domain, site.id);
129  
130          sitesIgnored++;
131          if (site.isExactMatch && bestSite.isExactMatch) {
132            exactMatches++;
133          } else {
134            crossBorder++;
135          }
136        }
137  
138        logger.info(
139          `  ${domain}: Kept ${bestSite.google_domain} entry, ignored ${sitesToIgnore.length} duplicate(s)`
140        );
141      }
142    }
143  
144    const stats = {
145      duplicateDomains: duplicates.length,
146      sitesIgnored,
147      exactMatches,
148      crossBorder,
149    };
150  
151    if (dryRun) {
152      logger.info('\n=== DRY RUN - No changes made ===');
153    }
154  
155    logger.info(`\nDeduplication complete:`);
156    logger.info(`  Duplicate domains: ${stats.duplicateDomains}`);
157    logger.info(`  Sites ignored: ${stats.sitesIgnored}`);
158    logger.info(`  Exact match duplicates: ${stats.exactMatches}`);
159    logger.info(`  Cross-border duplicates: ${stats.crossBorder}`);
160  
161    return stats;
162  }