/ scripts / dedupe-locale-aware.js
dedupe-locale-aware.js
  1  #!/usr/bin/env node
  2  /**
  3   * Locale-Aware Domain Deduplication Script
  4   *
  5   * For domains found in multiple regions (google.com.au, google.co.nz, etc.),
  6   * this script keeps the entry where country_code matches google_domain's country
  7   * (exact match) and marks cross-border duplicates as 'ignore'.
  8   *
  9   * Usage:
 10   *   npm run dedupe:locale           # Execute deduplication
 11   *   npm run dedupe:locale:dry-run   # Preview without changes
 12   */
 13  
 14  import { createDatabaseConnection } from '../src/utils/db.js';
 15  import { deduplicateSites } from '../src/utils/dedupe-locale-aware.js';
 16  
 17  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
 18  const isDryRun = process.argv.includes('--dry-run');
 19  
 20  console.log(`\n${'='.repeat(60)}`);
 21  console.log(`Locale-Aware Domain Deduplication ${isDryRun ? '(DRY RUN)' : ''}`);
 22  console.log(`${'='.repeat(60)}\n`);
 23  
 24  const db = createDatabaseConnection(dbPath);
 25  
 26  try {
 27    // Count duplicates before deduplication
 28    const beforeStats = db
 29      .prepare(
 30        `
 31      SELECT COUNT(DISTINCT domain) as duplicate_domains
 32      FROM (
 33        SELECT domain, COUNT(*) as count
 34        FROM sites
 35        WHERE status != 'ignored'
 36        GROUP BY domain
 37        HAVING count > 1
 38      )
 39    `
 40      )
 41      .get();
 42  
 43    console.log(`Duplicate domains before: ${beforeStats.duplicate_domains}`);
 44  
 45    // Run deduplication
 46    const stats = deduplicateSites(db, isDryRun);
 47  
 48    // Display results
 49    console.log(`\nDeduplication Results:`);
 50    console.log(`  Duplicate domains processed: ${stats.duplicateDomains}`);
 51    console.log(`  Sites marked as ignored: ${stats.sitesIgnored}`);
 52    console.log(`  Cross-border duplicates: ${stats.crossBorder}`);
 53  
 54    if (isDryRun) {
 55      console.log(`\n⚠️  DRY RUN - No changes made to database`);
 56      console.log(`Run without --dry-run to apply changes`);
 57    } else {
 58      console.log(`\n✅ Deduplication complete`);
 59    }
 60  
 61    // Count duplicates after deduplication
 62    const afterStats = db
 63      .prepare(
 64        `
 65      SELECT COUNT(DISTINCT domain) as duplicate_domains
 66      FROM (
 67        SELECT domain, COUNT(*) as count
 68        FROM sites
 69        WHERE status != 'ignored'
 70        GROUP BY domain
 71        HAVING count > 1
 72      )
 73    `
 74      )
 75      .get();
 76  
 77    console.log(`\nDuplicate domains after: ${afterStats.duplicate_domains}`);
 78  
 79    // Show some examples of deduplicated sites
 80    if (!isDryRun && stats.sitesIgnored > 0) {
 81      console.log(`\nExample cross-border duplicates ignored:`);
 82      const examples = db
 83        .prepare(
 84          `
 85        SELECT domain, google_domain, country_code, error_message
 86        FROM sites
 87        WHERE status = 'ignored'
 88          AND error_message LIKE 'Cross-border duplicate%'
 89        LIMIT 5
 90      `
 91        )
 92        .all();
 93  
 94      for (const site of examples) {
 95        console.log(`  ${site.domain}`);
 96        console.log(`    Google domain: ${site.google_domain}`);
 97        console.log(`    Country code: ${site.country_code}`);
 98        console.log(`    Reason: ${site.error_message}`);
 99      }
100    }
101  } catch (err) {
102    console.error(`\n❌ Error: ${err.message}`);
103    process.exit(1);
104  } finally {
105    db.close();
106  }
107  
108  console.log(`\n${'='.repeat(60)}\n`);