dedupe-locale-aware.js
1 #!/usr/bin/env node 2 /** 3 * Locale-Aware Domain Deduplication Script 4 * 5 * For domains found in multiple regions (google.com.au, google.co.nz, etc.), 6 * this script keeps the entry where country_code matches google_domain's country 7 * (exact match) and marks cross-border duplicates as 'ignore'. 8 * 9 * Usage: 10 * npm run dedupe:locale # Execute deduplication 11 * npm run dedupe:locale:dry-run # Preview without changes 12 */ 13 14 import { createDatabaseConnection } from '../src/utils/db.js'; 15 import { deduplicateSites } from '../src/utils/dedupe-locale-aware.js'; 16 17 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 18 const isDryRun = process.argv.includes('--dry-run'); 19 20 console.log(`\n${'='.repeat(60)}`); 21 console.log(`Locale-Aware Domain Deduplication ${isDryRun ? '(DRY RUN)' : ''}`); 22 console.log(`${'='.repeat(60)}\n`); 23 24 const db = createDatabaseConnection(dbPath); 25 26 try { 27 // Count duplicates before deduplication 28 const beforeStats = db 29 .prepare( 30 ` 31 SELECT COUNT(DISTINCT domain) as duplicate_domains 32 FROM ( 33 SELECT domain, COUNT(*) as count 34 FROM sites 35 WHERE status != 'ignored' 36 GROUP BY domain 37 HAVING count > 1 38 ) 39 ` 40 ) 41 .get(); 42 43 console.log(`Duplicate domains before: ${beforeStats.duplicate_domains}`); 44 45 // Run deduplication 46 const stats = deduplicateSites(db, isDryRun); 47 48 // Display results 49 console.log(`\nDeduplication Results:`); 50 console.log(` Duplicate domains processed: ${stats.duplicateDomains}`); 51 console.log(` Sites marked as ignored: ${stats.sitesIgnored}`); 52 console.log(` Cross-border duplicates: ${stats.crossBorder}`); 53 54 if (isDryRun) { 55 console.log(`\n⚠️ DRY RUN - No changes made to database`); 56 console.log(`Run without --dry-run to apply changes`); 57 } else { 58 console.log(`\n✅ Deduplication complete`); 59 } 60 61 // Count duplicates after deduplication 62 const afterStats = db 63 .prepare( 64 ` 65 SELECT COUNT(DISTINCT domain) as duplicate_domains 66 FROM ( 67 SELECT domain, COUNT(*) as count 68 FROM sites 69 WHERE status != 'ignored' 70 GROUP BY domain 71 HAVING count > 1 72 ) 73 ` 74 ) 75 .get(); 76 77 console.log(`\nDuplicate domains after: ${afterStats.duplicate_domains}`); 78 79 // Show some examples of deduplicated sites 80 if (!isDryRun && stats.sitesIgnored > 0) { 81 console.log(`\nExample cross-border duplicates ignored:`); 82 const examples = db 83 .prepare( 84 ` 85 SELECT domain, google_domain, country_code, error_message 86 FROM sites 87 WHERE status = 'ignored' 88 AND error_message LIKE 'Cross-border duplicate%' 89 LIMIT 5 90 ` 91 ) 92 .all(); 93 94 for (const site of examples) { 95 console.log(` ${site.domain}`); 96 console.log(` Google domain: ${site.google_domain}`); 97 console.log(` Country code: ${site.country_code}`); 98 console.log(` Reason: ${site.error_message}`); 99 } 100 } 101 } catch (err) { 102 console.error(`\n❌ Error: ${err.message}`); 103 process.exit(1); 104 } finally { 105 db.close(); 106 } 107 108 console.log(`\n${'='.repeat(60)}\n`);