dedupe-locale-aware.js
1 /** 2 * Locale-Aware Domain Deduplication 3 * Deduplicates sites based on both domain AND locale matching 4 * Keeps sites where country_code matches google_domain's country (exact locale match) 5 * Ignores cross-border duplicates (e.g., .com.au site found on google.co.nz) 6 * 7 * MIGRATION REQUIRED: deduplicateSites() still uses the SQLite .prepare() API 8 * (synchronous). src/stages/assets.js calls it with getPool() as the db argument, 9 * which causes it to attempt SQLite methods on a pg.Pool — this will fail at runtime 10 * once the pool no longer has a .prepare() shim. Convert to use async getAll/run 11 * from db.js and update the call sites in assets.js to await the result. 12 */ 13 14 import { parseCountryFromGoogleDomain } from './tld-detector.js'; 15 import Logger from './logger.js'; 16 17 const logger = new Logger('DedupeLocale'); 18 19 /** 20 * Deduplicate sites with locale awareness 21 * Priority: Exact locale match > Search volume > First found 22 * @param {Object} db - Database connection 23 * @param {boolean} dryRun - If true, only log what would be done 24 * @returns {Object} - Stats about deduplication 25 */ 26 export function deduplicateSites(db, dryRun = false) { 27 // Find domains with multiple entries 28 const duplicates = db 29 .prepare( 30 ` 31 SELECT domain, COUNT(*) as count 32 FROM sites 33 WHERE status NOT IN ('ignored') 34 GROUP BY domain 35 HAVING count > 1 36 ` 37 ) 38 .all(); 39 40 if (duplicates.length === 0) { 41 logger.info('No duplicate domains found'); 42 return { 43 duplicateDomains: 0, 44 sitesIgnored: 0, 45 exactMatches: 0, 46 crossBorder: 0, 47 }; 48 } 49 50 logger.info(`Found ${duplicates.length} domains with multiple entries`); 51 52 let sitesIgnored = 0; 53 let exactMatches = 0; 54 let crossBorder = 0; 55 56 for (const { domain } of duplicates) { 57 // Get all sites for this domain 58 const sites = db 59 .prepare( 60 ` 61 SELECT id, country_code, google_domain, keyword 62 FROM sites 63 WHERE domain = ? AND status NOT IN ('ignored') 64 ORDER BY id 65 ` 66 ) 67 .all(domain); 68 69 // Calculate score for each site 70 // Score = (exact_match ? 1000000 : 0) + search_volume 71 const sitesWithScores = sites.map(site => { 72 const googleCountry = parseCountryFromGoogleDomain(site.google_domain); 73 const isExactMatch = site.country_code === googleCountry; 74 75 // Get search volume from keywords table 76 const keywordData = db 77 .prepare( 78 ` 79 SELECT search_volume FROM keywords WHERE keyword = ? LIMIT 1 80 ` 81 ) 82 .get(site.keyword); 83 84 const searchVolume = keywordData?.search_volume || 0; 85 86 // Exact match heavily weighted (1 million points) 87 const score = (isExactMatch ? 1 : 0) * 1000000 + searchVolume; 88 89 return { 90 ...site, 91 googleCountry, 92 isExactMatch, 93 searchVolume, 94 score, 95 }; 96 }); 97 98 // Sort by score (descending) and keep the best one 99 sitesWithScores.sort((a, b) => b.score - a.score); 100 const bestSite = sitesWithScores[0]; 101 const sitesToIgnore = sitesWithScores.slice(1); 102 103 if (dryRun) { 104 logger.info(`\nDomain: ${domain}`); 105 logger.info(` KEEP: Site #${bestSite.id} (${bestSite.google_domain})`); 106 logger.info(` country_code: ${bestSite.country_code}`); 107 logger.info(` google_domain country: ${bestSite.googleCountry}`); 108 logger.info(` exact_match: ${bestSite.isExactMatch}`); 109 logger.info(` search_volume: ${bestSite.searchVolume}`); 110 111 for (const site of sitesToIgnore) { 112 logger.warn(` IGNORE: Site #${site.id} (${site.google_domain})`); 113 logger.warn(` country_code: ${site.country_code}`); 114 logger.warn(` google_domain country: ${site.googleCountry}`); 115 logger.warn(` exact_match: ${site.isExactMatch}`); 116 logger.warn(` search_volume: ${site.searchVolume}`); 117 } 118 } else { 119 // Mark others as ignored 120 for (const site of sitesToIgnore) { 121 db.prepare( 122 ` 123 UPDATE sites 124 SET status = 'ignored', 125 error_message = 'Cross-border duplicate (kept ' || ? || ' google_domain entry)' 126 WHERE id = ? 127 ` 128 ).run(bestSite.google_domain, site.id); 129 130 sitesIgnored++; 131 if (site.isExactMatch && bestSite.isExactMatch) { 132 exactMatches++; 133 } else { 134 crossBorder++; 135 } 136 } 137 138 logger.info( 139 ` ${domain}: Kept ${bestSite.google_domain} entry, ignored ${sitesToIgnore.length} duplicate(s)` 140 ); 141 } 142 } 143 144 const stats = { 145 duplicateDomains: duplicates.length, 146 sitesIgnored, 147 exactMatches, 148 crossBorder, 149 }; 150 151 if (dryRun) { 152 logger.info('\n=== DRY RUN - No changes made ==='); 153 } 154 155 logger.info(`\nDeduplication complete:`); 156 logger.info(` Duplicate domains: ${stats.duplicateDomains}`); 157 logger.info(` Sites ignored: ${stats.sitesIgnored}`); 158 logger.info(` Exact match duplicates: ${stats.exactMatches}`); 159 logger.info(` Cross-border duplicates: ${stats.crossBorder}`); 160 161 return stats; 162 }