dedupe-domains.js
1 /** 2 * Domain Deduplication Utility 3 * Removes duplicate domains from sites table, keeping only the entry 4 * with the keyword that has the highest search volume 5 * 6 * DEPRECATED: Not called by the active pipeline (pipeline-service.js, src/stages/). 7 * Only referenced by scripts/dedupe-domains.js (a standalone CLI script) and its 8 * test files. Still uses the SQLite .prepare() API — do not use for new code. 9 */ 10 11 import Logger from './logger.js'; 12 13 const logger = new Logger('Dedupe'); 14 15 /** 16 * Deduplicate domains in sites table 17 * For each domain with multiple entries, keeps the one with highest search volume 18 * and marks others as status='ignored' with error_message='Duplicate domain' 19 * 20 * @param {Database} db - Database connection 21 * @param {Object} options - Options 22 * @param {boolean} options.dryRun - If true, only report what would be deduped 23 * @returns {Object} Deduplication stats 24 */ 25 export function dedupeDomains(db, options = {}) { 26 const { dryRun = false } = options; 27 28 // Find all domains that appear multiple times 29 const duplicateDomains = db 30 .prepare( 31 ` 32 SELECT domain, COUNT(*) as count 33 FROM sites 34 WHERE status NOT IN ('ignored', 'outreach_partial', 'outreach_sent') 35 GROUP BY domain 36 HAVING COUNT(*) > 1 37 ORDER BY COUNT(*) DESC 38 ` 39 ) 40 .all(); 41 42 if (duplicateDomains.length === 0) { 43 logger.info('No duplicate domains found'); 44 return { 45 duplicateDomains: 0, 46 sitesMarkedIgnored: 0, 47 sitesKept: 0, 48 }; 49 } 50 51 logger.info(`Found ${duplicateDomains.length} domains with multiple entries`); 52 53 const stats = { 54 duplicateDomains: duplicateDomains.length, 55 sitesMarkedIgnored: 0, 56 sitesKept: 0, 57 }; 58 59 // For each duplicate domain, find the best entry to keep 60 for (const { domain } of duplicateDomains) { 61 // Get all site entries for this domain with keyword search volumes 62 const sites = db 63 .prepare( 64 ` 65 SELECT 66 s.id, 67 s.keyword, 68 s.status, 69 COALESCE(k.search_volume, 0) as search_volume 70 FROM sites s 71 LEFT JOIN keywords k ON s.keyword = k.keyword AND s.country_code = k.country_code 72 WHERE s.domain = ? 73 AND s.status NOT IN ('ignored', 'outreach_sent') 74 ORDER BY k.search_volume DESC, s.id ASC 75 ` 76 ) 77 .all(domain); 78 79 if (sites.length <= 1) { 80 continue; // Skip if only one non-ignored entry 81 } 82 83 // Keep the first one (highest search volume) 84 const keepSite = sites[0]; 85 const duplicateSites = sites.slice(1); 86 87 if (dryRun) { 88 logger.info( 89 ` Would keep: ${domain} (keyword: "${keepSite.keyword}", search_volume: ${keepSite.search_volume})` 90 ); 91 logger.info( 92 ` Would ignore: ${duplicateSites.length} duplicate(s) for keywords: ${duplicateSites.map(s => `"${s.keyword}"`).join(', ')}` 93 ); 94 } else { 95 // Mark duplicates as ignored 96 const markIgnored = db.prepare(` 97 UPDATE sites 98 SET status = 'ignored', 99 error_message = 'Duplicate domain (kept highest search volume keyword)' 100 WHERE id = ? 101 `); 102 103 for (const site of duplicateSites) { 104 markIgnored.run(site.id); 105 stats.sitesMarkedIgnored++; 106 } 107 108 logger.info( 109 ` Kept: ${domain} (keyword: "${keepSite.keyword}", search_volume: ${keepSite.search_volume})` 110 ); 111 logger.info( 112 ` Marked ${duplicateSites.length} duplicate(s) as ignored for keywords: ${duplicateSites.map(s => `"${s.keyword}"`).join(', ')}` 113 ); 114 } 115 116 stats.sitesKept++; 117 } 118 119 if (dryRun) { 120 logger.info('\nDry run complete - no changes made'); 121 } else { 122 logger.success( 123 `\nDeduplication complete: ${stats.sitesKept} domains kept, ${stats.sitesMarkedIgnored} duplicates marked as ignored` 124 ); 125 } 126 127 return stats; 128 }