dedupe-outreaches.js
1 #!/usr/bin/env node 2 3 /** 4 * Deduplicate Outreaches 5 * 6 * Removes duplicate outreach records that have the same (site_id, contact_method, contact_uri). 7 * Keeps the oldest outreach for each unique combination. 8 * 9 * Usage: 10 * npm run dedupe:outreaches -- --dry-run (preview only) 11 * npm run dedupe:outreaches (execute) 12 */ 13 14 import { createDatabaseConnection } from '../src/utils/db.js'; 15 import { join, dirname } from 'path'; 16 import { fileURLToPath } from 'url'; 17 import dotenv from 'dotenv'; 18 19 dotenv.config(); 20 21 const __filename = fileURLToPath(import.meta.url); 22 const __dirname = dirname(__filename); 23 const projectRoot = join(__dirname, '../..'); 24 25 const dbPath = process.env.DATABASE_PATH || join(projectRoot, 'db/sites.db'); 26 const dryRun = process.argv.includes('--dry-run'); 27 28 /** 29 * Find and remove duplicate outreaches 30 */ 31 function deduplicateOutreaches() { 32 const db = createDatabaseConnection(dbPath); 33 34 try { 35 // Disable foreign keys temporarily for deduplication 36 // (No conversations reference duplicate outreaches, but we'll be safe) 37 db.pragma('foreign_keys = OFF'); 38 39 console.log('🔍 Checking for duplicate outreaches...\n'); 40 41 // Find duplicates (keep oldest by id) 42 const duplicates = db 43 .prepare( 44 ` 45 SELECT 46 o1.id, 47 o1.site_id, 48 o1.contact_method, 49 o1.contact_uri, 50 o1.created_at, 51 s.domain 52 FROM outreaches o1 53 INNER JOIN outreaches o2 54 ON o1.site_id = o2.site_id 55 AND o1.contact_method = o2.contact_method 56 AND o1.contact_uri = o2.contact_uri 57 AND o1.id > o2.id -- Keep the one with lower id (older) 58 INNER JOIN sites s ON s.id = o1.site_id 59 ORDER BY o1.site_id, o1.contact_method, o1.contact_uri, o1.id 60 ` 61 ) 62 .all(); 63 64 if (duplicates.length === 0) { 65 console.log('✅ No duplicate outreaches found!'); 66 return; 67 } 68 69 console.log(`Found ${duplicates.length} duplicate outreach records:\n`); 70 71 // Group duplicates by site for display 72 const duplicatesBySite = {}; 73 for (const dup of duplicates) { 74 if (!duplicatesBySite[dup.site_id]) { 75 duplicatesBySite[dup.site_id] = { 76 domain: dup.domain, 77 contacts: {}, 78 }; 79 } 80 const key = `${dup.contact_method}:${dup.contact_uri}`; 81 if (!duplicatesBySite[dup.site_id].contacts[key]) { 82 duplicatesBySite[dup.site_id].contacts[key] = 0; 83 } 84 duplicatesBySite[dup.site_id].contacts[key]++; 85 } 86 87 // Display summary 88 for (const [siteId, info] of Object.entries(duplicatesBySite)) { 89 console.log(` Site ${siteId} (${info.domain}):`); 90 for (const [contact, count] of Object.entries(info.contacts)) { 91 console.log(` - ${contact}: ${count + 1} copies (removing ${count})`); 92 } 93 } 94 95 console.log(); 96 97 if (dryRun) { 98 console.log('🔍 DRY RUN - No changes made.'); 99 console.log(` Run without --dry-run to delete ${duplicates.length} duplicate records.`); 100 return; 101 } 102 103 // Delete duplicates 104 console.log(`🗑️ Deleting ${duplicates.length} duplicate outreach records...`); 105 106 const duplicateIds = duplicates.map(d => d.id); 107 const placeholders = duplicateIds.map(() => '?').join(','); 108 109 db.prepare(`DELETE FROM outreaches WHERE id IN (${placeholders})`).run(...duplicateIds); 110 111 console.log('✅ Duplicates removed successfully!'); 112 113 // Verify 114 const remaining = db 115 .prepare( 116 ` 117 SELECT COUNT(*) as count 118 FROM outreaches o1 119 INNER JOIN outreaches o2 120 ON o1.site_id = o2.site_id 121 AND o1.contact_method = o2.contact_method 122 AND o1.contact_uri = o2.contact_uri 123 AND o1.id != o2.id 124 ` 125 ) 126 .get(); 127 128 if (remaining.count === 0) { 129 console.log('✅ Verification passed - no duplicates remain!'); 130 } else { 131 console.warn(`⚠️ Warning: ${remaining.count} duplicates still exist (should not happen)`); 132 } 133 } finally { 134 db.close(); 135 } 136 } 137 138 // Run deduplication 139 try { 140 deduplicateOutreaches(); 141 } catch (error) { 142 console.error('❌ Error:', error.message); 143 process.exit(1); 144 }