/ scripts / dedupe-outreaches.js
dedupe-outreaches.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Deduplicate Outreaches
  5   *
  6   * Removes duplicate outreach records that have the same (site_id, contact_method, contact_uri).
  7   * Keeps the oldest outreach for each unique combination.
  8   *
  9   * Usage:
 10   *   npm run dedupe:outreaches -- --dry-run  (preview only)
 11   *   npm run dedupe:outreaches                (execute)
 12   */
 13  
 14  import { createDatabaseConnection } from '../src/utils/db.js';
 15  import { join, dirname } from 'path';
 16  import { fileURLToPath } from 'url';
 17  import dotenv from 'dotenv';
 18  
 19  dotenv.config();
 20  
 21  const __filename = fileURLToPath(import.meta.url);
 22  const __dirname = dirname(__filename);
 23  const projectRoot = join(__dirname, '../..');
 24  
 25  const dbPath = process.env.DATABASE_PATH || join(projectRoot, 'db/sites.db');
 26  const dryRun = process.argv.includes('--dry-run');
 27  
 28  /**
 29   * Find and remove duplicate outreaches
 30   */
 31  function deduplicateOutreaches() {
 32    const db = createDatabaseConnection(dbPath);
 33  
 34    try {
 35      // Disable foreign keys temporarily for deduplication
 36      // (No conversations reference duplicate outreaches, but we'll be safe)
 37      db.pragma('foreign_keys = OFF');
 38  
 39      console.log('🔍 Checking for duplicate outreaches...\n');
 40  
 41      // Find duplicates (keep oldest by id)
 42      const duplicates = db
 43        .prepare(
 44          `
 45          SELECT
 46            o1.id,
 47            o1.site_id,
 48            o1.contact_method,
 49            o1.contact_uri,
 50            o1.created_at,
 51            s.domain
 52          FROM outreaches o1
 53          INNER JOIN outreaches o2
 54            ON o1.site_id = o2.site_id
 55            AND o1.contact_method = o2.contact_method
 56            AND o1.contact_uri = o2.contact_uri
 57            AND o1.id > o2.id  -- Keep the one with lower id (older)
 58          INNER JOIN sites s ON s.id = o1.site_id
 59          ORDER BY o1.site_id, o1.contact_method, o1.contact_uri, o1.id
 60        `
 61        )
 62        .all();
 63  
 64      if (duplicates.length === 0) {
 65        console.log('✅ No duplicate outreaches found!');
 66        return;
 67      }
 68  
 69      console.log(`Found ${duplicates.length} duplicate outreach records:\n`);
 70  
 71      // Group duplicates by site for display
 72      const duplicatesBySite = {};
 73      for (const dup of duplicates) {
 74        if (!duplicatesBySite[dup.site_id]) {
 75          duplicatesBySite[dup.site_id] = {
 76            domain: dup.domain,
 77            contacts: {},
 78          };
 79        }
 80        const key = `${dup.contact_method}:${dup.contact_uri}`;
 81        if (!duplicatesBySite[dup.site_id].contacts[key]) {
 82          duplicatesBySite[dup.site_id].contacts[key] = 0;
 83        }
 84        duplicatesBySite[dup.site_id].contacts[key]++;
 85      }
 86  
 87      // Display summary
 88      for (const [siteId, info] of Object.entries(duplicatesBySite)) {
 89        console.log(`  Site ${siteId} (${info.domain}):`);
 90        for (const [contact, count] of Object.entries(info.contacts)) {
 91          console.log(`    - ${contact}: ${count + 1} copies (removing ${count})`);
 92        }
 93      }
 94  
 95      console.log();
 96  
 97      if (dryRun) {
 98        console.log('🔍 DRY RUN - No changes made.');
 99        console.log(`   Run without --dry-run to delete ${duplicates.length} duplicate records.`);
100        return;
101      }
102  
103      // Delete duplicates
104      console.log(`🗑️  Deleting ${duplicates.length} duplicate outreach records...`);
105  
106      const duplicateIds = duplicates.map(d => d.id);
107      const placeholders = duplicateIds.map(() => '?').join(',');
108  
109      db.prepare(`DELETE FROM outreaches WHERE id IN (${placeholders})`).run(...duplicateIds);
110  
111      console.log('✅ Duplicates removed successfully!');
112  
113      // Verify
114      const remaining = db
115        .prepare(
116          `
117          SELECT COUNT(*) as count
118          FROM outreaches o1
119          INNER JOIN outreaches o2
120            ON o1.site_id = o2.site_id
121            AND o1.contact_method = o2.contact_method
122            AND o1.contact_uri = o2.contact_uri
123            AND o1.id != o2.id
124        `
125        )
126        .get();
127  
128      if (remaining.count === 0) {
129        console.log('✅ Verification passed - no duplicates remain!');
130      } else {
131        console.warn(`⚠️  Warning: ${remaining.count} duplicates still exist (should not happen)`);
132      }
133    } finally {
134      db.close();
135    }
136  }
137  
138  // Run deduplication
139  try {
140    deduplicateOutreaches();
141  } catch (error) {
142    console.error('❌ Error:', error.message);
143    process.exit(1);
144  }