/ scripts / dedupe-domains.js
dedupe-domains.js
 1  #!/usr/bin/env node
 2  
 3  /**
 4   * Manual domain deduplication script
 5   * Removes duplicate domains from sites table, keeping only the entry
 6   * with the keyword that has the highest search volume
 7   */
 8  
 9  import { createDatabaseConnection } from '../src/utils/db.js';
10  import { dedupeDomains } from '../src/utils/dedupe-domains.js';
11  import Logger from '../src/utils/logger.js';
12  
13  const logger = new Logger('Dedupe');
14  
15  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
16  
17  // Parse command line args
18  const args = process.argv.slice(2);
19  const dryRun = args.includes('--dry-run');
20  
21  async function main() {
22    const db = createDatabaseConnection(dbPath);
23  
24    try {
25      if (dryRun) {
26        logger.info('Running in dry-run mode (no changes will be made)');
27      }
28  
29      const stats = dedupeDomains(db, { dryRun });
30  
31      if (stats.duplicateDomains === 0) {
32        logger.info('\nNo duplicate domains found - database is clean!');
33        process.exit(0);
34      }
35  
36      if (dryRun) {
37        logger.info(
38          `\n${stats.duplicateDomains} duplicate domains found. Run without --dry-run to deduplicate.`
39        );
40      } else {
41        logger.success(
42          `\n✓ Successfully deduplicated ${stats.duplicateDomains} domains (${stats.sitesMarkedIgnored} duplicates marked as ignored)`
43        );
44      }
45    } catch (err) {
46      logger.error(`Deduplication failed: ${err.message}`);
47      process.exit(1);
48    } finally {
49      db.close();
50    }
51  }
52  
53  main();