/ scripts / deduplicate-regions.js
deduplicate-regions.js
  1  #!/usr/bin/env node
  2  
  3  import fs from 'fs';
  4  import path from 'path';
  5  import { fileURLToPath } from 'url';
  6  
  7  const __dirname = path.dirname(fileURLToPath(import.meta.url));
  8  const dataDir = path.join(__dirname, '..', 'data');
  9  
 10  const countries = [
 11    'at',
 12    'au',
 13    'be',
 14    'ca',
 15    'ch',
 16    'cn',
 17    'de',
 18    'dk',
 19    'es',
 20    'fr',
 21    'id',
 22    'ie',
 23    'in',
 24    'it',
 25    'jp',
 26    'kr',
 27    'mx',
 28    'nl',
 29    'no',
 30    'nz',
 31    'pl',
 32    'se',
 33    'sg',
 34    'uk',
 35    'us',
 36  ];
 37  
 38  function deduplicateRegions(filePath) {
 39    const content = fs.readFileSync(filePath, 'utf-8');
 40    const lines = content.split('\n');
 41  
 42    const seen = new Set();
 43    const deduplicated = [];
 44    let duplicatesCount = 0;
 45  
 46    for (const line of lines) {
 47      // Preserve comments and blank lines
 48      if (line.trim().startsWith('#') || line.trim() === '') {
 49        deduplicated.push(line);
 50        continue;
 51      }
 52  
 53      // Check for duplicates (case-insensitive)
 54      const normalized = line.trim().toLowerCase();
 55      if (normalized === '') {
 56        deduplicated.push(line);
 57        continue;
 58      }
 59  
 60      if (seen.has(normalized)) {
 61        duplicatesCount++;
 62        continue; // Skip duplicate
 63      }
 64  
 65      seen.add(normalized);
 66      deduplicated.push(line);
 67    }
 68  
 69    return { deduplicated: deduplicated.join('\n'), duplicatesCount };
 70  }
 71  
 72  console.log('Deduplicating regions.txt files...\n');
 73  
 74  const results = [];
 75  
 76  for (const country of countries) {
 77    const filePath = path.join(dataDir, country, 'regions.txt');
 78  
 79    if (!fs.existsSync(filePath)) {
 80      console.log(`⚠️  ${country.toUpperCase()}: File not found`);
 81      continue;
 82    }
 83  
 84    const { deduplicated, duplicatesCount } = deduplicateRegions(filePath);
 85  
 86    if (duplicatesCount > 0) {
 87      fs.writeFileSync(filePath, deduplicated, 'utf-8');
 88      console.log(`✅ ${country.toUpperCase()}: Removed ${duplicatesCount} duplicate(s)`);
 89    } else {
 90      console.log(`✓  ${country.toUpperCase()}: No duplicates found`);
 91    }
 92  
 93    results.push({ country, duplicatesCount });
 94  }
 95  
 96  console.log(`\n${'='.repeat(50)}`);
 97  console.log('SUMMARY');
 98  console.log('='.repeat(50));
 99  
100  const totalDuplicates = results.reduce((sum, r) => sum + r.duplicatesCount, 0);
101  const countriesWithDuplicates = results.filter(r => r.duplicatesCount > 0);
102  
103  if (countriesWithDuplicates.length > 0) {
104    console.log('\nCountries with duplicates removed:');
105    countriesWithDuplicates.forEach(r => {
106      console.log(`  ${r.country.toUpperCase()}: ${r.duplicatesCount} duplicate(s)`);
107    });
108  }
109  
110  console.log(`\nTotal duplicates removed: ${totalDuplicates}`);
111  console.log(`Files processed: ${results.length}`);
112  console.log(`Files with duplicates: ${countriesWithDuplicates.length}`);