/ scripts / filter-long-keywords.js
filter-long-keywords.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Filter out keywords with more than 3 words from all *-final-filtered.csv files
  5   *
  6   * Usage:
  7   *   node scripts/filter-long-keywords.js [--dry-run] [--country XX]
  8   */
  9  
 10  import fs from 'fs';
 11  import path from 'path';
 12  import { parse } from 'csv-parse/sync';
 13  import { stringify } from 'csv-stringify/sync';
 14  
 15  const args = process.argv.slice(2);
 16  const dryRun = args.includes('--dry-run');
 17  const countryArg = args.find(a => a.startsWith('--country='));
 18  const targetCountry = countryArg ? countryArg.split('=')[1].toLowerCase() : null;
 19  
 20  const dataDir = './data';
 21  const MAX_WORDS = 3;
 22  
 23  // Get all country directories
 24  let countries = fs.readdirSync(dataDir).filter(f => {
 25    const fullPath = path.join(dataDir, f);
 26    return fs.statSync(fullPath).isDirectory() && f.length === 2;
 27  });
 28  
 29  if (targetCountry) {
 30    countries = countries.filter(c => c === targetCountry);
 31  }
 32  
 33  console.log(`Filtering keywords to max ${MAX_WORDS} words...`);
 34  if (dryRun) console.log('DRY RUN MODE - no files will be modified\n');
 35  console.log('');
 36  
 37  let totalProcessed = 0;
 38  let totalRemoved = 0;
 39  let totalKept = 0;
 40  
 41  for (const country of countries) {
 42    const countryPath = path.join(dataDir, country);
 43  
 44    // Process both businesses and regions
 45    for (const type of ['businesses', 'regions']) {
 46      const csvPath = path.join(countryPath, `${type}-final-filtered.csv`);
 47  
 48      if (!fs.existsSync(csvPath)) continue;
 49  
 50      const content = fs.readFileSync(csvPath, 'utf-8');
 51      const rows = parse(content, { columns: true });
 52  
 53      const kept = [];
 54      const removed = [];
 55  
 56      for (const row of rows) {
 57        const { keyword } = row;
 58        const wordCount = keyword.trim().split(/\s+/).length;
 59  
 60        if (wordCount <= MAX_WORDS) {
 61          kept.push(row);
 62        } else {
 63          removed.push({ keyword, wordCount, sv: row.search_volume });
 64        }
 65      }
 66  
 67      if (removed.length > 0) {
 68        console.log(`${country.toUpperCase()} ${type}:`);
 69        console.log(`  Kept: ${kept.length}, Removed: ${removed.length}`);
 70  
 71        // Show examples of removed keywords (top 5 by search volume)
 72        const examples = removed.sort((a, b) => parseInt(b.sv) - parseInt(a.sv)).slice(0, 5);
 73        examples.forEach(ex => {
 74          console.log(`    - "${ex.keyword}" (${ex.wordCount} words, ${ex.sv} SV)`);
 75        });
 76        if (removed.length > 5) {
 77          console.log(`    ... and ${removed.length - 5} more`);
 78        }
 79        console.log('');
 80  
 81        // Write filtered CSV
 82        if (!dryRun) {
 83          const newContent = stringify(kept, { header: true, columns: Object.keys(kept[0]) });
 84          fs.writeFileSync(csvPath, newContent);
 85        }
 86  
 87        totalRemoved += removed.length;
 88        totalKept += kept.length;
 89        totalProcessed++;
 90      }
 91    }
 92  }
 93  
 94  console.log('='.repeat(60));
 95  if (dryRun) {
 96    console.log('DRY RUN COMPLETE');
 97    console.log(`Would remove ${totalRemoved} keywords from ${totalProcessed} files`);
 98    console.log(`Would keep ${totalKept} keywords`);
 99    console.log('');
100    console.log('Run without --dry-run to apply changes');
101  } else {
102    console.log(`✓ Filtered ${totalProcessed} files`);
103    console.log(`✓ Removed ${totalRemoved} keywords`);
104    console.log(`✓ Kept ${totalKept} keywords`);
105  }