filter-long-keywords.js
1 #!/usr/bin/env node 2 3 /** 4 * Filter out keywords with more than 3 words from all *-final-filtered.csv files 5 * 6 * Usage: 7 * node scripts/filter-long-keywords.js [--dry-run] [--country XX] 8 */ 9 10 import fs from 'fs'; 11 import path from 'path'; 12 import { parse } from 'csv-parse/sync'; 13 import { stringify } from 'csv-stringify/sync'; 14 15 const args = process.argv.slice(2); 16 const dryRun = args.includes('--dry-run'); 17 const countryArg = args.find(a => a.startsWith('--country=')); 18 const targetCountry = countryArg ? countryArg.split('=')[1].toLowerCase() : null; 19 20 const dataDir = './data'; 21 const MAX_WORDS = 3; 22 23 // Get all country directories 24 let countries = fs.readdirSync(dataDir).filter(f => { 25 const fullPath = path.join(dataDir, f); 26 return fs.statSync(fullPath).isDirectory() && f.length === 2; 27 }); 28 29 if (targetCountry) { 30 countries = countries.filter(c => c === targetCountry); 31 } 32 33 console.log(`Filtering keywords to max ${MAX_WORDS} words...`); 34 if (dryRun) console.log('DRY RUN MODE - no files will be modified\n'); 35 console.log(''); 36 37 let totalProcessed = 0; 38 let totalRemoved = 0; 39 let totalKept = 0; 40 41 for (const country of countries) { 42 const countryPath = path.join(dataDir, country); 43 44 // Process both businesses and regions 45 for (const type of ['businesses', 'regions']) { 46 const csvPath = path.join(countryPath, `${type}-final-filtered.csv`); 47 48 if (!fs.existsSync(csvPath)) continue; 49 50 const content = fs.readFileSync(csvPath, 'utf-8'); 51 const rows = parse(content, { columns: true }); 52 53 const kept = []; 54 const removed = []; 55 56 for (const row of rows) { 57 const { keyword } = row; 58 const wordCount = keyword.trim().split(/\s+/).length; 59 60 if (wordCount <= MAX_WORDS) { 61 kept.push(row); 62 } else { 63 removed.push({ keyword, wordCount, sv: row.search_volume }); 64 } 65 } 66 67 if (removed.length > 0) { 68 console.log(`${country.toUpperCase()} ${type}:`); 69 console.log(` Kept: ${kept.length}, Removed: ${removed.length}`); 70 71 // Show examples of removed keywords (top 5 by search volume) 72 const examples = removed.sort((a, b) => parseInt(b.sv) - parseInt(a.sv)).slice(0, 5); 73 examples.forEach(ex => { 74 console.log(` - "${ex.keyword}" (${ex.wordCount} words, ${ex.sv} SV)`); 75 }); 76 if (removed.length > 5) { 77 console.log(` ... and ${removed.length - 5} more`); 78 } 79 console.log(''); 80 81 // Write filtered CSV 82 if (!dryRun) { 83 const newContent = stringify(kept, { header: true, columns: Object.keys(kept[0]) }); 84 fs.writeFileSync(csvPath, newContent); 85 } 86 87 totalRemoved += removed.length; 88 totalKept += kept.length; 89 totalProcessed++; 90 } 91 } 92 } 93 94 console.log('='.repeat(60)); 95 if (dryRun) { 96 console.log('DRY RUN COMPLETE'); 97 console.log(`Would remove ${totalRemoved} keywords from ${totalProcessed} files`); 98 console.log(`Would keep ${totalKept} keywords`); 99 console.log(''); 100 console.log('Run without --dry-run to apply changes'); 101 } else { 102 console.log(`✓ Filtered ${totalProcessed} files`); 103 console.log(`✓ Removed ${totalRemoved} keywords`); 104 console.log(`✓ Kept ${totalKept} keywords`); 105 }