llm-regions-cleanup.js
1 #!/usr/bin/env node 2 3 /** 4 * LLM-Based Regions Cleanup 5 * 6 * Uses Claude Sonnet to filter regions CSVs to keep ONLY place names. 7 * Removes: weather queries, sports teams, universities, tourist attractions, 8 * directions ("X to Y"), hotels, etc. 9 * 10 * Processes ALL countries' regions-final-filtered.csv files. 11 */ 12 13 import fs from 'fs'; 14 import path from 'path'; 15 import { parse } from 'csv-parse/sync'; 16 import { stringify } from 'csv-stringify/sync'; 17 import Logger from '../src/utils/logger.js'; 18 import { callLLM } from '../src/utils/llm-provider.js'; 19 20 const logger = new Logger('LLMRegionsCleanup'); 21 22 /** 23 * Batch keywords for LLM processing (to avoid token limits) 24 */ 25 function batchKeywords(keywords, batchSize = 100) { 26 const batches = []; 27 for (let i = 0; i < keywords.length; i += batchSize) { 28 batches.push(keywords.slice(i, i + batchSize)); 29 } 30 return batches; 31 } 32 33 /** 34 * Use LLM to filter keywords to keep only place names 35 */ 36 async function filterPlaceNames(keywords, language = 'en', countryCode = null) { 37 const systemPrompt = `You are a precise keyword filter for geographic place names. You have access to web search and can look up unfamiliar terms. 38 39 Your task: From the provided list, keep ONLY keywords that are actual place names (cities, neighborhoods, districts, regions, states, provinces). 40 41 REMOVE these categories: 42 - Weather queries: "delhi weather", "temperature", "climate" 43 - Sports teams/brands: "lucknow super giants", "chennai super kings" 44 - Universities/schools: "delhi university", "iit bombay" 45 - Directions/routes: "mumbai to pune", "delhi to agra" 46 - Tourist attractions: "taj mahal", "gateway of india" 47 - Hotels/venues: "marriott delhi", "phoenix mall" 48 - Transit infrastructure: "railway station", "bus stand", "junction", "train station", "metro station" 49 - State abbreviations: "nagpur mh", "bangalore ka" (MH=Maharashtra, KA=Karnataka) 50 - Abbreviations: "bsk bangalore", "hsk bangalore" (unless you confirm these are neighborhoods) 51 - Social media markers: "#secunderabad", "hyderabad #location" 52 - Questions: "how to reach", "distance from" 53 - Modifiers: "best places in", "things to do in" 54 - Events: "festival", "concert" 55 - ANY query that is not a pure place name 56 57 KEEP only: 58 - City names: "delhi", "mumbai", "bangalore" 59 - Neighborhood/district names: "koramangala", "andheri", "whitefield" 60 - Region names: "karnataka", "maharashtra" 61 - State/province names 62 - Village/town names 63 64 Important rules: 65 - If you're unsure whether something is a place name, you can search the web to verify 66 - Be VERY aggressive: when in doubt, REMOVE it 67 - Only keep if you're confident it's a real place name 68 - Remove anything with "station", "stand", "junction", "#", state abbreviations 69 - Preserve exact spelling/casing from input for keywords you keep 70 71 Output only JSON with format: {"keep": ["place1", "place2", ...], "remove": [{"keyword": "...", "reason": "transit|abbreviation|weather|sports|university|direction|attraction|social_media|other"}]}`; 72 73 const userPrompt = `Input JSON: 74 { "language": "${language}", "country_code": "${countryCode || 'unknown'}", "keywords": ${JSON.stringify(keywords)} } 75 76 Output JSON schema: 77 { "keep": [...], "remove": [{"keyword":"...","reason":"..."}] } 78 79 Output only the JSON object.`; 80 81 try { 82 const response = await callLLM({ 83 model: process.env.CLAUDE_SONNET_MODEL || 'anthropic/claude-sonnet-4-6', 84 messages: [ 85 { role: 'system', content: systemPrompt }, 86 { role: 'user', content: userPrompt }, 87 ], 88 temperature: 0.0, 89 max_tokens: 4000, 90 }); 91 92 // Parse JSON response 93 let content = response.content.trim(); 94 if (content.startsWith('```json')) { 95 content = content.replace(/^```json\n/, '').replace(/\n```$/, ''); 96 } else if (content.startsWith('```')) { 97 content = content.replace(/^```\n/, '').replace(/\n```$/, ''); 98 } 99 100 const result = JSON.parse(content); 101 return result; 102 } catch (error) { 103 logger.error(`LLM filter failed: ${error.message}`); 104 // Fallback: keep all keywords if LLM fails 105 return { keep: keywords, remove: [] }; 106 } 107 } 108 109 /** 110 * Process a single regions CSV file 111 */ 112 async function processRegionsFile(csvPath, countryCode) { 113 if (!fs.existsSync(csvPath)) { 114 logger.warn(`File not found: ${csvPath}`); 115 return { kept: 0, removed: 0, reasons: {} }; 116 } 117 118 logger.info(`Processing ${countryCode} regions...`); 119 120 const content = fs.readFileSync(csvPath, 'utf-8'); 121 const rows = parse(content, { columns: true }); 122 123 if (rows.length === 0) { 124 logger.warn(`Empty CSV: ${csvPath}`); 125 return { kept: 0, removed: 0, reasons: {} }; 126 } 127 128 const allKeywords = rows.map(row => row.keyword); 129 const batches = batchKeywords(allKeywords, 100); 130 131 const keptKeywords = new Set(); 132 const removedMap = new Map(); 133 const reasons = {}; 134 135 logger.info(` Processing ${batches.length} batches (${allKeywords.length} keywords)...`); 136 137 for (let i = 0; i < batches.length; i++) { 138 const batch = batches[i]; 139 logger.info(` Batch ${i + 1}/${batches.length} (${batch.length} keywords)...`); 140 141 const result = await filterPlaceNames(batch, 'mixed', countryCode); 142 143 // Track kept keywords 144 for (const keyword of result.keep) { 145 keptKeywords.add(keyword.toLowerCase()); 146 } 147 148 // Track removed keywords with reasons 149 for (const removal of result.remove) { 150 removedMap.set(removal.keyword.toLowerCase(), removal.reason); 151 reasons[removal.reason] = (reasons[removal.reason] || 0) + 1; 152 } 153 154 // Rate limit delay (OpenRouter has limits) 155 if (i < batches.length - 1) { 156 await new Promise(resolve => setTimeout(resolve, 1000)); 157 } 158 } 159 160 // Filter original rows based on LLM results 161 const keptRows = []; 162 const removedRows = []; 163 164 for (const row of rows) { 165 const keyword = row.keyword.toLowerCase(); 166 if (keptKeywords.has(keyword)) { 167 keptRows.push(row); 168 } else { 169 const reason = removedMap.get(keyword) || 'other'; 170 removedRows.push({ keyword: row.keyword, reason, sv: row.search_volume }); 171 } 172 } 173 174 // Write filtered CSV 175 if (keptRows.length > 0) { 176 const newContent = stringify(keptRows, { header: true, columns: Object.keys(keptRows[0]) }); 177 fs.writeFileSync(csvPath, newContent); 178 logger.success(` ✓ Kept ${keptRows.length}, removed ${removedRows.length}`); 179 } else { 180 logger.warn(` ⚠ All keywords removed! Not overwriting file.`); 181 } 182 183 if (Object.keys(reasons).length > 0) { 184 logger.info(` Removal reasons:`); 185 Object.entries(reasons) 186 .sort((a, b) => b[1] - a[1]) 187 .forEach(([reason, count]) => { 188 logger.info(` ${reason}: ${count}`); 189 }); 190 } 191 192 return { kept: keptRows.length, removed: removedRows.length, reasons }; 193 } 194 195 async function main() { 196 const args = process.argv.slice(2); 197 const dryRun = args.includes('--dry-run'); 198 const countryFilter = args.find(arg => arg.startsWith('--country='))?.split('=')[1]; 199 200 if (dryRun) { 201 logger.info('DRY RUN MODE - no files will be modified\n'); 202 } 203 204 logger.info('Starting LLM-based regions cleanup (place names only)...\n'); 205 206 // Get all countries 207 const allCountries = fs 208 .readdirSync('./data') 209 .filter(f => fs.statSync(path.join('./data', f)).isDirectory() && f.length === 2); 210 211 const countries = countryFilter 212 ? allCountries.filter(c => c.toLowerCase() === countryFilter.toLowerCase()) 213 : allCountries; 214 215 if (countries.length === 0) { 216 logger.error(`No countries found matching filter: ${countryFilter}`); 217 process.exit(1); 218 } 219 220 logger.info(`Processing ${countries.length} countries: ${countries.sort().join(', ')}\n`); 221 222 let totalKept = 0; 223 let totalRemoved = 0; 224 const allReasons = {}; 225 226 // Process Hindi regions file first (if exists) 227 const hindiRegPath = './data/in/regions-hindi-final-filtered.csv'; 228 if (fs.existsSync(hindiRegPath)) { 229 logger.info('Processing Hindi regions file...'); 230 const result = await processRegionsFile(hindiRegPath, 'in-hindi'); 231 totalKept += result.kept; 232 totalRemoved += result.removed; 233 Object.entries(result.reasons).forEach(([reason, count]) => { 234 allReasons[reason] = (allReasons[reason] || 0) + count; 235 }); 236 logger.info(''); 237 } 238 239 // Process standard regions files for each country 240 for (const country of countries.sort()) { 241 const regPath = path.join('./data', country, 'regions-final-filtered.csv'); 242 243 if (!fs.existsSync(regPath)) { 244 logger.warn(`${country}: No regions file found`); 245 continue; 246 } 247 248 const result = await processRegionsFile(regPath, country); 249 250 totalKept += result.kept; 251 totalRemoved += result.removed; 252 253 Object.entries(result.reasons).forEach(([reason, count]) => { 254 allReasons[reason] = (allReasons[reason] || 0) + count; 255 }); 256 257 logger.info(''); 258 } 259 260 logger.info('='.repeat(60)); 261 logger.success(`LLM Regions Cleanup Complete:`); 262 logger.info(` Kept: ${totalKept} keywords`); 263 logger.info(` Removed: ${totalRemoved} keywords`); 264 265 if (Object.keys(allReasons).length > 0) { 266 logger.info(`\nRemoval reasons (across all countries):`); 267 Object.entries(allReasons) 268 .sort((a, b) => b[1] - a[1]) 269 .forEach(([reason, count]) => { 270 logger.info(` ${reason}: ${count}`); 271 }); 272 } 273 274 logger.info(`\n${'='.repeat(60)}`); 275 logger.success(`✓ Done! Processed ${countries.length} countries.`); 276 } 277 278 main().catch(err => { 279 logger.error(`Fatal error: ${err.message}`); 280 console.error(err); 281 process.exit(1); 282 });