/ scripts / llm-regions-cleanup.js
llm-regions-cleanup.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * LLM-Based Regions Cleanup
  5   *
  6   * Uses Claude Sonnet to filter regions CSVs to keep ONLY place names.
  7   * Removes: weather queries, sports teams, universities, tourist attractions,
  8   * directions ("X to Y"), hotels, etc.
  9   *
 10   * Processes ALL countries' regions-final-filtered.csv files.
 11   */
 12  
 13  import fs from 'fs';
 14  import path from 'path';
 15  import { parse } from 'csv-parse/sync';
 16  import { stringify } from 'csv-stringify/sync';
 17  import Logger from '../src/utils/logger.js';
 18  import { callLLM } from '../src/utils/llm-provider.js';
 19  
 20  const logger = new Logger('LLMRegionsCleanup');
 21  
 22  /**
 23   * Batch keywords for LLM processing (to avoid token limits)
 24   */
 25  function batchKeywords(keywords, batchSize = 100) {
 26    const batches = [];
 27    for (let i = 0; i < keywords.length; i += batchSize) {
 28      batches.push(keywords.slice(i, i + batchSize));
 29    }
 30    return batches;
 31  }
 32  
 33  /**
 34   * Use LLM to filter keywords to keep only place names
 35   */
 36  async function filterPlaceNames(keywords, language = 'en', countryCode = null) {
 37    const systemPrompt = `You are a precise keyword filter for geographic place names. You have access to web search and can look up unfamiliar terms.
 38  
 39  Your task: From the provided list, keep ONLY keywords that are actual place names (cities, neighborhoods, districts, regions, states, provinces).
 40  
 41  REMOVE these categories:
 42  - Weather queries: "delhi weather", "temperature", "climate"
 43  - Sports teams/brands: "lucknow super giants", "chennai super kings"
 44  - Universities/schools: "delhi university", "iit bombay"
 45  - Directions/routes: "mumbai to pune", "delhi to agra"
 46  - Tourist attractions: "taj mahal", "gateway of india"
 47  - Hotels/venues: "marriott delhi", "phoenix mall"
 48  - Transit infrastructure: "railway station", "bus stand", "junction", "train station", "metro station"
 49  - State abbreviations: "nagpur mh", "bangalore ka" (MH=Maharashtra, KA=Karnataka)
 50  - Abbreviations: "bsk bangalore", "hsk bangalore" (unless you confirm these are neighborhoods)
 51  - Social media markers: "#secunderabad", "hyderabad #location"
 52  - Questions: "how to reach", "distance from"
 53  - Modifiers: "best places in", "things to do in"
 54  - Events: "festival", "concert"
 55  - ANY query that is not a pure place name
 56  
 57  KEEP only:
 58  - City names: "delhi", "mumbai", "bangalore"
 59  - Neighborhood/district names: "koramangala", "andheri", "whitefield"
 60  - Region names: "karnataka", "maharashtra"
 61  - State/province names
 62  - Village/town names
 63  
 64  Important rules:
 65  - If you're unsure whether something is a place name, you can search the web to verify
 66  - Be VERY aggressive: when in doubt, REMOVE it
 67  - Only keep if you're confident it's a real place name
 68  - Remove anything with "station", "stand", "junction", "#", state abbreviations
 69  - Preserve exact spelling/casing from input for keywords you keep
 70  
 71  Output only JSON with format: {"keep": ["place1", "place2", ...], "remove": [{"keyword": "...", "reason": "transit|abbreviation|weather|sports|university|direction|attraction|social_media|other"}]}`;
 72  
 73    const userPrompt = `Input JSON:
 74  { "language": "${language}", "country_code": "${countryCode || 'unknown'}", "keywords": ${JSON.stringify(keywords)} }
 75  
 76  Output JSON schema:
 77  { "keep": [...], "remove": [{"keyword":"...","reason":"..."}] }
 78  
 79  Output only the JSON object.`;
 80  
 81    try {
 82      const response = await callLLM({
 83        model: process.env.CLAUDE_SONNET_MODEL || 'anthropic/claude-sonnet-4-6',
 84        messages: [
 85          { role: 'system', content: systemPrompt },
 86          { role: 'user', content: userPrompt },
 87        ],
 88        temperature: 0.0,
 89        max_tokens: 4000,
 90      });
 91  
 92      // Parse JSON response
 93      let content = response.content.trim();
 94      if (content.startsWith('```json')) {
 95        content = content.replace(/^```json\n/, '').replace(/\n```$/, '');
 96      } else if (content.startsWith('```')) {
 97        content = content.replace(/^```\n/, '').replace(/\n```$/, '');
 98      }
 99  
100      const result = JSON.parse(content);
101      return result;
102    } catch (error) {
103      logger.error(`LLM filter failed: ${error.message}`);
104      // Fallback: keep all keywords if LLM fails
105      return { keep: keywords, remove: [] };
106    }
107  }
108  
109  /**
110   * Process a single regions CSV file
111   */
112  async function processRegionsFile(csvPath, countryCode) {
113    if (!fs.existsSync(csvPath)) {
114      logger.warn(`File not found: ${csvPath}`);
115      return { kept: 0, removed: 0, reasons: {} };
116    }
117  
118    logger.info(`Processing ${countryCode} regions...`);
119  
120    const content = fs.readFileSync(csvPath, 'utf-8');
121    const rows = parse(content, { columns: true });
122  
123    if (rows.length === 0) {
124      logger.warn(`Empty CSV: ${csvPath}`);
125      return { kept: 0, removed: 0, reasons: {} };
126    }
127  
128    const allKeywords = rows.map(row => row.keyword);
129    const batches = batchKeywords(allKeywords, 100);
130  
131    const keptKeywords = new Set();
132    const removedMap = new Map();
133    const reasons = {};
134  
135    logger.info(`  Processing ${batches.length} batches (${allKeywords.length} keywords)...`);
136  
137    for (let i = 0; i < batches.length; i++) {
138      const batch = batches[i];
139      logger.info(`  Batch ${i + 1}/${batches.length} (${batch.length} keywords)...`);
140  
141      const result = await filterPlaceNames(batch, 'mixed', countryCode);
142  
143      // Track kept keywords
144      for (const keyword of result.keep) {
145        keptKeywords.add(keyword.toLowerCase());
146      }
147  
148      // Track removed keywords with reasons
149      for (const removal of result.remove) {
150        removedMap.set(removal.keyword.toLowerCase(), removal.reason);
151        reasons[removal.reason] = (reasons[removal.reason] || 0) + 1;
152      }
153  
154      // Rate limit delay (OpenRouter has limits)
155      if (i < batches.length - 1) {
156        await new Promise(resolve => setTimeout(resolve, 1000));
157      }
158    }
159  
160    // Filter original rows based on LLM results
161    const keptRows = [];
162    const removedRows = [];
163  
164    for (const row of rows) {
165      const keyword = row.keyword.toLowerCase();
166      if (keptKeywords.has(keyword)) {
167        keptRows.push(row);
168      } else {
169        const reason = removedMap.get(keyword) || 'other';
170        removedRows.push({ keyword: row.keyword, reason, sv: row.search_volume });
171      }
172    }
173  
174    // Write filtered CSV
175    if (keptRows.length > 0) {
176      const newContent = stringify(keptRows, { header: true, columns: Object.keys(keptRows[0]) });
177      fs.writeFileSync(csvPath, newContent);
178      logger.success(`  ✓ Kept ${keptRows.length}, removed ${removedRows.length}`);
179    } else {
180      logger.warn(`  ⚠ All keywords removed! Not overwriting file.`);
181    }
182  
183    if (Object.keys(reasons).length > 0) {
184      logger.info(`  Removal reasons:`);
185      Object.entries(reasons)
186        .sort((a, b) => b[1] - a[1])
187        .forEach(([reason, count]) => {
188          logger.info(`    ${reason}: ${count}`);
189        });
190    }
191  
192    return { kept: keptRows.length, removed: removedRows.length, reasons };
193  }
194  
195  async function main() {
196    const args = process.argv.slice(2);
197    const dryRun = args.includes('--dry-run');
198    const countryFilter = args.find(arg => arg.startsWith('--country='))?.split('=')[1];
199  
200    if (dryRun) {
201      logger.info('DRY RUN MODE - no files will be modified\n');
202    }
203  
204    logger.info('Starting LLM-based regions cleanup (place names only)...\n');
205  
206    // Get all countries
207    const allCountries = fs
208      .readdirSync('./data')
209      .filter(f => fs.statSync(path.join('./data', f)).isDirectory() && f.length === 2);
210  
211    const countries = countryFilter
212      ? allCountries.filter(c => c.toLowerCase() === countryFilter.toLowerCase())
213      : allCountries;
214  
215    if (countries.length === 0) {
216      logger.error(`No countries found matching filter: ${countryFilter}`);
217      process.exit(1);
218    }
219  
220    logger.info(`Processing ${countries.length} countries: ${countries.sort().join(', ')}\n`);
221  
222    let totalKept = 0;
223    let totalRemoved = 0;
224    const allReasons = {};
225  
226    // Process Hindi regions file first (if exists)
227    const hindiRegPath = './data/in/regions-hindi-final-filtered.csv';
228    if (fs.existsSync(hindiRegPath)) {
229      logger.info('Processing Hindi regions file...');
230      const result = await processRegionsFile(hindiRegPath, 'in-hindi');
231      totalKept += result.kept;
232      totalRemoved += result.removed;
233      Object.entries(result.reasons).forEach(([reason, count]) => {
234        allReasons[reason] = (allReasons[reason] || 0) + count;
235      });
236      logger.info('');
237    }
238  
239    // Process standard regions files for each country
240    for (const country of countries.sort()) {
241      const regPath = path.join('./data', country, 'regions-final-filtered.csv');
242  
243      if (!fs.existsSync(regPath)) {
244        logger.warn(`${country}: No regions file found`);
245        continue;
246      }
247  
248      const result = await processRegionsFile(regPath, country);
249  
250      totalKept += result.kept;
251      totalRemoved += result.removed;
252  
253      Object.entries(result.reasons).forEach(([reason, count]) => {
254        allReasons[reason] = (allReasons[reason] || 0) + count;
255      });
256  
257      logger.info('');
258    }
259  
260    logger.info('='.repeat(60));
261    logger.success(`LLM Regions Cleanup Complete:`);
262    logger.info(`  Kept: ${totalKept} keywords`);
263    logger.info(`  Removed: ${totalRemoved} keywords`);
264  
265    if (Object.keys(allReasons).length > 0) {
266      logger.info(`\nRemoval reasons (across all countries):`);
267      Object.entries(allReasons)
268        .sort((a, b) => b[1] - a[1])
269        .forEach(([reason, count]) => {
270          logger.info(`  ${reason}: ${count}`);
271        });
272    }
273  
274    logger.info(`\n${'='.repeat(60)}`);
275    logger.success(`✓ Done! Processed ${countries.length} countries.`);
276  }
277  
278  main().catch(err => {
279    logger.error(`Fatal error: ${err.message}`);
280    console.error(err);
281    process.exit(1);
282  });