/ scripts / comprehensive-keyword-cleanup.js
comprehensive-keyword-cleanup.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Comprehensive Keyword Cleanup
  5   *
  6   * Applies across ALL countries:
  7   * 1. Remove junk patterns (near me, map, directions, translations, pronunciation, job, pdf)
  8   * 2. Remove low search volume (1-999, keep 0 and ≥1000)
  9   * 3. Remove >3 words
 10   * 4. Remove place names from business keywords
 11   * 5. Clean database
 12   */
 13  
 14  import fs from 'fs';
 15  import path from 'path';
 16  import { parse } from 'csv-parse/sync';
 17  import { stringify } from 'csv-stringify/sync';
 18  import { createDatabaseConnection } from '../src/utils/db.js';
 19  import Logger from '../src/utils/logger.js';
 20  
 21  const logger = new Logger('ComprehensiveCleanup');
 22  
 23  // COMPREHENSIVE JUNK PATTERNS
 24  const JUNK_PATTERNS = [
 25    // === LOCATION MODIFIERS ===
 26    // English
 27    /\bnear me\b/i,
 28    /\bnearby\b/i,
 29    /\bclose to me\b/i,
 30    /\baround me\b/i,
 31    /\bin my area\b/i,
 32    /\bmap\b$/i,
 33    /\bmaps\b$/i,
 34    /\bdirections?\b/i,
 35    /\broute to\b/i,
 36    /\bhow to get to\b/i,
 37  
 38    // Hindi (Devanagari)
 39    /पास में/,
 40    /नजदीक/,
 41    /मेरे पास/,
 42    /मेरे आसपास/,
 43    /के पास$/,
 44    /यहाँ के पास/,
 45    /रास्ता/, // route
 46    /दिशा/, // direction
 47  
 48    // French
 49    /près de moi/i,
 50    /à proximité/i,
 51    /autour de moi/i,
 52    /carte$/i,
 53    /itinéraire/i, // route
 54    /directions?$/i,
 55  
 56    // German
 57    /in meiner nähe/i,
 58    /in der nähe/i,
 59    /karte$/i,
 60    /weg zu/i, // way to
 61    /anfahrt/i, // directions
 62  
 63    // Spanish
 64    /cerca de mí/i,
 65    /cerca de mi/i,
 66    /a mi alrededor/i,
 67    /mapa$/i,
 68    /direcciones$/i,
 69    /ruta a/i,
 70  
 71    // Dutch
 72    /bij mij in de buurt/i,
 73    /in de buurt/i,
 74    /kaart$/i,
 75    /route naar/i,
 76  
 77    // Italian
 78    /vicino a me/i,
 79    /nelle vicinanze/i,
 80    /mappa$/i,
 81    /indicazioni/i,
 82  
 83    // === TRANSLATION REQUESTS ===
 84    /\bin english$/i,
 85    /\bin hindi$/i,
 86    /\bin french$/i,
 87    /\bin german$/i,
 88    /\bin spanish$/i,
 89    /\bin chinese$/i,
 90    /\bin japanese$/i,
 91    /\bin korean$/i,
 92    /\bmeaning in english\b/i,
 93    /\btranslation\b/i,
 94    /\btranslate\b/i,
 95  
 96    // === PRONUNCIATION ===
 97    /\bpronunciation\b/i,
 98    /\bhow to pronounce\b/i,
 99    /\bhow to say\b/i,
100  
101    // === DIY / EDUCATIONAL ===
102    /\bpdf$/i,
103    /\bebook\b/i,
104    /\bbook\b$/i,
105    /\bguide\b$/i,
106    /\btutorial\b/i,
107    /\bcourse\b/i,
108    /\btraining\b/i,
109    /\blearn\b/i,
110    /\bhow to\b/i,
111    /\bdiy\b/i,
112  
113    // === JOBS ===
114    /\bjob$/i,
115    /\bjobs\b/i,
116    /\bsalary\b/i,
117    /\bwage\b/i,
118    /\bpay\b$/i,
119    /\bvacancy\b/i,
120    /\bhiring\b/i,
121    /\brecruit/i,
122    /\bcareer\b/i,
123    /\bresume\b/i,
124    /\bcv\b$/i,
125  ];
126  
127  /**
128   * Check if keyword matches any junk pattern
129   */
130  function isJunk(keyword) {
131    for (const pattern of JUNK_PATTERNS) {
132      if (pattern.test(keyword)) {
133        return true;
134      }
135    }
136  
137    // Additional checks for compound keywords (e.g., "HVAC repair weather-sengkang")
138    // These were created by keyword expansion combining business + junk regions
139    const compoundJunkPatterns = [
140      / weather[\s-]/i, // "carpenter weather sengkang", "plumber weather-yishun"
141      /[\s-]weather($| )/i, // "carpenter sengkang weather", "plumber-weather yishun"
142      / to /i, // "X to Y" directions
143      / university/i, // "delhi university"
144      /super giants/i, // Sports teams
145      / stadium/i, // Sports venues
146      / temperature/i, // Weather queries
147      / climate/i, // Weather queries
148    ];
149  
150    for (const pattern of compoundJunkPatterns) {
151      if (pattern.test(keyword)) {
152        return true;
153      }
154    }
155  
156    return false;
157  }
158  
159  /**
160   * Load region names from a country's regions CSV
161   */
162  function loadRegionNames(countryCode) {
163    try {
164      const regionsPath = path.join(
165        './data',
166        countryCode.toLowerCase(),
167        'regions-final-filtered.csv'
168      );
169      if (!fs.existsSync(regionsPath)) {
170        return [];
171      }
172  
173      const content = fs.readFileSync(regionsPath, 'utf-8');
174      const records = parse(content, { columns: true, skip_empty_lines: true });
175      return records.map(row => row.keyword.toLowerCase()).filter(r => r && r.trim());
176    } catch (error) {
177      logger.warn(`Failed to load regions for ${countryCode}: ${error.message}`);
178      return [];
179    }
180  }
181  
182  /**
183   * Check if business keyword contains a place name
184   */
185  function containsPlaceName(keyword, regions) {
186    const kw = keyword.toLowerCase();
187    for (const region of regions) {
188      const regex = new RegExp(`\\b${region.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i');
189      if (regex.test(kw)) {
190        return true;
191      }
192    }
193    return false;
194  }
195  
196  /**
197   * Filter a CSV file
198   */
199  function filterCSV(csvPath, type, countryCode) {
200    if (!fs.existsSync(csvPath)) {
201      return { kept: 0, removed: 0, reasons: {} };
202    }
203  
204    const content = fs.readFileSync(csvPath, 'utf-8');
205    const rows = parse(content, { columns: true });
206  
207    const kept = [];
208    const removed = [];
209    const reasons = {};
210  
211    // Load regions for place name filtering (businesses only)
212    const regions = type === 'businesses' ? loadRegionNames(countryCode) : [];
213  
214    for (const row of rows) {
215      const { keyword, search_volume } = row;
216      const sv = parseInt(search_volume) || 0;
217      const wordCount = keyword.trim().split(/\s+/).length;
218  
219      let reason = null;
220  
221      // Check junk patterns
222      if (isJunk(keyword)) {
223        reason = 'junk_pattern';
224      }
225      // Check word count
226      else if (wordCount > 3) {
227        reason = 'too_long';
228      }
229      // Check search volume (1-999 removed, 0 and ≥1000 kept)
230      else if (sv >= 1 && sv < 1000) {
231        reason = 'low_volume';
232      }
233      // Check place names in businesses
234      else if (type === 'businesses' && regions.length > 0 && containsPlaceName(keyword, regions)) {
235        reason = 'place_name';
236      }
237  
238      if (reason) {
239        removed.push({ keyword, reason, sv });
240        reasons[reason] = (reasons[reason] || 0) + 1;
241      } else {
242        kept.push(row);
243      }
244    }
245  
246    // Write filtered CSV
247    if (kept.length > 0) {
248      const newContent = stringify(kept, { header: true, columns: Object.keys(kept[0]) });
249      fs.writeFileSync(csvPath, newContent);
250    }
251  
252    return { kept: kept.length, removed: removed.length, reasons };
253  }
254  
255  /**
256   * Clean database keywords
257   */
258  function cleanDatabase() {
259    const dbPath = process.env.DATABASE_PATH || './db/sites.db';
260    const db = createDatabaseConnection(dbPath);
261  
262    try {
263      const keywords = db.prepare('SELECT id, keyword FROM keywords').all();
264      logger.info(`Checking ${keywords.length} database keywords...`);
265  
266      const toDelete = [];
267      const reasons = {};
268  
269      for (const row of keywords) {
270        const { id, keyword } = row;
271        const wordCount = keyword.trim().split(/\s+/).length;
272  
273        let reason = null;
274  
275        if (isJunk(keyword)) {
276          reason = 'junk_pattern';
277        } else if (wordCount > 3) {
278          reason = 'too_long';
279        }
280  
281        if (reason) {
282          toDelete.push(id);
283          reasons[reason] = (reasons[reason] || 0) + 1;
284        }
285      }
286  
287      if (toDelete.length > 0) {
288        const deleteStmt = db.prepare('DELETE FROM keywords WHERE id = ?');
289        const deleteMany = db.transaction(ids => {
290          for (const id of ids) {
291            deleteStmt.run(id);
292          }
293        });
294  
295        deleteMany(toDelete);
296  
297        logger.success(`✓ Deleted ${toDelete.length} database keywords`);
298        Object.entries(reasons).forEach(([reason, count]) => {
299          logger.info(`  ${reason}: ${count}`);
300        });
301  
302        const newCount = db.prepare('SELECT COUNT(*) as count FROM keywords').get();
303        logger.info(`Remaining database keywords: ${newCount.count}`);
304      } else {
305        logger.success('✓ No junk keywords in database');
306      }
307  
308      return toDelete.length;
309    } finally {
310      db.close();
311    }
312  }
313  
314  async function main() {
315    const args = process.argv.slice(2);
316    const dryRun = args.includes('--dry-run');
317  
318    if (dryRun) {
319      logger.info('DRY RUN MODE - no files will be modified\n');
320    }
321  
322    logger.info('Starting comprehensive keyword cleanup...\n');
323  
324    // Get all countries
325    const countries = fs
326      .readdirSync('./data')
327      .filter(f => fs.statSync(path.join('./data', f)).isDirectory() && f.length === 2);
328  
329    let totalKept = 0;
330    let totalRemoved = 0;
331    const allReasons = {};
332  
333    // Process Hindi files first (if they exist)
334    const hindiFiles = [
335      {
336        path: './data/in/businesses-hindi-final-filtered.csv',
337        type: 'businesses',
338        label: 'Hindi businesses',
339      },
340      { path: './data/in/regions-hindi-final-filtered.csv', type: 'regions', label: 'Hindi regions' },
341    ];
342  
343    for (const { path: filePath, type, label } of hindiFiles) {
344      if (fs.existsSync(filePath)) {
345        const result = filterCSV(filePath, type, 'in');
346        if (result.removed > 0) {
347          logger.info(`${label}: kept ${result.kept}, removed ${result.removed}`);
348          Object.entries(result.reasons).forEach(([reason, count]) => {
349            logger.info(`  ${reason}: ${count}`);
350            allReasons[reason] = (allReasons[reason] || 0) + count;
351          });
352        }
353        totalKept += result.kept;
354        totalRemoved += result.removed;
355      }
356    }
357  
358    // Process each country
359    for (const country of countries.sort()) {
360      const countryPath = `./data/${country}`;
361  
362      // Process businesses
363      const bizPath = path.join(countryPath, 'businesses-final-filtered.csv');
364      if (fs.existsSync(bizPath)) {
365        const result = filterCSV(bizPath, 'businesses', country);
366        if (result.removed > 0) {
367          logger.info(`${country} businesses: kept ${result.kept}, removed ${result.removed}`);
368          Object.entries(result.reasons).forEach(([reason, count]) => {
369            logger.info(`  ${reason}: ${count}`);
370            allReasons[reason] = (allReasons[reason] || 0) + count;
371          });
372        }
373        totalKept += result.kept;
374        totalRemoved += result.removed;
375      }
376  
377      // Process regions
378      const regPath = path.join(countryPath, 'regions-final-filtered.csv');
379      if (fs.existsSync(regPath)) {
380        const result = filterCSV(regPath, 'regions', country);
381        if (result.removed > 0) {
382          logger.info(`${country} regions: kept ${result.kept}, removed ${result.removed}`);
383          Object.entries(result.reasons).forEach(([reason, count]) => {
384            logger.info(`  ${reason}: ${count}`);
385            allReasons[reason] = (allReasons[reason] || 0) + count;
386          });
387        }
388        totalKept += result.kept;
389        totalRemoved += result.removed;
390      }
391    }
392  
393    logger.info(`\n${'='.repeat(60)}`);
394    logger.success(`CSV Cleanup Complete:`);
395    logger.info(`  Kept: ${totalKept} keywords`);
396    logger.info(`  Removed: ${totalRemoved} keywords`);
397    logger.info(`\nRemoval reasons:`);
398    Object.entries(allReasons)
399      .sort((a, b) => b[1] - a[1])
400      .forEach(([reason, count]) => {
401        logger.info(`  ${reason}: ${count}`);
402      });
403  
404    // Clean database
405    logger.info(`\n${'='.repeat(60)}`);
406    logger.info('Cleaning database...\n');
407    const dbRemoved = cleanDatabase();
408  
409    logger.info(`\n${'='.repeat(60)}`);
410    logger.success(`✓ Cleanup complete!`);
411    logger.info(`Total removed: ${totalRemoved + dbRemoved} keywords`);
412  }
413  
414  main().catch(err => {
415    logger.error(`Fatal error: ${err.message}`);
416    console.error(err);
417    process.exit(1);
418  });