comprehensive-keyword-cleanup.js
1 #!/usr/bin/env node 2 3 /** 4 * Comprehensive Keyword Cleanup 5 * 6 * Applies across ALL countries: 7 * 1. Remove junk patterns (near me, map, directions, translations, pronunciation, job, pdf) 8 * 2. Remove low search volume (1-999, keep 0 and ≥1000) 9 * 3. Remove >3 words 10 * 4. Remove place names from business keywords 11 * 5. Clean database 12 */ 13 14 import fs from 'fs'; 15 import path from 'path'; 16 import { parse } from 'csv-parse/sync'; 17 import { stringify } from 'csv-stringify/sync'; 18 import { createDatabaseConnection } from '../src/utils/db.js'; 19 import Logger from '../src/utils/logger.js'; 20 21 const logger = new Logger('ComprehensiveCleanup'); 22 23 // COMPREHENSIVE JUNK PATTERNS 24 const JUNK_PATTERNS = [ 25 // === LOCATION MODIFIERS === 26 // English 27 /\bnear me\b/i, 28 /\bnearby\b/i, 29 /\bclose to me\b/i, 30 /\baround me\b/i, 31 /\bin my area\b/i, 32 /\bmap\b$/i, 33 /\bmaps\b$/i, 34 /\bdirections?\b/i, 35 /\broute to\b/i, 36 /\bhow to get to\b/i, 37 38 // Hindi (Devanagari) 39 /पास में/, 40 /नजदीक/, 41 /मेरे पास/, 42 /मेरे आसपास/, 43 /के पास$/, 44 /यहाँ के पास/, 45 /रास्ता/, // route 46 /दिशा/, // direction 47 48 // French 49 /près de moi/i, 50 /à proximité/i, 51 /autour de moi/i, 52 /carte$/i, 53 /itinéraire/i, // route 54 /directions?$/i, 55 56 // German 57 /in meiner nähe/i, 58 /in der nähe/i, 59 /karte$/i, 60 /weg zu/i, // way to 61 /anfahrt/i, // directions 62 63 // Spanish 64 /cerca de mí/i, 65 /cerca de mi/i, 66 /a mi alrededor/i, 67 /mapa$/i, 68 /direcciones$/i, 69 /ruta a/i, 70 71 // Dutch 72 /bij mij in de buurt/i, 73 /in de buurt/i, 74 /kaart$/i, 75 /route naar/i, 76 77 // Italian 78 /vicino a me/i, 79 /nelle vicinanze/i, 80 /mappa$/i, 81 /indicazioni/i, 82 83 // === TRANSLATION REQUESTS === 84 /\bin english$/i, 85 /\bin hindi$/i, 86 /\bin french$/i, 87 /\bin german$/i, 88 /\bin spanish$/i, 89 /\bin chinese$/i, 90 /\bin japanese$/i, 91 /\bin korean$/i, 92 /\bmeaning in english\b/i, 93 /\btranslation\b/i, 94 /\btranslate\b/i, 95 96 // === PRONUNCIATION === 97 /\bpronunciation\b/i, 98 /\bhow to pronounce\b/i, 99 /\bhow to say\b/i, 100 101 // === DIY / EDUCATIONAL === 102 /\bpdf$/i, 103 /\bebook\b/i, 104 /\bbook\b$/i, 105 /\bguide\b$/i, 106 /\btutorial\b/i, 107 /\bcourse\b/i, 108 /\btraining\b/i, 109 /\blearn\b/i, 110 /\bhow to\b/i, 111 /\bdiy\b/i, 112 113 // === JOBS === 114 /\bjob$/i, 115 /\bjobs\b/i, 116 /\bsalary\b/i, 117 /\bwage\b/i, 118 /\bpay\b$/i, 119 /\bvacancy\b/i, 120 /\bhiring\b/i, 121 /\brecruit/i, 122 /\bcareer\b/i, 123 /\bresume\b/i, 124 /\bcv\b$/i, 125 ]; 126 127 /** 128 * Check if keyword matches any junk pattern 129 */ 130 function isJunk(keyword) { 131 for (const pattern of JUNK_PATTERNS) { 132 if (pattern.test(keyword)) { 133 return true; 134 } 135 } 136 137 // Additional checks for compound keywords (e.g., "HVAC repair weather-sengkang") 138 // These were created by keyword expansion combining business + junk regions 139 const compoundJunkPatterns = [ 140 / weather[\s-]/i, // "carpenter weather sengkang", "plumber weather-yishun" 141 /[\s-]weather($| )/i, // "carpenter sengkang weather", "plumber-weather yishun" 142 / to /i, // "X to Y" directions 143 / university/i, // "delhi university" 144 /super giants/i, // Sports teams 145 / stadium/i, // Sports venues 146 / temperature/i, // Weather queries 147 / climate/i, // Weather queries 148 ]; 149 150 for (const pattern of compoundJunkPatterns) { 151 if (pattern.test(keyword)) { 152 return true; 153 } 154 } 155 156 return false; 157 } 158 159 /** 160 * Load region names from a country's regions CSV 161 */ 162 function loadRegionNames(countryCode) { 163 try { 164 const regionsPath = path.join( 165 './data', 166 countryCode.toLowerCase(), 167 'regions-final-filtered.csv' 168 ); 169 if (!fs.existsSync(regionsPath)) { 170 return []; 171 } 172 173 const content = fs.readFileSync(regionsPath, 'utf-8'); 174 const records = parse(content, { columns: true, skip_empty_lines: true }); 175 return records.map(row => row.keyword.toLowerCase()).filter(r => r && r.trim()); 176 } catch (error) { 177 logger.warn(`Failed to load regions for ${countryCode}: ${error.message}`); 178 return []; 179 } 180 } 181 182 /** 183 * Check if business keyword contains a place name 184 */ 185 function containsPlaceName(keyword, regions) { 186 const kw = keyword.toLowerCase(); 187 for (const region of regions) { 188 const regex = new RegExp(`\\b${region.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i'); 189 if (regex.test(kw)) { 190 return true; 191 } 192 } 193 return false; 194 } 195 196 /** 197 * Filter a CSV file 198 */ 199 function filterCSV(csvPath, type, countryCode) { 200 if (!fs.existsSync(csvPath)) { 201 return { kept: 0, removed: 0, reasons: {} }; 202 } 203 204 const content = fs.readFileSync(csvPath, 'utf-8'); 205 const rows = parse(content, { columns: true }); 206 207 const kept = []; 208 const removed = []; 209 const reasons = {}; 210 211 // Load regions for place name filtering (businesses only) 212 const regions = type === 'businesses' ? loadRegionNames(countryCode) : []; 213 214 for (const row of rows) { 215 const { keyword, search_volume } = row; 216 const sv = parseInt(search_volume) || 0; 217 const wordCount = keyword.trim().split(/\s+/).length; 218 219 let reason = null; 220 221 // Check junk patterns 222 if (isJunk(keyword)) { 223 reason = 'junk_pattern'; 224 } 225 // Check word count 226 else if (wordCount > 3) { 227 reason = 'too_long'; 228 } 229 // Check search volume (1-999 removed, 0 and ≥1000 kept) 230 else if (sv >= 1 && sv < 1000) { 231 reason = 'low_volume'; 232 } 233 // Check place names in businesses 234 else if (type === 'businesses' && regions.length > 0 && containsPlaceName(keyword, regions)) { 235 reason = 'place_name'; 236 } 237 238 if (reason) { 239 removed.push({ keyword, reason, sv }); 240 reasons[reason] = (reasons[reason] || 0) + 1; 241 } else { 242 kept.push(row); 243 } 244 } 245 246 // Write filtered CSV 247 if (kept.length > 0) { 248 const newContent = stringify(kept, { header: true, columns: Object.keys(kept[0]) }); 249 fs.writeFileSync(csvPath, newContent); 250 } 251 252 return { kept: kept.length, removed: removed.length, reasons }; 253 } 254 255 /** 256 * Clean database keywords 257 */ 258 function cleanDatabase() { 259 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 260 const db = createDatabaseConnection(dbPath); 261 262 try { 263 const keywords = db.prepare('SELECT id, keyword FROM keywords').all(); 264 logger.info(`Checking ${keywords.length} database keywords...`); 265 266 const toDelete = []; 267 const reasons = {}; 268 269 for (const row of keywords) { 270 const { id, keyword } = row; 271 const wordCount = keyword.trim().split(/\s+/).length; 272 273 let reason = null; 274 275 if (isJunk(keyword)) { 276 reason = 'junk_pattern'; 277 } else if (wordCount > 3) { 278 reason = 'too_long'; 279 } 280 281 if (reason) { 282 toDelete.push(id); 283 reasons[reason] = (reasons[reason] || 0) + 1; 284 } 285 } 286 287 if (toDelete.length > 0) { 288 const deleteStmt = db.prepare('DELETE FROM keywords WHERE id = ?'); 289 const deleteMany = db.transaction(ids => { 290 for (const id of ids) { 291 deleteStmt.run(id); 292 } 293 }); 294 295 deleteMany(toDelete); 296 297 logger.success(`✓ Deleted ${toDelete.length} database keywords`); 298 Object.entries(reasons).forEach(([reason, count]) => { 299 logger.info(` ${reason}: ${count}`); 300 }); 301 302 const newCount = db.prepare('SELECT COUNT(*) as count FROM keywords').get(); 303 logger.info(`Remaining database keywords: ${newCount.count}`); 304 } else { 305 logger.success('✓ No junk keywords in database'); 306 } 307 308 return toDelete.length; 309 } finally { 310 db.close(); 311 } 312 } 313 314 async function main() { 315 const args = process.argv.slice(2); 316 const dryRun = args.includes('--dry-run'); 317 318 if (dryRun) { 319 logger.info('DRY RUN MODE - no files will be modified\n'); 320 } 321 322 logger.info('Starting comprehensive keyword cleanup...\n'); 323 324 // Get all countries 325 const countries = fs 326 .readdirSync('./data') 327 .filter(f => fs.statSync(path.join('./data', f)).isDirectory() && f.length === 2); 328 329 let totalKept = 0; 330 let totalRemoved = 0; 331 const allReasons = {}; 332 333 // Process Hindi files first (if they exist) 334 const hindiFiles = [ 335 { 336 path: './data/in/businesses-hindi-final-filtered.csv', 337 type: 'businesses', 338 label: 'Hindi businesses', 339 }, 340 { path: './data/in/regions-hindi-final-filtered.csv', type: 'regions', label: 'Hindi regions' }, 341 ]; 342 343 for (const { path: filePath, type, label } of hindiFiles) { 344 if (fs.existsSync(filePath)) { 345 const result = filterCSV(filePath, type, 'in'); 346 if (result.removed > 0) { 347 logger.info(`${label}: kept ${result.kept}, removed ${result.removed}`); 348 Object.entries(result.reasons).forEach(([reason, count]) => { 349 logger.info(` ${reason}: ${count}`); 350 allReasons[reason] = (allReasons[reason] || 0) + count; 351 }); 352 } 353 totalKept += result.kept; 354 totalRemoved += result.removed; 355 } 356 } 357 358 // Process each country 359 for (const country of countries.sort()) { 360 const countryPath = `./data/${country}`; 361 362 // Process businesses 363 const bizPath = path.join(countryPath, 'businesses-final-filtered.csv'); 364 if (fs.existsSync(bizPath)) { 365 const result = filterCSV(bizPath, 'businesses', country); 366 if (result.removed > 0) { 367 logger.info(`${country} businesses: kept ${result.kept}, removed ${result.removed}`); 368 Object.entries(result.reasons).forEach(([reason, count]) => { 369 logger.info(` ${reason}: ${count}`); 370 allReasons[reason] = (allReasons[reason] || 0) + count; 371 }); 372 } 373 totalKept += result.kept; 374 totalRemoved += result.removed; 375 } 376 377 // Process regions 378 const regPath = path.join(countryPath, 'regions-final-filtered.csv'); 379 if (fs.existsSync(regPath)) { 380 const result = filterCSV(regPath, 'regions', country); 381 if (result.removed > 0) { 382 logger.info(`${country} regions: kept ${result.kept}, removed ${result.removed}`); 383 Object.entries(result.reasons).forEach(([reason, count]) => { 384 logger.info(` ${reason}: ${count}`); 385 allReasons[reason] = (allReasons[reason] || 0) + count; 386 }); 387 } 388 totalKept += result.kept; 389 totalRemoved += result.removed; 390 } 391 } 392 393 logger.info(`\n${'='.repeat(60)}`); 394 logger.success(`CSV Cleanup Complete:`); 395 logger.info(` Kept: ${totalKept} keywords`); 396 logger.info(` Removed: ${totalRemoved} keywords`); 397 logger.info(`\nRemoval reasons:`); 398 Object.entries(allReasons) 399 .sort((a, b) => b[1] - a[1]) 400 .forEach(([reason, count]) => { 401 logger.info(` ${reason}: ${count}`); 402 }); 403 404 // Clean database 405 logger.info(`\n${'='.repeat(60)}`); 406 logger.info('Cleaning database...\n'); 407 const dbRemoved = cleanDatabase(); 408 409 logger.info(`\n${'='.repeat(60)}`); 410 logger.success(`✓ Cleanup complete!`); 411 logger.info(`Total removed: ${totalRemoved + dbRemoved} keywords`); 412 } 413 414 main().catch(err => { 415 logger.error(`Fatal error: ${err.message}`); 416 console.error(err); 417 process.exit(1); 418 });