keywords.js
1 #!/usr/bin/env node 2 3 /** 4 * Keywords Stage CLI 5 */ 6 7 import '../utils/load-env.js'; 8 import { 9 runKeywordsStage, 10 listKeywords, 11 addKeyword, 12 updateKeywordPriority, 13 } from '../stages/keywords.js'; 14 import { generateKeywordCombinations } from '../utils/keyword-manager.js'; 15 import { 16 generateSearchVolumeCSV, 17 filterKeywordsByVolume, 18 analyzeSearchVolumes, 19 } from '../utils/keyword-validator.js'; 20 import { run, getOne, withTransaction } from '../utils/db.js'; 21 import { parseFlags } from '../utils/flag-parser.js'; 22 import Logger from '../utils/logger.js'; 23 import path from 'path'; 24 import fs from 'fs'; 25 26 const logger = new Logger('KeywordsCLI'); 27 28 /** 29 * Generate search volume CSV for keywords (requires DataForSEO API) 30 */ 31 async function generateCsvCommand(country, keywordType, gdpPriority = false) { 32 const { getSupportedCountries } = await import('../config/countries.js'); 33 const { getBudgetOptimizedCountries, getTopCountriesByGDP } = 34 await import('../config/countries-gdp.js'); 35 36 if (!process.env.DATAFORSEO_LOGIN || !process.env.DATAFORSEO_PASSWORD) { 37 throw new Error( 38 'DataForSEO credentials required. Add DATAFORSEO_LOGIN and DATAFORSEO_PASSWORD to .env' 39 ); 40 } 41 42 // Determine which countries to process 43 let targetCountries; 44 if (country) { 45 targetCountries = [country]; 46 } else if (gdpPriority) { 47 const budgetOptimized = getBudgetOptimizedCountries(); 48 targetCountries = 49 keywordType === 'businesses' ? budgetOptimized.businesses : budgetOptimized.regions; 50 logger.info( 51 `Using GDP-prioritized processing: ${targetCountries.length} countries for ${keywordType}` 52 ); 53 if (keywordType === 'regions') { 54 logger.info(` Budget constraint: Processing top 18 countries by GDP`); 55 } 56 } else { 57 targetCountries = getSupportedCountries(); 58 } 59 60 logger.info( 61 `Generating search volume CSV for ${country ? country : `${targetCountries.length} countries`} (${keywordType})...` 62 ); 63 64 let totalKeywords = 0; 65 let processedCount = 0; 66 67 for (const countryCode of targetCountries) { 68 const filename = keywordType === 'businesses' ? 'businesses.txt' : 'regions.txt'; 69 const dataDir = './data'; 70 const filePath = path.join(dataDir, countryCode.toLowerCase(), filename); 71 const csvPath = filePath.replace('.txt', '-search-volume.csv'); 72 73 if (!fs.existsSync(filePath)) { 74 logger.warn(`File not found: ${filePath}`); 75 continue; 76 } 77 78 logger.info(`Processing ${countryCode} ${keywordType}...`); 79 80 try { 81 const stats = await generateSearchVolumeCSV(filePath, countryCode, csvPath); 82 83 logger.success(`${countryCode}: ${stats.totalKeywords} keywords → ${csvPath}`); 84 logger.info(` Seeds: ${stats.seedCount}, Expanded: ${stats.expandedCount}`); 85 totalKeywords += stats.totalKeywords; 86 processedCount++; 87 } catch (err) { 88 logger.error(`${countryCode}: ${err.message}`); 89 if (err.message.includes('API')) { 90 logger.warn('Stopping due to API error'); 91 break; 92 } 93 } 94 } 95 96 logger.success(`Total: Generated ${totalKeywords} keywords across ${processedCount} countries`); 97 } 98 99 /** 100 * Analyze search volume distribution from CSV (no API) 101 */ 102 async function analyzeCommand(csvPath) { 103 if (!csvPath) { 104 throw new Error( 105 'CSV path required: npm run keywords analyze -- --csv data/au/businesses-search-volume.csv' 106 ); 107 } 108 109 if (!fs.existsSync(csvPath)) { 110 throw new Error(`File not found: ${csvPath}`); 111 } 112 113 logger.info(`Analyzing search volumes from ${csvPath}...`); 114 115 const stats = await analyzeSearchVolumes(csvPath); 116 117 console.log(`\n${'='.repeat(60)}`); 118 console.log('Search Volume Statistics'); 119 console.log('='.repeat(60)); 120 console.log(`Total Keywords: ${stats.total_keywords}`); 121 console.log(`Min: ${stats.statistics.min.toLocaleString()}`); 122 console.log(`Max: ${stats.statistics.max.toLocaleString()}`); 123 console.log(`Mean: ${stats.statistics.mean.toLocaleString()}`); 124 console.log(`Median (p50): ${stats.statistics.median.toLocaleString()}`); 125 console.log(`p25: ${stats.statistics.p25.toLocaleString()}`); 126 console.log(`p75: ${stats.statistics.p75.toLocaleString()}`); 127 console.log(`p90: ${stats.statistics.p90.toLocaleString()}`); 128 console.log(`p95: ${stats.statistics.p95.toLocaleString()}`); 129 console.log(`p99: ${stats.statistics.p99.toLocaleString()}`); 130 131 console.log('\nDistribution:'); 132 console.log(` 0-10K: ${stats.distribution['0-10k']} keywords`); 133 console.log(` 10K-50K: ${stats.distribution['10k-50k']} keywords`); 134 console.log(` 50K-100K: ${stats.distribution['50k-100k']} keywords`); 135 console.log(` 100K-200K: ${stats.distribution['100k-200k']} keywords`); 136 console.log(` 200K-500K: ${stats.distribution['200k-500k']} keywords`); 137 console.log(` 500K+: ${stats.distribution['500k+']} keywords`); 138 139 console.log('\nRecommended Cutoffs:'); 140 console.log(` Conservative (p95): ${stats.recommendations.conservative.toLocaleString()}`); 141 console.log(` Balanced (p90): ${stats.recommendations.balanced.toLocaleString()} ⭐`); 142 console.log(` Inclusive (p75): ${stats.recommendations.inclusive.toLocaleString()}`); 143 console.log(`${'='.repeat(60)}\n`); 144 } 145 146 /** 147 * Apply search volume threshold to filter keywords (no API) 148 */ 149 async function applyCutoffCommand(country, keywordType, minSearchVolume) { 150 const { getSupportedCountries } = await import('../config/countries.js'); 151 152 const targetCountries = country ? [country] : getSupportedCountries(); 153 logger.info( 154 `Applying cutoff ${minSearchVolume.toLocaleString()} to ${country ? country : `all ${targetCountries.length} countries`} (${keywordType})...` 155 ); 156 157 let totalFiltered = 0; 158 let totalRemoved = 0; 159 160 for (const countryCode of targetCountries) { 161 const dataDir = './data'; 162 const csvPath = path.join( 163 dataDir, 164 countryCode.toLowerCase(), 165 `${keywordType}-search-volume.csv` 166 ); 167 const outputPath = path.join(dataDir, countryCode.toLowerCase(), `${keywordType}.txt`); 168 169 if (!fs.existsSync(csvPath)) { 170 logger.warn(`CSV not found: ${csvPath}. Run generate-csv first.`); 171 continue; 172 } 173 174 logger.info(`Processing ${countryCode}...`); 175 176 try { 177 const stats = await filterKeywordsByVolume(csvPath, minSearchVolume, outputPath); 178 179 logger.success( 180 `${countryCode}: ${stats.filteredKeywords}/${stats.totalKeywords} keywords kept (${stats.removedKeywords} removed)` 181 ); 182 totalFiltered += stats.filteredKeywords; 183 totalRemoved += stats.removedKeywords; 184 } catch (err) { 185 logger.error(`${countryCode}: ${err.message}`); 186 } 187 } 188 189 logger.success( 190 `Total: ${totalFiltered} keywords kept, ${totalRemoved} removed across ${targetCountries.length} countries` 191 ); 192 } 193 194 async function main() { 195 const command = process.argv[2]; 196 const { country, limit } = parseFlags(); 197 198 try { 199 if (command === 'list') { 200 const keywords = await listKeywords(); 201 console.table(keywords); 202 } else if (command === 'add') { 203 const keyword = process.argv[3]; 204 const priority = parseInt(process.argv[4] || '5', 10); 205 if (!keyword) { 206 throw new Error( 207 'Keyword is required: npm run keywords add "keyword" [priority] --country CODE' 208 ); 209 } 210 if (!country) { 211 throw new Error( 212 'Country code is required: npm run keywords add "keyword" [priority] --country CODE' 213 ); 214 } 215 await addKeyword(keyword, priority, country); 216 } else if (command === 'generate') { 217 // Generate keyword combinations for one or all countries 218 const { getSupportedCountries } = await import('../config/countries.js'); 219 const { upsertKeyword } = await import('../utils/keyword-manager.js'); 220 221 const targetCountries = country ? [country] : getSupportedCountries(); 222 logger.info( 223 `Generating keyword combinations for ${country ? country : `all ${targetCountries.length} countries`}...` 224 ); 225 226 let totalInserted = 0; 227 let totalSkipped = 0; 228 229 for (const countryCode of targetCountries) { 230 let combinations; 231 try { 232 combinations = generateKeywordCombinations(countryCode); 233 } catch (err) { 234 if (err.message.includes('Region file')) { 235 logger.warn(`Skipping ${countryCode}: ${err.message}`); 236 continue; 237 } 238 throw err; 239 } 240 241 let countryInserted = 0; 242 let countrySkipped = 0; 243 244 const limitCount = limit 245 ? Math.ceil(limit / targetCountries.length) 246 : combinations.length; 247 248 const combosToInsert = combinations.slice(0, limitCount); 249 const BATCH_SIZE = 10000; 250 251 // Process in batches to avoid overloading the PG connection 252 for (let i = 0; i < combosToInsert.length; i += BATCH_SIZE) { 253 const batch = combosToInsert.slice(i, i + BATCH_SIZE); 254 const batchNum = Math.floor(i / BATCH_SIZE) + 1; 255 const totalBatches = Math.ceil(combosToInsert.length / BATCH_SIZE); 256 257 if (totalBatches > 1) { 258 logger.info( 259 ` Processing batch ${batchNum}/${totalBatches} (${batch.length} keywords)...` 260 ); 261 } 262 263 // Use a transaction per batch for efficiency 264 await withTransaction(async client => { 265 for (const combo of batch) { 266 try { 267 await upsertKeyword(client, combo.keyword, { 268 country_code: combo.countryCode, 269 google_domain: combo.googleDomain, 270 search_volume: combo.searchVolume, 271 priority: combo.priority, 272 }); 273 countryInserted++; 274 } catch (err) { 275 if ( 276 err.message.includes('unique') || 277 err.message.includes('duplicate') || 278 err.code === '23505' 279 ) { 280 countrySkipped++; 281 } else { 282 throw err; 283 } 284 } 285 } 286 }); 287 } 288 289 totalInserted += countryInserted; 290 totalSkipped += countrySkipped; 291 292 if (countryInserted > 0) { 293 logger.success(`${countryCode}: Generated ${countryInserted} keywords`); 294 } 295 } 296 297 logger.success( 298 `Total: Generated ${totalInserted} keywords across ${targetCountries.length} countries` 299 ); 300 if (totalSkipped > 0) { 301 logger.info(`Skipped ${totalSkipped} duplicate keywords`); 302 } 303 } else if (command === 'priority') { 304 const keywordId = parseInt(process.argv[3], 10); 305 const priority = parseInt(process.argv[4], 10); 306 if (!keywordId || !priority) { 307 throw new Error('Usage: npm run keywords priority <id> <priority>'); 308 } 309 await updateKeywordPriority(keywordId, priority); 310 } else if (command === 'generate-csv') { 311 const { type, 'gdp-priority': gdpPriority } = parseFlags(); 312 const keywordType = type || 'businesses'; 313 await generateCsvCommand(country, keywordType, gdpPriority); 314 } else if (command === 'analyze') { 315 const { csv } = parseFlags(); 316 const csvPath = csv || process.argv[3]; 317 await analyzeCommand(csvPath); 318 } else if (command === 'apply-cutoff') { 319 const { type, threshold } = parseFlags(); 320 const keywordType = type || 'businesses'; 321 if (!threshold) { 322 logger.error('--threshold flag is required for apply-cutoff command'); 323 process.exit(1); 324 } 325 const minSearchVolume = parseInt(threshold, 10); 326 await applyCutoffCommand(country, keywordType, minSearchVolume); 327 } else { 328 await runKeywordsStage({ limit, ...(country && { country }) }); 329 } 330 } catch (err) { 331 logger.error(err.message); 332 process.exit(1); 333 } 334 } 335 336 main();