/ src / cli / keywords.js
keywords.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Keywords Stage CLI
  5   */
  6  
  7  import '../utils/load-env.js';
  8  import {
  9    runKeywordsStage,
 10    listKeywords,
 11    addKeyword,
 12    updateKeywordPriority,
 13  } from '../stages/keywords.js';
 14  import { generateKeywordCombinations } from '../utils/keyword-manager.js';
 15  import {
 16    generateSearchVolumeCSV,
 17    filterKeywordsByVolume,
 18    analyzeSearchVolumes,
 19  } from '../utils/keyword-validator.js';
 20  import { run, getOne, withTransaction } from '../utils/db.js';
 21  import { parseFlags } from '../utils/flag-parser.js';
 22  import Logger from '../utils/logger.js';
 23  import path from 'path';
 24  import fs from 'fs';
 25  
 26  const logger = new Logger('KeywordsCLI');
 27  
 28  /**
 29   * Generate search volume CSV for keywords (requires DataForSEO API)
 30   */
 31  async function generateCsvCommand(country, keywordType, gdpPriority = false) {
 32    const { getSupportedCountries } = await import('../config/countries.js');
 33    const { getBudgetOptimizedCountries, getTopCountriesByGDP } =
 34      await import('../config/countries-gdp.js');
 35  
 36    if (!process.env.DATAFORSEO_LOGIN || !process.env.DATAFORSEO_PASSWORD) {
 37      throw new Error(
 38        'DataForSEO credentials required. Add DATAFORSEO_LOGIN and DATAFORSEO_PASSWORD to .env'
 39      );
 40    }
 41  
 42    // Determine which countries to process
 43    let targetCountries;
 44    if (country) {
 45      targetCountries = [country];
 46    } else if (gdpPriority) {
 47      const budgetOptimized = getBudgetOptimizedCountries();
 48      targetCountries =
 49        keywordType === 'businesses' ? budgetOptimized.businesses : budgetOptimized.regions;
 50      logger.info(
 51        `Using GDP-prioritized processing: ${targetCountries.length} countries for ${keywordType}`
 52      );
 53      if (keywordType === 'regions') {
 54        logger.info(`  Budget constraint: Processing top 18 countries by GDP`);
 55      }
 56    } else {
 57      targetCountries = getSupportedCountries();
 58    }
 59  
 60    logger.info(
 61      `Generating search volume CSV for ${country ? country : `${targetCountries.length} countries`} (${keywordType})...`
 62    );
 63  
 64    let totalKeywords = 0;
 65    let processedCount = 0;
 66  
 67    for (const countryCode of targetCountries) {
 68      const filename = keywordType === 'businesses' ? 'businesses.txt' : 'regions.txt';
 69      const dataDir = './data';
 70      const filePath = path.join(dataDir, countryCode.toLowerCase(), filename);
 71      const csvPath = filePath.replace('.txt', '-search-volume.csv');
 72  
 73      if (!fs.existsSync(filePath)) {
 74        logger.warn(`File not found: ${filePath}`);
 75        continue;
 76      }
 77  
 78      logger.info(`Processing ${countryCode} ${keywordType}...`);
 79  
 80      try {
 81        const stats = await generateSearchVolumeCSV(filePath, countryCode, csvPath);
 82  
 83        logger.success(`${countryCode}: ${stats.totalKeywords} keywords → ${csvPath}`);
 84        logger.info(`  Seeds: ${stats.seedCount}, Expanded: ${stats.expandedCount}`);
 85        totalKeywords += stats.totalKeywords;
 86        processedCount++;
 87      } catch (err) {
 88        logger.error(`${countryCode}: ${err.message}`);
 89        if (err.message.includes('API')) {
 90          logger.warn('Stopping due to API error');
 91          break;
 92        }
 93      }
 94    }
 95  
 96    logger.success(`Total: Generated ${totalKeywords} keywords across ${processedCount} countries`);
 97  }
 98  
 99  /**
100   * Analyze search volume distribution from CSV (no API)
101   */
102  async function analyzeCommand(csvPath) {
103    if (!csvPath) {
104      throw new Error(
105        'CSV path required: npm run keywords analyze -- --csv data/au/businesses-search-volume.csv'
106      );
107    }
108  
109    if (!fs.existsSync(csvPath)) {
110      throw new Error(`File not found: ${csvPath}`);
111    }
112  
113    logger.info(`Analyzing search volumes from ${csvPath}...`);
114  
115    const stats = await analyzeSearchVolumes(csvPath);
116  
117    console.log(`\n${'='.repeat(60)}`);
118    console.log('Search Volume Statistics');
119    console.log('='.repeat(60));
120    console.log(`Total Keywords: ${stats.total_keywords}`);
121    console.log(`Min: ${stats.statistics.min.toLocaleString()}`);
122    console.log(`Max: ${stats.statistics.max.toLocaleString()}`);
123    console.log(`Mean: ${stats.statistics.mean.toLocaleString()}`);
124    console.log(`Median (p50): ${stats.statistics.median.toLocaleString()}`);
125    console.log(`p25: ${stats.statistics.p25.toLocaleString()}`);
126    console.log(`p75: ${stats.statistics.p75.toLocaleString()}`);
127    console.log(`p90: ${stats.statistics.p90.toLocaleString()}`);
128    console.log(`p95: ${stats.statistics.p95.toLocaleString()}`);
129    console.log(`p99: ${stats.statistics.p99.toLocaleString()}`);
130  
131    console.log('\nDistribution:');
132    console.log(`  0-10K:     ${stats.distribution['0-10k']} keywords`);
133    console.log(`  10K-50K:   ${stats.distribution['10k-50k']} keywords`);
134    console.log(`  50K-100K:  ${stats.distribution['50k-100k']} keywords`);
135    console.log(`  100K-200K: ${stats.distribution['100k-200k']} keywords`);
136    console.log(`  200K-500K: ${stats.distribution['200k-500k']} keywords`);
137    console.log(`  500K+:     ${stats.distribution['500k+']} keywords`);
138  
139    console.log('\nRecommended Cutoffs:');
140    console.log(`  Conservative (p95): ${stats.recommendations.conservative.toLocaleString()}`);
141    console.log(`  Balanced (p90): ${stats.recommendations.balanced.toLocaleString()} ⭐`);
142    console.log(`  Inclusive (p75): ${stats.recommendations.inclusive.toLocaleString()}`);
143    console.log(`${'='.repeat(60)}\n`);
144  }
145  
146  /**
147   * Apply search volume threshold to filter keywords (no API)
148   */
149  async function applyCutoffCommand(country, keywordType, minSearchVolume) {
150    const { getSupportedCountries } = await import('../config/countries.js');
151  
152    const targetCountries = country ? [country] : getSupportedCountries();
153    logger.info(
154      `Applying cutoff ${minSearchVolume.toLocaleString()} to ${country ? country : `all ${targetCountries.length} countries`} (${keywordType})...`
155    );
156  
157    let totalFiltered = 0;
158    let totalRemoved = 0;
159  
160    for (const countryCode of targetCountries) {
161      const dataDir = './data';
162      const csvPath = path.join(
163        dataDir,
164        countryCode.toLowerCase(),
165        `${keywordType}-search-volume.csv`
166      );
167      const outputPath = path.join(dataDir, countryCode.toLowerCase(), `${keywordType}.txt`);
168  
169      if (!fs.existsSync(csvPath)) {
170        logger.warn(`CSV not found: ${csvPath}. Run generate-csv first.`);
171        continue;
172      }
173  
174      logger.info(`Processing ${countryCode}...`);
175  
176      try {
177        const stats = await filterKeywordsByVolume(csvPath, minSearchVolume, outputPath);
178  
179        logger.success(
180          `${countryCode}: ${stats.filteredKeywords}/${stats.totalKeywords} keywords kept (${stats.removedKeywords} removed)`
181        );
182        totalFiltered += stats.filteredKeywords;
183        totalRemoved += stats.removedKeywords;
184      } catch (err) {
185        logger.error(`${countryCode}: ${err.message}`);
186      }
187    }
188  
189    logger.success(
190      `Total: ${totalFiltered} keywords kept, ${totalRemoved} removed across ${targetCountries.length} countries`
191    );
192  }
193  
194  async function main() {
195    const command = process.argv[2];
196    const { country, limit } = parseFlags();
197  
198    try {
199      if (command === 'list') {
200        const keywords = await listKeywords();
201        console.table(keywords);
202      } else if (command === 'add') {
203        const keyword = process.argv[3];
204        const priority = parseInt(process.argv[4] || '5', 10);
205        if (!keyword) {
206          throw new Error(
207            'Keyword is required: npm run keywords add "keyword" [priority] --country CODE'
208          );
209        }
210        if (!country) {
211          throw new Error(
212            'Country code is required: npm run keywords add "keyword" [priority] --country CODE'
213          );
214        }
215        await addKeyword(keyword, priority, country);
216      } else if (command === 'generate') {
217        // Generate keyword combinations for one or all countries
218        const { getSupportedCountries } = await import('../config/countries.js');
219        const { upsertKeyword } = await import('../utils/keyword-manager.js');
220  
221        const targetCountries = country ? [country] : getSupportedCountries();
222        logger.info(
223          `Generating keyword combinations for ${country ? country : `all ${targetCountries.length} countries`}...`
224        );
225  
226        let totalInserted = 0;
227        let totalSkipped = 0;
228  
229        for (const countryCode of targetCountries) {
230          let combinations;
231          try {
232            combinations = generateKeywordCombinations(countryCode);
233          } catch (err) {
234            if (err.message.includes('Region file')) {
235              logger.warn(`Skipping ${countryCode}: ${err.message}`);
236              continue;
237            }
238            throw err;
239          }
240  
241          let countryInserted = 0;
242          let countrySkipped = 0;
243  
244          const limitCount = limit
245            ? Math.ceil(limit / targetCountries.length)
246            : combinations.length;
247  
248          const combosToInsert = combinations.slice(0, limitCount);
249          const BATCH_SIZE = 10000;
250  
251          // Process in batches to avoid overloading the PG connection
252          for (let i = 0; i < combosToInsert.length; i += BATCH_SIZE) {
253            const batch = combosToInsert.slice(i, i + BATCH_SIZE);
254            const batchNum = Math.floor(i / BATCH_SIZE) + 1;
255            const totalBatches = Math.ceil(combosToInsert.length / BATCH_SIZE);
256  
257            if (totalBatches > 1) {
258              logger.info(
259                `  Processing batch ${batchNum}/${totalBatches} (${batch.length} keywords)...`
260              );
261            }
262  
263            // Use a transaction per batch for efficiency
264            await withTransaction(async client => {
265              for (const combo of batch) {
266                try {
267                  await upsertKeyword(client, combo.keyword, {
268                    country_code: combo.countryCode,
269                    google_domain: combo.googleDomain,
270                    search_volume: combo.searchVolume,
271                    priority: combo.priority,
272                  });
273                  countryInserted++;
274                } catch (err) {
275                  if (
276                    err.message.includes('unique') ||
277                    err.message.includes('duplicate') ||
278                    err.code === '23505'
279                  ) {
280                    countrySkipped++;
281                  } else {
282                    throw err;
283                  }
284                }
285              }
286            });
287          }
288  
289          totalInserted += countryInserted;
290          totalSkipped += countrySkipped;
291  
292          if (countryInserted > 0) {
293            logger.success(`${countryCode}: Generated ${countryInserted} keywords`);
294          }
295        }
296  
297        logger.success(
298          `Total: Generated ${totalInserted} keywords across ${targetCountries.length} countries`
299        );
300        if (totalSkipped > 0) {
301          logger.info(`Skipped ${totalSkipped} duplicate keywords`);
302        }
303      } else if (command === 'priority') {
304        const keywordId = parseInt(process.argv[3], 10);
305        const priority = parseInt(process.argv[4], 10);
306        if (!keywordId || !priority) {
307          throw new Error('Usage: npm run keywords priority <id> <priority>');
308        }
309        await updateKeywordPriority(keywordId, priority);
310      } else if (command === 'generate-csv') {
311        const { type, 'gdp-priority': gdpPriority } = parseFlags();
312        const keywordType = type || 'businesses';
313        await generateCsvCommand(country, keywordType, gdpPriority);
314      } else if (command === 'analyze') {
315        const { csv } = parseFlags();
316        const csvPath = csv || process.argv[3];
317        await analyzeCommand(csvPath);
318      } else if (command === 'apply-cutoff') {
319        const { type, threshold } = parseFlags();
320        const keywordType = type || 'businesses';
321        if (!threshold) {
322          logger.error('--threshold flag is required for apply-cutoff command');
323          process.exit(1);
324        }
325        const minSearchVolume = parseInt(threshold, 10);
326        await applyCutoffCommand(country, keywordType, minSearchVolume);
327      } else {
328        await runKeywordsStage({ limit, ...(country && { country }) });
329      }
330    } catch (err) {
331      logger.error(err.message);
332      process.exit(1);
333    }
334  }
335  
336  main();