/ scripts / backfill-real-search-volumes.js
backfill-real-search-volumes.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Backfill Real Search Volumes from DataForSEO
  5   *
  6   * Fetches actual Google Ads search volumes for combined keywords
  7   * (e.g., "plumber sydney") and replaces geometric mean estimates
  8   * with real data for accurate prioritization.
  9   *
 10   * Usage:
 11   *   node scripts/backfill-real-search-volumes.js [options]
 12   *
 13   * Options:
 14   *   --test          Test mode - only process 100 keywords
 15   *   --limit N       Limit to N keywords (for testing)
 16   *   --country CC    Only process specific country (e.g., US, AU)
 17   *   --dry-run       Show what would be done without making changes
 18   */
 19  
 20  import 'dotenv/config';
 21  import { createDatabaseConnection } from '../src/utils/db.js';
 22  import Logger from '../src/utils/logger.js';
 23  import fs from 'fs';
 24  import path from 'path';
 25  
 26  const logger = new Logger('BackfillSV');
 27  
 28  // Backup directory for API responses
 29  const BACKUP_DIR = './backups/dataforseo-responses';
 30  
 31  // DataForSEO location codes for each country
 32  const LOCATION_CODES = {
 33    US: 2840, // United States
 34    CN: 2156, // China
 35    JP: 2392, // Japan
 36    DE: 2276, // Germany
 37    IN: 2356, // India
 38    UK: 2826, // United Kingdom
 39    FR: 2250, // France
 40    IT: 2380, // Italy
 41    CA: 2124, // Canada
 42    MX: 2484, // Mexico
 43    KR: 2410, // South Korea
 44    AU: 2036, // Australia
 45    ES: 2724, // Spain
 46    ID: 2360, // Indonesia
 47    NL: 2528, // Netherlands
 48    CH: 2756, // Switzerland
 49    PL: 2616, // Poland
 50    SE: 2752, // Sweden
 51    BE: 2056, // Belgium
 52    NO: 2578, // Norway
 53    IE: 2372, // Ireland
 54    AT: 2040, // Austria
 55    SG: 2702, // Singapore
 56    DK: 2208, // Denmark
 57    NZ: 2554, // New Zealand
 58  };
 59  
 60  // Parse command line arguments
 61  const args = process.argv.slice(2);
 62  const testMode = args.includes('--test');
 63  const dryRun = args.includes('--dry-run');
 64  const limitIndex = args.indexOf('--limit');
 65  const limit = limitIndex >= 0 ? parseInt(args[limitIndex + 1]) : null;
 66  const countryIndex = args.indexOf('--country');
 67  const countryFilter = countryIndex >= 0 ? args[countryIndex + 1].toUpperCase() : null;
 68  
 69  // Apply limit
 70  const effectiveLimit = testMode ? 100 : limit;
 71  
 72  /**
 73   * Chunk array into batches
 74   */
 75  function chunk(array, size) {
 76    const chunks = [];
 77    for (let i = 0; i < array.length; i += size) {
 78      chunks.push(array.slice(i, i + size));
 79    }
 80    return chunks;
 81  }
 82  
 83  /**
 84   * Save API response backup
 85   */
 86  function saveResponseBackup(countryCode, batchNum, responseData, keywords) {
 87    const timestamp = new Date().toISOString().replace(/:/g, '-').split('.')[0];
 88    const countryDir = path.join(BACKUP_DIR, countryCode);
 89  
 90    // Ensure directory exists
 91    if (!fs.existsSync(countryDir)) {
 92      fs.mkdirSync(countryDir, { recursive: true });
 93    }
 94  
 95    const filename = `${timestamp}-batch-${batchNum}.json`;
 96    const filepath = path.join(countryDir, filename);
 97  
 98    const backup = {
 99      timestamp: new Date().toISOString(),
100      country_code: countryCode,
101      batch_number: batchNum,
102      keyword_count: keywords.length,
103      keywords,
104      response: responseData,
105    };
106  
107    fs.writeFileSync(filepath, JSON.stringify(backup, null, 2));
108    logger.debug(`  Saved backup to ${filepath}`);
109  }
110  
111  /**
112   * Call DataForSEO Google Ads Search Volume API (Live mode)
113   */
114  async function fetchSearchVolumes(
115    keywords,
116    locationCode,
117    countryCode,
118    batchNum,
119    languageCode = 'en'
120  ) {
121    const url = 'https://api.dataforseo.com/v3/keywords_data/google_ads/search_volume/live';
122  
123    const auth = Buffer.from(
124      `${process.env.DATAFORSEO_LOGIN}:${process.env.DATAFORSEO_PASSWORD}`
125    ).toString('base64');
126  
127    const payload = [
128      {
129        location_code: locationCode,
130        language_code: languageCode,
131        keywords,
132      },
133    ];
134  
135    logger.debug(
136      `  API call: ${keywords.length} keywords, location ${locationCode}, language ${languageCode}`
137    );
138  
139    const response = await fetch(url, {
140      method: 'POST',
141      headers: {
142        Authorization: `Basic ${auth}`,
143        'Content-Type': 'application/json',
144      },
145      body: JSON.stringify(payload),
146    });
147  
148    if (!response.ok) {
149      const errorText = await response.text();
150      throw new Error(
151        `DataForSEO API error: ${response.status} ${response.statusText} - ${errorText}`
152      );
153    }
154  
155    const data = await response.json();
156  
157    if (data.status_code !== 20000) {
158      throw new Error(`DataForSEO API error: ${data.status_message}`);
159    }
160  
161    // Save raw API response as backup
162    saveResponseBackup(countryCode, batchNum, data, keywords);
163  
164    const results = data.tasks[0]?.result || [];
165  
166    logger.debug(`  API returned ${results.length} results`);
167  
168    return results.map(r => ({
169      keyword: r.keyword,
170      search_volume: r.search_volume || 0,
171      competition: r.competition || 0,
172      cpc: r.cpc || 0,
173    }));
174  }
175  
176  /**
177   * Update keywords table with real search volumes
178   */
179  function updateKeywords(db, results, countryCode) {
180    const stmt = db.prepare(`
181      UPDATE keywords
182      SET search_volume = ?, updated_at = CURRENT_TIMESTAMP
183      WHERE keyword = ? AND country_code = ?
184    `);
185  
186    const updateMany = db.transaction(updates => {
187      for (const update of updates) {
188        stmt.run(update.search_volume, update.keyword, countryCode);
189      }
190    });
191  
192    updateMany(results);
193    logger.success(`  Updated ${results.length} keywords in database`);
194  }
195  
196  /**
197   * Recalculate priorities based on percentile ranking
198   * Keywords with SV=0 get priority 0 (no search demand)
199   * Keywords with SV>0 get priorities 1-10 based on percentile
200   */
201  function recalculatePriorities(db) {
202    logger.info('Recalculating priorities based on search volumes...');
203  
204    // Get all keywords sorted by search volume
205    const allKeywords = db
206      .prepare(
207        `
208      SELECT id, search_volume
209      FROM keywords
210      ORDER BY search_volume DESC
211    `
212      )
213      .all();
214  
215    if (allKeywords.length === 0) {
216      logger.warn('No keywords to recalculate');
217      return;
218    }
219  
220    // Split into zero and non-zero search volumes
221    const zeroSV = allKeywords.filter(kw => kw.search_volume === 0);
222    const positiveSV = allKeywords.filter(kw => kw.search_volume > 0);
223  
224    logger.info(`  Keywords: ${positiveSV.length} with SV>0, ${zeroSV.length} with SV=0`);
225  
226    // Calculate percentile-based priorities (1-10 scale) for positive SV only
227    const stmt = db.prepare('UPDATE keywords SET priority = ? WHERE id = ?');
228    const updateMany = db.transaction(updates => {
229      for (const update of updates) {
230        stmt.run(update.priority, update.id);
231      }
232    });
233  
234    const updates = [];
235  
236    // Priority 0 for all zero SV keywords
237    for (const kw of zeroSV) {
238      updates.push({ id: kw.id, priority: 0 });
239    }
240  
241    // Priorities 1-10 for positive SV keywords based on percentile
242    for (let i = 0; i < positiveSV.length; i++) {
243      const kw = positiveSV[i];
244      const percentile = (positiveSV.length - i) / positiveSV.length;
245      let priority;
246      if (percentile >= 0.95) priority = 10;
247      else if (percentile >= 0.9) priority = 9;
248      else if (percentile >= 0.8) priority = 8;
249      else if (percentile >= 0.7) priority = 7;
250      else if (percentile >= 0.6) priority = 6;
251      else if (percentile >= 0.5) priority = 5;
252      else if (percentile >= 0.4) priority = 4;
253      else if (percentile >= 0.3) priority = 3;
254      else if (percentile >= 0.2) priority = 2;
255      else priority = 1;
256  
257      updates.push({ id: kw.id, priority });
258    }
259  
260    updateMany(updates);
261    logger.success(
262      `Recalculated priorities: ${positiveSV.length} active (1-10), ${zeroSV.length} inactive (0)`
263    );
264  }
265  
266  /**
267   * Save checkpoint to resume if interrupted
268   */
269  function saveCheckpoint(processed, filename = '.backfill-checkpoint.json') {
270    fs.writeFileSync(filename, JSON.stringify({ processed, timestamp: new Date().toISOString() }));
271  }
272  
273  /**
274   * Load checkpoint to resume
275   */
276  function loadCheckpoint(filename = '.backfill-checkpoint.json') {
277    try {
278      if (fs.existsSync(filename)) {
279        const data = JSON.parse(fs.readFileSync(filename, 'utf-8'));
280        logger.info(
281          `Resuming from checkpoint: ${data.processed} keywords processed at ${data.timestamp}`
282        );
283        return data.processed;
284      }
285    } catch (error) {
286      logger.warn(`Failed to load checkpoint: ${error.message}`);
287    }
288    return 0;
289  }
290  
291  /**
292   * Main backfill process
293   */
294  async function main() {
295    logger.info('DataForSEO Search Volume Backfill');
296    logger.info('='.repeat(60));
297  
298    if (testMode) logger.warn('⚠️  TEST MODE: Processing only 100 keywords');
299    if (dryRun) logger.warn('⚠️  DRY RUN: No changes will be made');
300    if (effectiveLimit) logger.warn(`⚠️  LIMIT: Processing only ${effectiveLimit} keywords`);
301    if (countryFilter) logger.info(`Filtering to country: ${countryFilter}`);
302  
303    // Open database
304    const dbPath = process.env.DATABASE_PATH || './db/sites.db';
305    const db = createDatabaseConnection(dbPath);
306  
307    // Get keywords that need search volumes (only NULL, not 0)
308    let query = `
309      SELECT id, keyword, country_code, search_volume
310      FROM keywords
311      WHERE search_volume IS NULL
312    `;
313  
314    if (countryFilter) {
315      query += ` AND country_code = '${countryFilter}'`;
316    }
317  
318    query += ' ORDER BY country_code, id';
319  
320    if (effectiveLimit) {
321      query += ` LIMIT ${effectiveLimit}`;
322    }
323  
324    const allKeywords = db.prepare(query).all();
325  
326    logger.info(`Found ${allKeywords.length} keywords to process`);
327  
328    if (allKeywords.length === 0) {
329      logger.warn('No keywords found');
330      db.close();
331      return;
332    }
333  
334    // Group by country
335    const byCountry = {};
336    for (const kw of allKeywords) {
337      if (!byCountry[kw.country_code]) {
338        byCountry[kw.country_code] = [];
339      }
340      byCountry[kw.country_code].push(kw);
341    }
342  
343    const countries = Object.keys(byCountry).sort();
344    logger.info(`Countries to process: ${countries.join(', ')}`);
345    logger.info('');
346  
347    let totalProcessed = 0;
348    let totalCost = 0;
349  
350    // Process each country
351    for (const countryCode of countries) {
352      const keywords = byCountry[countryCode];
353      const locationCode = LOCATION_CODES[countryCode];
354  
355      if (!locationCode) {
356        logger.warn(`❌ No location code for ${countryCode}, skipping`);
357        continue;
358      }
359  
360      logger.info(`Processing ${countryCode}: ${keywords.length} keywords`);
361  
362      // Batch into groups of 1,000
363      const batches = chunk(
364        keywords.map(k => k.keyword),
365        1000
366      );
367      logger.info(`  Batches: ${batches.length} (${batches[0].length} keywords per batch)`);
368  
369      for (let i = 0; i < batches.length; i++) {
370        const batch = batches[i];
371        const batchNum = i + 1;
372  
373        logger.info(`  Batch ${batchNum}/${batches.length}: ${batch.length} keywords`);
374  
375        if (dryRun) {
376          logger.info(
377            `    [DRY RUN] Would fetch search volumes for: ${batch.slice(0, 3).join(', ')}...`
378          );
379          totalProcessed += batch.length;
380          continue;
381        }
382  
383        try {
384          // Fetch search volumes from API (with backup)
385          const results = await fetchSearchVolumes(batch, locationCode, countryCode, batchNum);
386          const cost = 0.075; // $0.075 per 1,000 keywords (Live mode)
387          totalCost += cost;
388  
389          logger.success(
390            `    Fetched ${results.length} results (cost: $${cost.toFixed(2)}, total: $${totalCost.toFixed(2)})`
391          );
392  
393          // Update database
394          updateKeywords(db, results, countryCode);
395          totalProcessed += batch.length;
396  
397          // Save checkpoint every 10 batches
398          if (batchNum % 10 === 0) {
399            saveCheckpoint(totalProcessed);
400            logger.info(`    Checkpoint saved: ${totalProcessed} keywords processed`);
401          }
402  
403          // Rate limit: wait 1 second between batches
404          if (batchNum < batches.length) {
405            await new Promise(resolve => setTimeout(resolve, 1000));
406          }
407        } catch (error) {
408          logger.error(`    Failed to process batch ${batchNum}: ${error.message}`);
409          logger.error(`    Saving checkpoint and exiting...`);
410          saveCheckpoint(totalProcessed);
411          db.close();
412          process.exit(1);
413        }
414      }
415  
416      logger.success(`✓ ${countryCode} complete: ${keywords.length} keywords processed\n`);
417    }
418  
419    // Recalculate priorities
420    if (!dryRun) {
421      recalculatePriorities(db);
422    }
423  
424    // Summary
425    logger.info('');
426    logger.info('='.repeat(60));
427    logger.info('SUMMARY');
428    logger.info('='.repeat(60));
429    logger.info(`Total keywords processed: ${totalProcessed}`);
430    logger.info(`Total cost: $${totalCost.toFixed(2)}`);
431    logger.info(`Countries: ${countries.length}`);
432  
433    // Cleanup
434    if (fs.existsSync('.backfill-checkpoint.json')) {
435      fs.unlinkSync('.backfill-checkpoint.json');
436      logger.info('Checkpoint file deleted');
437    }
438  
439    db.close();
440    logger.success('✓ Backfill complete!');
441  }
442  
443  main().catch(error => {
444    logger.error(`Fatal error: ${error.message}`);
445    if (error.stack) {
446      logger.debug(error.stack);
447    }
448    process.exit(1);
449  });