backfill-real-search-volumes.js
1 #!/usr/bin/env node 2 3 /** 4 * Backfill Real Search Volumes from DataForSEO 5 * 6 * Fetches actual Google Ads search volumes for combined keywords 7 * (e.g., "plumber sydney") and replaces geometric mean estimates 8 * with real data for accurate prioritization. 9 * 10 * Usage: 11 * node scripts/backfill-real-search-volumes.js [options] 12 * 13 * Options: 14 * --test Test mode - only process 100 keywords 15 * --limit N Limit to N keywords (for testing) 16 * --country CC Only process specific country (e.g., US, AU) 17 * --dry-run Show what would be done without making changes 18 */ 19 20 import 'dotenv/config'; 21 import { createDatabaseConnection } from '../src/utils/db.js'; 22 import Logger from '../src/utils/logger.js'; 23 import fs from 'fs'; 24 import path from 'path'; 25 26 const logger = new Logger('BackfillSV'); 27 28 // Backup directory for API responses 29 const BACKUP_DIR = './backups/dataforseo-responses'; 30 31 // DataForSEO location codes for each country 32 const LOCATION_CODES = { 33 US: 2840, // United States 34 CN: 2156, // China 35 JP: 2392, // Japan 36 DE: 2276, // Germany 37 IN: 2356, // India 38 UK: 2826, // United Kingdom 39 FR: 2250, // France 40 IT: 2380, // Italy 41 CA: 2124, // Canada 42 MX: 2484, // Mexico 43 KR: 2410, // South Korea 44 AU: 2036, // Australia 45 ES: 2724, // Spain 46 ID: 2360, // Indonesia 47 NL: 2528, // Netherlands 48 CH: 2756, // Switzerland 49 PL: 2616, // Poland 50 SE: 2752, // Sweden 51 BE: 2056, // Belgium 52 NO: 2578, // Norway 53 IE: 2372, // Ireland 54 AT: 2040, // Austria 55 SG: 2702, // Singapore 56 DK: 2208, // Denmark 57 NZ: 2554, // New Zealand 58 }; 59 60 // Parse command line arguments 61 const args = process.argv.slice(2); 62 const testMode = args.includes('--test'); 63 const dryRun = args.includes('--dry-run'); 64 const limitIndex = args.indexOf('--limit'); 65 const limit = limitIndex >= 0 ? parseInt(args[limitIndex + 1]) : null; 66 const countryIndex = args.indexOf('--country'); 67 const countryFilter = countryIndex >= 0 ? args[countryIndex + 1].toUpperCase() : null; 68 69 // Apply limit 70 const effectiveLimit = testMode ? 100 : limit; 71 72 /** 73 * Chunk array into batches 74 */ 75 function chunk(array, size) { 76 const chunks = []; 77 for (let i = 0; i < array.length; i += size) { 78 chunks.push(array.slice(i, i + size)); 79 } 80 return chunks; 81 } 82 83 /** 84 * Save API response backup 85 */ 86 function saveResponseBackup(countryCode, batchNum, responseData, keywords) { 87 const timestamp = new Date().toISOString().replace(/:/g, '-').split('.')[0]; 88 const countryDir = path.join(BACKUP_DIR, countryCode); 89 90 // Ensure directory exists 91 if (!fs.existsSync(countryDir)) { 92 fs.mkdirSync(countryDir, { recursive: true }); 93 } 94 95 const filename = `${timestamp}-batch-${batchNum}.json`; 96 const filepath = path.join(countryDir, filename); 97 98 const backup = { 99 timestamp: new Date().toISOString(), 100 country_code: countryCode, 101 batch_number: batchNum, 102 keyword_count: keywords.length, 103 keywords, 104 response: responseData, 105 }; 106 107 fs.writeFileSync(filepath, JSON.stringify(backup, null, 2)); 108 logger.debug(` Saved backup to ${filepath}`); 109 } 110 111 /** 112 * Call DataForSEO Google Ads Search Volume API (Live mode) 113 */ 114 async function fetchSearchVolumes( 115 keywords, 116 locationCode, 117 countryCode, 118 batchNum, 119 languageCode = 'en' 120 ) { 121 const url = 'https://api.dataforseo.com/v3/keywords_data/google_ads/search_volume/live'; 122 123 const auth = Buffer.from( 124 `${process.env.DATAFORSEO_LOGIN}:${process.env.DATAFORSEO_PASSWORD}` 125 ).toString('base64'); 126 127 const payload = [ 128 { 129 location_code: locationCode, 130 language_code: languageCode, 131 keywords, 132 }, 133 ]; 134 135 logger.debug( 136 ` API call: ${keywords.length} keywords, location ${locationCode}, language ${languageCode}` 137 ); 138 139 const response = await fetch(url, { 140 method: 'POST', 141 headers: { 142 Authorization: `Basic ${auth}`, 143 'Content-Type': 'application/json', 144 }, 145 body: JSON.stringify(payload), 146 }); 147 148 if (!response.ok) { 149 const errorText = await response.text(); 150 throw new Error( 151 `DataForSEO API error: ${response.status} ${response.statusText} - ${errorText}` 152 ); 153 } 154 155 const data = await response.json(); 156 157 if (data.status_code !== 20000) { 158 throw new Error(`DataForSEO API error: ${data.status_message}`); 159 } 160 161 // Save raw API response as backup 162 saveResponseBackup(countryCode, batchNum, data, keywords); 163 164 const results = data.tasks[0]?.result || []; 165 166 logger.debug(` API returned ${results.length} results`); 167 168 return results.map(r => ({ 169 keyword: r.keyword, 170 search_volume: r.search_volume || 0, 171 competition: r.competition || 0, 172 cpc: r.cpc || 0, 173 })); 174 } 175 176 /** 177 * Update keywords table with real search volumes 178 */ 179 function updateKeywords(db, results, countryCode) { 180 const stmt = db.prepare(` 181 UPDATE keywords 182 SET search_volume = ?, updated_at = CURRENT_TIMESTAMP 183 WHERE keyword = ? AND country_code = ? 184 `); 185 186 const updateMany = db.transaction(updates => { 187 for (const update of updates) { 188 stmt.run(update.search_volume, update.keyword, countryCode); 189 } 190 }); 191 192 updateMany(results); 193 logger.success(` Updated ${results.length} keywords in database`); 194 } 195 196 /** 197 * Recalculate priorities based on percentile ranking 198 * Keywords with SV=0 get priority 0 (no search demand) 199 * Keywords with SV>0 get priorities 1-10 based on percentile 200 */ 201 function recalculatePriorities(db) { 202 logger.info('Recalculating priorities based on search volumes...'); 203 204 // Get all keywords sorted by search volume 205 const allKeywords = db 206 .prepare( 207 ` 208 SELECT id, search_volume 209 FROM keywords 210 ORDER BY search_volume DESC 211 ` 212 ) 213 .all(); 214 215 if (allKeywords.length === 0) { 216 logger.warn('No keywords to recalculate'); 217 return; 218 } 219 220 // Split into zero and non-zero search volumes 221 const zeroSV = allKeywords.filter(kw => kw.search_volume === 0); 222 const positiveSV = allKeywords.filter(kw => kw.search_volume > 0); 223 224 logger.info(` Keywords: ${positiveSV.length} with SV>0, ${zeroSV.length} with SV=0`); 225 226 // Calculate percentile-based priorities (1-10 scale) for positive SV only 227 const stmt = db.prepare('UPDATE keywords SET priority = ? WHERE id = ?'); 228 const updateMany = db.transaction(updates => { 229 for (const update of updates) { 230 stmt.run(update.priority, update.id); 231 } 232 }); 233 234 const updates = []; 235 236 // Priority 0 for all zero SV keywords 237 for (const kw of zeroSV) { 238 updates.push({ id: kw.id, priority: 0 }); 239 } 240 241 // Priorities 1-10 for positive SV keywords based on percentile 242 for (let i = 0; i < positiveSV.length; i++) { 243 const kw = positiveSV[i]; 244 const percentile = (positiveSV.length - i) / positiveSV.length; 245 let priority; 246 if (percentile >= 0.95) priority = 10; 247 else if (percentile >= 0.9) priority = 9; 248 else if (percentile >= 0.8) priority = 8; 249 else if (percentile >= 0.7) priority = 7; 250 else if (percentile >= 0.6) priority = 6; 251 else if (percentile >= 0.5) priority = 5; 252 else if (percentile >= 0.4) priority = 4; 253 else if (percentile >= 0.3) priority = 3; 254 else if (percentile >= 0.2) priority = 2; 255 else priority = 1; 256 257 updates.push({ id: kw.id, priority }); 258 } 259 260 updateMany(updates); 261 logger.success( 262 `Recalculated priorities: ${positiveSV.length} active (1-10), ${zeroSV.length} inactive (0)` 263 ); 264 } 265 266 /** 267 * Save checkpoint to resume if interrupted 268 */ 269 function saveCheckpoint(processed, filename = '.backfill-checkpoint.json') { 270 fs.writeFileSync(filename, JSON.stringify({ processed, timestamp: new Date().toISOString() })); 271 } 272 273 /** 274 * Load checkpoint to resume 275 */ 276 function loadCheckpoint(filename = '.backfill-checkpoint.json') { 277 try { 278 if (fs.existsSync(filename)) { 279 const data = JSON.parse(fs.readFileSync(filename, 'utf-8')); 280 logger.info( 281 `Resuming from checkpoint: ${data.processed} keywords processed at ${data.timestamp}` 282 ); 283 return data.processed; 284 } 285 } catch (error) { 286 logger.warn(`Failed to load checkpoint: ${error.message}`); 287 } 288 return 0; 289 } 290 291 /** 292 * Main backfill process 293 */ 294 async function main() { 295 logger.info('DataForSEO Search Volume Backfill'); 296 logger.info('='.repeat(60)); 297 298 if (testMode) logger.warn('⚠️ TEST MODE: Processing only 100 keywords'); 299 if (dryRun) logger.warn('⚠️ DRY RUN: No changes will be made'); 300 if (effectiveLimit) logger.warn(`⚠️ LIMIT: Processing only ${effectiveLimit} keywords`); 301 if (countryFilter) logger.info(`Filtering to country: ${countryFilter}`); 302 303 // Open database 304 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 305 const db = createDatabaseConnection(dbPath); 306 307 // Get keywords that need search volumes (only NULL, not 0) 308 let query = ` 309 SELECT id, keyword, country_code, search_volume 310 FROM keywords 311 WHERE search_volume IS NULL 312 `; 313 314 if (countryFilter) { 315 query += ` AND country_code = '${countryFilter}'`; 316 } 317 318 query += ' ORDER BY country_code, id'; 319 320 if (effectiveLimit) { 321 query += ` LIMIT ${effectiveLimit}`; 322 } 323 324 const allKeywords = db.prepare(query).all(); 325 326 logger.info(`Found ${allKeywords.length} keywords to process`); 327 328 if (allKeywords.length === 0) { 329 logger.warn('No keywords found'); 330 db.close(); 331 return; 332 } 333 334 // Group by country 335 const byCountry = {}; 336 for (const kw of allKeywords) { 337 if (!byCountry[kw.country_code]) { 338 byCountry[kw.country_code] = []; 339 } 340 byCountry[kw.country_code].push(kw); 341 } 342 343 const countries = Object.keys(byCountry).sort(); 344 logger.info(`Countries to process: ${countries.join(', ')}`); 345 logger.info(''); 346 347 let totalProcessed = 0; 348 let totalCost = 0; 349 350 // Process each country 351 for (const countryCode of countries) { 352 const keywords = byCountry[countryCode]; 353 const locationCode = LOCATION_CODES[countryCode]; 354 355 if (!locationCode) { 356 logger.warn(`❌ No location code for ${countryCode}, skipping`); 357 continue; 358 } 359 360 logger.info(`Processing ${countryCode}: ${keywords.length} keywords`); 361 362 // Batch into groups of 1,000 363 const batches = chunk( 364 keywords.map(k => k.keyword), 365 1000 366 ); 367 logger.info(` Batches: ${batches.length} (${batches[0].length} keywords per batch)`); 368 369 for (let i = 0; i < batches.length; i++) { 370 const batch = batches[i]; 371 const batchNum = i + 1; 372 373 logger.info(` Batch ${batchNum}/${batches.length}: ${batch.length} keywords`); 374 375 if (dryRun) { 376 logger.info( 377 ` [DRY RUN] Would fetch search volumes for: ${batch.slice(0, 3).join(', ')}...` 378 ); 379 totalProcessed += batch.length; 380 continue; 381 } 382 383 try { 384 // Fetch search volumes from API (with backup) 385 const results = await fetchSearchVolumes(batch, locationCode, countryCode, batchNum); 386 const cost = 0.075; // $0.075 per 1,000 keywords (Live mode) 387 totalCost += cost; 388 389 logger.success( 390 ` Fetched ${results.length} results (cost: $${cost.toFixed(2)}, total: $${totalCost.toFixed(2)})` 391 ); 392 393 // Update database 394 updateKeywords(db, results, countryCode); 395 totalProcessed += batch.length; 396 397 // Save checkpoint every 10 batches 398 if (batchNum % 10 === 0) { 399 saveCheckpoint(totalProcessed); 400 logger.info(` Checkpoint saved: ${totalProcessed} keywords processed`); 401 } 402 403 // Rate limit: wait 1 second between batches 404 if (batchNum < batches.length) { 405 await new Promise(resolve => setTimeout(resolve, 1000)); 406 } 407 } catch (error) { 408 logger.error(` Failed to process batch ${batchNum}: ${error.message}`); 409 logger.error(` Saving checkpoint and exiting...`); 410 saveCheckpoint(totalProcessed); 411 db.close(); 412 process.exit(1); 413 } 414 } 415 416 logger.success(`✓ ${countryCode} complete: ${keywords.length} keywords processed\n`); 417 } 418 419 // Recalculate priorities 420 if (!dryRun) { 421 recalculatePriorities(db); 422 } 423 424 // Summary 425 logger.info(''); 426 logger.info('='.repeat(60)); 427 logger.info('SUMMARY'); 428 logger.info('='.repeat(60)); 429 logger.info(`Total keywords processed: ${totalProcessed}`); 430 logger.info(`Total cost: $${totalCost.toFixed(2)}`); 431 logger.info(`Countries: ${countries.length}`); 432 433 // Cleanup 434 if (fs.existsSync('.backfill-checkpoint.json')) { 435 fs.unlinkSync('.backfill-checkpoint.json'); 436 logger.info('Checkpoint file deleted'); 437 } 438 439 db.close(); 440 logger.success('✓ Backfill complete!'); 441 } 442 443 main().catch(error => { 444 logger.error(`Fatal error: ${error.message}`); 445 if (error.stack) { 446 logger.debug(error.stack); 447 } 448 process.exit(1); 449 });