backfill-dataforseo-ads.js
1 #!/usr/bin/env node 2 3 /** 4 * Backfill Google Ads detection via DataForSEO. 5 * 6 * Checks sites where is_running_ads IS NULL or ad_signals does not yet contain 7 * a 'google_ads' key, prioritising English-speaking markets. 8 * 9 * Usage: 10 * node scripts/backfill-dataforseo-ads.js [--limit 100] [--dry-run] 11 * 12 * Options: 13 * --limit N Process at most N sites (default: unlimited) 14 * --dry-run Query + print results but do not update the database 15 */ 16 17 import '../src/utils/load-env.js'; 18 19 import { getAll, run } from '../src/utils/db.js'; 20 import { checkDomainAdActivity } from '../src/utils/dataforseo.js'; 21 import Logger from '../src/utils/logger.js'; 22 23 const logger = new Logger('BackfillDataForSEOAds'); 24 25 // English-speaking country codes (ISO 3166-1 alpha-2, upper-case) 26 const ENGLISH_MARKETS = new Set(['AU', 'US', 'CA', 'GB', 'UK', 'NZ', 'IE', 'IN', 'ZA']); 27 28 // DataForSEO location codes for our priority markets 29 const LOCATION_CODES = { 30 AU: 2036, 31 US: 2840, 32 CA: 2124, 33 GB: 2826, 34 UK: 2826, 35 NZ: 2554, 36 IE: 2372, 37 IN: 2356, 38 ZA: 2710, 39 }; 40 41 // Default to US if no location code mapping found 42 const DEFAULT_LOCATION_CODE = 2840; 43 44 // Delay between API requests (ms) 45 const REQUEST_DELAY_MS = 2000; 46 47 // --- CLI args --- 48 const args = process.argv.slice(2); 49 const dryRun = args.includes('--dry-run'); 50 const limitIdx = args.indexOf('--limit'); 51 const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : Infinity; 52 53 if (isNaN(limit) || limit <= 0) { 54 logger.error('--limit must be a positive integer'); 55 process.exit(1); 56 } 57 58 /** 59 * Pause execution for a given number of milliseconds. 60 */ 61 function sleep(ms) { 62 return new Promise(resolve => setTimeout(resolve, ms)); 63 } 64 65 async function main() { 66 logger.info(`Starting DataForSEO ads backfill${dryRun ? ' (DRY RUN)' : ''}...`); 67 if (limit !== Infinity) logger.info(`Limit: ${limit} sites`); 68 69 // Verify credentials up-front 70 if (!process.env.DATAFORSEO_LOGIN || !process.env.DATAFORSEO_PASSWORD) { 71 logger.error( 72 'DATAFORSEO_LOGIN and/or DATAFORSEO_PASSWORD not set. ' + 73 'Add them to .env or .env.secrets and retry.' 74 ); 75 process.exit(1); 76 } 77 78 // Query sites that need Google Ads data: 79 // - is_running_ads IS NULL, AND 80 // - country_code is an English-speaking market (prioritised), or any market as fallback 81 // We ORDER BY to process English markets first, then by id for determinism. 82 const englishList = [...ENGLISH_MARKETS] 83 .map(c => `'${c}'`) 84 .join(', '); 85 86 const rows = await getAll( 87 `SELECT id, domain, country_code, ad_signals 88 FROM sites 89 WHERE is_running_ads IS NULL 90 AND domain IS NOT NULL 91 AND domain <> '' 92 ORDER BY 93 CASE WHEN upper(country_code) IN (${englishList}) THEN 0 ELSE 1 END, 94 id ASC` 95 ); 96 97 if (rows.length === 0) { 98 logger.info('No sites require DataForSEO ads backfill — all is_running_ads values are set.'); 99 return; 100 } 101 102 const total = Math.min(rows.length, limit === Infinity ? rows.length : limit); 103 logger.info(`Found ${rows.length} eligible sites; will process ${total}.`); 104 105 let processed = 0; 106 let adsDetected = 0; 107 let noAds = 0; 108 const nullResult = 0; 109 let errors = 0; 110 111 for (let i = 0; i < total; i++) { 112 const row = rows[i]; 113 const { id, domain, country_code } = row; 114 const countryUpper = (country_code || '').toUpperCase(); 115 const locationCode = LOCATION_CODES[countryUpper] ?? DEFAULT_LOCATION_CODE; 116 117 let result; 118 try { 119 result = await checkDomainAdActivity(domain, { locationCode }); 120 } catch (err) { 121 logger.warn(`site ${i + 1}/${total}: ${domain} → ERROR: ${err.message}`); 122 errors++; 123 // Delay before next request even on error 124 if (i < total - 1) await sleep(REQUEST_DELAY_MS); 125 continue; 126 } 127 128 if (result === null) { 129 // Credentials not configured — abort (already logged inside the module) 130 logger.error('checkDomainAdActivity returned null — credentials missing. Aborting.'); 131 process.exit(1); 132 } 133 134 const { confidence } = result; 135 const kwCount = result.details.keyword_count ?? 0; 136 const adKwCount = result.details.ad_keyword_count ?? 0; 137 138 logger.info( 139 `site ${i + 1}/${total}: ${domain} → ` + 140 `keywords=${kwCount} ad_keywords=${adKwCount} (confidence=${confidence})` 141 ); 142 143 // keywords_for_site returns competition data for a domain's topic area, 144 // not a direct signal that the site is actively running Google Ads. 145 // Store the data in ad_signals for informational use but do NOT set 146 // is_running_ads — that field is driven by HTML-based detection only 147 // (AW- conversion tags, Google Tag, etc. detected by ad-detector.js). 148 if (kwCount > 0) adsDetected++; 149 else noAds++; 150 151 if (!dryRun) { 152 const existingSignals = 153 typeof row.ad_signals === 'object' && row.ad_signals !== null 154 ? row.ad_signals 155 : {}; 156 157 const mergedSignals = { 158 ...existingSignals, 159 dataforseo_keywords: { 160 keyword_count: kwCount, 161 ad_keyword_count: adKwCount, 162 confidence, 163 checked_at: new Date().toISOString(), 164 top_keywords: result.details.top_keywords ?? [], 165 }, 166 }; 167 168 // Only update ad_signals + timestamp — never overwrite is_running_ads from here 169 await run( 170 `UPDATE sites 171 SET ad_signals = $1, 172 ad_signals_updated_at = NOW(), 173 updated_at = CURRENT_TIMESTAMP 174 WHERE id = $2`, 175 [JSON.stringify(mergedSignals), id] 176 ); 177 } 178 179 processed++; 180 181 // Rate limit — skip delay after last item 182 if (i < total - 1) await sleep(REQUEST_DELAY_MS); 183 } 184 185 logger.info(''); 186 logger.info('=== Backfill Complete ==='); 187 logger.info(`Processed: ${processed}`); 188 logger.info(`With keywords: ${adsDetected}`); 189 logger.info(`No keywords: ${noAds}`); 190 logger.info(`Null result: ${nullResult}`); 191 logger.info(`Errors: ${errors}`); 192 if (dryRun) { 193 logger.info(''); 194 logger.info('(DRY RUN — no database changes made)'); 195 } 196 } 197 198 main() 199 .then(() => process.exit(0)) 200 .catch(err => { 201 logger.error(`Fatal: ${err.message}`); 202 process.exit(1); 203 });