/ scripts / backfill-dataforseo-ads.js
backfill-dataforseo-ads.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Backfill Google Ads detection via DataForSEO.
  5   *
  6   * Checks sites where is_running_ads IS NULL or ad_signals does not yet contain
  7   * a 'google_ads' key, prioritising English-speaking markets.
  8   *
  9   * Usage:
 10   *   node scripts/backfill-dataforseo-ads.js [--limit 100] [--dry-run]
 11   *
 12   * Options:
 13   *   --limit N    Process at most N sites (default: unlimited)
 14   *   --dry-run    Query + print results but do not update the database
 15   */
 16  
 17  import '../src/utils/load-env.js';
 18  
 19  import { getAll, run } from '../src/utils/db.js';
 20  import { checkDomainAdActivity } from '../src/utils/dataforseo.js';
 21  import Logger from '../src/utils/logger.js';
 22  
 23  const logger = new Logger('BackfillDataForSEOAds');
 24  
 25  // English-speaking country codes (ISO 3166-1 alpha-2, upper-case)
 26  const ENGLISH_MARKETS = new Set(['AU', 'US', 'CA', 'GB', 'UK', 'NZ', 'IE', 'IN', 'ZA']);
 27  
 28  // DataForSEO location codes for our priority markets
 29  const LOCATION_CODES = {
 30    AU: 2036,
 31    US: 2840,
 32    CA: 2124,
 33    GB: 2826,
 34    UK: 2826,
 35    NZ: 2554,
 36    IE: 2372,
 37    IN: 2356,
 38    ZA: 2710,
 39  };
 40  
 41  // Default to US if no location code mapping found
 42  const DEFAULT_LOCATION_CODE = 2840;
 43  
 44  // Delay between API requests (ms)
 45  const REQUEST_DELAY_MS = 2000;
 46  
 47  // --- CLI args ---
 48  const args = process.argv.slice(2);
 49  const dryRun = args.includes('--dry-run');
 50  const limitIdx = args.indexOf('--limit');
 51  const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : Infinity;
 52  
 53  if (isNaN(limit) || limit <= 0) {
 54    logger.error('--limit must be a positive integer');
 55    process.exit(1);
 56  }
 57  
 58  /**
 59   * Pause execution for a given number of milliseconds.
 60   */
 61  function sleep(ms) {
 62    return new Promise(resolve => setTimeout(resolve, ms));
 63  }
 64  
 65  async function main() {
 66    logger.info(`Starting DataForSEO ads backfill${dryRun ? ' (DRY RUN)' : ''}...`);
 67    if (limit !== Infinity) logger.info(`Limit: ${limit} sites`);
 68  
 69    // Verify credentials up-front
 70    if (!process.env.DATAFORSEO_LOGIN || !process.env.DATAFORSEO_PASSWORD) {
 71      logger.error(
 72        'DATAFORSEO_LOGIN and/or DATAFORSEO_PASSWORD not set. ' +
 73        'Add them to .env or .env.secrets and retry.'
 74      );
 75      process.exit(1);
 76    }
 77  
 78    // Query sites that need Google Ads data:
 79    //   - is_running_ads IS NULL, AND
 80    //   - country_code is an English-speaking market (prioritised), or any market as fallback
 81    // We ORDER BY to process English markets first, then by id for determinism.
 82    const englishList = [...ENGLISH_MARKETS]
 83      .map(c => `'${c}'`)
 84      .join(', ');
 85  
 86    const rows = await getAll(
 87      `SELECT id, domain, country_code, ad_signals
 88       FROM sites
 89       WHERE is_running_ads IS NULL
 90         AND domain IS NOT NULL
 91         AND domain <> ''
 92       ORDER BY
 93         CASE WHEN upper(country_code) IN (${englishList}) THEN 0 ELSE 1 END,
 94         id ASC`
 95    );
 96  
 97    if (rows.length === 0) {
 98      logger.info('No sites require DataForSEO ads backfill — all is_running_ads values are set.');
 99      return;
100    }
101  
102    const total = Math.min(rows.length, limit === Infinity ? rows.length : limit);
103    logger.info(`Found ${rows.length} eligible sites; will process ${total}.`);
104  
105    let processed = 0;
106    let adsDetected = 0;
107    let noAds = 0;
108    const nullResult = 0;
109    let errors = 0;
110  
111    for (let i = 0; i < total; i++) {
112      const row = rows[i];
113      const { id, domain, country_code } = row;
114      const countryUpper = (country_code || '').toUpperCase();
115      const locationCode = LOCATION_CODES[countryUpper] ?? DEFAULT_LOCATION_CODE;
116  
117      let result;
118      try {
119        result = await checkDomainAdActivity(domain, { locationCode });
120      } catch (err) {
121        logger.warn(`site ${i + 1}/${total}: ${domain} → ERROR: ${err.message}`);
122        errors++;
123        // Delay before next request even on error
124        if (i < total - 1) await sleep(REQUEST_DELAY_MS);
125        continue;
126      }
127  
128      if (result === null) {
129        // Credentials not configured — abort (already logged inside the module)
130        logger.error('checkDomainAdActivity returned null — credentials missing. Aborting.');
131        process.exit(1);
132      }
133  
134      const { confidence } = result;
135      const kwCount = result.details.keyword_count ?? 0;
136      const adKwCount = result.details.ad_keyword_count ?? 0;
137  
138      logger.info(
139        `site ${i + 1}/${total}: ${domain} → ` +
140        `keywords=${kwCount} ad_keywords=${adKwCount} (confidence=${confidence})`
141      );
142  
143      // keywords_for_site returns competition data for a domain's topic area,
144      // not a direct signal that the site is actively running Google Ads.
145      // Store the data in ad_signals for informational use but do NOT set
146      // is_running_ads — that field is driven by HTML-based detection only
147      // (AW- conversion tags, Google Tag, etc. detected by ad-detector.js).
148      if (kwCount > 0) adsDetected++;
149      else noAds++;
150  
151      if (!dryRun) {
152        const existingSignals =
153          typeof row.ad_signals === 'object' && row.ad_signals !== null
154            ? row.ad_signals
155            : {};
156  
157        const mergedSignals = {
158          ...existingSignals,
159          dataforseo_keywords: {
160            keyword_count: kwCount,
161            ad_keyword_count: adKwCount,
162            confidence,
163            checked_at: new Date().toISOString(),
164            top_keywords: result.details.top_keywords ?? [],
165          },
166        };
167  
168        // Only update ad_signals + timestamp — never overwrite is_running_ads from here
169        await run(
170          `UPDATE sites
171           SET ad_signals          = $1,
172               ad_signals_updated_at = NOW(),
173               updated_at          = CURRENT_TIMESTAMP
174           WHERE id = $2`,
175          [JSON.stringify(mergedSignals), id]
176        );
177      }
178  
179      processed++;
180  
181      // Rate limit — skip delay after last item
182      if (i < total - 1) await sleep(REQUEST_DELAY_MS);
183    }
184  
185    logger.info('');
186    logger.info('=== Backfill Complete ===');
187    logger.info(`Processed:     ${processed}`);
188    logger.info(`With keywords: ${adsDetected}`);
189    logger.info(`No keywords:   ${noAds}`);
190    logger.info(`Null result:   ${nullResult}`);
191    logger.info(`Errors:        ${errors}`);
192    if (dryRun) {
193      logger.info('');
194      logger.info('(DRY RUN — no database changes made)');
195    }
196  }
197  
198  main()
199    .then(() => process.exit(0))
200    .catch(err => {
201      logger.error(`Fatal: ${err.message}`);
202      process.exit(1);
203    });