/ scripts / backfill-ad-signals.js
backfill-ad-signals.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Backfill ad signals from existing HTML files on disk.
  5   * Scans data/html/{site_id}.json for ad platform pixels/tags.
  6   *
  7   * Usage:
  8   *   node scripts/backfill-ad-signals.js          # All sites with HTML
  9   *   node scripts/backfill-ad-signals.js --limit 100  # First 100
 10   *   node scripts/backfill-ad-signals.js --dry-run     # Preview only
 11   */
 12  
 13  import { readdirSync } from 'fs';
 14  import { join } from 'path';
 15  import { run, getOne } from '../src/utils/db.js';
 16  import { readHtmlDom, readKeyPagesHtml, DATA_DIR } from '../src/utils/html-storage.js';
 17  import { detectAdsFromHtml } from '../src/utils/ad-detector.js';
 18  import Logger from '../src/utils/logger.js';
 19  
 20  const logger = new Logger('BackfillAdSignals');
 21  
 22  const args = process.argv.slice(2);
 23  const dryRun = args.includes('--dry-run');
 24  const limitIdx = args.indexOf('--limit');
 25  const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : Infinity;
 26  
 27  async function main() {
 28    // Get all HTML files on disk
 29    let files;
 30    try {
 31      files = readdirSync(DATA_DIR)
 32        .filter(f => f.endsWith('.json'))
 33        .map(f => parseInt(f.replace('.json', ''), 10))
 34        .filter(id => Number.isInteger(id) && id > 0);
 35    } catch (e) {
 36      logger.error(`Cannot read HTML directory: ${e.message}`);
 37      process.exit(1);
 38    }
 39  
 40    logger.info(`Found ${files.length} HTML files, processing ${Math.min(files.length, limit)}...`);
 41  
 42    let processed = 0;
 43    let adsDetected = 0;
 44    let noAds = 0;
 45    let errors = 0;
 46    const platformCounts = {};
 47  
 48    for (const siteId of files) {
 49      if (processed >= limit) break;
 50  
 51      try {
 52        // Read all HTML for this site
 53        const htmlDom = readHtmlDom(siteId);
 54        const keyPages = readKeyPagesHtml(siteId);
 55  
 56        // Combine all HTML sources
 57        let allHtml = htmlDom || '';
 58        if (keyPages) {
 59          for (const pageHtml of Object.values(keyPages)) {
 60            allHtml += `\n${  pageHtml || ''}`;
 61          }
 62        }
 63  
 64        if (!allHtml || allHtml.length < 100) {
 65          continue;
 66        }
 67  
 68        const result = detectAdsFromHtml(allHtml);
 69  
 70        // Count platforms
 71        for (const [platform, detected] of Object.entries(result.signals)) {
 72          if (detected === true) {
 73            platformCounts[platform] = (platformCounts[platform] || 0) + 1;
 74          }
 75        }
 76  
 77        if (result.is_running_ads) adsDetected++;
 78        else noAds++;
 79  
 80        if (!dryRun) {
 81          await run(
 82            `UPDATE sites
 83             SET is_running_ads = $1,
 84                 ad_signals = $2,
 85                 ad_signals_updated_at = NOW(),
 86                 updated_at = CURRENT_TIMESTAMP
 87             WHERE id = $3`,
 88            [result.is_running_ads, JSON.stringify(result.signals), siteId]
 89          );
 90        }
 91  
 92        processed++;
 93  
 94        if (processed % 50 === 0) {
 95          logger.info(`Progress: ${processed}/${Math.min(files.length, limit)} — ${adsDetected} ads detected`);
 96        }
 97      } catch (e) {
 98        errors++;
 99        logger.warn(`Site ${siteId}: ${e.message}`);
100      }
101    }
102  
103    logger.info(`\n=== Backfill Complete ===`);
104    logger.info(`Processed: ${processed}`);
105    logger.info(`Ads detected: ${adsDetected} (${((adsDetected / processed) * 100).toFixed(1)}%)`);
106    logger.info(`No ads: ${noAds}`);
107    logger.info(`Errors: ${errors}`);
108    logger.info(`\nPlatform breakdown:`);
109    for (const [platform, count] of Object.entries(platformCounts).sort((a, b) => b[1] - a[1])) {
110      logger.info(`  ${platform}: ${count}`);
111    }
112    if (dryRun) {
113      logger.info(`\n(DRY RUN — no database changes made)`);
114    }
115  }
116  
117  main().then(() => process.exit(0)).catch(e => {
118    logger.error(e.message);
119    process.exit(1);
120  });