backfill-ad-signals.js
1 #!/usr/bin/env node 2 3 /** 4 * Backfill ad signals from existing HTML files on disk. 5 * Scans data/html/{site_id}.json for ad platform pixels/tags. 6 * 7 * Usage: 8 * node scripts/backfill-ad-signals.js # All sites with HTML 9 * node scripts/backfill-ad-signals.js --limit 100 # First 100 10 * node scripts/backfill-ad-signals.js --dry-run # Preview only 11 */ 12 13 import { readdirSync } from 'fs'; 14 import { join } from 'path'; 15 import { run, getOne } from '../src/utils/db.js'; 16 import { readHtmlDom, readKeyPagesHtml, DATA_DIR } from '../src/utils/html-storage.js'; 17 import { detectAdsFromHtml } from '../src/utils/ad-detector.js'; 18 import Logger from '../src/utils/logger.js'; 19 20 const logger = new Logger('BackfillAdSignals'); 21 22 const args = process.argv.slice(2); 23 const dryRun = args.includes('--dry-run'); 24 const limitIdx = args.indexOf('--limit'); 25 const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : Infinity; 26 27 async function main() { 28 // Get all HTML files on disk 29 let files; 30 try { 31 files = readdirSync(DATA_DIR) 32 .filter(f => f.endsWith('.json')) 33 .map(f => parseInt(f.replace('.json', ''), 10)) 34 .filter(id => Number.isInteger(id) && id > 0); 35 } catch (e) { 36 logger.error(`Cannot read HTML directory: ${e.message}`); 37 process.exit(1); 38 } 39 40 logger.info(`Found ${files.length} HTML files, processing ${Math.min(files.length, limit)}...`); 41 42 let processed = 0; 43 let adsDetected = 0; 44 let noAds = 0; 45 let errors = 0; 46 const platformCounts = {}; 47 48 for (const siteId of files) { 49 if (processed >= limit) break; 50 51 try { 52 // Read all HTML for this site 53 const htmlDom = readHtmlDom(siteId); 54 const keyPages = readKeyPagesHtml(siteId); 55 56 // Combine all HTML sources 57 let allHtml = htmlDom || ''; 58 if (keyPages) { 59 for (const pageHtml of Object.values(keyPages)) { 60 allHtml += `\n${ pageHtml || ''}`; 61 } 62 } 63 64 if (!allHtml || allHtml.length < 100) { 65 continue; 66 } 67 68 const result = detectAdsFromHtml(allHtml); 69 70 // Count platforms 71 for (const [platform, detected] of Object.entries(result.signals)) { 72 if (detected === true) { 73 platformCounts[platform] = (platformCounts[platform] || 0) + 1; 74 } 75 } 76 77 if (result.is_running_ads) adsDetected++; 78 else noAds++; 79 80 if (!dryRun) { 81 await run( 82 `UPDATE sites 83 SET is_running_ads = $1, 84 ad_signals = $2, 85 ad_signals_updated_at = NOW(), 86 updated_at = CURRENT_TIMESTAMP 87 WHERE id = $3`, 88 [result.is_running_ads, JSON.stringify(result.signals), siteId] 89 ); 90 } 91 92 processed++; 93 94 if (processed % 50 === 0) { 95 logger.info(`Progress: ${processed}/${Math.min(files.length, limit)} — ${adsDetected} ads detected`); 96 } 97 } catch (e) { 98 errors++; 99 logger.warn(`Site ${siteId}: ${e.message}`); 100 } 101 } 102 103 logger.info(`\n=== Backfill Complete ===`); 104 logger.info(`Processed: ${processed}`); 105 logger.info(`Ads detected: ${adsDetected} (${((adsDetected / processed) * 100).toFixed(1)}%)`); 106 logger.info(`No ads: ${noAds}`); 107 logger.info(`Errors: ${errors}`); 108 logger.info(`\nPlatform breakdown:`); 109 for (const [platform, count] of Object.entries(platformCounts).sort((a, b) => b[1] - a[1])) { 110 logger.info(` ${platform}: ${count}`); 111 } 112 if (dryRun) { 113 logger.info(`\n(DRY RUN — no database changes made)`); 114 } 115 } 116 117 main().then(() => process.exit(0)).catch(e => { 118 logger.error(e.message); 119 process.exit(1); 120 });