ad-detector.js
1 /** 2 * Ad Pixel / Tracking Detection 3 * 4 * Scans HTML source for evidence of paid advertising activity: 5 * - Google Ads conversion tracking (AW- tags, remarketing, call tracking) 6 * - Meta Pixel (Facebook/Instagram ads) 7 * - Microsoft/Bing Ads (UET tag) 8 * - LinkedIn Insight Tag 9 * - TikTok Pixel 10 * - Call tracking services (CallRail, CallTrackingMetrics, etc.) 11 * - Retargeting (Criteo, DoubleClick Floodlight) 12 * 13 * Works on raw HTML (no browser needed), so it can backfill from stored HTML. 14 * Also provides Playwright network request interception for live detection. 15 */ 16 17 import Logger from './logger.js'; 18 19 const logger = new Logger('AdDetector'); 20 21 // --- HTML-based detection patterns --- 22 // Each pattern: { name, signals[], htmlPatterns[], networkPatterns[] } 23 24 const AD_PLATFORMS = [ 25 { 26 name: 'google_ads', 27 weight: 40, 28 htmlPatterns: [ 29 /gtag\s*\(\s*['"]config['"]\s*,\s*['"]AW-\d+['"]\)/i, 30 /google_conversion_id\s*=\s*\d+/i, 31 /googleadservices\.com\/pagead\/conversion/i, 32 /googleads\.g\.doubleclick\.net\/pagead\/viewthroughconversion/i, 33 /www\.googleadservices\.com\/pagead\/conversion_async\.js/i, 34 /gtag\s*\(\s*['"]event['"]\s*,\s*['"]conversion['"]/i, 35 ], 36 networkDomains: [ 37 'googleads.g.doubleclick.net', 38 'www.googleadservices.com/pagead', 39 ], 40 }, 41 { 42 name: 'google_remarketing', 43 weight: 30, 44 htmlPatterns: [ 45 /doubleclick\.net\/ddm\//i, 46 /fls\.doubleclick\.net/i, 47 /www\.gstatic\.com\/wcm\/loader\.js/i, // Google forwarding number (call tracking) 48 ], 49 networkDomains: [ 50 'ad.doubleclick.net', 51 'fls.doubleclick.net', 52 ], 53 }, 54 { 55 name: 'meta_pixel', 56 weight: 35, 57 htmlPatterns: [ 58 /fbq\s*\(\s*['"]init['"]/i, 59 /connect\.facebook\.net\/[a-z_]+\/fbevents\.js/i, 60 /_fbp\s*=/i, 61 /facebook\.com\/tr\//i, 62 ], 63 networkDomains: [ 64 'connect.facebook.net', 65 ], 66 }, 67 { 68 name: 'bing_ads', 69 weight: 25, 70 htmlPatterns: [ 71 /bat\.bing\.com\/bat\.js/i, 72 /uetq\s*=/i, 73 ], 74 networkDomains: [ 75 'bat.bing.com', 76 ], 77 }, 78 { 79 name: 'linkedin_ads', 80 weight: 20, 81 htmlPatterns: [ 82 /snap\.licdn\.com\/li\.lms-analytics\/insight/i, 83 /_linkedin_partner_id\s*=/i, 84 ], 85 networkDomains: [ 86 'snap.licdn.com', 87 ], 88 }, 89 { 90 name: 'tiktok_pixel', 91 weight: 20, 92 htmlPatterns: [ 93 /analytics\.tiktok\.com\/i18n\/pixel\/events\.js/i, 94 /ttq\.load\s*\(/i, 95 ], 96 networkDomains: [ 97 'analytics.tiktok.com', 98 ], 99 }, 100 { 101 name: 'call_tracking', 102 weight: 30, 103 htmlPatterns: [ 104 /callrail\.com/i, 105 /calltrackingmetrics\.com/i, 106 /marchex\.io/i, 107 /invoca\.net/i, 108 /dialogtech\.com/i, 109 ], 110 networkDomains: [ 111 'cdn.callrail.com', 112 'calltrackingmetrics.com', 113 'marchex.io', 114 ], 115 }, 116 { 117 name: 'retargeting', 118 weight: 20, 119 htmlPatterns: [ 120 /criteo\.com|criteo\.net/i, 121 /adsrvr\.org/i, // The Trade Desk 122 /taboola\.com\/libtrc/i, // Taboola 123 /outbrain\.com\/outbrain\.js/i, // Outbrain 124 ], 125 networkDomains: [ 126 'static.criteo.net', 127 'adsrvr.org', 128 ], 129 }, 130 { 131 name: 'pinterest_ads', 132 weight: 15, 133 htmlPatterns: [ 134 /s\.pinimg\.com\/ct\/core\.js/i, 135 /pintrk\s*\(\s*['"]load['"]/i, 136 ], 137 networkDomains: [ 138 's.pinimg.com', 139 ], 140 }, 141 ]; 142 143 // Google Analytics (not ads, but useful context) 144 const GA_PATTERNS = [ 145 /gtag\s*\(\s*['"]config['"]\s*,\s*['"]G-[A-Z0-9]+['"]\)/i, // GA4 146 /gtag\s*\(\s*['"]config['"]\s*,\s*['"]UA-\d+['"]\)/i, // Universal Analytics 147 /googletagmanager\.com\/gtm\.js/i, // GTM container 148 ]; 149 150 /** 151 * Scan raw HTML for ad platform signals. 152 * Works without a browser — just string/regex matching. 153 * 154 * @param {string} html - Raw HTML source 155 * @returns {Object} { is_running_ads, score, signals: { platform: bool }, details: [...] } 156 */ 157 export function detectAdsFromHtml(html) { 158 if (!html || typeof html !== 'string') { 159 return { is_running_ads: null, score: 0, signals: {}, details: [] }; 160 } 161 162 const signals = {}; 163 const details = []; 164 let totalScore = 0; 165 166 for (const platform of AD_PLATFORMS) { 167 let detected = false; 168 for (const pattern of platform.htmlPatterns) { 169 if (pattern.test(html)) { 170 detected = true; 171 const match = html.match(pattern); 172 details.push({ 173 platform: platform.name, 174 pattern: pattern.source.slice(0, 60), 175 match: match?.[0]?.slice(0, 100), 176 }); 177 break; // One match per platform is enough 178 } 179 } 180 signals[platform.name] = detected; 181 if (detected) totalScore += platform.weight; 182 } 183 184 // Check for GTM (indicates potential hidden ad tags) 185 const hasGtm = /googletagmanager\.com\/gtm\.js\?id=GTM-[A-Z0-9]+/i.test(html); 186 if (hasGtm) { 187 signals.has_gtm = true; 188 // GTM alone isn't an ad signal, but it's context 189 } 190 191 // Check for GA (not an ad signal but useful) 192 signals.has_analytics = GA_PATTERNS.some(p => p.test(html)); 193 194 // Extract Facebook Page URL if present (for Meta Ad Library lookups) 195 const fbPageMatch = html.match(/(?:https?:\/\/)?(?:www\.)?facebook\.com\/([a-zA-Z0-9._-]+)\/?/i); 196 if (fbPageMatch && !['sharer', 'share', 'dialog', 'tr', 'plugins'].includes(fbPageMatch[1])) { 197 signals.facebook_page_slug = fbPageMatch[1]; 198 } 199 200 const isRunningAds = totalScore >= 25; // At least one significant signal 201 202 return { 203 is_running_ads: isRunningAds, 204 score: totalScore, 205 signals, 206 details, 207 }; 208 } 209 210 /** 211 * Set up Playwright network request interception for live ad detection. 212 * Call before page.goto(). Returns a collector function to get results after page load. 213 * 214 * @param {import('playwright').Page} page - Playwright page instance 215 * @returns {Function} Call after page load to get detection results 216 */ 217 export function setupNetworkAdDetection(page) { 218 const networkHits = new Map(); // platform -> [urls] 219 220 const allNetworkDomains = new Map(); 221 for (const platform of AD_PLATFORMS) { 222 for (const domain of platform.networkDomains) { 223 allNetworkDomains.set(domain, platform.name); 224 } 225 } 226 227 const requestHandler = (request) => { 228 const url = request.url(); 229 for (const [domain, platformName] of allNetworkDomains) { 230 if (url.includes(domain)) { 231 if (!networkHits.has(platformName)) { 232 networkHits.set(platformName, []); 233 } 234 networkHits.get(platformName).push(url.slice(0, 200)); 235 } 236 } 237 }; 238 239 page.on('request', requestHandler); 240 241 // Return collector function 242 return function collectNetworkAdSignals() { 243 page.off('request', requestHandler); 244 245 const signals = {}; 246 const details = []; 247 let totalScore = 0; 248 249 for (const platform of AD_PLATFORMS) { 250 const urls = networkHits.get(platform.name); 251 const detected = !!urls && urls.length > 0; 252 signals[platform.name] = detected; 253 if (detected) { 254 totalScore += platform.weight; 255 details.push({ 256 platform: platform.name, 257 source: 'network', 258 urls: urls.slice(0, 3), 259 }); 260 } 261 } 262 263 return { 264 is_running_ads: totalScore >= 25, 265 score: totalScore, 266 signals, 267 details, 268 }; 269 }; 270 } 271 272 /** 273 * Merge HTML-based and network-based detection results. 274 * 275 * @param {Object} htmlResult - From detectAdsFromHtml() 276 * @param {Object} networkResult - From collectNetworkAdSignals() 277 * @returns {Object} Combined result 278 */ 279 export function mergeAdDetectionResults(htmlResult, networkResult) { 280 if (!networkResult) return htmlResult; 281 if (!htmlResult) return networkResult; 282 283 const signals = { ...htmlResult.signals }; 284 const details = [...(htmlResult.details || [])]; 285 let totalScore = 0; 286 287 // Merge: if either source detected the platform, it's detected 288 for (const platform of AD_PLATFORMS) { 289 const detected = signals[platform.name] || networkResult.signals[platform.name]; 290 signals[platform.name] = detected; 291 if (detected) totalScore += platform.weight; 292 } 293 294 // Preserve non-platform signals from HTML 295 if (htmlResult.signals.has_gtm) signals.has_gtm = true; 296 if (htmlResult.signals.has_analytics) signals.has_analytics = true; 297 if (htmlResult.signals.facebook_page_slug) { 298 signals.facebook_page_slug = htmlResult.signals.facebook_page_slug; 299 } 300 301 // Add network details 302 if (networkResult.details) { 303 details.push(...networkResult.details); 304 } 305 306 return { 307 is_running_ads: totalScore >= 25, 308 score: totalScore, 309 signals, 310 details, 311 }; 312 } 313 314 export { AD_PLATFORMS };