/ src / utils / ad-detector.js
ad-detector.js
  1  /**
  2   * Ad Pixel / Tracking Detection
  3   *
  4   * Scans HTML source for evidence of paid advertising activity:
  5   *   - Google Ads conversion tracking (AW- tags, remarketing, call tracking)
  6   *   - Meta Pixel (Facebook/Instagram ads)
  7   *   - Microsoft/Bing Ads (UET tag)
  8   *   - LinkedIn Insight Tag
  9   *   - TikTok Pixel
 10   *   - Call tracking services (CallRail, CallTrackingMetrics, etc.)
 11   *   - Retargeting (Criteo, DoubleClick Floodlight)
 12   *
 13   * Works on raw HTML (no browser needed), so it can backfill from stored HTML.
 14   * Also provides Playwright network request interception for live detection.
 15   */
 16  
 17  import Logger from './logger.js';
 18  
 19  const logger = new Logger('AdDetector');
 20  
 21  // --- HTML-based detection patterns ---
 22  // Each pattern: { name, signals[], htmlPatterns[], networkPatterns[] }
 23  
 24  const AD_PLATFORMS = [
 25    {
 26      name: 'google_ads',
 27      weight: 40,
 28      htmlPatterns: [
 29        /gtag\s*\(\s*['"]config['"]\s*,\s*['"]AW-\d+['"]\)/i,
 30        /google_conversion_id\s*=\s*\d+/i,
 31        /googleadservices\.com\/pagead\/conversion/i,
 32        /googleads\.g\.doubleclick\.net\/pagead\/viewthroughconversion/i,
 33        /www\.googleadservices\.com\/pagead\/conversion_async\.js/i,
 34        /gtag\s*\(\s*['"]event['"]\s*,\s*['"]conversion['"]/i,
 35      ],
 36      networkDomains: [
 37        'googleads.g.doubleclick.net',
 38        'www.googleadservices.com/pagead',
 39      ],
 40    },
 41    {
 42      name: 'google_remarketing',
 43      weight: 30,
 44      htmlPatterns: [
 45        /doubleclick\.net\/ddm\//i,
 46        /fls\.doubleclick\.net/i,
 47        /www\.gstatic\.com\/wcm\/loader\.js/i, // Google forwarding number (call tracking)
 48      ],
 49      networkDomains: [
 50        'ad.doubleclick.net',
 51        'fls.doubleclick.net',
 52      ],
 53    },
 54    {
 55      name: 'meta_pixel',
 56      weight: 35,
 57      htmlPatterns: [
 58        /fbq\s*\(\s*['"]init['"]/i,
 59        /connect\.facebook\.net\/[a-z_]+\/fbevents\.js/i,
 60        /_fbp\s*=/i,
 61        /facebook\.com\/tr\//i,
 62      ],
 63      networkDomains: [
 64        'connect.facebook.net',
 65      ],
 66    },
 67    {
 68      name: 'bing_ads',
 69      weight: 25,
 70      htmlPatterns: [
 71        /bat\.bing\.com\/bat\.js/i,
 72        /uetq\s*=/i,
 73      ],
 74      networkDomains: [
 75        'bat.bing.com',
 76      ],
 77    },
 78    {
 79      name: 'linkedin_ads',
 80      weight: 20,
 81      htmlPatterns: [
 82        /snap\.licdn\.com\/li\.lms-analytics\/insight/i,
 83        /_linkedin_partner_id\s*=/i,
 84      ],
 85      networkDomains: [
 86        'snap.licdn.com',
 87      ],
 88    },
 89    {
 90      name: 'tiktok_pixel',
 91      weight: 20,
 92      htmlPatterns: [
 93        /analytics\.tiktok\.com\/i18n\/pixel\/events\.js/i,
 94        /ttq\.load\s*\(/i,
 95      ],
 96      networkDomains: [
 97        'analytics.tiktok.com',
 98      ],
 99    },
100    {
101      name: 'call_tracking',
102      weight: 30,
103      htmlPatterns: [
104        /callrail\.com/i,
105        /calltrackingmetrics\.com/i,
106        /marchex\.io/i,
107        /invoca\.net/i,
108        /dialogtech\.com/i,
109      ],
110      networkDomains: [
111        'cdn.callrail.com',
112        'calltrackingmetrics.com',
113        'marchex.io',
114      ],
115    },
116    {
117      name: 'retargeting',
118      weight: 20,
119      htmlPatterns: [
120        /criteo\.com|criteo\.net/i,
121        /adsrvr\.org/i,           // The Trade Desk
122        /taboola\.com\/libtrc/i,  // Taboola
123        /outbrain\.com\/outbrain\.js/i, // Outbrain
124      ],
125      networkDomains: [
126        'static.criteo.net',
127        'adsrvr.org',
128      ],
129    },
130    {
131      name: 'pinterest_ads',
132      weight: 15,
133      htmlPatterns: [
134        /s\.pinimg\.com\/ct\/core\.js/i,
135        /pintrk\s*\(\s*['"]load['"]/i,
136      ],
137      networkDomains: [
138        's.pinimg.com',
139      ],
140    },
141  ];
142  
143  // Google Analytics (not ads, but useful context)
144  const GA_PATTERNS = [
145    /gtag\s*\(\s*['"]config['"]\s*,\s*['"]G-[A-Z0-9]+['"]\)/i,   // GA4
146    /gtag\s*\(\s*['"]config['"]\s*,\s*['"]UA-\d+['"]\)/i,         // Universal Analytics
147    /googletagmanager\.com\/gtm\.js/i,                               // GTM container
148  ];
149  
150  /**
151   * Scan raw HTML for ad platform signals.
152   * Works without a browser — just string/regex matching.
153   *
154   * @param {string} html - Raw HTML source
155   * @returns {Object} { is_running_ads, score, signals: { platform: bool }, details: [...] }
156   */
157  export function detectAdsFromHtml(html) {
158    if (!html || typeof html !== 'string') {
159      return { is_running_ads: null, score: 0, signals: {}, details: [] };
160    }
161  
162    const signals = {};
163    const details = [];
164    let totalScore = 0;
165  
166    for (const platform of AD_PLATFORMS) {
167      let detected = false;
168      for (const pattern of platform.htmlPatterns) {
169        if (pattern.test(html)) {
170          detected = true;
171          const match = html.match(pattern);
172          details.push({
173            platform: platform.name,
174            pattern: pattern.source.slice(0, 60),
175            match: match?.[0]?.slice(0, 100),
176          });
177          break; // One match per platform is enough
178        }
179      }
180      signals[platform.name] = detected;
181      if (detected) totalScore += platform.weight;
182    }
183  
184    // Check for GTM (indicates potential hidden ad tags)
185    const hasGtm = /googletagmanager\.com\/gtm\.js\?id=GTM-[A-Z0-9]+/i.test(html);
186    if (hasGtm) {
187      signals.has_gtm = true;
188      // GTM alone isn't an ad signal, but it's context
189    }
190  
191    // Check for GA (not an ad signal but useful)
192    signals.has_analytics = GA_PATTERNS.some(p => p.test(html));
193  
194    // Extract Facebook Page URL if present (for Meta Ad Library lookups)
195    const fbPageMatch = html.match(/(?:https?:\/\/)?(?:www\.)?facebook\.com\/([a-zA-Z0-9._-]+)\/?/i);
196    if (fbPageMatch && !['sharer', 'share', 'dialog', 'tr', 'plugins'].includes(fbPageMatch[1])) {
197      signals.facebook_page_slug = fbPageMatch[1];
198    }
199  
200    const isRunningAds = totalScore >= 25; // At least one significant signal
201  
202    return {
203      is_running_ads: isRunningAds,
204      score: totalScore,
205      signals,
206      details,
207    };
208  }
209  
210  /**
211   * Set up Playwright network request interception for live ad detection.
212   * Call before page.goto(). Returns a collector function to get results after page load.
213   *
214   * @param {import('playwright').Page} page - Playwright page instance
215   * @returns {Function} Call after page load to get detection results
216   */
217  export function setupNetworkAdDetection(page) {
218    const networkHits = new Map(); // platform -> [urls]
219  
220    const allNetworkDomains = new Map();
221    for (const platform of AD_PLATFORMS) {
222      for (const domain of platform.networkDomains) {
223        allNetworkDomains.set(domain, platform.name);
224      }
225    }
226  
227    const requestHandler = (request) => {
228      const url = request.url();
229      for (const [domain, platformName] of allNetworkDomains) {
230        if (url.includes(domain)) {
231          if (!networkHits.has(platformName)) {
232            networkHits.set(platformName, []);
233          }
234          networkHits.get(platformName).push(url.slice(0, 200));
235        }
236      }
237    };
238  
239    page.on('request', requestHandler);
240  
241    // Return collector function
242    return function collectNetworkAdSignals() {
243      page.off('request', requestHandler);
244  
245      const signals = {};
246      const details = [];
247      let totalScore = 0;
248  
249      for (const platform of AD_PLATFORMS) {
250        const urls = networkHits.get(platform.name);
251        const detected = !!urls && urls.length > 0;
252        signals[platform.name] = detected;
253        if (detected) {
254          totalScore += platform.weight;
255          details.push({
256            platform: platform.name,
257            source: 'network',
258            urls: urls.slice(0, 3),
259          });
260        }
261      }
262  
263      return {
264        is_running_ads: totalScore >= 25,
265        score: totalScore,
266        signals,
267        details,
268      };
269    };
270  }
271  
272  /**
273   * Merge HTML-based and network-based detection results.
274   *
275   * @param {Object} htmlResult - From detectAdsFromHtml()
276   * @param {Object} networkResult - From collectNetworkAdSignals()
277   * @returns {Object} Combined result
278   */
279  export function mergeAdDetectionResults(htmlResult, networkResult) {
280    if (!networkResult) return htmlResult;
281    if (!htmlResult) return networkResult;
282  
283    const signals = { ...htmlResult.signals };
284    const details = [...(htmlResult.details || [])];
285    let totalScore = 0;
286  
287    // Merge: if either source detected the platform, it's detected
288    for (const platform of AD_PLATFORMS) {
289      const detected = signals[platform.name] || networkResult.signals[platform.name];
290      signals[platform.name] = detected;
291      if (detected) totalScore += platform.weight;
292    }
293  
294    // Preserve non-platform signals from HTML
295    if (htmlResult.signals.has_gtm) signals.has_gtm = true;
296    if (htmlResult.signals.has_analytics) signals.has_analytics = true;
297    if (htmlResult.signals.facebook_page_slug) {
298      signals.facebook_page_slug = htmlResult.signals.facebook_page_slug;
299    }
300  
301    // Add network details
302    if (networkResult.details) {
303      details.push(...networkResult.details);
304    }
305  
306    return {
307      is_running_ads: totalScore >= 25,
308      score: totalScore,
309      signals,
310      details,
311    };
312  }
313  
314  export { AD_PLATFORMS };