/ src / stages / assets.js
assets.js
  1  /**
  2   * Assets Stage
  3   * Captures screenshots and visual assets for sites
  4   */
  5  
  6  import { run, getOne, getAll, withTransaction, getPool } from '../utils/db.js';
  7  import { captureWebsite, launchBrowser, createStealthContext } from '../capture.js';
  8  import Logger from '../utils/logger.js';
  9  import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js';
 10  import { processBatch, withTimeout } from '../utils/error-handler.js';
 11  import { saveScreenshots, croppedScreenshotsExist } from '../utils/screenshot-storage.js';
 12  import { checkBlocklist } from '../utils/site-filters.js';
 13  import { incrementAssetsScraped } from '../utils/keyword-counters.js';
 14  // deduplicateSites removed — UNIQUE constraint on sites.domain handles this (DR-106)
 15  import { detectErrorPage } from '../utils/error-page-detector.js';
 16  import { writeHtmlDom, hasHtmlDom, deleteHtmlDom } from '../utils/html-storage.js';
 17  import { detectAdsFromHtml } from '../utils/ad-detector.js';
 18  import { recordFailure, resetRetries } from '../utils/retry-handler.js';
 19  import { getAdaptiveConcurrencyFast } from '../utils/adaptive-concurrency.js';
 20  import { getCountryByCode } from '../config/countries.js';
 21  import { deriveLanguageCode } from '../utils/detect-language.js';
 22  
 23  const logger = new Logger('Assets');
 24  const success = (...args) => logger.success(...args);
 25  const info = (...args) => logger.info(...args);
 26  const error = (...args) => logger.error(...args);
 27  
 28  /**
 29   * Run the assets stage
 30   * @param {Object} options - Stage options
 31   * @param {number} options.limit - Limit number of sites to process
 32   * @param {number} options.concurrency - Number of concurrent captures (default: 3)
 33   * @returns {Promise<Object>} Stage results
 34   */
 35  // eslint-disable-next-line complexity -- Stage orchestration requires multiple conditional paths
 36  export async function runAssetsStage(options = {}) {
 37    const startTime = Date.now();
 38    // Use concurrency=1 for browser captures to avoid resource exhaustion
 39    // Each capture launches a full browser instance, so parallel launches can timeout
 40    // Can be configured via BROWSER_CONCURRENCY env var
 41    const concurrency = options.concurrency || parseInt(process.env.BROWSER_CONCURRENCY || '1', 10);
 42  
 43    try {
 44      info('Starting Assets Stage...');
 45  
 46      // Check ENABLE_VISION flag (consolidates old flags)
 47      const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
 48  
 49      // Show deprecation warning if old flags are used
 50      const legacyFlags = [
 51        process.env.ENABLE_SCREENSHOT_CAPTURE,
 52        process.env.USE_COMPUTER_VISION_SCORING,
 53        process.env.USE_COMPUTER_VISION_RESCORING,
 54        process.env.USE_COMPUTER_VISION_ENRICHMENT,
 55      ];
 56      if (legacyFlags.some(flag => flag !== undefined)) {
 57        logger.warn(
 58          '[assets] WARN: Vision flags (ENABLE_SCREENSHOT_CAPTURE, USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.'
 59        );
 60      }
 61  
 62      if (!ENABLE_VISION) {
 63        info('[assets] Vision disabled - capturing rendered DOM HTML via headless browser');
 64  
 65        // Deduplication no longer needed — UNIQUE constraint on sites.domain (DR-106)
 66        // prevents duplicates at insert time. Legacy deduplicateSites() removed.
 67  
 68        // Query found sites that don't yet have HTML content
 69        const queryLimit = options.limit || null;
 70        const htmlCandidates = await getAll(
 71          `SELECT id, domain, landing_page_url as url, country_code
 72           FROM sites
 73           WHERE status = 'found'
 74             AND status NOT IN ('ignored', 'failing')
 75             AND recapture_at IS NULL
 76             AND (html_dom IS NULL OR html_dom = '')
 77           ${queryLimit ? `LIMIT ${queryLimit}` : ''}`
 78        );
 79  
 80        if (htmlCandidates.length === 0) {
 81          info('[assets] No sites need HTML capture');
 82          return {
 83            processed: 0,
 84            succeeded: 0,
 85            failed: 0,
 86            skipped: 0,
 87            duration: Date.now() - startTime,
 88          };
 89        }
 90  
 91        info(
 92          `[assets] Capturing DOM HTML for ${htmlCandidates.length} sites (concurrency: ${concurrency})`
 93        );
 94  
 95        // Check blocklist
 96        let ignoredCount = 0;
 97        const blockedIds = new Set();
 98        for (const site of htmlCandidates) {
 99          const blocked = checkBlocklist(site.domain, site.country_code);
100          if (blocked) {
101            await run(
102              `UPDATE sites SET status = 'ignored', error_message = $1, html_dom = NULL WHERE id = $2`,
103              [blocked.reason, site.id]
104            );
105            deleteHtmlDom(site.id);
106            blockedIds.add(site.id);
107            ignoredCount++;
108          }
109        }
110        if (ignoredCount > 0) {
111          info(`Marked ${ignoredCount} sites as ignored (directories/social media)`);
112        }
113  
114        const htmlSites = htmlCandidates.filter(s => !blockedIds.has(s.id));
115        const htmlStats = { processed: 0, succeeded: 0, failed: 0, skipped: 0 };
116  
117        // Launch one shared browser for the whole batch — avoids per-site launch overhead (~500ms each).
118        // Each concurrent worker gets its own page within the shared context.
119        const sharedBrowser = await launchBrowser({ headless: true });
120        const sharedContext = await createStealthContext(sharedBrowser);
121        let htmlResults, htmlErrors;
122        try {
123          ({ results: htmlResults, errors: htmlErrors } = await processBatch(
124            htmlSites,
125            async (site, index) => {
126              displayProgress(index + 1, htmlSites.length, `Capturing DOM: ${site.url}`);
127              return withTimeout(
128                captureSiteScreenshots(site.id, site.url, site.country_code, sharedContext),
129                120000,
130                `Capture timed out after 120s for ${site.url}`
131              );
132            },
133            {
134              concurrency,
135              getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 3, 'BROWSER_CONCURRENCY'),
136            }
137          ));
138        } finally {
139          await sharedContext.close().catch(() => {});
140          await sharedBrowser.close().catch(() => {});
141        }
142  
143        htmlStats.processed = htmlSites.length;
144        htmlStats.succeeded = htmlResults.filter(r => r !== null).length;
145        htmlStats.failed = htmlErrors.length;
146        for (const err of htmlErrors) {
147          error(`  Failed to capture ${err.item?.url || 'unknown'}: ${err.error?.message || err}`);
148        }
149  
150        htmlStats.duration = Date.now() - startTime;
151        generateStageCompletion('Assets (HTML-only)', htmlStats);
152        return htmlStats;
153      }
154  
155      // Vision enabled - proceed with normal screenshot capture
156      info('[assets] Vision enabled - capturing screenshots');
157  
158      // Deduplication no longer needed — UNIQUE constraint on sites.domain (DR-106)
159  
160      // Get sites that need screenshot capture (missing html_dom or incomplete screenshots)
161      // Pre-filter in SQL for sites likely to need screenshots to avoid loading unnecessary data
162      // This query targets sites with missing html_dom or missing screenshot_path
163      // We'll still need to check for incomplete cropped screenshots afterward (can't check file existence in SQL)
164      const queryLimit = options.limit ? options.limit * 3 : null; // Small multiplier to account for cropped screenshot checks (reduced from 50x to 3x for performance)
165      const candidates = await getAll(
166        `SELECT id, domain, landing_page_url as url, screenshot_path, html_dom, error_message, country_code
167         FROM sites
168         WHERE (status = 'found' OR status = 'assets_captured')
169           AND status NOT IN ('ignored', 'failing')
170           AND recapture_at IS NULL
171           AND (
172             html_dom IS NULL
173             OR html_dom = ''
174             OR screenshot_path IS NULL
175           )
176         ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC
177         ${queryLimit ? `LIMIT ${queryLimit}` : ''}`
178      );
179  
180      if (candidates.length === 0) {
181        info('No sites to check for screenshot capture');
182        return {
183          processed: 0,
184          succeeded: 0,
185          failed: 0,
186          skipped: 0,
187          duration: Date.now() - startTime,
188        };
189      }
190  
191      info(`Checking ${candidates.length} candidate sites (limit: ${options.limit || 'none'})`);
192  
193      // Filter out blocklisted sites (directories/social media/franchises)
194      let ignoredCount = 0;
195      const blockedSiteIds = new Set();
196      for (const site of candidates) {
197        const blocked = checkBlocklist(site.domain, site.country_code);
198        if (blocked) {
199          await run(
200            `UPDATE sites SET status = 'ignored', error_message = $1, html_dom = NULL WHERE id = $2`,
201            [blocked.reason, site.id]
202          );
203          blockedSiteIds.add(site.id);
204          ignoredCount++;
205        }
206      }
207  
208      if (ignoredCount > 0) {
209        info(`Marked ${ignoredCount} sites as ignored (directories/social media)`);
210      }
211  
212      // Filter out blocked sites from candidates
213      const nonBlockedCandidates = candidates.filter(site => !blockedSiteIds.has(site.id));
214  
215      // Filter to only sites with missing html_dom or missing cropped screenshots
216      // Stop early once we've found enough sites (if limit specified)
217      info(
218        `Checking ${nonBlockedCandidates.length} sites for missing html_dom or missing cropped screenshots...`
219      );
220      const sitesNeedingCapture = [];
221      let cleanedUpCount = 0;
222  
223      for (const site of nonBlockedCandidates) {
224        // Early exit if we've found enough sites
225        if (options.limit && sitesNeedingCapture.length >= options.limit) {
226          info(`Found ${options.limit} sites needing capture, stopping search`);
227          break;
228        }
229  
230        let needsCapture = false;
231        const reasons = [];
232  
233        // Check if html_dom is missing
234        if (!site.html_dom || site.html_dom.trim() === '') {
235          needsCapture = true;
236          reasons.push('missing html_dom');
237        }
238  
239        // Check if screenshot_path is missing
240        if (!site.screenshot_path) {
241          needsCapture = true;
242          reasons.push('no screenshot_path');
243        } else {
244          // Check if the 3 essential cropped screenshots exist
245          const { exists, missing } = await croppedScreenshotsExist(site.screenshot_path);
246          if (!exists) {
247            needsCapture = true;
248            reasons.push(`missing ${missing.length}/3 cropped screenshots: ${missing.join(', ')}`);
249  
250            // Cleanup: Reset screenshot_path if files don't exist (legacy bug fix)
251            // This prevents sites from being stuck with invalid screenshot_path values
252            await run('UPDATE sites SET screenshot_path = NULL WHERE id = $1', [site.id]);
253            cleanedUpCount++;
254          }
255        }
256  
257        if (needsCapture) {
258          if (reasons.length > 0) {
259            info(`  Site ${site.id} needs capture: ${reasons.join(', ')}`);
260          }
261          sitesNeedingCapture.push(site);
262        }
263      }
264  
265      if (cleanedUpCount > 0) {
266        info(`Cleaned up ${cleanedUpCount} sites with invalid screenshot_path values`);
267      }
268  
269      const sites = sitesNeedingCapture;
270  
271      if (sites.length === 0) {
272        info('No sites need screenshot capture (all have complete screenshots)');
273        return {
274          processed: 0,
275          succeeded: 0,
276          failed: 0,
277          skipped: 0,
278          duration: Date.now() - startTime,
279        };
280      }
281  
282      info(
283        `Capturing screenshots for ${sites.length} sites (concurrency: ${concurrency}, sequential by default to avoid timeouts)`
284      );
285  
286      const stats = {
287        processed: 0,
288        succeeded: 0,
289        failed: 0,
290        skipped: 0,
291      };
292  
293      // Process sites in batches with dynamic concurrency
294      const { results, errors } = await processBatch(
295        sites,
296        (site, index) => {
297          displayProgress(index + 1, sites.length, `Capturing ${site.url}`);
298          return withTimeout(
299            captureSiteScreenshots(site.id, site.url, site.country_code),
300            120000,
301            `Capture timed out after 120s for ${site.url}`
302          );
303        },
304        {
305          concurrency,
306          getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 3, 'BROWSER_CONCURRENCY'),
307        }
308      );
309  
310      // Count successes and failures
311      stats.processed = sites.length;
312      stats.succeeded = results.filter(r => r !== null).length;
313      stats.failed = errors.length;
314  
315      // Log errors
316      for (const err of errors) {
317        const url = err.item?.url || 'unknown site';
318        const message = err.error?.message || err.toString();
319        error(`  Failed to capture ${url}: ${message}`);
320      }
321  
322      stats.duration = Date.now() - startTime;
323      generateStageCompletion('Assets', stats);
324  
325      return stats;
326    } catch (err) {
327      error(`Assets stage failed: ${err.message}`);
328      throw err;
329    }
330  }
331  
332  /**
333   * Capture HTML for a single URL.
334   * @param {string} url
335   * @param {import('playwright').BrowserContext|null} sharedContext - Reuse existing context if provided.
336   *   When provided the caller owns the context lifecycle (open/close). When null a fresh
337   *   browser+context is created and torn down internally (legacy path, used by backfill).
338   */
339  async function captureHtmlOnly(url, sharedContext = null) {
340    const domain = new URL(url).hostname;
341    logger.info(`Capturing HTML for ${domain}...`);
342  
343    const t0 = Date.now();
344    let ownedBrowser = null;
345    let ownedContext = null;
346    const context =
347      sharedContext ||
348      (async () => {
349        ownedBrowser = await launchBrowser({ headless: true });
350        ownedContext = await createStealthContext(ownedBrowser);
351        return ownedContext;
352      })();
353    // context may be a Promise (self-owned) or already resolved (shared)
354    const ctx = await context;
355  
356    const page = await ctx.newPage();
357    try {
358      const tNav = Date.now();
359      // Navigate to URL — domcontentloaded fires as soon as HTML is parsed; no need to wait for images/JS
360      const response = await page.goto(url, {
361        waitUntil: 'domcontentloaded',
362        timeout: 30000,
363      });
364      logger.debug(`[assets] nav ${Date.now() - tNav}ms for ${domain}`);
365  
366      const httpStatusCode = response.status();
367  
368      // Capture SSL status
369      let sslStatus = 'error';
370      try {
371        const parsedUrl = new URL(url);
372        sslStatus = parsedUrl.protocol === 'https:' ? 'https' : 'http';
373      } catch (e) {
374        logger.warn(`Failed to determine SSL status: ${e.message}`);
375      }
376  
377      // Capture HTTP headers
378      let httpHeaders = null;
379      try {
380        const headers = response.headers();
381        httpHeaders = JSON.stringify({
382          'strict-transport-security': headers['strict-transport-security'] || null,
383          'content-security-policy': headers['content-security-policy'] || null,
384          'x-frame-options': headers['x-frame-options'] || null,
385          'x-content-type-options': headers['x-content-type-options'] || null,
386          server: headers['server'] || null,
387          'x-powered-by': headers['x-powered-by'] || null,
388          'content-encoding': headers['content-encoding'] || null,
389          'cache-control': headers['cache-control'] || null,
390          'content-language': headers['content-language'] || null,
391        });
392      } catch (e) {
393        logger.warn(`Failed to capture HTTP headers: ${e.message}`);
394      }
395  
396      // NOTE: removed waitForTimeout(1000) — domcontentloaded already ensures HTML is parsed.
397      // JS frameworks that inject content after DOMContentLoaded are handled by the scoring LLM
398      // which reads the visible text; we don't need a JS-rendered DOM for HTML-only mode.
399  
400      const tContent = Date.now();
401      const html = await page.content();
402      logger.debug(`[assets] content ${Date.now() - tContent}ms for ${domain}`);
403  
404      // Capture locale data
405      let localeData = null;
406      try {
407        /* eslint-disable no-undef -- document is available in browser context */
408        const data = await page.evaluate(() => ({
409          htmlLang: document.documentElement.lang || null,
410          hreflangs: Array.from(document.querySelectorAll('link[rel="alternate"][hreflang]'))
411            .map(el => ({
412              hreflang: el.getAttribute('hreflang'),
413              href: el.getAttribute('href'),
414            }))
415            .filter(link => link.hreflang),
416        }));
417        /* eslint-enable no-undef */
418        localeData = JSON.stringify(data);
419      } catch (e) {
420        logger.warn(`Failed to capture locale data: ${e.message}`);
421      }
422  
423      // Capture performance timing data (Core Web Vitals / page speed)
424      let perfData = null;
425      try {
426  
427        perfData = await page.evaluate(() => {
428          const nav = performance.getEntriesByType('navigation')[0];
429          const paint = performance.getEntriesByType('paint');
430          const resources = performance.getEntriesByType('resource');
431  
432          // Count resources by type
433          const resourceCounts = {};
434          for (const r of resources) {
435            const type = r.initiatorType || 'other';
436            resourceCounts[type] = (resourceCounts[type] || 0) + 1;
437          }
438  
439          // Total transfer size from all resources
440          let totalTransferSize = nav ? nav.transferSize : 0;
441          for (const r of resources) {
442            totalTransferSize += r.transferSize || 0;
443          }
444  
445          return {
446            loadTime: nav ? Math.round(nav.loadEventEnd - nav.startTime) : null,
447            domContentLoaded: nav ? Math.round(nav.domContentLoadedEventEnd - nav.startTime) : null,
448            firstPaint: Math.round(paint.find(p => p.name === 'first-paint')?.startTime) || null,
449            firstContentfulPaint: Math.round(paint.find(p => p.name === 'first-contentful-paint')?.startTime) || null,
450            transferSize: nav ? nav.transferSize : null,
451            totalTransferSize,
452            domInteractive: nav ? Math.round(nav.domInteractive - nav.startTime) : null,
453            resourceCount: resources.length,
454            resourceCounts,
455          };
456        });
457  
458        logger.debug(`[assets] perf: load=${perfData.loadTime}ms, DCL=${perfData.domContentLoaded}ms, FCP=${perfData.firstContentfulPaint}ms for ${domain}`);
459      } catch (e) {
460        logger.warn(`Failed to capture performance data: ${e.message}`);
461      }
462  
463      logger.debug(`[assets] total ${Date.now() - t0}ms for ${domain}`);
464      logger.success(`Captured HTML for ${domain}`);
465  
466      return { html, httpStatusCode, sslStatus, httpHeaders, localeData, perfData };
467    } finally {
468      await page.close().catch(() => {});
469      // Only tear down if we own the browser/context
470      if (ownedContext) await ownedContext.close().catch(() => {});
471      if (ownedBrowser) await ownedBrowser.close().catch(() => {});
472    }
473  }
474  
475  /**
476   * Capture screenshots for a single site
477   * @param {number} siteId - Site ID
478   * @param {string} url - Site URL
479   * @param {string} countryCode - Site country code
480   * @param {import('playwright').BrowserContext|null} sharedContext - Optional shared browser context
481   * @returns {Promise<boolean>} Success status
482   */
483  async function captureSiteScreenshots(siteId, url, countryCode, sharedContext = null) {
484    // Check ENABLE_VISION flag (when false, this function shouldn't even be called, but check anyway)
485    const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
486  
487    try {
488      let result;
489  
490      if (ENABLE_VISION) {
491        // Capture screenshots + HTML
492        result = await captureWebsite(url);
493      } else {
494        // Capture HTML only (no screenshots) — reuse shared browser context if provided
495        result = await captureHtmlOnly(url, sharedContext);
496      }
497  
498      // Check for false-positive error pages (e.g., 403 rendered as HTML with 200 status)
499      if (result.html && result.httpStatusCode) {
500        const errorDetection = detectErrorPage(result.html, result.httpStatusCode);
501  
502        if (errorDetection.isErrorPage) {
503          // This is a false-positive error page - schedule retry in 7 days
504          const errorMsg = `False-positive error page: ${errorDetection.indicator} (${errorDetection.wordCount} words, HTTP ${result.httpStatusCode})`;
505  
506          await run(
507            `UPDATE sites
508             SET status = 'found',
509                 error_message = $1,
510                 recapture_at = NOW() + INTERVAL '7 days'
511             WHERE id = $2`,
512            [errorMsg, siteId]
513          );
514  
515          logger.info(`  ${url}: ${errorMsg} - scheduled retry in 7 days`);
516          throw new Error(errorMsg);
517        }
518      }
519  
520      // Save screenshots to file system if vision enabled
521      let screenshotPath = null;
522      if (ENABLE_VISION && result.screenshots) {
523        const screenshotData = {
524          desktop_above: result.screenshots.desktop_above,
525          desktop_below: result.screenshots.desktop_below,
526          mobile_above: result.screenshots.mobile_above,
527          desktop_above_uncropped: result.screenshotsUncropped.desktop_above,
528          desktop_below_uncropped: result.screenshotsUncropped.desktop_below,
529          mobile_above_uncropped: result.screenshotsUncropped.mobile_above,
530        };
531  
532        screenshotPath = await saveScreenshots(siteId, screenshotData);
533  
534        // Validate that screenshots were actually written to disk
535        const { exists, missing } = await croppedScreenshotsExist(screenshotPath);
536        if (!exists) {
537          throw new Error(
538            `Screenshot validation failed: missing ${missing.length}/3 files (${missing.join(', ')})`
539          );
540        }
541      }
542  
543      // Update database with screenshot path, html_dom, http_status_code, ssl_status, and http_headers
544      // Mark as assets_captured if HTTP status is success (2xx or 3xx)
545      // If screenshots disabled, still mark as assets_captured (scoring stage can work with HTML only)
546      // Clear error_message and recapture_at on successful processing
547      const isSuccess = result.httpStatusCode >= 200 && result.httpStatusCode < 400;
548  
549      // Validate that html_dom was actually captured - without it, scoring has nothing to work with
550      if (isSuccess && (!result.html || result.html.trim() === '')) {
551        throw new Error(
552          `HTML DOM capture failed: page returned HTTP ${result.httpStatusCode} but html content is ${result.html === null ? 'null' : 'empty'}`
553        );
554      }
555  
556      const hasScreenshots = screenshotPath !== null;
557  
558      // Derive language_code using multi-signal detection (Content-Language header →
559      // hreflangs → htmlLang with template-default detection → country fallback)
560      const languageCode = deriveLanguageCode(
561        countryCode,
562        result.localeData,
563        result.httpHeaders,
564        getCountryByCode
565      );
566  
567      // Write HTML to filesystem (not DB) — reduces DB bloat by ~7 GB
568      // Write file first, then set 'fs' flag in DB. If file write fails, the error
569      // propagates and DB isn't updated, keeping the site in a retryable state.
570      if (result.html) {
571        writeHtmlDom(siteId, result.html);
572      }
573  
574      // Detect ad platform pixels from HTML (Google Ads, Meta, Bing, call tracking)
575      const adResult = result.html ? detectAdsFromHtml(result.html) : null;
576  
577      // Mark as assets_captured if HTTP status is successful (2xx or 3xx)
578      // Scoring stage can work with or without screenshots (uses HTML DOM as fallback)
579      const newStatus = isSuccess ? 'assets_captured' : 'found';
580  
581      // Wrap all DB updates for this site in a transaction so status, screenshot_path,
582      // html_dom, and keyword counters are always consistent — no partial writes on crash.
583      await withTransaction(async (client) => {
584        await client.query(
585          `UPDATE sites SET
586            screenshot_path = $1,
587            html_dom = $2,
588            http_status_code = $3,
589            ssl_status = $4,
590            http_headers = $5,
591            locale_data = $6,
592            language_code = $7,
593            perf_json = $8,
594            status = $9,
595            assets_captured_at = CASE WHEN $10 = 'assets_captured' THEN NOW() ELSE assets_captured_at END,
596            error_message = NULL,
597            recapture_at = NULL
598           WHERE id = $11`,
599          [
600            screenshotPath,
601            result.html ? 'fs' : null, // Flag: 'fs' = stored on filesystem
602            result.httpStatusCode,
603            result.sslStatus,
604            result.httpHeaders,
605            result.localeData,
606            languageCode,
607            result.perfData ? JSON.stringify(result.perfData) : null,
608            newStatus,
609            newStatus, // second occurrence for assets_captured_at CASE expression
610            siteId,
611          ]
612        );
613  
614        // Store ad detection signals
615        if (adResult) {
616          await client.query(
617            `UPDATE sites SET is_running_ads = $1, ad_signals = $2, ad_signals_updated_at = NOW() WHERE id = $3`,
618            [adResult.is_running_ads, JSON.stringify(adResult.signals), siteId]
619          );
620        }
621  
622        // Reset retry count on successful capture
623        await resetRetries(siteId);
624  
625        // Increment keyword counter
626        const siteRow = await client.query(
627          'SELECT keyword, country_code FROM sites WHERE id = $1',
628          [siteId]
629        );
630        const site = siteRow.rows[0];
631        if (site?.keyword && site?.country_code) {
632          await incrementAssetsScraped(site.keyword, site.country_code);
633        }
634      });
635  
636      if (!isSuccess) {
637        throw new Error(`HTTP ${result.httpStatusCode} - Cannot capture assets for error response`);
638      }
639  
640      if (!hasScreenshots && ENABLE_VISION) {
641        throw new Error(
642          `Vision enabled but screenshots not captured - result.screenshots is ${result.screenshots ? 'empty' : 'null/undefined'}`
643        );
644      }
645  
646      if (hasScreenshots) {
647        success(`  Captured screenshots for ${url} -> ${screenshotPath}/`);
648      } else {
649        success(`  Captured HTML for ${url} (screenshots disabled, using HTML DOM for scoring)`);
650      }
651      return true;
652    } catch (err) {
653      // Record failure and increment retry count (marks as 'failing' if limit exceeded)
654      // Note: Error page detection (false positives) use recapture_at separately
655      await recordFailure(siteId, 'assets', err, 'found');
656      throw err;
657    }
658  }
659  
660  /**
661   * Get assets statistics
662   * @returns {Promise<Object>} Assets statistics
663   */
664  export async function getAssetsStats() {
665    return await getOne(
666      `SELECT
667        COUNT(id) as total_sites,
668        COUNT(CASE WHEN screenshot_path IS NOT NULL THEN 1 END) as sites_with_screenshots,
669        COUNT(CASE WHEN status = 'assets_captured' THEN 1 END) as captured_sites,
670        COUNT(CASE WHEN status = 'found' AND screenshot_path IS NULL THEN 1 END) as pending_capture,
671        COUNT(CASE WHEN status = 'found' AND error_message IS NOT NULL THEN 1 END) as failed_capture
672       FROM sites`
673    );
674  }
675  
676  /**
677   * Backfill missing screenshots for existing sites
678   * @param {number} limit - Limit number of sites to process
679   * @returns {Promise<Object>} Backfill results
680   */
681  export async function backfillScreenshots(limit = 10) {
682    info(`Backfilling screenshots for up to ${limit} sites...`);
683  
684    // Get sites with missing screenshots
685    const sites = await getAll(
686      `SELECT id, landing_page_url as url
687       FROM sites
688       WHERE status = 'found'
689         AND screenshot_path IS NULL
690       LIMIT $1`,
691      [limit]
692    );
693  
694    if (sites.length === 0) {
695      info('No sites need screenshot backfill');
696      return { processed: 0, succeeded: 0, failed: 0 };
697    }
698  
699    return await runAssetsStage({ limit, concurrency: 1 });
700  }