assets.js
1 /** 2 * Assets Stage 3 * Captures screenshots and visual assets for sites 4 */ 5 6 import { run, getOne, getAll, withTransaction, getPool } from '../utils/db.js'; 7 import { captureWebsite, launchBrowser, createStealthContext } from '../capture.js'; 8 import Logger from '../utils/logger.js'; 9 import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js'; 10 import { processBatch, withTimeout } from '../utils/error-handler.js'; 11 import { saveScreenshots, croppedScreenshotsExist } from '../utils/screenshot-storage.js'; 12 import { checkBlocklist } from '../utils/site-filters.js'; 13 import { incrementAssetsScraped } from '../utils/keyword-counters.js'; 14 // deduplicateSites removed — UNIQUE constraint on sites.domain handles this (DR-106) 15 import { detectErrorPage } from '../utils/error-page-detector.js'; 16 import { writeHtmlDom, hasHtmlDom, deleteHtmlDom } from '../utils/html-storage.js'; 17 import { detectAdsFromHtml } from '../utils/ad-detector.js'; 18 import { recordFailure, resetRetries } from '../utils/retry-handler.js'; 19 import { getAdaptiveConcurrencyFast } from '../utils/adaptive-concurrency.js'; 20 import { getCountryByCode } from '../config/countries.js'; 21 import { deriveLanguageCode } from '../utils/detect-language.js'; 22 23 const logger = new Logger('Assets'); 24 const success = (...args) => logger.success(...args); 25 const info = (...args) => logger.info(...args); 26 const error = (...args) => logger.error(...args); 27 28 /** 29 * Run the assets stage 30 * @param {Object} options - Stage options 31 * @param {number} options.limit - Limit number of sites to process 32 * @param {number} options.concurrency - Number of concurrent captures (default: 3) 33 * @returns {Promise<Object>} Stage results 34 */ 35 // eslint-disable-next-line complexity -- Stage orchestration requires multiple conditional paths 36 export async function runAssetsStage(options = {}) { 37 const startTime = Date.now(); 38 // Use concurrency=1 for browser captures to avoid resource exhaustion 39 // Each capture launches a full browser instance, so parallel launches can timeout 40 // Can be configured via BROWSER_CONCURRENCY env var 41 const concurrency = options.concurrency || parseInt(process.env.BROWSER_CONCURRENCY || '1', 10); 42 43 try { 44 info('Starting Assets Stage...'); 45 46 // Check ENABLE_VISION flag (consolidates old flags) 47 const ENABLE_VISION = process.env.ENABLE_VISION !== 'false'; 48 49 // Show deprecation warning if old flags are used 50 const legacyFlags = [ 51 process.env.ENABLE_SCREENSHOT_CAPTURE, 52 process.env.USE_COMPUTER_VISION_SCORING, 53 process.env.USE_COMPUTER_VISION_RESCORING, 54 process.env.USE_COMPUTER_VISION_ENRICHMENT, 55 ]; 56 if (legacyFlags.some(flag => flag !== undefined)) { 57 logger.warn( 58 '[assets] WARN: Vision flags (ENABLE_SCREENSHOT_CAPTURE, USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.' 59 ); 60 } 61 62 if (!ENABLE_VISION) { 63 info('[assets] Vision disabled - capturing rendered DOM HTML via headless browser'); 64 65 // Deduplication no longer needed — UNIQUE constraint on sites.domain (DR-106) 66 // prevents duplicates at insert time. Legacy deduplicateSites() removed. 67 68 // Query found sites that don't yet have HTML content 69 const queryLimit = options.limit || null; 70 const htmlCandidates = await getAll( 71 `SELECT id, domain, landing_page_url as url, country_code 72 FROM sites 73 WHERE status = 'found' 74 AND status NOT IN ('ignored', 'failing') 75 AND recapture_at IS NULL 76 AND (html_dom IS NULL OR html_dom = '') 77 ${queryLimit ? `LIMIT ${queryLimit}` : ''}` 78 ); 79 80 if (htmlCandidates.length === 0) { 81 info('[assets] No sites need HTML capture'); 82 return { 83 processed: 0, 84 succeeded: 0, 85 failed: 0, 86 skipped: 0, 87 duration: Date.now() - startTime, 88 }; 89 } 90 91 info( 92 `[assets] Capturing DOM HTML for ${htmlCandidates.length} sites (concurrency: ${concurrency})` 93 ); 94 95 // Check blocklist 96 let ignoredCount = 0; 97 const blockedIds = new Set(); 98 for (const site of htmlCandidates) { 99 const blocked = checkBlocklist(site.domain, site.country_code); 100 if (blocked) { 101 await run( 102 `UPDATE sites SET status = 'ignored', error_message = $1, html_dom = NULL WHERE id = $2`, 103 [blocked.reason, site.id] 104 ); 105 deleteHtmlDom(site.id); 106 blockedIds.add(site.id); 107 ignoredCount++; 108 } 109 } 110 if (ignoredCount > 0) { 111 info(`Marked ${ignoredCount} sites as ignored (directories/social media)`); 112 } 113 114 const htmlSites = htmlCandidates.filter(s => !blockedIds.has(s.id)); 115 const htmlStats = { processed: 0, succeeded: 0, failed: 0, skipped: 0 }; 116 117 // Launch one shared browser for the whole batch — avoids per-site launch overhead (~500ms each). 118 // Each concurrent worker gets its own page within the shared context. 119 const sharedBrowser = await launchBrowser({ headless: true }); 120 const sharedContext = await createStealthContext(sharedBrowser); 121 let htmlResults, htmlErrors; 122 try { 123 ({ results: htmlResults, errors: htmlErrors } = await processBatch( 124 htmlSites, 125 async (site, index) => { 126 displayProgress(index + 1, htmlSites.length, `Capturing DOM: ${site.url}`); 127 return withTimeout( 128 captureSiteScreenshots(site.id, site.url, site.country_code, sharedContext), 129 120000, 130 `Capture timed out after 120s for ${site.url}` 131 ); 132 }, 133 { 134 concurrency, 135 getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 3, 'BROWSER_CONCURRENCY'), 136 } 137 )); 138 } finally { 139 await sharedContext.close().catch(() => {}); 140 await sharedBrowser.close().catch(() => {}); 141 } 142 143 htmlStats.processed = htmlSites.length; 144 htmlStats.succeeded = htmlResults.filter(r => r !== null).length; 145 htmlStats.failed = htmlErrors.length; 146 for (const err of htmlErrors) { 147 error(` Failed to capture ${err.item?.url || 'unknown'}: ${err.error?.message || err}`); 148 } 149 150 htmlStats.duration = Date.now() - startTime; 151 generateStageCompletion('Assets (HTML-only)', htmlStats); 152 return htmlStats; 153 } 154 155 // Vision enabled - proceed with normal screenshot capture 156 info('[assets] Vision enabled - capturing screenshots'); 157 158 // Deduplication no longer needed — UNIQUE constraint on sites.domain (DR-106) 159 160 // Get sites that need screenshot capture (missing html_dom or incomplete screenshots) 161 // Pre-filter in SQL for sites likely to need screenshots to avoid loading unnecessary data 162 // This query targets sites with missing html_dom or missing screenshot_path 163 // We'll still need to check for incomplete cropped screenshots afterward (can't check file existence in SQL) 164 const queryLimit = options.limit ? options.limit * 3 : null; // Small multiplier to account for cropped screenshot checks (reduced from 50x to 3x for performance) 165 const candidates = await getAll( 166 `SELECT id, domain, landing_page_url as url, screenshot_path, html_dom, error_message, country_code 167 FROM sites 168 WHERE (status = 'found' OR status = 'assets_captured') 169 AND status NOT IN ('ignored', 'failing') 170 AND recapture_at IS NULL 171 AND ( 172 html_dom IS NULL 173 OR html_dom = '' 174 OR screenshot_path IS NULL 175 ) 176 ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC 177 ${queryLimit ? `LIMIT ${queryLimit}` : ''}` 178 ); 179 180 if (candidates.length === 0) { 181 info('No sites to check for screenshot capture'); 182 return { 183 processed: 0, 184 succeeded: 0, 185 failed: 0, 186 skipped: 0, 187 duration: Date.now() - startTime, 188 }; 189 } 190 191 info(`Checking ${candidates.length} candidate sites (limit: ${options.limit || 'none'})`); 192 193 // Filter out blocklisted sites (directories/social media/franchises) 194 let ignoredCount = 0; 195 const blockedSiteIds = new Set(); 196 for (const site of candidates) { 197 const blocked = checkBlocklist(site.domain, site.country_code); 198 if (blocked) { 199 await run( 200 `UPDATE sites SET status = 'ignored', error_message = $1, html_dom = NULL WHERE id = $2`, 201 [blocked.reason, site.id] 202 ); 203 blockedSiteIds.add(site.id); 204 ignoredCount++; 205 } 206 } 207 208 if (ignoredCount > 0) { 209 info(`Marked ${ignoredCount} sites as ignored (directories/social media)`); 210 } 211 212 // Filter out blocked sites from candidates 213 const nonBlockedCandidates = candidates.filter(site => !blockedSiteIds.has(site.id)); 214 215 // Filter to only sites with missing html_dom or missing cropped screenshots 216 // Stop early once we've found enough sites (if limit specified) 217 info( 218 `Checking ${nonBlockedCandidates.length} sites for missing html_dom or missing cropped screenshots...` 219 ); 220 const sitesNeedingCapture = []; 221 let cleanedUpCount = 0; 222 223 for (const site of nonBlockedCandidates) { 224 // Early exit if we've found enough sites 225 if (options.limit && sitesNeedingCapture.length >= options.limit) { 226 info(`Found ${options.limit} sites needing capture, stopping search`); 227 break; 228 } 229 230 let needsCapture = false; 231 const reasons = []; 232 233 // Check if html_dom is missing 234 if (!site.html_dom || site.html_dom.trim() === '') { 235 needsCapture = true; 236 reasons.push('missing html_dom'); 237 } 238 239 // Check if screenshot_path is missing 240 if (!site.screenshot_path) { 241 needsCapture = true; 242 reasons.push('no screenshot_path'); 243 } else { 244 // Check if the 3 essential cropped screenshots exist 245 const { exists, missing } = await croppedScreenshotsExist(site.screenshot_path); 246 if (!exists) { 247 needsCapture = true; 248 reasons.push(`missing ${missing.length}/3 cropped screenshots: ${missing.join(', ')}`); 249 250 // Cleanup: Reset screenshot_path if files don't exist (legacy bug fix) 251 // This prevents sites from being stuck with invalid screenshot_path values 252 await run('UPDATE sites SET screenshot_path = NULL WHERE id = $1', [site.id]); 253 cleanedUpCount++; 254 } 255 } 256 257 if (needsCapture) { 258 if (reasons.length > 0) { 259 info(` Site ${site.id} needs capture: ${reasons.join(', ')}`); 260 } 261 sitesNeedingCapture.push(site); 262 } 263 } 264 265 if (cleanedUpCount > 0) { 266 info(`Cleaned up ${cleanedUpCount} sites with invalid screenshot_path values`); 267 } 268 269 const sites = sitesNeedingCapture; 270 271 if (sites.length === 0) { 272 info('No sites need screenshot capture (all have complete screenshots)'); 273 return { 274 processed: 0, 275 succeeded: 0, 276 failed: 0, 277 skipped: 0, 278 duration: Date.now() - startTime, 279 }; 280 } 281 282 info( 283 `Capturing screenshots for ${sites.length} sites (concurrency: ${concurrency}, sequential by default to avoid timeouts)` 284 ); 285 286 const stats = { 287 processed: 0, 288 succeeded: 0, 289 failed: 0, 290 skipped: 0, 291 }; 292 293 // Process sites in batches with dynamic concurrency 294 const { results, errors } = await processBatch( 295 sites, 296 (site, index) => { 297 displayProgress(index + 1, sites.length, `Capturing ${site.url}`); 298 return withTimeout( 299 captureSiteScreenshots(site.id, site.url, site.country_code), 300 120000, 301 `Capture timed out after 120s for ${site.url}` 302 ); 303 }, 304 { 305 concurrency, 306 getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 3, 'BROWSER_CONCURRENCY'), 307 } 308 ); 309 310 // Count successes and failures 311 stats.processed = sites.length; 312 stats.succeeded = results.filter(r => r !== null).length; 313 stats.failed = errors.length; 314 315 // Log errors 316 for (const err of errors) { 317 const url = err.item?.url || 'unknown site'; 318 const message = err.error?.message || err.toString(); 319 error(` Failed to capture ${url}: ${message}`); 320 } 321 322 stats.duration = Date.now() - startTime; 323 generateStageCompletion('Assets', stats); 324 325 return stats; 326 } catch (err) { 327 error(`Assets stage failed: ${err.message}`); 328 throw err; 329 } 330 } 331 332 /** 333 * Capture HTML for a single URL. 334 * @param {string} url 335 * @param {import('playwright').BrowserContext|null} sharedContext - Reuse existing context if provided. 336 * When provided the caller owns the context lifecycle (open/close). When null a fresh 337 * browser+context is created and torn down internally (legacy path, used by backfill). 338 */ 339 async function captureHtmlOnly(url, sharedContext = null) { 340 const domain = new URL(url).hostname; 341 logger.info(`Capturing HTML for ${domain}...`); 342 343 const t0 = Date.now(); 344 let ownedBrowser = null; 345 let ownedContext = null; 346 const context = 347 sharedContext || 348 (async () => { 349 ownedBrowser = await launchBrowser({ headless: true }); 350 ownedContext = await createStealthContext(ownedBrowser); 351 return ownedContext; 352 })(); 353 // context may be a Promise (self-owned) or already resolved (shared) 354 const ctx = await context; 355 356 const page = await ctx.newPage(); 357 try { 358 const tNav = Date.now(); 359 // Navigate to URL — domcontentloaded fires as soon as HTML is parsed; no need to wait for images/JS 360 const response = await page.goto(url, { 361 waitUntil: 'domcontentloaded', 362 timeout: 30000, 363 }); 364 logger.debug(`[assets] nav ${Date.now() - tNav}ms for ${domain}`); 365 366 const httpStatusCode = response.status(); 367 368 // Capture SSL status 369 let sslStatus = 'error'; 370 try { 371 const parsedUrl = new URL(url); 372 sslStatus = parsedUrl.protocol === 'https:' ? 'https' : 'http'; 373 } catch (e) { 374 logger.warn(`Failed to determine SSL status: ${e.message}`); 375 } 376 377 // Capture HTTP headers 378 let httpHeaders = null; 379 try { 380 const headers = response.headers(); 381 httpHeaders = JSON.stringify({ 382 'strict-transport-security': headers['strict-transport-security'] || null, 383 'content-security-policy': headers['content-security-policy'] || null, 384 'x-frame-options': headers['x-frame-options'] || null, 385 'x-content-type-options': headers['x-content-type-options'] || null, 386 server: headers['server'] || null, 387 'x-powered-by': headers['x-powered-by'] || null, 388 'content-encoding': headers['content-encoding'] || null, 389 'cache-control': headers['cache-control'] || null, 390 'content-language': headers['content-language'] || null, 391 }); 392 } catch (e) { 393 logger.warn(`Failed to capture HTTP headers: ${e.message}`); 394 } 395 396 // NOTE: removed waitForTimeout(1000) — domcontentloaded already ensures HTML is parsed. 397 // JS frameworks that inject content after DOMContentLoaded are handled by the scoring LLM 398 // which reads the visible text; we don't need a JS-rendered DOM for HTML-only mode. 399 400 const tContent = Date.now(); 401 const html = await page.content(); 402 logger.debug(`[assets] content ${Date.now() - tContent}ms for ${domain}`); 403 404 // Capture locale data 405 let localeData = null; 406 try { 407 /* eslint-disable no-undef -- document is available in browser context */ 408 const data = await page.evaluate(() => ({ 409 htmlLang: document.documentElement.lang || null, 410 hreflangs: Array.from(document.querySelectorAll('link[rel="alternate"][hreflang]')) 411 .map(el => ({ 412 hreflang: el.getAttribute('hreflang'), 413 href: el.getAttribute('href'), 414 })) 415 .filter(link => link.hreflang), 416 })); 417 /* eslint-enable no-undef */ 418 localeData = JSON.stringify(data); 419 } catch (e) { 420 logger.warn(`Failed to capture locale data: ${e.message}`); 421 } 422 423 // Capture performance timing data (Core Web Vitals / page speed) 424 let perfData = null; 425 try { 426 427 perfData = await page.evaluate(() => { 428 const nav = performance.getEntriesByType('navigation')[0]; 429 const paint = performance.getEntriesByType('paint'); 430 const resources = performance.getEntriesByType('resource'); 431 432 // Count resources by type 433 const resourceCounts = {}; 434 for (const r of resources) { 435 const type = r.initiatorType || 'other'; 436 resourceCounts[type] = (resourceCounts[type] || 0) + 1; 437 } 438 439 // Total transfer size from all resources 440 let totalTransferSize = nav ? nav.transferSize : 0; 441 for (const r of resources) { 442 totalTransferSize += r.transferSize || 0; 443 } 444 445 return { 446 loadTime: nav ? Math.round(nav.loadEventEnd - nav.startTime) : null, 447 domContentLoaded: nav ? Math.round(nav.domContentLoadedEventEnd - nav.startTime) : null, 448 firstPaint: Math.round(paint.find(p => p.name === 'first-paint')?.startTime) || null, 449 firstContentfulPaint: Math.round(paint.find(p => p.name === 'first-contentful-paint')?.startTime) || null, 450 transferSize: nav ? nav.transferSize : null, 451 totalTransferSize, 452 domInteractive: nav ? Math.round(nav.domInteractive - nav.startTime) : null, 453 resourceCount: resources.length, 454 resourceCounts, 455 }; 456 }); 457 458 logger.debug(`[assets] perf: load=${perfData.loadTime}ms, DCL=${perfData.domContentLoaded}ms, FCP=${perfData.firstContentfulPaint}ms for ${domain}`); 459 } catch (e) { 460 logger.warn(`Failed to capture performance data: ${e.message}`); 461 } 462 463 logger.debug(`[assets] total ${Date.now() - t0}ms for ${domain}`); 464 logger.success(`Captured HTML for ${domain}`); 465 466 return { html, httpStatusCode, sslStatus, httpHeaders, localeData, perfData }; 467 } finally { 468 await page.close().catch(() => {}); 469 // Only tear down if we own the browser/context 470 if (ownedContext) await ownedContext.close().catch(() => {}); 471 if (ownedBrowser) await ownedBrowser.close().catch(() => {}); 472 } 473 } 474 475 /** 476 * Capture screenshots for a single site 477 * @param {number} siteId - Site ID 478 * @param {string} url - Site URL 479 * @param {string} countryCode - Site country code 480 * @param {import('playwright').BrowserContext|null} sharedContext - Optional shared browser context 481 * @returns {Promise<boolean>} Success status 482 */ 483 async function captureSiteScreenshots(siteId, url, countryCode, sharedContext = null) { 484 // Check ENABLE_VISION flag (when false, this function shouldn't even be called, but check anyway) 485 const ENABLE_VISION = process.env.ENABLE_VISION !== 'false'; 486 487 try { 488 let result; 489 490 if (ENABLE_VISION) { 491 // Capture screenshots + HTML 492 result = await captureWebsite(url); 493 } else { 494 // Capture HTML only (no screenshots) — reuse shared browser context if provided 495 result = await captureHtmlOnly(url, sharedContext); 496 } 497 498 // Check for false-positive error pages (e.g., 403 rendered as HTML with 200 status) 499 if (result.html && result.httpStatusCode) { 500 const errorDetection = detectErrorPage(result.html, result.httpStatusCode); 501 502 if (errorDetection.isErrorPage) { 503 // This is a false-positive error page - schedule retry in 7 days 504 const errorMsg = `False-positive error page: ${errorDetection.indicator} (${errorDetection.wordCount} words, HTTP ${result.httpStatusCode})`; 505 506 await run( 507 `UPDATE sites 508 SET status = 'found', 509 error_message = $1, 510 recapture_at = NOW() + INTERVAL '7 days' 511 WHERE id = $2`, 512 [errorMsg, siteId] 513 ); 514 515 logger.info(` ${url}: ${errorMsg} - scheduled retry in 7 days`); 516 throw new Error(errorMsg); 517 } 518 } 519 520 // Save screenshots to file system if vision enabled 521 let screenshotPath = null; 522 if (ENABLE_VISION && result.screenshots) { 523 const screenshotData = { 524 desktop_above: result.screenshots.desktop_above, 525 desktop_below: result.screenshots.desktop_below, 526 mobile_above: result.screenshots.mobile_above, 527 desktop_above_uncropped: result.screenshotsUncropped.desktop_above, 528 desktop_below_uncropped: result.screenshotsUncropped.desktop_below, 529 mobile_above_uncropped: result.screenshotsUncropped.mobile_above, 530 }; 531 532 screenshotPath = await saveScreenshots(siteId, screenshotData); 533 534 // Validate that screenshots were actually written to disk 535 const { exists, missing } = await croppedScreenshotsExist(screenshotPath); 536 if (!exists) { 537 throw new Error( 538 `Screenshot validation failed: missing ${missing.length}/3 files (${missing.join(', ')})` 539 ); 540 } 541 } 542 543 // Update database with screenshot path, html_dom, http_status_code, ssl_status, and http_headers 544 // Mark as assets_captured if HTTP status is success (2xx or 3xx) 545 // If screenshots disabled, still mark as assets_captured (scoring stage can work with HTML only) 546 // Clear error_message and recapture_at on successful processing 547 const isSuccess = result.httpStatusCode >= 200 && result.httpStatusCode < 400; 548 549 // Validate that html_dom was actually captured - without it, scoring has nothing to work with 550 if (isSuccess && (!result.html || result.html.trim() === '')) { 551 throw new Error( 552 `HTML DOM capture failed: page returned HTTP ${result.httpStatusCode} but html content is ${result.html === null ? 'null' : 'empty'}` 553 ); 554 } 555 556 const hasScreenshots = screenshotPath !== null; 557 558 // Derive language_code using multi-signal detection (Content-Language header → 559 // hreflangs → htmlLang with template-default detection → country fallback) 560 const languageCode = deriveLanguageCode( 561 countryCode, 562 result.localeData, 563 result.httpHeaders, 564 getCountryByCode 565 ); 566 567 // Write HTML to filesystem (not DB) — reduces DB bloat by ~7 GB 568 // Write file first, then set 'fs' flag in DB. If file write fails, the error 569 // propagates and DB isn't updated, keeping the site in a retryable state. 570 if (result.html) { 571 writeHtmlDom(siteId, result.html); 572 } 573 574 // Detect ad platform pixels from HTML (Google Ads, Meta, Bing, call tracking) 575 const adResult = result.html ? detectAdsFromHtml(result.html) : null; 576 577 // Mark as assets_captured if HTTP status is successful (2xx or 3xx) 578 // Scoring stage can work with or without screenshots (uses HTML DOM as fallback) 579 const newStatus = isSuccess ? 'assets_captured' : 'found'; 580 581 // Wrap all DB updates for this site in a transaction so status, screenshot_path, 582 // html_dom, and keyword counters are always consistent — no partial writes on crash. 583 await withTransaction(async (client) => { 584 await client.query( 585 `UPDATE sites SET 586 screenshot_path = $1, 587 html_dom = $2, 588 http_status_code = $3, 589 ssl_status = $4, 590 http_headers = $5, 591 locale_data = $6, 592 language_code = $7, 593 perf_json = $8, 594 status = $9, 595 assets_captured_at = CASE WHEN $10 = 'assets_captured' THEN NOW() ELSE assets_captured_at END, 596 error_message = NULL, 597 recapture_at = NULL 598 WHERE id = $11`, 599 [ 600 screenshotPath, 601 result.html ? 'fs' : null, // Flag: 'fs' = stored on filesystem 602 result.httpStatusCode, 603 result.sslStatus, 604 result.httpHeaders, 605 result.localeData, 606 languageCode, 607 result.perfData ? JSON.stringify(result.perfData) : null, 608 newStatus, 609 newStatus, // second occurrence for assets_captured_at CASE expression 610 siteId, 611 ] 612 ); 613 614 // Store ad detection signals 615 if (adResult) { 616 await client.query( 617 `UPDATE sites SET is_running_ads = $1, ad_signals = $2, ad_signals_updated_at = NOW() WHERE id = $3`, 618 [adResult.is_running_ads, JSON.stringify(adResult.signals), siteId] 619 ); 620 } 621 622 // Reset retry count on successful capture 623 await resetRetries(siteId); 624 625 // Increment keyword counter 626 const siteRow = await client.query( 627 'SELECT keyword, country_code FROM sites WHERE id = $1', 628 [siteId] 629 ); 630 const site = siteRow.rows[0]; 631 if (site?.keyword && site?.country_code) { 632 await incrementAssetsScraped(site.keyword, site.country_code); 633 } 634 }); 635 636 if (!isSuccess) { 637 throw new Error(`HTTP ${result.httpStatusCode} - Cannot capture assets for error response`); 638 } 639 640 if (!hasScreenshots && ENABLE_VISION) { 641 throw new Error( 642 `Vision enabled but screenshots not captured - result.screenshots is ${result.screenshots ? 'empty' : 'null/undefined'}` 643 ); 644 } 645 646 if (hasScreenshots) { 647 success(` Captured screenshots for ${url} -> ${screenshotPath}/`); 648 } else { 649 success(` Captured HTML for ${url} (screenshots disabled, using HTML DOM for scoring)`); 650 } 651 return true; 652 } catch (err) { 653 // Record failure and increment retry count (marks as 'failing' if limit exceeded) 654 // Note: Error page detection (false positives) use recapture_at separately 655 await recordFailure(siteId, 'assets', err, 'found'); 656 throw err; 657 } 658 } 659 660 /** 661 * Get assets statistics 662 * @returns {Promise<Object>} Assets statistics 663 */ 664 export async function getAssetsStats() { 665 return await getOne( 666 `SELECT 667 COUNT(id) as total_sites, 668 COUNT(CASE WHEN screenshot_path IS NOT NULL THEN 1 END) as sites_with_screenshots, 669 COUNT(CASE WHEN status = 'assets_captured' THEN 1 END) as captured_sites, 670 COUNT(CASE WHEN status = 'found' AND screenshot_path IS NULL THEN 1 END) as pending_capture, 671 COUNT(CASE WHEN status = 'found' AND error_message IS NOT NULL THEN 1 END) as failed_capture 672 FROM sites` 673 ); 674 } 675 676 /** 677 * Backfill missing screenshots for existing sites 678 * @param {number} limit - Limit number of sites to process 679 * @returns {Promise<Object>} Backfill results 680 */ 681 export async function backfillScreenshots(limit = 10) { 682 info(`Backfilling screenshots for up to ${limit} sites...`); 683 684 // Get sites with missing screenshots 685 const sites = await getAll( 686 `SELECT id, landing_page_url as url 687 FROM sites 688 WHERE status = 'found' 689 AND screenshot_path IS NULL 690 LIMIT $1`, 691 [limit] 692 ); 693 694 if (sites.length === 0) { 695 info('No sites need screenshot backfill'); 696 return { processed: 0, succeeded: 0, failed: 0 }; 697 } 698 699 return await runAssetsStage({ limit, concurrency: 1 }); 700 }