vision-scoring-comparison.js
1 #!/usr/bin/env node 2 /** 3 * Vision vs HTML-Only Scoring Comparison 4 * 5 * Selects up to 500 sites from the production DB that have: 6 * - status IN ('enriched','proposals_drafted','outreach_sent') 7 * - score_json with factor_scores (existing HTML-only score) 8 * - screenshot_path pointing to existing desktop_above.jpg + mobile_above.jpg 9 * 10 * For each site, re-scores via Claude Sonnet using vision (claude -p --image ...) 11 * with prompts/CONVERSION-SCORING-VISION.md, then compares: 12 * - Time per site 13 * - Overall score delta 14 * - Grade change 15 * - Per-factor score deltas 16 * 17 * Writes results to reports/vision-comparison-YYYY-MM-DD.json (raw) 18 * and reports/vision-comparison-YYYY-MM-DD.html (readable report) 19 * 20 * Usage: 21 * node scripts/vision-scoring-comparison.js [--limit 500] [--concurrency 3] [--dry-run] 22 * 23 * Requirements: 24 * - claude CLI in PATH (for `claude -p --image ...`) 25 * - Screenshots at screenshots/{site_id}/desktop_above.jpg, mobile_above.jpg 26 * - Does NOT modify the production DB 27 */ 28 29 import Database from 'better-sqlite3'; 30 import { readFileSync, writeFileSync, mkdirSync } from 'fs'; 31 import { existsSync } from 'fs'; 32 import { join, dirname } from 'path'; 33 import { fileURLToPath } from 'url'; 34 import '../src/utils/load-env.js'; 35 import { callLLM } from '../src/utils/llm-provider.js'; 36 import { safeJsonParse } from '../src/utils/error-handler.js'; 37 import { readHtmlDom } from '../src/utils/html-storage.js'; 38 import { getScoreDataWithFallback } from '../src/utils/score-storage.js'; 39 40 const __filename = fileURLToPath(import.meta.url); 41 const __dirname = dirname(__filename); 42 const projectRoot = join(__dirname, '..'); 43 44 // Vision scoring model — same as production scoring 45 const VISION_MODEL = process.env.SCORING_MODEL || 'openai/gpt-4o-mini'; 46 47 // Config from args 48 const args = process.argv.slice(2); 49 const getArg = (flag, defaultVal) => { 50 const idx = args.indexOf(flag); 51 if (idx === -1) return defaultVal; 52 return args[idx + 1] !== undefined ? args[idx + 1] : defaultVal; 53 }; 54 const hasFlag = flag => args.includes(flag); 55 56 const LIMIT = parseInt(getArg('--limit', '500'), 10); 57 const CONCURRENCY = parseInt(getArg('--concurrency', '3'), 10); 58 const DRY_RUN = hasFlag('--dry-run'); 59 const SAMPLE_SIZE = parseInt(getArg('--sample', '0'), 10); // quick test mode 60 // --capture: use Playwright to take fresh screenshots instead of reading from disk 61 const CAPTURE_MODE = hasFlag('--capture'); 62 63 // Paths 64 const DB_PATH = process.env.DATABASE_PATH || join(projectRoot, 'db/sites.db'); 65 const SCREENSHOTS_BASE = process.env.SCREENSHOT_BASE_PATH || join(projectRoot, 'screenshots'); 66 const VISION_PROMPT_PATH = join(projectRoot, 'prompts/CONVERSION-SCORING-VISION.md'); 67 const REPORTS_DIR = join(projectRoot, 'reports'); 68 69 // Load vision prompt 70 const VISION_PROMPT = readFileSync(VISION_PROMPT_PATH, 'utf-8'); 71 72 // Factor weights (must match src/score.js FACTOR_WEIGHTS) 73 const FACTOR_WEIGHTS = { 74 headline_quality: 0.15, 75 value_proposition: 0.14, 76 unique_selling_proposition: 0.13, 77 call_to_action: 0.13, 78 urgency_messaging: 0.1, 79 hook_engagement: 0.09, 80 trust_signals: 0.11, 81 imagery_design: 0.08, 82 offer_clarity: 0.04, 83 contextual_appropriateness: 0.03, 84 }; 85 86 const GRADE_THRESHOLDS = [ 87 { min: 97, grade: 'A+' }, 88 { min: 93, grade: 'A' }, 89 { min: 90, grade: 'A-' }, 90 { min: 87, grade: 'B+' }, 91 { min: 83, grade: 'B' }, 92 { min: 80, grade: 'B-' }, 93 { min: 77, grade: 'C+' }, 94 { min: 73, grade: 'C' }, 95 { min: 70, grade: 'C-' }, 96 { min: 67, grade: 'D+' }, 97 { min: 63, grade: 'D' }, 98 { min: 60, grade: 'D-' }, 99 { min: 0, grade: 'F' }, 100 ]; 101 102 function computeScoreFromFactors(factorScores) { 103 if (!factorScores || typeof factorScores !== 'object') return null; 104 let total = 0; 105 for (const [factor, weight] of Object.entries(FACTOR_WEIGHTS)) { 106 const score = factorScores[factor]?.score ?? 0; 107 total += score * weight; 108 } 109 return Math.round(total * 10 * 10) / 10; 110 } 111 112 function computeGrade(score) { 113 if (score === null || score === undefined || score < 0) return 'F'; 114 for (const { min, grade } of GRADE_THRESHOLDS) { 115 if (score >= min) return grade; 116 } 117 return 'F'; 118 } 119 120 /** 121 * Select sites from DB with existing scores + screenshots. 122 * Uses a lightweight query (no BLOB columns, no json_extract) for performance on large DBs. 123 * Heavy columns (score_json, html_dom, http_headers) loaded per-site via loadSiteData(). 124 */ 125 function selectSites(db, limit) { 126 // Over-select then shuffle in JS — avoids ORDER BY RANDOM() full-scan on 10GB DB 127 const rows = db 128 .prepare( 129 `SELECT 130 s.id, s.domain, s.landing_page_url as url, s.screenshot_path, 131 s.score, s.grade, s.country_code, s.status 132 FROM sites s 133 WHERE s.status IN ('enriched','proposals_drafted','outreach_sent') 134 AND s.score_json IS NOT NULL 135 AND s.screenshot_path IS NOT NULL 136 ORDER BY s.updated_at DESC 137 LIMIT ?` 138 ) 139 .all(Math.min(limit * 3, 2000)); 140 141 // Fisher-Yates shuffle 142 for (let i = rows.length - 1; i > 0; i--) { 143 const j = Math.floor(Math.random() * (i + 1)); 144 [rows[i], rows[j]] = [rows[j], rows[i]]; 145 } 146 147 return rows.slice(0, limit); 148 } 149 150 /** 151 * Load heavy columns (score_json, html_dom, http_headers) for a single site. 152 * Returns null if score_json is missing or has no factor_scores. 153 */ 154 function loadSiteData(db, siteId) { 155 const row = db.prepare(`SELECT score_json, http_headers FROM sites WHERE id = ?`).get(siteId); 156 157 if (!row) return null; 158 159 const scoreJson = getScoreDataWithFallback(siteId, row); 160 161 if (!scoreJson?.factor_scores) return null; 162 163 return { 164 score_json: scoreJson, 165 html_dom: readHtmlDom(siteId) || '', 166 http_headers: row.http_headers, 167 // Derive grade from score_json since the grade DB column is often empty 168 derived_grade: 169 scoreJson.overall_calculation?.letter_grade ?? 170 computeGrade(scoreJson.overall_calculation?.conversion_score), 171 }; 172 } 173 174 /** 175 * Check if screenshots exist on disk for a site 176 */ 177 function screenshotsExist(screenshotPath) { 178 const siteId = screenshotPath.split('/').pop(); 179 const desktop = join(SCREENSHOTS_BASE, siteId, 'desktop_above.jpg'); 180 const mobile = join(SCREENSHOTS_BASE, siteId, 'mobile_above.jpg'); 181 return existsSync(desktop) && existsSync(mobile); 182 } 183 184 function screenshotPaths(screenshotPath) { 185 const siteId = screenshotPath.split('/').pop(); 186 return { 187 desktop: join(SCREENSHOTS_BASE, siteId, 'desktop_above.jpg'), 188 mobile: join(SCREENSHOTS_BASE, siteId, 'mobile_above.jpg'), 189 }; 190 } 191 192 /** 193 * Capture fresh above-fold screenshots for a site using Playwright. 194 * Returns { desktop: string (path), mobile: string (path) } or null on failure. 195 * Writes JPEG files to a temp dir. Only used in --capture mode. 196 */ 197 const TEMP_DIR = join(projectRoot, '.vision-comparison-tmp'); 198 let captureModule = null; 199 async function captureScreenshots(site) { 200 try { 201 if (!captureModule) { 202 const mod = await import('../src/capture.js'); 203 captureModule = mod.captureWebsite ?? mod.default?.captureWebsite; 204 } 205 const result = await captureModule(site.url); 206 if (!result?.screenshots?.desktop_above || !result?.screenshots?.mobile_above) return null; 207 208 const siteDir = join(TEMP_DIR, String(site.id)); 209 mkdirSync(siteDir, { recursive: true }); 210 const desktopPath = join(siteDir, 'desktop_above.jpg'); 211 const mobilePath = join(siteDir, 'mobile_above.jpg'); 212 writeFileSync(desktopPath, result.screenshots.desktop_above); 213 writeFileSync(mobilePath, result.screenshots.mobile_above); 214 return { desktop: desktopPath, mobile: mobilePath }; 215 } catch (err) { 216 console.warn(` [${site.domain}] Capture failed: ${err.message?.slice(0, 100)}`); 217 return null; 218 } 219 } 220 221 /** 222 * Run vision scoring via OpenRouter API with base64 screenshots. 223 * Uses the same model as production scoring (SCORING_MODEL env). 224 * @param {Object} site - Site data with url, domain, html_dom, http_headers 225 * @param {Object} [imagePaths] - { desktop, mobile } file paths. If null, reads from screenshot_path. 226 * Returns parsed JSON result with overall_calculation, or error object. 227 */ 228 async function runVisionScoring(site, imagePaths = null) { 229 const { desktop, mobile } = imagePaths || screenshotPaths(site.screenshot_path); 230 231 const desktopBuf = readFileSync(desktop); 232 const mobileBuf = readFileSync(mobile); 233 234 const userContent = [ 235 { 236 type: 'text', 237 text: `Evaluate this website:\n\nURL: ${site.url}\nDomain: ${site.domain}\n\nHTTP Headers:\n${site.http_headers ? JSON.stringify(JSON.parse(site.http_headers), null, 2) : 'Not available'}\n\nHTML DOM (first 50000 chars):\n${(site.html_dom || '').substring(0, 50000)}`, 238 }, 239 { 240 type: 'image_url', 241 image_url: { url: `data:image/jpeg;base64,${desktopBuf.toString('base64')}`, detail: 'low' }, 242 }, 243 { 244 type: 'image_url', 245 image_url: { url: `data:image/jpeg;base64,${mobileBuf.toString('base64')}`, detail: 'low' }, 246 }, 247 ]; 248 249 try { 250 const response = await callLLM({ 251 model: VISION_MODEL, 252 messages: [ 253 { role: 'system', content: VISION_PROMPT }, 254 { role: 'user', content: userContent }, 255 ], 256 temperature: 0.3, 257 max_tokens: 4000, 258 json_mode: true, 259 stage: 'other', 260 siteId: site.id, 261 }); 262 263 const parsed = safeJsonParse(response.content); 264 if (!parsed) { 265 return { error: 'parse_failed', raw: response.content?.slice(0, 500) }; 266 } 267 268 if (!parsed.factor_scores) { 269 return { error: 'missing_factor_scores', raw: JSON.stringify(parsed).slice(0, 500) }; 270 } 271 272 // Compute score/grade programmatically from factor scores 273 if (!parsed.overall_calculation) parsed.overall_calculation = {}; 274 const computedScore = computeScoreFromFactors(parsed.factor_scores); 275 const computedGrade = computeGrade(computedScore); 276 parsed.overall_calculation.conversion_score = computedScore; 277 parsed.overall_calculation.letter_grade = computedGrade; 278 279 return parsed; 280 } catch (err) { 281 return { error: err.message?.slice(0, 200) || 'unknown' }; 282 } 283 } 284 285 /** 286 * Compare two score_json objects, returning per-factor deltas 287 */ 288 function compareScores(htmlScore, visionScore) { 289 const htmlFactors = htmlScore.factor_scores || {}; 290 const visionFactors = visionScore.factor_scores || {}; 291 292 const htmlTotal = htmlScore.overall_calculation?.conversion_score ?? null; 293 const visionTotal = visionScore.overall_calculation?.conversion_score ?? null; 294 const htmlGrade = htmlScore.overall_calculation?.letter_grade ?? null; 295 const visionGrade = visionScore.overall_calculation?.letter_grade ?? null; 296 297 const factorDeltas = {}; 298 for (const factor of Object.keys(FACTOR_WEIGHTS)) { 299 const h = htmlFactors[factor]?.score ?? null; 300 const v = visionFactors[factor]?.score ?? null; 301 factorDeltas[factor] = { 302 html: h, 303 vision: v, 304 delta: h !== null && v !== null ? v - h : null, 305 }; 306 } 307 308 return { 309 html_score: htmlTotal, 310 vision_score: visionTotal, 311 score_delta: htmlTotal !== null && visionTotal !== null ? visionTotal - htmlTotal : null, 312 html_grade: htmlGrade, 313 vision_grade: visionGrade, 314 grade_changed: htmlGrade !== visionGrade, 315 factor_deltas: factorDeltas, 316 }; 317 } 318 319 /** 320 * Process sites in batches with concurrency limit 321 */ 322 async function processBatch(sites, fn, concurrency) { 323 const results = []; 324 const errors = []; 325 326 for (let i = 0; i < sites.length; i += concurrency) { 327 const chunk = sites.slice(i, i + concurrency); 328 const settled = await Promise.allSettled(chunk.map(fn)); 329 for (const r of settled) { 330 if (r.status === 'fulfilled') results.push(r.value); 331 else errors.push(r.reason); 332 } 333 // Progress 334 const done = Math.min(i + concurrency, sites.length); 335 process.stdout.write(`\r Progress: ${done}/${sites.length} sites scored`); 336 } 337 console.log(); // newline after progress 338 return { results, errors }; 339 } 340 341 /** 342 * Generate HTML report 343 */ 344 function generateHtmlReport(data, outputPath) { 345 const { 346 run_date, 347 total_attempted, 348 total_succeeded, 349 total_failed, 350 duration_seconds, 351 summary_stats, 352 sites, 353 } = data; 354 355 const gradeOrder = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']; 356 357 // Grade distribution comparison 358 const htmlGradeDist = {}; 359 const visionGradeDist = {}; 360 for (const g of gradeOrder) { 361 htmlGradeDist[g] = 0; 362 visionGradeDist[g] = 0; 363 } 364 for (const s of sites.filter(s => s.status === 'success')) { 365 if (s.comparison.html_grade) 366 htmlGradeDist[s.comparison.html_grade] = (htmlGradeDist[s.comparison.html_grade] || 0) + 1; 367 if (s.comparison.vision_grade) 368 visionGradeDist[s.comparison.vision_grade] = 369 (visionGradeDist[s.comparison.vision_grade] || 0) + 1; 370 } 371 372 // Factor delta summary 373 const factorSummary = {}; 374 const successSites = sites.filter(s => s.status === 'success'); 375 for (const factor of Object.keys(FACTOR_WEIGHTS)) { 376 const deltas = successSites 377 .map(s => s.comparison.factor_deltas[factor]?.delta) 378 .filter(d => d !== null); 379 if (deltas.length === 0) continue; 380 const avg = deltas.reduce((a, b) => a + b, 0) / deltas.length; 381 const absDeltas = deltas.map(Math.abs); 382 const avgAbs = absDeltas.reduce((a, b) => a + b, 0) / absDeltas.length; 383 factorSummary[factor] = { 384 avg_delta: avg.toFixed(2), 385 avg_abs_delta: avgAbs.toFixed(2), 386 n: deltas.length, 387 vision_higher: deltas.filter(d => d > 0).length, 388 html_higher: deltas.filter(d => d < 0).length, 389 same: deltas.filter(d => d === 0).length, 390 }; 391 } 392 393 // Score delta histogram buckets 394 const scoreDeltasBuckets = { 395 'Vision ≥+10': 0, 396 'Vision +5 to +9.9': 0, 397 'Vision +1 to +4.9': 0, 398 'Within ±1': 0, 399 'HTML +1 to +4.9': 0, 400 'HTML +5 to +9.9': 0, 401 'HTML ≥+10': 0, 402 }; 403 for (const s of successSites) { 404 const d = s.comparison.score_delta; 405 if (d === null) continue; 406 if (d >= 10) scoreDeltasBuckets['Vision ≥+10']++; 407 else if (d >= 5) scoreDeltasBuckets['Vision +5 to +9.9']++; 408 else if (d >= 1) scoreDeltasBuckets['Vision +1 to +4.9']++; 409 else if (d > -1) scoreDeltasBuckets['Within ±1']++; 410 else if (d > -5) scoreDeltasBuckets['HTML +1 to +4.9']++; 411 else if (d > -10) scoreDeltasBuckets['HTML +5 to +9.9']++; 412 else scoreDeltasBuckets['HTML ≥+10']++; 413 } 414 415 // Top divergent sites 416 const divergent = successSites 417 .filter(s => s.comparison.score_delta !== null) 418 .sort((a, b) => Math.abs(b.comparison.score_delta) - Math.abs(a.comparison.score_delta)) 419 .slice(0, 20); 420 421 const tableRows = divergent 422 .map( 423 s => ` 424 <tr> 425 <td><a href="${s.url}" target="_blank">${s.domain}</a></td> 426 <td>${s.comparison.html_grade} (${s.comparison.html_score?.toFixed(1) ?? '-'})</td> 427 <td>${s.comparison.vision_grade} (${s.comparison.vision_score?.toFixed(1) ?? '-'})</td> 428 <td class="${s.comparison.score_delta > 0 ? 'pos' : s.comparison.score_delta < 0 ? 'neg' : ''}">${s.comparison.score_delta?.toFixed(1) ?? '-'}</td> 429 <td>${s.time_ms ? `${(s.time_ms / 1000).toFixed(1)}s` : '-'}</td> 430 </tr>` 431 ) 432 .join(''); 433 434 const factorRows = Object.entries(factorSummary) 435 .sort((a, b) => Math.abs(b[1].avg_delta) - Math.abs(a[1].avg_delta)) 436 .map( 437 ([factor, stats]) => ` 438 <tr> 439 <td>${factor.replace(/_/g, ' ')}</td> 440 <td class="${stats.avg_delta > 0 ? 'pos' : stats.avg_delta < 0 ? 'neg' : ''}">${stats.avg_delta > 0 ? '+' : ''}${stats.avg_delta}</td> 441 <td>${stats.avg_abs_delta}</td> 442 <td>${stats.vision_higher} / ${stats.html_higher} / ${stats.same}</td> 443 <td>${(FACTOR_WEIGHTS[factor] * 100).toFixed(0)}%</td> 444 </tr>` 445 ) 446 .join(''); 447 448 const gradeRows = gradeOrder 449 .map( 450 g => ` 451 <tr> 452 <td><strong>${g}</strong></td> 453 <td>${htmlGradeDist[g]}</td> 454 <td>${visionGradeDist[g]}</td> 455 <td class="${visionGradeDist[g] - htmlGradeDist[g] > 0 ? 'pos' : visionGradeDist[g] - htmlGradeDist[g] < 0 ? 'neg' : ''}">${visionGradeDist[g] - htmlGradeDist[g] >= 0 ? '+' : ''}${visionGradeDist[g] - htmlGradeDist[g]}</td> 456 </tr>` 457 ) 458 .join(''); 459 460 const bucketRows = Object.entries(scoreDeltasBuckets) 461 .map( 462 ([label, count]) => ` 463 <tr> 464 <td>${label}</td> 465 <td>${count}</td> 466 <td>${successSites.length > 0 ? `${((count / successSites.length) * 100).toFixed(1)}%` : '-'}</td> 467 </tr>` 468 ) 469 .join(''); 470 471 const html = `<!DOCTYPE html> 472 <html lang="en"> 473 <head> 474 <meta charset="UTF-8"> 475 <meta name="viewport" content="width=device-width, initial-scale=1.0"> 476 <title>Vision vs HTML-Only Scoring Comparison — ${run_date}</title> 477 <style> 478 body { font-family: system-ui, sans-serif; max-width: 1200px; margin: 0 auto; padding: 24px; color: #1a1a1a; } 479 h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 12px; } 480 h2 { color: #2c3e50; margin-top: 32px; } 481 .stat-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 16px; margin: 24px 0; } 482 .stat-card { background: #f8f9fa; border-radius: 8px; padding: 16px; text-align: center; border: 1px solid #dee2e6; } 483 .stat-card .value { font-size: 2em; font-weight: bold; color: #2c3e50; } 484 .stat-card .label { font-size: 0.85em; color: #666; margin-top: 4px; } 485 table { width: 100%; border-collapse: collapse; margin: 16px 0; } 486 th { background: #2c3e50; color: white; padding: 10px 12px; text-align: left; } 487 td { padding: 8px 12px; border-bottom: 1px solid #eee; } 488 tr:hover td { background: #f8f9fa; } 489 .pos { color: #27ae60; font-weight: bold; } 490 .neg { color: #e74c3c; font-weight: bold; } 491 .note { background: #fff3cd; border: 1px solid #ffc107; border-radius: 6px; padding: 12px 16px; margin: 16px 0; font-size: 0.9em; } 492 .section { margin: 40px 0; } 493 .meta { color: #666; font-size: 0.9em; margin-bottom: 24px; } 494 a { color: #3498db; } 495 </style> 496 </head> 497 <body> 498 <h1>Vision vs HTML-Only Scoring Comparison</h1> 499 <div class="meta"> 500 Run date: ${run_date} | 501 Sites attempted: ${total_attempted} | 502 Succeeded: ${total_succeeded} | 503 Failed: ${total_failed} | 504 Duration: ${duration_seconds}s 505 </div> 506 507 <div class="note"> 508 <strong>Method:</strong> HTML-only scores are existing production scores from DB (programmatic + Haiku semantic). 509 Vision scores use <code>claude -p --image desktop_above.jpg --image mobile_above.jpg</code> with 510 <code>prompts/CONVERSION-SCORING-VISION.md</code> at zero API cost via Claude Max. 511 Score computation is always programmatic from factor scores (not LLM-generated numbers). 512 DB is not modified. 513 </div> 514 515 <div class="section"> 516 <h2>Summary Statistics</h2> 517 <div class="stat-grid"> 518 <div class="stat-card"> 519 <div class="value">${summary_stats.avg_score_delta?.toFixed(2) ?? 'N/A'}</div> 520 <div class="label">Avg Score Delta (Vision - HTML)</div> 521 </div> 522 <div class="stat-card"> 523 <div class="value">${summary_stats.avg_abs_score_delta?.toFixed(2) ?? 'N/A'}</div> 524 <div class="label">Avg |Score Delta|</div> 525 </div> 526 <div class="stat-card"> 527 <div class="value">${summary_stats.pct_grade_changed?.toFixed(1) ?? 'N/A'}%</div> 528 <div class="label">Grade Changed</div> 529 </div> 530 <div class="stat-card"> 531 <div class="value">${summary_stats.avg_time_ms ? `${(summary_stats.avg_time_ms / 1000).toFixed(1)}s` : 'N/A'}</div> 532 <div class="label">Avg Time / Site</div> 533 </div> 534 <div class="stat-card"> 535 <div class="value">${summary_stats.pct_vision_higher?.toFixed(1) ?? 'N/A'}%</div> 536 <div class="label">Vision Scored Higher</div> 537 </div> 538 <div class="stat-card"> 539 <div class="value">${summary_stats.pct_html_higher?.toFixed(1) ?? 'N/A'}%</div> 540 <div class="label">HTML Scored Higher</div> 541 </div> 542 </div> 543 </div> 544 545 <div class="section"> 546 <h2>Score Delta Distribution</h2> 547 <table> 548 <thead><tr><th>Bucket</th><th>Count</th><th>%</th></tr></thead> 549 <tbody>${bucketRows}</tbody> 550 </table> 551 </div> 552 553 <div class="section"> 554 <h2>Grade Distribution Comparison</h2> 555 <table> 556 <thead><tr><th>Grade</th><th>HTML-Only</th><th>Vision</th><th>Delta</th></tr></thead> 557 <tbody>${gradeRows}</tbody> 558 </table> 559 </div> 560 561 <div class="section"> 562 <h2>Per-Factor Agreement (sorted by avg |delta|)</h2> 563 <p>Delta = Vision score − HTML score. Vision higher / HTML higher / Same.</p> 564 <table> 565 <thead><tr><th>Factor</th><th>Avg Delta</th><th>Avg |Delta|</th><th>Higher: V/H/=</th><th>Weight</th></tr></thead> 566 <tbody>${factorRows}</tbody> 567 </table> 568 </div> 569 570 <div class="section"> 571 <h2>Top 20 Most Divergent Sites</h2> 572 <table> 573 <thead><tr><th>Domain</th><th>HTML Grade (Score)</th><th>Vision Grade (Score)</th><th>Delta</th><th>Time</th></tr></thead> 574 <tbody>${tableRows}</tbody> 575 </table> 576 </div> 577 578 <div class="note" style="margin-top:40px"> 579 Raw JSON data saved to: <code>${outputPath.replace('.html', '.json')}</code> 580 </div> 581 </body> 582 </html>`; 583 584 writeFileSync(outputPath, html, 'utf-8'); 585 } 586 587 /** 588 * Main 589 */ 590 async function main() { 591 console.log(`\n=== Vision vs HTML-Only Scoring Comparison ===`); 592 console.log(`DB: ${DB_PATH}`); 593 console.log( 594 `Mode: ${CAPTURE_MODE ? 'capture (fresh screenshots)' : `disk (${SCREENSHOTS_BASE})`}` 595 ); 596 console.log(`Limit: ${LIMIT} | Concurrency: ${CONCURRENCY} | Dry-run: ${DRY_RUN}`); 597 598 if (DRY_RUN) { 599 console.log('\n[DRY RUN] Would select sites and run vision scoring. No changes made.'); 600 } 601 602 const db = new Database(DB_PATH, { readonly: true }); 603 604 // Select candidate sites (lightweight query — no BLOBs) 605 console.log('\nSelecting sites...'); 606 const candidates = selectSites(db, LIMIT * 3); 607 console.log(` Candidates from DB: ${candidates.length}`); 608 609 let sites; 610 if (CAPTURE_MODE) { 611 // In capture mode, all candidates are eligible (screenshots taken on-demand) 612 sites = candidates.slice(0, SAMPLE_SIZE > 0 ? SAMPLE_SIZE : LIMIT); 613 console.log(` Selected for comparison: ${sites.length} (will capture fresh screenshots)`); 614 } else { 615 // Filter to those with screenshots on disk 616 const withScreenshots = candidates.filter(s => screenshotsExist(s.screenshot_path)); 617 console.log(` With screenshots on disk: ${withScreenshots.length}`); 618 sites = withScreenshots.slice(0, SAMPLE_SIZE > 0 ? SAMPLE_SIZE : LIMIT); 619 } 620 console.log(` Final selection: ${sites.length}`); 621 622 if (sites.length === 0) { 623 if (CAPTURE_MODE) { 624 console.error('\nNo eligible sites found in DB.'); 625 } else { 626 console.error( 627 '\nNo sites found with screenshots on disk.\n' + 628 'Options:\n' + 629 ' 1. Mount the screenshots drive and retry\n' + 630 ' 2. Use --capture to take fresh screenshots via Playwright\n' + 631 ` 3. Set SCREENSHOT_BASE_PATH env var (current: ${SCREENSHOTS_BASE})` 632 ); 633 } 634 db.close(); 635 process.exit(1); 636 } 637 638 if (DRY_RUN) { 639 console.log('\nSample sites that would be scored:'); 640 for (const s of sites.slice(0, 5)) { 641 if (CAPTURE_MODE) { 642 console.log(` ${s.domain} (${s.grade ?? s.score}) — will capture fresh screenshots`); 643 } else { 644 const { desktop } = screenshotPaths(s.screenshot_path); 645 console.log(` ${s.domain} (${s.grade ?? s.score}) — desktop: ${desktop}`); 646 } 647 } 648 db.close(); 649 return; 650 } 651 652 // Ensure reports + temp directories exist 653 mkdirSync(REPORTS_DIR, { recursive: true }); 654 if (CAPTURE_MODE) mkdirSync(TEMP_DIR, { recursive: true }); 655 656 const startTime = Date.now(); 657 const results = []; 658 let succeeded = 0; 659 let failed = 0; 660 const captureTimings = []; 661 662 // In capture mode, limit concurrency to 1 to avoid Playwright memory exhaustion 663 const effectiveConcurrency = CAPTURE_MODE ? 1 : CONCURRENCY; 664 console.log(`\nRunning vision scoring (concurrency=${effectiveConcurrency})...`); 665 666 const { results: batchResults } = await processBatch( 667 sites, 668 async site => { 669 const siteStart = Date.now(); 670 671 // Load heavy columns on demand (avoids bulk-loading all BLOBs upfront) 672 const siteData = loadSiteData(db, site.id); 673 if (!siteData) { 674 failed++; 675 return { 676 site_id: site.id, 677 domain: site.domain, 678 url: site.url, 679 status: 'failed', 680 error: 'invalid_score_json', 681 html_score: site.score, 682 html_grade: computeGrade(site.score), 683 time_ms: 0, 684 }; 685 } 686 const { score_json: htmlScoreJson, html_dom, http_headers, derived_grade } = siteData; 687 const htmlGrade = site.grade || derived_grade || computeGrade(site.score); 688 const siteWithData = { ...site, html_dom, http_headers }; 689 690 // Get image paths — from disk or by capturing fresh screenshots 691 let imagePaths = null; 692 let captureTimeMs = 0; 693 694 if (CAPTURE_MODE) { 695 const captureStart = Date.now(); 696 const captured = await captureScreenshots(site); 697 captureTimeMs = Date.now() - captureStart; 698 699 if (!captured) { 700 const timeMs = Date.now() - siteStart; 701 failed++; 702 return { 703 site_id: site.id, 704 domain: site.domain, 705 url: site.url, 706 status: 'failed', 707 error: 'capture_failed', 708 html_score: site.score, 709 html_grade: htmlGrade, 710 time_ms: timeMs, 711 }; 712 } 713 captureTimings.push(captureTimeMs); 714 imagePaths = captured; 715 } 716 717 const visionResult = await runVisionScoring(siteWithData, imagePaths); 718 const timeMs = Date.now() - siteStart; 719 720 if (visionResult.error) { 721 failed++; 722 return { 723 site_id: site.id, 724 domain: site.domain, 725 url: site.url, 726 status: 'failed', 727 error: visionResult.error, 728 html_score: site.score, 729 html_grade: htmlGrade, 730 time_ms: timeMs, 731 }; 732 } 733 734 succeeded++; 735 const comparison = compareScores(htmlScoreJson, visionResult); 736 737 const sign = comparison.score_delta > 0 ? '+' : ''; 738 console.log( 739 ` ✓ ${site.domain.padEnd(40)} HTML: ${(htmlGrade || '?').padEnd(3)} (${(site.score || 0).toFixed(1).padStart(5)}) → Vision: ${(comparison.vision_grade || '?').padEnd(3)} (${(comparison.vision_score || 0).toFixed(1).padStart(5)}) Δ${sign}${comparison.score_delta?.toFixed(1) ?? '?'} [${(timeMs / 1000).toFixed(1)}s]` 740 ); 741 742 return { 743 site_id: site.id, 744 domain: site.domain, 745 url: site.url, 746 country_code: site.country_code, 747 status: 'success', 748 time_ms: timeMs, 749 capture_time_ms: captureTimeMs || undefined, 750 html_score: site.score, 751 html_grade: htmlGrade, 752 html_scoring_method: htmlScoreJson.scoring_method, 753 comparison, 754 vision_score_json: visionResult, 755 }; 756 }, 757 effectiveConcurrency 758 ); 759 760 results.push(...batchResults); 761 762 const durationSeconds = ((Date.now() - startTime) / 1000).toFixed(1); 763 764 // Compute summary stats 765 const successResults = results.filter(r => r.status === 'success'); 766 const scoreDeltas = successResults.map(r => r.comparison.score_delta).filter(d => d !== null); 767 const gradeChanges = successResults.filter(r => r.comparison.grade_changed).length; 768 const visionHigher = scoreDeltas.filter(d => d > 0).length; 769 const htmlHigher = scoreDeltas.filter(d => d < 0).length; 770 const timings = successResults.map(r => r.time_ms).filter(Boolean); 771 772 const avgCaptureMs = 773 captureTimings.length > 0 774 ? captureTimings.reduce((a, b) => a + b, 0) / captureTimings.length 775 : null; 776 777 const summaryStats = { 778 avg_score_delta: 779 scoreDeltas.length > 0 ? scoreDeltas.reduce((a, b) => a + b, 0) / scoreDeltas.length : null, 780 avg_abs_score_delta: 781 scoreDeltas.length > 0 782 ? scoreDeltas.map(Math.abs).reduce((a, b) => a + b, 0) / scoreDeltas.length 783 : null, 784 pct_grade_changed: 785 successResults.length > 0 ? (gradeChanges / successResults.length) * 100 : null, 786 pct_vision_higher: scoreDeltas.length > 0 ? (visionHigher / scoreDeltas.length) * 100 : null, 787 pct_html_higher: scoreDeltas.length > 0 ? (htmlHigher / scoreDeltas.length) * 100 : null, 788 avg_time_ms: timings.length > 0 ? timings.reduce((a, b) => a + b, 0) / timings.length : null, 789 avg_capture_time_ms: avgCaptureMs, 790 }; 791 792 const today = new Date().toISOString().slice(0, 10); 793 const jsonPath = join(REPORTS_DIR, `vision-comparison-${today}.json`); 794 const htmlPath = join(REPORTS_DIR, `vision-comparison-${today}.html`); 795 796 const output = { 797 run_date: today, 798 run_timestamp: new Date().toISOString(), 799 config: { 800 limit: LIMIT, 801 concurrency: CONCURRENCY, 802 sample_size: SAMPLE_SIZE, 803 capture_mode: CAPTURE_MODE, 804 }, 805 total_attempted: sites.length, 806 total_succeeded: succeeded, 807 total_failed: failed, 808 duration_seconds: parseFloat(durationSeconds), 809 summary_stats: summaryStats, 810 sites: results, 811 }; 812 813 writeFileSync(jsonPath, JSON.stringify(output, null, 2), 'utf-8'); 814 console.log(`\nRaw results: ${jsonPath}`); 815 816 generateHtmlReport(output, htmlPath); 817 console.log(`HTML report: ${htmlPath}`); 818 console.log(`\nOpen: file://${htmlPath}`); 819 820 console.log('\n--- Summary ---'); 821 console.log(` Sites: ${sites.length} attempted, ${succeeded} succeeded, ${failed} failed`); 822 console.log(` Duration: ${durationSeconds}s total`); 823 if (summaryStats.avg_time_ms) { 824 console.log(` Avg time/site: ${(summaryStats.avg_time_ms / 1000).toFixed(1)}s`); 825 } 826 if (summaryStats.avg_score_delta !== null) { 827 const sign = summaryStats.avg_score_delta >= 0 ? '+' : ''; 828 console.log( 829 ` Avg score delta (Vision − HTML): ${sign}${summaryStats.avg_score_delta.toFixed(2)}` 830 ); 831 console.log(` Avg |score delta|: ${summaryStats.avg_abs_score_delta.toFixed(2)}`); 832 console.log( 833 ` Grade changed: ${gradeChanges}/${successResults.length} (${summaryStats.pct_grade_changed.toFixed(1)}%)` 834 ); 835 console.log(` Vision higher: ${visionHigher} (${summaryStats.pct_vision_higher.toFixed(1)}%)`); 836 console.log(` HTML higher: ${htmlHigher} (${summaryStats.pct_html_higher.toFixed(1)}%)`); 837 } 838 839 db.close(); 840 } 841 842 main().catch(err => { 843 console.error('Fatal:', err); 844 process.exit(1); 845 });