calibrate-scorer.js
1 #!/usr/bin/env node 2 /** 3 * Calibrate programmatic scorer against LLM-scored sites. 4 * Samples N sites with both html_dom and score, runs programmatic scorer, 5 * computes R², MAE, and score distribution comparison. 6 * 7 * Also reports per-factor divergence to identify which factors need 8 * LLM semantic scoring (headline, value prop, USP) vs which are fine programmatically. 9 * 10 * Usage: node scripts/calibrate-scorer.js [sample_size] 11 */ 12 13 import Database from 'better-sqlite3'; 14 import { scoreWebsiteProgrammatically } from '../src/utils/programmatic-scorer.js'; 15 import { readHtmlDom } from '../src/utils/html-storage.js'; 16 import { getScoreDataWithFallback } from '../src/utils/score-storage.js'; 17 import '../src/utils/load-env.js'; 18 19 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 20 const sampleSize = parseInt(process.argv[2] || '500', 10); 21 22 const db = new Database(dbPath, { readonly: true }); 23 db.pragma('journal_mode = WAL'); 24 25 console.error(`Sampling ${sampleSize} sites...`); 26 27 // Use ROWID-based sampling for speed (avoids full table scan) 28 const { maxId } = db.prepare('SELECT MAX(id) as maxId FROM sites').get(); 29 30 // English-only calibration: non-English sites use neutral scores and skew R² 31 // Fetch candidate sites (oversample to account for language filtering in JS) 32 const candidates = db 33 .prepare( 34 `SELECT id, domain, landing_page_url, keyword, score, score_json 35 FROM sites 36 WHERE score IS NOT NULL 37 AND html_dom IS NOT NULL 38 AND id >= ? 39 ORDER BY id 40 LIMIT ?` 41 ) 42 .all(Math.floor(Math.random() * maxId * 0.5), sampleSize * 3); 43 44 // Filter to English sites using filesystem HTML 45 const sites = []; 46 for (const site of candidates) { 47 if (sites.length >= sampleSize) break; 48 const htmlDom = readHtmlDom(site.id); 49 if (!htmlDom || htmlDom.length <= 500) continue; 50 const lower = htmlDom.toLowerCase(); 51 const isEnglish = 52 lower.includes('lang="en') || 53 lower.includes("lang='en") || 54 (!lower.includes(' lang=') && !lower.includes('<html lang')); 55 if (isEnglish) { 56 site.html_dom = htmlDom; 57 sites.push(site); 58 } 59 } 60 61 if (sites.length === 0) { 62 console.error('No sites found with html_dom and score'); 63 process.exit(1); 64 } 65 66 console.error(`Processing ${sites.length} sites...`); 67 68 const results = []; 69 let errors = 0; 70 71 // Per-factor tracking 72 const FACTORS = [ 73 'headline_quality', 74 'value_proposition', 75 'unique_selling_proposition', 76 'call_to_action', 77 'urgency_messaging', 78 'hook_engagement', 79 'trust_signals', 80 'imagery_design', 81 'offer_clarity', 82 'contextual_appropriateness', 83 ]; 84 const factorDiffs = {}; 85 for (const f of FACTORS) factorDiffs[f] = []; 86 87 for (const site of sites) { 88 try { 89 const prog = scoreWebsiteProgrammatically( 90 site.html_dom, 91 site.landing_page_url || `https://${site.domain}`, 92 site.keyword 93 ); 94 95 if (prog.is_broken_site || prog.is_error_page) continue; 96 97 results.push({ 98 domain: site.domain, 99 llm_score: site.score, 100 prog_score: prog.conversion_score, 101 diff: prog.conversion_score - site.score, 102 }); 103 104 // Per-factor comparison (if LLM score_json has factor_scores) 105 if (prog.factor_scores) { 106 try { 107 const llmJson = getScoreDataWithFallback(site.id, site); 108 const llmFactors = llmJson?.factor_scores; 109 if (llmFactors) { 110 for (const f of FACTORS) { 111 const llmScore = llmFactors[f]?.score; 112 const progScore = prog.factor_scores[f]?.score; 113 if (typeof llmScore === 'number' && typeof progScore === 'number') { 114 factorDiffs[f].push(progScore - llmScore); 115 } 116 } 117 } 118 } catch { 119 /* skip malformed score_json */ 120 } 121 } 122 } catch { 123 errors++; 124 } 125 } 126 127 if (results.length < 10) { 128 console.error('Too few valid results for calibration'); 129 process.exit(1); 130 } 131 132 // R² calculation 133 const n = results.length; 134 const llmMean = results.reduce((s, r) => s + r.llm_score, 0) / n; 135 const ssTot = results.reduce((s, r) => s + Math.pow(r.llm_score - llmMean, 2), 0); 136 const ssRes = results.reduce((s, r) => s + Math.pow(r.llm_score - r.prog_score, 2), 0); 137 const r2 = 1 - ssRes / ssTot; 138 139 // MAE 140 const mae = results.reduce((s, r) => s + Math.abs(r.diff), 0) / n; 141 142 // Correlation (Pearson) 143 const progMean = results.reduce((s, r) => s + r.prog_score, 0) / n; 144 const cov = 145 results.reduce((s, r) => s + (r.llm_score - llmMean) * (r.prog_score - progMean), 0) / n; 146 const stdLlm = Math.sqrt(results.reduce((s, r) => s + Math.pow(r.llm_score - llmMean, 2), 0) / n); 147 const stdProg = Math.sqrt( 148 results.reduce((s, r) => s + Math.pow(r.prog_score - progMean, 2), 0) / n 149 ); 150 const pearson = cov / (stdLlm * stdProg); 151 152 // Score distribution 153 const buckets = { 'F(0-59)': 0, 'D(60-69)': 0, 'C(70-79)': 0, 'B(80-89)': 0, 'A(90+)': 0 }; 154 const progBuckets = { 'F(0-59)': 0, 'D(60-69)': 0, 'C(70-79)': 0, 'B(80-89)': 0, 'A(90+)': 0 }; 155 for (const r of results) { 156 const llmB = 157 r.llm_score < 60 158 ? 'F(0-59)' 159 : r.llm_score < 70 160 ? 'D(60-69)' 161 : r.llm_score < 80 162 ? 'C(70-79)' 163 : r.llm_score < 90 164 ? 'B(80-89)' 165 : 'A(90+)'; 166 const pB = 167 r.prog_score < 60 168 ? 'F(0-59)' 169 : r.prog_score < 70 170 ? 'D(60-69)' 171 : r.prog_score < 80 172 ? 'C(70-79)' 173 : r.prog_score < 90 174 ? 'B(80-89)' 175 : 'A(90+)'; 176 buckets[llmB]++; 177 progBuckets[pB]++; 178 } 179 180 // Per-factor analysis 181 const factorAnalysis = {}; 182 for (const f of FACTORS) { 183 const diffs = factorDiffs[f]; 184 if (diffs.length === 0) continue; 185 const mean = diffs.reduce((s, d) => s + d, 0) / diffs.length; 186 const absMean = diffs.reduce((s, d) => s + Math.abs(d), 0) / diffs.length; 187 factorAnalysis[f] = { 188 n: diffs.length, 189 mean_diff: Math.round(mean * 100) / 100, 190 mae: Math.round(absMean * 100) / 100, 191 }; 192 } 193 194 // Sort factors by MAE descending (worst first) 195 const sortedFactors = Object.entries(factorAnalysis) 196 .sort(([, a], [, b]) => b.mae - a.mae) 197 .reduce((obj, [k, v]) => { 198 obj[k] = v; 199 return obj; 200 }, {}); 201 202 // Worst outliers 203 const outliers = [...results].sort((a, b) => Math.abs(b.diff) - Math.abs(a.diff)).slice(0, 10); 204 205 console.log( 206 JSON.stringify( 207 { 208 sample_size: n, 209 errors, 210 vision_enabled: process.env.ENABLE_VISION !== 'false', 211 metrics: { 212 r_squared: Math.round(r2 * 1000) / 1000, 213 pearson_r: Math.round(pearson * 1000) / 1000, 214 mae: Math.round(mae * 10) / 10, 215 mean_diff: Math.round((results.reduce((s, r) => s + r.diff, 0) / n) * 10) / 10, 216 llm_mean: Math.round(llmMean * 10) / 10, 217 prog_mean: Math.round(progMean * 10) / 10, 218 }, 219 pass: r2 >= 0.75, 220 target: 'R² >= 0.75', 221 factor_divergence: sortedFactors, 222 distribution: { 223 llm: buckets, 224 programmatic: progBuckets, 225 }, 226 top_outliers: outliers.map(r => ({ 227 domain: r.domain, 228 llm: r.llm_score, 229 prog: r.prog_score, 230 diff: Math.round(r.diff * 10) / 10, 231 })), 232 }, 233 null, 234 2 235 ) 236 ); 237 238 db.close();