Cradicle Explorer

/ scripts / calibrate-scorer.js
calibrate-scorer.js
  1  #!/usr/bin/env node
  2  /**
  3   * Calibrate programmatic scorer against LLM-scored sites.
  4   * Samples N sites with both html_dom and score, runs programmatic scorer,
  5   * computes R², MAE, and score distribution comparison.
  6   *
  7   * Also reports per-factor divergence to identify which factors need
  8   * LLM semantic scoring (headline, value prop, USP) vs which are fine programmatically.
  9   *
 10   * Usage: node scripts/calibrate-scorer.js [sample_size]
 11   */
 12  
 13  import Database from 'better-sqlite3';
 14  import { scoreWebsiteProgrammatically } from '../src/utils/programmatic-scorer.js';
 15  import { readHtmlDom } from '../src/utils/html-storage.js';
 16  import { getScoreDataWithFallback } from '../src/utils/score-storage.js';
 17  import '../src/utils/load-env.js';
 18  
 19  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
 20  const sampleSize = parseInt(process.argv[2] || '500', 10);
 21  
 22  const db = new Database(dbPath, { readonly: true });
 23  db.pragma('journal_mode = WAL');
 24  
 25  console.error(`Sampling ${sampleSize} sites...`);
 26  
 27  // Use ROWID-based sampling for speed (avoids full table scan)
 28  const { maxId } = db.prepare('SELECT MAX(id) as maxId FROM sites').get();
 29  
 30  // English-only calibration: non-English sites use neutral scores and skew R²
 31  // Fetch candidate sites (oversample to account for language filtering in JS)
 32  const candidates = db
 33    .prepare(
 34      `SELECT id, domain, landing_page_url, keyword, score, score_json
 35       FROM sites
 36       WHERE score IS NOT NULL
 37         AND html_dom IS NOT NULL
 38         AND id >= ?
 39       ORDER BY id
 40       LIMIT ?`
 41    )
 42    .all(Math.floor(Math.random() * maxId * 0.5), sampleSize * 3);
 43  
 44  // Filter to English sites using filesystem HTML
 45  const sites = [];
 46  for (const site of candidates) {
 47    if (sites.length >= sampleSize) break;
 48    const htmlDom = readHtmlDom(site.id);
 49    if (!htmlDom || htmlDom.length <= 500) continue;
 50    const lower = htmlDom.toLowerCase();
 51    const isEnglish =
 52      lower.includes('lang="en') ||
 53      lower.includes("lang='en") ||
 54      (!lower.includes(' lang=') && !lower.includes('<html lang'));
 55    if (isEnglish) {
 56      site.html_dom = htmlDom;
 57      sites.push(site);
 58    }
 59  }
 60  
 61  if (sites.length === 0) {
 62    console.error('No sites found with html_dom and score');
 63    process.exit(1);
 64  }
 65  
 66  console.error(`Processing ${sites.length} sites...`);
 67  
 68  const results = [];
 69  let errors = 0;
 70  
 71  // Per-factor tracking
 72  const FACTORS = [
 73    'headline_quality',
 74    'value_proposition',
 75    'unique_selling_proposition',
 76    'call_to_action',
 77    'urgency_messaging',
 78    'hook_engagement',
 79    'trust_signals',
 80    'imagery_design',
 81    'offer_clarity',
 82    'contextual_appropriateness',
 83  ];
 84  const factorDiffs = {};
 85  for (const f of FACTORS) factorDiffs[f] = [];
 86  
 87  for (const site of sites) {
 88    try {
 89      const prog = scoreWebsiteProgrammatically(
 90        site.html_dom,
 91        site.landing_page_url || `https://${site.domain}`,
 92        site.keyword
 93      );
 94  
 95      if (prog.is_broken_site || prog.is_error_page) continue;
 96  
 97      results.push({
 98        domain: site.domain,
 99        llm_score: site.score,
100        prog_score: prog.conversion_score,
101        diff: prog.conversion_score - site.score,
102      });
103  
104      // Per-factor comparison (if LLM score_json has factor_scores)
105      if (prog.factor_scores) {
106        try {
107          const llmJson = getScoreDataWithFallback(site.id, site);
108          const llmFactors = llmJson?.factor_scores;
109          if (llmFactors) {
110            for (const f of FACTORS) {
111              const llmScore = llmFactors[f]?.score;
112              const progScore = prog.factor_scores[f]?.score;
113              if (typeof llmScore === 'number' && typeof progScore === 'number') {
114                factorDiffs[f].push(progScore - llmScore);
115              }
116            }
117          }
118        } catch {
119          /* skip malformed score_json */
120        }
121      }
122    } catch {
123      errors++;
124    }
125  }
126  
127  if (results.length < 10) {
128    console.error('Too few valid results for calibration');
129    process.exit(1);
130  }
131  
132  // R² calculation
133  const n = results.length;
134  const llmMean = results.reduce((s, r) => s + r.llm_score, 0) / n;
135  const ssTot = results.reduce((s, r) => s + Math.pow(r.llm_score - llmMean, 2), 0);
136  const ssRes = results.reduce((s, r) => s + Math.pow(r.llm_score - r.prog_score, 2), 0);
137  const r2 = 1 - ssRes / ssTot;
138  
139  // MAE
140  const mae = results.reduce((s, r) => s + Math.abs(r.diff), 0) / n;
141  
142  // Correlation (Pearson)
143  const progMean = results.reduce((s, r) => s + r.prog_score, 0) / n;
144  const cov =
145    results.reduce((s, r) => s + (r.llm_score - llmMean) * (r.prog_score - progMean), 0) / n;
146  const stdLlm = Math.sqrt(results.reduce((s, r) => s + Math.pow(r.llm_score - llmMean, 2), 0) / n);
147  const stdProg = Math.sqrt(
148    results.reduce((s, r) => s + Math.pow(r.prog_score - progMean, 2), 0) / n
149  );
150  const pearson = cov / (stdLlm * stdProg);
151  
152  // Score distribution
153  const buckets = { 'F(0-59)': 0, 'D(60-69)': 0, 'C(70-79)': 0, 'B(80-89)': 0, 'A(90+)': 0 };
154  const progBuckets = { 'F(0-59)': 0, 'D(60-69)': 0, 'C(70-79)': 0, 'B(80-89)': 0, 'A(90+)': 0 };
155  for (const r of results) {
156    const llmB =
157      r.llm_score < 60
158        ? 'F(0-59)'
159        : r.llm_score < 70
160          ? 'D(60-69)'
161          : r.llm_score < 80
162            ? 'C(70-79)'
163            : r.llm_score < 90
164              ? 'B(80-89)'
165              : 'A(90+)';
166    const pB =
167      r.prog_score < 60
168        ? 'F(0-59)'
169        : r.prog_score < 70
170          ? 'D(60-69)'
171          : r.prog_score < 80
172            ? 'C(70-79)'
173            : r.prog_score < 90
174              ? 'B(80-89)'
175              : 'A(90+)';
176    buckets[llmB]++;
177    progBuckets[pB]++;
178  }
179  
180  // Per-factor analysis
181  const factorAnalysis = {};
182  for (const f of FACTORS) {
183    const diffs = factorDiffs[f];
184    if (diffs.length === 0) continue;
185    const mean = diffs.reduce((s, d) => s + d, 0) / diffs.length;
186    const absMean = diffs.reduce((s, d) => s + Math.abs(d), 0) / diffs.length;
187    factorAnalysis[f] = {
188      n: diffs.length,
189      mean_diff: Math.round(mean * 100) / 100,
190      mae: Math.round(absMean * 100) / 100,
191    };
192  }
193  
194  // Sort factors by MAE descending (worst first)
195  const sortedFactors = Object.entries(factorAnalysis)
196    .sort(([, a], [, b]) => b.mae - a.mae)
197    .reduce((obj, [k, v]) => {
198      obj[k] = v;
199      return obj;
200    }, {});
201  
202  // Worst outliers
203  const outliers = [...results].sort((a, b) => Math.abs(b.diff) - Math.abs(a.diff)).slice(0, 10);
204  
205  console.log(
206    JSON.stringify(
207      {
208        sample_size: n,
209        errors,
210        vision_enabled: process.env.ENABLE_VISION !== 'false',
211        metrics: {
212          r_squared: Math.round(r2 * 1000) / 1000,
213          pearson_r: Math.round(pearson * 1000) / 1000,
214          mae: Math.round(mae * 10) / 10,
215          mean_diff: Math.round((results.reduce((s, r) => s + r.diff, 0) / n) * 10) / 10,
216          llm_mean: Math.round(llmMean * 10) / 10,
217          prog_mean: Math.round(progMean * 10) / 10,
218        },
219        pass: r2 >= 0.75,
220        target: 'R² >= 0.75',
221        factor_divergence: sortedFactors,
222        distribution: {
223          llm: buckets,
224          programmatic: progBuckets,
225        },
226        top_outliers: outliers.map(r => ({
227          domain: r.domain,
228          llm: r.llm_score,
229          prog: r.prog_score,
230          diff: Math.round(r.diff * 10) / 10,
231        })),
232      },
233      null,
234      2
235    )
236  );
237  
238  db.close();