Cradicle Explorer

/ scripts / benchmark-contact-extraction.js
benchmark-contact-extraction.js
  1  #!/usr/bin/env node
  2  /**
  3   * Contact Extraction Model Benchmarking Script
  4   *
  5   * Compares multiple LLM models on contact extraction quality using real site HTML
  6   * from the database. Designed to be re-run monthly as models improve and prices drop.
  7   *
  8   * Usage:
  9   *   npm run benchmark:contacts
 10   *   npm run benchmark:contacts -- --limit 25
 11   *   npm run benchmark:contacts -- --limit 10 --models mini,deepseek
 12   *   npm run benchmark:contacts -- --dry-run
 13   *
 14   * Output: reports/contact-extraction-benchmark-YYYY-MM-DD.md
 15   */
 16  
 17  import Database from 'better-sqlite3';
 18  import { writeFileSync, mkdirSync } from 'fs';
 19  import { join, dirname } from 'path';
 20  import { fileURLToPath } from 'url';
 21  import '../src/utils/load-env.js';
 22  import { callLLM } from '../src/utils/llm-provider.js';
 23  import { extractContactsFromHtml } from '../src/utils/html-contact-extractor.js';
 24  
 25  const __dirname = dirname(fileURLToPath(import.meta.url));
 26  const PROJECT_ROOT = join(__dirname, '..');
 27  
 28  // ── Model registry ────────────────────────────────────────────────────────────
 29  // To add a new model: add an entry here. The key becomes the --models shorthand.
 30  // Prices in USD per million tokens (update periodically from openrouter.ai/pricing).
 31  const MODELS = {
 32    mini: {
 33      id: 'openai/gpt-4o-mini',
 34      label: 'GPT-4o Mini',
 35      inputPer1M: 0.15,
 36      outputPer1M: 0.6,
 37      note: 'Current baseline',
 38    },
 39    deepseek: {
 40      id: 'deepseek/deepseek-v3.2',
 41      label: 'DeepSeek V3.2',
 42      inputPer1M: 0.24,
 43      outputPer1M: 0.38,
 44      note: 'Near-frontier at budget price',
 45    },
 46    flash: {
 47      id: 'google/gemini-2.5-flash',
 48      label: 'Gemini 2.5 Flash',
 49      inputPer1M: 0.3,
 50      outputPer1M: 2.5,
 51      note: 'Fast, 1M context',
 52    },
 53    haiku: {
 54      id: 'anthropic/claude-3.5-haiku',
 55      label: 'Claude 3.5 Haiku',
 56      inputPer1M: 0.8,
 57      outputPer1M: 4.0,
 58      note: 'Good structured JSON',
 59    },
 60    'gemini-pro': {
 61      id: 'google/gemini-2.5-pro',
 62      label: 'Gemini 2.5 Pro',
 63      inputPer1M: 1.25,
 64      outputPer1M: 10.0,
 65      note: 'Strong multilingual',
 66    },
 67    'gpt-5': {
 68      id: 'openai/gpt-5.2',
 69      label: 'GPT-5.2',
 70      inputPer1M: 1.75,
 71      outputPer1M: 14.0,
 72      note: 'Latest OpenAI frontier',
 73    },
 74    sonnet: {
 75      id: 'anthropic/claude-sonnet-4.6',
 76      label: 'Claude Sonnet 4.6',
 77      inputPer1M: 3.0,
 78      outputPer1M: 15.0,
 79      note: 'Best JSON compliance',
 80    },
 81    opus: {
 82      id: 'anthropic/claude-opus-4.6',
 83      label: 'Claude Opus 4.6',
 84      inputPer1M: 5.0,
 85      outputPer1M: 25.0,
 86      note: 'Maximum intelligence',
 87    },
 88  };
 89  
 90  // Extraction prompt (mirrors extractInitialContacts in src/stages/enrich.js)
 91  const SYSTEM_PROMPT = `Extract contact information from this HTML page. Return JSON with this structure:
 92  {
 93    "business_name": "Company Name",
 94    "email_addresses": [{ "email": "info@example.com", "label": "General", "source": "//a[@href='mailto:...']" }],
 95    "phone_numbers": [{ "number": "+1234567890", "label": "Office", "source": "//span[@class='phone']" }],
 96    "social_profiles": [{ "url": "https://twitter.com/handle", "label": "Twitter", "source": "..." }],
 97    "key_pages": ["https://example.com/contact"],
 98    "primary_contact_form": { "form_url": "https://example.com/contact", "form_action_url": "/submit" }
 99  }
100  Return empty arrays for missing fields. Omit primary_contact_form if none found.`;
101  
102  // ── Argument parsing ──────────────────────────────────────────────────────────
103  function parseArgs() {
104    const args = process.argv.slice(2);
105    const opts = { limit: 50, models: Object.keys(MODELS), dryRun: false };
106  
107    for (let i = 0; i < args.length; i++) {
108      if (args[i] === '--limit' && args[i + 1]) {
109        opts.limit = parseInt(args[++i], 10);
110      } else if (args[i] === '--models' && args[i + 1]) {
111        opts.models = args[++i].split(',').map(s => s.trim());
112      } else if (args[i] === '--dry-run') {
113        opts.dryRun = true;
114      }
115    }
116  
117    // Validate model shorthands
118    const invalid = opts.models.filter(k => !MODELS[k]);
119    if (invalid.length > 0) {
120      console.error(`Unknown model keys: ${invalid.join(', ')}`);
121      console.error(`Valid keys: ${Object.keys(MODELS).join(', ')}`);
122      process.exit(1);
123    }
124  
125    return opts;
126  }
127  
128  // ── Database sampling ─────────────────────────────────────────────────────────
129  function sampleSites(db, limit) {
130    // Sample with geographic diversity: try to get at least 5 countries
131    const sites = db
132      .prepare(
133        `
134      SELECT id, landing_page_url, html_dom, contacts_json, country_code, score, grade
135      FROM sites
136      WHERE html_dom IS NOT NULL
137        AND html_dom != 'HTML removed after scoring'
138        AND html_dom != ''
139        AND length(html_dom) > 500
140        AND status IN ('semantic_scored', 'vision_scored', 'enriched', 'proposals_drafted', 'outreach_sent')
141      ORDER BY RANDOM()
142      LIMIT ?
143    `
144      )
145      .all(limit * 3); // Over-sample, then select for diversity
146  
147    if (sites.length === 0) return [];
148  
149    // Build a geographically diverse sample
150    const byCountry = {};
151    for (const site of sites) {
152      const cc = site.country_code || 'unknown';
153      if (!byCountry[cc]) byCountry[cc] = [];
154      byCountry[cc].push(site);
155    }
156  
157    const countries = Object.keys(byCountry);
158    const perCountry = Math.max(1, Math.floor(limit / Math.min(countries.length, 10)));
159    const selected = [];
160  
161    for (const cc of countries) {
162      const take = byCountry[cc].slice(0, perCountry);
163      selected.push(...take);
164      if (selected.length >= limit) break;
165    }
166  
167    // Fill remaining slots if we didn't hit the limit
168    if (selected.length < limit) {
169      const selectedIds = new Set(selected.map(s => s.id));
170      for (const site of sites) {
171        if (!selectedIds.has(site.id)) {
172          selected.push(site);
173          if (selected.length >= limit) break;
174        }
175      }
176    }
177  
178    return selected.slice(0, limit);
179  }
180  
181  // ── Cost estimation ───────────────────────────────────────────────────────────
182  function estimateCost(sites, modelKeys) {
183    // Assume ~5K input tokens per site (50K HTML chars / ~10 chars per token), ~300 output tokens
184    const avgInputTokens = 5000;
185    const avgOutputTokens = 300;
186    const costs = {};
187  
188    for (const key of modelKeys) {
189      const m = MODELS[key];
190      const perSite =
191        (avgInputTokens / 1_000_000) * m.inputPer1M + (avgOutputTokens / 1_000_000) * m.outputPer1M;
192      costs[key] = { perSite, total: perSite * sites.length };
193    }
194  
195    return costs;
196  }
197  
198  // Models that support OpenAI-style response_format: json_object via OpenRouter
199  const OPENAI_JSON_MODE_MODELS = new Set(['openai/gpt-4o-mini', 'openai/gpt-5.2']);
200  
201  /**
202   * Extract JSON from LLM response content, handling markdown fences.
203   * Some models (Claude, DeepSeek, Gemini) wrap JSON in ```json...``` fences
204   * even when asked for plain JSON, so we strip them before parsing.
205   */
206  function safeJsonParse(content) {
207    if (!content || typeof content !== 'string') return null;
208  
209    // Try raw parse first (OpenAI models return clean JSON)
210    try {
211      return JSON.parse(content);
212    } catch {
213      // Fall through to fence-stripping
214    }
215  
216    // Strip ```json ... ``` or ``` ... ``` fences
217    const fenceMatch = content.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
218    if (fenceMatch) {
219      try {
220        return JSON.parse(fenceMatch[1].trim());
221      } catch {
222        // Fall through
223      }
224    }
225  
226    // Try extracting the first {...} block (handles leading/trailing text)
227    const braceMatch = content.match(/\{[\s\S]*\}/);
228    if (braceMatch) {
229      try {
230        return JSON.parse(braceMatch[0]);
231      } catch {
232        // Give up
233      }
234    }
235  
236    return null;
237  }
238  
239  // ── LLM extraction ────────────────────────────────────────────────────────────
240  async function extractWithModel(url, html, modelId) {
241    const messages = [
242      { role: 'system', content: SYSTEM_PROMPT },
243      {
244        role: 'user',
245        content: `URL: ${url}\n\nHTML (first 50000 chars):\n${html ? html.substring(0, 50000) : 'No HTML available'}`,
246      },
247    ];
248  
249    // Only send json_mode for OpenAI models — Claude/DeepSeek/Gemini don't support
250    // response_format: json_object via OpenRouter and respond with markdown-fenced JSON
251    const jsonMode = OPENAI_JSON_MODE_MODELS.has(modelId);
252  
253    const start = Date.now();
254    const response = await callLLM({
255      model: modelId,
256      messages,
257      temperature: 0.1,
258      max_tokens: 2000,
259      json_mode: jsonMode,
260    });
261    const elapsed = Date.now() - start;
262  
263    const parsed = safeJsonParse(response.content);
264  
265    return {
266      contacts: parsed,
267      usage: response.usage || { promptTokens: 0, completionTokens: 0 },
268      elapsedMs: elapsed,
269    };
270  }
271  
272  // ── Contact counting ──────────────────────────────────────────────────────────
273  function countContacts(contacts) {
274    if (!contacts) return { emails: 0, phones: 0, socials: 0, hasForm: false, total: 0 };
275    const emails = (contacts.email_addresses || []).length;
276    const phones = (contacts.phone_numbers || []).length;
277    const socials = (contacts.social_profiles || []).length;
278    const hasForm = !!contacts.primary_contact_form?.form_url;
279    return { emails, phones, socials, hasForm, total: emails + phones + socials + (hasForm ? 1 : 0) };
280  }
281  
282  function uniqueEmails(contacts) {
283    return new Set((contacts?.email_addresses || []).map(e => e.email?.toLowerCase()));
284  }
285  
286  function uniquePhones(contacts) {
287    return new Set((contacts?.phone_numbers || []).map(p => p.number?.replace(/\D/g, '')));
288  }
289  
290  // ── Report generation ─────────────────────────────────────────────────────────
291  function generateReport(opts, sites, programmaticResults, llmResults) {
292    const date = new Date().toISOString().split('T')[0];
293    const lines = [];
294  
295    lines.push(`# Contact Extraction Benchmark — ${date}`);
296    lines.push('');
297    lines.push('## Setup');
298    lines.push('');
299    lines.push(`- **Sites sampled:** ${sites.length}`);
300  
301    const countries = [...new Set(sites.map(s => s.country_code || 'unknown'))].filter(Boolean);
302    lines.push(`- **Countries represented:** ${countries.join(', ')}`);
303    lines.push(
304      `- **Models tested:** Programmatic (free), ${opts.models.map(k => MODELS[k].label).join(', ')}`
305    );
306    lines.push(`- **Run date:** ${date}`);
307    lines.push('');
308  
309    // Aggregate programmatic stats
310    const progTotals = { emails: 0, phones: 0, socials: 0, forms: 0 };
311    for (const r of Object.values(programmaticResults)) {
312      progTotals.emails += r.emails;
313      progTotals.phones += r.phones;
314      progTotals.socials += r.socials;
315      progTotals.forms += r.hasForm ? 1 : 0;
316    }
317  
318    // Aggregate LLM stats
319    const llmTotals = {};
320    const llmCosts = {};
321    const llmTimes = {};
322  
323    for (const key of opts.models) {
324      llmTotals[key] = { emails: 0, phones: 0, socials: 0, forms: 0, parseErrors: 0 };
325      llmCosts[key] = 0;
326      llmTimes[key] = [];
327    }
328  
329    for (const siteResults of Object.values(llmResults)) {
330      for (const key of opts.models) {
331        const r = siteResults[key];
332        if (!r) continue;
333        const m = MODELS[key];
334        const cost =
335          (r.usage.promptTokens / 1_000_000) * m.inputPer1M +
336          (r.usage.completionTokens / 1_000_000) * m.outputPer1M;
337        llmCosts[key] += cost;
338        llmTimes[key].push(r.elapsedMs);
339  
340        if (!r.contacts) {
341          llmTotals[key].parseErrors++;
342          continue;
343        }
344        const counts = countContacts(r.contacts);
345        llmTotals[key].emails += counts.emails;
346        llmTotals[key].phones += counts.phones;
347        llmTotals[key].socials += counts.socials;
348        llmTotals[key].forms += counts.hasForm ? 1 : 0;
349      }
350    }
351  
352    // Results table
353    lines.push('## Results by Method');
354    lines.push('');
355    lines.push(
356      '| Method | Emails | Phones | Socials | Forms | Total | Avg Time | Cost (run) | Cost/site |'
357    );
358    lines.push(
359      '|--------|--------|--------|---------|-------|-------|----------|------------|-----------|'
360    );
361  
362    const progTotal = progTotals.emails + progTotals.phones + progTotals.socials + progTotals.forms;
363    lines.push(
364      `| Programmatic (free) | ${progTotals.emails} | ${progTotals.phones} | ${progTotals.socials} | ${progTotals.forms} | **${progTotal}** | — | $0.00 | $0.0000 |`
365    );
366  
367    for (const key of opts.models) {
368      const t = llmTotals[key];
369      const total = t.emails + t.phones + t.socials + t.forms;
370      const avgMs =
371        llmTimes[key].length > 0
372          ? Math.round(llmTimes[key].reduce((a, b) => a + b, 0) / llmTimes[key].length)
373          : 0;
374      const totalCost = llmCosts[key];
375      const perSite = sites.length > 0 ? totalCost / sites.length : 0;
376      const errors = t.parseErrors > 0 ? ` (${t.parseErrors} parse errors)` : '';
377      lines.push(
378        `| ${MODELS[key].label} | ${t.emails} | ${t.phones} | ${t.socials} | ${t.forms} | **${total}**${errors} | ${avgMs}ms | $${totalCost.toFixed(4)} | $${perSite.toFixed(5)} |`
379      );
380    }
381  
382    // Incremental value vs programmatic
383    lines.push('');
384    lines.push('## Incremental Value vs Programmatic-Only');
385    lines.push('');
386    lines.push('| Model | +Emails | +Phones | +Socials | +Forms | +Total | Cost per Extra Contact |');
387    lines.push('|-------|---------|---------|----------|--------|--------|----------------------|');
388  
389    for (const key of opts.models) {
390      const t = llmTotals[key];
391      const extraEmails = t.emails - progTotals.emails;
392      const extraPhones = t.phones - progTotals.phones;
393      const extraSocials = t.socials - progTotals.socials;
394      const extraForms = t.forms - progTotals.forms;
395      const extraTotal = extraEmails + extraPhones + extraSocials + extraForms;
396      const costPerExtra =
397        extraTotal > 0 ? `$${(llmCosts[key] / extraTotal).toFixed(4)}` : 'N/A (none found)';
398      lines.push(
399        `| ${MODELS[key].label} | ${extraEmails > 0 ? '+' : ''}${extraEmails} | ${extraPhones > 0 ? '+' : ''}${extraPhones} | ${extraSocials > 0 ? '+' : ''}${extraSocials} | ${extraForms > 0 ? '+' : ''}${extraForms} | ${extraTotal > 0 ? '+' : ''}${extraTotal} | ${costPerExtra} |`
400      );
401    }
402  
403    // Incremental value vs baseline model (mini)
404    if (opts.models.includes('mini') && opts.models.length > 1) {
405      lines.push('');
406      lines.push('## Incremental Value vs GPT-4o-Mini (Upgrade Cost)');
407      lines.push('');
408      lines.push(
409        '| Model | +Emails | +Phones | +Socials | +Total | Extra Cost | Cost per Extra Contact |'
410      );
411      lines.push(
412        '|-------|---------|---------|----------|--------|------------|----------------------|'
413      );
414  
415      const mini = llmTotals['mini'];
416      const miniCost = llmCosts['mini'];
417  
418      for (const key of opts.models) {
419        if (key === 'mini') continue;
420        const t = llmTotals[key];
421        const extraEmails = t.emails - mini.emails;
422        const extraPhones = t.phones - mini.phones;
423        const extraSocials = t.socials - mini.socials;
424        const extraTotal = extraEmails + extraPhones + extraSocials;
425        const extraCost = llmCosts[key] - miniCost;
426        const costPerExtra =
427          extraTotal > 0 ? `$${(extraCost / extraTotal).toFixed(4)}` : 'N/A (none found)';
428        lines.push(
429          `| ${MODELS[key].label} | ${extraEmails > 0 ? '+' : ''}${extraEmails} | ${extraPhones > 0 ? '+' : ''}${extraPhones} | ${extraSocials > 0 ? '+' : ''}${extraSocials} | ${extraTotal > 0 ? '+' : ''}${extraTotal} | $${extraCost.toFixed(4)} | ${costPerExtra} |`
430        );
431      }
432    }
433  
434    // ROI note
435    lines.push('');
436    lines.push('## ROI Notes');
437    lines.push('');
438    lines.push(
439      '> Each extra contact found can lead to an extra outreach attempt. At a ~5% reply rate and ~10% deal rate from replies,'
440    );
441    lines.push(
442      '> each 100 extra contacts = ~5 extra replies = ~0.5 extra deals. Price your model upgrade decision accordingly.'
443    );
444    lines.push(
445      '> **Recommendation:** Upgrade only if cost-per-extra-contact is less than expected contact value.'
446    );
447    lines.push('');
448    lines.push('_Re-run this benchmark monthly: `npm run benchmark:contacts`_');
449    lines.push('');
450  
451    // Per-site raw data
452    lines.push('## Raw Data (Per Site)');
453    lines.push('');
454    lines.push('<details>');
455    lines.push('<summary>Expand for per-site JSON</summary>');
456    lines.push('');
457    lines.push('```json');
458  
459    const rawData = sites.map(site => {
460      const prog = programmaticResults[site.id];
461      const llm = {};
462      for (const key of opts.models) {
463        const r = (llmResults[site.id] || {})[key];
464        llm[key] = r
465          ? {
466              counts: countContacts(r.contacts),
467              uniqueEmails: [...uniqueEmails(r.contacts)],
468              uniquePhones: [...uniquePhones(r.contacts)],
469              cost: (
470                (r.usage.promptTokens / 1_000_000) * MODELS[key].inputPer1M +
471                (r.usage.completionTokens / 1_000_000) * MODELS[key].outputPer1M
472              ).toFixed(6),
473              elapsedMs: r.elapsedMs,
474              parseError: !r.contacts,
475            }
476          : null;
477      }
478      return {
479        id: site.id,
480        url: site.landing_page_url,
481        country: site.country_code,
482        score: site.score,
483        programmatic: prog,
484        llm,
485      };
486    });
487  
488    lines.push(JSON.stringify(rawData, null, 2));
489    lines.push('```');
490    lines.push('</details>');
491  
492    return lines.join('\n');
493  }
494  
495  // ── Main ──────────────────────────────────────────────────────────────────────
496  async function main() {
497    const opts = parseArgs();
498  
499    const dbPath = process.env.DATABASE_PATH || join(PROJECT_ROOT, 'db/sites.db');
500    const db = new Database(dbPath, { readonly: true });
501  
502    console.log('📊 Contact Extraction Benchmark');
503    console.log('================================');
504  
505    // Sample sites
506    const sites = sampleSites(db, opts.limit);
507    if (sites.length === 0) {
508      console.error('No sites with html_dom found in database. Run the pipeline first.');
509      process.exit(1);
510    }
511  
512    const countries = [...new Set(sites.map(s => s.country_code || 'unknown'))];
513    console.log(`\nSampled ${sites.length} sites from ${countries.length} countries`);
514    console.log(`Countries: ${countries.join(', ')}`);
515  
516    // Estimate costs
517    const costEst = estimateCost(sites, opts.models);
518    console.log('\nEstimated costs (approximate):');
519    let totalEst = 0;
520    for (const key of opts.models) {
521      console.log(
522        `  ${MODELS[key].label.padEnd(25)} $${costEst[key].total.toFixed(4)} total ($${costEst[key].perSite.toFixed(5)}/site)`
523      );
524      totalEst += costEst[key].total;
525    }
526    console.log(`  ${'TOTAL'.padEnd(25)} $${totalEst.toFixed(4)}`);
527  
528    if (opts.dryRun) {
529      console.log('\n[dry-run] Exiting without making API calls.');
530      db.close();
531      return;
532    }
533  
534    console.log('\nStarting benchmark...');
535  
536    // Run programmatic extractor (free)
537    console.log('\n[1/2] Running programmatic extractor...');
538    const programmaticResults = {};
539    for (const site of sites) {
540      const contacts = extractContactsFromHtml(site.html_dom, site.landing_page_url);
541      programmaticResults[site.id] = countContacts(contacts);
542    }
543  
544    // Run LLM models
545    console.log(`\n[2/2] Running ${opts.models.length} LLM model(s) on ${sites.length} sites...`);
546    const llmResults = {};
547  
548    for (const site of sites) {
549      llmResults[site.id] = {};
550    }
551  
552    const CONCURRENCY = 5; // Parallel requests per model
553  
554    for (const key of opts.models) {
555      const model = MODELS[key];
556      console.log(`\n  → ${model.label} (${model.id})`);
557      let done = 0;
558      const errors = [];
559  
560      // Process sites in batches of CONCURRENCY
561      for (let i = 0; i < sites.length; i += CONCURRENCY) {
562        const batch = sites.slice(i, i + CONCURRENCY);
563        await Promise.all(
564          batch.map(async site => {
565            try {
566              const result = await extractWithModel(site.landing_page_url, site.html_dom, model.id);
567              llmResults[site.id][key] = result;
568            } catch (err) {
569              errors.push({ siteId: site.id, error: err.message });
570              llmResults[site.id][key] = {
571                contacts: null,
572                usage: { promptTokens: 0, completionTokens: 0 },
573                elapsedMs: 0,
574              };
575            } finally {
576              process.stdout.write(`\r     ${++done}/${sites.length} sites`);
577            }
578          })
579        );
580      }
581  
582      if (errors.length > 0) {
583        console.log(`\n     ⚠️  ${errors.length} API errors`);
584      } else {
585        console.log('');
586      }
587    }
588  
589    // Generate and save report
590    const report = generateReport(opts, sites, programmaticResults, llmResults);
591    const date = new Date().toISOString().split('T')[0];
592    const reportDir = join(PROJECT_ROOT, 'reports');
593    mkdirSync(reportDir, { recursive: true });
594    const reportPath = join(reportDir, `contact-extraction-benchmark-${date}.md`);
595    writeFileSync(reportPath, report, 'utf8');
596  
597    console.log(`\n✅ Report saved to: reports/contact-extraction-benchmark-${date}.md`);
598  
599    // Print summary to console
600    console.log('\n── Quick Summary ─────────────────────────────────────────────');
601    const progTotal = Object.values(programmaticResults).reduce((s, r) => s + r.total, 0);
602    console.log(`  Programmatic:  ${progTotal} total contacts (free)`);
603  
604    for (const key of opts.models) {
605      let total = 0;
606      for (const siteRes of Object.values(llmResults)) {
607        total += countContacts(siteRes[key]?.contacts).total;
608      }
609      const extra = total - progTotal;
610      const sign = extra >= 0 ? '+' : '';
611      console.log(
612        `  ${MODELS[key].label.padEnd(22)} ${total} total (${sign}${extra} vs programmatic)`
613      );
614    }
615  
616    db.close();
617  }
618  
619  main().catch(err => {
620    console.error('Benchmark failed:', err.message);
621    process.exit(1);
622  });