benchmark-contact-extraction.js
1 #!/usr/bin/env node 2 /** 3 * Contact Extraction Model Benchmarking Script 4 * 5 * Compares multiple LLM models on contact extraction quality using real site HTML 6 * from the database. Designed to be re-run monthly as models improve and prices drop. 7 * 8 * Usage: 9 * npm run benchmark:contacts 10 * npm run benchmark:contacts -- --limit 25 11 * npm run benchmark:contacts -- --limit 10 --models mini,deepseek 12 * npm run benchmark:contacts -- --dry-run 13 * 14 * Output: reports/contact-extraction-benchmark-YYYY-MM-DD.md 15 */ 16 17 import Database from 'better-sqlite3'; 18 import { writeFileSync, mkdirSync } from 'fs'; 19 import { join, dirname } from 'path'; 20 import { fileURLToPath } from 'url'; 21 import '../src/utils/load-env.js'; 22 import { callLLM } from '../src/utils/llm-provider.js'; 23 import { extractContactsFromHtml } from '../src/utils/html-contact-extractor.js'; 24 25 const __dirname = dirname(fileURLToPath(import.meta.url)); 26 const PROJECT_ROOT = join(__dirname, '..'); 27 28 // ── Model registry ──────────────────────────────────────────────────────────── 29 // To add a new model: add an entry here. The key becomes the --models shorthand. 30 // Prices in USD per million tokens (update periodically from openrouter.ai/pricing). 31 const MODELS = { 32 mini: { 33 id: 'openai/gpt-4o-mini', 34 label: 'GPT-4o Mini', 35 inputPer1M: 0.15, 36 outputPer1M: 0.6, 37 note: 'Current baseline', 38 }, 39 deepseek: { 40 id: 'deepseek/deepseek-v3.2', 41 label: 'DeepSeek V3.2', 42 inputPer1M: 0.24, 43 outputPer1M: 0.38, 44 note: 'Near-frontier at budget price', 45 }, 46 flash: { 47 id: 'google/gemini-2.5-flash', 48 label: 'Gemini 2.5 Flash', 49 inputPer1M: 0.3, 50 outputPer1M: 2.5, 51 note: 'Fast, 1M context', 52 }, 53 haiku: { 54 id: 'anthropic/claude-3.5-haiku', 55 label: 'Claude 3.5 Haiku', 56 inputPer1M: 0.8, 57 outputPer1M: 4.0, 58 note: 'Good structured JSON', 59 }, 60 'gemini-pro': { 61 id: 'google/gemini-2.5-pro', 62 label: 'Gemini 2.5 Pro', 63 inputPer1M: 1.25, 64 outputPer1M: 10.0, 65 note: 'Strong multilingual', 66 }, 67 'gpt-5': { 68 id: 'openai/gpt-5.2', 69 label: 'GPT-5.2', 70 inputPer1M: 1.75, 71 outputPer1M: 14.0, 72 note: 'Latest OpenAI frontier', 73 }, 74 sonnet: { 75 id: 'anthropic/claude-sonnet-4.6', 76 label: 'Claude Sonnet 4.6', 77 inputPer1M: 3.0, 78 outputPer1M: 15.0, 79 note: 'Best JSON compliance', 80 }, 81 opus: { 82 id: 'anthropic/claude-opus-4.6', 83 label: 'Claude Opus 4.6', 84 inputPer1M: 5.0, 85 outputPer1M: 25.0, 86 note: 'Maximum intelligence', 87 }, 88 }; 89 90 // Extraction prompt (mirrors extractInitialContacts in src/stages/enrich.js) 91 const SYSTEM_PROMPT = `Extract contact information from this HTML page. Return JSON with this structure: 92 { 93 "business_name": "Company Name", 94 "email_addresses": [{ "email": "info@example.com", "label": "General", "source": "//a[@href='mailto:...']" }], 95 "phone_numbers": [{ "number": "+1234567890", "label": "Office", "source": "//span[@class='phone']" }], 96 "social_profiles": [{ "url": "https://twitter.com/handle", "label": "Twitter", "source": "..." }], 97 "key_pages": ["https://example.com/contact"], 98 "primary_contact_form": { "form_url": "https://example.com/contact", "form_action_url": "/submit" } 99 } 100 Return empty arrays for missing fields. Omit primary_contact_form if none found.`; 101 102 // ── Argument parsing ────────────────────────────────────────────────────────── 103 function parseArgs() { 104 const args = process.argv.slice(2); 105 const opts = { limit: 50, models: Object.keys(MODELS), dryRun: false }; 106 107 for (let i = 0; i < args.length; i++) { 108 if (args[i] === '--limit' && args[i + 1]) { 109 opts.limit = parseInt(args[++i], 10); 110 } else if (args[i] === '--models' && args[i + 1]) { 111 opts.models = args[++i].split(',').map(s => s.trim()); 112 } else if (args[i] === '--dry-run') { 113 opts.dryRun = true; 114 } 115 } 116 117 // Validate model shorthands 118 const invalid = opts.models.filter(k => !MODELS[k]); 119 if (invalid.length > 0) { 120 console.error(`Unknown model keys: ${invalid.join(', ')}`); 121 console.error(`Valid keys: ${Object.keys(MODELS).join(', ')}`); 122 process.exit(1); 123 } 124 125 return opts; 126 } 127 128 // ── Database sampling ───────────────────────────────────────────────────────── 129 function sampleSites(db, limit) { 130 // Sample with geographic diversity: try to get at least 5 countries 131 const sites = db 132 .prepare( 133 ` 134 SELECT id, landing_page_url, html_dom, contacts_json, country_code, score, grade 135 FROM sites 136 WHERE html_dom IS NOT NULL 137 AND html_dom != 'HTML removed after scoring' 138 AND html_dom != '' 139 AND length(html_dom) > 500 140 AND status IN ('semantic_scored', 'vision_scored', 'enriched', 'proposals_drafted', 'outreach_sent') 141 ORDER BY RANDOM() 142 LIMIT ? 143 ` 144 ) 145 .all(limit * 3); // Over-sample, then select for diversity 146 147 if (sites.length === 0) return []; 148 149 // Build a geographically diverse sample 150 const byCountry = {}; 151 for (const site of sites) { 152 const cc = site.country_code || 'unknown'; 153 if (!byCountry[cc]) byCountry[cc] = []; 154 byCountry[cc].push(site); 155 } 156 157 const countries = Object.keys(byCountry); 158 const perCountry = Math.max(1, Math.floor(limit / Math.min(countries.length, 10))); 159 const selected = []; 160 161 for (const cc of countries) { 162 const take = byCountry[cc].slice(0, perCountry); 163 selected.push(...take); 164 if (selected.length >= limit) break; 165 } 166 167 // Fill remaining slots if we didn't hit the limit 168 if (selected.length < limit) { 169 const selectedIds = new Set(selected.map(s => s.id)); 170 for (const site of sites) { 171 if (!selectedIds.has(site.id)) { 172 selected.push(site); 173 if (selected.length >= limit) break; 174 } 175 } 176 } 177 178 return selected.slice(0, limit); 179 } 180 181 // ── Cost estimation ─────────────────────────────────────────────────────────── 182 function estimateCost(sites, modelKeys) { 183 // Assume ~5K input tokens per site (50K HTML chars / ~10 chars per token), ~300 output tokens 184 const avgInputTokens = 5000; 185 const avgOutputTokens = 300; 186 const costs = {}; 187 188 for (const key of modelKeys) { 189 const m = MODELS[key]; 190 const perSite = 191 (avgInputTokens / 1_000_000) * m.inputPer1M + (avgOutputTokens / 1_000_000) * m.outputPer1M; 192 costs[key] = { perSite, total: perSite * sites.length }; 193 } 194 195 return costs; 196 } 197 198 // Models that support OpenAI-style response_format: json_object via OpenRouter 199 const OPENAI_JSON_MODE_MODELS = new Set(['openai/gpt-4o-mini', 'openai/gpt-5.2']); 200 201 /** 202 * Extract JSON from LLM response content, handling markdown fences. 203 * Some models (Claude, DeepSeek, Gemini) wrap JSON in ```json...``` fences 204 * even when asked for plain JSON, so we strip them before parsing. 205 */ 206 function safeJsonParse(content) { 207 if (!content || typeof content !== 'string') return null; 208 209 // Try raw parse first (OpenAI models return clean JSON) 210 try { 211 return JSON.parse(content); 212 } catch { 213 // Fall through to fence-stripping 214 } 215 216 // Strip ```json ... ``` or ``` ... ``` fences 217 const fenceMatch = content.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/); 218 if (fenceMatch) { 219 try { 220 return JSON.parse(fenceMatch[1].trim()); 221 } catch { 222 // Fall through 223 } 224 } 225 226 // Try extracting the first {...} block (handles leading/trailing text) 227 const braceMatch = content.match(/\{[\s\S]*\}/); 228 if (braceMatch) { 229 try { 230 return JSON.parse(braceMatch[0]); 231 } catch { 232 // Give up 233 } 234 } 235 236 return null; 237 } 238 239 // ── LLM extraction ──────────────────────────────────────────────────────────── 240 async function extractWithModel(url, html, modelId) { 241 const messages = [ 242 { role: 'system', content: SYSTEM_PROMPT }, 243 { 244 role: 'user', 245 content: `URL: ${url}\n\nHTML (first 50000 chars):\n${html ? html.substring(0, 50000) : 'No HTML available'}`, 246 }, 247 ]; 248 249 // Only send json_mode for OpenAI models — Claude/DeepSeek/Gemini don't support 250 // response_format: json_object via OpenRouter and respond with markdown-fenced JSON 251 const jsonMode = OPENAI_JSON_MODE_MODELS.has(modelId); 252 253 const start = Date.now(); 254 const response = await callLLM({ 255 model: modelId, 256 messages, 257 temperature: 0.1, 258 max_tokens: 2000, 259 json_mode: jsonMode, 260 }); 261 const elapsed = Date.now() - start; 262 263 const parsed = safeJsonParse(response.content); 264 265 return { 266 contacts: parsed, 267 usage: response.usage || { promptTokens: 0, completionTokens: 0 }, 268 elapsedMs: elapsed, 269 }; 270 } 271 272 // ── Contact counting ────────────────────────────────────────────────────────── 273 function countContacts(contacts) { 274 if (!contacts) return { emails: 0, phones: 0, socials: 0, hasForm: false, total: 0 }; 275 const emails = (contacts.email_addresses || []).length; 276 const phones = (contacts.phone_numbers || []).length; 277 const socials = (contacts.social_profiles || []).length; 278 const hasForm = !!contacts.primary_contact_form?.form_url; 279 return { emails, phones, socials, hasForm, total: emails + phones + socials + (hasForm ? 1 : 0) }; 280 } 281 282 function uniqueEmails(contacts) { 283 return new Set((contacts?.email_addresses || []).map(e => e.email?.toLowerCase())); 284 } 285 286 function uniquePhones(contacts) { 287 return new Set((contacts?.phone_numbers || []).map(p => p.number?.replace(/\D/g, ''))); 288 } 289 290 // ── Report generation ───────────────────────────────────────────────────────── 291 function generateReport(opts, sites, programmaticResults, llmResults) { 292 const date = new Date().toISOString().split('T')[0]; 293 const lines = []; 294 295 lines.push(`# Contact Extraction Benchmark — ${date}`); 296 lines.push(''); 297 lines.push('## Setup'); 298 lines.push(''); 299 lines.push(`- **Sites sampled:** ${sites.length}`); 300 301 const countries = [...new Set(sites.map(s => s.country_code || 'unknown'))].filter(Boolean); 302 lines.push(`- **Countries represented:** ${countries.join(', ')}`); 303 lines.push( 304 `- **Models tested:** Programmatic (free), ${opts.models.map(k => MODELS[k].label).join(', ')}` 305 ); 306 lines.push(`- **Run date:** ${date}`); 307 lines.push(''); 308 309 // Aggregate programmatic stats 310 const progTotals = { emails: 0, phones: 0, socials: 0, forms: 0 }; 311 for (const r of Object.values(programmaticResults)) { 312 progTotals.emails += r.emails; 313 progTotals.phones += r.phones; 314 progTotals.socials += r.socials; 315 progTotals.forms += r.hasForm ? 1 : 0; 316 } 317 318 // Aggregate LLM stats 319 const llmTotals = {}; 320 const llmCosts = {}; 321 const llmTimes = {}; 322 323 for (const key of opts.models) { 324 llmTotals[key] = { emails: 0, phones: 0, socials: 0, forms: 0, parseErrors: 0 }; 325 llmCosts[key] = 0; 326 llmTimes[key] = []; 327 } 328 329 for (const siteResults of Object.values(llmResults)) { 330 for (const key of opts.models) { 331 const r = siteResults[key]; 332 if (!r) continue; 333 const m = MODELS[key]; 334 const cost = 335 (r.usage.promptTokens / 1_000_000) * m.inputPer1M + 336 (r.usage.completionTokens / 1_000_000) * m.outputPer1M; 337 llmCosts[key] += cost; 338 llmTimes[key].push(r.elapsedMs); 339 340 if (!r.contacts) { 341 llmTotals[key].parseErrors++; 342 continue; 343 } 344 const counts = countContacts(r.contacts); 345 llmTotals[key].emails += counts.emails; 346 llmTotals[key].phones += counts.phones; 347 llmTotals[key].socials += counts.socials; 348 llmTotals[key].forms += counts.hasForm ? 1 : 0; 349 } 350 } 351 352 // Results table 353 lines.push('## Results by Method'); 354 lines.push(''); 355 lines.push( 356 '| Method | Emails | Phones | Socials | Forms | Total | Avg Time | Cost (run) | Cost/site |' 357 ); 358 lines.push( 359 '|--------|--------|--------|---------|-------|-------|----------|------------|-----------|' 360 ); 361 362 const progTotal = progTotals.emails + progTotals.phones + progTotals.socials + progTotals.forms; 363 lines.push( 364 `| Programmatic (free) | ${progTotals.emails} | ${progTotals.phones} | ${progTotals.socials} | ${progTotals.forms} | **${progTotal}** | — | $0.00 | $0.0000 |` 365 ); 366 367 for (const key of opts.models) { 368 const t = llmTotals[key]; 369 const total = t.emails + t.phones + t.socials + t.forms; 370 const avgMs = 371 llmTimes[key].length > 0 372 ? Math.round(llmTimes[key].reduce((a, b) => a + b, 0) / llmTimes[key].length) 373 : 0; 374 const totalCost = llmCosts[key]; 375 const perSite = sites.length > 0 ? totalCost / sites.length : 0; 376 const errors = t.parseErrors > 0 ? ` (${t.parseErrors} parse errors)` : ''; 377 lines.push( 378 `| ${MODELS[key].label} | ${t.emails} | ${t.phones} | ${t.socials} | ${t.forms} | **${total}**${errors} | ${avgMs}ms | $${totalCost.toFixed(4)} | $${perSite.toFixed(5)} |` 379 ); 380 } 381 382 // Incremental value vs programmatic 383 lines.push(''); 384 lines.push('## Incremental Value vs Programmatic-Only'); 385 lines.push(''); 386 lines.push('| Model | +Emails | +Phones | +Socials | +Forms | +Total | Cost per Extra Contact |'); 387 lines.push('|-------|---------|---------|----------|--------|--------|----------------------|'); 388 389 for (const key of opts.models) { 390 const t = llmTotals[key]; 391 const extraEmails = t.emails - progTotals.emails; 392 const extraPhones = t.phones - progTotals.phones; 393 const extraSocials = t.socials - progTotals.socials; 394 const extraForms = t.forms - progTotals.forms; 395 const extraTotal = extraEmails + extraPhones + extraSocials + extraForms; 396 const costPerExtra = 397 extraTotal > 0 ? `$${(llmCosts[key] / extraTotal).toFixed(4)}` : 'N/A (none found)'; 398 lines.push( 399 `| ${MODELS[key].label} | ${extraEmails > 0 ? '+' : ''}${extraEmails} | ${extraPhones > 0 ? '+' : ''}${extraPhones} | ${extraSocials > 0 ? '+' : ''}${extraSocials} | ${extraForms > 0 ? '+' : ''}${extraForms} | ${extraTotal > 0 ? '+' : ''}${extraTotal} | ${costPerExtra} |` 400 ); 401 } 402 403 // Incremental value vs baseline model (mini) 404 if (opts.models.includes('mini') && opts.models.length > 1) { 405 lines.push(''); 406 lines.push('## Incremental Value vs GPT-4o-Mini (Upgrade Cost)'); 407 lines.push(''); 408 lines.push( 409 '| Model | +Emails | +Phones | +Socials | +Total | Extra Cost | Cost per Extra Contact |' 410 ); 411 lines.push( 412 '|-------|---------|---------|----------|--------|------------|----------------------|' 413 ); 414 415 const mini = llmTotals['mini']; 416 const miniCost = llmCosts['mini']; 417 418 for (const key of opts.models) { 419 if (key === 'mini') continue; 420 const t = llmTotals[key]; 421 const extraEmails = t.emails - mini.emails; 422 const extraPhones = t.phones - mini.phones; 423 const extraSocials = t.socials - mini.socials; 424 const extraTotal = extraEmails + extraPhones + extraSocials; 425 const extraCost = llmCosts[key] - miniCost; 426 const costPerExtra = 427 extraTotal > 0 ? `$${(extraCost / extraTotal).toFixed(4)}` : 'N/A (none found)'; 428 lines.push( 429 `| ${MODELS[key].label} | ${extraEmails > 0 ? '+' : ''}${extraEmails} | ${extraPhones > 0 ? '+' : ''}${extraPhones} | ${extraSocials > 0 ? '+' : ''}${extraSocials} | ${extraTotal > 0 ? '+' : ''}${extraTotal} | $${extraCost.toFixed(4)} | ${costPerExtra} |` 430 ); 431 } 432 } 433 434 // ROI note 435 lines.push(''); 436 lines.push('## ROI Notes'); 437 lines.push(''); 438 lines.push( 439 '> Each extra contact found can lead to an extra outreach attempt. At a ~5% reply rate and ~10% deal rate from replies,' 440 ); 441 lines.push( 442 '> each 100 extra contacts = ~5 extra replies = ~0.5 extra deals. Price your model upgrade decision accordingly.' 443 ); 444 lines.push( 445 '> **Recommendation:** Upgrade only if cost-per-extra-contact is less than expected contact value.' 446 ); 447 lines.push(''); 448 lines.push('_Re-run this benchmark monthly: `npm run benchmark:contacts`_'); 449 lines.push(''); 450 451 // Per-site raw data 452 lines.push('## Raw Data (Per Site)'); 453 lines.push(''); 454 lines.push('<details>'); 455 lines.push('<summary>Expand for per-site JSON</summary>'); 456 lines.push(''); 457 lines.push('```json'); 458 459 const rawData = sites.map(site => { 460 const prog = programmaticResults[site.id]; 461 const llm = {}; 462 for (const key of opts.models) { 463 const r = (llmResults[site.id] || {})[key]; 464 llm[key] = r 465 ? { 466 counts: countContacts(r.contacts), 467 uniqueEmails: [...uniqueEmails(r.contacts)], 468 uniquePhones: [...uniquePhones(r.contacts)], 469 cost: ( 470 (r.usage.promptTokens / 1_000_000) * MODELS[key].inputPer1M + 471 (r.usage.completionTokens / 1_000_000) * MODELS[key].outputPer1M 472 ).toFixed(6), 473 elapsedMs: r.elapsedMs, 474 parseError: !r.contacts, 475 } 476 : null; 477 } 478 return { 479 id: site.id, 480 url: site.landing_page_url, 481 country: site.country_code, 482 score: site.score, 483 programmatic: prog, 484 llm, 485 }; 486 }); 487 488 lines.push(JSON.stringify(rawData, null, 2)); 489 lines.push('```'); 490 lines.push('</details>'); 491 492 return lines.join('\n'); 493 } 494 495 // ── Main ────────────────────────────────────────────────────────────────────── 496 async function main() { 497 const opts = parseArgs(); 498 499 const dbPath = process.env.DATABASE_PATH || join(PROJECT_ROOT, 'db/sites.db'); 500 const db = new Database(dbPath, { readonly: true }); 501 502 console.log('📊 Contact Extraction Benchmark'); 503 console.log('================================'); 504 505 // Sample sites 506 const sites = sampleSites(db, opts.limit); 507 if (sites.length === 0) { 508 console.error('No sites with html_dom found in database. Run the pipeline first.'); 509 process.exit(1); 510 } 511 512 const countries = [...new Set(sites.map(s => s.country_code || 'unknown'))]; 513 console.log(`\nSampled ${sites.length} sites from ${countries.length} countries`); 514 console.log(`Countries: ${countries.join(', ')}`); 515 516 // Estimate costs 517 const costEst = estimateCost(sites, opts.models); 518 console.log('\nEstimated costs (approximate):'); 519 let totalEst = 0; 520 for (const key of opts.models) { 521 console.log( 522 ` ${MODELS[key].label.padEnd(25)} $${costEst[key].total.toFixed(4)} total ($${costEst[key].perSite.toFixed(5)}/site)` 523 ); 524 totalEst += costEst[key].total; 525 } 526 console.log(` ${'TOTAL'.padEnd(25)} $${totalEst.toFixed(4)}`); 527 528 if (opts.dryRun) { 529 console.log('\n[dry-run] Exiting without making API calls.'); 530 db.close(); 531 return; 532 } 533 534 console.log('\nStarting benchmark...'); 535 536 // Run programmatic extractor (free) 537 console.log('\n[1/2] Running programmatic extractor...'); 538 const programmaticResults = {}; 539 for (const site of sites) { 540 const contacts = extractContactsFromHtml(site.html_dom, site.landing_page_url); 541 programmaticResults[site.id] = countContacts(contacts); 542 } 543 544 // Run LLM models 545 console.log(`\n[2/2] Running ${opts.models.length} LLM model(s) on ${sites.length} sites...`); 546 const llmResults = {}; 547 548 for (const site of sites) { 549 llmResults[site.id] = {}; 550 } 551 552 const CONCURRENCY = 5; // Parallel requests per model 553 554 for (const key of opts.models) { 555 const model = MODELS[key]; 556 console.log(`\n → ${model.label} (${model.id})`); 557 let done = 0; 558 const errors = []; 559 560 // Process sites in batches of CONCURRENCY 561 for (let i = 0; i < sites.length; i += CONCURRENCY) { 562 const batch = sites.slice(i, i + CONCURRENCY); 563 await Promise.all( 564 batch.map(async site => { 565 try { 566 const result = await extractWithModel(site.landing_page_url, site.html_dom, model.id); 567 llmResults[site.id][key] = result; 568 } catch (err) { 569 errors.push({ siteId: site.id, error: err.message }); 570 llmResults[site.id][key] = { 571 contacts: null, 572 usage: { promptTokens: 0, completionTokens: 0 }, 573 elapsedMs: 0, 574 }; 575 } finally { 576 process.stdout.write(`\r ${++done}/${sites.length} sites`); 577 } 578 }) 579 ); 580 } 581 582 if (errors.length > 0) { 583 console.log(`\n ⚠️ ${errors.length} API errors`); 584 } else { 585 console.log(''); 586 } 587 } 588 589 // Generate and save report 590 const report = generateReport(opts, sites, programmaticResults, llmResults); 591 const date = new Date().toISOString().split('T')[0]; 592 const reportDir = join(PROJECT_ROOT, 'reports'); 593 mkdirSync(reportDir, { recursive: true }); 594 const reportPath = join(reportDir, `contact-extraction-benchmark-${date}.md`); 595 writeFileSync(reportPath, report, 'utf8'); 596 597 console.log(`\n✅ Report saved to: reports/contact-extraction-benchmark-${date}.md`); 598 599 // Print summary to console 600 console.log('\n── Quick Summary ─────────────────────────────────────────────'); 601 const progTotal = Object.values(programmaticResults).reduce((s, r) => s + r.total, 0); 602 console.log(` Programmatic: ${progTotal} total contacts (free)`); 603 604 for (const key of opts.models) { 605 let total = 0; 606 for (const siteRes of Object.values(llmResults)) { 607 total += countContacts(siteRes[key]?.contacts).total; 608 } 609 const extra = total - progTotal; 610 const sign = extra >= 0 ? '+' : ''; 611 console.log( 612 ` ${MODELS[key].label.padEnd(22)} ${total} total (${sign}${extra} vs programmatic)` 613 ); 614 } 615 616 db.close(); 617 } 618 619 main().catch(err => { 620 console.error('Benchmark failed:', err.message); 621 process.exit(1); 622 });