backfill-regex-contacts.js
1 #!/usr/bin/env node 2 /** 3 * One-time backfill: run regex contact extraction on all sites with real html_dom 4 * that won't naturally flow through enrich again (proposals_drafted, enriched). 5 * Also runs on rescored sites to pre-populate before the LLM enrichment call. 6 * 7 * Targets: enriched, proposals_drafted, rescored (with real html_dom) 8 * Skips: found, assets_captured, failing, ignore (will flow through pipeline naturally) 9 * 10 * Usage: node scripts/backfill-regex-contacts.js [--dry-run] [--status enriched,proposals_drafted] 11 */ 12 13 import { createDatabaseConnection } from '../src/utils/db.js'; 14 import { 15 extractContactsFromHtml, 16 mergeExtractedContacts, 17 countUsableContacts, 18 } from '../src/utils/html-contact-extractor.js'; 19 import { getContactsDataWithFallback, setContactsJson } from '../src/utils/contacts-storage.js'; 20 21 const FS_SENTINEL = '{"_fs":true}'; 22 23 const HTML_SENTINEL = 'HTML removed after scoring'; 24 25 const args = process.argv.slice(2); 26 const DRY_RUN = args.includes('--dry-run'); 27 const statusArg = args.find(a => a.startsWith('--status=')); 28 const TARGET_STATUSES = statusArg 29 ? statusArg.replace('--status=', '').split(',') 30 : ['enriched', 'proposals_drafted', 'semantic_scored', 'vision_scored']; 31 32 const db = createDatabaseConnection(); 33 34 const sites = db 35 .prepare( 36 `SELECT id, landing_page_url, status, contacts_json, html_dom 37 FROM sites 38 WHERE status IN (${TARGET_STATUSES.map(() => '?').join(',')}) 39 AND html_dom IS NOT NULL 40 AND html_dom != '' 41 AND html_dom != ? 42 ORDER BY status, id` 43 ) 44 .all(...TARGET_STATUSES, HTML_SENTINEL); 45 46 console.log(`Backfill target: ${sites.length} sites (statuses: ${TARGET_STATUSES.join(', ')})`); 47 if (DRY_RUN) console.log('DRY RUN — no DB writes'); 48 49 const updateStmt = db.prepare( 50 `UPDATE sites SET contacts_json = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?` 51 ); 52 53 let updated = 0; 54 let skipped = 0; 55 let totalNewContacts = 0; 56 57 const statusBreakdown = {}; 58 59 for (const site of sites) { 60 let existing; 61 try { 62 existing = getContactsDataWithFallback(site.id, site); 63 } catch { 64 existing = null; 65 } 66 const beforeCount = countUsableContacts(existing); 67 68 const regexContacts = extractContactsFromHtml(site.html_dom, site.landing_page_url); 69 const merged = mergeExtractedContacts(existing, regexContacts, site.landing_page_url); 70 const afterCount = countUsableContacts(merged); 71 72 const added = afterCount - beforeCount; 73 74 if (added <= 0) { 75 skipped++; 76 continue; 77 } 78 79 if (!DRY_RUN) { 80 setContactsJson(site.id, JSON.stringify(merged)); 81 updateStmt.run(FS_SENTINEL, site.id); 82 } 83 84 updated++; 85 totalNewContacts += added; 86 statusBreakdown[site.status] = (statusBreakdown[site.status] || 0) + 1; 87 88 console.log( 89 ` [${site.status}] ${site.landing_page_url} — +${added} contacts (${beforeCount} → ${afterCount})` 90 ); 91 } 92 93 console.log('\n=== Backfill complete ==='); 94 console.log(`Sites updated: ${updated}`); 95 console.log(`Sites unchanged: ${skipped}`); 96 console.log(`Total new usable contacts added: ${totalNewContacts}`); 97 console.log('By status:', statusBreakdown); 98 99 db.close();