/ scripts / backfill-regex-contacts.js
backfill-regex-contacts.js
 1  #!/usr/bin/env node
 2  /**
 3   * One-time backfill: run regex contact extraction on all sites with real html_dom
 4   * that won't naturally flow through enrich again (proposals_drafted, enriched).
 5   * Also runs on rescored sites to pre-populate before the LLM enrichment call.
 6   *
 7   * Targets: enriched, proposals_drafted, rescored (with real html_dom)
 8   * Skips: found, assets_captured, failing, ignore (will flow through pipeline naturally)
 9   *
10   * Usage: node scripts/backfill-regex-contacts.js [--dry-run] [--status enriched,proposals_drafted]
11   */
12  
13  import { createDatabaseConnection } from '../src/utils/db.js';
14  import {
15    extractContactsFromHtml,
16    mergeExtractedContacts,
17    countUsableContacts,
18  } from '../src/utils/html-contact-extractor.js';
19  import { getContactsDataWithFallback, setContactsJson } from '../src/utils/contacts-storage.js';
20  
21  const FS_SENTINEL = '{"_fs":true}';
22  
23  const HTML_SENTINEL = 'HTML removed after scoring';
24  
25  const args = process.argv.slice(2);
26  const DRY_RUN = args.includes('--dry-run');
27  const statusArg = args.find(a => a.startsWith('--status='));
28  const TARGET_STATUSES = statusArg
29    ? statusArg.replace('--status=', '').split(',')
30    : ['enriched', 'proposals_drafted', 'semantic_scored', 'vision_scored'];
31  
32  const db = createDatabaseConnection();
33  
34  const sites = db
35    .prepare(
36      `SELECT id, landing_page_url, status, contacts_json, html_dom
37       FROM sites
38       WHERE status IN (${TARGET_STATUSES.map(() => '?').join(',')})
39         AND html_dom IS NOT NULL
40         AND html_dom != ''
41         AND html_dom != ?
42       ORDER BY status, id`
43    )
44    .all(...TARGET_STATUSES, HTML_SENTINEL);
45  
46  console.log(`Backfill target: ${sites.length} sites (statuses: ${TARGET_STATUSES.join(', ')})`);
47  if (DRY_RUN) console.log('DRY RUN — no DB writes');
48  
49  const updateStmt = db.prepare(
50    `UPDATE sites SET contacts_json = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?`
51  );
52  
53  let updated = 0;
54  let skipped = 0;
55  let totalNewContacts = 0;
56  
57  const statusBreakdown = {};
58  
59  for (const site of sites) {
60    let existing;
61    try {
62      existing = getContactsDataWithFallback(site.id, site);
63    } catch {
64      existing = null;
65    }
66    const beforeCount = countUsableContacts(existing);
67  
68    const regexContacts = extractContactsFromHtml(site.html_dom, site.landing_page_url);
69    const merged = mergeExtractedContacts(existing, regexContacts, site.landing_page_url);
70    const afterCount = countUsableContacts(merged);
71  
72    const added = afterCount - beforeCount;
73  
74    if (added <= 0) {
75      skipped++;
76      continue;
77    }
78  
79    if (!DRY_RUN) {
80      setContactsJson(site.id, JSON.stringify(merged));
81      updateStmt.run(FS_SENTINEL, site.id);
82    }
83  
84    updated++;
85    totalNewContacts += added;
86    statusBreakdown[site.status] = (statusBreakdown[site.status] || 0) + 1;
87  
88    console.log(
89      `  [${site.status}] ${site.landing_page_url} — +${added} contacts (${beforeCount} → ${afterCount})`
90    );
91  }
92  
93  console.log('\n=== Backfill complete ===');
94  console.log(`Sites updated: ${updated}`);
95  console.log(`Sites unchanged: ${skipped}`);
96  console.log(`Total new usable contacts added: ${totalNewContacts}`);
97  console.log('By status:', statusBreakdown);
98  
99  db.close();