/ tests / utils / enriched-status-split.test.js
enriched-status-split.test.js
  1  /**
  2   * Tests for the enriched status split:
  3   *   enriched_regex  — pipeline browser pass complete (contact pages scraped, regex extracted)
  4   *   enriched_llm    — orchestrator LLM pass complete (key_pages_html processed, key_pages_html cleared)
  5   *   enriched        — fast-path for sites that already have a contact form (no LLM pass needed)
  6   *
  7   * Covers:
  8   *   1. enrich.js: browser pass sets status='enriched_regex' and stores key_pages_html
  9   *   2. enrich.js: "has contacts, no contact pages" path sets status='enriched_regex'
 10   *   3. enrich.js: "already has form" fast-path keeps status='enriched'
 11   *   4. claude-store.js: enrichment_pass='llm' → sets enriched_llm, clears key_pages_html
 12   *   5. claude-store.js: enrichment_pass='initial' → keeps semantic_scored/vision_scored status
 13   *   6. claude-batch.js: fetchEnrichSites includes enriched_regex sites with combined HTML
 14   *   7. claude-batch.js: fetchProposalsBatch accepts enriched_llm sites
 15   *
 16   * Run: NODE_ENV=test node --test tests/utils/enriched-status-split.test.js
 17   */
 18  
 19  import { test, describe } from 'node:test';
 20  import assert from 'node:assert/strict';
 21  import { readFileSync } from 'fs';
 22  import { join, dirname } from 'path';
 23  import { fileURLToPath } from 'url';
 24  
 25  const __dirname = dirname(fileURLToPath(import.meta.url));
 26  const projectRoot = join(__dirname, '../..');
 27  
 28  // ─── Source code text tests (no mocking needed) ─────────────────────────────
 29  
 30  describe('enrich.js source: status split', () => {
 31    const src = readFileSync(join(projectRoot, 'src/stages/enrich.js'), 'utf8');
 32  
 33    test('browser pass final UPDATE uses enriched_regex, not enriched', () => {
 34      assert.ok(
 35        src.includes("status = 'enriched_regex',") || src.includes("status = 'enriched_regex'\n"),
 36        'Final UPDATE in enrichSite browser pass should set status=enriched_regex'
 37      );
 38    });
 39  
 40    test('has-contacts-no-pages path uses enriched_regex', () => {
 41      // The "Has some contacts but no contact pages" path should set enriched_regex
 42      // so the orchestrator LLM pass can still run against html_dom
 43      const hasContactsBlock = src.indexOf('Has some contacts but no contact pages');
 44      assert.ok(hasContactsBlock !== -1, 'should have "has contacts, no contact pages" comment');
 45  
 46      // Search in a 800-char window (the UPDATE block is ~600 chars after the comment)
 47      const afterComment = src.slice(hasContactsBlock, hasContactsBlock + 800);
 48      assert.ok(
 49        afterComment.includes("status = 'enriched_regex'"),
 50        'has-contacts-no-pages path should set status=enriched_regex'
 51      );
 52    });
 53  
 54    test('all enrichment paths use enriched_regex (LLM pass always runs)', () => {
 55      // All paths now use enriched_regex so the orchestrator LLM pass always runs.
 56      // (The form fast-path was unified into enriched_regex in a later refactor.)
 57      const enrichedRegexCount = (src.match(/status = 'enriched_regex'/g) || []).length;
 58      assert.ok(
 59        enrichedRegexCount >= 2,
 60        `Should have at least 2 status='enriched_regex' (browser pass + no-pages path), found ${enrichedRegexCount}`
 61      );
 62    });
 63  
 64    test('key_pages_html accumulator is initialised before page loop', () => {
 65      assert.ok(
 66        src.includes('const keyPagesHtml = {};'),
 67        'enrich.js should initialise keyPagesHtml = {} before the page loop'
 68      );
 69    });
 70  
 71    test('key_pages_html is populated during page loop', () => {
 72      assert.ok(
 73        src.includes('keyPagesHtml[pageUrl] = pageData.html'),
 74        'enrich.js should store rendered HTML per page URL into keyPagesHtml'
 75      );
 76    });
 77  
 78    test('key_pages_html is written to filesystem and flagged in UPDATE', () => {
 79      // key_pages_html is now stored on filesystem via writeKeyPagesHtmlFile,
 80      // and the DB column is set to 'fs' sentinel (not JSON inline)
 81      assert.ok(
 82        src.includes('writeKeyPagesHtmlFile'),
 83        'enrich.js should write key pages HTML to filesystem via writeKeyPagesHtmlFile'
 84      );
 85      // PG-style ($N) or SQLite-style (?) positional parameter
 86      assert.ok(
 87        src.includes('key_pages_html = ?') || /key_pages_html\s*=\s*\$\d+/.test(src),
 88        'final UPDATE should include key_pages_html = $N (or ?) column'
 89      );
 90    });
 91  });
 92  
 93  // ─── claude-store.js source: enriched_llm pass ───────────────────────────────
 94  
 95  describe('claude-store.js source: enrichment_pass handling', () => {
 96    const src = readFileSync(join(projectRoot, 'scripts/claude-store.js'), 'utf8');
 97  
 98    test('enrichment_pass=llm sets status=enriched_llm', () => {
 99      assert.ok(
100        src.includes("status = 'enriched_llm'"),
101        "claude-store.js should set status='enriched_llm' for LLM pass"
102      );
103    });
104  
105    test('enrichment_pass=llm clears key_pages_html', () => {
106      // After LLM extraction, key_pages_html should be NULLed to reclaim disk space
107      const llmBlock = src.indexOf("enrichment_pass === 'llm'");
108      assert.ok(llmBlock !== -1, "should have enrichment_pass === 'llm' check");
109  
110      const afterCheck = src.slice(llmBlock, llmBlock + 800);
111      assert.ok(
112        afterCheck.includes('key_pages_html = NULL'),
113        'LLM pass UPDATE should clear key_pages_html = NULL'
114      );
115    });
116  
117    test('enrichment_pass=initial does NOT set enriched_llm', () => {
118      // Initial pass from html_dom should leave status as semantic_scored/vision_scored
119      const llmBlockEnd = src.indexOf("enrichment_pass === 'llm'") + 800;
120      // The else clause starts after the LLM block — check it doesn't contain enriched_llm
121      const elseBlock = src.slice(llmBlockEnd, llmBlockEnd + 600);
122      assert.ok(
123        !elseBlock.includes("status = 'enriched_llm'"),
124        'Initial pass should not set enriched_llm status'
125      );
126      assert.ok(
127        !elseBlock.includes('key_pages_html = NULL'),
128        'Initial pass should not clear key_pages_html'
129      );
130    });
131  
132    test('storeEnrichResult reads enrichment_pass from item', () => {
133      assert.ok(
134        src.includes('item.enrichment_pass'),
135        'storeEnrichResult should check item.enrichment_pass to determine which pass this is'
136      );
137    });
138  });
139  
140  // ─── claude-batch.js source: fetchEnrichSites and proposal queries ───────────
141  
142  describe('claude-batch.js source: enriched_regex/enriched_llm inclusion', () => {
143    const src = readFileSync(join(projectRoot, 'scripts/claude-batch.js'), 'utf8');
144  
145    test('fetchEnrichSites queries enriched_regex status', () => {
146      const enrichSitesFn = src.slice(
147        src.indexOf('function fetchEnrichSites'),
148        src.indexOf('function fetchEnrichSites') + 2000
149      );
150      assert.ok(
151        enrichSitesFn.includes("'enriched_regex'"),
152        'fetchEnrichSites should include enriched_regex in its WHERE clause'
153      );
154    });
155  
156    test('fetchEnrichSites includes key_pages_html in SELECT', () => {
157      const enrichSitesFn = src.slice(
158        src.indexOf('function fetchEnrichSites'),
159        src.indexOf('function fetchEnrichSites') + 2000
160      );
161      assert.ok(
162        enrichSitesFn.includes('key_pages_html'),
163        'fetchEnrichSites SELECT should include key_pages_html column'
164      );
165    });
166  
167    test('fetchEnrichSites sets enrichment_pass=llm for enriched_regex sites', () => {
168      const enrichSitesFn = src.slice(
169        src.indexOf('function fetchEnrichSites'),
170        src.indexOf('function fetchEnrichSites') + 3000
171      );
172      assert.ok(
173        enrichSitesFn.includes(
174          "enrichment_pass: site.status === 'enriched_regex' ? 'llm' : 'initial'"
175        ),
176        "fetchEnrichSites should mark enriched_regex sites with enrichment_pass='llm'"
177      );
178    });
179  
180    test('fetchEnrichSites combines key_pages_html + html_dom for enriched_regex sites', () => {
181      const enrichSitesFn = src.slice(
182        src.indexOf('function fetchEnrichSites'),
183        src.indexOf('function fetchEnrichSites') + 3000
184      );
185      assert.ok(
186        enrichSitesFn.includes("site.status === 'enriched_regex' && keyPagesHtml"),
187        'fetchEnrichSites should use combined key_pages_html + html_dom for enriched_regex'
188      );
189    });
190  
191    test('fetchProposalsBatch accepts enriched_llm sites', () => {
192      const proposalsFn = src.slice(
193        src.indexOf('function fetchProposalsBatch'),
194        src.indexOf('function fetchProposalsBatch') + 1500
195      );
196      assert.ok(
197        proposalsFn.includes("'enriched_llm'"),
198        "fetchProposalsBatch WHERE clause should include 'enriched_llm'"
199      );
200    });
201  
202    test('queue_depths.enriched count includes enriched_regex and enriched_llm', () => {
203      assert.ok(
204        src.includes("status IN ('enriched','enriched_regex','enriched_llm')"),
205        'queue_depths.enriched count should include enriched_regex and enriched_llm'
206      );
207    });
208  });
209  
210  // ─── proposals.js: enriched_llm acceptance ───────────────────────────────────
211  
212  describe('proposals.js source: enriched_llm acceptance', () => {
213    const src = readFileSync(join(projectRoot, 'src/stages/proposals.js'), 'utf8');
214  
215    test('proposal stage fetches enriched_llm sites', () => {
216      assert.ok(
217        src.includes("'enriched_llm'"),
218        "proposals.js should include 'enriched_llm' in its WHERE clause"
219      );
220    });
221  
222    test('proposal stage re-queue check includes enriched_llm', () => {
223      // The UPDATE that fires the re-queue must include enriched_llm
224      // (status list may also include enriched_regex — check both values are present)
225      assert.ok(
226        src.includes("'enriched_llm'"),
227        'proposals.js re-queue UPDATE should target enriched_llm sites'
228      );
229      assert.ok(
230        src.includes("'enriched'"),
231        "proposals.js re-queue UPDATE should target 'enriched' sites"
232      );
233    });
234  });
235  
236  // ─── proposal-generator-v2.js and templates ──────────────────────────────────
237  
238  describe('proposal-generator-v2.js and templates: enriched_llm acceptance', () => {
239    test('proposal-generator-v2.js fetchSites includes enriched_llm', () => {
240      const src = readFileSync(join(projectRoot, 'src/proposal-generator-v2.js'), 'utf8');
241      assert.ok(
242        src.includes("'enriched_llm'"),
243        "proposal-generator-v2.js should include 'enriched_llm' in its status filter"
244      );
245    });
246  
247    test('proposal-generator-templates.js fetchSites includes enriched_llm', () => {
248      const src = readFileSync(join(projectRoot, 'src/proposal-generator-templates.js'), 'utf8');
249      assert.ok(
250        src.includes("'enriched_llm'"),
251        "proposal-generator-templates.js should include 'enriched_llm' in its status filter"
252      );
253    });
254  });
255  
256  // ─── pipeline-service.js: surplus detection maps to enriched_regex ───────────
257  
258  describe('pipeline-service.js: surplus detection uses enriched_regex', () => {
259    const src = readFileSync(join(projectRoot, 'src/pipeline-service.js'), 'utf8');
260  
261    test('STAGE_OUTPUT_STATUS.enrich maps to enriched_regex', () => {
262      assert.ok(
263        src.includes("enrich: 'enriched_regex'"),
264        "STAGE_OUTPUT_STATUS.enrich should map to 'enriched_regex' (the main output of the pipeline enrich stage)"
265      );
266    });
267  });
268  
269  // ─── cleanup-html-dom.js: includes enriched_llm ──────────────────────────────
270  
271  describe('cleanup-html-dom.js: enriched_llm sites get html_dom cleaned', () => {
272    const src = readFileSync(join(projectRoot, 'src/cron/cleanup-html-dom.js'), 'utf8');
273  
274    test('cleanup includes enriched_llm in post-enrichment status list', () => {
275      assert.ok(
276        src.includes("'enriched_llm'"),
277        "cleanup-html-dom.js should include 'enriched_llm' so html_dom is cleaned after LLM pass"
278      );
279    });
280  });