enriched-status-split.test.js
1 /** 2 * Tests for the enriched status split: 3 * enriched_regex — pipeline browser pass complete (contact pages scraped, regex extracted) 4 * enriched_llm — orchestrator LLM pass complete (key_pages_html processed, key_pages_html cleared) 5 * enriched — fast-path for sites that already have a contact form (no LLM pass needed) 6 * 7 * Covers: 8 * 1. enrich.js: browser pass sets status='enriched_regex' and stores key_pages_html 9 * 2. enrich.js: "has contacts, no contact pages" path sets status='enriched_regex' 10 * 3. enrich.js: "already has form" fast-path keeps status='enriched' 11 * 4. claude-store.js: enrichment_pass='llm' → sets enriched_llm, clears key_pages_html 12 * 5. claude-store.js: enrichment_pass='initial' → keeps semantic_scored/vision_scored status 13 * 6. claude-batch.js: fetchEnrichSites includes enriched_regex sites with combined HTML 14 * 7. claude-batch.js: fetchProposalsBatch accepts enriched_llm sites 15 * 16 * Run: NODE_ENV=test node --test tests/utils/enriched-status-split.test.js 17 */ 18 19 import { test, describe } from 'node:test'; 20 import assert from 'node:assert/strict'; 21 import { readFileSync } from 'fs'; 22 import { join, dirname } from 'path'; 23 import { fileURLToPath } from 'url'; 24 25 const __dirname = dirname(fileURLToPath(import.meta.url)); 26 const projectRoot = join(__dirname, '../..'); 27 28 // ─── Source code text tests (no mocking needed) ───────────────────────────── 29 30 describe('enrich.js source: status split', () => { 31 const src = readFileSync(join(projectRoot, 'src/stages/enrich.js'), 'utf8'); 32 33 test('browser pass final UPDATE uses enriched_regex, not enriched', () => { 34 assert.ok( 35 src.includes("status = 'enriched_regex',") || src.includes("status = 'enriched_regex'\n"), 36 'Final UPDATE in enrichSite browser pass should set status=enriched_regex' 37 ); 38 }); 39 40 test('has-contacts-no-pages path uses enriched_regex', () => { 41 // The "Has some contacts but no contact pages" path should set enriched_regex 42 // so the orchestrator LLM pass can still run against html_dom 43 const hasContactsBlock = src.indexOf('Has some contacts but no contact pages'); 44 assert.ok(hasContactsBlock !== -1, 'should have "has contacts, no contact pages" comment'); 45 46 // Search in a 800-char window (the UPDATE block is ~600 chars after the comment) 47 const afterComment = src.slice(hasContactsBlock, hasContactsBlock + 800); 48 assert.ok( 49 afterComment.includes("status = 'enriched_regex'"), 50 'has-contacts-no-pages path should set status=enriched_regex' 51 ); 52 }); 53 54 test('all enrichment paths use enriched_regex (LLM pass always runs)', () => { 55 // All paths now use enriched_regex so the orchestrator LLM pass always runs. 56 // (The form fast-path was unified into enriched_regex in a later refactor.) 57 const enrichedRegexCount = (src.match(/status = 'enriched_regex'/g) || []).length; 58 assert.ok( 59 enrichedRegexCount >= 2, 60 `Should have at least 2 status='enriched_regex' (browser pass + no-pages path), found ${enrichedRegexCount}` 61 ); 62 }); 63 64 test('key_pages_html accumulator is initialised before page loop', () => { 65 assert.ok( 66 src.includes('const keyPagesHtml = {};'), 67 'enrich.js should initialise keyPagesHtml = {} before the page loop' 68 ); 69 }); 70 71 test('key_pages_html is populated during page loop', () => { 72 assert.ok( 73 src.includes('keyPagesHtml[pageUrl] = pageData.html'), 74 'enrich.js should store rendered HTML per page URL into keyPagesHtml' 75 ); 76 }); 77 78 test('key_pages_html is written to filesystem and flagged in UPDATE', () => { 79 // key_pages_html is now stored on filesystem via writeKeyPagesHtmlFile, 80 // and the DB column is set to 'fs' sentinel (not JSON inline) 81 assert.ok( 82 src.includes('writeKeyPagesHtmlFile'), 83 'enrich.js should write key pages HTML to filesystem via writeKeyPagesHtmlFile' 84 ); 85 // PG-style ($N) or SQLite-style (?) positional parameter 86 assert.ok( 87 src.includes('key_pages_html = ?') || /key_pages_html\s*=\s*\$\d+/.test(src), 88 'final UPDATE should include key_pages_html = $N (or ?) column' 89 ); 90 }); 91 }); 92 93 // ─── claude-store.js source: enriched_llm pass ─────────────────────────────── 94 95 describe('claude-store.js source: enrichment_pass handling', () => { 96 const src = readFileSync(join(projectRoot, 'scripts/claude-store.js'), 'utf8'); 97 98 test('enrichment_pass=llm sets status=enriched_llm', () => { 99 assert.ok( 100 src.includes("status = 'enriched_llm'"), 101 "claude-store.js should set status='enriched_llm' for LLM pass" 102 ); 103 }); 104 105 test('enrichment_pass=llm clears key_pages_html', () => { 106 // After LLM extraction, key_pages_html should be NULLed to reclaim disk space 107 const llmBlock = src.indexOf("enrichment_pass === 'llm'"); 108 assert.ok(llmBlock !== -1, "should have enrichment_pass === 'llm' check"); 109 110 const afterCheck = src.slice(llmBlock, llmBlock + 800); 111 assert.ok( 112 afterCheck.includes('key_pages_html = NULL'), 113 'LLM pass UPDATE should clear key_pages_html = NULL' 114 ); 115 }); 116 117 test('enrichment_pass=initial does NOT set enriched_llm', () => { 118 // Initial pass from html_dom should leave status as semantic_scored/vision_scored 119 const llmBlockEnd = src.indexOf("enrichment_pass === 'llm'") + 800; 120 // The else clause starts after the LLM block — check it doesn't contain enriched_llm 121 const elseBlock = src.slice(llmBlockEnd, llmBlockEnd + 600); 122 assert.ok( 123 !elseBlock.includes("status = 'enriched_llm'"), 124 'Initial pass should not set enriched_llm status' 125 ); 126 assert.ok( 127 !elseBlock.includes('key_pages_html = NULL'), 128 'Initial pass should not clear key_pages_html' 129 ); 130 }); 131 132 test('storeEnrichResult reads enrichment_pass from item', () => { 133 assert.ok( 134 src.includes('item.enrichment_pass'), 135 'storeEnrichResult should check item.enrichment_pass to determine which pass this is' 136 ); 137 }); 138 }); 139 140 // ─── claude-batch.js source: fetchEnrichSites and proposal queries ─────────── 141 142 describe('claude-batch.js source: enriched_regex/enriched_llm inclusion', () => { 143 const src = readFileSync(join(projectRoot, 'scripts/claude-batch.js'), 'utf8'); 144 145 test('fetchEnrichSites queries enriched_regex status', () => { 146 const enrichSitesFn = src.slice( 147 src.indexOf('function fetchEnrichSites'), 148 src.indexOf('function fetchEnrichSites') + 2000 149 ); 150 assert.ok( 151 enrichSitesFn.includes("'enriched_regex'"), 152 'fetchEnrichSites should include enriched_regex in its WHERE clause' 153 ); 154 }); 155 156 test('fetchEnrichSites includes key_pages_html in SELECT', () => { 157 const enrichSitesFn = src.slice( 158 src.indexOf('function fetchEnrichSites'), 159 src.indexOf('function fetchEnrichSites') + 2000 160 ); 161 assert.ok( 162 enrichSitesFn.includes('key_pages_html'), 163 'fetchEnrichSites SELECT should include key_pages_html column' 164 ); 165 }); 166 167 test('fetchEnrichSites sets enrichment_pass=llm for enriched_regex sites', () => { 168 const enrichSitesFn = src.slice( 169 src.indexOf('function fetchEnrichSites'), 170 src.indexOf('function fetchEnrichSites') + 3000 171 ); 172 assert.ok( 173 enrichSitesFn.includes( 174 "enrichment_pass: site.status === 'enriched_regex' ? 'llm' : 'initial'" 175 ), 176 "fetchEnrichSites should mark enriched_regex sites with enrichment_pass='llm'" 177 ); 178 }); 179 180 test('fetchEnrichSites combines key_pages_html + html_dom for enriched_regex sites', () => { 181 const enrichSitesFn = src.slice( 182 src.indexOf('function fetchEnrichSites'), 183 src.indexOf('function fetchEnrichSites') + 3000 184 ); 185 assert.ok( 186 enrichSitesFn.includes("site.status === 'enriched_regex' && keyPagesHtml"), 187 'fetchEnrichSites should use combined key_pages_html + html_dom for enriched_regex' 188 ); 189 }); 190 191 test('fetchProposalsBatch accepts enriched_llm sites', () => { 192 const proposalsFn = src.slice( 193 src.indexOf('function fetchProposalsBatch'), 194 src.indexOf('function fetchProposalsBatch') + 1500 195 ); 196 assert.ok( 197 proposalsFn.includes("'enriched_llm'"), 198 "fetchProposalsBatch WHERE clause should include 'enriched_llm'" 199 ); 200 }); 201 202 test('queue_depths.enriched count includes enriched_regex and enriched_llm', () => { 203 assert.ok( 204 src.includes("status IN ('enriched','enriched_regex','enriched_llm')"), 205 'queue_depths.enriched count should include enriched_regex and enriched_llm' 206 ); 207 }); 208 }); 209 210 // ─── proposals.js: enriched_llm acceptance ─────────────────────────────────── 211 212 describe('proposals.js source: enriched_llm acceptance', () => { 213 const src = readFileSync(join(projectRoot, 'src/stages/proposals.js'), 'utf8'); 214 215 test('proposal stage fetches enriched_llm sites', () => { 216 assert.ok( 217 src.includes("'enriched_llm'"), 218 "proposals.js should include 'enriched_llm' in its WHERE clause" 219 ); 220 }); 221 222 test('proposal stage re-queue check includes enriched_llm', () => { 223 // The UPDATE that fires the re-queue must include enriched_llm 224 // (status list may also include enriched_regex — check both values are present) 225 assert.ok( 226 src.includes("'enriched_llm'"), 227 'proposals.js re-queue UPDATE should target enriched_llm sites' 228 ); 229 assert.ok( 230 src.includes("'enriched'"), 231 "proposals.js re-queue UPDATE should target 'enriched' sites" 232 ); 233 }); 234 }); 235 236 // ─── proposal-generator-v2.js and templates ────────────────────────────────── 237 238 describe('proposal-generator-v2.js and templates: enriched_llm acceptance', () => { 239 test('proposal-generator-v2.js fetchSites includes enriched_llm', () => { 240 const src = readFileSync(join(projectRoot, 'src/proposal-generator-v2.js'), 'utf8'); 241 assert.ok( 242 src.includes("'enriched_llm'"), 243 "proposal-generator-v2.js should include 'enriched_llm' in its status filter" 244 ); 245 }); 246 247 test('proposal-generator-templates.js fetchSites includes enriched_llm', () => { 248 const src = readFileSync(join(projectRoot, 'src/proposal-generator-templates.js'), 'utf8'); 249 assert.ok( 250 src.includes("'enriched_llm'"), 251 "proposal-generator-templates.js should include 'enriched_llm' in its status filter" 252 ); 253 }); 254 }); 255 256 // ─── pipeline-service.js: surplus detection maps to enriched_regex ─────────── 257 258 describe('pipeline-service.js: surplus detection uses enriched_regex', () => { 259 const src = readFileSync(join(projectRoot, 'src/pipeline-service.js'), 'utf8'); 260 261 test('STAGE_OUTPUT_STATUS.enrich maps to enriched_regex', () => { 262 assert.ok( 263 src.includes("enrich: 'enriched_regex'"), 264 "STAGE_OUTPUT_STATUS.enrich should map to 'enriched_regex' (the main output of the pipeline enrich stage)" 265 ); 266 }); 267 }); 268 269 // ─── cleanup-html-dom.js: includes enriched_llm ────────────────────────────── 270 271 describe('cleanup-html-dom.js: enriched_llm sites get html_dom cleaned', () => { 272 const src = readFileSync(join(projectRoot, 'src/cron/cleanup-html-dom.js'), 'utf8'); 273 274 test('cleanup includes enriched_llm in post-enrichment status list', () => { 275 assert.ok( 276 src.includes("'enriched_llm'"), 277 "cleanup-html-dom.js should include 'enriched_llm' so html_dom is cleaned after LLM pass" 278 ); 279 }); 280 });