enrich.test.js
1 /** 2 * Enrichment Stage Unit Tests 3 * 4 * Exercises runEnrichmentStage() through mocked dependencies: 5 * - better-sqlite3: MockDatabase class with SQL-keyword matching 6 * - ../utils/stealth-browser.js: mock browser launch/context/page 7 * - ../utils/llm-provider.js: controlled LLM responses 8 * - ../utils/error-handler.js: pass-through processBatch, safeJsonParse 9 * - ../utils/summary-generator.js, adaptive-concurrency.js, site-filters.js 10 * - ../utils/gdpr-verification.js, config/countries.js 11 * - ../utils/retry-handler.js, tld-detector.js, phone-normalizer.js 12 * - ../contacts/prioritize.js 13 * - fs (readFileSync for ENRICHMENT_PROMPT) 14 * - globalThis.fetch (discoverContactPagesFromSitemap uses fetch) 15 * 16 * MUST be run with --experimental-test-module-mocks. 17 * Run: NODE_ENV=test LOGS_DIR=/tmp/test-logs DATABASE_PATH=/tmp/test-sites.db \ 18 * node --experimental-test-module-mocks --test tests/stages/enrich.test.js 19 */ 20 21 import { test, describe, mock, beforeEach } from 'node:test'; 22 import assert from 'node:assert/strict'; 23 import * as realFs from 'node:fs'; 24 import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars 25 26 // ═══════════════════════════════════════════════════════════════ 27 // Environment setup BEFORE any mock.module() or imports 28 // ═══════════════════════════════════════════════════════════════ 29 process.env.NODE_ENV = 'test'; 30 process.env.LOGS_DIR = '/tmp/test-logs'; 31 process.env.DATABASE_PATH = '/tmp/test-enrich.db'; 32 process.env.ENABLE_VISION = 'true'; 33 process.env.ENRICHMENT_CONCURRENCY = '1'; 34 35 // ═══════════════════════════════════════════════════════════════ 36 // Mock globalThis.fetch to prevent real network calls from 37 // discoverContactPagesFromSitemap (which fetches /sitemap.xml) 38 // ═══════════════════════════════════════════════════════════════ 39 globalThis.fetch = async () => { 40 // Return a non-ok response so sitemap fallback always returns [] 41 return { 42 ok: false, 43 status: 404, 44 text: async () => '', 45 }; 46 }; 47 48 // ═══════════════════════════════════════════════════════════════ 49 // Shared mutable state (mutated per-test in resetState/beforeEach) 50 // ═══════════════════════════════════════════════════════════════ 51 52 // DB state 53 let mockSiteRows = []; 54 let runCalls = []; 55 let getCalls = []; 56 let dbClosed = false; 57 58 // Browser state — default implementations stored for reset 59 let browserCloseCalled = false; 60 let mockHtml = '<html><body>Contact: info@example.com</body></html>'; 61 let mockScreenshot = Buffer.from('png_data'); 62 63 // LLM state 64 let mockLLMResponse = null; 65 let llmCallCount = 0; 66 67 // Blocklist state — default: not blocked 68 let mockBlocklistResult = null; 69 70 // Country lookup state 71 let mockCountryByCode = code => { 72 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 73 if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false }; 74 return null; 75 }; 76 77 // TLD detector state 78 let mockParseCountryFromGoogleDomain = _domain => 'AU'; 79 80 // ═══════════════════════════════════════════════════════════════ 81 // 1. fs — mock readFileSync for the ENRICHMENT_PROMPT load 82 // ═══════════════════════════════════════════════════════════════ 83 const mockReadFileSync = mock.fn((_path, _enc) => 'MOCK ENRICHMENT PROMPT'); 84 mock.module('fs', { 85 namedExports: { 86 ...realFs, 87 readFileSync: mockReadFileSync, 88 existsSync: () => false, 89 }, 90 }); 91 92 // ═══════════════════════════════════════════════════════════════ 93 // 2. better-sqlite3 — full MockDatabase 94 // ═══════════════════════════════════════════════════════════════ 95 96 class MockStatement { 97 constructor(sql) { 98 this.sql = sql; 99 } 100 101 all(..._args) { 102 // Sites query — main fetch for enrichment stage 103 if (this.sql.includes('FROM sites') && this.sql.includes('enriched_at IS NULL')) { 104 return mockSiteRows; 105 } 106 return []; 107 } 108 109 get(...args) { 110 getCalls.push({ sql: this.sql, args }); 111 // google_domain lookup (called during country mismatch check) 112 if (this.sql.includes('google_domain')) { 113 const id = args[0]; 114 const site = mockSiteRows.find(s => s.id === id); 115 if (site) return { google_domain: site.google_domain || 'google.com.au' }; 116 return null; 117 } 118 // Stats query (getEnrichmentStats) 119 if (this.sql.includes('enriched_at IS NOT NULL')) { 120 return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 }; 121 } 122 return null; 123 } 124 125 run(...args) { 126 runCalls.push({ sql: this.sql, args }); 127 return { changes: 1, lastInsertRowid: 1 }; 128 } 129 } 130 131 class MockDatabase { 132 constructor(_path) { 133 dbClosed = false; 134 } 135 136 prepare(sql) { 137 return new MockStatement(sql); 138 } 139 140 pragma() { 141 return undefined; 142 } 143 144 exec() { 145 return undefined; 146 } 147 148 transaction(fn) { 149 return (...args) => fn(...args); 150 } 151 152 close() { 153 dbClosed = true; 154 } 155 } 156 157 mock.module('better-sqlite3', { 158 defaultExport: MockDatabase, 159 }); 160 161 // ═══════════════════════════════════════════════════════════════ 162 // 2b. db.js mock — enrich.js uses db.js (PostgreSQL) not better-sqlite3 163 // ═══════════════════════════════════════════════════════════════ 164 mock.module('../../src/utils/db.js', { 165 namedExports: { 166 getPool: () => ({}), 167 getAll: async (sql) => { 168 // Main site query 169 if (sql.includes('FROM sites') && sql.includes('enriched_at IS NULL')) { 170 return mockSiteRows; 171 } 172 return []; 173 }, 174 getOne: async (sql, params) => { 175 // google_domain lookup 176 if (sql.includes('google_domain')) { 177 const id = params?.[0]; 178 const site = mockSiteRows.find(s => s.id === id); 179 if (site) return { google_domain: site.google_domain || 'google.com.au' }; 180 return null; 181 } 182 // Stats query (getEnrichmentStats) 183 if (sql.includes('enriched_at IS NOT NULL')) { 184 return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 }; 185 } 186 return null; 187 }, 188 run: async (sql, params) => { 189 runCalls.push({ sql: sql.trim(), args: params || [] }); 190 return { changes: 1, lastInsertRowid: 1 }; 191 }, 192 query: async (sql, params) => { 193 const trimmed = sql.trim(); 194 if (trimmed.includes('google_domain')) { 195 const id = params?.[0]; 196 const site = mockSiteRows.find(s => s.id === id); 197 if (site) return { rows: [{ google_domain: site.google_domain || 'google.com.au' }], rowCount: 1 }; 198 return { rows: [], rowCount: 0 }; 199 } 200 if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) { 201 return { rows: [], rowCount: 0 }; 202 } 203 runCalls.push({ sql: trimmed, args: params || [] }); 204 return { rows: [], rowCount: 1 }; 205 }, 206 withTransaction: async (fn) => { 207 const fakeClient = { 208 query: async (sql, params) => { 209 const trimmed = sql.trim(); 210 runCalls.push({ sql: trimmed, args: params || [] }); 211 if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) { 212 return { rows: [], rowCount: 0 }; 213 } 214 return { rows: [], rowCount: 1 }; 215 }, 216 }; 217 return await fn(fakeClient); 218 }, 219 closePool: async () => {}, 220 createDatabaseConnection: () => ({}), 221 closeDatabaseConnection: async () => {}, 222 }, 223 }); 224 225 // ═══════════════════════════════════════════════════════════════ 226 // 3. Logger — silent no-op 227 // ═══════════════════════════════════════════════════════════════ 228 mock.module('../../src/utils/logger.js', { 229 defaultExport: class MockLogger { 230 info() {} 231 success() {} 232 error() {} 233 warn() {} 234 debug() {} 235 }, 236 }); 237 238 // ═══════════════════════════════════════════════════════════════ 239 // 4. Stealth browser — mock browser/context/page chain 240 // Default implementations are assigned in resetState() so 241 // individual tests can override them without leaking into others. 242 // ═══════════════════════════════════════════════════════════════ 243 244 // These mock fns are created once; their implementations are reset per-test. 245 const mockPageGoto = mock.fn(async () => {}); 246 const mockPageContent = mock.fn(async () => mockHtml); 247 const mockPageScreenshot = mock.fn(async () => mockScreenshot); 248 const mockPageClose = mock.fn(async () => {}); 249 const mockPageWaitForLoadState = mock.fn(async () => {}); 250 251 const mockPage = { 252 goto: mockPageGoto, 253 content: mockPageContent, 254 screenshot: mockPageScreenshot, 255 close: mockPageClose, 256 waitForLoadState: mockPageWaitForLoadState, 257 }; 258 259 const mockContextNewPage = mock.fn(async () => mockPage); 260 const mockContextClose = mock.fn(async () => {}); 261 262 const mockContext = { 263 newPage: mockContextNewPage, 264 close: mockContextClose, 265 }; 266 267 const mockBrowserClose = mock.fn(async () => { 268 browserCloseCalled = true; 269 }); 270 271 const mockBrowser = { 272 close: mockBrowserClose, 273 }; 274 275 const launchStealthBrowserMock = mock.fn(async () => mockBrowser); 276 const createStealthContextMock = mock.fn(async () => mockContext); 277 const humanScrollMock = mock.fn(async () => {}); 278 const randomDelayMock = mock.fn(async () => {}); 279 const isSocialMediaUrlMock = mock.fn(() => false); 280 const waitForCloudflareMock = mock.fn(async () => true); 281 282 mock.module('../../src/utils/stealth-browser.js', { 283 namedExports: { 284 launchStealthBrowser: launchStealthBrowserMock, 285 createStealthContext: createStealthContextMock, 286 humanScroll: humanScrollMock, 287 randomDelay: randomDelayMock, 288 isSocialMediaUrl: isSocialMediaUrlMock, 289 waitForCloudflare: waitForCloudflareMock, 290 }, 291 }); 292 293 // ═══════════════════════════════════════════════════════════════ 294 // 5. LLM provider 295 // ═══════════════════════════════════════════════════════════════ 296 const callLLMMock = mock.fn(async () => { 297 llmCallCount++; 298 return ( 299 mockLLMResponse || { 300 content: JSON.stringify({ 301 business_name: 'Test Corp', 302 email_addresses: [{ email: 'info@testcorp.com', label: 'General' }], 303 phone_numbers: [], 304 social_profiles: [], 305 key_pages: [], 306 primary_contact_form: null, 307 }), 308 usage: { promptTokens: 100, completionTokens: 50 }, 309 } 310 ); 311 }); 312 313 const getProviderMock = mock.fn(() => 'openrouter'); 314 315 mock.module('../../src/utils/llm-provider.js', { 316 namedExports: { 317 callLLM: callLLMMock, 318 getProvider: getProviderMock, 319 }, 320 }); 321 322 // ═══════════════════════════════════════════════════════════════ 323 // 6. LLM usage tracker 324 // ═══════════════════════════════════════════════════════════════ 325 mock.module('../../src/utils/llm-usage-tracker.js', { 326 namedExports: { 327 logLLMUsage: mock.fn(() => {}), 328 }, 329 }); 330 331 // ═══════════════════════════════════════════════════════════════ 332 // 7. error-handler — real safeJsonParse logic, pass-through processBatch 333 // ═══════════════════════════════════════════════════════════════ 334 mock.module('../../src/utils/error-handler.js', { 335 namedExports: { 336 processBatch: mock.fn(async (items, processor, _opts) => { 337 const results = []; 338 const errors = []; 339 for (let i = 0; i < items.length; i++) { 340 try { 341 const r = await processor(items[i], i); 342 results.push(r); 343 } catch (err) { 344 errors.push(err); 345 results.push(null); 346 } 347 } 348 return { results, errors }; 349 }), 350 safeJsonParse: mock.fn((str, fallback = null) => { 351 try { 352 return str ? JSON.parse(str) : fallback; 353 } catch { 354 return fallback; 355 } 356 }), 357 retryWithBackoff: mock.fn(async fn => fn()), 358 }, 359 }); 360 361 // ═══════════════════════════════════════════════════════════════ 362 // 8. summary-generator — no-op 363 // ═══════════════════════════════════════════════════════════════ 364 mock.module('../../src/utils/summary-generator.js', { 365 namedExports: { 366 generateStageCompletion: mock.fn(() => {}), 367 displayProgress: mock.fn(() => {}), 368 }, 369 }); 370 371 // ═══════════════════════════════════════════════════════════════ 372 // 9. adaptive-concurrency — return fixed value 373 // ═══════════════════════════════════════════════════════════════ 374 mock.module('../../src/utils/adaptive-concurrency.js', { 375 namedExports: { 376 getAdaptiveConcurrencyFast: mock.fn(() => 1), 377 getAdaptiveConcurrency: mock.fn(() => 1), 378 isScreenActive: mock.fn(() => false), 379 }, 380 }); 381 382 // ═══════════════════════════════════════════════════════════════ 383 // 10. site-filters — configurable per-test via mockBlocklistResult 384 // ═══════════════════════════════════════════════════════════════ 385 const checkBlocklistMock = mock.fn((_domain, _country) => mockBlocklistResult); 386 387 mock.module('../../src/utils/site-filters.js', { 388 namedExports: { 389 checkBlocklist: checkBlocklistMock, 390 DIRECTORY_DOMAINS: [], 391 SOCIAL_MEDIA_DOMAINS: [], 392 DEMO_EMAIL_DOMAINS: [], 393 loadFranchiseDomains: mock.fn(() => []), 394 isGovernmentDomain: mock.fn(() => false), 395 isEducationDomain: mock.fn(() => false), 396 isNonCommercialDomain: mock.fn(() => false), 397 isDemoEmail: mock.fn(() => false), 398 isGovernmentEmail: mock.fn(() => false), 399 }, 400 }); 401 402 // ═══════════════════════════════════════════════════════════════ 403 // 11. gdpr-verification 404 // ═══════════════════════════════════════════════════════════════ 405 const batchVerifyEmailsMock = mock.fn(() => [ 406 { isVerified: true, confidence: 'high', reason: 'Company domain' }, 407 ]); 408 409 mock.module('../../src/utils/gdpr-verification.js', { 410 namedExports: { 411 verifyCompanyEmail: mock.fn(() => ({ 412 isVerified: true, 413 confidence: 'high', 414 reason: 'Company domain', 415 })), 416 batchVerifyEmails: batchVerifyEmailsMock, 417 isFreeEmailProvider: mock.fn(() => false), 418 searchCompanyTypes: mock.fn(() => []), 419 searchCompanyKeywords: mock.fn(() => []), 420 getKeyPageNames: mock.fn(() => []), 421 }, 422 }); 423 424 // ═══════════════════════════════════════════════════════════════ 425 // 12. countries.js — configurable via mockCountryByCode 426 // ═══════════════════════════════════════════════════════════════ 427 const getCountryByCodeMock = mock.fn(code => mockCountryByCode(code)); 428 429 mock.module('../../src/config/countries.js', { 430 namedExports: { 431 getCountryByCode: getCountryByCodeMock, 432 getCountryByGoogleDomain: mock.fn(() => null), 433 normaliseCountryCode: mock.fn(code => code), 434 COUNTRIES: {}, 435 FREE_EMAIL_PROVIDERS: [], 436 isFreeEmailProvider: mock.fn(() => false), 437 getSupportedCountries: mock.fn(() => []), 438 getGDPRCountries: mock.fn(() => []), 439 isMobileNumber: mock.fn(() => false), 440 }, 441 }); 442 443 // ═══════════════════════════════════════════════════════════════ 444 // 13. retry-handler 445 // ═══════════════════════════════════════════════════════════════ 446 const recordFailureMock = mock.fn(() => {}); 447 const resetRetriesMock = mock.fn(() => {}); 448 449 mock.module('../../src/utils/retry-handler.js', { 450 namedExports: { 451 recordFailure: recordFailureMock, 452 resetRetries: resetRetriesMock, 453 getRetryStats: mock.fn(() => ({})), 454 }, 455 }); 456 457 // ═══════════════════════════════════════════════════════════════ 458 // 14. tld-detector — configurable via mockParseCountryFromGoogleDomain 459 // ═══════════════════════════════════════════════════════════════ 460 const parseCountryFromGoogleDomainMock = mock.fn(domain => 461 mockParseCountryFromGoogleDomain(domain) 462 ); 463 464 mock.module('../../src/utils/tld-detector.js', { 465 namedExports: { 466 parseCountryFromGoogleDomain: parseCountryFromGoogleDomainMock, 467 detectCountryFromTLD: mock.fn(() => null), 468 }, 469 }); 470 471 // ═══════════════════════════════════════════════════════════════ 472 // 15. phone-normalizer — identity transform 473 // ═══════════════════════════════════════════════════════════════ 474 const normalizePhoneNumberMock = mock.fn(p => p); 475 476 mock.module('../../src/utils/phone-normalizer.js', { 477 namedExports: { 478 normalizePhoneNumber: normalizePhoneNumberMock, 479 normalizePhoneNumbers: mock.fn(ps => ps), 480 addCountryCode: mock.fn(p => p), 481 isFakeNumber: mock.fn(() => false), 482 isValidSmsNumber: mock.fn(() => ({ valid: true })), 483 }, 484 }); 485 486 // ═══════════════════════════════════════════════════════════════ 487 // 16. contacts/prioritize.js 488 // ═══════════════════════════════════════════════════════════════ 489 const cleanInvalidSocialLinksMock = mock.fn(contacts => contacts || {}); 490 491 mock.module('../../src/contacts/prioritize.js', { 492 namedExports: { 493 cleanInvalidSocialLinks: cleanInvalidSocialLinksMock, 494 getAllContacts: mock.fn(() => []), 495 parseAvailableChannels: mock.fn(() => []), 496 prioritizeContacts: mock.fn(() => []), 497 updateOutreachContacts: mock.fn(() => {}), 498 bulkUpdateOutreachContacts: mock.fn(() => {}), 499 getOutreachReadinessReport: mock.fn(() => ({})), 500 getAllContactsWithNames: mock.fn(async () => []), 501 }, 502 defaultExport: {}, 503 }); 504 505 // ═══════════════════════════════════════════════════════════════ 506 // 17. contacts-storage — use site row fallback, avoid real filesystem 507 // ═══════════════════════════════════════════════════════════════ 508 const setContactsJsonMock = mock.fn(() => {}); 509 510 mock.module('../../src/utils/contacts-storage.js', { 511 namedExports: { 512 getContactsJson: mock.fn(() => null), 513 getContactsData: mock.fn(() => null), 514 setContactsJson: setContactsJsonMock, 515 deleteContactsJson: mock.fn(() => false), 516 hasContactsJson: mock.fn(() => false), 517 getContactsJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.contacts_json || null), 518 getContactsDataWithFallback: mock.fn((siteId, dbRow) => { 519 const raw = dbRow?.contacts_json; 520 if (!raw) return null; 521 try { return JSON.parse(raw); } catch { return null; } 522 }), 523 DATA_DIR: '/tmp/test-contacts', 524 }, 525 }); 526 527 // ═══════════════════════════════════════════════════════════════ 528 // 18. html-storage — return html_dom from mockSiteRows 529 // ═══════════════════════════════════════════════════════════════ 530 const readHtmlDomMock = mock.fn(siteId => { 531 const site = mockSiteRows.find(s => s.id === siteId); 532 return site?.html_dom || null; 533 }); 534 535 mock.module('../../src/utils/html-storage.js', { 536 namedExports: { 537 readHtmlDom: readHtmlDomMock, 538 writeKeyPagesHtml: mock.fn(() => {}), 539 readKeyPagesHtml: mock.fn(() => null), 540 deleteHtmlDom: mock.fn(() => {}), 541 deleteKeyPagesHtml: mock.fn(() => {}), 542 }, 543 }); 544 545 // ═══════════════════════════════════════════════════════════════ 546 // 19. score-storage — use site row fallback 547 // ═══════════════════════════════════════════════════════════════ 548 mock.module('../../src/utils/score-storage.js', { 549 namedExports: { 550 getScoreJson: mock.fn(() => null), 551 getScoreData: mock.fn(() => null), 552 setScoreJson: mock.fn(() => {}), 553 getScoreJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.score_json || null), 554 getScoreDataWithFallback: mock.fn((siteId, dbRow) => { 555 const raw = dbRow?.score_json; 556 if (!raw) return null; 557 try { return JSON.parse(raw); } catch { return null; } 558 }), 559 }, 560 }); 561 562 // ═══════════════════════════════════════════════════════════════ 563 // Import module under test AFTER all mocks 564 // ═══════════════════════════════════════════════════════════════ 565 const { runEnrichmentStage, getEnrichmentStats } = await import('../../src/stages/enrich.js'); 566 567 // ═══════════════════════════════════════════════════════════════ 568 // Helper factories 569 // ═══════════════════════════════════════════════════════════════ 570 571 /** 572 * Build a minimal site record matching the SQL query columns. 573 */ 574 function makeSite(overrides = {}) { 575 return { 576 id: 1, 577 domain: 'example.com', 578 url: 'https://example.com', 579 contacts_json: null, 580 html_dom: '<html><body>Test</body></html>', 581 score_json: null, 582 country_code: 'AU', 583 google_domain: 'google.com.au', 584 ...overrides, 585 }; 586 } 587 588 /** 589 * Build a contacts_json string without a contact form. 590 */ 591 function makeContactsJson(extra = {}) { 592 return JSON.stringify({ 593 business_name: 'Test Co', 594 email_addresses: [], 595 phone_numbers: [], 596 social_profiles: [], 597 key_pages: [], 598 ...extra, 599 }); 600 } 601 602 /** 603 * Build a contacts_json string WITH a contact form. 604 */ 605 function makeContactsWithForm(extra = {}) { 606 return JSON.stringify({ 607 business_name: 'Test Co', 608 email_addresses: [], 609 phone_numbers: [], 610 social_profiles: [], 611 key_pages: [], 612 primary_contact_form: { form_url: 'https://example.com/contact' }, 613 ...extra, 614 }); 615 } 616 617 /** 618 * Reset all per-test mutable state. 619 * CRITICAL: also resets mock.fn implementations that individual tests may override. 620 */ 621 function resetState() { 622 // State variables 623 mockSiteRows = []; 624 runCalls = []; 625 getCalls = []; 626 dbClosed = false; 627 browserCloseCalled = false; 628 llmCallCount = 0; 629 mockHtml = '<html><body>Contact us</body></html>'; 630 mockScreenshot = Buffer.from('png_data'); 631 mockLLMResponse = null; 632 mockBlocklistResult = null; 633 634 // Reset configurable functions 635 mockCountryByCode = code => { 636 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 637 if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false }; 638 return null; 639 }; 640 mockParseCountryFromGoogleDomain = _domain => 'AU'; 641 642 // Reset mock.fn call counts 643 mockPageGoto.mock.resetCalls(); 644 mockPageContent.mock.resetCalls(); 645 mockPageScreenshot.mock.resetCalls(); 646 mockPageClose.mock.resetCalls(); 647 mockPageWaitForLoadState.mock.resetCalls(); 648 mockContextNewPage.mock.resetCalls(); 649 mockContextClose.mock.resetCalls(); 650 mockBrowserClose.mock.resetCalls(); 651 callLLMMock.mock.resetCalls(); 652 recordFailureMock.mock.resetCalls(); 653 resetRetriesMock.mock.resetCalls(); 654 checkBlocklistMock.mock.resetCalls(); 655 batchVerifyEmailsMock.mock.resetCalls(); 656 getCountryByCodeMock.mock.resetCalls(); 657 parseCountryFromGoogleDomainMock.mock.resetCalls(); 658 normalizePhoneNumberMock.mock.resetCalls(); 659 cleanInvalidSocialLinksMock.mock.resetCalls(); 660 launchStealthBrowserMock.mock.resetCalls(); 661 readHtmlDomMock.mock.resetCalls(); 662 setContactsJsonMock.mock.resetCalls(); 663 664 // IMPORTANT: Reset mock implementations to defaults (prevent test leakage) 665 mockPageGoto.mock.mockImplementation(async () => {}); 666 callLLMMock.mock.mockImplementation(async () => { 667 llmCallCount++; 668 return ( 669 mockLLMResponse || { 670 content: JSON.stringify({ 671 business_name: 'Test Corp', 672 email_addresses: [{ email: 'info@testcorp.com', label: 'General' }], 673 phone_numbers: [], 674 social_profiles: [], 675 key_pages: [], 676 primary_contact_form: null, 677 }), 678 usage: { promptTokens: 100, completionTokens: 50 }, 679 } 680 ); 681 }); 682 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 683 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 684 mockParseCountryFromGoogleDomain(domain) 685 ); 686 mockPageContent.mock.mockImplementation(async () => mockHtml); 687 mockPageScreenshot.mock.mockImplementation(async () => mockScreenshot); 688 } 689 690 // ═══════════════════════════════════════════════════════════════ 691 // Test Suites 692 // ═══════════════════════════════════════════════════════════════ 693 694 describe('Enrichment Stage', () => { 695 // ───────────────────────────────────────────────────────────── 696 // Suite: No sites to enrich 697 // ───────────────────────────────────────────────────────────── 698 699 describe('No sites to enrich', () => { 700 beforeEach(() => { 701 resetState(); 702 }); 703 704 test('returns zero counts when no sites are in rescored status', async () => { 705 mockSiteRows = []; 706 const result = await runEnrichmentStage(); 707 708 assert.equal(result.processed, 0, 'processed should be 0'); 709 assert.equal(result.succeeded, 0, 'succeeded should be 0'); 710 assert.equal(result.failed, 0, 'failed should be 0'); 711 assert.equal(result.skipped, 0, 'skipped should be 0'); 712 assert.ok(typeof result.duration === 'number', 'duration should be a number'); 713 }); 714 715 test('closes database even when no sites found', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => { 716 mockSiteRows = []; 717 await runEnrichmentStage(); 718 assert.ok(dbClosed, 'database should be closed'); 719 }); 720 721 test('does not launch browser when no sites found', async () => { 722 mockSiteRows = []; 723 await runEnrichmentStage(); 724 assert.equal(launchStealthBrowserMock.mock.calls.length, 0, 'browser should not be launched'); 725 }); 726 }); 727 728 // ───────────────────────────────────────────────────────────── 729 // Suite: Sites already have contact forms — skip browser enrichment 730 // ───────────────────────────────────────────────────────────── 731 732 describe('Sites with existing contact forms', () => { 733 beforeEach(() => { 734 resetState(); 735 }); 736 737 test('marks non-GDPR site with form directly as enriched without browser', async () => { 738 mockSiteRows = [ 739 makeSite({ 740 id: 10, 741 domain: 'au-site.com', 742 country_code: 'AU', 743 contacts_json: makeContactsWithForm(), 744 }), 745 ]; 746 747 const result = await runEnrichmentStage(); 748 749 assert.equal(result.succeeded, 1, 'should succeed for 1 site'); 750 assert.equal(result.failed, 0, 'should have 0 failures'); 751 752 // Should have set status=enriched 753 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 754 assert.ok(enrichUpdate, 'should UPDATE status to enriched'); 755 756 // Page.goto should not be called (no contact page browsing for form sites) 757 assert.equal( 758 mockPageGoto.mock.calls.length, 759 0, 760 'page.goto should not be called for form sites with no contact pages' 761 ); 762 }); 763 764 test('returns all sites as succeeded when all have forms', async () => { 765 mockSiteRows = [ 766 makeSite({ id: 11, contacts_json: makeContactsWithForm() }), 767 makeSite({ id: 12, domain: 'site2.com', contacts_json: makeContactsWithForm() }), 768 ]; 769 770 const result = await runEnrichmentStage(); 771 772 assert.equal(result.processed, 2, 'processed should be 2'); 773 assert.equal(result.succeeded, 2, 'succeeded should be 2'); 774 assert.equal(result.skipped, 0, 'skipped should be 0'); 775 }); 776 777 test('resets retries on form site marked enriched', async () => { 778 mockSiteRows = [makeSite({ id: 20, contacts_json: makeContactsWithForm() })]; 779 780 await runEnrichmentStage(); 781 782 assert.ok( 783 resetRetriesMock.mock.calls.length >= 1, 784 'resetRetries should be called for form sites' 785 ); 786 }); 787 788 test('runs GDPR verification for GDPR country with form + emails', async () => { 789 mockSiteRows = [ 790 makeSite({ 791 id: 30, 792 domain: 'uk-site.co.uk', 793 country_code: 'GB', 794 contacts_json: JSON.stringify({ 795 business_name: 'UK Ltd', 796 email_addresses: [{ email: 'info@uk-site.co.uk', label: 'Office' }], 797 phone_numbers: [], 798 social_profiles: [], 799 key_pages: [], 800 primary_contact_form: { form_url: 'https://uk-site.co.uk/contact' }, 801 country_code: 'GB', 802 }), 803 html_dom: '<html><body>UK Ltd info@uk-site.co.uk</body></html>', 804 }), 805 ]; 806 807 await runEnrichmentStage(); 808 809 // Form-site path marks enriched_regex (GDPR verification happens in browser enrichment path only) 810 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 811 assert.ok(enrichUpdate, 'should UPDATE status to enriched_regex'); 812 }); 813 814 test('skips GDPR check and marks enriched for non-GDPR country with form', async () => { 815 mockSiteRows = [ 816 makeSite({ 817 id: 31, 818 domain: 'au-site.com.au', 819 country_code: 'AU', 820 contacts_json: makeContactsWithForm(), 821 }), 822 ]; 823 824 await runEnrichmentStage(); 825 826 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 827 assert.ok(enrichUpdate, 'AU site with form should be marked enriched without GDPR check'); 828 }); 829 }); 830 831 // ───────────────────────────────────────────────────────────── 832 // Suite: Blocklisted sites 833 // ───────────────────────────────────────────────────────────── 834 835 describe('Blocklisted sites', () => { 836 beforeEach(() => { 837 resetState(); 838 }); 839 840 test('marks blocklisted site as ignored and skips enrichment', async () => { 841 mockBlocklistResult = { reason: 'Directory domain' }; 842 mockSiteRows = [makeSite({ id: 50, domain: 'yellowpages.com' })]; 843 844 await runEnrichmentStage(); 845 846 const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'")); 847 assert.ok(ignoreUpdate, "should UPDATE status to 'ignore' for blocklisted site"); 848 }); 849 850 test('does not run browser enrichment for blocklisted sites', async () => { 851 mockBlocklistResult = { reason: 'Social media domain' }; 852 mockSiteRows = [makeSite({ id: 51, domain: 'facebook.com' })]; 853 854 await runEnrichmentStage(); 855 856 // All sites blocked — filtered before sitesNeedingEnrichment 857 assert.equal(mockPageGoto.mock.calls.length, 0, 'should not browse any pages'); 858 }); 859 }); 860 861 // ───────────────────────────────────────────────────────────── 862 // Suite: Happy path — no existing contacts, no contact pages 863 // ───────────────────────────────────────────────────────────── 864 865 describe('Happy path — no contact pages found', () => { 866 // HTML with an email so regex extraction finds a contact (hasAnyContact=true → enriched) 867 // Must not use example.com/acme.com — these may be in EMAIL_NOISE_DOMAINS 868 const htmlWithEmail = 869 '<html><body><p>Contact us at <a href="mailto:info@plumbersydney.com.au">info@plumbersydney.com.au</a></p></body></html>'; 870 871 beforeEach(() => { 872 resetState(); 873 mockHtml = htmlWithEmail; 874 }); 875 876 test('enriches a site with no prior contacts, no contact pages', async () => { 877 mockSiteRows = [makeSite({ id: 100, contacts_json: null, html_dom: htmlWithEmail })]; 878 879 const result = await runEnrichmentStage(); 880 881 assert.equal(result.processed, 1, 'processed should be 1'); 882 assert.equal(result.succeeded, 1, 'succeeded should be 1'); 883 assert.equal(result.failed, 0, 'failed should be 0'); 884 }); 885 886 test('updates site to enriched status', async () => { 887 mockSiteRows = [makeSite({ id: 101, contacts_json: null, html_dom: htmlWithEmail })]; 888 889 await runEnrichmentStage(); 890 891 const enrichUpdate = runCalls.find( 892 c => c.sql.includes("status = 'enriched_regex'") && c.sql.includes('enriched_at') 893 ); 894 assert.ok(enrichUpdate, 'should UPDATE site to enriched status'); 895 }); 896 897 test('resets retries after successful enrichment', async () => { 898 mockSiteRows = [makeSite({ id: 102, contacts_json: null, html_dom: htmlWithEmail })]; 899 await runEnrichmentStage(); 900 assert.ok(resetRetriesMock.mock.calls.length >= 1, 'resetRetries should be called'); 901 }); 902 903 test('closes database after successful run', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => { 904 mockSiteRows = [makeSite({ id: 103, contacts_json: null, html_dom: htmlWithEmail })]; 905 await runEnrichmentStage(); 906 assert.ok(dbClosed, 'database should be closed'); 907 }); 908 909 test('closes browser after successful run', async () => { 910 mockSiteRows = [makeSite({ id: 104, contacts_json: null, html_dom: htmlWithEmail })]; 911 await runEnrichmentStage(); 912 assert.ok(mockBrowserClose.mock.calls.length >= 1, 'browser should be closed'); 913 }); 914 915 test('returns correct stats shape', async () => { 916 mockSiteRows = [makeSite({ id: 105, contacts_json: null, html_dom: htmlWithEmail })]; 917 const result = await runEnrichmentStage(); 918 919 assert.ok('processed' in result, 'result should have processed'); 920 assert.ok('succeeded' in result, 'result should have succeeded'); 921 assert.ok('failed' in result, 'result should have failed'); 922 assert.ok('skipped' in result, 'result should have skipped'); 923 assert.ok('duration' in result, 'result should have duration'); 924 }); 925 926 test('handles null html_dom gracefully', async () => { 927 mockSiteRows = [makeSite({ id: 106, contacts_json: null, html_dom: null })]; 928 929 const result = await runEnrichmentStage(); 930 assert.equal(result.processed, 1, 'should still process site with null html_dom'); 931 }); 932 }); 933 934 // ───────────────────────────────────────────────────────────── 935 // Suite: Happy path — contact pages found, browser enrichment 936 // ───────────────────────────────────────────────────────────── 937 938 describe('Happy path — contact pages found, browser enrichment', () => { 939 beforeEach(() => { 940 resetState(); 941 }); 942 943 test('launches browser and browses contact pages', async () => { 944 mockSiteRows = [ 945 makeSite({ 946 id: 200, 947 contacts_json: makeContactsJson({ 948 key_pages: ['https://example.com/contact'], 949 }), 950 }), 951 ]; 952 953 mockLLMResponse = { 954 content: JSON.stringify({ 955 business_name: 'Example Corp', 956 email_addresses: [{ email: 'hello@example.com', label: 'Office' }], 957 phone_numbers: [], 958 social_profiles: [], 959 key_pages: [], 960 }), 961 usage: { promptTokens: 300, completionTokens: 100 }, 962 }; 963 964 const result = await runEnrichmentStage(); 965 966 assert.ok(mockPageGoto.mock.calls.length >= 1, 'should navigate to contact page'); 967 assert.equal(result.succeeded, 1, 'should succeed'); 968 }); 969 970 test('finds a contact form and sets formFound=true in stats', async () => { 971 // Page HTML with <form> so regex extractor detects has_contact_form=true 972 mockHtml = 973 '<html><body><form action="/contact" method="post"><input name="email"/><button>Submit</button></form></body></html>'; 974 975 mockSiteRows = [ 976 makeSite({ 977 id: 201, 978 contacts_json: makeContactsJson({ 979 key_pages: ['https://example.com/contact-us'], 980 }), 981 }), 982 ]; 983 984 const result = await runEnrichmentStage(); 985 986 assert.equal(result.formsFound, 1, 'should count form as found'); 987 }); 988 989 test('counts emails found from contact page regex result', async () => { 990 // Page HTML contains 2 emails that regex extractor will find 991 // (must not use example.com — it is in EMAIL_NOISE_DOMAINS) 992 mockHtml = 993 '<html><body>' + 994 '<a href="mailto:sales@testcorp.com.au">Sales</a> ' + 995 '<a href="mailto:support@testcorp.com.au">Support</a>' + 996 '</body></html>'; 997 998 mockSiteRows = [ 999 makeSite({ 1000 id: 202, 1001 contacts_json: makeContactsJson({ 1002 key_pages: ['https://example.com/about'], 1003 }), 1004 }), 1005 ]; 1006 1007 const result = await runEnrichmentStage(); 1008 1009 assert.ok( 1010 result.emailsFound >= 2, 1011 `should count ≥2 emails from contact page, got ${result.emailsFound}` 1012 ); 1013 }); 1014 1015 test('stops browsing after finding a form (break on foundForm)', async () => { 1016 // Page HTML must contain <form> so extractContactsFromHtml detects has_contact_form=true 1017 mockHtml = 1018 '<html><body><form action="/contact" method="post"><input name="email"/></form></body></html>'; 1019 1020 mockSiteRows = [ 1021 makeSite({ 1022 id: 203, 1023 contacts_json: makeContactsJson({ 1024 key_pages: [ 1025 'https://example.com/contact', 1026 'https://example.com/about', 1027 'https://example.com/support', 1028 ], 1029 }), 1030 }), 1031 ]; 1032 1033 await runEnrichmentStage(); 1034 1035 // Should only visit 1 page (stopped after finding form on first page) 1036 assert.equal( 1037 mockPageGoto.mock.calls.length, 1038 1, 1039 'should stop browsing after finding form on first page' 1040 ); 1041 }); 1042 1043 test('closes context and page after each contact page', async () => { 1044 mockSiteRows = [ 1045 makeSite({ 1046 id: 204, 1047 contacts_json: makeContactsJson({ 1048 key_pages: ['https://example.com/contact'], 1049 }), 1050 }), 1051 ]; 1052 1053 mockLLMResponse = { 1054 content: JSON.stringify({ 1055 business_name: 'Close Test Corp', 1056 email_addresses: [], 1057 phone_numbers: [], 1058 social_profiles: [], 1059 key_pages: [], 1060 }), 1061 usage: { promptTokens: 100, completionTokens: 40 }, 1062 }; 1063 1064 await runEnrichmentStage(); 1065 1066 assert.ok(mockPageClose.mock.calls.length >= 1, 'page should be closed after browsing'); 1067 assert.ok(mockContextClose.mock.calls.length >= 1, 'context should be closed after browsing'); 1068 }); 1069 1070 test('reads page HTML during browser enrichment (regex extraction path)', async () => { 1071 mockSiteRows = [ 1072 makeSite({ 1073 id: 205, 1074 contacts_json: makeContactsJson({ 1075 key_pages: ['https://example.com/contact'], 1076 }), 1077 }), 1078 ]; 1079 1080 await runEnrichmentStage(); 1081 1082 assert.ok(mockPageContent.mock.calls.length >= 1, 'should call page.content()'); 1083 assert.ok(mockPageGoto.mock.calls.length >= 1, 'should navigate to contact page'); 1084 }); 1085 1086 test('marks site enriched after browsing contact pages', async () => { 1087 mockSiteRows = [ 1088 makeSite({ 1089 id: 206, 1090 contacts_json: makeContactsJson({ 1091 key_pages: ['https://example.com/contact'], 1092 }), 1093 }), 1094 ]; 1095 1096 mockLLMResponse = { 1097 content: JSON.stringify({ 1098 business_name: 'Enriched Corp', 1099 email_addresses: [], 1100 phone_numbers: [], 1101 social_profiles: [], 1102 key_pages: [], 1103 }), 1104 usage: { promptTokens: 100, completionTokens: 40 }, 1105 }; 1106 1107 await runEnrichmentStage(); 1108 1109 const enrichUpdate = runCalls.find( 1110 c => c.sql.includes("status = 'enriched_regex'") 1111 ); 1112 assert.ok(enrichUpdate, 'should UPDATE site to enriched_regex status'); 1113 }); 1114 }); 1115 1116 // ───────────────────────────────────────────────────────────── 1117 // Suite: Contacts from score_json 1118 // ───────────────────────────────────────────────────────────── 1119 1120 describe('Contacts from score_json (fallback path)', () => { 1121 beforeEach(() => { 1122 resetState(); 1123 }); 1124 1125 test('uses contact_details from score_json when contacts_json is null', async () => { 1126 mockSiteRows = [ 1127 makeSite({ 1128 id: 300, 1129 contacts_json: null, 1130 score_json: JSON.stringify({ 1131 contact_details: { 1132 business_name: 'Score Corp', 1133 email_addresses: [{ email: 'score@score.com', label: 'Main' }], 1134 phone_numbers: [], 1135 social_profiles: [], 1136 key_pages: [], 1137 }, 1138 }), 1139 }), 1140 ]; 1141 1142 const result = await runEnrichmentStage(); 1143 1144 // No LLM call needed (score_json has contacts, no key_pages) 1145 assert.equal(result.processed, 1, 'should process the site'); 1146 assert.equal(result.succeeded, 1, 'should succeed'); 1147 }); 1148 1149 test('falls back to regex extraction if score_json has no contact_details', async () => { 1150 mockSiteRows = [ 1151 makeSite({ 1152 id: 301, 1153 contacts_json: null, 1154 score_json: JSON.stringify({ overall_score: 65 }), // no contact_details 1155 html_dom: '<html><body><p>Contact: fallback@fallback.com</p></body></html>', 1156 }), 1157 ]; 1158 1159 const result = await runEnrichmentStage(); 1160 1161 // Code uses regex extraction (ENABLE_ENRICHMENT_LLM !== 'false' → regex-only path) 1162 assert.equal(result.processed, 1, 'should process the site'); 1163 assert.equal(result.succeeded, 1, 'should succeed'); 1164 }); 1165 1166 test('handles invalid score_json gracefully', async () => { 1167 mockSiteRows = [ 1168 makeSite({ 1169 id: 302, 1170 contacts_json: null, 1171 score_json: 'NOT VALID JSON{{{', 1172 }), 1173 ]; 1174 1175 mockLLMResponse = { 1176 content: JSON.stringify({ 1177 business_name: 'Invalid JSON Corp', 1178 email_addresses: [], 1179 phone_numbers: [], 1180 social_profiles: [], 1181 key_pages: [], 1182 }), 1183 usage: { promptTokens: 100, completionTokens: 30 }, 1184 }; 1185 1186 // Should not throw — falls back to extractInitialContacts 1187 const result = await runEnrichmentStage(); 1188 assert.equal(result.processed, 1, 'should process even with invalid score_json'); 1189 }); 1190 1191 test('cleans invalid social links from score_json contacts', async () => { 1192 mockSiteRows = [ 1193 makeSite({ 1194 id: 303, 1195 contacts_json: null, 1196 score_json: JSON.stringify({ 1197 contact_details: { 1198 business_name: 'Social Corp', 1199 email_addresses: [], 1200 phone_numbers: [], 1201 social_profiles: [{ url: 'https://twitter.com/', label: 'Twitter' }], 1202 key_pages: [], 1203 }, 1204 }), 1205 }), 1206 ]; 1207 1208 await runEnrichmentStage(); 1209 1210 // cleanInvalidSocialLinks should have been called 1211 assert.ok( 1212 cleanInvalidSocialLinksMock.mock.calls.length >= 1, 1213 'cleanInvalidSocialLinks should be called' 1214 ); 1215 }); 1216 }); 1217 1218 // ───────────────────────────────────────────────────────────── 1219 // Suite: Error handling 1220 // ───────────────────────────────────────────────────────────── 1221 1222 describe('Error handling', () => { 1223 beforeEach(() => { 1224 resetState(); 1225 }); 1226 1227 test('records failure when browser page navigation throws', async () => { 1228 mockSiteRows = [ 1229 makeSite({ 1230 id: 400, 1231 contacts_json: makeContactsJson({ 1232 key_pages: ['https://example.com/contact'], 1233 }), 1234 }), 1235 ]; 1236 1237 // Override goto to throw 1238 mockPageGoto.mock.mockImplementation(async () => { 1239 throw new Error('Navigation timeout'); 1240 }); 1241 1242 const result = await runEnrichmentStage(); 1243 1244 assert.equal(result.failed, 1, 'should count failure'); 1245 assert.ok(recordFailureMock.mock.calls.length >= 1, 'should call recordFailure'); 1246 }); 1247 1248 test('records failure when page navigation throws for contact page enrichment', async () => { 1249 mockSiteRows = [ 1250 makeSite({ 1251 id: 401, 1252 contacts_json: makeContactsJson({ 1253 key_pages: ['https://example.com/contact'], 1254 }), 1255 }), 1256 ]; 1257 1258 mockPageGoto.mock.mockImplementation(async () => { 1259 throw new Error('Navigation error'); 1260 }); 1261 1262 const result = await runEnrichmentStage(); 1263 1264 assert.equal(result.failed, 1, 'should count failure'); 1265 assert.ok(recordFailureMock.mock.calls.length >= 1, 'recordFailure should be called'); 1266 }); 1267 1268 test('processes second site when first fails during navigation', async () => { 1269 // Site1: has contact pages → page.goto throws → fails 1270 // Site2: has an email in contacts_json (no contact pages) → succeeds 1271 let navCallCount = 0; 1272 mockPageGoto.mock.mockImplementation(async () => { 1273 navCallCount++; 1274 if (navCallCount === 1) throw new Error('Navigation failed for site1'); 1275 }); 1276 1277 mockSiteRows = [ 1278 makeSite({ 1279 id: 410, 1280 domain: 'site1.com', 1281 url: 'https://site1.com', 1282 contacts_json: makeContactsJson({ key_pages: ['https://site1.com/contact'] }), 1283 }), 1284 makeSite({ 1285 id: 411, 1286 domain: 'site2.com', 1287 url: 'https://site2.com', 1288 contacts_json: JSON.stringify({ 1289 business_name: 'Site 2', 1290 email_addresses: [{ email: 'info@site2.com', label: 'Main' }], 1291 phone_numbers: [], 1292 social_profiles: [], 1293 key_pages: [], // no contact pages → no browsing → direct enriched 1294 }), 1295 }), 1296 ]; 1297 1298 const result = await runEnrichmentStage(); 1299 1300 assert.equal(result.processed, 2, 'should process all sites'); 1301 assert.equal(result.failed, 1, 'should count 1 failure (site1)'); 1302 assert.equal(result.succeeded, 1, 'should count 1 success (site2)'); 1303 }); 1304 1305 test('closes browser even when enrichment fails', async () => { 1306 mockSiteRows = [ 1307 makeSite({ 1308 id: 420, 1309 contacts_json: makeContactsJson({ 1310 key_pages: ['https://example.com/contact'], 1311 }), 1312 }), 1313 ]; 1314 1315 mockPageGoto.mock.mockImplementation(async () => { 1316 throw new Error('Navigation timeout'); 1317 }); 1318 1319 await runEnrichmentStage(); 1320 1321 assert.ok( 1322 mockBrowserClose.mock.calls.length >= 1, 1323 'browser should be closed even on failure' 1324 ); 1325 }); 1326 1327 test('closes database even when enrichment fails', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => { 1328 mockSiteRows = [ 1329 makeSite({ 1330 id: 421, 1331 contacts_json: makeContactsJson({ 1332 key_pages: ['https://example.com/contact'], 1333 }), 1334 }), 1335 ]; 1336 1337 mockPageGoto.mock.mockImplementation(async () => { 1338 throw new Error('Navigation timeout'); 1339 }); 1340 1341 await runEnrichmentStage(); 1342 1343 assert.ok(dbClosed, 'database should be closed even on failure'); 1344 }); 1345 1346 test('recordFailure called with semantic_scored or vision_scored as currentStatus', async () => { 1347 mockSiteRows = [ 1348 makeSite({ 1349 id: 430, 1350 contacts_json: makeContactsJson({ key_pages: ['https://example.com/contact'] }), 1351 }), 1352 ]; 1353 1354 mockPageGoto.mock.mockImplementation(async () => { 1355 throw new Error('Fatal navigation error'); 1356 }); 1357 1358 await runEnrichmentStage(); 1359 1360 assert.ok(recordFailureMock.mock.calls.length >= 1, 'recordFailure should be called'); 1361 // The third argument to recordFailure should be 'semantic_scored' or 'vision_scored' 1362 const call = recordFailureMock.mock.calls[0]; 1363 assert.ok( 1364 call.arguments.includes('semantic_scored') || call.arguments.includes('vision_scored'), 1365 'recordFailure should pass semantic_scored or vision_scored as currentStatus' 1366 ); 1367 }); 1368 }); 1369 1370 // ───────────────────────────────────────────────────────────── 1371 // Suite: Country mismatch detection 1372 // ───────────────────────────────────────────────────────────── 1373 1374 describe('Country mismatch detection', () => { 1375 beforeEach(() => { 1376 resetState(); 1377 }); 1378 1379 test('marks site as ignored when detected country differs from google_domain country', async () => { 1380 // parseCountryFromGoogleDomain returns AU (from google.com.au) 1381 // contacts_json has country_code=US → after browsing, country mismatch check triggers → ignore 1382 mockParseCountryFromGoogleDomain = _domain => 'AU'; 1383 1384 mockSiteRows = [ 1385 makeSite({ 1386 id: 500, 1387 domain: 'us-site.com', 1388 country_code: 'AU', 1389 google_domain: 'google.com.au', 1390 contacts_json: JSON.stringify({ 1391 business_name: 'US Site', 1392 email_addresses: [{ email: 'us@us-site.com', label: 'Main' }], 1393 phone_numbers: [], 1394 social_profiles: [], 1395 // Has a contact page so browsing path executes and reaches country mismatch check 1396 key_pages: ['https://us-site.com/contact'], 1397 country_code: 'US', // triggers mismatch check 1398 city: 'New York', 1399 }), 1400 }), 1401 ]; 1402 1403 await runEnrichmentStage(); 1404 1405 const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'")); 1406 assert.ok(ignoreUpdate, 'should mark country-mismatched site as ignored'); 1407 }); 1408 1409 test('does not ignore site when country matches google_domain', async () => { 1410 // Both AU → no mismatch 1411 mockParseCountryFromGoogleDomain = _domain => 'AU'; 1412 1413 mockSiteRows = [ 1414 makeSite({ 1415 id: 501, 1416 domain: 'au-site.com.au', 1417 country_code: 'AU', 1418 google_domain: 'google.com.au', 1419 contacts_json: makeContactsJson({ 1420 key_pages: ['https://au-site.com.au/contact'], 1421 }), 1422 }), 1423 ]; 1424 1425 mockLLMResponse = { 1426 content: JSON.stringify({ 1427 business_name: 'AU Site', 1428 email_addresses: [], 1429 phone_numbers: [], 1430 social_profiles: [], 1431 key_pages: [], 1432 country_code: 'AU', 1433 }), 1434 usage: { promptTokens: 200, completionTokens: 60 }, 1435 }; 1436 1437 await runEnrichmentStage(); 1438 1439 const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'")); 1440 assert.equal(ignoreUpdate, undefined, 'should NOT mark site as ignored when country matches'); 1441 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 1442 assert.ok(enrichUpdate, 'site should be marked enriched when country matches'); 1443 }); 1444 }); 1445 1446 // ───────────────────────────────────────────────────────────── 1447 // Suite: GDPR verification during browser enrichment 1448 // ───────────────────────────────────────────────────────────── 1449 1450 describe('GDPR verification during browser enrichment', () => { 1451 beforeEach(() => { 1452 resetState(); 1453 }); 1454 1455 test('runs GDPR check for GDPR country sites with emails from enrichment', async () => { 1456 // Configure: GB requires GDPR, google.co.uk → GB (no mismatch) 1457 mockParseCountryFromGoogleDomain = _domain => 'GB'; 1458 mockCountryByCode = code => { 1459 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 1460 return null; 1461 }; 1462 1463 mockSiteRows = [ 1464 makeSite({ 1465 id: 600, 1466 domain: 'uk-site.co.uk', 1467 country_code: 'GB', 1468 google_domain: 'google.co.uk', 1469 contacts_json: makeContactsJson({ 1470 key_pages: ['https://uk-site.co.uk/contact'], 1471 country_code: 'GB', 1472 }), 1473 }), 1474 ]; 1475 1476 mockLLMResponse = { 1477 content: JSON.stringify({ 1478 business_name: 'UK Corp', 1479 email_addresses: [{ email: 'info@uk-site.co.uk', label: 'Main' }], 1480 phone_numbers: [], 1481 social_profiles: [], 1482 key_pages: [], 1483 country_code: 'GB', 1484 }), 1485 usage: { promptTokens: 200, completionTokens: 60 }, 1486 }; 1487 1488 await runEnrichmentStage(); 1489 1490 // Should have company_proof in the final UPDATE 1491 const gdprUpdate = runCalls.find( 1492 c => c.sql.includes('company_proof') && c.sql.includes("status = 'enriched_regex'") 1493 ); 1494 assert.ok(gdprUpdate, 'should set company_proof for GDPR site'); 1495 }); 1496 1497 test('skips GDPR check for non-GDPR countries during browser enrichment', async () => { 1498 mockParseCountryFromGoogleDomain = _domain => 'AU'; 1499 mockCountryByCode = code => { 1500 if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false }; 1501 return null; 1502 }; 1503 1504 mockSiteRows = [ 1505 makeSite({ 1506 id: 601, 1507 domain: 'au-site.com.au', 1508 country_code: 'AU', 1509 google_domain: 'google.com.au', 1510 contacts_json: makeContactsJson({ 1511 key_pages: ['https://au-site.com.au/contact'], 1512 country_code: 'AU', 1513 }), 1514 }), 1515 ]; 1516 1517 mockLLMResponse = { 1518 content: JSON.stringify({ 1519 business_name: 'AU Corp', 1520 email_addresses: [{ email: 'info@au-site.com.au', label: 'Main' }], 1521 phone_numbers: [], 1522 social_profiles: [], 1523 key_pages: [], 1524 country_code: 'AU', 1525 }), 1526 usage: { promptTokens: 200, completionTokens: 60 }, 1527 }; 1528 1529 await runEnrichmentStage(); 1530 1531 // Should still enrich without GDPR verification 1532 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 1533 assert.ok(enrichUpdate, 'AU site should still be marked enriched'); 1534 1535 // batchVerifyEmails should NOT be called (non-GDPR country) 1536 assert.equal( 1537 batchVerifyEmailsMock.mock.calls.length, 1538 0, 1539 'batchVerifyEmails should not be called for non-GDPR country' 1540 ); 1541 }); 1542 }); 1543 1544 // ───────────────────────────────────────────────────────────── 1545 // Suite: Phone number normalization 1546 // ───────────────────────────────────────────────────────────── 1547 1548 describe('Phone number normalization', () => { 1549 beforeEach(() => { 1550 resetState(); 1551 }); 1552 1553 test('normalizes phone numbers from score_json contact_details', async () => { 1554 let normalizeCalled = false; 1555 normalizePhoneNumberMock.mock.mockImplementation(p => { 1556 normalizeCalled = true; 1557 return p; 1558 }); 1559 1560 mockSiteRows = [ 1561 makeSite({ 1562 id: 700, 1563 contacts_json: null, 1564 score_json: JSON.stringify({ 1565 contact_details: { 1566 business_name: 'Phone Corp', 1567 email_addresses: [], 1568 phone_numbers: [{ number: '+61412345678', label: 'Office' }], 1569 social_profiles: [], 1570 key_pages: [], 1571 }, 1572 }), 1573 }), 1574 ]; 1575 1576 await runEnrichmentStage(); 1577 1578 assert.ok( 1579 normalizeCalled, 1580 'normalizePhoneNumber should be called for phone numbers in score_json' 1581 ); 1582 }); 1583 1584 test('handles string phone format in score_json', async () => { 1585 mockSiteRows = [ 1586 makeSite({ 1587 id: 701, 1588 contacts_json: null, 1589 score_json: JSON.stringify({ 1590 contact_details: { 1591 business_name: 'String Phone Corp', 1592 email_addresses: [], 1593 phone_numbers: ['0412345678'], // legacy string format 1594 social_profiles: [], 1595 key_pages: [], 1596 }, 1597 }), 1598 }), 1599 ]; 1600 1601 // Should not throw 1602 const result = await runEnrichmentStage(); 1603 assert.equal(result.processed, 1, 'should handle string phone format'); 1604 }); 1605 1606 test('filters null phones from cleanPhoneNumbers', async () => { 1607 // Phone with missing .number property → should be filtered out, not crash 1608 mockSiteRows = [ 1609 makeSite({ 1610 id: 702, 1611 contacts_json: null, 1612 score_json: JSON.stringify({ 1613 contact_details: { 1614 business_name: 'Null Phone Corp', 1615 email_addresses: [], 1616 phone_numbers: [{ label: 'No number field' }], // invalid — no .number 1617 social_profiles: [], 1618 key_pages: [], 1619 }, 1620 }), 1621 }), 1622 ]; 1623 1624 const result = await runEnrichmentStage(); 1625 assert.equal(result.processed, 1, 'should handle invalid phone objects gracefully'); 1626 }); 1627 }); 1628 1629 // ───────────────────────────────────────────────────────────── 1630 // Suite: Batch processing with limit option 1631 // ───────────────────────────────────────────────────────────── 1632 1633 describe('Batch processing options', () => { 1634 beforeEach(() => { 1635 resetState(); 1636 }); 1637 1638 test('respects limit option passed to the stage', async () => { 1639 // Limit is applied at DB query level (SQL LIMIT clause in the source) 1640 // We verify it doesn't break when limit is passed 1641 mockSiteRows = [makeSite({ id: 800, contacts_json: null })]; 1642 1643 mockLLMResponse = { 1644 content: JSON.stringify({ 1645 business_name: 'Limit Corp', 1646 email_addresses: [], 1647 phone_numbers: [], 1648 social_profiles: [], 1649 key_pages: [], 1650 }), 1651 usage: { promptTokens: 100, completionTokens: 30 }, 1652 }; 1653 1654 const result = await runEnrichmentStage({ limit: 10 }); 1655 assert.ok(result, 'should return a result with limit option'); 1656 assert.equal(result.processed, 1, 'should process the site'); 1657 }); 1658 1659 test('processes multiple sites sequentially in mock batch', async () => { 1660 // All sites have empty key_pages (no contact pages) → no browser needed 1661 // fetch is mocked globally to return 404 → sitemap fallback returns [] 1662 mockSiteRows = [ 1663 makeSite({ id: 810, contacts_json: makeContactsJson() }), 1664 makeSite({ 1665 id: 811, 1666 domain: 'site2.com', 1667 url: 'https://site2.com', 1668 contacts_json: makeContactsJson(), 1669 }), 1670 makeSite({ 1671 id: 812, 1672 domain: 'site3.com', 1673 url: 'https://site3.com', 1674 contacts_json: makeContactsJson(), 1675 }), 1676 ]; 1677 1678 const result = await runEnrichmentStage(); 1679 1680 assert.equal(result.processed, 3, 'should process all 3 sites'); 1681 assert.equal(result.succeeded, 3, 'all 3 should succeed'); 1682 assert.equal(result.failed, 0, 'no failures'); 1683 }); 1684 1685 test('concurrency option is accepted without error', async () => { 1686 mockSiteRows = []; 1687 const result = await runEnrichmentStage({ concurrency: 2 }); 1688 assert.ok(result, 'should accept concurrency option'); 1689 }); 1690 }); 1691 1692 // ───────────────────────────────────────────────────────────── 1693 // Suite: getEnrichmentStats 1694 // ───────────────────────────────────────────────────────────── 1695 1696 describe('getEnrichmentStats', () => { 1697 beforeEach(() => { 1698 resetState(); 1699 }); 1700 1701 test('returns stats object with expected fields', async () => { 1702 const stats = await getEnrichmentStats(); 1703 1704 assert.ok(stats !== null, 'stats should not be null'); 1705 assert.ok('total_enriched' in stats, 'stats should have total_enriched'); 1706 assert.ok('with_forms' in stats, 'stats should have with_forms'); 1707 assert.ok('with_emails' in stats, 'stats should have with_emails'); 1708 assert.ok('with_phones' in stats, 'stats should have with_phones'); 1709 }); 1710 1711 test('closes database after returning stats', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, () => { 1712 getEnrichmentStats(); 1713 assert.ok(dbClosed, 'database should be closed after getEnrichmentStats'); 1714 }); 1715 1716 test('returns numeric values in stats', async () => { 1717 const stats = await getEnrichmentStats(); 1718 assert.ok(typeof stats.total_enriched === 'number', 'total_enriched should be a number'); 1719 assert.ok(typeof stats.with_forms === 'number', 'with_forms should be a number'); 1720 }); 1721 }); 1722 1723 // ───────────────────────────────────────────────────────────── 1724 // Suite: ENABLE_VISION flag handling 1725 // ───────────────────────────────────────────────────────────── 1726 1727 describe('ENABLE_VISION flag', () => { 1728 beforeEach(() => { 1729 resetState(); 1730 }); 1731 1732 test('stage runs and returns empty results with ENABLE_VISION=false and no sites', async () => { 1733 process.env.ENABLE_VISION = 'false'; 1734 mockSiteRows = []; 1735 1736 const result = await runEnrichmentStage(); 1737 1738 assert.equal(result.processed, 0, 'should return 0 processed with no sites'); 1739 1740 process.env.ENABLE_VISION = 'true'; 1741 }); 1742 1743 test('stage runs normally with ENABLE_VISION=true', async () => { 1744 process.env.ENABLE_VISION = 'true'; 1745 mockSiteRows = []; 1746 1747 const result = await runEnrichmentStage(); 1748 assert.equal(result.processed, 0, 'should return 0 processed with no sites'); 1749 }); 1750 }); 1751 1752 // ───────────────────────────────────────────────────────────── 1753 // Suite: Mixed sites — some with forms, some without 1754 // ───────────────────────────────────────────────────────────── 1755 1756 describe('Mixed site set — forms and non-forms', () => { 1757 beforeEach(() => { 1758 resetState(); 1759 }); 1760 1761 test('handles mix of form-sites and non-form-sites correctly', async () => { 1762 mockSiteRows = [ 1763 // Site with form → handled in sitesWithForms loop → enriched without browser 1764 makeSite({ id: 900, contacts_json: makeContactsWithForm() }), 1765 // Site without form, no contact pages → browser launched, sitemap fails, marked enriched 1766 makeSite({ 1767 id: 901, 1768 domain: 'no-form.com', 1769 url: 'https://no-form.com', 1770 contacts_json: makeContactsJson(), 1771 }), 1772 ]; 1773 1774 const result = await runEnrichmentStage(); 1775 1776 // All sites go through processBatch (no fast-path split for form sites) 1777 assert.equal(result.processed, 2, 'should process both sites'); 1778 assert.ok(result.succeeded >= 1, 'should have at least 1 success'); 1779 }); 1780 1781 test('skipped count reflects sites that have forms', async () => { 1782 mockSiteRows = [ 1783 makeSite({ id: 910, contacts_json: makeContactsWithForm() }), 1784 makeSite({ id: 911, domain: 's2.com', contacts_json: makeContactsWithForm() }), 1785 makeSite({ 1786 id: 912, 1787 domain: 's3.com', 1788 url: 'https://s3.com', 1789 contacts_json: makeContactsJson(), 1790 }), 1791 ]; 1792 1793 const result = await runEnrichmentStage(); 1794 1795 // All 3 sites go through processBatch (no skipping based on forms) 1796 assert.equal(result.processed, 3, 'all 3 sites should be processed'); 1797 assert.equal(result.skipped, 0, 'no sites are skipped from processBatch'); 1798 }); 1799 }); 1800 1801 // ───────────────────────────────────────────────────────────── 1802 // Suite: dedupeByUrl utility (via social_profiles merge) 1803 // ───────────────────────────────────────────────────────────── 1804 1805 describe('Social profile deduplication', () => { 1806 beforeEach(() => { 1807 resetState(); 1808 }); 1809 1810 test('deduplicates social profiles when merging existing and new contacts', async () => { 1811 const sharedProfileUrl = 'https://linkedin.com/company/example'; 1812 1813 mockSiteRows = [ 1814 makeSite({ 1815 id: 1000, 1816 contacts_json: makeContactsJson({ 1817 key_pages: ['https://example.com/contact'], 1818 social_profiles: [{ url: sharedProfileUrl, label: 'LinkedIn' }], 1819 }), 1820 }), 1821 ]; 1822 1823 // LLM returns the same social profile URL 1824 mockLLMResponse = { 1825 content: JSON.stringify({ 1826 business_name: 'Dedup Corp', 1827 email_addresses: [], 1828 phone_numbers: [], 1829 social_profiles: [{ url: sharedProfileUrl, label: 'LinkedIn' }], 1830 key_pages: [], 1831 }), 1832 usage: { promptTokens: 100, completionTokens: 40 }, 1833 }; 1834 1835 await runEnrichmentStage(); 1836 1837 // Should succeed and save enriched contacts 1838 const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'")); 1839 assert.ok(enrichUpdate, 'should mark site as enriched even with duplicate social profiles'); 1840 1841 // Verify contacts_json arg doesn't have duplicate LinkedIn entries 1842 if (enrichUpdate) { 1843 const contactsArg = enrichUpdate.args.find( 1844 a => typeof a === 'string' && a.includes(sharedProfileUrl) 1845 ); 1846 if (contactsArg) { 1847 const parsed = JSON.parse(contactsArg); 1848 const linkedinCount = (parsed.social_profiles || []).filter( 1849 p => (typeof p === 'string' ? p : p.url) === sharedProfileUrl 1850 ).length; 1851 assert.equal(linkedinCount, 1, 'LinkedIn profile should appear only once (deduped)'); 1852 } 1853 } 1854 }); 1855 }); 1856 });