enrich-supplement.test.js
1 /** 2 * Enrichment Stage — Supplemental Coverage Tests 3 * 4 * Targets uncovered branches not exercised by enrich.test.js: 5 * - scrapePage: social media path (humanScroll/randomDelay), cloudflare unresolved warning, 6 * waitForLoadState timeout catch (lines 884-885, 889-891, 901-902) 7 * - GDPR verification block inside enrichSite (after contact pages browsed) (lines 783-810) 8 * - extractInitialContacts: happy path, parse failure fallback, error fallback (lines 991-1062) 9 * 10 * MUST be run with --experimental-test-module-mocks. 11 */ 12 13 import { describe, test, mock, beforeEach } from 'node:test'; 14 import assert from 'node:assert/strict'; 15 import * as realFs from 'node:fs'; 16 import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars 17 18 // ═══════════════════════════════════════════════════════════════ 19 // Environment setup BEFORE any mock.module() or imports 20 // ═══════════════════════════════════════════════════════════════ 21 process.env.NODE_ENV = 'test'; 22 process.env.LOGS_DIR = '/tmp/test-logs'; 23 process.env.DATABASE_PATH = '/tmp/test-enrich-supp.db'; 24 process.env.ENABLE_VISION = 'true'; 25 process.env.ENRICHMENT_CONCURRENCY = '1'; 26 27 // Mock fetch globally to prevent real network calls from sitemap discovery 28 globalThis.fetch = async () => ({ 29 ok: false, 30 status: 404, 31 text: async () => '', 32 }); 33 34 // ═══════════════════════════════════════════════════════════════ 35 // Mutable state (reset per test) 36 // ═══════════════════════════════════════════════════════════════ 37 let mockSiteRows = []; 38 let runCalls = []; 39 let getCalls = []; 40 let dbClosed = false; 41 let browserCloseCalled = false; 42 let mockHtml = '<html><body>Contact us</body></html>'; 43 let mockScreenshot = Buffer.from('png_data'); 44 let mockLLMResponse = null; 45 let llmCallCount = 0; 46 let mockBlocklistResult = null; 47 48 let mockCountryByCode = code => { 49 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 50 if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false }; 51 return null; 52 }; 53 54 let mockParseCountryFromGoogleDomain = _domain => 'AU'; 55 56 // ═══════════════════════════════════════════════════════════════ 57 // 1. fs — mock readFileSync for ENRICHMENT_PROMPT 58 // ═══════════════════════════════════════════════════════════════ 59 mock.module('fs', { 60 namedExports: { 61 ...realFs, 62 readFileSync: mock.fn((_path, _enc) => 'MOCK ENRICHMENT PROMPT'), 63 existsSync: () => false, 64 }, 65 }); 66 67 // ═══════════════════════════════════════════════════════════════ 68 // 2. better-sqlite3 69 // ═══════════════════════════════════════════════════════════════ 70 class MockStatement { 71 constructor(sql) { 72 this.sql = sql; 73 } 74 75 all(..._args) { 76 if (this.sql.includes('FROM sites') && this.sql.includes('enriched_at IS NULL')) { 77 return mockSiteRows; 78 } 79 return []; 80 } 81 82 get(...args) { 83 getCalls.push({ sql: this.sql, args }); 84 if (this.sql.includes('google_domain')) { 85 const id = args[0]; 86 const site = mockSiteRows.find(s => s.id === id); 87 if (site) return { google_domain: site.google_domain || 'google.com.au' }; 88 return null; 89 } 90 if (this.sql.includes('enriched_at IS NOT NULL')) { 91 return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 }; 92 } 93 return null; 94 } 95 96 run(...args) { 97 runCalls.push({ sql: this.sql, args }); 98 return { changes: 1, lastInsertRowid: 1 }; 99 } 100 } 101 102 class MockDatabase { 103 constructor(_path) { 104 dbClosed = false; 105 } 106 prepare(sql) { 107 return new MockStatement(sql); 108 } 109 pragma() { 110 return undefined; 111 } 112 exec() { 113 return undefined; 114 } 115 transaction(fn) { 116 return (...args) => fn(...args); 117 } 118 close() { 119 dbClosed = true; 120 } 121 } 122 123 mock.module('better-sqlite3', { 124 defaultExport: MockDatabase, 125 }); 126 127 // ═══════════════════════════════════════════════════════════════ 128 // 2b. db.js mock — enrich.js uses db.js (not better-sqlite3 directly) 129 // ═══════════════════════════════════════════════════════════════ 130 mock.module('../../src/utils/db.js', { 131 namedExports: { 132 getAll: async (sql, _params = []) => { 133 if (sql.includes('FROM sites') && (sql.includes('enriched_at IS NULL') || sql.includes('enrich'))) { 134 return mockSiteRows; 135 } 136 return []; 137 }, 138 getOne: async (sql, params = []) => { 139 if (sql.includes('google_domain')) { 140 const id = params[0]; 141 const site = mockSiteRows.find(s => s.id === id); 142 if (site) return { google_domain: site.google_domain || 'google.com.au' }; 143 return null; 144 } 145 if (sql.includes('enriched_at IS NOT NULL')) { 146 return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 }; 147 } 148 return null; 149 }, 150 run: async (sql, args = []) => { 151 runCalls.push({ sql, args }); 152 return { changes: 1, rowCount: 1 }; 153 }, 154 query: async (sql, params = []) => { 155 runCalls.push({ sql, args: params }); 156 return { rows: [], rowCount: 0 }; 157 }, 158 withTransaction: async fn => fn({ 159 query: async (sql, params = []) => { 160 runCalls.push({ sql, args: params }); 161 return { rows: [], rowCount: 0 }; 162 }, 163 }), 164 closePool: async () => {}, 165 getPool: () => ({}), 166 createDatabaseConnection: () => ({}), 167 closeDatabaseConnection: async () => {}, 168 }, 169 }); 170 171 // ═══════════════════════════════════════════════════════════════ 172 // 3. Logger — silent no-op 173 // ═══════════════════════════════════════════════════════════════ 174 mock.module('../../src/utils/logger.js', { 175 defaultExport: class MockLogger { 176 info() {} 177 success() {} 178 error() {} 179 warn() {} 180 debug() {} 181 }, 182 }); 183 184 // ═══════════════════════════════════════════════════════════════ 185 // 4. Stealth browser — mock browser/context/page chain 186 // ═══════════════════════════════════════════════════════════════ 187 const mockPageGoto = mock.fn(async () => {}); 188 const mockPageContent = mock.fn(async () => mockHtml); 189 const mockPageScreenshot = mock.fn(async () => mockScreenshot); 190 const mockPageClose = mock.fn(async () => {}); 191 const mockPageWaitForLoadState = mock.fn(async () => {}); 192 193 const mockPage = { 194 goto: mockPageGoto, 195 content: mockPageContent, 196 screenshot: mockPageScreenshot, 197 close: mockPageClose, 198 waitForLoadState: mockPageWaitForLoadState, 199 }; 200 201 const mockContextNewPage = mock.fn(async () => mockPage); 202 const mockContextClose = mock.fn(async () => {}); 203 204 const mockContext = { 205 newPage: mockContextNewPage, 206 close: mockContextClose, 207 }; 208 209 const mockBrowserClose = mock.fn(async () => { 210 browserCloseCalled = true; 211 }); 212 213 const mockBrowser = { close: mockBrowserClose }; 214 215 const launchStealthBrowserMock = mock.fn(async () => mockBrowser); 216 const createStealthContextMock = mock.fn(async () => mockContext); 217 const humanScrollMock = mock.fn(async () => {}); 218 const randomDelayMock = mock.fn(async () => {}); 219 const isSocialMediaUrlMock = mock.fn(() => false); 220 const waitForCloudflareMock = mock.fn(async () => true); 221 222 mock.module('../../src/utils/stealth-browser.js', { 223 namedExports: { 224 launchStealthBrowser: launchStealthBrowserMock, 225 createStealthContext: createStealthContextMock, 226 humanScroll: humanScrollMock, 227 randomDelay: randomDelayMock, 228 isSocialMediaUrl: isSocialMediaUrlMock, 229 waitForCloudflare: waitForCloudflareMock, 230 }, 231 }); 232 233 // ═══════════════════════════════════════════════════════════════ 234 // 5. LLM provider 235 // ═══════════════════════════════════════════════════════════════ 236 const callLLMMock = mock.fn(async () => { 237 llmCallCount++; 238 return ( 239 mockLLMResponse || { 240 content: JSON.stringify({ 241 business_name: 'Test Corp', 242 email_addresses: [{ email: 'info@testcorp.com', label: 'General' }], 243 phone_numbers: [], 244 social_profiles: [], 245 key_pages: [], 246 primary_contact_form: null, 247 }), 248 usage: { promptTokens: 100, completionTokens: 50 }, 249 } 250 ); 251 }); 252 253 mock.module('../../src/utils/llm-provider.js', { 254 namedExports: { 255 callLLM: callLLMMock, 256 getProvider: mock.fn(() => 'openrouter'), 257 }, 258 }); 259 260 // ═══════════════════════════════════════════════════════════════ 261 // 6. LLM usage tracker 262 // ═══════════════════════════════════════════════════════════════ 263 mock.module('../../src/utils/llm-usage-tracker.js', { 264 namedExports: { 265 logLLMUsage: mock.fn(() => {}), 266 }, 267 }); 268 269 // ═══════════════════════════════════════════════════════════════ 270 // 7. error-handler 271 // ═══════════════════════════════════════════════════════════════ 272 mock.module('../../src/utils/error-handler.js', { 273 namedExports: { 274 processBatch: mock.fn(async (items, processor, _opts) => { 275 const results = []; 276 const errors = []; 277 for (let i = 0; i < items.length; i++) { 278 try { 279 const r = await processor(items[i], i); 280 results.push(r); 281 } catch (err) { 282 errors.push(err); 283 results.push(null); 284 } 285 } 286 return { results, errors }; 287 }), 288 safeJsonParse: mock.fn((str, fallback = null) => { 289 try { 290 return str ? JSON.parse(str) : fallback; 291 } catch { 292 return fallback; 293 } 294 }), 295 retryWithBackoff: mock.fn(async fn => fn()), 296 }, 297 }); 298 299 // ═══════════════════════════════════════════════════════════════ 300 // 8. summary-generator 301 // ═══════════════════════════════════════════════════════════════ 302 mock.module('../../src/utils/summary-generator.js', { 303 namedExports: { 304 generateStageCompletion: mock.fn(() => {}), 305 displayProgress: mock.fn(() => {}), 306 }, 307 }); 308 309 // ═══════════════════════════════════════════════════════════════ 310 // 9. adaptive-concurrency 311 // ═══════════════════════════════════════════════════════════════ 312 mock.module('../../src/utils/adaptive-concurrency.js', { 313 namedExports: { 314 getAdaptiveConcurrencyFast: mock.fn(() => 1), 315 getAdaptiveConcurrency: mock.fn(() => 1), 316 isScreenActive: mock.fn(() => false), 317 }, 318 }); 319 320 // ═══════════════════════════════════════════════════════════════ 321 // 10. site-filters 322 // ═══════════════════════════════════════════════════════════════ 323 const checkBlocklistMock = mock.fn((_domain, _country) => mockBlocklistResult); 324 325 mock.module('../../src/utils/site-filters.js', { 326 namedExports: { 327 checkBlocklist: checkBlocklistMock, 328 DIRECTORY_DOMAINS: [], 329 SOCIAL_MEDIA_DOMAINS: [], 330 DEMO_EMAIL_DOMAINS: [], 331 loadFranchiseDomains: mock.fn(() => []), 332 isGovernmentDomain: mock.fn(() => false), 333 isEducationDomain: mock.fn(() => false), 334 isNonCommercialDomain: mock.fn(() => false), 335 isDemoEmail: mock.fn(() => false), 336 isGovernmentEmail: mock.fn(() => false), 337 }, 338 }); 339 340 // ═══════════════════════════════════════════════════════════════ 341 // 11. gdpr-verification 342 // ═══════════════════════════════════════════════════════════════ 343 const batchVerifyEmailsMock = mock.fn(() => [ 344 { isVerified: true, confidence: 'high', reason: 'Company domain' }, 345 ]); 346 347 mock.module('../../src/utils/gdpr-verification.js', { 348 namedExports: { 349 verifyCompanyEmail: mock.fn(() => ({ 350 isVerified: true, 351 confidence: 'high', 352 reason: 'Company domain', 353 })), 354 batchVerifyEmails: batchVerifyEmailsMock, 355 isFreeEmailProvider: mock.fn(() => false), 356 searchCompanyTypes: mock.fn(() => []), 357 searchCompanyKeywords: mock.fn(() => []), 358 getKeyPageNames: mock.fn(() => []), 359 }, 360 }); 361 362 // ═══════════════════════════════════════════════════════════════ 363 // 12. countries.js 364 // ═══════════════════════════════════════════════════════════════ 365 const getCountryByCodeMock = mock.fn(code => mockCountryByCode(code)); 366 367 mock.module('../../src/config/countries.js', { 368 namedExports: { 369 getCountryByCode: getCountryByCodeMock, 370 getCountryByGoogleDomain: mock.fn(() => null), 371 normaliseCountryCode: mock.fn(code => code), 372 COUNTRIES: {}, 373 FREE_EMAIL_PROVIDERS: [], 374 isFreeEmailProvider: mock.fn(() => false), 375 getSupportedCountries: mock.fn(() => []), 376 getGDPRCountries: mock.fn(() => []), 377 isMobileNumber: mock.fn(() => false), 378 }, 379 }); 380 381 // ═══════════════════════════════════════════════════════════════ 382 // 13. retry-handler 383 // ═══════════════════════════════════════════════════════════════ 384 const recordFailureMock = mock.fn(() => {}); 385 const resetRetriesMock = mock.fn(() => {}); 386 387 mock.module('../../src/utils/retry-handler.js', { 388 namedExports: { 389 recordFailure: recordFailureMock, 390 resetRetries: resetRetriesMock, 391 getRetryStats: mock.fn(() => ({})), 392 }, 393 }); 394 395 // ═══════════════════════════════════════════════════════════════ 396 // 14. tld-detector 397 // ═══════════════════════════════════════════════════════════════ 398 const parseCountryFromGoogleDomainMock = mock.fn(domain => 399 mockParseCountryFromGoogleDomain(domain) 400 ); 401 402 mock.module('../../src/utils/tld-detector.js', { 403 namedExports: { 404 parseCountryFromGoogleDomain: parseCountryFromGoogleDomainMock, 405 detectCountryFromTLD: mock.fn(() => null), 406 }, 407 }); 408 409 // ═══════════════════════════════════════════════════════════════ 410 // 15. phone-normalizer 411 // ═══════════════════════════════════════════════════════════════ 412 const normalizePhoneNumberMock = mock.fn(p => p); 413 414 mock.module('../../src/utils/phone-normalizer.js', { 415 namedExports: { 416 normalizePhoneNumber: normalizePhoneNumberMock, 417 normalizePhoneNumbers: mock.fn(ps => ps), 418 addCountryCode: mock.fn(p => p), 419 isFakeNumber: mock.fn(() => false), 420 cleanPhoneNumbers: mock.fn(ps => ps), 421 isValidSmsNumber: mock.fn(() => ({ valid: true })), 422 }, 423 }); 424 425 // ═══════════════════════════════════════════════════════════════ 426 // 16. contacts/prioritize.js 427 // ═══════════════════════════════════════════════════════════════ 428 const cleanInvalidSocialLinksMock = mock.fn(contacts => contacts || {}); 429 430 mock.module('../../src/contacts/prioritize.js', { 431 namedExports: { 432 cleanInvalidSocialLinks: cleanInvalidSocialLinksMock, 433 getAllContacts: mock.fn(() => []), 434 parseAvailableChannels: mock.fn(() => []), 435 prioritizeContacts: mock.fn(() => []), 436 updateOutreachContacts: mock.fn(() => {}), 437 bulkUpdateOutreachContacts: mock.fn(() => {}), 438 getOutreachReadinessReport: mock.fn(() => ({})), 439 getAllContactsWithNames: mock.fn(async () => []), 440 }, 441 defaultExport: {}, 442 }); 443 444 // ═══════════════════════════════════════════════════════════════ 445 // contacts-storage — use site row fallback, avoid real filesystem 446 // ═══════════════════════════════════════════════════════════════ 447 mock.module('../../src/utils/contacts-storage.js', { 448 namedExports: { 449 getContactsJson: mock.fn(() => null), 450 getContactsData: mock.fn(() => null), 451 setContactsJson: mock.fn(() => {}), 452 deleteContactsJson: mock.fn(() => false), 453 hasContactsJson: mock.fn(() => false), 454 getContactsJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.contacts_json || null), 455 getContactsDataWithFallback: mock.fn((siteId, dbRow) => { 456 const raw = dbRow?.contacts_json; 457 if (!raw) return null; 458 try { return JSON.parse(raw); } catch { return null; } 459 }), 460 DATA_DIR: '/tmp/test-contacts', 461 }, 462 }); 463 464 // ═══════════════════════════════════════════════════════════════ 465 // html-storage — return html_dom from mockSiteRows 466 // ═══════════════════════════════════════════════════════════════ 467 mock.module('../../src/utils/html-storage.js', { 468 namedExports: { 469 readHtmlDom: mock.fn(siteId => { 470 const site = mockSiteRows.find(s => s.id === siteId); 471 return site?.html_dom || null; 472 }), 473 writeKeyPagesHtml: mock.fn(() => {}), 474 readKeyPagesHtml: mock.fn(() => null), 475 deleteHtmlDom: mock.fn(() => {}), 476 deleteKeyPagesHtml: mock.fn(() => {}), 477 }, 478 }); 479 480 // ═══════════════════════════════════════════════════════════════ 481 // score-storage — use site row fallback 482 // ═══════════════════════════════════════════════════════════════ 483 mock.module('../../src/utils/score-storage.js', { 484 namedExports: { 485 getScoreJson: mock.fn(() => null), 486 getScoreData: mock.fn(() => null), 487 setScoreJson: mock.fn(() => {}), 488 getScoreJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.score_json || null), 489 getScoreDataWithFallback: mock.fn((siteId, dbRow) => { 490 const raw = dbRow?.score_json; 491 if (!raw) return null; 492 try { return JSON.parse(raw); } catch { return null; } 493 }), 494 }, 495 }); 496 497 // ═══════════════════════════════════════════════════════════════ 498 // Import module under test AFTER all mocks 499 // ═══════════════════════════════════════════════════════════════ 500 const { runEnrichmentStage } = await import('../../src/stages/enrich.js'); 501 502 // ═══════════════════════════════════════════════════════════════ 503 // Helpers 504 // ═══════════════════════════════════════════════════════════════ 505 function makeSite(overrides = {}) { 506 return { 507 id: 1, 508 domain: 'example.com', 509 url: 'https://example.com', 510 contacts_json: null, 511 html_dom: '<html><body>Test</body></html>', 512 score_json: null, 513 country_code: 'AU', 514 google_domain: 'google.com.au', 515 ...overrides, 516 }; 517 } 518 519 function makeContactsJson(extra = {}) { 520 return JSON.stringify({ 521 business_name: 'Test Co', 522 email_addresses: [], 523 phone_numbers: [], 524 social_profiles: [], 525 key_pages: [], 526 ...extra, 527 }); 528 } 529 530 function resetState() { 531 mockSiteRows = []; 532 runCalls = []; 533 getCalls = []; 534 dbClosed = false; 535 browserCloseCalled = false; 536 llmCallCount = 0; 537 mockHtml = '<html><body>Contact us</body></html>'; 538 mockScreenshot = Buffer.from('png_data'); 539 mockLLMResponse = null; 540 mockBlocklistResult = null; 541 542 mockCountryByCode = code => { 543 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 544 if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false }; 545 return null; 546 }; 547 mockParseCountryFromGoogleDomain = _domain => 'AU'; 548 549 mockPageGoto.mock.resetCalls(); 550 mockPageContent.mock.resetCalls(); 551 mockPageScreenshot.mock.resetCalls(); 552 mockPageClose.mock.resetCalls(); 553 mockPageWaitForLoadState.mock.resetCalls(); 554 mockContextNewPage.mock.resetCalls(); 555 mockContextClose.mock.resetCalls(); 556 mockBrowserClose.mock.resetCalls(); 557 callLLMMock.mock.resetCalls(); 558 recordFailureMock.mock.resetCalls(); 559 resetRetriesMock.mock.resetCalls(); 560 checkBlocklistMock.mock.resetCalls(); 561 batchVerifyEmailsMock.mock.resetCalls(); 562 getCountryByCodeMock.mock.resetCalls(); 563 parseCountryFromGoogleDomainMock.mock.resetCalls(); 564 normalizePhoneNumberMock.mock.resetCalls(); 565 cleanInvalidSocialLinksMock.mock.resetCalls(); 566 launchStealthBrowserMock.mock.resetCalls(); 567 isSocialMediaUrlMock.mock.resetCalls(); 568 waitForCloudflareMock.mock.resetCalls(); 569 humanScrollMock.mock.resetCalls(); 570 randomDelayMock.mock.resetCalls(); 571 572 mockPageGoto.mock.mockImplementation(async () => {}); 573 mockPageContent.mock.mockImplementation(async () => mockHtml); 574 mockPageScreenshot.mock.mockImplementation(async () => mockScreenshot); 575 mockPageWaitForLoadState.mock.mockImplementation(async () => {}); 576 isSocialMediaUrlMock.mock.mockImplementation(() => false); 577 waitForCloudflareMock.mock.mockImplementation(async () => true); 578 579 callLLMMock.mock.mockImplementation(async () => { 580 llmCallCount++; 581 return ( 582 mockLLMResponse || { 583 content: JSON.stringify({ 584 business_name: 'Test Corp', 585 email_addresses: [{ email: 'info@testcorp.com', label: 'General' }], 586 phone_numbers: [], 587 social_profiles: [], 588 key_pages: [], 589 primary_contact_form: null, 590 }), 591 usage: { promptTokens: 100, completionTokens: 50 }, 592 } 593 ); 594 }); 595 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 596 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 597 mockParseCountryFromGoogleDomain(domain) 598 ); 599 } 600 601 // ═══════════════════════════════════════════════════════════════ 602 // Test Suites 603 // ═══════════════════════════════════════════════════════════════ 604 605 describe('Enrichment Stage — Supplement Coverage', () => { 606 // ───────────────────────────────────────────────────────────── 607 // Suite: scrapePage — social media stealth path (lines 888-891) 608 // ───────────────────────────────────────────────────────────── 609 describe('scrapePage — social media stealth path', () => { 610 beforeEach(() => { 611 resetState(); 612 }); 613 614 test('calls humanScroll and randomDelay for social media URLs', async () => { 615 // Make isSocialMediaUrl return true so the social branch in scrapePage executes 616 isSocialMediaUrlMock.mock.mockImplementation(() => true); 617 618 mockSiteRows = [ 619 makeSite({ 620 id: 1001, 621 contacts_json: makeContactsJson({ 622 key_pages: ['https://facebook.com/testbiz/about'], 623 }), 624 }), 625 ]; 626 627 await runEnrichmentStage(); 628 629 assert.ok( 630 humanScrollMock.mock.calls.length >= 1, 631 'humanScroll should be called for social media pages' 632 ); 633 assert.ok( 634 randomDelayMock.mock.calls.length >= 2, 635 'randomDelay should be called multiple times for social media pages' 636 ); 637 }); 638 639 test('uses aggressive stealth context for social media URLs', async () => { 640 isSocialMediaUrlMock.mock.mockImplementation(() => true); 641 642 mockSiteRows = [ 643 makeSite({ 644 id: 1002, 645 contacts_json: makeContactsJson({ 646 key_pages: ['https://instagram.com/testbiz/contact'], 647 }), 648 }), 649 ]; 650 651 await runEnrichmentStage(); 652 653 // createStealthContext should be called with stealthLevel: 'aggressive' 654 const stealthCalls = createStealthContextMock.mock.calls; 655 assert.ok(stealthCalls.length >= 1, 'createStealthContext should be called'); 656 const opts = stealthCalls[0].arguments[1]; 657 assert.equal( 658 opts.stealthLevel, 659 'aggressive', 660 'should use aggressive stealth for social URLs' 661 ); 662 }); 663 664 test('uses minimal stealth context for non-social URLs', async () => { 665 // Reset call counts explicitly before this test to avoid cross-test contamination 666 createStealthContextMock.mock.resetCalls(); 667 isSocialMediaUrlMock.mock.mockImplementation(() => false); 668 669 mockSiteRows = [ 670 makeSite({ 671 id: 1003, 672 contacts_json: makeContactsJson({ 673 key_pages: ['https://example.com/contact'], 674 }), 675 }), 676 ]; 677 678 await runEnrichmentStage(); 679 680 const stealthCalls = createStealthContextMock.mock.calls; 681 assert.ok(stealthCalls.length >= 1, 'createStealthContext should be called'); 682 // Find the call made during this test (last call if multiple) 683 const lastCall = stealthCalls[stealthCalls.length - 1]; 684 const opts = lastCall.arguments[1]; 685 assert.equal(opts.stealthLevel, 'minimal', 'should use minimal stealth for non-social URLs'); 686 }); 687 }); 688 689 // ───────────────────────────────────────────────────────────── 690 // Suite: scrapePage — Cloudflare not resolved warning (lines 883-885) 691 // ───────────────────────────────────────────────────────────── 692 describe('scrapePage — Cloudflare challenge not resolved', () => { 693 beforeEach(() => { 694 resetState(); 695 }); 696 697 test('logs warning but continues when Cloudflare is not resolved', async () => { 698 // Make waitForCloudflare return false (challenge still blocking) 699 waitForCloudflareMock.mock.mockImplementation(async () => false); 700 701 mockSiteRows = [ 702 makeSite({ 703 id: 1010, 704 contacts_json: makeContactsJson({ 705 key_pages: ['https://example.com/contact'], 706 }), 707 }), 708 ]; 709 710 // Should still succeed (warning only, not a hard error) 711 const result = await runEnrichmentStage(); 712 713 assert.ok(waitForCloudflareMock.mock.calls.length >= 1, 'waitForCloudflare should be called'); 714 // Site should succeed even with Cloudflare warning 715 assert.equal(result.processed, 1, 'site should still be processed'); 716 }); 717 718 test('still reads page content after Cloudflare warning', async () => { 719 waitForCloudflareMock.mock.mockImplementation(async () => false); 720 721 mockSiteRows = [ 722 makeSite({ 723 id: 1011, 724 contacts_json: makeContactsJson({ 725 key_pages: ['https://example.com/about'], 726 }), 727 }), 728 ]; 729 730 await runEnrichmentStage(); 731 732 assert.ok( 733 mockPageContent.mock.calls.length >= 1, 734 'page.content() should still be called after Cloudflare warning' 735 ); 736 }); 737 }); 738 739 // ───────────────────────────────────────────────────────────── 740 // Suite: scrapePage — waitForLoadState timeout (lines 901-902) 741 // ───────────────────────────────────────────────────────────── 742 describe('scrapePage — waitForLoadState timeout caught', () => { 743 beforeEach(() => { 744 resetState(); 745 }); 746 747 test('continues enrichment when waitForLoadState times out', async () => { 748 // Make waitForLoadState throw to exercise the inner try/catch 749 mockPageWaitForLoadState.mock.mockImplementation(async () => { 750 throw new Error('Timeout waiting for load state'); 751 }); 752 753 mockSiteRows = [ 754 makeSite({ 755 id: 1020, 756 contacts_json: makeContactsJson({ 757 key_pages: ['https://example.com/contact'], 758 }), 759 }), 760 ]; 761 762 // Should not throw — the timeout is caught and ignored 763 const result = await runEnrichmentStage(); 764 765 assert.equal(result.processed, 1, 'should process site even when waitForLoadState times out'); 766 }); 767 768 test('still calls page.content() after waitForLoadState timeout', async () => { 769 mockPageWaitForLoadState.mock.mockImplementation(async () => { 770 throw new Error('Timed out waiting for load state'); 771 }); 772 773 mockSiteRows = [ 774 makeSite({ 775 id: 1021, 776 contacts_json: makeContactsJson({ 777 key_pages: ['https://example.com/contact'], 778 }), 779 }), 780 ]; 781 782 await runEnrichmentStage(); 783 784 assert.ok( 785 mockPageContent.mock.calls.length >= 1, 786 'page.content() should still be called after waitForLoadState timeout' 787 ); 788 }); 789 }); 790 791 // ───────────────────────────────────────────────────────────── 792 // Suite: ENABLE_VISION=false path (disables screenshot in scrapePage) 793 // ───────────────────────────────────────────────────────────── 794 describe('scrapePage — ENABLE_VISION=false skips screenshot', () => { 795 beforeEach(() => { 796 resetState(); 797 process.env.ENABLE_VISION = 'false'; 798 }); 799 800 // Restore after suite 801 test('does not call page.screenshot() when ENABLE_VISION=false', async () => { 802 mockSiteRows = [ 803 makeSite({ 804 id: 1030, 805 contacts_json: makeContactsJson({ 806 key_pages: ['https://example.com/contact'], 807 }), 808 }), 809 ]; 810 811 await runEnrichmentStage(); 812 813 assert.equal( 814 mockPageScreenshot.mock.calls.length, 815 0, 816 'page.screenshot() should not be called when ENABLE_VISION=false' 817 ); 818 819 // Restore 820 process.env.ENABLE_VISION = 'true'; 821 }); 822 823 test('still enriches site without screenshot', async () => { 824 mockSiteRows = [ 825 makeSite({ 826 id: 1031, 827 contacts_json: makeContactsJson({ 828 key_pages: ['https://example.com/contact'], 829 }), 830 }), 831 ]; 832 833 const result = await runEnrichmentStage(); 834 835 assert.equal(result.processed, 1, 'should process site without screenshot'); 836 837 process.env.ENABLE_VISION = 'true'; 838 }); 839 }); 840 841 // ───────────────────────────────────────────────────────────── 842 // Suite: extractInitialContacts path (lines 991-1062) 843 // Triggered by: contacts_json=null, score_json=null, 844 // ENABLE_ENRICHMENT_LLM=false 845 // ───────────────────────────────────────────────────────────── 846 describe('extractInitialContacts path (ENABLE_ENRICHMENT_LLM=false)', () => { 847 beforeEach(() => { 848 resetState(); 849 process.env.ENABLE_ENRICHMENT_LLM = 'false'; 850 }); 851 852 test('calls LLM when ENABLE_ENRICHMENT_LLM=false and no contacts/score_json', async () => { 853 mockSiteRows = [ 854 makeSite({ 855 id: 1040, 856 contacts_json: null, 857 score_json: null, 858 html_dom: '<html><body>Contact page</body></html>', 859 }), 860 ]; 861 862 mockLLMResponse = { 863 content: JSON.stringify({ 864 business_name: 'LLM Corp', 865 email_addresses: [{ email: 'llm@llmcorp.com', label: 'Main' }], 866 phone_numbers: [], 867 social_profiles: [], 868 key_pages: [], 869 }), 870 usage: { promptTokens: 100, completionTokens: 50 }, 871 }; 872 873 const result = await runEnrichmentStage(); 874 875 assert.equal(result.processed, 1, 'should process the site'); 876 assert.ok(llmCallCount >= 1, 'LLM should be called via extractInitialContacts'); 877 878 process.env.ENABLE_ENRICHMENT_LLM = 'true'; 879 }); 880 881 test('returns minimal contact structure when LLM parse fails', async () => { 882 mockSiteRows = [ 883 makeSite({ 884 id: 1041, 885 contacts_json: null, 886 score_json: null, 887 html_dom: '<html><body>Contact</body></html>', 888 }), 889 ]; 890 891 // Return invalid JSON so safeJsonParse returns null 892 mockLLMResponse = { 893 content: 'NOT JSON AT ALL <<<>>>', 894 usage: { promptTokens: 100, completionTokens: 10 }, 895 }; 896 897 const result = await runEnrichmentStage(); 898 899 // With null parse result, extractInitialContacts returns minimal structure 900 // which has no contacts → recordFailure is called (no contacts, no key_pages) 901 assert.equal(result.processed, 1, 'should process site even with LLM parse failure'); 902 903 process.env.ENABLE_ENRICHMENT_LLM = 'true'; 904 }); 905 906 test('returns minimal structure when LLM call throws', async () => { 907 mockSiteRows = [ 908 makeSite({ 909 id: 1042, 910 contacts_json: null, 911 score_json: null, 912 html_dom: '<html><body>Contact</body></html>', 913 }), 914 ]; 915 916 // Make LLM throw an error to hit the catch block in extractInitialContacts 917 callLLMMock.mock.mockImplementation(async () => { 918 llmCallCount++; 919 throw new Error('LLM API error'); 920 }); 921 922 // extractInitialContacts catches the error and returns minimal structure 923 // minimal structure has no contacts → recordFailure path 924 const result = await runEnrichmentStage(); 925 926 assert.equal(result.processed, 1, 'should process even when LLM throws'); 927 928 process.env.ENABLE_ENRICHMENT_LLM = 'true'; 929 }); 930 931 test('cleans invalid social links from LLM extractInitialContacts result', async () => { 932 mockSiteRows = [ 933 makeSite({ 934 id: 1043, 935 contacts_json: null, 936 score_json: null, 937 html_dom: '<html><body>Contact</body></html>', 938 }), 939 ]; 940 941 mockLLMResponse = { 942 content: JSON.stringify({ 943 business_name: 'Social LLM Corp', 944 email_addresses: [{ email: 'info@sociallm.com', label: 'Main' }], 945 phone_numbers: [], 946 social_profiles: [{ url: 'https://twitter.com/handle', label: 'Twitter' }], 947 key_pages: [], 948 }), 949 usage: { promptTokens: 100, completionTokens: 50 }, 950 }; 951 952 await runEnrichmentStage(); 953 954 assert.ok( 955 cleanInvalidSocialLinksMock.mock.calls.length >= 1, 956 'cleanInvalidSocialLinks should be called on extractInitialContacts result' 957 ); 958 959 process.env.ENABLE_ENRICHMENT_LLM = 'true'; 960 }); 961 962 test('normalizes phone numbers from LLM extractInitialContacts result', async () => { 963 mockSiteRows = [ 964 makeSite({ 965 id: 1044, 966 contacts_json: null, 967 score_json: null, 968 html_dom: '<html><body>Contact</body></html>', 969 }), 970 ]; 971 972 mockLLMResponse = { 973 content: JSON.stringify({ 974 business_name: 'Phone LLM Corp', 975 email_addresses: [{ email: 'info@phonellm.com', label: 'Main' }], 976 phone_numbers: [{ number: '+61412345678', label: 'Office' }], 977 social_profiles: [], 978 key_pages: [], 979 }), 980 usage: { promptTokens: 100, completionTokens: 50 }, 981 }; 982 983 let normalizeCalled = false; 984 normalizePhoneNumberMock.mock.mockImplementation(p => { 985 normalizeCalled = true; 986 return p; 987 }); 988 989 await runEnrichmentStage(); 990 991 assert.ok( 992 normalizeCalled, 993 'normalizePhoneNumber should be called on phone numbers from extractInitialContacts' 994 ); 995 996 process.env.ENABLE_ENRICHMENT_LLM = 'true'; 997 }); 998 }); 999 1000 // ───────────────────────────────────────────────────────────── 1001 // Suite: GDPR verification in enrichSite after browsing 1002 // (lines 783-810 — the block with companyProof / gdprVerified) 1003 // ───────────────────────────────────────────────────────────── 1004 describe('GDPR verification after contact page browsing', () => { 1005 beforeEach(() => { 1006 resetState(); 1007 }); 1008 1009 test('runs GDPR verification and sets company_proof when GB site has emails after browsing', async () => { 1010 // Configure: GB requires GDPR, google.co.uk → GB (no country mismatch) 1011 mockParseCountryFromGoogleDomain = _domain => 'GB'; 1012 mockCountryByCode = code => { 1013 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 1014 return null; 1015 }; 1016 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 1017 mockParseCountryFromGoogleDomain(domain) 1018 ); 1019 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 1020 1021 // Page HTML that regex extractor will detect emails from 1022 mockHtml = '<html><body><a href="mailto:info@uk-co.co.uk">info@uk-co.co.uk</a></body></html>'; 1023 1024 mockSiteRows = [ 1025 makeSite({ 1026 id: 1050, 1027 domain: 'uk-co.co.uk', 1028 country_code: 'GB', 1029 google_domain: 'google.co.uk', 1030 contacts_json: JSON.stringify({ 1031 business_name: 'UK Co', 1032 email_addresses: [], 1033 phone_numbers: [], 1034 social_profiles: [], 1035 key_pages: ['https://uk-co.co.uk/contact'], 1036 country_code: 'GB', 1037 }), 1038 }), 1039 ]; 1040 1041 await runEnrichmentStage(); 1042 1043 // Should have called batchVerifyEmails (GDPR path) 1044 assert.ok( 1045 batchVerifyEmailsMock.mock.calls.length >= 1, 1046 'batchVerifyEmails should be called for GB site after browsing' 1047 ); 1048 1049 // The final UPDATE should include company_proof 1050 const gdprUpdate = runCalls.find( 1051 c => c.sql.includes('company_proof') && c.sql.includes("status = 'enriched_regex'") 1052 ); 1053 assert.ok(gdprUpdate, 'should store company_proof for GDPR site after browser enrichment'); 1054 }); 1055 1056 test('sets gdpr_verified=1 when at least one email has high confidence', async () => { 1057 mockParseCountryFromGoogleDomain = _domain => 'GB'; 1058 mockCountryByCode = code => { 1059 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 1060 return null; 1061 }; 1062 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 1063 mockParseCountryFromGoogleDomain(domain) 1064 ); 1065 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 1066 1067 batchVerifyEmailsMock.mock.mockImplementation(() => [ 1068 { isVerified: true, confidence: 'high', reason: 'Company domain' }, 1069 ]); 1070 1071 mockHtml = 1072 '<html><body><a href="mailto:info@gbsite.co.uk">info@gbsite.co.uk</a></body></html>'; 1073 1074 mockSiteRows = [ 1075 makeSite({ 1076 id: 1051, 1077 domain: 'gbsite.co.uk', 1078 country_code: 'GB', 1079 google_domain: 'google.co.uk', 1080 contacts_json: JSON.stringify({ 1081 business_name: 'GB Site', 1082 email_addresses: [], 1083 phone_numbers: [], 1084 social_profiles: [], 1085 key_pages: ['https://gbsite.co.uk/contact'], 1086 country_code: 'GB', 1087 }), 1088 }), 1089 ]; 1090 1091 await runEnrichmentStage(); 1092 1093 const gdprUpdate = runCalls.find( 1094 c => c.sql.includes('gdpr_verified') && c.sql.includes("status = 'enriched_regex'") 1095 ); 1096 assert.ok(gdprUpdate, 'should have a DB update with gdpr_verified field'); 1097 1098 // Find the gdpr_verified value in the run args (it's the 6th positional param) 1099 if (gdprUpdate) { 1100 const { args } = gdprUpdate; 1101 // Look for value 1 in args (gdpr_verified=1 for verified) 1102 assert.ok(args.includes(1), 'gdpr_verified should be 1 when high confidence email found'); 1103 } 1104 }); 1105 1106 test('sets gdpr_verified=0 when all emails are unverified', async () => { 1107 mockParseCountryFromGoogleDomain = _domain => 'GB'; 1108 mockCountryByCode = code => { 1109 if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true }; 1110 return null; 1111 }; 1112 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 1113 mockParseCountryFromGoogleDomain(domain) 1114 ); 1115 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 1116 1117 // Return unverified results 1118 batchVerifyEmailsMock.mock.mockImplementation(() => [ 1119 { isVerified: false, confidence: 'low', reason: 'Free email provider' }, 1120 ]); 1121 1122 mockHtml = 1123 '<html><body><a href="mailto:info@gbsite2.co.uk">info@gbsite2.co.uk</a></body></html>'; 1124 1125 mockSiteRows = [ 1126 makeSite({ 1127 id: 1052, 1128 domain: 'gbsite2.co.uk', 1129 country_code: 'GB', 1130 google_domain: 'google.co.uk', 1131 contacts_json: JSON.stringify({ 1132 business_name: 'GB Site 2', 1133 email_addresses: [], 1134 phone_numbers: [], 1135 social_profiles: [], 1136 key_pages: ['https://gbsite2.co.uk/contact'], 1137 country_code: 'GB', 1138 }), 1139 }), 1140 ]; 1141 1142 await runEnrichmentStage(); 1143 1144 const gdprUpdate = runCalls.find( 1145 c => c.sql.includes('gdpr_verified') && c.sql.includes("status = 'enriched_regex'") 1146 ); 1147 assert.ok(gdprUpdate, 'should have a DB update with gdpr_verified field'); 1148 }); 1149 1150 test('skips GDPR check when countryCode throws from getCountryByCode', async () => { 1151 mockParseCountryFromGoogleDomain = _domain => 'XX'; 1152 mockCountryByCode = _code => { 1153 throw new Error('Unknown country code'); 1154 }; 1155 parseCountryFromGoogleDomainMock.mock.mockImplementation(domain => 1156 mockParseCountryFromGoogleDomain(domain) 1157 ); 1158 getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code)); 1159 1160 mockHtml = '<html><body><a href="mailto:info@xxsite.xx">info@xxsite.xx</a></body></html>'; 1161 1162 mockSiteRows = [ 1163 makeSite({ 1164 id: 1053, 1165 domain: 'xxsite.xx', 1166 country_code: 'XX', 1167 google_domain: 'google.xx', 1168 contacts_json: JSON.stringify({ 1169 business_name: 'XX Site', 1170 email_addresses: [], 1171 phone_numbers: [], 1172 social_profiles: [], 1173 key_pages: ['https://xxsite.xx/contact'], 1174 country_code: 'XX', 1175 }), 1176 }), 1177 ]; 1178 1179 // Should not throw — exception from getCountryByCode is caught 1180 const result = await runEnrichmentStage(); 1181 assert.equal(result.processed, 1, 'should process site even when getCountryByCode throws'); 1182 }); 1183 }); 1184 1185 // ───────────────────────────────────────────────────────────── 1186 // Suite: Social + Cloudflare combined 1187 // ───────────────────────────────────────────────────────────── 1188 describe('scrapePage — social media with Cloudflare unresolved', () => { 1189 beforeEach(() => { 1190 resetState(); 1191 }); 1192 1193 test('social media page with Cloudflare unresolved still completes', async () => { 1194 humanScrollMock.mock.resetCalls(); 1195 isSocialMediaUrlMock.mock.mockImplementation(() => true); 1196 waitForCloudflareMock.mock.mockImplementation(async () => false); 1197 1198 // key_pages URL must match contactPagePattern to trigger browsing 1199 mockSiteRows = [ 1200 makeSite({ 1201 id: 1060, 1202 contacts_json: makeContactsJson({ 1203 key_pages: ['https://facebook.com/testbiz/contact'], 1204 }), 1205 }), 1206 ]; 1207 1208 const result = await runEnrichmentStage(); 1209 1210 assert.equal(result.processed, 1, 'should process social page even with Cloudflare issue'); 1211 assert.ok( 1212 humanScrollMock.mock.calls.length >= 1, 1213 'humanScroll should still be called for social page' 1214 ); 1215 }); 1216 }); 1217 });