/ tests / stages / enrich.test.js
enrich.test.js
   1  /**
   2   * Enrichment Stage Unit Tests
   3   *
   4   * Exercises runEnrichmentStage() through mocked dependencies:
   5   *   - better-sqlite3: MockDatabase class with SQL-keyword matching
   6   *   - ../utils/stealth-browser.js: mock browser launch/context/page
   7   *   - ../utils/llm-provider.js: controlled LLM responses
   8   *   - ../utils/error-handler.js: pass-through processBatch, safeJsonParse
   9   *   - ../utils/summary-generator.js, adaptive-concurrency.js, site-filters.js
  10   *   - ../utils/gdpr-verification.js, config/countries.js
  11   *   - ../utils/retry-handler.js, tld-detector.js, phone-normalizer.js
  12   *   - ../contacts/prioritize.js
  13   *   - fs (readFileSync for ENRICHMENT_PROMPT)
  14   *   - globalThis.fetch (discoverContactPagesFromSitemap uses fetch)
  15   *
  16   * MUST be run with --experimental-test-module-mocks.
  17   * Run: NODE_ENV=test LOGS_DIR=/tmp/test-logs DATABASE_PATH=/tmp/test-sites.db \
  18   *   node --experimental-test-module-mocks --test tests/stages/enrich.test.js
  19   */
  20  
  21  import { test, describe, mock, beforeEach } from 'node:test';
  22  import assert from 'node:assert/strict';
  23  import * as realFs from 'node:fs';
  24  import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars
  25  
  26  // ═══════════════════════════════════════════════════════════════
  27  // Environment setup BEFORE any mock.module() or imports
  28  // ═══════════════════════════════════════════════════════════════
  29  process.env.NODE_ENV = 'test';
  30  process.env.LOGS_DIR = '/tmp/test-logs';
  31  process.env.DATABASE_PATH = '/tmp/test-enrich.db';
  32  process.env.ENABLE_VISION = 'true';
  33  process.env.ENRICHMENT_CONCURRENCY = '1';
  34  
  35  // ═══════════════════════════════════════════════════════════════
  36  // Mock globalThis.fetch to prevent real network calls from
  37  // discoverContactPagesFromSitemap (which fetches /sitemap.xml)
  38  // ═══════════════════════════════════════════════════════════════
  39  globalThis.fetch = async () => {
  40    // Return a non-ok response so sitemap fallback always returns []
  41    return {
  42      ok: false,
  43      status: 404,
  44      text: async () => '',
  45    };
  46  };
  47  
  48  // ═══════════════════════════════════════════════════════════════
  49  // Shared mutable state (mutated per-test in resetState/beforeEach)
  50  // ═══════════════════════════════════════════════════════════════
  51  
  52  // DB state
  53  let mockSiteRows = [];
  54  let runCalls = [];
  55  let getCalls = [];
  56  let dbClosed = false;
  57  
  58  // Browser state — default implementations stored for reset
  59  let browserCloseCalled = false;
  60  let mockHtml = '<html><body>Contact: info@example.com</body></html>';
  61  let mockScreenshot = Buffer.from('png_data');
  62  
  63  // LLM state
  64  let mockLLMResponse = null;
  65  let llmCallCount = 0;
  66  
  67  // Blocklist state — default: not blocked
  68  let mockBlocklistResult = null;
  69  
  70  // Country lookup state
  71  let mockCountryByCode = code => {
  72    if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true };
  73    if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false };
  74    return null;
  75  };
  76  
  77  // TLD detector state
  78  let mockParseCountryFromGoogleDomain = _domain => 'AU';
  79  
  80  // ═══════════════════════════════════════════════════════════════
  81  // 1. fs — mock readFileSync for the ENRICHMENT_PROMPT load
  82  // ═══════════════════════════════════════════════════════════════
  83  const mockReadFileSync = mock.fn((_path, _enc) => 'MOCK ENRICHMENT PROMPT');
  84  mock.module('fs', {
  85    namedExports: {
  86      ...realFs,
  87      readFileSync: mockReadFileSync,
  88      existsSync: () => false,
  89    },
  90  });
  91  
  92  // ═══════════════════════════════════════════════════════════════
  93  // 2. better-sqlite3 — full MockDatabase
  94  // ═══════════════════════════════════════════════════════════════
  95  
  96  class MockStatement {
  97    constructor(sql) {
  98      this.sql = sql;
  99    }
 100  
 101    all(..._args) {
 102      // Sites query — main fetch for enrichment stage
 103      if (this.sql.includes('FROM sites') && this.sql.includes('enriched_at IS NULL')) {
 104        return mockSiteRows;
 105      }
 106      return [];
 107    }
 108  
 109    get(...args) {
 110      getCalls.push({ sql: this.sql, args });
 111      // google_domain lookup (called during country mismatch check)
 112      if (this.sql.includes('google_domain')) {
 113        const id = args[0];
 114        const site = mockSiteRows.find(s => s.id === id);
 115        if (site) return { google_domain: site.google_domain || 'google.com.au' };
 116        return null;
 117      }
 118      // Stats query (getEnrichmentStats)
 119      if (this.sql.includes('enriched_at IS NOT NULL')) {
 120        return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 };
 121      }
 122      return null;
 123    }
 124  
 125    run(...args) {
 126      runCalls.push({ sql: this.sql, args });
 127      return { changes: 1, lastInsertRowid: 1 };
 128    }
 129  }
 130  
 131  class MockDatabase {
 132    constructor(_path) {
 133      dbClosed = false;
 134    }
 135  
 136    prepare(sql) {
 137      return new MockStatement(sql);
 138    }
 139  
 140    pragma() {
 141      return undefined;
 142    }
 143  
 144    exec() {
 145      return undefined;
 146    }
 147  
 148    transaction(fn) {
 149      return (...args) => fn(...args);
 150    }
 151  
 152    close() {
 153      dbClosed = true;
 154    }
 155  }
 156  
 157  mock.module('better-sqlite3', {
 158    defaultExport: MockDatabase,
 159  });
 160  
 161  // ═══════════════════════════════════════════════════════════════
 162  // 2b. db.js mock — enrich.js uses db.js (PostgreSQL) not better-sqlite3
 163  // ═══════════════════════════════════════════════════════════════
 164  mock.module('../../src/utils/db.js', {
 165    namedExports: {
 166      getPool: () => ({}),
 167      getAll: async (sql) => {
 168        // Main site query
 169        if (sql.includes('FROM sites') && sql.includes('enriched_at IS NULL')) {
 170          return mockSiteRows;
 171        }
 172        return [];
 173      },
 174      getOne: async (sql, params) => {
 175        // google_domain lookup
 176        if (sql.includes('google_domain')) {
 177          const id = params?.[0];
 178          const site = mockSiteRows.find(s => s.id === id);
 179          if (site) return { google_domain: site.google_domain || 'google.com.au' };
 180          return null;
 181        }
 182        // Stats query (getEnrichmentStats)
 183        if (sql.includes('enriched_at IS NOT NULL')) {
 184          return { total_enriched: 5, with_forms: 2, with_emails: 4, with_phones: 1 };
 185        }
 186        return null;
 187      },
 188      run: async (sql, params) => {
 189        runCalls.push({ sql: sql.trim(), args: params || [] });
 190        return { changes: 1, lastInsertRowid: 1 };
 191      },
 192      query: async (sql, params) => {
 193        const trimmed = sql.trim();
 194        if (trimmed.includes('google_domain')) {
 195          const id = params?.[0];
 196          const site = mockSiteRows.find(s => s.id === id);
 197          if (site) return { rows: [{ google_domain: site.google_domain || 'google.com.au' }], rowCount: 1 };
 198          return { rows: [], rowCount: 0 };
 199        }
 200        if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) {
 201          return { rows: [], rowCount: 0 };
 202        }
 203        runCalls.push({ sql: trimmed, args: params || [] });
 204        return { rows: [], rowCount: 1 };
 205      },
 206      withTransaction: async (fn) => {
 207        const fakeClient = {
 208          query: async (sql, params) => {
 209            const trimmed = sql.trim();
 210            runCalls.push({ sql: trimmed, args: params || [] });
 211            if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) {
 212              return { rows: [], rowCount: 0 };
 213            }
 214            return { rows: [], rowCount: 1 };
 215          },
 216        };
 217        return await fn(fakeClient);
 218      },
 219      closePool: async () => {},
 220      createDatabaseConnection: () => ({}),
 221      closeDatabaseConnection: async () => {},
 222    },
 223  });
 224  
 225  // ═══════════════════════════════════════════════════════════════
 226  // 3. Logger — silent no-op
 227  // ═══════════════════════════════════════════════════════════════
 228  mock.module('../../src/utils/logger.js', {
 229    defaultExport: class MockLogger {
 230      info() {}
 231      success() {}
 232      error() {}
 233      warn() {}
 234      debug() {}
 235    },
 236  });
 237  
 238  // ═══════════════════════════════════════════════════════════════
 239  // 4. Stealth browser — mock browser/context/page chain
 240  //    Default implementations are assigned in resetState() so
 241  //    individual tests can override them without leaking into others.
 242  // ═══════════════════════════════════════════════════════════════
 243  
 244  // These mock fns are created once; their implementations are reset per-test.
 245  const mockPageGoto = mock.fn(async () => {});
 246  const mockPageContent = mock.fn(async () => mockHtml);
 247  const mockPageScreenshot = mock.fn(async () => mockScreenshot);
 248  const mockPageClose = mock.fn(async () => {});
 249  const mockPageWaitForLoadState = mock.fn(async () => {});
 250  
 251  const mockPage = {
 252    goto: mockPageGoto,
 253    content: mockPageContent,
 254    screenshot: mockPageScreenshot,
 255    close: mockPageClose,
 256    waitForLoadState: mockPageWaitForLoadState,
 257  };
 258  
 259  const mockContextNewPage = mock.fn(async () => mockPage);
 260  const mockContextClose = mock.fn(async () => {});
 261  
 262  const mockContext = {
 263    newPage: mockContextNewPage,
 264    close: mockContextClose,
 265  };
 266  
 267  const mockBrowserClose = mock.fn(async () => {
 268    browserCloseCalled = true;
 269  });
 270  
 271  const mockBrowser = {
 272    close: mockBrowserClose,
 273  };
 274  
 275  const launchStealthBrowserMock = mock.fn(async () => mockBrowser);
 276  const createStealthContextMock = mock.fn(async () => mockContext);
 277  const humanScrollMock = mock.fn(async () => {});
 278  const randomDelayMock = mock.fn(async () => {});
 279  const isSocialMediaUrlMock = mock.fn(() => false);
 280  const waitForCloudflareMock = mock.fn(async () => true);
 281  
 282  mock.module('../../src/utils/stealth-browser.js', {
 283    namedExports: {
 284      launchStealthBrowser: launchStealthBrowserMock,
 285      createStealthContext: createStealthContextMock,
 286      humanScroll: humanScrollMock,
 287      randomDelay: randomDelayMock,
 288      isSocialMediaUrl: isSocialMediaUrlMock,
 289      waitForCloudflare: waitForCloudflareMock,
 290    },
 291  });
 292  
 293  // ═══════════════════════════════════════════════════════════════
 294  // 5. LLM provider
 295  // ═══════════════════════════════════════════════════════════════
 296  const callLLMMock = mock.fn(async () => {
 297    llmCallCount++;
 298    return (
 299      mockLLMResponse || {
 300        content: JSON.stringify({
 301          business_name: 'Test Corp',
 302          email_addresses: [{ email: 'info@testcorp.com', label: 'General' }],
 303          phone_numbers: [],
 304          social_profiles: [],
 305          key_pages: [],
 306          primary_contact_form: null,
 307        }),
 308        usage: { promptTokens: 100, completionTokens: 50 },
 309      }
 310    );
 311  });
 312  
 313  const getProviderMock = mock.fn(() => 'openrouter');
 314  
 315  mock.module('../../src/utils/llm-provider.js', {
 316    namedExports: {
 317      callLLM: callLLMMock,
 318      getProvider: getProviderMock,
 319    },
 320  });
 321  
 322  // ═══════════════════════════════════════════════════════════════
 323  // 6. LLM usage tracker
 324  // ═══════════════════════════════════════════════════════════════
 325  mock.module('../../src/utils/llm-usage-tracker.js', {
 326    namedExports: {
 327      logLLMUsage: mock.fn(() => {}),
 328    },
 329  });
 330  
 331  // ═══════════════════════════════════════════════════════════════
 332  // 7. error-handler — real safeJsonParse logic, pass-through processBatch
 333  // ═══════════════════════════════════════════════════════════════
 334  mock.module('../../src/utils/error-handler.js', {
 335    namedExports: {
 336      processBatch: mock.fn(async (items, processor, _opts) => {
 337        const results = [];
 338        const errors = [];
 339        for (let i = 0; i < items.length; i++) {
 340          try {
 341            const r = await processor(items[i], i);
 342            results.push(r);
 343          } catch (err) {
 344            errors.push(err);
 345            results.push(null);
 346          }
 347        }
 348        return { results, errors };
 349      }),
 350      safeJsonParse: mock.fn((str, fallback = null) => {
 351        try {
 352          return str ? JSON.parse(str) : fallback;
 353        } catch {
 354          return fallback;
 355        }
 356      }),
 357      retryWithBackoff: mock.fn(async fn => fn()),
 358    },
 359  });
 360  
 361  // ═══════════════════════════════════════════════════════════════
 362  // 8. summary-generator — no-op
 363  // ═══════════════════════════════════════════════════════════════
 364  mock.module('../../src/utils/summary-generator.js', {
 365    namedExports: {
 366      generateStageCompletion: mock.fn(() => {}),
 367      displayProgress: mock.fn(() => {}),
 368    },
 369  });
 370  
 371  // ═══════════════════════════════════════════════════════════════
 372  // 9. adaptive-concurrency — return fixed value
 373  // ═══════════════════════════════════════════════════════════════
 374  mock.module('../../src/utils/adaptive-concurrency.js', {
 375    namedExports: {
 376      getAdaptiveConcurrencyFast: mock.fn(() => 1),
 377      getAdaptiveConcurrency: mock.fn(() => 1),
 378      isScreenActive: mock.fn(() => false),
 379    },
 380  });
 381  
 382  // ═══════════════════════════════════════════════════════════════
 383  // 10. site-filters — configurable per-test via mockBlocklistResult
 384  // ═══════════════════════════════════════════════════════════════
 385  const checkBlocklistMock = mock.fn((_domain, _country) => mockBlocklistResult);
 386  
 387  mock.module('../../src/utils/site-filters.js', {
 388    namedExports: {
 389      checkBlocklist: checkBlocklistMock,
 390      DIRECTORY_DOMAINS: [],
 391      SOCIAL_MEDIA_DOMAINS: [],
 392      DEMO_EMAIL_DOMAINS: [],
 393      loadFranchiseDomains: mock.fn(() => []),
 394      isGovernmentDomain: mock.fn(() => false),
 395      isEducationDomain: mock.fn(() => false),
 396      isNonCommercialDomain: mock.fn(() => false),
 397      isDemoEmail: mock.fn(() => false),
 398      isGovernmentEmail: mock.fn(() => false),
 399    },
 400  });
 401  
 402  // ═══════════════════════════════════════════════════════════════
 403  // 11. gdpr-verification
 404  // ═══════════════════════════════════════════════════════════════
 405  const batchVerifyEmailsMock = mock.fn(() => [
 406    { isVerified: true, confidence: 'high', reason: 'Company domain' },
 407  ]);
 408  
 409  mock.module('../../src/utils/gdpr-verification.js', {
 410    namedExports: {
 411      verifyCompanyEmail: mock.fn(() => ({
 412        isVerified: true,
 413        confidence: 'high',
 414        reason: 'Company domain',
 415      })),
 416      batchVerifyEmails: batchVerifyEmailsMock,
 417      isFreeEmailProvider: mock.fn(() => false),
 418      searchCompanyTypes: mock.fn(() => []),
 419      searchCompanyKeywords: mock.fn(() => []),
 420      getKeyPageNames: mock.fn(() => []),
 421    },
 422  });
 423  
 424  // ═══════════════════════════════════════════════════════════════
 425  // 12. countries.js — configurable via mockCountryByCode
 426  // ═══════════════════════════════════════════════════════════════
 427  const getCountryByCodeMock = mock.fn(code => mockCountryByCode(code));
 428  
 429  mock.module('../../src/config/countries.js', {
 430    namedExports: {
 431      getCountryByCode: getCountryByCodeMock,
 432      getCountryByGoogleDomain: mock.fn(() => null),
 433      normaliseCountryCode: mock.fn(code => code),
 434      COUNTRIES: {},
 435      FREE_EMAIL_PROVIDERS: [],
 436      isFreeEmailProvider: mock.fn(() => false),
 437      getSupportedCountries: mock.fn(() => []),
 438      getGDPRCountries: mock.fn(() => []),
 439      isMobileNumber: mock.fn(() => false),
 440    },
 441  });
 442  
 443  // ═══════════════════════════════════════════════════════════════
 444  // 13. retry-handler
 445  // ═══════════════════════════════════════════════════════════════
 446  const recordFailureMock = mock.fn(() => {});
 447  const resetRetriesMock = mock.fn(() => {});
 448  
 449  mock.module('../../src/utils/retry-handler.js', {
 450    namedExports: {
 451      recordFailure: recordFailureMock,
 452      resetRetries: resetRetriesMock,
 453      getRetryStats: mock.fn(() => ({})),
 454    },
 455  });
 456  
 457  // ═══════════════════════════════════════════════════════════════
 458  // 14. tld-detector — configurable via mockParseCountryFromGoogleDomain
 459  // ═══════════════════════════════════════════════════════════════
 460  const parseCountryFromGoogleDomainMock = mock.fn(domain =>
 461    mockParseCountryFromGoogleDomain(domain)
 462  );
 463  
 464  mock.module('../../src/utils/tld-detector.js', {
 465    namedExports: {
 466      parseCountryFromGoogleDomain: parseCountryFromGoogleDomainMock,
 467      detectCountryFromTLD: mock.fn(() => null),
 468    },
 469  });
 470  
 471  // ═══════════════════════════════════════════════════════════════
 472  // 15. phone-normalizer — identity transform
 473  // ═══════════════════════════════════════════════════════════════
 474  const normalizePhoneNumberMock = mock.fn(p => p);
 475  
 476  mock.module('../../src/utils/phone-normalizer.js', {
 477    namedExports: {
 478      normalizePhoneNumber: normalizePhoneNumberMock,
 479      normalizePhoneNumbers: mock.fn(ps => ps),
 480      addCountryCode: mock.fn(p => p),
 481      isFakeNumber: mock.fn(() => false),
 482      isValidSmsNumber: mock.fn(() => ({ valid: true })),
 483    },
 484  });
 485  
 486  // ═══════════════════════════════════════════════════════════════
 487  // 16. contacts/prioritize.js
 488  // ═══════════════════════════════════════════════════════════════
 489  const cleanInvalidSocialLinksMock = mock.fn(contacts => contacts || {});
 490  
 491  mock.module('../../src/contacts/prioritize.js', {
 492    namedExports: {
 493      cleanInvalidSocialLinks: cleanInvalidSocialLinksMock,
 494      getAllContacts: mock.fn(() => []),
 495      parseAvailableChannels: mock.fn(() => []),
 496      prioritizeContacts: mock.fn(() => []),
 497      updateOutreachContacts: mock.fn(() => {}),
 498      bulkUpdateOutreachContacts: mock.fn(() => {}),
 499      getOutreachReadinessReport: mock.fn(() => ({})),
 500      getAllContactsWithNames: mock.fn(async () => []),
 501    },
 502    defaultExport: {},
 503  });
 504  
 505  // ═══════════════════════════════════════════════════════════════
 506  // 17. contacts-storage — use site row fallback, avoid real filesystem
 507  // ═══════════════════════════════════════════════════════════════
 508  const setContactsJsonMock = mock.fn(() => {});
 509  
 510  mock.module('../../src/utils/contacts-storage.js', {
 511    namedExports: {
 512      getContactsJson: mock.fn(() => null),
 513      getContactsData: mock.fn(() => null),
 514      setContactsJson: setContactsJsonMock,
 515      deleteContactsJson: mock.fn(() => false),
 516      hasContactsJson: mock.fn(() => false),
 517      getContactsJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.contacts_json || null),
 518      getContactsDataWithFallback: mock.fn((siteId, dbRow) => {
 519        const raw = dbRow?.contacts_json;
 520        if (!raw) return null;
 521        try { return JSON.parse(raw); } catch { return null; }
 522      }),
 523      DATA_DIR: '/tmp/test-contacts',
 524    },
 525  });
 526  
 527  // ═══════════════════════════════════════════════════════════════
 528  // 18. html-storage — return html_dom from mockSiteRows
 529  // ═══════════════════════════════════════════════════════════════
 530  const readHtmlDomMock = mock.fn(siteId => {
 531    const site = mockSiteRows.find(s => s.id === siteId);
 532    return site?.html_dom || null;
 533  });
 534  
 535  mock.module('../../src/utils/html-storage.js', {
 536    namedExports: {
 537      readHtmlDom: readHtmlDomMock,
 538      writeKeyPagesHtml: mock.fn(() => {}),
 539      readKeyPagesHtml: mock.fn(() => null),
 540      deleteHtmlDom: mock.fn(() => {}),
 541      deleteKeyPagesHtml: mock.fn(() => {}),
 542    },
 543  });
 544  
 545  // ═══════════════════════════════════════════════════════════════
 546  // 19. score-storage — use site row fallback
 547  // ═══════════════════════════════════════════════════════════════
 548  mock.module('../../src/utils/score-storage.js', {
 549    namedExports: {
 550      getScoreJson: mock.fn(() => null),
 551      getScoreData: mock.fn(() => null),
 552      setScoreJson: mock.fn(() => {}),
 553      getScoreJsonWithFallback: mock.fn((siteId, dbRow) => dbRow?.score_json || null),
 554      getScoreDataWithFallback: mock.fn((siteId, dbRow) => {
 555        const raw = dbRow?.score_json;
 556        if (!raw) return null;
 557        try { return JSON.parse(raw); } catch { return null; }
 558      }),
 559    },
 560  });
 561  
 562  // ═══════════════════════════════════════════════════════════════
 563  // Import module under test AFTER all mocks
 564  // ═══════════════════════════════════════════════════════════════
 565  const { runEnrichmentStage, getEnrichmentStats } = await import('../../src/stages/enrich.js');
 566  
 567  // ═══════════════════════════════════════════════════════════════
 568  // Helper factories
 569  // ═══════════════════════════════════════════════════════════════
 570  
 571  /**
 572   * Build a minimal site record matching the SQL query columns.
 573   */
 574  function makeSite(overrides = {}) {
 575    return {
 576      id: 1,
 577      domain: 'example.com',
 578      url: 'https://example.com',
 579      contacts_json: null,
 580      html_dom: '<html><body>Test</body></html>',
 581      score_json: null,
 582      country_code: 'AU',
 583      google_domain: 'google.com.au',
 584      ...overrides,
 585    };
 586  }
 587  
 588  /**
 589   * Build a contacts_json string without a contact form.
 590   */
 591  function makeContactsJson(extra = {}) {
 592    return JSON.stringify({
 593      business_name: 'Test Co',
 594      email_addresses: [],
 595      phone_numbers: [],
 596      social_profiles: [],
 597      key_pages: [],
 598      ...extra,
 599    });
 600  }
 601  
 602  /**
 603   * Build a contacts_json string WITH a contact form.
 604   */
 605  function makeContactsWithForm(extra = {}) {
 606    return JSON.stringify({
 607      business_name: 'Test Co',
 608      email_addresses: [],
 609      phone_numbers: [],
 610      social_profiles: [],
 611      key_pages: [],
 612      primary_contact_form: { form_url: 'https://example.com/contact' },
 613      ...extra,
 614    });
 615  }
 616  
 617  /**
 618   * Reset all per-test mutable state.
 619   * CRITICAL: also resets mock.fn implementations that individual tests may override.
 620   */
 621  function resetState() {
 622    // State variables
 623    mockSiteRows = [];
 624    runCalls = [];
 625    getCalls = [];
 626    dbClosed = false;
 627    browserCloseCalled = false;
 628    llmCallCount = 0;
 629    mockHtml = '<html><body>Contact us</body></html>';
 630    mockScreenshot = Buffer.from('png_data');
 631    mockLLMResponse = null;
 632    mockBlocklistResult = null;
 633  
 634    // Reset configurable functions
 635    mockCountryByCode = code => {
 636      if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true };
 637      if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false };
 638      return null;
 639    };
 640    mockParseCountryFromGoogleDomain = _domain => 'AU';
 641  
 642    // Reset mock.fn call counts
 643    mockPageGoto.mock.resetCalls();
 644    mockPageContent.mock.resetCalls();
 645    mockPageScreenshot.mock.resetCalls();
 646    mockPageClose.mock.resetCalls();
 647    mockPageWaitForLoadState.mock.resetCalls();
 648    mockContextNewPage.mock.resetCalls();
 649    mockContextClose.mock.resetCalls();
 650    mockBrowserClose.mock.resetCalls();
 651    callLLMMock.mock.resetCalls();
 652    recordFailureMock.mock.resetCalls();
 653    resetRetriesMock.mock.resetCalls();
 654    checkBlocklistMock.mock.resetCalls();
 655    batchVerifyEmailsMock.mock.resetCalls();
 656    getCountryByCodeMock.mock.resetCalls();
 657    parseCountryFromGoogleDomainMock.mock.resetCalls();
 658    normalizePhoneNumberMock.mock.resetCalls();
 659    cleanInvalidSocialLinksMock.mock.resetCalls();
 660    launchStealthBrowserMock.mock.resetCalls();
 661    readHtmlDomMock.mock.resetCalls();
 662    setContactsJsonMock.mock.resetCalls();
 663  
 664    // IMPORTANT: Reset mock implementations to defaults (prevent test leakage)
 665    mockPageGoto.mock.mockImplementation(async () => {});
 666    callLLMMock.mock.mockImplementation(async () => {
 667      llmCallCount++;
 668      return (
 669        mockLLMResponse || {
 670          content: JSON.stringify({
 671            business_name: 'Test Corp',
 672            email_addresses: [{ email: 'info@testcorp.com', label: 'General' }],
 673            phone_numbers: [],
 674            social_profiles: [],
 675            key_pages: [],
 676            primary_contact_form: null,
 677          }),
 678          usage: { promptTokens: 100, completionTokens: 50 },
 679        }
 680      );
 681    });
 682    getCountryByCodeMock.mock.mockImplementation(code => mockCountryByCode(code));
 683    parseCountryFromGoogleDomainMock.mock.mockImplementation(domain =>
 684      mockParseCountryFromGoogleDomain(domain)
 685    );
 686    mockPageContent.mock.mockImplementation(async () => mockHtml);
 687    mockPageScreenshot.mock.mockImplementation(async () => mockScreenshot);
 688  }
 689  
 690  // ═══════════════════════════════════════════════════════════════
 691  // Test Suites
 692  // ═══════════════════════════════════════════════════════════════
 693  
 694  describe('Enrichment Stage', () => {
 695    // ─────────────────────────────────────────────────────────────
 696    // Suite: No sites to enrich
 697    // ─────────────────────────────────────────────────────────────
 698  
 699    describe('No sites to enrich', () => {
 700      beforeEach(() => {
 701        resetState();
 702      });
 703  
 704      test('returns zero counts when no sites are in rescored status', async () => {
 705        mockSiteRows = [];
 706        const result = await runEnrichmentStage();
 707  
 708        assert.equal(result.processed, 0, 'processed should be 0');
 709        assert.equal(result.succeeded, 0, 'succeeded should be 0');
 710        assert.equal(result.failed, 0, 'failed should be 0');
 711        assert.equal(result.skipped, 0, 'skipped should be 0');
 712        assert.ok(typeof result.duration === 'number', 'duration should be a number');
 713      });
 714  
 715      test('closes database even when no sites found', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => {
 716        mockSiteRows = [];
 717        await runEnrichmentStage();
 718        assert.ok(dbClosed, 'database should be closed');
 719      });
 720  
 721      test('does not launch browser when no sites found', async () => {
 722        mockSiteRows = [];
 723        await runEnrichmentStage();
 724        assert.equal(launchStealthBrowserMock.mock.calls.length, 0, 'browser should not be launched');
 725      });
 726    });
 727  
 728    // ─────────────────────────────────────────────────────────────
 729    // Suite: Sites already have contact forms — skip browser enrichment
 730    // ─────────────────────────────────────────────────────────────
 731  
 732    describe('Sites with existing contact forms', () => {
 733      beforeEach(() => {
 734        resetState();
 735      });
 736  
 737      test('marks non-GDPR site with form directly as enriched without browser', async () => {
 738        mockSiteRows = [
 739          makeSite({
 740            id: 10,
 741            domain: 'au-site.com',
 742            country_code: 'AU',
 743            contacts_json: makeContactsWithForm(),
 744          }),
 745        ];
 746  
 747        const result = await runEnrichmentStage();
 748  
 749        assert.equal(result.succeeded, 1, 'should succeed for 1 site');
 750        assert.equal(result.failed, 0, 'should have 0 failures');
 751  
 752        // Should have set status=enriched
 753        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
 754        assert.ok(enrichUpdate, 'should UPDATE status to enriched');
 755  
 756        // Page.goto should not be called (no contact page browsing for form sites)
 757        assert.equal(
 758          mockPageGoto.mock.calls.length,
 759          0,
 760          'page.goto should not be called for form sites with no contact pages'
 761        );
 762      });
 763  
 764      test('returns all sites as succeeded when all have forms', async () => {
 765        mockSiteRows = [
 766          makeSite({ id: 11, contacts_json: makeContactsWithForm() }),
 767          makeSite({ id: 12, domain: 'site2.com', contacts_json: makeContactsWithForm() }),
 768        ];
 769  
 770        const result = await runEnrichmentStage();
 771  
 772        assert.equal(result.processed, 2, 'processed should be 2');
 773        assert.equal(result.succeeded, 2, 'succeeded should be 2');
 774        assert.equal(result.skipped, 0, 'skipped should be 0');
 775      });
 776  
 777      test('resets retries on form site marked enriched', async () => {
 778        mockSiteRows = [makeSite({ id: 20, contacts_json: makeContactsWithForm() })];
 779  
 780        await runEnrichmentStage();
 781  
 782        assert.ok(
 783          resetRetriesMock.mock.calls.length >= 1,
 784          'resetRetries should be called for form sites'
 785        );
 786      });
 787  
 788      test('runs GDPR verification for GDPR country with form + emails', async () => {
 789        mockSiteRows = [
 790          makeSite({
 791            id: 30,
 792            domain: 'uk-site.co.uk',
 793            country_code: 'GB',
 794            contacts_json: JSON.stringify({
 795              business_name: 'UK Ltd',
 796              email_addresses: [{ email: 'info@uk-site.co.uk', label: 'Office' }],
 797              phone_numbers: [],
 798              social_profiles: [],
 799              key_pages: [],
 800              primary_contact_form: { form_url: 'https://uk-site.co.uk/contact' },
 801              country_code: 'GB',
 802            }),
 803            html_dom: '<html><body>UK Ltd info@uk-site.co.uk</body></html>',
 804          }),
 805        ];
 806  
 807        await runEnrichmentStage();
 808  
 809        // Form-site path marks enriched_regex (GDPR verification happens in browser enrichment path only)
 810        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
 811        assert.ok(enrichUpdate, 'should UPDATE status to enriched_regex');
 812      });
 813  
 814      test('skips GDPR check and marks enriched for non-GDPR country with form', async () => {
 815        mockSiteRows = [
 816          makeSite({
 817            id: 31,
 818            domain: 'au-site.com.au',
 819            country_code: 'AU',
 820            contacts_json: makeContactsWithForm(),
 821          }),
 822        ];
 823  
 824        await runEnrichmentStage();
 825  
 826        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
 827        assert.ok(enrichUpdate, 'AU site with form should be marked enriched without GDPR check');
 828      });
 829    });
 830  
 831    // ─────────────────────────────────────────────────────────────
 832    // Suite: Blocklisted sites
 833    // ─────────────────────────────────────────────────────────────
 834  
 835    describe('Blocklisted sites', () => {
 836      beforeEach(() => {
 837        resetState();
 838      });
 839  
 840      test('marks blocklisted site as ignored and skips enrichment', async () => {
 841        mockBlocklistResult = { reason: 'Directory domain' };
 842        mockSiteRows = [makeSite({ id: 50, domain: 'yellowpages.com' })];
 843  
 844        await runEnrichmentStage();
 845  
 846        const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'"));
 847        assert.ok(ignoreUpdate, "should UPDATE status to 'ignore' for blocklisted site");
 848      });
 849  
 850      test('does not run browser enrichment for blocklisted sites', async () => {
 851        mockBlocklistResult = { reason: 'Social media domain' };
 852        mockSiteRows = [makeSite({ id: 51, domain: 'facebook.com' })];
 853  
 854        await runEnrichmentStage();
 855  
 856        // All sites blocked — filtered before sitesNeedingEnrichment
 857        assert.equal(mockPageGoto.mock.calls.length, 0, 'should not browse any pages');
 858      });
 859    });
 860  
 861    // ─────────────────────────────────────────────────────────────
 862    // Suite: Happy path — no existing contacts, no contact pages
 863    // ─────────────────────────────────────────────────────────────
 864  
 865    describe('Happy path — no contact pages found', () => {
 866      // HTML with an email so regex extraction finds a contact (hasAnyContact=true → enriched)
 867      // Must not use example.com/acme.com — these may be in EMAIL_NOISE_DOMAINS
 868      const htmlWithEmail =
 869        '<html><body><p>Contact us at <a href="mailto:info@plumbersydney.com.au">info@plumbersydney.com.au</a></p></body></html>';
 870  
 871      beforeEach(() => {
 872        resetState();
 873        mockHtml = htmlWithEmail;
 874      });
 875  
 876      test('enriches a site with no prior contacts, no contact pages', async () => {
 877        mockSiteRows = [makeSite({ id: 100, contacts_json: null, html_dom: htmlWithEmail })];
 878  
 879        const result = await runEnrichmentStage();
 880  
 881        assert.equal(result.processed, 1, 'processed should be 1');
 882        assert.equal(result.succeeded, 1, 'succeeded should be 1');
 883        assert.equal(result.failed, 0, 'failed should be 0');
 884      });
 885  
 886      test('updates site to enriched status', async () => {
 887        mockSiteRows = [makeSite({ id: 101, contacts_json: null, html_dom: htmlWithEmail })];
 888  
 889        await runEnrichmentStage();
 890  
 891        const enrichUpdate = runCalls.find(
 892          c => c.sql.includes("status = 'enriched_regex'") && c.sql.includes('enriched_at')
 893        );
 894        assert.ok(enrichUpdate, 'should UPDATE site to enriched status');
 895      });
 896  
 897      test('resets retries after successful enrichment', async () => {
 898        mockSiteRows = [makeSite({ id: 102, contacts_json: null, html_dom: htmlWithEmail })];
 899        await runEnrichmentStage();
 900        assert.ok(resetRetriesMock.mock.calls.length >= 1, 'resetRetries should be called');
 901      });
 902  
 903      test('closes database after successful run', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => {
 904        mockSiteRows = [makeSite({ id: 103, contacts_json: null, html_dom: htmlWithEmail })];
 905        await runEnrichmentStage();
 906        assert.ok(dbClosed, 'database should be closed');
 907      });
 908  
 909      test('closes browser after successful run', async () => {
 910        mockSiteRows = [makeSite({ id: 104, contacts_json: null, html_dom: htmlWithEmail })];
 911        await runEnrichmentStage();
 912        assert.ok(mockBrowserClose.mock.calls.length >= 1, 'browser should be closed');
 913      });
 914  
 915      test('returns correct stats shape', async () => {
 916        mockSiteRows = [makeSite({ id: 105, contacts_json: null, html_dom: htmlWithEmail })];
 917        const result = await runEnrichmentStage();
 918  
 919        assert.ok('processed' in result, 'result should have processed');
 920        assert.ok('succeeded' in result, 'result should have succeeded');
 921        assert.ok('failed' in result, 'result should have failed');
 922        assert.ok('skipped' in result, 'result should have skipped');
 923        assert.ok('duration' in result, 'result should have duration');
 924      });
 925  
 926      test('handles null html_dom gracefully', async () => {
 927        mockSiteRows = [makeSite({ id: 106, contacts_json: null, html_dom: null })];
 928  
 929        const result = await runEnrichmentStage();
 930        assert.equal(result.processed, 1, 'should still process site with null html_dom');
 931      });
 932    });
 933  
 934    // ─────────────────────────────────────────────────────────────
 935    // Suite: Happy path — contact pages found, browser enrichment
 936    // ─────────────────────────────────────────────────────────────
 937  
 938    describe('Happy path — contact pages found, browser enrichment', () => {
 939      beforeEach(() => {
 940        resetState();
 941      });
 942  
 943      test('launches browser and browses contact pages', async () => {
 944        mockSiteRows = [
 945          makeSite({
 946            id: 200,
 947            contacts_json: makeContactsJson({
 948              key_pages: ['https://example.com/contact'],
 949            }),
 950          }),
 951        ];
 952  
 953        mockLLMResponse = {
 954          content: JSON.stringify({
 955            business_name: 'Example Corp',
 956            email_addresses: [{ email: 'hello@example.com', label: 'Office' }],
 957            phone_numbers: [],
 958            social_profiles: [],
 959            key_pages: [],
 960          }),
 961          usage: { promptTokens: 300, completionTokens: 100 },
 962        };
 963  
 964        const result = await runEnrichmentStage();
 965  
 966        assert.ok(mockPageGoto.mock.calls.length >= 1, 'should navigate to contact page');
 967        assert.equal(result.succeeded, 1, 'should succeed');
 968      });
 969  
 970      test('finds a contact form and sets formFound=true in stats', async () => {
 971        // Page HTML with <form> so regex extractor detects has_contact_form=true
 972        mockHtml =
 973          '<html><body><form action="/contact" method="post"><input name="email"/><button>Submit</button></form></body></html>';
 974  
 975        mockSiteRows = [
 976          makeSite({
 977            id: 201,
 978            contacts_json: makeContactsJson({
 979              key_pages: ['https://example.com/contact-us'],
 980            }),
 981          }),
 982        ];
 983  
 984        const result = await runEnrichmentStage();
 985  
 986        assert.equal(result.formsFound, 1, 'should count form as found');
 987      });
 988  
 989      test('counts emails found from contact page regex result', async () => {
 990        // Page HTML contains 2 emails that regex extractor will find
 991        // (must not use example.com — it is in EMAIL_NOISE_DOMAINS)
 992        mockHtml =
 993          '<html><body>' +
 994          '<a href="mailto:sales@testcorp.com.au">Sales</a> ' +
 995          '<a href="mailto:support@testcorp.com.au">Support</a>' +
 996          '</body></html>';
 997  
 998        mockSiteRows = [
 999          makeSite({
1000            id: 202,
1001            contacts_json: makeContactsJson({
1002              key_pages: ['https://example.com/about'],
1003            }),
1004          }),
1005        ];
1006  
1007        const result = await runEnrichmentStage();
1008  
1009        assert.ok(
1010          result.emailsFound >= 2,
1011          `should count ≥2 emails from contact page, got ${result.emailsFound}`
1012        );
1013      });
1014  
1015      test('stops browsing after finding a form (break on foundForm)', async () => {
1016        // Page HTML must contain <form> so extractContactsFromHtml detects has_contact_form=true
1017        mockHtml =
1018          '<html><body><form action="/contact" method="post"><input name="email"/></form></body></html>';
1019  
1020        mockSiteRows = [
1021          makeSite({
1022            id: 203,
1023            contacts_json: makeContactsJson({
1024              key_pages: [
1025                'https://example.com/contact',
1026                'https://example.com/about',
1027                'https://example.com/support',
1028              ],
1029            }),
1030          }),
1031        ];
1032  
1033        await runEnrichmentStage();
1034  
1035        // Should only visit 1 page (stopped after finding form on first page)
1036        assert.equal(
1037          mockPageGoto.mock.calls.length,
1038          1,
1039          'should stop browsing after finding form on first page'
1040        );
1041      });
1042  
1043      test('closes context and page after each contact page', async () => {
1044        mockSiteRows = [
1045          makeSite({
1046            id: 204,
1047            contacts_json: makeContactsJson({
1048              key_pages: ['https://example.com/contact'],
1049            }),
1050          }),
1051        ];
1052  
1053        mockLLMResponse = {
1054          content: JSON.stringify({
1055            business_name: 'Close Test Corp',
1056            email_addresses: [],
1057            phone_numbers: [],
1058            social_profiles: [],
1059            key_pages: [],
1060          }),
1061          usage: { promptTokens: 100, completionTokens: 40 },
1062        };
1063  
1064        await runEnrichmentStage();
1065  
1066        assert.ok(mockPageClose.mock.calls.length >= 1, 'page should be closed after browsing');
1067        assert.ok(mockContextClose.mock.calls.length >= 1, 'context should be closed after browsing');
1068      });
1069  
1070      test('reads page HTML during browser enrichment (regex extraction path)', async () => {
1071        mockSiteRows = [
1072          makeSite({
1073            id: 205,
1074            contacts_json: makeContactsJson({
1075              key_pages: ['https://example.com/contact'],
1076            }),
1077          }),
1078        ];
1079  
1080        await runEnrichmentStage();
1081  
1082        assert.ok(mockPageContent.mock.calls.length >= 1, 'should call page.content()');
1083        assert.ok(mockPageGoto.mock.calls.length >= 1, 'should navigate to contact page');
1084      });
1085  
1086      test('marks site enriched after browsing contact pages', async () => {
1087        mockSiteRows = [
1088          makeSite({
1089            id: 206,
1090            contacts_json: makeContactsJson({
1091              key_pages: ['https://example.com/contact'],
1092            }),
1093          }),
1094        ];
1095  
1096        mockLLMResponse = {
1097          content: JSON.stringify({
1098            business_name: 'Enriched Corp',
1099            email_addresses: [],
1100            phone_numbers: [],
1101            social_profiles: [],
1102            key_pages: [],
1103          }),
1104          usage: { promptTokens: 100, completionTokens: 40 },
1105        };
1106  
1107        await runEnrichmentStage();
1108  
1109        const enrichUpdate = runCalls.find(
1110          c => c.sql.includes("status = 'enriched_regex'")
1111        );
1112        assert.ok(enrichUpdate, 'should UPDATE site to enriched_regex status');
1113      });
1114    });
1115  
1116    // ─────────────────────────────────────────────────────────────
1117    // Suite: Contacts from score_json
1118    // ─────────────────────────────────────────────────────────────
1119  
1120    describe('Contacts from score_json (fallback path)', () => {
1121      beforeEach(() => {
1122        resetState();
1123      });
1124  
1125      test('uses contact_details from score_json when contacts_json is null', async () => {
1126        mockSiteRows = [
1127          makeSite({
1128            id: 300,
1129            contacts_json: null,
1130            score_json: JSON.stringify({
1131              contact_details: {
1132                business_name: 'Score Corp',
1133                email_addresses: [{ email: 'score@score.com', label: 'Main' }],
1134                phone_numbers: [],
1135                social_profiles: [],
1136                key_pages: [],
1137              },
1138            }),
1139          }),
1140        ];
1141  
1142        const result = await runEnrichmentStage();
1143  
1144        // No LLM call needed (score_json has contacts, no key_pages)
1145        assert.equal(result.processed, 1, 'should process the site');
1146        assert.equal(result.succeeded, 1, 'should succeed');
1147      });
1148  
1149      test('falls back to regex extraction if score_json has no contact_details', async () => {
1150        mockSiteRows = [
1151          makeSite({
1152            id: 301,
1153            contacts_json: null,
1154            score_json: JSON.stringify({ overall_score: 65 }), // no contact_details
1155            html_dom: '<html><body><p>Contact: fallback@fallback.com</p></body></html>',
1156          }),
1157        ];
1158  
1159        const result = await runEnrichmentStage();
1160  
1161        // Code uses regex extraction (ENABLE_ENRICHMENT_LLM !== 'false' → regex-only path)
1162        assert.equal(result.processed, 1, 'should process the site');
1163        assert.equal(result.succeeded, 1, 'should succeed');
1164      });
1165  
1166      test('handles invalid score_json gracefully', async () => {
1167        mockSiteRows = [
1168          makeSite({
1169            id: 302,
1170            contacts_json: null,
1171            score_json: 'NOT VALID JSON{{{',
1172          }),
1173        ];
1174  
1175        mockLLMResponse = {
1176          content: JSON.stringify({
1177            business_name: 'Invalid JSON Corp',
1178            email_addresses: [],
1179            phone_numbers: [],
1180            social_profiles: [],
1181            key_pages: [],
1182          }),
1183          usage: { promptTokens: 100, completionTokens: 30 },
1184        };
1185  
1186        // Should not throw — falls back to extractInitialContacts
1187        const result = await runEnrichmentStage();
1188        assert.equal(result.processed, 1, 'should process even with invalid score_json');
1189      });
1190  
1191      test('cleans invalid social links from score_json contacts', async () => {
1192        mockSiteRows = [
1193          makeSite({
1194            id: 303,
1195            contacts_json: null,
1196            score_json: JSON.stringify({
1197              contact_details: {
1198                business_name: 'Social Corp',
1199                email_addresses: [],
1200                phone_numbers: [],
1201                social_profiles: [{ url: 'https://twitter.com/', label: 'Twitter' }],
1202                key_pages: [],
1203              },
1204            }),
1205          }),
1206        ];
1207  
1208        await runEnrichmentStage();
1209  
1210        // cleanInvalidSocialLinks should have been called
1211        assert.ok(
1212          cleanInvalidSocialLinksMock.mock.calls.length >= 1,
1213          'cleanInvalidSocialLinks should be called'
1214        );
1215      });
1216    });
1217  
1218    // ─────────────────────────────────────────────────────────────
1219    // Suite: Error handling
1220    // ─────────────────────────────────────────────────────────────
1221  
1222    describe('Error handling', () => {
1223      beforeEach(() => {
1224        resetState();
1225      });
1226  
1227      test('records failure when browser page navigation throws', async () => {
1228        mockSiteRows = [
1229          makeSite({
1230            id: 400,
1231            contacts_json: makeContactsJson({
1232              key_pages: ['https://example.com/contact'],
1233            }),
1234          }),
1235        ];
1236  
1237        // Override goto to throw
1238        mockPageGoto.mock.mockImplementation(async () => {
1239          throw new Error('Navigation timeout');
1240        });
1241  
1242        const result = await runEnrichmentStage();
1243  
1244        assert.equal(result.failed, 1, 'should count failure');
1245        assert.ok(recordFailureMock.mock.calls.length >= 1, 'should call recordFailure');
1246      });
1247  
1248      test('records failure when page navigation throws for contact page enrichment', async () => {
1249        mockSiteRows = [
1250          makeSite({
1251            id: 401,
1252            contacts_json: makeContactsJson({
1253              key_pages: ['https://example.com/contact'],
1254            }),
1255          }),
1256        ];
1257  
1258        mockPageGoto.mock.mockImplementation(async () => {
1259          throw new Error('Navigation error');
1260        });
1261  
1262        const result = await runEnrichmentStage();
1263  
1264        assert.equal(result.failed, 1, 'should count failure');
1265        assert.ok(recordFailureMock.mock.calls.length >= 1, 'recordFailure should be called');
1266      });
1267  
1268      test('processes second site when first fails during navigation', async () => {
1269        // Site1: has contact pages → page.goto throws → fails
1270        // Site2: has an email in contacts_json (no contact pages) → succeeds
1271        let navCallCount = 0;
1272        mockPageGoto.mock.mockImplementation(async () => {
1273          navCallCount++;
1274          if (navCallCount === 1) throw new Error('Navigation failed for site1');
1275        });
1276  
1277        mockSiteRows = [
1278          makeSite({
1279            id: 410,
1280            domain: 'site1.com',
1281            url: 'https://site1.com',
1282            contacts_json: makeContactsJson({ key_pages: ['https://site1.com/contact'] }),
1283          }),
1284          makeSite({
1285            id: 411,
1286            domain: 'site2.com',
1287            url: 'https://site2.com',
1288            contacts_json: JSON.stringify({
1289              business_name: 'Site 2',
1290              email_addresses: [{ email: 'info@site2.com', label: 'Main' }],
1291              phone_numbers: [],
1292              social_profiles: [],
1293              key_pages: [], // no contact pages → no browsing → direct enriched
1294            }),
1295          }),
1296        ];
1297  
1298        const result = await runEnrichmentStage();
1299  
1300        assert.equal(result.processed, 2, 'should process all sites');
1301        assert.equal(result.failed, 1, 'should count 1 failure (site1)');
1302        assert.equal(result.succeeded, 1, 'should count 1 success (site2)');
1303      });
1304  
1305      test('closes browser even when enrichment fails', async () => {
1306        mockSiteRows = [
1307          makeSite({
1308            id: 420,
1309            contacts_json: makeContactsJson({
1310              key_pages: ['https://example.com/contact'],
1311            }),
1312          }),
1313        ];
1314  
1315        mockPageGoto.mock.mockImplementation(async () => {
1316          throw new Error('Navigation timeout');
1317        });
1318  
1319        await runEnrichmentStage();
1320  
1321        assert.ok(
1322          mockBrowserClose.mock.calls.length >= 1,
1323          'browser should be closed even on failure'
1324        );
1325      });
1326  
1327      test('closes database even when enrichment fails', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, async () => {
1328        mockSiteRows = [
1329          makeSite({
1330            id: 421,
1331            contacts_json: makeContactsJson({
1332              key_pages: ['https://example.com/contact'],
1333            }),
1334          }),
1335        ];
1336  
1337        mockPageGoto.mock.mockImplementation(async () => {
1338          throw new Error('Navigation timeout');
1339        });
1340  
1341        await runEnrichmentStage();
1342  
1343        assert.ok(dbClosed, 'database should be closed even on failure');
1344      });
1345  
1346      test('recordFailure called with semantic_scored or vision_scored as currentStatus', async () => {
1347        mockSiteRows = [
1348          makeSite({
1349            id: 430,
1350            contacts_json: makeContactsJson({ key_pages: ['https://example.com/contact'] }),
1351          }),
1352        ];
1353  
1354        mockPageGoto.mock.mockImplementation(async () => {
1355          throw new Error('Fatal navigation error');
1356        });
1357  
1358        await runEnrichmentStage();
1359  
1360        assert.ok(recordFailureMock.mock.calls.length >= 1, 'recordFailure should be called');
1361        // The third argument to recordFailure should be 'semantic_scored' or 'vision_scored'
1362        const call = recordFailureMock.mock.calls[0];
1363        assert.ok(
1364          call.arguments.includes('semantic_scored') || call.arguments.includes('vision_scored'),
1365          'recordFailure should pass semantic_scored or vision_scored as currentStatus'
1366        );
1367      });
1368    });
1369  
1370    // ─────────────────────────────────────────────────────────────
1371    // Suite: Country mismatch detection
1372    // ─────────────────────────────────────────────────────────────
1373  
1374    describe('Country mismatch detection', () => {
1375      beforeEach(() => {
1376        resetState();
1377      });
1378  
1379      test('marks site as ignored when detected country differs from google_domain country', async () => {
1380        // parseCountryFromGoogleDomain returns AU (from google.com.au)
1381        // contacts_json has country_code=US → after browsing, country mismatch check triggers → ignore
1382        mockParseCountryFromGoogleDomain = _domain => 'AU';
1383  
1384        mockSiteRows = [
1385          makeSite({
1386            id: 500,
1387            domain: 'us-site.com',
1388            country_code: 'AU',
1389            google_domain: 'google.com.au',
1390            contacts_json: JSON.stringify({
1391              business_name: 'US Site',
1392              email_addresses: [{ email: 'us@us-site.com', label: 'Main' }],
1393              phone_numbers: [],
1394              social_profiles: [],
1395              // Has a contact page so browsing path executes and reaches country mismatch check
1396              key_pages: ['https://us-site.com/contact'],
1397              country_code: 'US', // triggers mismatch check
1398              city: 'New York',
1399            }),
1400          }),
1401        ];
1402  
1403        await runEnrichmentStage();
1404  
1405        const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'"));
1406        assert.ok(ignoreUpdate, 'should mark country-mismatched site as ignored');
1407      });
1408  
1409      test('does not ignore site when country matches google_domain', async () => {
1410        // Both AU → no mismatch
1411        mockParseCountryFromGoogleDomain = _domain => 'AU';
1412  
1413        mockSiteRows = [
1414          makeSite({
1415            id: 501,
1416            domain: 'au-site.com.au',
1417            country_code: 'AU',
1418            google_domain: 'google.com.au',
1419            contacts_json: makeContactsJson({
1420              key_pages: ['https://au-site.com.au/contact'],
1421            }),
1422          }),
1423        ];
1424  
1425        mockLLMResponse = {
1426          content: JSON.stringify({
1427            business_name: 'AU Site',
1428            email_addresses: [],
1429            phone_numbers: [],
1430            social_profiles: [],
1431            key_pages: [],
1432            country_code: 'AU',
1433          }),
1434          usage: { promptTokens: 200, completionTokens: 60 },
1435        };
1436  
1437        await runEnrichmentStage();
1438  
1439        const ignoreUpdate = runCalls.find(c => c.sql.includes("status = 'ignored'"));
1440        assert.equal(ignoreUpdate, undefined, 'should NOT mark site as ignored when country matches');
1441        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
1442        assert.ok(enrichUpdate, 'site should be marked enriched when country matches');
1443      });
1444    });
1445  
1446    // ─────────────────────────────────────────────────────────────
1447    // Suite: GDPR verification during browser enrichment
1448    // ─────────────────────────────────────────────────────────────
1449  
1450    describe('GDPR verification during browser enrichment', () => {
1451      beforeEach(() => {
1452        resetState();
1453      });
1454  
1455      test('runs GDPR check for GDPR country sites with emails from enrichment', async () => {
1456        // Configure: GB requires GDPR, google.co.uk → GB (no mismatch)
1457        mockParseCountryFromGoogleDomain = _domain => 'GB';
1458        mockCountryByCode = code => {
1459          if (code === 'GB') return { code: 'GB', requiresGDPRCheck: true };
1460          return null;
1461        };
1462  
1463        mockSiteRows = [
1464          makeSite({
1465            id: 600,
1466            domain: 'uk-site.co.uk',
1467            country_code: 'GB',
1468            google_domain: 'google.co.uk',
1469            contacts_json: makeContactsJson({
1470              key_pages: ['https://uk-site.co.uk/contact'],
1471              country_code: 'GB',
1472            }),
1473          }),
1474        ];
1475  
1476        mockLLMResponse = {
1477          content: JSON.stringify({
1478            business_name: 'UK Corp',
1479            email_addresses: [{ email: 'info@uk-site.co.uk', label: 'Main' }],
1480            phone_numbers: [],
1481            social_profiles: [],
1482            key_pages: [],
1483            country_code: 'GB',
1484          }),
1485          usage: { promptTokens: 200, completionTokens: 60 },
1486        };
1487  
1488        await runEnrichmentStage();
1489  
1490        // Should have company_proof in the final UPDATE
1491        const gdprUpdate = runCalls.find(
1492          c => c.sql.includes('company_proof') && c.sql.includes("status = 'enriched_regex'")
1493        );
1494        assert.ok(gdprUpdate, 'should set company_proof for GDPR site');
1495      });
1496  
1497      test('skips GDPR check for non-GDPR countries during browser enrichment', async () => {
1498        mockParseCountryFromGoogleDomain = _domain => 'AU';
1499        mockCountryByCode = code => {
1500          if (code === 'AU') return { code: 'AU', requiresGDPRCheck: false };
1501          return null;
1502        };
1503  
1504        mockSiteRows = [
1505          makeSite({
1506            id: 601,
1507            domain: 'au-site.com.au',
1508            country_code: 'AU',
1509            google_domain: 'google.com.au',
1510            contacts_json: makeContactsJson({
1511              key_pages: ['https://au-site.com.au/contact'],
1512              country_code: 'AU',
1513            }),
1514          }),
1515        ];
1516  
1517        mockLLMResponse = {
1518          content: JSON.stringify({
1519            business_name: 'AU Corp',
1520            email_addresses: [{ email: 'info@au-site.com.au', label: 'Main' }],
1521            phone_numbers: [],
1522            social_profiles: [],
1523            key_pages: [],
1524            country_code: 'AU',
1525          }),
1526          usage: { promptTokens: 200, completionTokens: 60 },
1527        };
1528  
1529        await runEnrichmentStage();
1530  
1531        // Should still enrich without GDPR verification
1532        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
1533        assert.ok(enrichUpdate, 'AU site should still be marked enriched');
1534  
1535        // batchVerifyEmails should NOT be called (non-GDPR country)
1536        assert.equal(
1537          batchVerifyEmailsMock.mock.calls.length,
1538          0,
1539          'batchVerifyEmails should not be called for non-GDPR country'
1540        );
1541      });
1542    });
1543  
1544    // ─────────────────────────────────────────────────────────────
1545    // Suite: Phone number normalization
1546    // ─────────────────────────────────────────────────────────────
1547  
1548    describe('Phone number normalization', () => {
1549      beforeEach(() => {
1550        resetState();
1551      });
1552  
1553      test('normalizes phone numbers from score_json contact_details', async () => {
1554        let normalizeCalled = false;
1555        normalizePhoneNumberMock.mock.mockImplementation(p => {
1556          normalizeCalled = true;
1557          return p;
1558        });
1559  
1560        mockSiteRows = [
1561          makeSite({
1562            id: 700,
1563            contacts_json: null,
1564            score_json: JSON.stringify({
1565              contact_details: {
1566                business_name: 'Phone Corp',
1567                email_addresses: [],
1568                phone_numbers: [{ number: '+61412345678', label: 'Office' }],
1569                social_profiles: [],
1570                key_pages: [],
1571              },
1572            }),
1573          }),
1574        ];
1575  
1576        await runEnrichmentStage();
1577  
1578        assert.ok(
1579          normalizeCalled,
1580          'normalizePhoneNumber should be called for phone numbers in score_json'
1581        );
1582      });
1583  
1584      test('handles string phone format in score_json', async () => {
1585        mockSiteRows = [
1586          makeSite({
1587            id: 701,
1588            contacts_json: null,
1589            score_json: JSON.stringify({
1590              contact_details: {
1591                business_name: 'String Phone Corp',
1592                email_addresses: [],
1593                phone_numbers: ['0412345678'], // legacy string format
1594                social_profiles: [],
1595                key_pages: [],
1596              },
1597            }),
1598          }),
1599        ];
1600  
1601        // Should not throw
1602        const result = await runEnrichmentStage();
1603        assert.equal(result.processed, 1, 'should handle string phone format');
1604      });
1605  
1606      test('filters null phones from cleanPhoneNumbers', async () => {
1607        // Phone with missing .number property → should be filtered out, not crash
1608        mockSiteRows = [
1609          makeSite({
1610            id: 702,
1611            contacts_json: null,
1612            score_json: JSON.stringify({
1613              contact_details: {
1614                business_name: 'Null Phone Corp',
1615                email_addresses: [],
1616                phone_numbers: [{ label: 'No number field' }], // invalid — no .number
1617                social_profiles: [],
1618                key_pages: [],
1619              },
1620            }),
1621          }),
1622        ];
1623  
1624        const result = await runEnrichmentStage();
1625        assert.equal(result.processed, 1, 'should handle invalid phone objects gracefully');
1626      });
1627    });
1628  
1629    // ─────────────────────────────────────────────────────────────
1630    // Suite: Batch processing with limit option
1631    // ─────────────────────────────────────────────────────────────
1632  
1633    describe('Batch processing options', () => {
1634      beforeEach(() => {
1635        resetState();
1636      });
1637  
1638      test('respects limit option passed to the stage', async () => {
1639        // Limit is applied at DB query level (SQL LIMIT clause in the source)
1640        // We verify it doesn't break when limit is passed
1641        mockSiteRows = [makeSite({ id: 800, contacts_json: null })];
1642  
1643        mockLLMResponse = {
1644          content: JSON.stringify({
1645            business_name: 'Limit Corp',
1646            email_addresses: [],
1647            phone_numbers: [],
1648            social_profiles: [],
1649            key_pages: [],
1650          }),
1651          usage: { promptTokens: 100, completionTokens: 30 },
1652        };
1653  
1654        const result = await runEnrichmentStage({ limit: 10 });
1655        assert.ok(result, 'should return a result with limit option');
1656        assert.equal(result.processed, 1, 'should process the site');
1657      });
1658  
1659      test('processes multiple sites sequentially in mock batch', async () => {
1660        // All sites have empty key_pages (no contact pages) → no browser needed
1661        // fetch is mocked globally to return 404 → sitemap fallback returns []
1662        mockSiteRows = [
1663          makeSite({ id: 810, contacts_json: makeContactsJson() }),
1664          makeSite({
1665            id: 811,
1666            domain: 'site2.com',
1667            url: 'https://site2.com',
1668            contacts_json: makeContactsJson(),
1669          }),
1670          makeSite({
1671            id: 812,
1672            domain: 'site3.com',
1673            url: 'https://site3.com',
1674            contacts_json: makeContactsJson(),
1675          }),
1676        ];
1677  
1678        const result = await runEnrichmentStage();
1679  
1680        assert.equal(result.processed, 3, 'should process all 3 sites');
1681        assert.equal(result.succeeded, 3, 'all 3 should succeed');
1682        assert.equal(result.failed, 0, 'no failures');
1683      });
1684  
1685      test('concurrency option is accepted without error', async () => {
1686        mockSiteRows = [];
1687        const result = await runEnrichmentStage({ concurrency: 2 });
1688        assert.ok(result, 'should accept concurrency option');
1689      });
1690    });
1691  
1692    // ─────────────────────────────────────────────────────────────
1693    // Suite: getEnrichmentStats
1694    // ─────────────────────────────────────────────────────────────
1695  
1696    describe('getEnrichmentStats', () => {
1697      beforeEach(() => {
1698        resetState();
1699      });
1700  
1701      test('returns stats object with expected fields', async () => {
1702        const stats = await getEnrichmentStats();
1703  
1704        assert.ok(stats !== null, 'stats should not be null');
1705        assert.ok('total_enriched' in stats, 'stats should have total_enriched');
1706        assert.ok('with_forms' in stats, 'stats should have with_forms');
1707        assert.ok('with_emails' in stats, 'stats should have with_emails');
1708        assert.ok('with_phones' in stats, 'stats should have with_phones');
1709      });
1710  
1711      test('closes database after returning stats', { skip: 'SQLite-era test: enrich.js now uses db.js PG pool (singleton, not closed per-run)' }, () => {
1712        getEnrichmentStats();
1713        assert.ok(dbClosed, 'database should be closed after getEnrichmentStats');
1714      });
1715  
1716      test('returns numeric values in stats', async () => {
1717        const stats = await getEnrichmentStats();
1718        assert.ok(typeof stats.total_enriched === 'number', 'total_enriched should be a number');
1719        assert.ok(typeof stats.with_forms === 'number', 'with_forms should be a number');
1720      });
1721    });
1722  
1723    // ─────────────────────────────────────────────────────────────
1724    // Suite: ENABLE_VISION flag handling
1725    // ─────────────────────────────────────────────────────────────
1726  
1727    describe('ENABLE_VISION flag', () => {
1728      beforeEach(() => {
1729        resetState();
1730      });
1731  
1732      test('stage runs and returns empty results with ENABLE_VISION=false and no sites', async () => {
1733        process.env.ENABLE_VISION = 'false';
1734        mockSiteRows = [];
1735  
1736        const result = await runEnrichmentStage();
1737  
1738        assert.equal(result.processed, 0, 'should return 0 processed with no sites');
1739  
1740        process.env.ENABLE_VISION = 'true';
1741      });
1742  
1743      test('stage runs normally with ENABLE_VISION=true', async () => {
1744        process.env.ENABLE_VISION = 'true';
1745        mockSiteRows = [];
1746  
1747        const result = await runEnrichmentStage();
1748        assert.equal(result.processed, 0, 'should return 0 processed with no sites');
1749      });
1750    });
1751  
1752    // ─────────────────────────────────────────────────────────────
1753    // Suite: Mixed sites — some with forms, some without
1754    // ─────────────────────────────────────────────────────────────
1755  
1756    describe('Mixed site set — forms and non-forms', () => {
1757      beforeEach(() => {
1758        resetState();
1759      });
1760  
1761      test('handles mix of form-sites and non-form-sites correctly', async () => {
1762        mockSiteRows = [
1763          // Site with form → handled in sitesWithForms loop → enriched without browser
1764          makeSite({ id: 900, contacts_json: makeContactsWithForm() }),
1765          // Site without form, no contact pages → browser launched, sitemap fails, marked enriched
1766          makeSite({
1767            id: 901,
1768            domain: 'no-form.com',
1769            url: 'https://no-form.com',
1770            contacts_json: makeContactsJson(),
1771          }),
1772        ];
1773  
1774        const result = await runEnrichmentStage();
1775  
1776        // All sites go through processBatch (no fast-path split for form sites)
1777        assert.equal(result.processed, 2, 'should process both sites');
1778        assert.ok(result.succeeded >= 1, 'should have at least 1 success');
1779      });
1780  
1781      test('skipped count reflects sites that have forms', async () => {
1782        mockSiteRows = [
1783          makeSite({ id: 910, contacts_json: makeContactsWithForm() }),
1784          makeSite({ id: 911, domain: 's2.com', contacts_json: makeContactsWithForm() }),
1785          makeSite({
1786            id: 912,
1787            domain: 's3.com',
1788            url: 'https://s3.com',
1789            contacts_json: makeContactsJson(),
1790          }),
1791        ];
1792  
1793        const result = await runEnrichmentStage();
1794  
1795        // All 3 sites go through processBatch (no skipping based on forms)
1796        assert.equal(result.processed, 3, 'all 3 sites should be processed');
1797        assert.equal(result.skipped, 0, 'no sites are skipped from processBatch');
1798      });
1799    });
1800  
1801    // ─────────────────────────────────────────────────────────────
1802    // Suite: dedupeByUrl utility (via social_profiles merge)
1803    // ─────────────────────────────────────────────────────────────
1804  
1805    describe('Social profile deduplication', () => {
1806      beforeEach(() => {
1807        resetState();
1808      });
1809  
1810      test('deduplicates social profiles when merging existing and new contacts', async () => {
1811        const sharedProfileUrl = 'https://linkedin.com/company/example';
1812  
1813        mockSiteRows = [
1814          makeSite({
1815            id: 1000,
1816            contacts_json: makeContactsJson({
1817              key_pages: ['https://example.com/contact'],
1818              social_profiles: [{ url: sharedProfileUrl, label: 'LinkedIn' }],
1819            }),
1820          }),
1821        ];
1822  
1823        // LLM returns the same social profile URL
1824        mockLLMResponse = {
1825          content: JSON.stringify({
1826            business_name: 'Dedup Corp',
1827            email_addresses: [],
1828            phone_numbers: [],
1829            social_profiles: [{ url: sharedProfileUrl, label: 'LinkedIn' }],
1830            key_pages: [],
1831          }),
1832          usage: { promptTokens: 100, completionTokens: 40 },
1833        };
1834  
1835        await runEnrichmentStage();
1836  
1837        // Should succeed and save enriched contacts
1838        const enrichUpdate = runCalls.find(c => c.sql.includes("status = 'enriched_regex'"));
1839        assert.ok(enrichUpdate, 'should mark site as enriched even with duplicate social profiles');
1840  
1841        // Verify contacts_json arg doesn't have duplicate LinkedIn entries
1842        if (enrichUpdate) {
1843          const contactsArg = enrichUpdate.args.find(
1844            a => typeof a === 'string' && a.includes(sharedProfileUrl)
1845          );
1846          if (contactsArg) {
1847            const parsed = JSON.parse(contactsArg);
1848            const linkedinCount = (parsed.social_profiles || []).filter(
1849              p => (typeof p === 'string' ? p : p.url) === sharedProfileUrl
1850            ).length;
1851            assert.equal(linkedinCount, 1, 'LinkedIn profile should appear only once (deduped)');
1852          }
1853        }
1854      });
1855    });
1856  });