Cradicle Explorer

/ src / stages / enrich.js
enrich.js
   1  /**
   2   * Enrichment Stage
   3   * Browse key_pages to extract additional contact information
   4   * Only runs if we don't already have a contact form
   5   */
   6  
   7  import { run, getOne, getAll, withTransaction } from '../utils/db.js';
   8  import {
   9    launchStealthBrowser,
  10    createStealthContext,
  11    humanScroll,
  12    randomDelay,
  13    isSocialMediaUrl,
  14    waitForCloudflare,
  15  } from '../utils/stealth-browser.js';
  16  import { readFileSync } from 'fs';
  17  import { join, dirname } from 'path';
  18  import { fileURLToPath } from 'url';
  19  import Logger from '../utils/logger.js';
  20  import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js';
  21  import { processBatch } from '../utils/error-handler.js';
  22  import { callLLM } from '../utils/llm-provider.js';
  23  import { openRouterLimiter } from '../utils/rate-limiter.js';
  24  import { openRouterBreaker } from '../utils/circuit-breaker.js';
  25  import { safeJsonParse } from '../utils/error-handler.js';
  26  import { normalizePhoneNumber, isValidSmsNumber } from '../utils/phone-normalizer.js';
  27  import { getAdaptiveConcurrencyFast } from '../utils/adaptive-concurrency.js';
  28  import { checkBlocklist } from '../utils/site-filters.js';
  29  import { verifyCompanyEmail, batchVerifyEmails } from '../utils/gdpr-verification.js';
  30  import { getCountryByCode, normaliseCountryCode } from '../config/countries.js';
  31  import { recordFailure, resetRetries } from '../utils/retry-handler.js';
  32  import { parseCountryFromGoogleDomain } from '../utils/tld-detector.js';
  33  import { cleanInvalidSocialLinks } from '../contacts/prioritize.js';
  34  import { extractFromSocialProfiles } from '../utils/social-contact-extractor.js';
  35  import {
  36    extractContactsFromHtml,
  37    mergeExtractedContacts,
  38  } from '../utils/html-contact-extractor.js';
  39  import { sanitizeHtmlForPrompt, wrapUntrusted } from '../utils/llm-sanitizer.js';
  40  import { validateEnrichmentResponse } from '../utils/llm-response-validator.js';
  41  import {
  42    readHtmlDom,
  43    writeKeyPagesHtml as writeKeyPagesHtmlFile,
  44    readKeyPagesHtml as readKeyPagesHtmlFile,
  45  } from '../utils/html-storage.js';
  46  import { getScoreJsonWithFallback } from '../utils/score-storage.js';
  47  import { getContactsJsonWithFallback, setContactsJson } from '../utils/contacts-storage.js';
  48  import { compactContacts } from '../utils/compact-contacts.js';
  49  
  50  const __filename = fileURLToPath(import.meta.url);
  51  const __dirname = dirname(__filename);
  52  const projectRoot = join(__dirname, '../..');
  53  
  54  const logger = new Logger('Enrich');
  55  const success = (...args) => logger.success(...args);
  56  const info = (...args) => logger.info(...args);
  57  const error = (...args) => logger.error(...args);
  58  
  59  const ENGLISH_ONLY_MARKETS = process.env.ENGLISH_ONLY_MARKETS
  60    ? process.env.ENGLISH_ONLY_MARKETS.split(',').map(c => c.trim().toUpperCase())
  61    : null;
  62  
  63  // Load enrichment prompt
  64  const ENRICHMENT_PROMPT = readFileSync(join(projectRoot, 'prompts/ENRICHMENT.md'), 'utf-8');
  65  
  66  // Model configuration (from env or default)
  67  const ENRICHMENT_MODEL = process.env.ENRICHMENT_MODEL || 'openai/gpt-4o-mini';
  68  
  69  /** Deduplicate social_profiles by URL (handles both string and {url, label} formats) */
  70  function dedupeByUrl(profiles) {
  71    const seen = new Set();
  72    return profiles.filter(p => {
  73      const url = typeof p === 'string' ? p : p?.url;
  74      if (!url || seen.has(url)) return false;
  75      seen.add(url);
  76      return true;
  77    });
  78  }
  79  
  80  /**
  81   * Clean phone numbers from LLM output
  82   * - Supports both object format {number, label} and legacy string format
  83   * - Normalizes to E.164 format
  84   * - Logs transformations for troubleshooting
  85   * @param {Array<Object|string>} phones - Phone numbers from LLM
  86   * @param {string} context - URL or identifier for logging
  87   * @returns {Array<Object>} Cleaned phone numbers in object format
  88   */
  89  function cleanPhoneNumbers(phones, context) {
  90    if (!Array.isArray(phones)) return [];
  91  
  92    return phones
  93      .map((phone, idx) => {
  94        // Handle legacy string format
  95        if (typeof phone === 'string') {
  96          const cleaned = normalizePhoneNumber(phone);
  97          if (cleaned !== phone) {
  98            info(`  Phone normalization [${context}]: '${phone}' → '${cleaned}'`);
  99          }
 100          return { number: cleaned, label: '' };
 101        }
 102  
 103        // Handle object format {number, label}
 104        if (phone.number) {
 105          const original = phone.number;
 106          const cleaned = normalizePhoneNumber(original);
 107          if (cleaned !== original) {
 108            info(`  Phone normalization [${context}]: '${original}' → '${cleaned}'`);
 109          }
 110          return {
 111            number: cleaned,
 112            label: phone.label || '',
 113          };
 114        }
 115  
 116        // Invalid format
 117        error(`  Invalid phone format at index ${idx} for ${context}: ${JSON.stringify(phone)}`);
 118        return null;
 119      })
 120      .filter(p => {
 121        if (!p) return false;
 122        const reason = isValidSmsNumber(p.number);
 123        if (reason) {
 124          info(`  Phone rejected [${context}]: ${p.number} — ${reason}`);
 125          return false;
 126        }
 127        return true;
 128      });
 129  }
 130  
 131  /**
 132   * Run the enrichment stage
 133   * @param {Object} options - Stage options
 134   * @param {number} options.limit - Limit number of sites to enrich
 135   * @param {number} options.concurrency - Number of concurrent enrichment operations (default: 8)
 136   * @param {string|string[]} [options.statusFilter] - Optional status value(s) to filter by instead
 137   *   of the default ENABLE_VISION-derived status. Pass a single string or array of strings.
 138   * @returns {Promise<Object>} Stage results
 139   */
 140  export async function runEnrichmentStage(options = {}) {
 141    const startTime = Date.now();
 142    // Use ENRICHMENT_CONCURRENCY (defaults to 8) for parallel browser + LLM operations
 143    // Each concurrent operation runs a browser context + multiple LLM calls
 144    const concurrency =
 145      options.concurrency || parseInt(process.env.ENRICHMENT_CONCURRENCY || '8', 10);
 146  
 147    try {
 148      info('Starting Enrichment Stage...');
 149  
 150      // Check ENABLE_VISION flag to determine input status
 151      const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
 152  
 153      // Show deprecation warning if old flags are used
 154      const legacyFlags = [
 155        process.env.USE_COMPUTER_VISION_SCORING,
 156        process.env.USE_COMPUTER_VISION_RESCORING,
 157        process.env.USE_COMPUTER_VISION_ENRICHMENT,
 158      ];
 159      if (legacyFlags.some(flag => flag !== undefined)) {
 160        logger.warn(
 161          '[enrich] WARN: Vision flags (USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.'
 162        );
 163      }
 164  
 165      // semantic_scored = HTML-only mode (ENABLE_VISION=false), semantic Haiku pass complete
 166      // vision_scored   = vision mode (ENABLE_VISION=true), vision rescoring complete
 167      const defaultInputStatus = ENABLE_VISION ? 'vision_scored' : 'semantic_scored';
 168  
 169      // statusFilter may be injected (e.g. 2Step passes 'reviews_downloaded').
 170      // Supports a single string or an array. Falls back to the default derived from ENABLE_VISION.
 171      const rawStatusFilter = options.statusFilter || defaultInputStatus;
 172      const statusValues = Array.isArray(rawStatusFilter) ? rawStatusFilter : [rawStatusFilter];
 173  
 174      info(
 175        `[enrich] Mode: ${ENABLE_VISION ? 'vision enabled' : 'HTML-only (vision disabled)'} - pulling from status IN (${statusValues.map(s => `'${s}'`).join(', ')})`
 176      );
 177  
 178      // Status-based gate: only enrich sites that have completed the appropriate scoring stage.
 179      // The status IS the gate — no json_extract needed.
 180      const englishFilter = ENGLISH_ONLY_MARKETS
 181        ? `AND country_code = ANY($2::text[])`
 182        : '';
 183      const sitesQuery = `
 184        SELECT id, domain, landing_page_url as url, country_code
 185        FROM sites
 186        WHERE status = ANY($1::text[])
 187          AND (enriched_at IS NULL OR error_message IS NOT NULL)
 188          AND html_dom IS NOT NULL
 189          ${englishFilter}
 190        ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC
 191        ${options.limit ? `LIMIT ${options.limit}` : ''}
 192      `;
 193  
 194      const sitesParams = ENGLISH_ONLY_MARKETS
 195        ? [statusValues, ENGLISH_ONLY_MARKETS]
 196        : [statusValues];
 197      const sites = await getAll(sitesQuery, sitesParams);
 198  
 199      if (sites.length === 0) {
 200        info('No sites need enrichment');
 201        return {
 202          processed: 0,
 203          succeeded: 0,
 204          failed: 0,
 205          skipped: 0,
 206          duration: Date.now() - startTime,
 207        };
 208      }
 209  
 210      // Filter out blocklisted sites (directories/social media/franchises)
 211      let ignoredCount = 0;
 212      for (const site of sites) {
 213        const blocked = checkBlocklist(site.domain, site.country_code);
 214        if (blocked) {
 215          await run(
 216            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
 217            [blocked.reason, site.id]
 218          );
 219          ignoredCount++;
 220        }
 221      }
 222  
 223      if (ignoredCount > 0) {
 224        info(`Marked ${ignoredCount} sites as ignored (directories/social media)`);
 225      }
 226  
 227      // All sites go through browser enrichment — no fast-path skip.
 228      // Even sites with contact forms may have additional emails/phones on key pages.
 229      info(`Found ${sites.length} sites to enrich (all go through browser + regex)`);
 230  
 231      const stats = {
 232        processed: 0,
 233        succeeded: 0,
 234        failed: 0,
 235        skipped: 0,
 236        formsFound: 0,
 237        emailsFound: 0,
 238        phonesFound: 0,
 239      };
 240  
 241      // Launch browser once for all enrichment operations with smart stealth
 242      // Will use aggressive stealth for social media, minimal for prospect sites
 243      const browser = await launchStealthBrowser({
 244        headless: true,
 245        stealthLevel: 'standard', // Base level, will be adjusted per-page in scrapePage
 246      });
 247  
 248      try {
 249        // Process sites in batches
 250        const { results, errors } = await processBatch(
 251          sites,
 252          // eslint-disable-next-line require-await -- Wrapper for async function enrichSite
 253          async (site, index) => {
 254            displayProgress(index + 1, sites.length, `Enriching ${site.url}`);
 255            return enrichSite(
 256              site.id,
 257              site.url,
 258              getContactsJsonWithFallback(site.id, site),
 259              readHtmlDom(site.id),
 260              getScoreJsonWithFallback(site.id, site),
 261              browser
 262            );
 263          },
 264          {
 265            concurrency,
 266            getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 8, 'ENRICHMENT_CONCURRENCY'),
 267          }
 268        );
 269  
 270        // Count successes and failures
 271        stats.processed = sites.length;
 272        stats.succeeded = results.filter(r => r !== null).length;
 273        stats.failed = errors.length;
 274  
 275        // Count enrichments
 276        /* eslint-disable max-depth -- Necessary for counting nested result stats */
 277        for (const result of results) {
 278          if (result) {
 279            if (result.formFound) stats.formsFound++;
 280            if (result.emailsFound) stats.emailsFound += result.emailsFound;
 281            if (result.phonesFound) stats.phonesFound += result.phonesFound;
 282          }
 283        }
 284        /* eslint-enable max-depth */
 285  
 286        // Log errors
 287        for (const err of errors) {
 288          const errorMsg = err.message || err.toString();
 289          error(`  Enrichment error: ${errorMsg}`);
 290        }
 291  
 292        stats.duration = Date.now() - startTime;
 293        generateStageCompletion('Enrichment', stats);
 294        info(
 295          `Found: ${stats.formsFound} forms, ${stats.emailsFound} emails, ${stats.phonesFound} phones`
 296        );
 297  
 298        return stats;
 299      } finally {
 300        await browser.close();
 301      }
 302    } catch (err) {
 303      error(`Enrichment stage failed: ${err.message}`);
 304      throw err;
 305    }
 306  }
 307  
 308  /**
 309   * Discover contact-related pages from sitemap.xml when LLM found no key_pages.
 310   * Fetches /sitemap.xml (and follows sitemap index entries), extracts <loc> URLs,
 311   * and filters them against the multilingual contact page pattern.
 312   * @param {string} siteUrl - The site's landing page URL
 313   * @param {RegExp} pattern - Contact page pattern to match against
 314   * @returns {Promise<string[]>} Array of matching URLs (may be empty)
 315   */
 316  async function discoverContactPagesFromSitemap(siteUrl, pattern) {
 317    try {
 318      const { origin } = new URL(siteUrl);
 319      const sitemapUrl = `${origin}/sitemap.xml`;
 320  
 321      const controller = new AbortController();
 322      const timeout = setTimeout(() => controller.abort(), 10000);
 323  
 324      const resp = await fetch(sitemapUrl, {
 325        signal: controller.signal,
 326        headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
 327      });
 328      clearTimeout(timeout);
 329  
 330      if (!resp.ok) return [];
 331  
 332      const xml = await resp.text();
 333      if (!xml || xml.length < 20) return [];
 334  
 335      // Extract all <loc> URLs from the sitemap
 336      const locMatches = xml.match(/<loc>\s*(https?:\/\/[^<]+?)\s*<\/loc>/gi) || [];
 337      const urls = locMatches.map(m => m.replace(/<\/?loc>/gi, '').trim());
 338  
 339      // If this is a sitemap index, fetch the first few child sitemaps
 340      const isSitemapIndex = /<sitemapindex/i.test(xml);
 341      const allUrls = [...urls];
 342  
 343      if (isSitemapIndex) {
 344        // Only fetch up to 3 child sitemaps to avoid excessive requests
 345        const childSitemaps = urls.slice(0, 3);
 346        for (const childUrl of childSitemaps) {
 347          try {
 348            const childCtrl = new AbortController();
 349            const childTimeout = setTimeout(() => childCtrl.abort(), 8000);
 350            const childResp = await fetch(childUrl, {
 351              signal: childCtrl.signal,
 352              headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
 353            });
 354            clearTimeout(childTimeout);
 355  
 356            if (childResp.ok) {
 357              const childXml = await childResp.text();
 358              const childLocs = childXml.match(/<loc>\s*(https?:\/\/[^<]+?)\s*<\/loc>/gi) || [];
 359              allUrls.push(...childLocs.map(m => m.replace(/<\/?loc>/gi, '').trim()));
 360            }
 361          } catch {
 362            // Skip failed child sitemaps
 363          }
 364        }
 365      }
 366  
 367      // Filter URLs matching the contact page pattern
 368      const matches = allUrls.filter(u => pattern.test(u));
 369  
 370      // Deduplicate and limit to 5 pages max
 371      const unique = [...new Set(matches)].slice(0, 5);
 372  
 373      if (unique.length > 0) {
 374        info(`  Sitemap fallback found ${unique.length} contact page(s)`);
 375      }
 376  
 377      return unique;
 378    } catch {
 379      // Network errors, timeouts, invalid URLs — all expected for many sites
 380      return [];
 381    }
 382  }
 383  
 384  /**
 385   * Enrich a single site by browsing key_pages
 386   * @param {number} siteId - Site ID
 387   * @param {string} url - Site URL
 388   * @param {string} contactsJsonStr - Current contacts_json (can be NULL)
 389   * @param {string} htmlDom - HTML DOM for initial contact extraction
 390   * @param {string} scoreJsonStr - Score JSON string
 391   * @param {Browser} browser - Playwright browser instance
 392   * @returns {Promise<Object>} Enrichment result
 393   */
 394  // eslint-disable-next-line complexity -- Contact enrichment requires checking multiple contact types and merging arrays
 395  async function enrichSite(siteId, url, contactsJsonStr, htmlDom, scoreJsonStr, browser) {
 396    try {
 397      // If contacts_json is NULL, try to get contact_details from score_json first
 398      let contactsJson;
 399      if (!contactsJsonStr) {
 400        // First try to get contacts from rescoring (includes vision text extraction)
 401        if (scoreJsonStr) {
 402          try {
 403            const scoreJson = JSON.parse(scoreJsonStr);
 404            if (scoreJson.contact_details) {
 405              info(`  Using contacts from rescoring (with vision text) for ${url}`);
 406              contactsJson = scoreJson.contact_details;
 407  
 408              // Clean phone numbers to E.164 format
 409              if (contactsJson.phone_numbers) {
 410                contactsJson.phone_numbers = cleanPhoneNumbers(contactsJson.phone_numbers, url);
 411              }
 412  
 413              // Clean up invalid Twitter/X.com links
 414              contactsJson = cleanInvalidSocialLinks(contactsJson);
 415            }
 416          } catch (err) {
 417            error(`  Failed to parse score_json: ${err.message}`);
 418          }
 419        }
 420  
 421        // Fallback to extracting from HTML if not found in score_json.
 422        // The orchestrator's enrich_sites batch handles LLM contact extraction via Claude Max
 423        // (claude -p, zero API cost). The pipeline stage uses regex-only here so we don't
 424        // double-spend on OpenRouter calls that the orchestrator will handle.
 425        if (!contactsJson) {
 426          const ENABLE_ENRICHMENT_LLM_FLAG = process.env.ENABLE_ENRICHMENT_LLM !== 'false';
 427  
 428          if (ENABLE_ENRICHMENT_LLM_FLAG) {
 429            info(
 430              `  [orchestrator] Using regex extraction for ${url} (enrich_sites batch handles LLM via Claude Max)`
 431            );
 432            contactsJson = extractContactsFromHtml(htmlDom || '', url);
 433          } else {
 434            info(`  Extracting initial contacts from ${url}`);
 435            contactsJson = await extractInitialContacts(url, htmlDom, siteId);
 436          }
 437        }
 438      } else {
 439        contactsJson = JSON.parse(contactsJsonStr);
 440        // Clean up invalid Twitter/X.com links from existing contacts_json
 441        contactsJson = cleanInvalidSocialLinks(contactsJson);
 442      }
 443  
 444      // Regex pre-pass: extract contacts from html_dom without any LLM call.
 445      // Runs unconditionally — cheap, and catches emails/phones/socials/key_pages the LLM missed.
 446      const realHtml = htmlDom && htmlDom !== 'HTML removed after scoring' ? htmlDom : null;
 447      if (realHtml) {
 448        const regexContacts = extractContactsFromHtml(realHtml, url);
 449        const before =
 450          (contactsJson.email_addresses?.length || 0) +
 451          (contactsJson.phone_numbers?.length || 0) +
 452          (contactsJson.key_pages?.length || 0) +
 453          (contactsJson.social_profiles?.length || 0);
 454        contactsJson = mergeExtractedContacts(contactsJson, regexContacts, url);
 455        const after =
 456          (contactsJson.email_addresses?.length || 0) +
 457          (contactsJson.phone_numbers?.length || 0) +
 458          (contactsJson.key_pages?.length || 0) +
 459          (contactsJson.social_profiles?.length || 0);
 460        if (after > before) {
 461          info(
 462            `  Regex pre-pass added ${after - before} contacts/pages for ${url} (emails:${contactsJson.email_addresses?.length || 0} phones:${contactsJson.phone_numbers?.length || 0} social:${contactsJson.social_profiles?.length || 0} pages:${contactsJson.key_pages?.length || 0})`
 463          );
 464        }
 465      }
 466  
 467      // Get key_pages that look like contact pages (multilingual)
 468      const keyPages = contactsJson.key_pages || [];
 469      const contactPagePattern =
 470        /contact|support|get-in-touch|about|kontakt|contacto|contato|a-propos|apropos|chi-siamo|uber-uns|ueber-uns|impressum|mentions-legales|aviso-legal|datenschutz|privacidad|hubungi|kontak/i;
 471      const contactPages = keyPages.filter(pageUrl => contactPagePattern.test(pageUrl.toLowerCase()));
 472  
 473      // Sitemap fallback: if no contact pages from LLM, check sitemap.xml
 474      if (contactPages.length === 0) {
 475        const sitemapPages = await discoverContactPagesFromSitemap(url, contactPagePattern);
 476        contactPages.push(...sitemapPages);
 477      }
 478  
 479      if (contactPages.length === 0) {
 480        info(`  No contact pages found for ${url}`);
 481  
 482        // Check if we actually have any usable contacts
 483        const hasEmail = contactsJson.email_addresses?.length > 0;
 484        const hasPhone = contactsJson.phone_numbers?.length > 0;
 485        const hasSocial = contactsJson.social_profiles?.length > 0;
 486        const hasForm = !!contactsJson.primary_contact_form?.form_url;
 487        const hasAnyContact = hasEmail || hasPhone || hasSocial || hasForm;
 488  
 489        if (!hasAnyContact) {
 490          // Zero contacts found and no key_pages to browse — don't mark as enriched.
 491          // Record as a retriable failure so the site gets another pass
 492          // (e.g. after html_dom becomes available or on next pipeline cycle).
 493          const retryStatus =
 494            process.env.ENABLE_VISION !== 'false' ? 'vision_scored' : 'semantic_scored';
 495          info(`  No contacts found for ${url} — keeping at ${retryStatus} for retry`);
 496          await recordFailure(siteId, 'enrichment', 'No contacts or key_pages found', retryStatus);
 497          return { formFound: false, emailsFound: 0, phonesFound: 0 };
 498        }
 499  
 500        // Has some contacts but no contact pages to browse — mark enriched_regex so
 501        // the orchestrator LLM pass can still run against html_dom
 502        const city = contactsJson.city || null;
 503        const countryCode = normaliseCountryCode(contactsJson.country_code) || null;
 504  
 505        // Write contacts to filesystem (contacts_json column was dropped in migration 121)
 506        contactsJson = compactContacts(contactsJson);
 507        setContactsJson(siteId, JSON.stringify(contactsJson));
 508  
 509        await withTransaction(async (client) => {
 510          await client.query(
 511            `UPDATE sites
 512             SET city = $1,
 513                 country_code = $2,
 514                 status = 'enriched_regex',
 515                 enriched_at = NOW(),
 516                 error_message = NULL
 517             WHERE id = $3`,
 518            [city, countryCode, siteId]
 519          );
 520          await resetRetries(siteId);
 521        });
 522  
 523        return { formFound: false, emailsFound: 0, phonesFound: 0 };
 524      }
 525  
 526      info(`  Browsing ${contactPages.length} pages for ${url}`);
 527  
 528      const newContactInfo = {};
 529      const keyPagesHtml = {}; // Accumulate {url: renderedHtml} for orchestrator LLM pass
 530      let foundForm = false;
 531  
 532      // Browse each contact page
 533      for (const pageUrl of contactPages) {
 534        if (foundForm) break; // Stop if we found a form
 535  
 536        const pageData = await scrapePage(pageUrl, browser);
 537  
 538        // Store rendered HTML for orchestrator LLM pass (cleared after enriched_llm)
 539        if (pageData.html) {
 540          keyPagesHtml[pageUrl] = pageData.html;
 541        }
 542  
 543        // Extract contacts — orchestrator batch (Claude Max) or regex-only.
 544        // The enrich_sites orchestrator batch handles LLM extraction from html_dom via claude -p.
 545        // Browser-navigated pages use regex extraction here (orchestrator handles the html_dom pass).
 546        const ENABLE_ENRICHMENT_LLM = process.env.ENABLE_ENRICHMENT_LLM !== 'false';
 547        let enrichmentResult;
 548  
 549        {
 550          // Regex-only extraction for browser-navigated pages.
 551          // The enrich_sites orchestrator batch handles LLM extraction from html_dom via claude -p
 552          // (Claude Max, zero API cost). OpenRouter enrichment calls are no longer used.
 553          const regexResult = extractContactsFromHtml(pageData.html || '', pageUrl);
 554          enrichmentResult = {
 555            email_addresses: regexResult.email_addresses,
 556            phone_numbers: regexResult.phone_numbers,
 557            social_profiles: regexResult.social_profiles,
 558            key_pages: regexResult.key_pages,
 559            primary_contact_form: regexResult.has_contact_form ? pageUrl : null,
 560          };
 561        }
 562  
 563        // Merge new contact info
 564        if (enrichmentResult.primary_contact_form) {
 565          newContactInfo.primary_contact_form = enrichmentResult.primary_contact_form;
 566          foundForm = true;
 567        }
 568  
 569        // Merge location info (prefer first found)
 570        if (enrichmentResult.city && !contactsJson.city) {
 571          newContactInfo.city = enrichmentResult.city;
 572        }
 573        if (enrichmentResult.country_code && !contactsJson.country_code) {
 574          newContactInfo.country_code = enrichmentResult.country_code;
 575        }
 576  
 577        // Merge arrays (avoid duplicates)
 578        if (enrichmentResult.email_addresses) {
 579          newContactInfo.email_addresses = [
 580            ...(newContactInfo.email_addresses || []),
 581            ...enrichmentResult.email_addresses,
 582          ];
 583        }
 584        if (enrichmentResult.phone_numbers) {
 585          newContactInfo.phone_numbers = [
 586            ...(newContactInfo.phone_numbers || []),
 587            ...enrichmentResult.phone_numbers,
 588          ];
 589        }
 590        if (enrichmentResult.social_profiles) {
 591          newContactInfo.social_profiles = [
 592            ...(newContactInfo.social_profiles || []),
 593            ...enrichmentResult.social_profiles,
 594          ];
 595        }
 596        if (enrichmentResult.key_pages) {
 597          newContactInfo.key_pages = [
 598            ...(newContactInfo.key_pages || []),
 599            ...enrichmentResult.key_pages,
 600          ];
 601        }
 602        if (enrichmentResult.business_name && !contactsJson.business_name) {
 603          newContactInfo.business_name = enrichmentResult.business_name;
 604        }
 605      }
 606  
 607      // Merge new contact info with existing
 608      let enrichedContacts = {
 609        ...contactsJson,
 610        ...newContactInfo,
 611        // Merge arrays without duplicates
 612        email_addresses: [
 613          ...(contactsJson.email_addresses || []),
 614          ...(newContactInfo.email_addresses || []),
 615        ],
 616        phone_numbers: cleanPhoneNumbers(
 617          [...(contactsJson.phone_numbers || []), ...(newContactInfo.phone_numbers || [])],
 618          url
 619        ),
 620        social_profiles: dedupeByUrl([
 621          ...(contactsJson.social_profiles || []),
 622          ...(newContactInfo.social_profiles || []),
 623        ]),
 624        key_pages: Array.from(
 625          new Set([...(contactsJson.key_pages || []), ...(newContactInfo.key_pages || [])])
 626        ),
 627      };
 628  
 629      // Social profile contact extraction (YouTube via raw HTTP, others via Playwright stealth)
 630      const extractableProfiles = (enrichedContacts.social_profiles || []).filter(sp => {
 631        const spUrl = typeof sp === 'string' ? sp : sp.url;
 632        return spUrl && /youtube\.com|facebook\.com|yelp\.com|instagram\.com|linkedin\.com\/company/i.test(spUrl)
 633          && !/profile\.php\?id=|\/groups\/|\/intent\//i.test(spUrl);
 634      });
 635      if (extractableProfiles.length > 0) {
 636        const socialContacts = await extractFromSocialProfiles(extractableProfiles, url, browser);
 637        if (socialContacts) {
 638          enrichedContacts = mergeExtractedContacts(enrichedContacts, socialContacts, url);
 639          if (socialContacts._city && !enrichedContacts.city) {
 640            enrichedContacts.city = socialContacts._city;
 641          }
 642        }
 643      }
 644  
 645      // Clean up invalid Twitter/X.com links (those without usernames)
 646      enrichedContacts = cleanInvalidSocialLinks(enrichedContacts);
 647  
 648      // Extract location information from enriched contacts
 649      const city = enrichedContacts.city || null;
 650      const countryCode = normaliseCountryCode(enrichedContacts.country_code) || null;
 651      const state = enrichedContacts.state || null;
 652  
 653      // Check for country mismatch (site from wrong country for the search)
 654      if (countryCode) {
 655        const siteRow = await getOne(
 656          'SELECT google_domain FROM sites WHERE id = $1',
 657          [siteId]
 658        );
 659        const googleDomain = siteRow?.google_domain;
 660  
 661        if (googleDomain) {
 662          const expectedCountry = parseCountryFromGoogleDomain(googleDomain);
 663  
 664          // Check if detected country matches expected country
 665          if (countryCode !== expectedCountry) {
 666            setContactsJson(siteId, JSON.stringify(enrichedContacts));
 667            await run(
 668              `UPDATE sites
 669               SET status = 'ignored',
 670                   error_message = $1,
 671                   city = $2,
 672                   country_code = $3,
 673                   state = $4
 674               WHERE id = $5`,
 675              [
 676                `Country mismatch: Site is in ${countryCode}, but search was for ${expectedCountry} (${googleDomain})`,
 677                city,
 678                countryCode,
 679                state,
 680                siteId,
 681              ]
 682            );
 683  
 684            info(
 685              `  ${url} country mismatch: Site in ${countryCode}, search for ${expectedCountry} - marked as ignored`
 686            );
 687  
 688            return {
 689              formFound: foundForm,
 690              emailsFound: 0,
 691              phonesFound: 0,
 692            };
 693          }
 694        }
 695      }
 696  
 697      // GDPR verification for EU/UK countries
 698      let companyProof = null;
 699      let gdprVerified = null;
 700      if (countryCode) {
 701        let country = null;
 702        try {
 703          country = getCountryByCode(countryCode);
 704        } catch {
 705          // Unsupported country code — skip GDPR check
 706        }
 707        if (country && country.requiresGDPRCheck && enrichedContacts.email_addresses?.length > 0) {
 708          info(`  Running GDPR verification for ${countryCode} site: ${url}`);
 709  
 710          // Combine landing page HTML with all browsed key pages HTML.
 711          // Company proof keywords (Ltd, VAT number, Registered office etc.) appear far
 712          // more often on About/Legal/Contact pages than the landing page.
 713          const allHtml = [htmlDom || '', ...Object.values(keyPagesHtml)].join('\n');
 714  
 715          // Verify all email addresses
 716          const verificationResults = batchVerifyEmails({
 717            emails: enrichedContacts.email_addresses.map(e => (typeof e === 'string' ? e : e.email)),
 718            html: allHtml,
 719            countryCode,
 720            domain: url,
 721          });
 722  
 723          // Store verification results
 724          companyProof = JSON.stringify({
 725            verifiedAt: new Date().toISOString(),
 726            results: verificationResults,
 727            countryCode,
 728            totalEmails: enrichedContacts.email_addresses.length,
 729            verifiedCount: verificationResults.filter(r => r.isVerified).length,
 730            unverifiedCount: verificationResults.filter(r => !r.isVerified).length,
 731          });
 732  
 733          // Set gdpr_verified flag based on results
 734          // TRUE if at least one email is verified (high or medium confidence)
 735          // FALSE if all emails are unverified (free email or uncertain)
 736          const hasVerifiedEmail = verificationResults.some(
 737            r => r.isVerified && ['high', 'medium'].includes(r.confidence)
 738          );
 739          gdprVerified = hasVerifiedEmail ? 1 : 0;
 740  
 741          if (hasVerifiedEmail) {
 742            success(
 743              `  GDPR: ${verificationResults.filter(r => r.isVerified).length}/${verificationResults.length} emails verified`
 744            );
 745          } else {
 746            info(
 747              `  GDPR: No verified company emails found (${verificationResults.filter(r => r.reason.includes('Free email')).length} free emails)`
 748            );
 749          }
 750        }
 751      }
 752  
 753      // Update database with enriched contacts and GDPR verification.
 754      // Status = 'enriched_regex': browser regex pass complete; orchestrator LLM pass will set 'enriched_llm'.
 755      // key_pages_html written to filesystem; cleared once enriched_llm completes.
 756      const hasKeyPages = Object.keys(keyPagesHtml).length > 0;
 757      if (hasKeyPages) {
 758        writeKeyPagesHtmlFile(siteId, keyPagesHtml);
 759      }
 760  
 761      // Wrap all DB updates for this site in a transaction so GDPR fields,
 762      // status, and retry counter are always consistent — no partial writes on crash.
 763      // contacts_json column was dropped in migration 121 — write to filesystem only
 764      const compactedContacts = compactContacts(enrichedContacts);
 765      setContactsJson(siteId, JSON.stringify(compactedContacts));
 766  
 767      await withTransaction(async (client) => {
 768        await client.query(
 769          `UPDATE sites
 770           SET city = $1,
 771               country_code = $2,
 772               state = $3,
 773               company_proof = $4,
 774               gdpr_verified = $5,
 775               gdpr_verified_at = CASE WHEN $6 IS NOT NULL THEN NOW() ELSE NULL END,
 776               status = 'enriched_regex',
 777               enriched_at = NOW(),
 778               key_pages_html = $7,
 779               error_message = NULL
 780           WHERE id = $8`,
 781          [
 782            city,
 783            countryCode,
 784            state,
 785            companyProof,
 786            gdprVerified,
 787            gdprVerified,
 788            hasKeyPages ? 'fs' : null, // Flag: 'fs' = stored on filesystem
 789            siteId,
 790          ]
 791        );
 792  
 793        // Reset retry count on successful enrichment
 794        await resetRetries(siteId);
 795      });
 796  
 797      success(
 798        `  Enriched ${url}: ${foundForm ? 'form' : ''} ${newContactInfo.email_addresses?.length || 0} emails, ${newContactInfo.phone_numbers?.length || 0} phones`
 799      );
 800  
 801      return {
 802        formFound: foundForm,
 803        emailsFound: newContactInfo.email_addresses?.length || 0,
 804        phonesFound: newContactInfo.phone_numbers?.length || 0,
 805      };
 806    } catch (err) {
 807      // Record failure and increment retry count (marks as 'failing' if limit exceeded)
 808      const retryStatus = process.env.ENABLE_VISION !== 'false' ? 'vision_scored' : 'semantic_scored';
 809      await recordFailure(siteId, 'enrichment', err, retryStatus);
 810      throw err;
 811    }
 812  }
 813  
 814  /**
 815   * Scrape a page to get HTML and screenshot with smart stealth detection
 816   * @param {string} pageUrl - Page URL
 817   * @param {Browser} browser - Playwright browser instance
 818   * @returns {Promise<Object>} Page data
 819   */
 820  async function scrapePage(pageUrl, browser) {
 821    // Smart stealth level detection: aggressive for socials, minimal for prospects
 822    const isSocial = isSocialMediaUrl(pageUrl);
 823    const stealthLevel = isSocial ? 'aggressive' : 'minimal';
 824  
 825    info(`  ${isSocial ? '[SOCIAL]' : '[PROSPECT]'} Using ${stealthLevel} stealth for ${pageUrl}`);
 826  
 827    const context = await createStealthContext(browser, {
 828      viewport: { width: 1280, height: 720 },
 829      stealthLevel,
 830    });
 831  
 832    const page = await context.newPage();
 833  
 834    try {
 835      await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
 836  
 837      // Wait for potential Cloudflare/Turnstile challenges
 838      const cloudflareResolved = await waitForCloudflare(page, { timeout: 30000 });
 839      if (!cloudflareResolved) {
 840        info(`  Cloudflare/Turnstile challenge may still be blocking ${pageUrl}`);
 841      }
 842  
 843      // Human-like browsing behavior (more aggressive for socials)
 844      if (isSocial) {
 845        await randomDelay(1500, 2500);
 846        await humanScroll(page, { distance: 'short' });
 847        await randomDelay(500, 1000);
 848      } else {
 849        await randomDelay(300, 700);
 850      }
 851  
 852      // Wait for page to stabilise before reading content — avoids "page is navigating" error
 853      // when redirects or JS navigations fire after domcontentloaded
 854      try {
 855        await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
 856      } catch {
 857        // Already loaded or timed out — proceed anyway
 858      }
 859  
 860      // Get HTML DOM
 861      const html = await page.content();
 862  
 863      // Take screenshot only when vision is enabled — saves bandwidth and avoids
 864      // OpenRouter 400 errors from large base64 payloads at higher concurrency
 865      const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
 866      let screenshot = null;
 867      if (ENABLE_VISION) {
 868        const buf = await page.screenshot({
 869          type: 'png',
 870          fullPage: false, // Above-fold only
 871        });
 872        screenshot = buf.toString('base64');
 873      }
 874  
 875      return {
 876        html,
 877        screenshot,
 878      };
 879    } finally {
 880      await page.close();
 881      await context.close();
 882    }
 883  }
 884  
 885  /**
 886   * Call LLM API for contact enrichment
 887   * @param {Object} params - Enrichment parameters
 888   * @param {number} params.siteId - Site ID for usage tracking
 889   * @returns {Promise<Object>} Enrichment result
 890   */
 891  async function callEnrichmentAPI({ pageUrl, currentContacts, html, screenshot, siteId = null }) {
 892    const messages = [
 893      {
 894        role: 'system',
 895        content: ENRICHMENT_PROMPT,
 896      },
 897      {
 898        role: 'user',
 899        content: screenshot
 900          ? [
 901              {
 902                type: 'text',
 903                text: `Current contacts_json:\n${JSON.stringify(currentContacts, null, 2)}\n\nPage URL: ${pageUrl}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html.substring(0, 50000)), 'website_html')}`,
 904              },
 905              {
 906                type: 'image_url',
 907                image_url: {
 908                  url: `data:image/png;base64,${screenshot}`,
 909                },
 910              },
 911            ]
 912          : `Current contacts_json:\n${JSON.stringify(currentContacts, null, 2)}\n\nPage URL: ${pageUrl}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html.substring(0, 50000)), 'website_html')}`,
 913      },
 914    ];
 915  
 916    const response = await openRouterBreaker.fire(async () =>
 917      openRouterLimiter.schedule(() =>
 918        callLLM({
 919          model: ENRICHMENT_MODEL,
 920          messages,
 921          temperature: 0.1,
 922          max_tokens: 4000,
 923          json_mode: true,
 924          stage: 'enrichment',
 925          siteId,
 926        })
 927      )
 928    );
 929  
 930    // Parse JSON response
 931    const result = safeJsonParse(response.content);
 932  
 933    if (!result) {
 934      throw new Error('Failed to parse LLM response as JSON');
 935    }
 936  
 937    // Validate LLM response (check email/phone formats, drop unexpected fields)
 938    validateEnrichmentResponse(result);
 939  
 940    // Clean up invalid Twitter/X.com links
 941    const cleanedResult = cleanInvalidSocialLinks(result);
 942  
 943    return cleanedResult;
 944  }
 945  
 946  /**
 947   * Extract initial contacts from landing page HTML
 948   * @param {string} url - Landing page URL
 949   * @param {string} html - HTML DOM
 950   * @param {number} siteId
 951   * @returns {Promise<Object>} Initial contacts_json structure
 952   */
 953  async function extractInitialContacts(url, html, siteId = null) {
 954    try {
 955      const messages = [
 956        {
 957          role: 'system',
 958          content: `Extract contact information from this HTML page. Return JSON with this structure:
 959  {
 960    "business_name": "Company Name",
 961    "email_addresses": [{ "email": "info@example.com", "label": "General", "source": "//a[@href='mailto:info@example.com']" }],
 962    "phone_numbers": [{ "number": "+1234567890", "label": "Office", "source": "//span[@class='phone']" }],
 963    "social_profiles": [{ "url": "https://twitter.com/handle", "label": "Twitter", "source": "//a[@href='https://twitter.com/handle']" }],
 964    "key_pages": ["https://example.com/contact", "https://example.com/about"],
 965    "primary_contact_form": {
 966      "form_url": "https://example.com/contact",
 967      "form_action_url": "/submit-contact"
 968    }
 969  }
 970  For each contact item, include a "source" field with the XPath where you found it in the HTML.
 971  Return empty arrays for missing fields. If no contact form is found, omit primary_contact_form.`,
 972        },
 973        {
 974          role: 'user',
 975          content: `URL: ${url}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html ? html.substring(0, 50000) : 'No HTML available'), 'website_html')}`,
 976        },
 977      ];
 978  
 979      const response = await openRouterBreaker.fire(async () =>
 980        openRouterLimiter.schedule(() =>
 981          callLLM({
 982            model: ENRICHMENT_MODEL,
 983            messages,
 984            temperature: 0.1,
 985            max_tokens: 2000,
 986            json_mode: true,
 987            stage: 'enrichment',
 988            siteId,
 989          })
 990        )
 991      );
 992  
 993      const result = safeJsonParse(response.content);
 994      if (!result) {
 995        info(`  Failed to parse LLM response, using minimal contact structure for ${url}`);
 996        // Return minimal structure if parsing fails
 997        return {
 998          business_name: '',
 999          email_addresses: [],
1000          phone_numbers: [],
1001          social_profiles: [],
1002          key_pages: [url],
1003        };
1004      }
1005  
1006      // Validate LLM response (check email/phone formats, drop unexpected fields)
1007      validateEnrichmentResponse(result);
1008  
1009      // Clean phone numbers and log transformations
1010      if (result.phone_numbers && Array.isArray(result.phone_numbers)) {
1011        result.phone_numbers = cleanPhoneNumbers(result.phone_numbers, url);
1012      }
1013  
1014      // Clean up invalid Twitter/X.com links
1015      const cleanedResult = cleanInvalidSocialLinks(result);
1016  
1017      return cleanedResult;
1018    } catch (err) {
1019      error(`  Error extracting contacts from ${url}: ${err.message}`);
1020      // Return minimal structure on error
1021      return {
1022        business_name: '',
1023        email_addresses: [],
1024        phone_numbers: [],
1025        social_profiles: [],
1026        key_pages: [url],
1027      };
1028    }
1029  }
1030  
1031  /**
1032   * Get enrichment statistics
1033   * @returns {Promise<Object>} Enrichment statistics
1034   */
1035  export async function getEnrichmentStats() {
1036    return await getOne(
1037      `SELECT
1038        COUNT(id) as total_enriched,
1039        0 as with_forms,
1040        0 as with_emails,
1041        0 as with_phones
1042       FROM sites
1043       WHERE enriched_at IS NOT NULL`
1044    );
1045  }