enrich.js
1 /** 2 * Enrichment Stage 3 * Browse key_pages to extract additional contact information 4 * Only runs if we don't already have a contact form 5 */ 6 7 import { run, getOne, getAll, withTransaction } from '../utils/db.js'; 8 import { 9 launchStealthBrowser, 10 createStealthContext, 11 humanScroll, 12 randomDelay, 13 isSocialMediaUrl, 14 waitForCloudflare, 15 } from '../utils/stealth-browser.js'; 16 import { readFileSync } from 'fs'; 17 import { join, dirname } from 'path'; 18 import { fileURLToPath } from 'url'; 19 import Logger from '../utils/logger.js'; 20 import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js'; 21 import { processBatch } from '../utils/error-handler.js'; 22 import { callLLM } from '../utils/llm-provider.js'; 23 import { openRouterLimiter } from '../utils/rate-limiter.js'; 24 import { openRouterBreaker } from '../utils/circuit-breaker.js'; 25 import { safeJsonParse } from '../utils/error-handler.js'; 26 import { normalizePhoneNumber, isValidSmsNumber } from '../utils/phone-normalizer.js'; 27 import { getAdaptiveConcurrencyFast } from '../utils/adaptive-concurrency.js'; 28 import { checkBlocklist } from '../utils/site-filters.js'; 29 import { verifyCompanyEmail, batchVerifyEmails } from '../utils/gdpr-verification.js'; 30 import { getCountryByCode, normaliseCountryCode } from '../config/countries.js'; 31 import { recordFailure, resetRetries } from '../utils/retry-handler.js'; 32 import { parseCountryFromGoogleDomain } from '../utils/tld-detector.js'; 33 import { cleanInvalidSocialLinks } from '../contacts/prioritize.js'; 34 import { extractFromSocialProfiles } from '../utils/social-contact-extractor.js'; 35 import { 36 extractContactsFromHtml, 37 mergeExtractedContacts, 38 } from '../utils/html-contact-extractor.js'; 39 import { sanitizeHtmlForPrompt, wrapUntrusted } from '../utils/llm-sanitizer.js'; 40 import { validateEnrichmentResponse } from '../utils/llm-response-validator.js'; 41 import { 42 readHtmlDom, 43 writeKeyPagesHtml as writeKeyPagesHtmlFile, 44 readKeyPagesHtml as readKeyPagesHtmlFile, 45 } from '../utils/html-storage.js'; 46 import { getScoreJsonWithFallback } from '../utils/score-storage.js'; 47 import { getContactsJsonWithFallback, setContactsJson } from '../utils/contacts-storage.js'; 48 import { compactContacts } from '../utils/compact-contacts.js'; 49 50 const __filename = fileURLToPath(import.meta.url); 51 const __dirname = dirname(__filename); 52 const projectRoot = join(__dirname, '../..'); 53 54 const logger = new Logger('Enrich'); 55 const success = (...args) => logger.success(...args); 56 const info = (...args) => logger.info(...args); 57 const error = (...args) => logger.error(...args); 58 59 const ENGLISH_ONLY_MARKETS = process.env.ENGLISH_ONLY_MARKETS 60 ? process.env.ENGLISH_ONLY_MARKETS.split(',').map(c => c.trim().toUpperCase()) 61 : null; 62 63 // Load enrichment prompt 64 const ENRICHMENT_PROMPT = readFileSync(join(projectRoot, 'prompts/ENRICHMENT.md'), 'utf-8'); 65 66 // Model configuration (from env or default) 67 const ENRICHMENT_MODEL = process.env.ENRICHMENT_MODEL || 'openai/gpt-4o-mini'; 68 69 /** Deduplicate social_profiles by URL (handles both string and {url, label} formats) */ 70 function dedupeByUrl(profiles) { 71 const seen = new Set(); 72 return profiles.filter(p => { 73 const url = typeof p === 'string' ? p : p?.url; 74 if (!url || seen.has(url)) return false; 75 seen.add(url); 76 return true; 77 }); 78 } 79 80 /** 81 * Clean phone numbers from LLM output 82 * - Supports both object format {number, label} and legacy string format 83 * - Normalizes to E.164 format 84 * - Logs transformations for troubleshooting 85 * @param {Array<Object|string>} phones - Phone numbers from LLM 86 * @param {string} context - URL or identifier for logging 87 * @returns {Array<Object>} Cleaned phone numbers in object format 88 */ 89 function cleanPhoneNumbers(phones, context) { 90 if (!Array.isArray(phones)) return []; 91 92 return phones 93 .map((phone, idx) => { 94 // Handle legacy string format 95 if (typeof phone === 'string') { 96 const cleaned = normalizePhoneNumber(phone); 97 if (cleaned !== phone) { 98 info(` Phone normalization [${context}]: '${phone}' → '${cleaned}'`); 99 } 100 return { number: cleaned, label: '' }; 101 } 102 103 // Handle object format {number, label} 104 if (phone.number) { 105 const original = phone.number; 106 const cleaned = normalizePhoneNumber(original); 107 if (cleaned !== original) { 108 info(` Phone normalization [${context}]: '${original}' → '${cleaned}'`); 109 } 110 return { 111 number: cleaned, 112 label: phone.label || '', 113 }; 114 } 115 116 // Invalid format 117 error(` Invalid phone format at index ${idx} for ${context}: ${JSON.stringify(phone)}`); 118 return null; 119 }) 120 .filter(p => { 121 if (!p) return false; 122 const reason = isValidSmsNumber(p.number); 123 if (reason) { 124 info(` Phone rejected [${context}]: ${p.number} — ${reason}`); 125 return false; 126 } 127 return true; 128 }); 129 } 130 131 /** 132 * Run the enrichment stage 133 * @param {Object} options - Stage options 134 * @param {number} options.limit - Limit number of sites to enrich 135 * @param {number} options.concurrency - Number of concurrent enrichment operations (default: 8) 136 * @param {string|string[]} [options.statusFilter] - Optional status value(s) to filter by instead 137 * of the default ENABLE_VISION-derived status. Pass a single string or array of strings. 138 * @returns {Promise<Object>} Stage results 139 */ 140 export async function runEnrichmentStage(options = {}) { 141 const startTime = Date.now(); 142 // Use ENRICHMENT_CONCURRENCY (defaults to 8) for parallel browser + LLM operations 143 // Each concurrent operation runs a browser context + multiple LLM calls 144 const concurrency = 145 options.concurrency || parseInt(process.env.ENRICHMENT_CONCURRENCY || '8', 10); 146 147 try { 148 info('Starting Enrichment Stage...'); 149 150 // Check ENABLE_VISION flag to determine input status 151 const ENABLE_VISION = process.env.ENABLE_VISION !== 'false'; 152 153 // Show deprecation warning if old flags are used 154 const legacyFlags = [ 155 process.env.USE_COMPUTER_VISION_SCORING, 156 process.env.USE_COMPUTER_VISION_RESCORING, 157 process.env.USE_COMPUTER_VISION_ENRICHMENT, 158 ]; 159 if (legacyFlags.some(flag => flag !== undefined)) { 160 logger.warn( 161 '[enrich] WARN: Vision flags (USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.' 162 ); 163 } 164 165 // semantic_scored = HTML-only mode (ENABLE_VISION=false), semantic Haiku pass complete 166 // vision_scored = vision mode (ENABLE_VISION=true), vision rescoring complete 167 const defaultInputStatus = ENABLE_VISION ? 'vision_scored' : 'semantic_scored'; 168 169 // statusFilter may be injected (e.g. 2Step passes 'reviews_downloaded'). 170 // Supports a single string or an array. Falls back to the default derived from ENABLE_VISION. 171 const rawStatusFilter = options.statusFilter || defaultInputStatus; 172 const statusValues = Array.isArray(rawStatusFilter) ? rawStatusFilter : [rawStatusFilter]; 173 174 info( 175 `[enrich] Mode: ${ENABLE_VISION ? 'vision enabled' : 'HTML-only (vision disabled)'} - pulling from status IN (${statusValues.map(s => `'${s}'`).join(', ')})` 176 ); 177 178 // Status-based gate: only enrich sites that have completed the appropriate scoring stage. 179 // The status IS the gate — no json_extract needed. 180 const englishFilter = ENGLISH_ONLY_MARKETS 181 ? `AND country_code = ANY($2::text[])` 182 : ''; 183 const sitesQuery = ` 184 SELECT id, domain, landing_page_url as url, country_code 185 FROM sites 186 WHERE status = ANY($1::text[]) 187 AND (enriched_at IS NULL OR error_message IS NOT NULL) 188 AND html_dom IS NOT NULL 189 ${englishFilter} 190 ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC 191 ${options.limit ? `LIMIT ${options.limit}` : ''} 192 `; 193 194 const sitesParams = ENGLISH_ONLY_MARKETS 195 ? [statusValues, ENGLISH_ONLY_MARKETS] 196 : [statusValues]; 197 const sites = await getAll(sitesQuery, sitesParams); 198 199 if (sites.length === 0) { 200 info('No sites need enrichment'); 201 return { 202 processed: 0, 203 succeeded: 0, 204 failed: 0, 205 skipped: 0, 206 duration: Date.now() - startTime, 207 }; 208 } 209 210 // Filter out blocklisted sites (directories/social media/franchises) 211 let ignoredCount = 0; 212 for (const site of sites) { 213 const blocked = checkBlocklist(site.domain, site.country_code); 214 if (blocked) { 215 await run( 216 `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`, 217 [blocked.reason, site.id] 218 ); 219 ignoredCount++; 220 } 221 } 222 223 if (ignoredCount > 0) { 224 info(`Marked ${ignoredCount} sites as ignored (directories/social media)`); 225 } 226 227 // All sites go through browser enrichment — no fast-path skip. 228 // Even sites with contact forms may have additional emails/phones on key pages. 229 info(`Found ${sites.length} sites to enrich (all go through browser + regex)`); 230 231 const stats = { 232 processed: 0, 233 succeeded: 0, 234 failed: 0, 235 skipped: 0, 236 formsFound: 0, 237 emailsFound: 0, 238 phonesFound: 0, 239 }; 240 241 // Launch browser once for all enrichment operations with smart stealth 242 // Will use aggressive stealth for social media, minimal for prospect sites 243 const browser = await launchStealthBrowser({ 244 headless: true, 245 stealthLevel: 'standard', // Base level, will be adjusted per-page in scrapePage 246 }); 247 248 try { 249 // Process sites in batches 250 const { results, errors } = await processBatch( 251 sites, 252 // eslint-disable-next-line require-await -- Wrapper for async function enrichSite 253 async (site, index) => { 254 displayProgress(index + 1, sites.length, `Enriching ${site.url}`); 255 return enrichSite( 256 site.id, 257 site.url, 258 getContactsJsonWithFallback(site.id, site), 259 readHtmlDom(site.id), 260 getScoreJsonWithFallback(site.id, site), 261 browser 262 ); 263 }, 264 { 265 concurrency, 266 getDynamicConcurrency: () => getAdaptiveConcurrencyFast(1, 8, 'ENRICHMENT_CONCURRENCY'), 267 } 268 ); 269 270 // Count successes and failures 271 stats.processed = sites.length; 272 stats.succeeded = results.filter(r => r !== null).length; 273 stats.failed = errors.length; 274 275 // Count enrichments 276 /* eslint-disable max-depth -- Necessary for counting nested result stats */ 277 for (const result of results) { 278 if (result) { 279 if (result.formFound) stats.formsFound++; 280 if (result.emailsFound) stats.emailsFound += result.emailsFound; 281 if (result.phonesFound) stats.phonesFound += result.phonesFound; 282 } 283 } 284 /* eslint-enable max-depth */ 285 286 // Log errors 287 for (const err of errors) { 288 const errorMsg = err.message || err.toString(); 289 error(` Enrichment error: ${errorMsg}`); 290 } 291 292 stats.duration = Date.now() - startTime; 293 generateStageCompletion('Enrichment', stats); 294 info( 295 `Found: ${stats.formsFound} forms, ${stats.emailsFound} emails, ${stats.phonesFound} phones` 296 ); 297 298 return stats; 299 } finally { 300 await browser.close(); 301 } 302 } catch (err) { 303 error(`Enrichment stage failed: ${err.message}`); 304 throw err; 305 } 306 } 307 308 /** 309 * Discover contact-related pages from sitemap.xml when LLM found no key_pages. 310 * Fetches /sitemap.xml (and follows sitemap index entries), extracts <loc> URLs, 311 * and filters them against the multilingual contact page pattern. 312 * @param {string} siteUrl - The site's landing page URL 313 * @param {RegExp} pattern - Contact page pattern to match against 314 * @returns {Promise<string[]>} Array of matching URLs (may be empty) 315 */ 316 async function discoverContactPagesFromSitemap(siteUrl, pattern) { 317 try { 318 const { origin } = new URL(siteUrl); 319 const sitemapUrl = `${origin}/sitemap.xml`; 320 321 const controller = new AbortController(); 322 const timeout = setTimeout(() => controller.abort(), 10000); 323 324 const resp = await fetch(sitemapUrl, { 325 signal: controller.signal, 326 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' }, 327 }); 328 clearTimeout(timeout); 329 330 if (!resp.ok) return []; 331 332 const xml = await resp.text(); 333 if (!xml || xml.length < 20) return []; 334 335 // Extract all <loc> URLs from the sitemap 336 const locMatches = xml.match(/<loc>\s*(https?:\/\/[^<]+?)\s*<\/loc>/gi) || []; 337 const urls = locMatches.map(m => m.replace(/<\/?loc>/gi, '').trim()); 338 339 // If this is a sitemap index, fetch the first few child sitemaps 340 const isSitemapIndex = /<sitemapindex/i.test(xml); 341 const allUrls = [...urls]; 342 343 if (isSitemapIndex) { 344 // Only fetch up to 3 child sitemaps to avoid excessive requests 345 const childSitemaps = urls.slice(0, 3); 346 for (const childUrl of childSitemaps) { 347 try { 348 const childCtrl = new AbortController(); 349 const childTimeout = setTimeout(() => childCtrl.abort(), 8000); 350 const childResp = await fetch(childUrl, { 351 signal: childCtrl.signal, 352 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' }, 353 }); 354 clearTimeout(childTimeout); 355 356 if (childResp.ok) { 357 const childXml = await childResp.text(); 358 const childLocs = childXml.match(/<loc>\s*(https?:\/\/[^<]+?)\s*<\/loc>/gi) || []; 359 allUrls.push(...childLocs.map(m => m.replace(/<\/?loc>/gi, '').trim())); 360 } 361 } catch { 362 // Skip failed child sitemaps 363 } 364 } 365 } 366 367 // Filter URLs matching the contact page pattern 368 const matches = allUrls.filter(u => pattern.test(u)); 369 370 // Deduplicate and limit to 5 pages max 371 const unique = [...new Set(matches)].slice(0, 5); 372 373 if (unique.length > 0) { 374 info(` Sitemap fallback found ${unique.length} contact page(s)`); 375 } 376 377 return unique; 378 } catch { 379 // Network errors, timeouts, invalid URLs — all expected for many sites 380 return []; 381 } 382 } 383 384 /** 385 * Enrich a single site by browsing key_pages 386 * @param {number} siteId - Site ID 387 * @param {string} url - Site URL 388 * @param {string} contactsJsonStr - Current contacts_json (can be NULL) 389 * @param {string} htmlDom - HTML DOM for initial contact extraction 390 * @param {string} scoreJsonStr - Score JSON string 391 * @param {Browser} browser - Playwright browser instance 392 * @returns {Promise<Object>} Enrichment result 393 */ 394 // eslint-disable-next-line complexity -- Contact enrichment requires checking multiple contact types and merging arrays 395 async function enrichSite(siteId, url, contactsJsonStr, htmlDom, scoreJsonStr, browser) { 396 try { 397 // If contacts_json is NULL, try to get contact_details from score_json first 398 let contactsJson; 399 if (!contactsJsonStr) { 400 // First try to get contacts from rescoring (includes vision text extraction) 401 if (scoreJsonStr) { 402 try { 403 const scoreJson = JSON.parse(scoreJsonStr); 404 if (scoreJson.contact_details) { 405 info(` Using contacts from rescoring (with vision text) for ${url}`); 406 contactsJson = scoreJson.contact_details; 407 408 // Clean phone numbers to E.164 format 409 if (contactsJson.phone_numbers) { 410 contactsJson.phone_numbers = cleanPhoneNumbers(contactsJson.phone_numbers, url); 411 } 412 413 // Clean up invalid Twitter/X.com links 414 contactsJson = cleanInvalidSocialLinks(contactsJson); 415 } 416 } catch (err) { 417 error(` Failed to parse score_json: ${err.message}`); 418 } 419 } 420 421 // Fallback to extracting from HTML if not found in score_json. 422 // The orchestrator's enrich_sites batch handles LLM contact extraction via Claude Max 423 // (claude -p, zero API cost). The pipeline stage uses regex-only here so we don't 424 // double-spend on OpenRouter calls that the orchestrator will handle. 425 if (!contactsJson) { 426 const ENABLE_ENRICHMENT_LLM_FLAG = process.env.ENABLE_ENRICHMENT_LLM !== 'false'; 427 428 if (ENABLE_ENRICHMENT_LLM_FLAG) { 429 info( 430 ` [orchestrator] Using regex extraction for ${url} (enrich_sites batch handles LLM via Claude Max)` 431 ); 432 contactsJson = extractContactsFromHtml(htmlDom || '', url); 433 } else { 434 info(` Extracting initial contacts from ${url}`); 435 contactsJson = await extractInitialContacts(url, htmlDom, siteId); 436 } 437 } 438 } else { 439 contactsJson = JSON.parse(contactsJsonStr); 440 // Clean up invalid Twitter/X.com links from existing contacts_json 441 contactsJson = cleanInvalidSocialLinks(contactsJson); 442 } 443 444 // Regex pre-pass: extract contacts from html_dom without any LLM call. 445 // Runs unconditionally — cheap, and catches emails/phones/socials/key_pages the LLM missed. 446 const realHtml = htmlDom && htmlDom !== 'HTML removed after scoring' ? htmlDom : null; 447 if (realHtml) { 448 const regexContacts = extractContactsFromHtml(realHtml, url); 449 const before = 450 (contactsJson.email_addresses?.length || 0) + 451 (contactsJson.phone_numbers?.length || 0) + 452 (contactsJson.key_pages?.length || 0) + 453 (contactsJson.social_profiles?.length || 0); 454 contactsJson = mergeExtractedContacts(contactsJson, regexContacts, url); 455 const after = 456 (contactsJson.email_addresses?.length || 0) + 457 (contactsJson.phone_numbers?.length || 0) + 458 (contactsJson.key_pages?.length || 0) + 459 (contactsJson.social_profiles?.length || 0); 460 if (after > before) { 461 info( 462 ` Regex pre-pass added ${after - before} contacts/pages for ${url} (emails:${contactsJson.email_addresses?.length || 0} phones:${contactsJson.phone_numbers?.length || 0} social:${contactsJson.social_profiles?.length || 0} pages:${contactsJson.key_pages?.length || 0})` 463 ); 464 } 465 } 466 467 // Get key_pages that look like contact pages (multilingual) 468 const keyPages = contactsJson.key_pages || []; 469 const contactPagePattern = 470 /contact|support|get-in-touch|about|kontakt|contacto|contato|a-propos|apropos|chi-siamo|uber-uns|ueber-uns|impressum|mentions-legales|aviso-legal|datenschutz|privacidad|hubungi|kontak/i; 471 const contactPages = keyPages.filter(pageUrl => contactPagePattern.test(pageUrl.toLowerCase())); 472 473 // Sitemap fallback: if no contact pages from LLM, check sitemap.xml 474 if (contactPages.length === 0) { 475 const sitemapPages = await discoverContactPagesFromSitemap(url, contactPagePattern); 476 contactPages.push(...sitemapPages); 477 } 478 479 if (contactPages.length === 0) { 480 info(` No contact pages found for ${url}`); 481 482 // Check if we actually have any usable contacts 483 const hasEmail = contactsJson.email_addresses?.length > 0; 484 const hasPhone = contactsJson.phone_numbers?.length > 0; 485 const hasSocial = contactsJson.social_profiles?.length > 0; 486 const hasForm = !!contactsJson.primary_contact_form?.form_url; 487 const hasAnyContact = hasEmail || hasPhone || hasSocial || hasForm; 488 489 if (!hasAnyContact) { 490 // Zero contacts found and no key_pages to browse — don't mark as enriched. 491 // Record as a retriable failure so the site gets another pass 492 // (e.g. after html_dom becomes available or on next pipeline cycle). 493 const retryStatus = 494 process.env.ENABLE_VISION !== 'false' ? 'vision_scored' : 'semantic_scored'; 495 info(` No contacts found for ${url} — keeping at ${retryStatus} for retry`); 496 await recordFailure(siteId, 'enrichment', 'No contacts or key_pages found', retryStatus); 497 return { formFound: false, emailsFound: 0, phonesFound: 0 }; 498 } 499 500 // Has some contacts but no contact pages to browse — mark enriched_regex so 501 // the orchestrator LLM pass can still run against html_dom 502 const city = contactsJson.city || null; 503 const countryCode = normaliseCountryCode(contactsJson.country_code) || null; 504 505 // Write contacts to filesystem (contacts_json column was dropped in migration 121) 506 contactsJson = compactContacts(contactsJson); 507 setContactsJson(siteId, JSON.stringify(contactsJson)); 508 509 await withTransaction(async (client) => { 510 await client.query( 511 `UPDATE sites 512 SET city = $1, 513 country_code = $2, 514 status = 'enriched_regex', 515 enriched_at = NOW(), 516 error_message = NULL 517 WHERE id = $3`, 518 [city, countryCode, siteId] 519 ); 520 await resetRetries(siteId); 521 }); 522 523 return { formFound: false, emailsFound: 0, phonesFound: 0 }; 524 } 525 526 info(` Browsing ${contactPages.length} pages for ${url}`); 527 528 const newContactInfo = {}; 529 const keyPagesHtml = {}; // Accumulate {url: renderedHtml} for orchestrator LLM pass 530 let foundForm = false; 531 532 // Browse each contact page 533 for (const pageUrl of contactPages) { 534 if (foundForm) break; // Stop if we found a form 535 536 const pageData = await scrapePage(pageUrl, browser); 537 538 // Store rendered HTML for orchestrator LLM pass (cleared after enriched_llm) 539 if (pageData.html) { 540 keyPagesHtml[pageUrl] = pageData.html; 541 } 542 543 // Extract contacts — orchestrator batch (Claude Max) or regex-only. 544 // The enrich_sites orchestrator batch handles LLM extraction from html_dom via claude -p. 545 // Browser-navigated pages use regex extraction here (orchestrator handles the html_dom pass). 546 const ENABLE_ENRICHMENT_LLM = process.env.ENABLE_ENRICHMENT_LLM !== 'false'; 547 let enrichmentResult; 548 549 { 550 // Regex-only extraction for browser-navigated pages. 551 // The enrich_sites orchestrator batch handles LLM extraction from html_dom via claude -p 552 // (Claude Max, zero API cost). OpenRouter enrichment calls are no longer used. 553 const regexResult = extractContactsFromHtml(pageData.html || '', pageUrl); 554 enrichmentResult = { 555 email_addresses: regexResult.email_addresses, 556 phone_numbers: regexResult.phone_numbers, 557 social_profiles: regexResult.social_profiles, 558 key_pages: regexResult.key_pages, 559 primary_contact_form: regexResult.has_contact_form ? pageUrl : null, 560 }; 561 } 562 563 // Merge new contact info 564 if (enrichmentResult.primary_contact_form) { 565 newContactInfo.primary_contact_form = enrichmentResult.primary_contact_form; 566 foundForm = true; 567 } 568 569 // Merge location info (prefer first found) 570 if (enrichmentResult.city && !contactsJson.city) { 571 newContactInfo.city = enrichmentResult.city; 572 } 573 if (enrichmentResult.country_code && !contactsJson.country_code) { 574 newContactInfo.country_code = enrichmentResult.country_code; 575 } 576 577 // Merge arrays (avoid duplicates) 578 if (enrichmentResult.email_addresses) { 579 newContactInfo.email_addresses = [ 580 ...(newContactInfo.email_addresses || []), 581 ...enrichmentResult.email_addresses, 582 ]; 583 } 584 if (enrichmentResult.phone_numbers) { 585 newContactInfo.phone_numbers = [ 586 ...(newContactInfo.phone_numbers || []), 587 ...enrichmentResult.phone_numbers, 588 ]; 589 } 590 if (enrichmentResult.social_profiles) { 591 newContactInfo.social_profiles = [ 592 ...(newContactInfo.social_profiles || []), 593 ...enrichmentResult.social_profiles, 594 ]; 595 } 596 if (enrichmentResult.key_pages) { 597 newContactInfo.key_pages = [ 598 ...(newContactInfo.key_pages || []), 599 ...enrichmentResult.key_pages, 600 ]; 601 } 602 if (enrichmentResult.business_name && !contactsJson.business_name) { 603 newContactInfo.business_name = enrichmentResult.business_name; 604 } 605 } 606 607 // Merge new contact info with existing 608 let enrichedContacts = { 609 ...contactsJson, 610 ...newContactInfo, 611 // Merge arrays without duplicates 612 email_addresses: [ 613 ...(contactsJson.email_addresses || []), 614 ...(newContactInfo.email_addresses || []), 615 ], 616 phone_numbers: cleanPhoneNumbers( 617 [...(contactsJson.phone_numbers || []), ...(newContactInfo.phone_numbers || [])], 618 url 619 ), 620 social_profiles: dedupeByUrl([ 621 ...(contactsJson.social_profiles || []), 622 ...(newContactInfo.social_profiles || []), 623 ]), 624 key_pages: Array.from( 625 new Set([...(contactsJson.key_pages || []), ...(newContactInfo.key_pages || [])]) 626 ), 627 }; 628 629 // Social profile contact extraction (YouTube via raw HTTP, others via Playwright stealth) 630 const extractableProfiles = (enrichedContacts.social_profiles || []).filter(sp => { 631 const spUrl = typeof sp === 'string' ? sp : sp.url; 632 return spUrl && /youtube\.com|facebook\.com|yelp\.com|instagram\.com|linkedin\.com\/company/i.test(spUrl) 633 && !/profile\.php\?id=|\/groups\/|\/intent\//i.test(spUrl); 634 }); 635 if (extractableProfiles.length > 0) { 636 const socialContacts = await extractFromSocialProfiles(extractableProfiles, url, browser); 637 if (socialContacts) { 638 enrichedContacts = mergeExtractedContacts(enrichedContacts, socialContacts, url); 639 if (socialContacts._city && !enrichedContacts.city) { 640 enrichedContacts.city = socialContacts._city; 641 } 642 } 643 } 644 645 // Clean up invalid Twitter/X.com links (those without usernames) 646 enrichedContacts = cleanInvalidSocialLinks(enrichedContacts); 647 648 // Extract location information from enriched contacts 649 const city = enrichedContacts.city || null; 650 const countryCode = normaliseCountryCode(enrichedContacts.country_code) || null; 651 const state = enrichedContacts.state || null; 652 653 // Check for country mismatch (site from wrong country for the search) 654 if (countryCode) { 655 const siteRow = await getOne( 656 'SELECT google_domain FROM sites WHERE id = $1', 657 [siteId] 658 ); 659 const googleDomain = siteRow?.google_domain; 660 661 if (googleDomain) { 662 const expectedCountry = parseCountryFromGoogleDomain(googleDomain); 663 664 // Check if detected country matches expected country 665 if (countryCode !== expectedCountry) { 666 setContactsJson(siteId, JSON.stringify(enrichedContacts)); 667 await run( 668 `UPDATE sites 669 SET status = 'ignored', 670 error_message = $1, 671 city = $2, 672 country_code = $3, 673 state = $4 674 WHERE id = $5`, 675 [ 676 `Country mismatch: Site is in ${countryCode}, but search was for ${expectedCountry} (${googleDomain})`, 677 city, 678 countryCode, 679 state, 680 siteId, 681 ] 682 ); 683 684 info( 685 ` ${url} country mismatch: Site in ${countryCode}, search for ${expectedCountry} - marked as ignored` 686 ); 687 688 return { 689 formFound: foundForm, 690 emailsFound: 0, 691 phonesFound: 0, 692 }; 693 } 694 } 695 } 696 697 // GDPR verification for EU/UK countries 698 let companyProof = null; 699 let gdprVerified = null; 700 if (countryCode) { 701 let country = null; 702 try { 703 country = getCountryByCode(countryCode); 704 } catch { 705 // Unsupported country code — skip GDPR check 706 } 707 if (country && country.requiresGDPRCheck && enrichedContacts.email_addresses?.length > 0) { 708 info(` Running GDPR verification for ${countryCode} site: ${url}`); 709 710 // Combine landing page HTML with all browsed key pages HTML. 711 // Company proof keywords (Ltd, VAT number, Registered office etc.) appear far 712 // more often on About/Legal/Contact pages than the landing page. 713 const allHtml = [htmlDom || '', ...Object.values(keyPagesHtml)].join('\n'); 714 715 // Verify all email addresses 716 const verificationResults = batchVerifyEmails({ 717 emails: enrichedContacts.email_addresses.map(e => (typeof e === 'string' ? e : e.email)), 718 html: allHtml, 719 countryCode, 720 domain: url, 721 }); 722 723 // Store verification results 724 companyProof = JSON.stringify({ 725 verifiedAt: new Date().toISOString(), 726 results: verificationResults, 727 countryCode, 728 totalEmails: enrichedContacts.email_addresses.length, 729 verifiedCount: verificationResults.filter(r => r.isVerified).length, 730 unverifiedCount: verificationResults.filter(r => !r.isVerified).length, 731 }); 732 733 // Set gdpr_verified flag based on results 734 // TRUE if at least one email is verified (high or medium confidence) 735 // FALSE if all emails are unverified (free email or uncertain) 736 const hasVerifiedEmail = verificationResults.some( 737 r => r.isVerified && ['high', 'medium'].includes(r.confidence) 738 ); 739 gdprVerified = hasVerifiedEmail ? 1 : 0; 740 741 if (hasVerifiedEmail) { 742 success( 743 ` GDPR: ${verificationResults.filter(r => r.isVerified).length}/${verificationResults.length} emails verified` 744 ); 745 } else { 746 info( 747 ` GDPR: No verified company emails found (${verificationResults.filter(r => r.reason.includes('Free email')).length} free emails)` 748 ); 749 } 750 } 751 } 752 753 // Update database with enriched contacts and GDPR verification. 754 // Status = 'enriched_regex': browser regex pass complete; orchestrator LLM pass will set 'enriched_llm'. 755 // key_pages_html written to filesystem; cleared once enriched_llm completes. 756 const hasKeyPages = Object.keys(keyPagesHtml).length > 0; 757 if (hasKeyPages) { 758 writeKeyPagesHtmlFile(siteId, keyPagesHtml); 759 } 760 761 // Wrap all DB updates for this site in a transaction so GDPR fields, 762 // status, and retry counter are always consistent — no partial writes on crash. 763 // contacts_json column was dropped in migration 121 — write to filesystem only 764 const compactedContacts = compactContacts(enrichedContacts); 765 setContactsJson(siteId, JSON.stringify(compactedContacts)); 766 767 await withTransaction(async (client) => { 768 await client.query( 769 `UPDATE sites 770 SET city = $1, 771 country_code = $2, 772 state = $3, 773 company_proof = $4, 774 gdpr_verified = $5, 775 gdpr_verified_at = CASE WHEN $6 IS NOT NULL THEN NOW() ELSE NULL END, 776 status = 'enriched_regex', 777 enriched_at = NOW(), 778 key_pages_html = $7, 779 error_message = NULL 780 WHERE id = $8`, 781 [ 782 city, 783 countryCode, 784 state, 785 companyProof, 786 gdprVerified, 787 gdprVerified, 788 hasKeyPages ? 'fs' : null, // Flag: 'fs' = stored on filesystem 789 siteId, 790 ] 791 ); 792 793 // Reset retry count on successful enrichment 794 await resetRetries(siteId); 795 }); 796 797 success( 798 ` Enriched ${url}: ${foundForm ? 'form' : ''} ${newContactInfo.email_addresses?.length || 0} emails, ${newContactInfo.phone_numbers?.length || 0} phones` 799 ); 800 801 return { 802 formFound: foundForm, 803 emailsFound: newContactInfo.email_addresses?.length || 0, 804 phonesFound: newContactInfo.phone_numbers?.length || 0, 805 }; 806 } catch (err) { 807 // Record failure and increment retry count (marks as 'failing' if limit exceeded) 808 const retryStatus = process.env.ENABLE_VISION !== 'false' ? 'vision_scored' : 'semantic_scored'; 809 await recordFailure(siteId, 'enrichment', err, retryStatus); 810 throw err; 811 } 812 } 813 814 /** 815 * Scrape a page to get HTML and screenshot with smart stealth detection 816 * @param {string} pageUrl - Page URL 817 * @param {Browser} browser - Playwright browser instance 818 * @returns {Promise<Object>} Page data 819 */ 820 async function scrapePage(pageUrl, browser) { 821 // Smart stealth level detection: aggressive for socials, minimal for prospects 822 const isSocial = isSocialMediaUrl(pageUrl); 823 const stealthLevel = isSocial ? 'aggressive' : 'minimal'; 824 825 info(` ${isSocial ? '[SOCIAL]' : '[PROSPECT]'} Using ${stealthLevel} stealth for ${pageUrl}`); 826 827 const context = await createStealthContext(browser, { 828 viewport: { width: 1280, height: 720 }, 829 stealthLevel, 830 }); 831 832 const page = await context.newPage(); 833 834 try { 835 await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); 836 837 // Wait for potential Cloudflare/Turnstile challenges 838 const cloudflareResolved = await waitForCloudflare(page, { timeout: 30000 }); 839 if (!cloudflareResolved) { 840 info(` Cloudflare/Turnstile challenge may still be blocking ${pageUrl}`); 841 } 842 843 // Human-like browsing behavior (more aggressive for socials) 844 if (isSocial) { 845 await randomDelay(1500, 2500); 846 await humanScroll(page, { distance: 'short' }); 847 await randomDelay(500, 1000); 848 } else { 849 await randomDelay(300, 700); 850 } 851 852 // Wait for page to stabilise before reading content — avoids "page is navigating" error 853 // when redirects or JS navigations fire after domcontentloaded 854 try { 855 await page.waitForLoadState('domcontentloaded', { timeout: 5000 }); 856 } catch { 857 // Already loaded or timed out — proceed anyway 858 } 859 860 // Get HTML DOM 861 const html = await page.content(); 862 863 // Take screenshot only when vision is enabled — saves bandwidth and avoids 864 // OpenRouter 400 errors from large base64 payloads at higher concurrency 865 const ENABLE_VISION = process.env.ENABLE_VISION !== 'false'; 866 let screenshot = null; 867 if (ENABLE_VISION) { 868 const buf = await page.screenshot({ 869 type: 'png', 870 fullPage: false, // Above-fold only 871 }); 872 screenshot = buf.toString('base64'); 873 } 874 875 return { 876 html, 877 screenshot, 878 }; 879 } finally { 880 await page.close(); 881 await context.close(); 882 } 883 } 884 885 /** 886 * Call LLM API for contact enrichment 887 * @param {Object} params - Enrichment parameters 888 * @param {number} params.siteId - Site ID for usage tracking 889 * @returns {Promise<Object>} Enrichment result 890 */ 891 async function callEnrichmentAPI({ pageUrl, currentContacts, html, screenshot, siteId = null }) { 892 const messages = [ 893 { 894 role: 'system', 895 content: ENRICHMENT_PROMPT, 896 }, 897 { 898 role: 'user', 899 content: screenshot 900 ? [ 901 { 902 type: 'text', 903 text: `Current contacts_json:\n${JSON.stringify(currentContacts, null, 2)}\n\nPage URL: ${pageUrl}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html.substring(0, 50000)), 'website_html')}`, 904 }, 905 { 906 type: 'image_url', 907 image_url: { 908 url: `data:image/png;base64,${screenshot}`, 909 }, 910 }, 911 ] 912 : `Current contacts_json:\n${JSON.stringify(currentContacts, null, 2)}\n\nPage URL: ${pageUrl}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html.substring(0, 50000)), 'website_html')}`, 913 }, 914 ]; 915 916 const response = await openRouterBreaker.fire(async () => 917 openRouterLimiter.schedule(() => 918 callLLM({ 919 model: ENRICHMENT_MODEL, 920 messages, 921 temperature: 0.1, 922 max_tokens: 4000, 923 json_mode: true, 924 stage: 'enrichment', 925 siteId, 926 }) 927 ) 928 ); 929 930 // Parse JSON response 931 const result = safeJsonParse(response.content); 932 933 if (!result) { 934 throw new Error('Failed to parse LLM response as JSON'); 935 } 936 937 // Validate LLM response (check email/phone formats, drop unexpected fields) 938 validateEnrichmentResponse(result); 939 940 // Clean up invalid Twitter/X.com links 941 const cleanedResult = cleanInvalidSocialLinks(result); 942 943 return cleanedResult; 944 } 945 946 /** 947 * Extract initial contacts from landing page HTML 948 * @param {string} url - Landing page URL 949 * @param {string} html - HTML DOM 950 * @param {number} siteId 951 * @returns {Promise<Object>} Initial contacts_json structure 952 */ 953 async function extractInitialContacts(url, html, siteId = null) { 954 try { 955 const messages = [ 956 { 957 role: 'system', 958 content: `Extract contact information from this HTML page. Return JSON with this structure: 959 { 960 "business_name": "Company Name", 961 "email_addresses": [{ "email": "info@example.com", "label": "General", "source": "//a[@href='mailto:info@example.com']" }], 962 "phone_numbers": [{ "number": "+1234567890", "label": "Office", "source": "//span[@class='phone']" }], 963 "social_profiles": [{ "url": "https://twitter.com/handle", "label": "Twitter", "source": "//a[@href='https://twitter.com/handle']" }], 964 "key_pages": ["https://example.com/contact", "https://example.com/about"], 965 "primary_contact_form": { 966 "form_url": "https://example.com/contact", 967 "form_action_url": "/submit-contact" 968 } 969 } 970 For each contact item, include a "source" field with the XPath where you found it in the HTML. 971 Return empty arrays for missing fields. If no contact form is found, omit primary_contact_form.`, 972 }, 973 { 974 role: 'user', 975 content: `URL: ${url}\n\n${wrapUntrusted(sanitizeHtmlForPrompt(html ? html.substring(0, 50000) : 'No HTML available'), 'website_html')}`, 976 }, 977 ]; 978 979 const response = await openRouterBreaker.fire(async () => 980 openRouterLimiter.schedule(() => 981 callLLM({ 982 model: ENRICHMENT_MODEL, 983 messages, 984 temperature: 0.1, 985 max_tokens: 2000, 986 json_mode: true, 987 stage: 'enrichment', 988 siteId, 989 }) 990 ) 991 ); 992 993 const result = safeJsonParse(response.content); 994 if (!result) { 995 info(` Failed to parse LLM response, using minimal contact structure for ${url}`); 996 // Return minimal structure if parsing fails 997 return { 998 business_name: '', 999 email_addresses: [], 1000 phone_numbers: [], 1001 social_profiles: [], 1002 key_pages: [url], 1003 }; 1004 } 1005 1006 // Validate LLM response (check email/phone formats, drop unexpected fields) 1007 validateEnrichmentResponse(result); 1008 1009 // Clean phone numbers and log transformations 1010 if (result.phone_numbers && Array.isArray(result.phone_numbers)) { 1011 result.phone_numbers = cleanPhoneNumbers(result.phone_numbers, url); 1012 } 1013 1014 // Clean up invalid Twitter/X.com links 1015 const cleanedResult = cleanInvalidSocialLinks(result); 1016 1017 return cleanedResult; 1018 } catch (err) { 1019 error(` Error extracting contacts from ${url}: ${err.message}`); 1020 // Return minimal structure on error 1021 return { 1022 business_name: '', 1023 email_addresses: [], 1024 phone_numbers: [], 1025 social_profiles: [], 1026 key_pages: [url], 1027 }; 1028 } 1029 } 1030 1031 /** 1032 * Get enrichment statistics 1033 * @returns {Promise<Object>} Enrichment statistics 1034 */ 1035 export async function getEnrichmentStats() { 1036 return await getOne( 1037 `SELECT 1038 COUNT(id) as total_enriched, 1039 0 as with_forms, 1040 0 as with_emails, 1041 0 as with_phones 1042 FROM sites 1043 WHERE enriched_at IS NOT NULL` 1044 ); 1045 }