html-contact-extractor.js
1 /** 2 * Programmatic contact extraction from raw HTML. 3 * Zero LLM cost — pure regex. Runs as a pre-pass before the LLM enrichment call. 4 * 5 * Extracts: 6 * - Emails: mailto: hrefs + plaintext + HTML entity decoding + text deobfuscation 7 * + data-email attributes + CloudFlare protection + CSS RTL reversal 8 * + Unicode homoglyph normalization 9 * - Phones: tel: hrefs + plaintext international (+prefix) numbers 10 * - Social profiles: x.com / twitter.com / linkedin.com only (usable channels); 11 * facebook / instagram / youtube collected but flagged as non-outreach 12 * - Key pages: same-domain href links matching contact/about keyword patterns 13 * - Contact form signal: <form> element or known WP form plugin detected 14 * 15 * Returns a contacts_json-compatible object ready to merge into existing contacts. 16 */ 17 18 // Email domains that indicate noise (internal tooling, CDNs, schema definitions, etc.) 19 const EMAIL_NOISE_DOMAINS = new Set([ 20 'sentry.io', 21 'sentry-next.wixpress.com', 22 'sentry.wixpress.com', 23 'wixpress.com', 24 'schema.org', 25 'example.com', 26 'example.org', 27 'example.net', 28 'w3.org', 29 'gravatar.com', 30 'parastorage.com', 31 'googleusercontent.com', 32 'amazonaws.com', 33 'cloudfront.net', 34 'wordpress.com', 35 'wpengine.com', 36 ]); 37 38 // File extension artifacts that look like emails but aren't (e.g. background.jpg@cdn.com) 39 const IMAGE_EXTS = new Set(['png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'css', 'js', 'min']); 40 41 // Social platform detection config 42 // usable = can we actually send outreach via this channel? 43 const SOCIAL_PLATFORMS = [ 44 { 45 pattern: /https?:\/\/(?:www\.)?(x\.com|twitter\.com)\/([^"'<>\s/?#]+)/gi, 46 label: url => (url.includes('twitter.com') ? 'Twitter' : 'X'), 47 usable: true, 48 // Exclude tracking/intent/login paths (note: pattern stops at ?, so check both /i/ and /i$) 49 exclude: 50 /\/(i(\/|$)|intent\/|share(\?|$)|login(\?|$)|flow\/|home$|explore$|notifications$|messages$|settings$|compose\/)/i, 51 }, 52 { 53 pattern: /https?:\/\/(?:www\.)?linkedin\.com\/(company|in)\/([^"'<>\s/?#]+)/gi, 54 label: () => 'LinkedIn', 55 usable: true, 56 exclude: /\/(sharing\/|shareArticle|login\?)/i, 57 }, 58 { 59 pattern: 60 /https?:\/\/(?:www\.)?facebook\.com\/(?!tr\?|sharer|share\.php|plugins|login|dialog)([^"'<>\s/?#][^"'<>\s]*)/gi, 61 label: () => 'Facebook', 62 usable: false, 63 }, 64 { 65 pattern: /https?:\/\/(?:www\.)?instagram\.com\/([^"'<>\s/?#]+)/gi, 66 label: () => 'Instagram', 67 usable: false, 68 exclude: /\/(p\/|reel\/|explore\/|accounts\/)/i, 69 }, 70 { 71 pattern: /https?:\/\/(?:www\.)?youtube\.com\/(channel|c|user|@)\/([^"'<>\s/?#]+)/gi, 72 label: () => 'YouTube', 73 usable: false, 74 }, 75 ]; 76 77 // Contact-page keyword pattern (multilingual) — same as enrich.js 78 const CONTACT_PAGE_PATTERN = 79 /\b(contact|support|get-in-touch|about|kontakt|kontakty|contacto|contato|a-propos|apropos|chi-siamo|uber-uns|ueber-uns|impressum|mentions-legales|aviso-legal|datenschutz|privacidad|hubungi|kontak)\b/i; 80 81 // Known WP/form-builder plugin signatures in HTML 82 const FORM_PLUGIN_PATTERNS = [ 83 /contact-form-7/i, 84 /\bwpcf7\b/i, 85 /wpforms/i, 86 /gravity.?form/i, 87 /ninja.?form/i, 88 /forminator/i, 89 /caldera.?form/i, 90 /fluentform/i, 91 /wc-booking/i, 92 ]; 93 94 // Words that precede numbers but indicate non-phone context (order IDs, postcodes, etc.) 95 const PHONE_NOISE_RE = /\b(?:order|ref|invoice|abn|acn|postcode|zip|color|colour|#)\s*$/i; 96 97 // ── Obfuscation decoders ──────────────────────────────────────────────────── 98 99 /** 100 * Decode HTML character entities to their Unicode equivalents. 101 * Operates on a copy — never mutates the original HTML. 102 * 103 * Handles: 104 * @ → @ (decimal) 105 * @ → @ (hex) 106 * @ → @ (named) 107 * . → . (decimal dot) 108 */ 109 function decodeHtmlEntities(text) { 110 return text 111 .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10))) 112 .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16))) 113 .replace(/@/gi, '@') 114 .replace(/./gi, '.') 115 .replace(/&/gi, '&') 116 .replace(/</gi, '<') 117 .replace(/>/gi, '>'); 118 } 119 120 /** 121 * Normalize Unicode homoglyph characters to their ASCII equivalents. 122 * Covers full-width punctuation and common Cyrillic look-alikes used in email obfuscation. 123 */ 124 function normalizeHomoglyphs(text) { 125 return text 126 .replace(/\uFF20/g, '@') // @ FULLWIDTH COMMERCIAL AT 127 .replace(/\uFF0E/g, '.') // . FULLWIDTH FULL STOP 128 .replace(/\u0430/g, 'a') // Cyrillic а → a 129 .replace(/\u0435/g, 'e') // Cyrillic е → e 130 .replace(/\u043E/g, 'o') // Cyrillic о → o 131 .replace(/\u0440/g, 'r') // Cyrillic р → r 132 .replace(/\u0441/g, 'c') // Cyrillic с → c 133 .replace(/\u0456/g, 'i'); // Cyrillic і → i 134 } 135 136 /** 137 * Normalize text-based email obfuscation to recoverable email patterns. 138 * 139 * Bracket variants are always intentional — replaced globally: 140 * info [at] site [dot] com → info@site.com 141 * info(at)site(dot)com → info@site.com 142 * 143 * Space-word variants use a targeted regex that requires the full email 144 * structure to avoid matching natural language like "look at this": 145 * info at site dot com → info@site.com (only when dot follows) 146 * info at site.com → info@site.com (when literal . is present) 147 */ 148 function deobfuscateEmailText(text) { 149 // Bracket/paren variants — always safe, natural language never uses [at] 150 let result = text 151 .replace(/\s*\[at\]\s*/gi, '@') 152 .replace(/\s*\(at\)\s*/gi, '@') 153 .replace(/\s*\{at\}\s*/gi, '@') 154 .replace(/\s*\[dot\]\s*/gi, '.') 155 .replace(/\s*\(dot\)\s*/gi, '.') 156 .replace(/\s*\{dot\}\s*/gi, '.'); 157 158 // Space-word " at " followed by a domain with a literal dot: "info at company.com" 159 result = result.replace( 160 /\b([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,})\b/gi, 161 (_, local, domain) => `${local}@${domain}` 162 ); 163 164 // Space-word " at ... dot " pattern: "info at company dot com" or "info at co dot co dot nz" 165 // Requires BOTH "at" and "dot" to form email structure (prevents "look at this") 166 result = result.replace( 167 /\b([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9][a-zA-Z0-9.-]*)\s+dot\s+([a-zA-Z]{2,}(?:\s+dot\s+[a-zA-Z]{2,})*)\b/gi, 168 (_, local, domain, tld) => `${local}@${domain}.${tld.replace(/\s+dot\s+/gi, '.')}` 169 ); 170 171 return result; 172 } 173 174 /** 175 * Decode a CloudFlare email protection hex string. 176 * Format: data-cfemail="HEXSTRING" where first byte is the XOR key. 177 * Returns the decoded email string, or null if invalid/not an email. 178 */ 179 function decodeCfEmail(hexStr) { 180 if (!hexStr || hexStr.length < 4 || hexStr.length % 2 !== 0) return null; 181 try { 182 const key = parseInt(hexStr.slice(0, 2), 16); 183 let email = ''; 184 for (let i = 2; i < hexStr.length; i += 2) { 185 email += String.fromCharCode(parseInt(hexStr.slice(i, i + 2), 16) ^ key); 186 } 187 return email.includes('@') ? email : null; 188 } catch { 189 return null; 190 } 191 } 192 193 // ── Shared helpers ────────────────────────────────────────────────────────── 194 195 /** 196 * Decode a URL-encoded string (e.g. tel:%20john → tel: john). 197 * Falls back to original on error. 198 */ 199 function safeDecode(str) { 200 try { 201 return decodeURIComponent(str); 202 } catch { 203 return str; 204 } 205 } 206 207 /** 208 * Normalise a phone string to E.164-ish for deduplication. 209 * Strips everything except digits and leading +. 210 * Also rejects obviously invalid numbers early to prevent garbage in contacts_json. 211 */ 212 function normalisePhone(raw) { 213 const stripped = raw.replace(/[^\d+]/g, ''); 214 const digits = stripped.replace(/\D/g, ''); 215 216 // Reject noise (too short, too long) 217 if (digits.length < 8) return null; 218 if (digits.length > 15) return null; 219 220 // Reject +0... — never valid E.164 221 if (stripped.startsWith('+0')) return null; 222 223 // Reject AU/NZ/UK toll-free patterns scraped from HTML 224 if (/^\+61(1800|1300|1900)/.test(stripped)) return null; 225 if (/^\+1(800|888|877|866|855|844|833)\d{7}$/.test(stripped)) return null; 226 227 // Reject short codes: after stripping common country codes, < 7 digits remain 228 const withoutCC = digits.replace(/^(1|61|44|64|91|33|49|81|82|52|62|27|353|48|31|39)/, ''); 229 if (withoutCC.length > 0 && withoutCC.length < 7) return null; 230 231 return stripped; 232 } 233 234 /** 235 * Extract a clean hostname from a URL for same-domain checks. 236 */ 237 function hostname(url) { 238 try { 239 return new URL(url).hostname.replace(/^www\./, ''); 240 } catch { 241 return null; 242 } 243 } 244 245 /** 246 * Resolve a potentially relative href to an absolute URL. 247 * Returns null if it can't be resolved or isn't HTTP/S. 248 */ 249 function resolveHref(href, baseUrl) { 250 if (!href) return null; 251 try { 252 const resolved = new URL(href, baseUrl); 253 if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') return null; 254 return resolved.href; 255 } catch { 256 return null; 257 } 258 } 259 260 /** 261 * Validate a candidate email string against noise filters. 262 * Returns true if it looks like a real email worth keeping. 263 */ 264 function isValidEmail(raw) { 265 const domain = raw.split('@')[1]; 266 if (!domain) return false; 267 if (EMAIL_NOISE_DOMAINS.has(domain)) return false; 268 const tld = domain.split('.').pop(); 269 const localExt = raw.split('@')[0].split('.').pop(); 270 if (IMAGE_EXTS.has(tld) || IMAGE_EXTS.has(localExt)) return false; 271 return true; 272 } 273 274 /** 275 * Add an email to the collection if not already seen. 276 */ 277 function addEmail(email_addresses, emailsSeen, raw, source) { 278 const norm = raw.toLowerCase().trim(); 279 if (!norm.includes('@')) return; 280 if (!isValidEmail(norm)) return; 281 if (emailsSeen.has(norm)) return; 282 emailsSeen.add(norm); 283 email_addresses.push({ email: norm, label: 'General', source }); 284 } 285 286 /** 287 * Extract all contacts from raw HTML without any LLM calls. 288 * 289 * @param {string} html - Raw HTML string from html_dom 290 * @param {string} pageUrl - The page's URL (used for same-domain checks and relative href resolution) 291 * @returns {{ 292 * email_addresses: Array<{email: string, label: string, source: string}>, 293 * phone_numbers: Array<{number: string, label: string, source: string}>, 294 * social_profiles: Array<{url: string, label: string, usable: boolean}>, 295 * key_pages: string[], 296 * has_contact_form: boolean, 297 * }} 298 */ 299 export function extractContactsFromHtml(html, pageUrl) { 300 if (!html || typeof html !== 'string' || html === 'HTML removed after scoring') { 301 return { 302 email_addresses: [], 303 phone_numbers: [], 304 social_profiles: [], 305 key_pages: [], 306 has_contact_form: false, 307 }; 308 } 309 310 const baseDomain = hostname(pageUrl); 311 312 // ── Emails ────────────────────────────────────────────────────────────────── 313 const emailsSeen = new Set(); 314 const email_addresses = []; 315 316 // 1. mailto: hrefs (most reliable — explicit intent) 317 const mailtoRe = /href=["']mailto:([^"'<>\s?]+)/gi; 318 let m; 319 while ((m = mailtoRe.exec(html)) !== null) { 320 addEmail(email_addresses, emailsSeen, safeDecode(m[1]), 'mailto:href'); 321 } 322 323 // 2. Plaintext email pattern (catches obfuscation-free addresses in visible text / JSON-LD) 324 const emailRe = /\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/g; 325 while ((m = emailRe.exec(html)) !== null) { 326 addEmail(email_addresses, emailsSeen, m[1], 'text'); 327 } 328 329 // 3. HTML entity decoding + homoglyph normalization + text deobfuscation pass. 330 // Operates on a decoded copy — never touches the original HTML used for other passes. 331 const decodedText = deobfuscateEmailText(normalizeHomoglyphs(decodeHtmlEntities(html))); 332 const decodedEmailRe = /\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/g; 333 while ((m = decodedEmailRe.exec(decodedText)) !== null) { 334 addEmail(email_addresses, emailsSeen, m[1], 'decoded'); 335 } 336 337 // 4. data-email / data-contact / data-href attributes 338 const dataAttrRe = /data-(?:email|contact|href)=["']([^"'<>\s]+)["']/gi; 339 while ((m = dataAttrRe.exec(html)) !== null) { 340 const val = safeDecode(m[1]); 341 const emailMatch = val.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/); 342 if (emailMatch) addEmail(email_addresses, emailsSeen, emailMatch[0], 'data-attr'); 343 } 344 345 // 5. CloudFlare Email Protection: data-cfemail="hexstring" (XOR-encoded) 346 const cfRe = /data-cfemail=["']([0-9a-f]+)["']/gi; 347 while ((m = cfRe.exec(html)) !== null) { 348 const decoded = decodeCfEmail(m[1]); 349 if (decoded) addEmail(email_addresses, emailsSeen, decoded.trim(), 'cloudflare'); 350 } 351 352 // 6. CSS direction:rtl reversal — text reversed visually, readable when flipped back 353 const rtlRe = 354 /<(?:span|div|p|td|th)[^>]+style=["'][^"']*(?:direction\s*:\s*rtl|unicode-bidi)[^"']*["'][^>]*>([^<]+)<\//gi; 355 while ((m = rtlRe.exec(html)) !== null) { 356 const reversed = m[1].trim().split('').reverse().join(''); 357 const emailMatch = reversed.match(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/); 358 if (emailMatch) addEmail(email_addresses, emailsSeen, emailMatch[1], 'rtl-reversed'); 359 } 360 361 // ── Phones ─────────────────────────────────────────────────────────────────── 362 const phonesSeen = new Set(); 363 const phone_numbers = []; 364 365 // 7. tel: hrefs (most reliable) 366 const telRe = /href=["']tel:([^"'<>\s]+)/gi; 367 while ((m = telRe.exec(html)) !== null) { 368 const raw = safeDecode(m[1]).trim(); 369 const normalised = normalisePhone(raw); 370 if (!normalised || phonesSeen.has(normalised)) continue; 371 phonesSeen.add(normalised); 372 phone_numbers.push({ number: normalised, label: 'General', source: 'tel:href' }); 373 } 374 375 // 8. Plaintext international phone numbers (+ prefix only, to minimize false positives) 376 const visibleText = html.replace(/<[^>]+>/g, ' '); 377 const intlPhoneRe = 378 /\+\d{1,3}[-\s.(]?\(?\d{1,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}(?:[\s.-]?\d{0,4})?/g; 379 while ((m = intlPhoneRe.exec(visibleText)) !== null) { 380 const raw = m[0].trim(); 381 const normalised = normalisePhone(raw); 382 if (!normalised) continue; 383 const digitCount = normalised.replace(/\D/g, '').length; 384 if (digitCount < 7 || digitCount > 15) continue; 385 // Check 25 chars of preceding context for noise words (order IDs, ABNs, postcodes, etc.) 386 const preceding = visibleText.slice(Math.max(0, m.index - 25), m.index); 387 if (PHONE_NOISE_RE.test(preceding)) continue; 388 if (phonesSeen.has(normalised)) continue; 389 phonesSeen.add(normalised); 390 phone_numbers.push({ number: normalised, label: 'General', source: 'text' }); 391 } 392 393 // ── Social profiles ────────────────────────────────────────────────────────── 394 const socialSeen = new Set(); 395 const social_profiles = []; 396 397 for (const platform of SOCIAL_PLATFORMS) { 398 // Reset lastIndex since we reuse the regex across calls 399 platform.pattern.lastIndex = 0; 400 while ((m = platform.pattern.exec(html)) !== null) { 401 const url = m[0].replace(/["'<>\s].*$/, '').replace(/\/$/, ''); // strip trailing punctuation 402 if (platform.exclude && platform.exclude.test(url)) continue; 403 const key = url.toLowerCase(); 404 if (socialSeen.has(key)) continue; 405 socialSeen.add(key); 406 social_profiles.push({ 407 url, 408 label: platform.label(url), 409 usable: platform.usable, 410 }); 411 } 412 } 413 414 // ── Key pages (same-domain contact/about links) ─────────────────────────── 415 const keyPagesSeen = new Set(); 416 const key_pages = []; 417 418 const hrefRe = /href=["']([^"'<>\s#][^"'<>\s]*)/gi; 419 while ((m = hrefRe.exec(html)) !== null) { 420 const raw = m[1]; 421 if (!CONTACT_PAGE_PATTERN.test(raw)) continue; 422 const resolved = resolveHref(raw, pageUrl); 423 if (!resolved) continue; 424 // Same-domain only 425 if (baseDomain && hostname(resolved) !== baseDomain) continue; 426 // Strip query/fragment for dedup 427 const clean = resolved.split(/[?#]/)[0].replace(/\/$/, ''); 428 if (keyPagesSeen.has(clean)) continue; 429 keyPagesSeen.add(clean); 430 key_pages.push(resolved); 431 } 432 433 // ── Contact form detection ──────────────────────────────────────────────── 434 const hasFormTag = /<form[\s>]/i.test(html); 435 const hasFormPlugin = FORM_PLUGIN_PATTERNS.some(p => p.test(html)); 436 const has_contact_form = hasFormTag || hasFormPlugin; 437 438 return { email_addresses, phone_numbers, social_profiles, key_pages, has_contact_form }; 439 } 440 441 /** 442 * Merge regex-extracted contacts into an existing contacts_json object. 443 * Deduplicates by normalised value. Existing entries take precedence for label/source. 444 * Only adds entries not already present. 445 * 446 * @param {Object} existing - Parsed contacts_json (may be null/undefined) 447 * @param {Object} extracted - Result of extractContactsFromHtml() 448 * @param {string} pageUrl - Page URL (used to populate form_url if missing) 449 * @returns {Object} Merged contacts_json 450 */ 451 export function mergeExtractedContacts(existing, extracted, pageUrl) { 452 const base = existing 453 ? { ...existing } 454 : { 455 email_addresses: [], 456 phone_numbers: [], 457 social_profiles: [], 458 key_pages: [], 459 }; 460 461 // Normalised sets of what's already in base (for dedup) 462 const existingEmails = new Set( 463 (base.email_addresses || []).map(e => 464 (typeof e === 'string' ? e : e.email || '').toLowerCase().trim() 465 ) 466 ); 467 const existingPhones = new Set( 468 (base.phone_numbers || []).map( 469 p => normalisePhone(typeof p === 'string' ? p : p.number || '') || '' 470 ) 471 ); 472 const existingSocial = new Set( 473 (base.social_profiles || []).map(s => 474 (typeof s === 'string' ? s : s.url || '').toLowerCase().replace(/\/$/, '') 475 ) 476 ); 477 const existingPages = new Set( 478 (base.key_pages || []).map(u => u.split(/[?#]/)[0].replace(/\/$/, '')) 479 ); 480 481 // Merge emails 482 for (const e of extracted.email_addresses) { 483 if (!existingEmails.has(e.email)) { 484 existingEmails.add(e.email); 485 base.email_addresses = [...(base.email_addresses || []), e]; 486 } 487 } 488 489 // Merge phones 490 for (const p of extracted.phone_numbers) { 491 const norm = normalisePhone(p.number) || ''; 492 if (norm && !existingPhones.has(norm)) { 493 existingPhones.add(norm); 494 base.phone_numbers = [...(base.phone_numbers || []), p]; 495 } 496 } 497 498 // Merge social profiles 499 for (const s of extracted.social_profiles) { 500 const key = s.url.toLowerCase().replace(/\/$/, ''); 501 if (!existingSocial.has(key)) { 502 existingSocial.add(key); 503 base.social_profiles = [...(base.social_profiles || []), s]; 504 } 505 } 506 507 // Merge key_pages 508 for (const kp of extracted.key_pages) { 509 const clean = kp.split(/[?#]/)[0].replace(/\/$/, ''); 510 if (!existingPages.has(clean)) { 511 existingPages.add(clean); 512 base.key_pages = [...(base.key_pages || []), kp]; 513 } 514 } 515 516 // Set contact form if regex found one and we don't already have a form_url 517 if (extracted.has_contact_form && !base.primary_contact_form?.form_url && pageUrl) { 518 base.primary_contact_form = { form_url: pageUrl, form_action_url: null }; 519 } 520 521 return base; 522 } 523 524 /** 525 * Count usable contacts in a contacts_json object. 526 * Usable = email, phone, x.com/twitter, linkedin, or contact form. 527 * (Facebook, Instagram, YouTube are collected but not counted as outreach channels.) 528 * 529 * @param {Object} contactsJson - Parsed contacts_json 530 * @returns {number} 531 */ 532 export function countUsableContacts(contactsJson) { 533 if (!contactsJson) return 0; 534 535 let count = 0; 536 count += (contactsJson.email_addresses || []).length; 537 count += (contactsJson.phone_numbers || []).length; 538 if (contactsJson.primary_contact_form?.form_url) count += 1; 539 540 for (const s of contactsJson.social_profiles || []) { 541 const url = typeof s === 'string' ? s : s.url || ''; 542 const usable = typeof s === 'object' && 'usable' in s ? s.usable : null; 543 // If usable flag present, use it; otherwise infer from URL 544 if (usable === true || (usable === null && /linkedin\.com|x\.com|twitter\.com/i.test(url))) { 545 count += 1; 546 } 547 } 548 549 return count; 550 }