Cradicle Explorer

/ src / utils / html-contact-extractor.js
html-contact-extractor.js
  1  /**
  2   * Programmatic contact extraction from raw HTML.
  3   * Zero LLM cost — pure regex. Runs as a pre-pass before the LLM enrichment call.
  4   *
  5   * Extracts:
  6   *   - Emails: mailto: hrefs + plaintext + HTML entity decoding + text deobfuscation
  7   *             + data-email attributes + CloudFlare protection + CSS RTL reversal
  8   *             + Unicode homoglyph normalization
  9   *   - Phones: tel: hrefs + plaintext international (+prefix) numbers
 10   *   - Social profiles: x.com / twitter.com / linkedin.com only (usable channels);
 11   *     facebook / instagram / youtube collected but flagged as non-outreach
 12   *   - Key pages: same-domain href links matching contact/about keyword patterns
 13   *   - Contact form signal: <form> element or known WP form plugin detected
 14   *
 15   * Returns a contacts_json-compatible object ready to merge into existing contacts.
 16   */
 17  
 18  // Email domains that indicate noise (internal tooling, CDNs, schema definitions, etc.)
 19  const EMAIL_NOISE_DOMAINS = new Set([
 20    'sentry.io',
 21    'sentry-next.wixpress.com',
 22    'sentry.wixpress.com',
 23    'wixpress.com',
 24    'schema.org',
 25    'example.com',
 26    'example.org',
 27    'example.net',
 28    'w3.org',
 29    'gravatar.com',
 30    'parastorage.com',
 31    'googleusercontent.com',
 32    'amazonaws.com',
 33    'cloudfront.net',
 34    'wordpress.com',
 35    'wpengine.com',
 36  ]);
 37  
 38  // File extension artifacts that look like emails but aren't (e.g. background.jpg@cdn.com)
 39  const IMAGE_EXTS = new Set(['png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'css', 'js', 'min']);
 40  
 41  // Social platform detection config
 42  // usable = can we actually send outreach via this channel?
 43  const SOCIAL_PLATFORMS = [
 44    {
 45      pattern: /https?:\/\/(?:www\.)?(x\.com|twitter\.com)\/([^"'<>\s/?#]+)/gi,
 46      label: url => (url.includes('twitter.com') ? 'Twitter' : 'X'),
 47      usable: true,
 48      // Exclude tracking/intent/login paths (note: pattern stops at ?, so check both /i/ and /i$)
 49      exclude:
 50        /\/(i(\/|$)|intent\/|share(\?|$)|login(\?|$)|flow\/|home$|explore$|notifications$|messages$|settings$|compose\/)/i,
 51    },
 52    {
 53      pattern: /https?:\/\/(?:www\.)?linkedin\.com\/(company|in)\/([^"'<>\s/?#]+)/gi,
 54      label: () => 'LinkedIn',
 55      usable: true,
 56      exclude: /\/(sharing\/|shareArticle|login\?)/i,
 57    },
 58    {
 59      pattern:
 60        /https?:\/\/(?:www\.)?facebook\.com\/(?!tr\?|sharer|share\.php|plugins|login|dialog)([^"'<>\s/?#][^"'<>\s]*)/gi,
 61      label: () => 'Facebook',
 62      usable: false,
 63    },
 64    {
 65      pattern: /https?:\/\/(?:www\.)?instagram\.com\/([^"'<>\s/?#]+)/gi,
 66      label: () => 'Instagram',
 67      usable: false,
 68      exclude: /\/(p\/|reel\/|explore\/|accounts\/)/i,
 69    },
 70    {
 71      pattern: /https?:\/\/(?:www\.)?youtube\.com\/(channel|c|user|@)\/([^"'<>\s/?#]+)/gi,
 72      label: () => 'YouTube',
 73      usable: false,
 74    },
 75  ];
 76  
 77  // Contact-page keyword pattern (multilingual) — same as enrich.js
 78  const CONTACT_PAGE_PATTERN =
 79    /\b(contact|support|get-in-touch|about|kontakt|kontakty|contacto|contato|a-propos|apropos|chi-siamo|uber-uns|ueber-uns|impressum|mentions-legales|aviso-legal|datenschutz|privacidad|hubungi|kontak)\b/i;
 80  
 81  // Known WP/form-builder plugin signatures in HTML
 82  const FORM_PLUGIN_PATTERNS = [
 83    /contact-form-7/i,
 84    /\bwpcf7\b/i,
 85    /wpforms/i,
 86    /gravity.?form/i,
 87    /ninja.?form/i,
 88    /forminator/i,
 89    /caldera.?form/i,
 90    /fluentform/i,
 91    /wc-booking/i,
 92  ];
 93  
 94  // Words that precede numbers but indicate non-phone context (order IDs, postcodes, etc.)
 95  const PHONE_NOISE_RE = /\b(?:order|ref|invoice|abn|acn|postcode|zip|color|colour|#)\s*$/i;
 96  
 97  // ── Obfuscation decoders ────────────────────────────────────────────────────
 98  
 99  /**
100   * Decode HTML character entities to their Unicode equivalents.
101   * Operates on a copy — never mutates the original HTML.
102   *
103   * Handles:
104   *   &#64;       → @   (decimal)
105   *   &#x40;      → @   (hex)
106   *   &commat;    → @   (named)
107   *   &#46;       → .   (decimal dot)
108   */
109  function decodeHtmlEntities(text) {
110    return text
111      .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
112      .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)))
113      .replace(/&commat;/gi, '@')
114      .replace(/&period;/gi, '.')
115      .replace(/&amp;/gi, '&')
116      .replace(/&lt;/gi, '<')
117      .replace(/&gt;/gi, '>');
118  }
119  
120  /**
121   * Normalize Unicode homoglyph characters to their ASCII equivalents.
122   * Covers full-width punctuation and common Cyrillic look-alikes used in email obfuscation.
123   */
124  function normalizeHomoglyphs(text) {
125    return text
126      .replace(/\uFF20/g, '@') // ＠ FULLWIDTH COMMERCIAL AT
127      .replace(/\uFF0E/g, '.') // ． FULLWIDTH FULL STOP
128      .replace(/\u0430/g, 'a') // Cyrillic а → a
129      .replace(/\u0435/g, 'e') // Cyrillic е → e
130      .replace(/\u043E/g, 'o') // Cyrillic о → o
131      .replace(/\u0440/g, 'r') // Cyrillic р → r
132      .replace(/\u0441/g, 'c') // Cyrillic с → c
133      .replace(/\u0456/g, 'i'); // Cyrillic і → i
134  }
135  
136  /**
137   * Normalize text-based email obfuscation to recoverable email patterns.
138   *
139   * Bracket variants are always intentional — replaced globally:
140   *   info [at] site [dot] com  →  info@site.com
141   *   info(at)site(dot)com      →  info@site.com
142   *
143   * Space-word variants use a targeted regex that requires the full email
144   * structure to avoid matching natural language like "look at this":
145   *   info at site dot com      →  info@site.com  (only when dot follows)
146   *   info at site.com          →  info@site.com  (when literal . is present)
147   */
148  function deobfuscateEmailText(text) {
149    // Bracket/paren variants — always safe, natural language never uses [at]
150    let result = text
151      .replace(/\s*\[at\]\s*/gi, '@')
152      .replace(/\s*\(at\)\s*/gi, '@')
153      .replace(/\s*\{at\}\s*/gi, '@')
154      .replace(/\s*\[dot\]\s*/gi, '.')
155      .replace(/\s*\(dot\)\s*/gi, '.')
156      .replace(/\s*\{dot\}\s*/gi, '.');
157  
158    // Space-word " at " followed by a domain with a literal dot: "info at company.com"
159    result = result.replace(
160      /\b([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,})\b/gi,
161      (_, local, domain) => `${local}@${domain}`
162    );
163  
164    // Space-word " at ... dot " pattern: "info at company dot com" or "info at co dot co dot nz"
165    // Requires BOTH "at" and "dot" to form email structure (prevents "look at this")
166    result = result.replace(
167      /\b([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9][a-zA-Z0-9.-]*)\s+dot\s+([a-zA-Z]{2,}(?:\s+dot\s+[a-zA-Z]{2,})*)\b/gi,
168      (_, local, domain, tld) => `${local}@${domain}.${tld.replace(/\s+dot\s+/gi, '.')}`
169    );
170  
171    return result;
172  }
173  
174  /**
175   * Decode a CloudFlare email protection hex string.
176   * Format: data-cfemail="HEXSTRING" where first byte is the XOR key.
177   * Returns the decoded email string, or null if invalid/not an email.
178   */
179  function decodeCfEmail(hexStr) {
180    if (!hexStr || hexStr.length < 4 || hexStr.length % 2 !== 0) return null;
181    try {
182      const key = parseInt(hexStr.slice(0, 2), 16);
183      let email = '';
184      for (let i = 2; i < hexStr.length; i += 2) {
185        email += String.fromCharCode(parseInt(hexStr.slice(i, i + 2), 16) ^ key);
186      }
187      return email.includes('@') ? email : null;
188    } catch {
189      return null;
190    }
191  }
192  
193  // ── Shared helpers ──────────────────────────────────────────────────────────
194  
195  /**
196   * Decode a URL-encoded string (e.g. tel:%20john → tel: john).
197   * Falls back to original on error.
198   */
199  function safeDecode(str) {
200    try {
201      return decodeURIComponent(str);
202    } catch {
203      return str;
204    }
205  }
206  
207  /**
208   * Normalise a phone string to E.164-ish for deduplication.
209   * Strips everything except digits and leading +.
210   * Also rejects obviously invalid numbers early to prevent garbage in contacts_json.
211   */
212  function normalisePhone(raw) {
213    const stripped = raw.replace(/[^\d+]/g, '');
214    const digits = stripped.replace(/\D/g, '');
215  
216    // Reject noise (too short, too long)
217    if (digits.length < 8) return null;
218    if (digits.length > 15) return null;
219  
220    // Reject +0... — never valid E.164
221    if (stripped.startsWith('+0')) return null;
222  
223    // Reject AU/NZ/UK toll-free patterns scraped from HTML
224    if (/^\+61(1800|1300|1900)/.test(stripped)) return null;
225    if (/^\+1(800|888|877|866|855|844|833)\d{7}$/.test(stripped)) return null;
226  
227    // Reject short codes: after stripping common country codes, < 7 digits remain
228    const withoutCC = digits.replace(/^(1|61|44|64|91|33|49|81|82|52|62|27|353|48|31|39)/, '');
229    if (withoutCC.length > 0 && withoutCC.length < 7) return null;
230  
231    return stripped;
232  }
233  
234  /**
235   * Extract a clean hostname from a URL for same-domain checks.
236   */
237  function hostname(url) {
238    try {
239      return new URL(url).hostname.replace(/^www\./, '');
240    } catch {
241      return null;
242    }
243  }
244  
245  /**
246   * Resolve a potentially relative href to an absolute URL.
247   * Returns null if it can't be resolved or isn't HTTP/S.
248   */
249  function resolveHref(href, baseUrl) {
250    if (!href) return null;
251    try {
252      const resolved = new URL(href, baseUrl);
253      if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') return null;
254      return resolved.href;
255    } catch {
256      return null;
257    }
258  }
259  
260  /**
261   * Validate a candidate email string against noise filters.
262   * Returns true if it looks like a real email worth keeping.
263   */
264  function isValidEmail(raw) {
265    const domain = raw.split('@')[1];
266    if (!domain) return false;
267    if (EMAIL_NOISE_DOMAINS.has(domain)) return false;
268    const tld = domain.split('.').pop();
269    const localExt = raw.split('@')[0].split('.').pop();
270    if (IMAGE_EXTS.has(tld) || IMAGE_EXTS.has(localExt)) return false;
271    return true;
272  }
273  
274  /**
275   * Add an email to the collection if not already seen.
276   */
277  function addEmail(email_addresses, emailsSeen, raw, source) {
278    const norm = raw.toLowerCase().trim();
279    if (!norm.includes('@')) return;
280    if (!isValidEmail(norm)) return;
281    if (emailsSeen.has(norm)) return;
282    emailsSeen.add(norm);
283    email_addresses.push({ email: norm, label: 'General', source });
284  }
285  
286  /**
287   * Extract all contacts from raw HTML without any LLM calls.
288   *
289   * @param {string} html - Raw HTML string from html_dom
290   * @param {string} pageUrl - The page's URL (used for same-domain checks and relative href resolution)
291   * @returns {{
292   *   email_addresses: Array<{email: string, label: string, source: string}>,
293   *   phone_numbers:   Array<{number: string, label: string, source: string}>,
294   *   social_profiles: Array<{url: string, label: string, usable: boolean}>,
295   *   key_pages:       string[],
296   *   has_contact_form: boolean,
297   * }}
298   */
299  export function extractContactsFromHtml(html, pageUrl) {
300    if (!html || typeof html !== 'string' || html === 'HTML removed after scoring') {
301      return {
302        email_addresses: [],
303        phone_numbers: [],
304        social_profiles: [],
305        key_pages: [],
306        has_contact_form: false,
307      };
308    }
309  
310    const baseDomain = hostname(pageUrl);
311  
312    // ── Emails ──────────────────────────────────────────────────────────────────
313    const emailsSeen = new Set();
314    const email_addresses = [];
315  
316    // 1. mailto: hrefs (most reliable — explicit intent)
317    const mailtoRe = /href=["']mailto:([^"'<>\s?]+)/gi;
318    let m;
319    while ((m = mailtoRe.exec(html)) !== null) {
320      addEmail(email_addresses, emailsSeen, safeDecode(m[1]), 'mailto:href');
321    }
322  
323    // 2. Plaintext email pattern (catches obfuscation-free addresses in visible text / JSON-LD)
324    const emailRe = /\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/g;
325    while ((m = emailRe.exec(html)) !== null) {
326      addEmail(email_addresses, emailsSeen, m[1], 'text');
327    }
328  
329    // 3. HTML entity decoding + homoglyph normalization + text deobfuscation pass.
330    //    Operates on a decoded copy — never touches the original HTML used for other passes.
331    const decodedText = deobfuscateEmailText(normalizeHomoglyphs(decodeHtmlEntities(html)));
332    const decodedEmailRe = /\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/g;
333    while ((m = decodedEmailRe.exec(decodedText)) !== null) {
334      addEmail(email_addresses, emailsSeen, m[1], 'decoded');
335    }
336  
337    // 4. data-email / data-contact / data-href attributes
338    const dataAttrRe = /data-(?:email|contact|href)=["']([^"'<>\s]+)["']/gi;
339    while ((m = dataAttrRe.exec(html)) !== null) {
340      const val = safeDecode(m[1]);
341      const emailMatch = val.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
342      if (emailMatch) addEmail(email_addresses, emailsSeen, emailMatch[0], 'data-attr');
343    }
344  
345    // 5. CloudFlare Email Protection: data-cfemail="hexstring" (XOR-encoded)
346    const cfRe = /data-cfemail=["']([0-9a-f]+)["']/gi;
347    while ((m = cfRe.exec(html)) !== null) {
348      const decoded = decodeCfEmail(m[1]);
349      if (decoded) addEmail(email_addresses, emailsSeen, decoded.trim(), 'cloudflare');
350    }
351  
352    // 6. CSS direction:rtl reversal — text reversed visually, readable when flipped back
353    const rtlRe =
354      /<(?:span|div|p|td|th)[^>]+style=["'][^"']*(?:direction\s*:\s*rtl|unicode-bidi)[^"']*["'][^>]*>([^<]+)<\//gi;
355    while ((m = rtlRe.exec(html)) !== null) {
356      const reversed = m[1].trim().split('').reverse().join('');
357      const emailMatch = reversed.match(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b/);
358      if (emailMatch) addEmail(email_addresses, emailsSeen, emailMatch[1], 'rtl-reversed');
359    }
360  
361    // ── Phones ───────────────────────────────────────────────────────────────────
362    const phonesSeen = new Set();
363    const phone_numbers = [];
364  
365    // 7. tel: hrefs (most reliable)
366    const telRe = /href=["']tel:([^"'<>\s]+)/gi;
367    while ((m = telRe.exec(html)) !== null) {
368      const raw = safeDecode(m[1]).trim();
369      const normalised = normalisePhone(raw);
370      if (!normalised || phonesSeen.has(normalised)) continue;
371      phonesSeen.add(normalised);
372      phone_numbers.push({ number: normalised, label: 'General', source: 'tel:href' });
373    }
374  
375    // 8. Plaintext international phone numbers (+ prefix only, to minimize false positives)
376    const visibleText = html.replace(/<[^>]+>/g, ' ');
377    const intlPhoneRe =
378      /\+\d{1,3}[-\s.(]?\(?\d{1,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}(?:[\s.-]?\d{0,4})?/g;
379    while ((m = intlPhoneRe.exec(visibleText)) !== null) {
380      const raw = m[0].trim();
381      const normalised = normalisePhone(raw);
382      if (!normalised) continue;
383      const digitCount = normalised.replace(/\D/g, '').length;
384      if (digitCount < 7 || digitCount > 15) continue;
385      // Check 25 chars of preceding context for noise words (order IDs, ABNs, postcodes, etc.)
386      const preceding = visibleText.slice(Math.max(0, m.index - 25), m.index);
387      if (PHONE_NOISE_RE.test(preceding)) continue;
388      if (phonesSeen.has(normalised)) continue;
389      phonesSeen.add(normalised);
390      phone_numbers.push({ number: normalised, label: 'General', source: 'text' });
391    }
392  
393    // ── Social profiles ──────────────────────────────────────────────────────────
394    const socialSeen = new Set();
395    const social_profiles = [];
396  
397    for (const platform of SOCIAL_PLATFORMS) {
398      // Reset lastIndex since we reuse the regex across calls
399      platform.pattern.lastIndex = 0;
400      while ((m = platform.pattern.exec(html)) !== null) {
401        const url = m[0].replace(/["'<>\s].*$/, '').replace(/\/$/, ''); // strip trailing punctuation
402        if (platform.exclude && platform.exclude.test(url)) continue;
403        const key = url.toLowerCase();
404        if (socialSeen.has(key)) continue;
405        socialSeen.add(key);
406        social_profiles.push({
407          url,
408          label: platform.label(url),
409          usable: platform.usable,
410        });
411      }
412    }
413  
414    // ── Key pages (same-domain contact/about links) ───────────────────────────
415    const keyPagesSeen = new Set();
416    const key_pages = [];
417  
418    const hrefRe = /href=["']([^"'<>\s#][^"'<>\s]*)/gi;
419    while ((m = hrefRe.exec(html)) !== null) {
420      const raw = m[1];
421      if (!CONTACT_PAGE_PATTERN.test(raw)) continue;
422      const resolved = resolveHref(raw, pageUrl);
423      if (!resolved) continue;
424      // Same-domain only
425      if (baseDomain && hostname(resolved) !== baseDomain) continue;
426      // Strip query/fragment for dedup
427      const clean = resolved.split(/[?#]/)[0].replace(/\/$/, '');
428      if (keyPagesSeen.has(clean)) continue;
429      keyPagesSeen.add(clean);
430      key_pages.push(resolved);
431    }
432  
433    // ── Contact form detection ────────────────────────────────────────────────
434    const hasFormTag = /<form[\s>]/i.test(html);
435    const hasFormPlugin = FORM_PLUGIN_PATTERNS.some(p => p.test(html));
436    const has_contact_form = hasFormTag || hasFormPlugin;
437  
438    return { email_addresses, phone_numbers, social_profiles, key_pages, has_contact_form };
439  }
440  
441  /**
442   * Merge regex-extracted contacts into an existing contacts_json object.
443   * Deduplicates by normalised value. Existing entries take precedence for label/source.
444   * Only adds entries not already present.
445   *
446   * @param {Object} existing - Parsed contacts_json (may be null/undefined)
447   * @param {Object} extracted - Result of extractContactsFromHtml()
448   * @param {string} pageUrl - Page URL (used to populate form_url if missing)
449   * @returns {Object} Merged contacts_json
450   */
451  export function mergeExtractedContacts(existing, extracted, pageUrl) {
452    const base = existing
453      ? { ...existing }
454      : {
455          email_addresses: [],
456          phone_numbers: [],
457          social_profiles: [],
458          key_pages: [],
459        };
460  
461    // Normalised sets of what's already in base (for dedup)
462    const existingEmails = new Set(
463      (base.email_addresses || []).map(e =>
464        (typeof e === 'string' ? e : e.email || '').toLowerCase().trim()
465      )
466    );
467    const existingPhones = new Set(
468      (base.phone_numbers || []).map(
469        p => normalisePhone(typeof p === 'string' ? p : p.number || '') || ''
470      )
471    );
472    const existingSocial = new Set(
473      (base.social_profiles || []).map(s =>
474        (typeof s === 'string' ? s : s.url || '').toLowerCase().replace(/\/$/, '')
475      )
476    );
477    const existingPages = new Set(
478      (base.key_pages || []).map(u => u.split(/[?#]/)[0].replace(/\/$/, ''))
479    );
480  
481    // Merge emails
482    for (const e of extracted.email_addresses) {
483      if (!existingEmails.has(e.email)) {
484        existingEmails.add(e.email);
485        base.email_addresses = [...(base.email_addresses || []), e];
486      }
487    }
488  
489    // Merge phones
490    for (const p of extracted.phone_numbers) {
491      const norm = normalisePhone(p.number) || '';
492      if (norm && !existingPhones.has(norm)) {
493        existingPhones.add(norm);
494        base.phone_numbers = [...(base.phone_numbers || []), p];
495      }
496    }
497  
498    // Merge social profiles
499    for (const s of extracted.social_profiles) {
500      const key = s.url.toLowerCase().replace(/\/$/, '');
501      if (!existingSocial.has(key)) {
502        existingSocial.add(key);
503        base.social_profiles = [...(base.social_profiles || []), s];
504      }
505    }
506  
507    // Merge key_pages
508    for (const kp of extracted.key_pages) {
509      const clean = kp.split(/[?#]/)[0].replace(/\/$/, '');
510      if (!existingPages.has(clean)) {
511        existingPages.add(clean);
512        base.key_pages = [...(base.key_pages || []), kp];
513      }
514    }
515  
516    // Set contact form if regex found one and we don't already have a form_url
517    if (extracted.has_contact_form && !base.primary_contact_form?.form_url && pageUrl) {
518      base.primary_contact_form = { form_url: pageUrl, form_action_url: null };
519    }
520  
521    return base;
522  }
523  
524  /**
525   * Count usable contacts in a contacts_json object.
526   * Usable = email, phone, x.com/twitter, linkedin, or contact form.
527   * (Facebook, Instagram, YouTube are collected but not counted as outreach channels.)
528   *
529   * @param {Object} contactsJson - Parsed contacts_json
530   * @returns {number}
531   */
532  export function countUsableContacts(contactsJson) {
533    if (!contactsJson) return 0;
534  
535    let count = 0;
536    count += (contactsJson.email_addresses || []).length;
537    count += (contactsJson.phone_numbers || []).length;
538    if (contactsJson.primary_contact_form?.form_url) count += 1;
539  
540    for (const s of contactsJson.social_profiles || []) {
541      const url = typeof s === 'string' ? s : s.url || '';
542      const usable = typeof s === 'object' && 'usable' in s ? s.usable : null;
543      // If usable flag present, use it; otherwise infer from URL
544      if (usable === true || (usable === null && /linkedin\.com|x\.com|twitter\.com/i.test(url))) {
545        count += 1;
546      }
547    }
548  
549    return count;
550  }