/ tests / html-contact-extractor.test.js
html-contact-extractor.test.js
  1  import { test, describe } from 'node:test';
  2  import assert from 'node:assert/strict';
  3  import {
  4    extractContactsFromHtml,
  5    mergeExtractedContacts,
  6    countUsableContacts,
  7  } from '../src/utils/html-contact-extractor.js';
  8  
  9  const BASE_URL = 'https://example.com.au';
 10  
 11  describe('extractContactsFromHtml', () => {
 12    test('extracts mailto: email', () => {
 13      const html = '<a href="mailto:info@acmeplumbing.com.au">Contact us</a>';
 14      const result = extractContactsFromHtml(html, BASE_URL);
 15      assert.equal(result.email_addresses.length, 1);
 16      assert.equal(result.email_addresses[0].email, 'info@acmeplumbing.com.au');
 17      assert.equal(result.email_addresses[0].source, 'mailto:href');
 18    });
 19  
 20    test('extracts plaintext email', () => {
 21      const html = '<p>Email us at info@acmeplumbing.com.au for a quote</p>';
 22      const result = extractContactsFromHtml(html, BASE_URL);
 23      assert.equal(result.email_addresses.length, 1);
 24      assert.equal(result.email_addresses[0].source, 'text');
 25    });
 26  
 27    test('deduplicates mailto and plaintext email', () => {
 28      const html = '<a href="mailto:info@acme.com.au">email</a> or info@acme.com.au';
 29      const result = extractContactsFromHtml(html, BASE_URL);
 30      assert.equal(result.email_addresses.length, 1);
 31    });
 32  
 33    test('filters noise email domains', () => {
 34      const html =
 35        '<p>605a7bae@sentry-next.wixpress.com</p>' +
 36        '<a href="mailto:real@business.com.au">Contact</a>';
 37      const result = extractContactsFromHtml(html, BASE_URL);
 38      assert.equal(result.email_addresses.length, 1);
 39      assert.equal(result.email_addresses[0].email, 'real@business.com.au');
 40    });
 41  
 42    test('filters image extension artefacts', () => {
 43      const html = '<p>background.jpg@cdn.com</p>';
 44      const result = extractContactsFromHtml(html, BASE_URL);
 45      assert.equal(result.email_addresses.length, 0);
 46    });
 47  
 48    test('extracts tel: phone', () => {
 49      const html = '<a href="tel:+61418804934">Call us</a>';
 50      const result = extractContactsFromHtml(html, BASE_URL);
 51      assert.equal(result.phone_numbers.length, 1);
 52      assert.equal(result.phone_numbers[0].number, '+61418804934');
 53    });
 54  
 55    test('extracts URL-encoded tel:', () => {
 56      const html = '<a href="tel:0401%20524%20647">Call</a>';
 57      const result = extractContactsFromHtml(html, BASE_URL);
 58      assert.equal(result.phone_numbers.length, 1);
 59      assert.ok(result.phone_numbers[0].number.includes('0401'));
 60    });
 61  
 62    test('rejects short phone noise', () => {
 63      // CSS colour code or short number
 64      const html = '<a href="tel:123">Ring</a>';
 65      const result = extractContactsFromHtml(html, BASE_URL);
 66      assert.equal(result.phone_numbers.length, 0);
 67    });
 68  
 69    test('extracts x.com social profile', () => {
 70      const html = '<a href="https://x.com/acmeplumbing">Follow us</a>';
 71      const result = extractContactsFromHtml(html, BASE_URL);
 72      const x = result.social_profiles.find(s => s.label === 'X');
 73      assert.ok(x);
 74      assert.equal(x.usable, true);
 75    });
 76  
 77    test('extracts linkedin.com/company profile', () => {
 78      const html = '<a href="https://linkedin.com/company/acmeplumbing">LinkedIn</a>';
 79      const result = extractContactsFromHtml(html, BASE_URL);
 80      const li = result.social_profiles.find(s => s.label === 'LinkedIn');
 81      assert.ok(li);
 82      assert.equal(li.usable, true);
 83    });
 84  
 85    test('extracts facebook but marks not usable', () => {
 86      const html = '<a href="https://www.facebook.com/acmeplumbing">Facebook</a>';
 87      const result = extractContactsFromHtml(html, BASE_URL);
 88      const fb = result.social_profiles.find(s => s.label === 'Facebook');
 89      assert.ok(fb);
 90      assert.equal(fb.usable, false);
 91    });
 92  
 93    test('excludes facebook tracking pixels', () => {
 94      const html = '<img src="https://www.facebook.com/tr?id=12345&ev=PageView" />';
 95      const result = extractContactsFromHtml(html, BASE_URL);
 96      assert.equal(result.social_profiles.filter(s => s.label === 'Facebook').length, 0);
 97    });
 98  
 99    test('excludes x.com login/intent URLs', () => {
100      const html = '<a href="https://x.com/i/flow/login?redirect_after_login=%2Facme">X</a>';
101      const result = extractContactsFromHtml(html, BASE_URL);
102      assert.equal(result.social_profiles.filter(s => s.label === 'X').length, 0);
103    });
104  
105    test('extracts same-domain contact page links', () => {
106      const html =
107        '<a href="https://example.com.au/contact-us">Contact</a>' +
108        '<a href="https://example.com.au/about-us">About</a>';
109      const result = extractContactsFromHtml(html, BASE_URL);
110      assert.ok(result.key_pages.length >= 2);
111    });
112  
113    test('excludes cross-domain links from key_pages', () => {
114      const html = '<a href="https://otherdomain.com/contact">Contact</a>';
115      const result = extractContactsFromHtml(html, BASE_URL);
116      assert.equal(result.key_pages.length, 0);
117    });
118  
119    test('resolves relative hrefs for key_pages', () => {
120      const html = '<a href="/contact-us">Contact</a>';
121      const result = extractContactsFromHtml(html, BASE_URL);
122      assert.equal(result.key_pages.length, 1);
123      assert.ok(result.key_pages[0].startsWith('https://example.com.au'));
124    });
125  
126    test('detects <form> tag as contact form signal', () => {
127      const html = '<form action="/submit"><input type="text"></form>';
128      const result = extractContactsFromHtml(html, BASE_URL);
129      assert.equal(result.has_contact_form, true);
130    });
131  
132    test('detects WP contact-form-7 plugin', () => {
133      const html = '<script src="/wp-content/plugins/contact-form-7/js/index.js"></script>';
134      const result = extractContactsFromHtml(html, BASE_URL);
135      assert.equal(result.has_contact_form, true);
136    });
137  
138    test('returns empty result for sentinel html_dom', () => {
139      const result = extractContactsFromHtml('HTML removed after scoring', BASE_URL);
140      assert.equal(result.email_addresses.length, 0);
141      assert.equal(result.phone_numbers.length, 0);
142      assert.equal(result.has_contact_form, false);
143    });
144  
145    test('returns empty result for null html', () => {
146      const result = extractContactsFromHtml(null, BASE_URL);
147      assert.equal(result.email_addresses.length, 0);
148    });
149  });
150  
151  describe('mergeExtractedContacts', () => {
152    test('merges new emails into existing contacts_json', () => {
153      const existing = { email_addresses: [{ email: 'old@site.com', label: 'Office' }] };
154      const extracted = {
155        email_addresses: [{ email: 'new@site.com', label: 'General', source: 'text' }],
156        phone_numbers: [],
157        social_profiles: [],
158        key_pages: [],
159        has_contact_form: false,
160      };
161      const merged = mergeExtractedContacts(existing, extracted, BASE_URL);
162      assert.equal(merged.email_addresses.length, 2);
163    });
164  
165    test('does not duplicate existing emails', () => {
166      const existing = { email_addresses: [{ email: 'info@site.com', label: 'Office' }] };
167      const extracted = {
168        email_addresses: [{ email: 'info@site.com', label: 'General', source: 'mailto:href' }],
169        phone_numbers: [],
170        social_profiles: [],
171        key_pages: [],
172        has_contact_form: false,
173      };
174      const merged = mergeExtractedContacts(existing, extracted, BASE_URL);
175      assert.equal(merged.email_addresses.length, 1);
176    });
177  
178    test('sets contact form when not present', () => {
179      const existing = {};
180      const extracted = {
181        email_addresses: [],
182        phone_numbers: [],
183        social_profiles: [],
184        key_pages: [],
185        has_contact_form: true,
186      };
187      const merged = mergeExtractedContacts(existing, extracted, BASE_URL);
188      assert.ok(merged.primary_contact_form?.form_url);
189    });
190  
191    test('does not overwrite existing contact form', () => {
192      const existing = { primary_contact_form: { form_url: 'https://example.com.au/old-form' } };
193      const extracted = {
194        email_addresses: [],
195        phone_numbers: [],
196        social_profiles: [],
197        key_pages: [],
198        has_contact_form: true,
199      };
200      const merged = mergeExtractedContacts(existing, extracted, 'https://example.com.au/page');
201      assert.equal(merged.primary_contact_form.form_url, 'https://example.com.au/old-form');
202    });
203  });
204  
205  // ── Obfuscation decoder tests ────────────────────────────────────────────────
206  
207  describe('1A: HTML entity decoding', () => {
208    test('decodes decimal entities: &#105;&#110;&#102;&#111;&#64;site.com', () => {
209      const html = '<p>&#105;&#110;&#102;&#111;&#64;site.com</p>';
210      const result = extractContactsFromHtml(html, BASE_URL);
211      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
212    });
213  
214    test('decodes hex entities: &#x69;&#x6e;&#x66;&#x6f;&#x40;site.com', () => {
215      const html = '<p>&#x69;&#x6e;&#x66;&#x6f;&#x40;site.com</p>';
216      const result = extractContactsFromHtml(html, BASE_URL);
217      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
218    });
219  
220    test('decodes named entity &commat;: info&commat;site.com', () => {
221      const html = '<p>info&commat;site.com</p>';
222      const result = extractContactsFromHtml(html, BASE_URL);
223      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
224    });
225  
226    test('decodes mixed entities: &#105;nfo&#64;site&period;com', () => {
227      const html = '<p>&#105;nfo&#64;site&period;com</p>';
228      const result = extractContactsFromHtml(html, BASE_URL);
229      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
230    });
231  
232    test('false-positive guard: &amp;lt;3 numbers does not produce email', () => {
233      const html = '<p>&amp;lt;3 numbers</p>';
234      const result = extractContactsFromHtml(html, BASE_URL);
235      assert.equal(result.email_addresses.length, 0);
236    });
237  });
238  
239  describe('1B: Text obfuscation patterns', () => {
240    test('[at] and [dot] bracket notation', () => {
241      const html = '<p>info [at] site [dot] com.au</p>';
242      const result = extractContactsFromHtml(html, BASE_URL);
243      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com.au'));
244    });
245  
246    test('(at) and (dot) paren notation', () => {
247      const html = '<p>info(at)site(dot)com</p>';
248      const result = extractContactsFromHtml(html, BASE_URL);
249      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
250    });
251  
252    test('space-word "at" and "dot" with full email structure', () => {
253      const html = '<p>contact at company dot co.nz</p>';
254      const result = extractContactsFromHtml(html, BASE_URL);
255      assert.ok(result.email_addresses.some(e => e.email === 'contact@company.co.nz'));
256    });
257  
258    test('space-word "at" with literal dot in domain', () => {
259      const html = '<p>hello at business.com.au</p>';
260      const result = extractContactsFromHtml(html, BASE_URL);
261      assert.ok(result.email_addresses.some(e => e.email === 'hello@business.com.au'));
262    });
263  
264    test('false-positive guard: "what a dotcom world" does not produce email', () => {
265      const html = '<p>what a dotcom world</p>';
266      const result = extractContactsFromHtml(html, BASE_URL);
267      assert.equal(result.email_addresses.length, 0);
268    });
269  
270    test('false-positive guard: "at least 3 things" does not produce email', () => {
271      const html = '<p>at least 3 things</p>';
272      const result = extractContactsFromHtml(html, BASE_URL);
273      assert.equal(result.email_addresses.length, 0);
274    });
275  
276    test('false-positive guard: "look at this" (no dot pattern) does not produce email', () => {
277      const html = '<p>look at this website</p>';
278      const result = extractContactsFromHtml(html, BASE_URL);
279      assert.equal(result.email_addresses.length, 0);
280    });
281  });
282  
283  describe('1C: data-email and data-contact attributes', () => {
284    test('data-email attribute extracts email', () => {
285      const html = '<span data-email="info@site.com">Contact</span>';
286      const result = extractContactsFromHtml(html, BASE_URL);
287      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
288    });
289  
290    test('data-contact attribute extracts email', () => {
291      const html = '<a data-contact="sales@company.com">Contact</a>';
292      const result = extractContactsFromHtml(html, BASE_URL);
293      assert.ok(result.email_addresses.some(e => e.email === 'sales@company.com'));
294    });
295  
296    test('data-href attribute extracts email', () => {
297      const html = '<div data-href="info@site.com"></div>';
298      const result = extractContactsFromHtml(html, BASE_URL);
299      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
300    });
301  
302    test('false-positive guard: non-email data-email value produces no email', () => {
303      const html = '<div data-email="not-an-email"></div>';
304      const result = extractContactsFromHtml(html, BASE_URL);
305      assert.equal(result.email_addresses.length, 0);
306    });
307  });
308  
309  describe('1D: CloudFlare email protection', () => {
310    // CF encodes info@site.com as XOR with key 0x42:
311    //   key=42, then each char XOR 0x42
312    //   i(69)^42=2b, n(6e)^42=2c, f(66)^42=24, o(6f)^42=2d, @(40)^42=02,
313    //   s(73)^42=31, i(69)^42=2b, t(74)^42=36, e(65)^42=27, .(2e)^42=6c,
314    //   c(63)^42=21, o(6f)^42=2d, m(6d)^42=2f
315    //   → "422b2c242d02312b3627" + "6c212d2f" → full hex below
316    const CF_ENCODED_INFO_AT_SITE_COM = '422b2c242d02312b36276c212d2f';
317  
318    test('decodes data-cfemail attribute to email', () => {
319      const html = `<a href="/cdn-cgi/l/email-protection" data-cfemail="${CF_ENCODED_INFO_AT_SITE_COM}">[email protected]</a>`;
320      const result = extractContactsFromHtml(html, BASE_URL);
321      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
322    });
323  
324    test('source is tagged as cloudflare', () => {
325      const html = `<a data-cfemail="${CF_ENCODED_INFO_AT_SITE_COM}">email</a>`;
326      const result = extractContactsFromHtml(html, BASE_URL);
327      const cf = result.email_addresses.find(e => e.source === 'cloudflare');
328      assert.ok(cf);
329    });
330  
331    test('false-positive guard: odd-length hex string does not crash', () => {
332      const html = '<a data-cfemail="abc">broken</a>';
333      assert.doesNotThrow(() => extractContactsFromHtml(html, BASE_URL));
334      const result = extractContactsFromHtml(html, BASE_URL);
335      assert.equal(result.email_addresses.length, 0);
336    });
337  
338    test('false-positive guard: short hex string produces no email', () => {
339      const html = '<a data-cfemail="0000">short</a>';
340      const result = extractContactsFromHtml(html, BASE_URL);
341      assert.equal(result.email_addresses.length, 0);
342    });
343  });
344  
345  describe('1E: CSS direction:rtl reversal', () => {
346    test('reverses span with direction:rtl to extract email', () => {
347      // "info@site.com" reversed = "moc.etis@ofni"
348      const html = '<span style="direction:rtl">moc.etis@ofni</span>';
349      const result = extractContactsFromHtml(html, BASE_URL);
350      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
351    });
352  
353    test('handles unicode-bidi with direction:rtl', () => {
354      // "contact@site.com" reversed = "moc.etis@tcatnoc"
355      const html = '<div style="unicode-bidi:bidi-override;direction:rtl">moc.etis@tcatnoc</div>';
356      const result = extractContactsFromHtml(html, BASE_URL);
357      assert.ok(result.email_addresses.some(e => e.email === 'contact@site.com'));
358    });
359  
360    test('handles direction: rtl with spaces around colon', () => {
361      const html = '<span style="direction : rtl">moc.etis@ofni</span>';
362      const result = extractContactsFromHtml(html, BASE_URL);
363      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
364    });
365  
366    test('false-positive guard: Arabic RTL text without email does not produce email', () => {
367      const html = '<span style="direction:rtl">مرحبا بالعالم</span>';
368      const result = extractContactsFromHtml(html, BASE_URL);
369      assert.equal(result.email_addresses.length, 0);
370    });
371  });
372  
373  describe('1F: Plaintext international phone numbers', () => {
374    test('extracts +61 Australian mobile from visible text', () => {
375      const html = '<p>Call us: +61 418 804 934</p>';
376      const result = extractContactsFromHtml(html, BASE_URL);
377      assert.ok(result.phone_numbers.length > 0);
378      assert.ok(result.phone_numbers.some(p => p.number.replace(/\D/g, '').includes('61418804934')));
379    });
380  
381    test('extracts +1 US number with parens', () => {
382      const html = '<p>Book now: +1 (555) 123-4567</p>';
383      const result = extractContactsFromHtml(html, BASE_URL);
384      assert.ok(result.phone_numbers.length > 0);
385    });
386  
387    test('extracts +44 UK number', () => {
388      const html = '<p>Ring us: +44 20 7946 0958</p>';
389      const result = extractContactsFromHtml(html, BASE_URL);
390      assert.ok(result.phone_numbers.length > 0);
391    });
392  
393    test('source is tagged as text for plaintext phones', () => {
394      const html = '<p>+61 418 804 934</p>';
395      const result = extractContactsFromHtml(html, BASE_URL);
396      const textPhone = result.phone_numbers.find(p => p.source === 'text');
397      assert.ok(textPhone);
398    });
399  
400    test('false-positive guard: "Order #1234567890" does not extract phone', () => {
401      const html = '<p>Order #1234567890 confirmed</p>';
402      const result = extractContactsFromHtml(html, BASE_URL);
403      // No + prefix, so should not be extracted
404      assert.equal(result.phone_numbers.length, 0);
405    });
406  
407    test('false-positive guard: CSS hex color does not extract phone', () => {
408      const html = '<p style="color: #FF6600">Hello</p>';
409      const result = extractContactsFromHtml(html, BASE_URL);
410      assert.equal(result.phone_numbers.length, 0);
411    });
412  
413    test('false-positive guard: ABN-adjacent number skipped', () => {
414      // ABN 12 345 678 901 — not a phone number
415      const html = '<p>ABN 12 345 678 901</p>';
416      const result = extractContactsFromHtml(html, BASE_URL);
417      // No + prefix, not extracted
418      assert.equal(result.phone_numbers.length, 0);
419    });
420  });
421  
422  describe('1G: Unicode homoglyph normalization', () => {
423    test('full-width @ sign (U+FF20) is treated as @', () => {
424      // info@site.com — @ is U+FF20
425      const html = '<p>info\uFF20site.com</p>';
426      const result = extractContactsFromHtml(html, BASE_URL);
427      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
428    });
429  
430    test('Cyrillic о (U+043E) in local part is normalized to o', () => {
431      // "inf" + Cyrillic о + "@site.com"
432      const html = '<p>inf\u043E@site.com</p>';
433      const result = extractContactsFromHtml(html, BASE_URL);
434      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
435    });
436  
437    test('Cyrillic і (U+0456) normalized to i', () => {
438      // "nfo" with Cyrillic і at start → "info@site.com"
439      const html = '<p>\u0456nfo@site.com</p>';
440      const result = extractContactsFromHtml(html, BASE_URL);
441      assert.ok(result.email_addresses.some(e => e.email === 'info@site.com'));
442    });
443  
444    test('false-positive guard: purely Cyrillic word does not produce email', () => {
445      const html = '<p>привет</p>';
446      const result = extractContactsFromHtml(html, BASE_URL);
447      assert.equal(result.email_addresses.length, 0);
448    });
449  });
450  
451  describe('countUsableContacts', () => {
452    test('counts emails and phones', () => {
453      const contacts = {
454        email_addresses: [{ email: 'a@b.com' }, { email: 'c@d.com' }],
455        phone_numbers: [{ number: '+61400000000' }],
456        social_profiles: [],
457      };
458      assert.equal(countUsableContacts(contacts), 3);
459    });
460  
461    test('counts form as 1', () => {
462      const contacts = {
463        email_addresses: [],
464        phone_numbers: [],
465        social_profiles: [],
466        primary_contact_form: { form_url: 'https://example.com/contact' },
467      };
468      assert.equal(countUsableContacts(contacts), 1);
469    });
470  
471    test('counts x.com social as usable', () => {
472      const contacts = {
473        email_addresses: [],
474        phone_numbers: [],
475        social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }],
476      };
477      assert.equal(countUsableContacts(contacts), 1);
478    });
479  
480    test('does not count facebook as usable', () => {
481      const contacts = {
482        email_addresses: [],
483        phone_numbers: [],
484        social_profiles: [{ url: 'https://facebook.com/acme', label: 'Facebook', usable: false }],
485      };
486      assert.equal(countUsableContacts(contacts), 0);
487    });
488  
489    test('returns 0 for null', () => {
490      assert.equal(countUsableContacts(null), 0);
491    });
492  });