Cradicle Explorer

/ tests / utils / html-contact-extractor.test.js
html-contact-extractor.test.js
  1  /**
  2   * Tests for src/utils/html-contact-extractor.js
  3   *
  4   * Covers: extractContactsFromHtml, mergeExtractedContacts, countUsableContacts
  5   *
  6   * Pure regex/logic — no DB, no LLM, no external dependencies.
  7   */
  8  
  9  import { test, describe } from 'node:test';
 10  import assert from 'node:assert/strict';
 11  
 12  import {
 13    extractContactsFromHtml,
 14    mergeExtractedContacts,
 15    countUsableContacts,
 16  } from '../../src/utils/html-contact-extractor.js';
 17  
 18  // ─── extractContactsFromHtml ──────────────────────────────────────────────────
 19  
 20  describe('extractContactsFromHtml', () => {
 21    describe('guard clauses', () => {
 22      test('returns empty result for null', () => {
 23        const r = extractContactsFromHtml(null, 'https://example.com');
 24        assert.deepEqual(r.email_addresses, []);
 25        assert.deepEqual(r.phone_numbers, []);
 26        assert.deepEqual(r.social_profiles, []);
 27        assert.deepEqual(r.key_pages, []);
 28        assert.equal(r.has_contact_form, false);
 29      });
 30  
 31      test('returns empty result for "HTML removed after scoring"', () => {
 32        const r = extractContactsFromHtml('HTML removed after scoring', 'https://example.com');
 33        assert.equal(r.email_addresses.length, 0);
 34      });
 35  
 36      test('returns empty result for empty string', () => {
 37        const r = extractContactsFromHtml('', 'https://example.com');
 38        assert.equal(r.email_addresses.length, 0);
 39      });
 40    });
 41  
 42    describe('email extraction — mailto: hrefs', () => {
 43      test('extracts email from mailto: href', () => {
 44        const html = '<a href="mailto:info@acme.com">Contact</a>';
 45        const r = extractContactsFromHtml(html, 'https://acme.com');
 46        assert.equal(r.email_addresses.length, 1);
 47        assert.equal(r.email_addresses[0].email, 'info@acme.com');
 48        assert.equal(r.email_addresses[0].source, 'mailto:href');
 49      });
 50  
 51      test('deduplicates emails across sources', () => {
 52        const html = '<a href="mailto:info@acme.com">Contact</a> <p>info@acme.com</p>';
 53        const r = extractContactsFromHtml(html, 'https://acme.com');
 54        assert.equal(r.email_addresses.length, 1);
 55      });
 56  
 57      test('lowercases email addresses', () => {
 58        const html = '<a href="mailto:INFO@Acme.COM">email</a>';
 59        const r = extractContactsFromHtml(html, 'https://acme.com');
 60        assert.equal(r.email_addresses[0].email, 'info@acme.com');
 61      });
 62    });
 63  
 64    describe('email extraction — plaintext', () => {
 65      test('extracts plaintext email from body text', () => {
 66        const html = '<p>Email us: hello@acme-business.com</p>';
 67        const r = extractContactsFromHtml(html, 'https://acme-business.com');
 68        const emails = r.email_addresses.map(e => e.email);
 69        assert.ok(emails.includes('hello@acme-business.com'));
 70      });
 71  
 72      test('filters out noise domains (schema.org)', () => {
 73        const html = '<p>See schema.org: info@schema.org</p>';
 74        const r = extractContactsFromHtml(html, 'https://mysite.com');
 75        const emails = r.email_addresses.map(e => e.email);
 76        assert.ok(!emails.includes('info@schema.org'));
 77      });
 78  
 79      test('filters out image extension fake emails', () => {
 80        const html = '<img src="background.jpg@cdn.com">';
 81        const r = extractContactsFromHtml(html, 'https://mysite.com');
 82        assert.equal(r.email_addresses.length, 0);
 83      });
 84    });
 85  
 86    describe('email extraction — HTML entity decoding', () => {
 87      test('decodes &#64; entity for @', () => {
 88        const html = '<p>info&#64;acme.com</p>';
 89        const r = extractContactsFromHtml(html, 'https://acme.com');
 90        const emails = r.email_addresses.map(e => e.email);
 91        assert.ok(emails.includes('info@acme.com'));
 92      });
 93  
 94      test('decodes [at] obfuscation', () => {
 95        const html = '<p>info [at] acme.com</p>';
 96        const r = extractContactsFromHtml(html, 'https://acme.com');
 97        const emails = r.email_addresses.map(e => e.email);
 98        assert.ok(emails.includes('info@acme.com'));
 99      });
100  
101      test('decodes (at) obfuscation', () => {
102        const html = '<p>info(at)acme.com</p>';
103        const r = extractContactsFromHtml(html, 'https://acme.com');
104        const emails = r.email_addresses.map(e => e.email);
105        assert.ok(emails.includes('info@acme.com'));
106      });
107  
108      test('decodes [dot] obfuscation', () => {
109        const html = '<p>info [at] acme [dot] com</p>';
110        const r = extractContactsFromHtml(html, 'https://acme.com');
111        const emails = r.email_addresses.map(e => e.email);
112        assert.ok(emails.includes('info@acme.com'), `got: ${JSON.stringify(emails)}`);
113      });
114    });
115  
116    describe('email extraction — CloudFlare protection', () => {
117      test('decodes CloudFlare cfemail', () => {
118        // Encode info@test.com: key=0x10
119        // i=0x69^0x10=0x79='y' ... actually let's compute properly:
120        // key = 0x10, then XOR each char with key
121        // 'info@test.com' -> bytes, XOR with 0x10
122        const email = 'info@test.com';
123        const key = 0x10;
124        let hex = key.toString(16).padStart(2, '0');
125        for (const c of email) {
126          hex += (c.charCodeAt(0) ^ key).toString(16).padStart(2, '0');
127        }
128        const html = `<a href="/cdn-cgi/l/email-protection" data-cfemail="${hex}">[email protected]</a>`;
129        const r = extractContactsFromHtml(html, 'https://test.com');
130        const emails = r.email_addresses.map(e => e.email);
131        assert.ok(emails.includes('info@test.com'), `got: ${JSON.stringify(emails)}`);
132      });
133    });
134  
135    describe('phone extraction', () => {
136      test('extracts phone from tel: href', () => {
137        const html = '<a href="tel:+61412345678">Call us</a>';
138        const r = extractContactsFromHtml(html, 'https://acme.com.au');
139        assert.equal(r.phone_numbers.length, 1);
140        assert.equal(r.phone_numbers[0].number, '+61412345678');
141        assert.equal(r.phone_numbers[0].source, 'tel:href');
142      });
143  
144      test('extracts international plaintext phone with + prefix', () => {
145        const html = '<p>Call us on +44 20 7946 0958</p>';
146        const r = extractContactsFromHtml(html, 'https://acme.co.uk');
147        assert.ok(r.phone_numbers.length >= 1);
148      });
149  
150      test('deduplicates phone numbers', () => {
151        const html = '<a href="tel:+61412345678">Call</a> <p>+61 412 345 678</p>';
152        const r = extractContactsFromHtml(html, 'https://acme.com.au');
153        // Both should resolve to same normalized number, so only 1
154        // (tel:href normalizes, text normalizes — may or may not dedup depending on format)
155        assert.ok(r.phone_numbers.length >= 1);
156      });
157  
158      test('ignores numbers preceded by noise words (ABN)', () => {
159        const html = '<p>ABN 123456789</p>';
160        const r = extractContactsFromHtml(html, 'https://acme.com.au');
161        // ABN numbers without + prefix won't be picked up by intl regex anyway
162        // (the plaintext regex requires + prefix)
163        assert.ok(true); // just verify no crash
164      });
165  
166      test('rejects phone with fewer than 6 digits', () => {
167        const html = '<a href="tel:+123">Short</a>';
168        const r = extractContactsFromHtml(html, 'https://acme.com');
169        assert.equal(r.phone_numbers.length, 0);
170      });
171    });
172  
173    describe('social profile extraction', () => {
174      test('extracts X.com profile', () => {
175        const html = '<a href="https://x.com/acmecorp">Follow</a>';
176        const r = extractContactsFromHtml(html, 'https://acme.com');
177        const xProfile = r.social_profiles.find(s => s.url.includes('x.com'));
178        assert.ok(xProfile);
179        assert.equal(xProfile.label, 'X');
180        assert.equal(xProfile.usable, true);
181      });
182  
183      test('extracts Twitter.com profile', () => {
184        const html = '<a href="https://twitter.com/acmecorp">Follow</a>';
185        const r = extractContactsFromHtml(html, 'https://acme.com');
186        const tProfile = r.social_profiles.find(s => s.url.includes('twitter.com'));
187        assert.ok(tProfile);
188        assert.equal(tProfile.label, 'Twitter');
189        assert.equal(tProfile.usable, true);
190      });
191  
192      test('extracts LinkedIn company profile', () => {
193        const html = '<a href="https://linkedin.com/company/acme-corp">LinkedIn</a>';
194        const r = extractContactsFromHtml(html, 'https://acme.com');
195        const li = r.social_profiles.find(s => s.url.includes('linkedin.com'));
196        assert.ok(li);
197        assert.equal(li.label, 'LinkedIn');
198        assert.equal(li.usable, true);
199      });
200  
201      test('extracts Facebook profile (usable=false)', () => {
202        const html = '<a href="https://facebook.com/acmecorp">Facebook</a>';
203        const r = extractContactsFromHtml(html, 'https://acme.com');
204        const fb = r.social_profiles.find(s => s.url.includes('facebook.com'));
205        assert.ok(fb);
206        assert.equal(fb.usable, false);
207      });
208  
209      test('excludes X.com /i/ tracking paths', () => {
210        // /i/ is an excluded tracking path prefix
211        const html = '<a href="https://x.com/i/some-tracking-url">Track</a>';
212        const r = extractContactsFromHtml(html, 'https://acme.com');
213        // The /i/ prefix should be excluded by the exclude pattern
214        const iProfile = r.social_profiles.find(s => /x\.com\/i$/.test(s.url));
215        assert.ok(
216          !iProfile,
217          `should not have x.com/i profile, got: ${JSON.stringify(r.social_profiles)}`
218        );
219      });
220  
221      test('excludes LinkedIn sharing paths', () => {
222        const html = '<a href="https://linkedin.com/sharing/share-offsite/?url=foo">Share</a>';
223        const r = extractContactsFromHtml(html, 'https://acme.com');
224        // sharing/ path should be excluded
225        const sharing = r.social_profiles.find(s => s.url.includes('sharing'));
226        assert.ok(!sharing);
227      });
228  
229      test('deduplicates social profiles', () => {
230        const html =
231          '<a href="https://x.com/acmecorp">X1</a><a href="https://x.com/acmecorp/">X2</a>';
232        const r = extractContactsFromHtml(html, 'https://acme.com');
233        const xProfiles = r.social_profiles.filter(s => s.url.includes('x.com'));
234        assert.equal(xProfiles.length, 1);
235      });
236    });
237  
238    describe('key pages extraction', () => {
239      test('extracts contact page links', () => {
240        const html = '<a href="https://acme.com/contact-us">Contact</a>';
241        const r = extractContactsFromHtml(html, 'https://acme.com');
242        assert.ok(r.key_pages.length >= 1);
243        assert.ok(r.key_pages.some(p => p.includes('contact')));
244      });
245  
246      test('extracts about page links', () => {
247        const html = '<a href="https://acme.com/about">About Us</a>';
248        const r = extractContactsFromHtml(html, 'https://acme.com');
249        assert.ok(r.key_pages.some(p => p.includes('about')));
250      });
251  
252      test('excludes cross-domain links', () => {
253        const html = '<a href="https://otherdomain.com/contact">Contact</a>';
254        const r = extractContactsFromHtml(html, 'https://acme.com');
255        assert.ok(!r.key_pages.some(p => p.includes('otherdomain')));
256      });
257  
258      test('resolves relative hrefs to absolute', () => {
259        const html = '<a href="/contact">Contact</a>';
260        const r = extractContactsFromHtml(html, 'https://acme.com');
261        assert.ok(r.key_pages.some(p => p.startsWith('https://acme.com/contact')));
262      });
263  
264      test('deduplicates key pages', () => {
265        const html =
266          '<a href="https://acme.com/contact">C1</a><a href="https://acme.com/contact/">C2</a>';
267        const r = extractContactsFromHtml(html, 'https://acme.com');
268        const contactPages = r.key_pages.filter(p => p.includes('contact'));
269        assert.equal(contactPages.length, 1);
270      });
271    });
272  
273    describe('contact form detection', () => {
274      test('detects <form> tag', () => {
275        const html = '<html><body><form method="POST"><input name="email"></form></body></html>';
276        const r = extractContactsFromHtml(html, 'https://acme.com');
277        assert.equal(r.has_contact_form, true);
278      });
279  
280      test('detects WP contact-form-7 plugin', () => {
281        const html = '<div class="contact-form-7"><form></form></div>';
282        const r = extractContactsFromHtml(html, 'https://acme.com');
283        assert.equal(r.has_contact_form, true);
284      });
285  
286      test('detects wpforms plugin', () => {
287        const html = '<div class="wpforms-container"><form></form></div>';
288        const r = extractContactsFromHtml(html, 'https://acme.com');
289        assert.equal(r.has_contact_form, true);
290      });
291  
292      test('returns false when no form present', () => {
293        const html = '<html><body><p>No forms here</p></body></html>';
294        const r = extractContactsFromHtml(html, 'https://acme.com');
295        assert.equal(r.has_contact_form, false);
296      });
297    });
298  });
299  
300  // ─── mergeExtractedContacts ────────────────────────────────────────────────────
301  
302  describe('mergeExtractedContacts', () => {
303    test('merges into empty existing contacts', () => {
304      const extracted = {
305        email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }],
306        phone_numbers: [{ number: '+61412345678', label: 'General', source: 'tel:href' }],
307        social_profiles: [],
308        key_pages: [],
309        has_contact_form: false,
310      };
311      const result = mergeExtractedContacts(null, extracted, 'https://acme.com');
312      assert.equal(result.email_addresses.length, 1);
313      assert.equal(result.phone_numbers.length, 1);
314    });
315  
316    test('does not duplicate existing emails', () => {
317      const existing = {
318        email_addresses: [{ email: 'info@acme.com', label: 'Work', source: 'llm' }],
319        phone_numbers: [],
320        social_profiles: [],
321        key_pages: [],
322      };
323      const extracted = {
324        email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }],
325        phone_numbers: [],
326        social_profiles: [],
327        key_pages: [],
328        has_contact_form: false,
329      };
330      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com');
331      assert.equal(result.email_addresses.length, 1);
332      // Original entry should be kept
333      assert.equal(result.email_addresses[0].source, 'llm');
334    });
335  
336    test('adds new emails not in existing', () => {
337      const existing = {
338        email_addresses: [{ email: 'old@acme.com', label: 'Work', source: 'llm' }],
339        phone_numbers: [],
340        social_profiles: [],
341        key_pages: [],
342      };
343      const extracted = {
344        email_addresses: [{ email: 'new@acme.com', label: 'General', source: 'mailto:href' }],
345        phone_numbers: [],
346        social_profiles: [],
347        key_pages: [],
348        has_contact_form: false,
349      };
350      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com');
351      assert.equal(result.email_addresses.length, 2);
352    });
353  
354    test('sets primary_contact_form when has_contact_form=true and none exists', () => {
355      const existing = { email_addresses: [], phone_numbers: [], social_profiles: [], key_pages: [] };
356      const extracted = {
357        email_addresses: [],
358        phone_numbers: [],
359        social_profiles: [],
360        key_pages: [],
361        has_contact_form: true,
362      };
363      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com/contact');
364      assert.ok(result.primary_contact_form?.form_url);
365      assert.equal(result.primary_contact_form.form_url, 'https://acme.com/contact');
366    });
367  
368    test('does not overwrite existing primary_contact_form', () => {
369      const existing = {
370        email_addresses: [],
371        phone_numbers: [],
372        social_profiles: [],
373        key_pages: [],
374        primary_contact_form: { form_url: 'https://acme.com/old-form', form_action_url: null },
375      };
376      const extracted = {
377        email_addresses: [],
378        phone_numbers: [],
379        social_profiles: [],
380        key_pages: [],
381        has_contact_form: true,
382      };
383      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com/new-form');
384      assert.equal(result.primary_contact_form.form_url, 'https://acme.com/old-form');
385    });
386  
387    test('deduplicates phones by normalized value', () => {
388      const existing = {
389        email_addresses: [],
390        phone_numbers: [{ number: '+61412345678', label: 'Work', source: 'llm' }],
391        social_profiles: [],
392        key_pages: [],
393      };
394      const extracted = {
395        email_addresses: [],
396        phone_numbers: [{ number: '+61412345678', label: 'General', source: 'tel:href' }],
397        social_profiles: [],
398        key_pages: [],
399        has_contact_form: false,
400      };
401      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com');
402      assert.equal(result.phone_numbers.length, 1);
403    });
404  
405    test('merges social profiles without duplicates', () => {
406      const existing = {
407        email_addresses: [],
408        phone_numbers: [],
409        social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }],
410        key_pages: [],
411      };
412      const extracted = {
413        email_addresses: [],
414        phone_numbers: [],
415        social_profiles: [
416          { url: 'https://x.com/acme', label: 'X', usable: true },
417          { url: 'https://linkedin.com/company/acme', label: 'LinkedIn', usable: true },
418        ],
419        key_pages: [],
420        has_contact_form: false,
421      };
422      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com');
423      assert.equal(result.social_profiles.length, 2);
424    });
425  
426    test('handles existing with string format emails (legacy)', () => {
427      const existing = {
428        email_addresses: ['info@acme.com'], // legacy string format
429        phone_numbers: [],
430        social_profiles: [],
431        key_pages: [],
432      };
433      const extracted = {
434        email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }],
435        phone_numbers: [],
436        social_profiles: [],
437        key_pages: [],
438        has_contact_form: false,
439      };
440      const result = mergeExtractedContacts(existing, extracted, 'https://acme.com');
441      // Should not duplicate
442      assert.equal(result.email_addresses.length, 1);
443    });
444  });
445  
446  // ─── countUsableContacts ──────────────────────────────────────────────────────
447  
448  describe('countUsableContacts', () => {
449    test('returns 0 for null', () => {
450      assert.equal(countUsableContacts(null), 0);
451    });
452  
453    test('returns 0 for empty contacts', () => {
454      assert.equal(
455        countUsableContacts({ email_addresses: [], phone_numbers: [], social_profiles: [] }),
456        0
457      );
458    });
459  
460    test('counts email addresses', () => {
461      const contacts = {
462        email_addresses: [
463          { email: 'a@example.com', label: 'Work' },
464          { email: 'b@example.com', label: 'General' },
465        ],
466        phone_numbers: [],
467        social_profiles: [],
468      };
469      assert.equal(countUsableContacts(contacts), 2);
470    });
471  
472    test('counts phone numbers', () => {
473      const contacts = {
474        email_addresses: [],
475        phone_numbers: [{ number: '+61412345678', label: 'General' }],
476        social_profiles: [],
477      };
478      assert.equal(countUsableContacts(contacts), 1);
479    });
480  
481    test('counts contact form', () => {
482      const contacts = {
483        email_addresses: [],
484        phone_numbers: [],
485        social_profiles: [],
486        primary_contact_form: { form_url: 'https://acme.com/contact', form_action_url: null },
487      };
488      assert.equal(countUsableContacts(contacts), 1);
489    });
490  
491    test('counts usable social profiles (X, LinkedIn)', () => {
492      const contacts = {
493        email_addresses: [],
494        phone_numbers: [],
495        social_profiles: [
496          { url: 'https://x.com/acme', label: 'X', usable: true },
497          { url: 'https://linkedin.com/company/acme', label: 'LinkedIn', usable: true },
498          { url: 'https://facebook.com/acme', label: 'Facebook', usable: false },
499        ],
500      };
501      assert.equal(countUsableContacts(contacts), 2);
502    });
503  
504    test('counts X/LinkedIn by URL when usable flag is null', () => {
505      const contacts = {
506        email_addresses: [],
507        phone_numbers: [],
508        social_profiles: [
509          { url: 'https://x.com/acme', label: 'X' }, // no usable flag
510          { url: 'https://instagram.com/acme', label: 'Instagram' }, // not usable
511        ],
512      };
513      assert.equal(countUsableContacts(contacts), 1);
514    });
515  
516    test('counts string-format social profiles by URL', () => {
517      const contacts = {
518        email_addresses: [],
519        phone_numbers: [],
520        social_profiles: [
521          'https://linkedin.com/company/acme', // legacy string format
522          'https://facebook.com/acme', // not usable by URL
523        ],
524      };
525      assert.equal(countUsableContacts(contacts), 1);
526    });
527  
528    test('counts all types together', () => {
529      const contacts = {
530        email_addresses: [{ email: 'info@acme.com', label: 'Work' }],
531        phone_numbers: [{ number: '+61412345678', label: 'General' }],
532        social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }],
533        primary_contact_form: { form_url: 'https://acme.com/contact', form_action_url: null },
534      };
535      assert.equal(countUsableContacts(contacts), 4);
536    });
537  });