html-contact-extractor.test.js
1 import { test, describe } from 'node:test'; 2 import assert from 'node:assert/strict'; 3 import { 4 extractContactsFromHtml, 5 mergeExtractedContacts, 6 countUsableContacts, 7 } from '../src/utils/html-contact-extractor.js'; 8 9 const BASE_URL = 'https://example.com.au'; 10 11 describe('extractContactsFromHtml', () => { 12 test('extracts mailto: email', () => { 13 const html = '<a href="mailto:info@acmeplumbing.com.au">Contact us</a>'; 14 const result = extractContactsFromHtml(html, BASE_URL); 15 assert.equal(result.email_addresses.length, 1); 16 assert.equal(result.email_addresses[0].email, 'info@acmeplumbing.com.au'); 17 assert.equal(result.email_addresses[0].source, 'mailto:href'); 18 }); 19 20 test('extracts plaintext email', () => { 21 const html = '<p>Email us at info@acmeplumbing.com.au for a quote</p>'; 22 const result = extractContactsFromHtml(html, BASE_URL); 23 assert.equal(result.email_addresses.length, 1); 24 assert.equal(result.email_addresses[0].source, 'text'); 25 }); 26 27 test('deduplicates mailto and plaintext email', () => { 28 const html = '<a href="mailto:info@acme.com.au">email</a> or info@acme.com.au'; 29 const result = extractContactsFromHtml(html, BASE_URL); 30 assert.equal(result.email_addresses.length, 1); 31 }); 32 33 test('filters noise email domains', () => { 34 const html = 35 '<p>605a7bae@sentry-next.wixpress.com</p>' + 36 '<a href="mailto:real@business.com.au">Contact</a>'; 37 const result = extractContactsFromHtml(html, BASE_URL); 38 assert.equal(result.email_addresses.length, 1); 39 assert.equal(result.email_addresses[0].email, 'real@business.com.au'); 40 }); 41 42 test('filters image extension artefacts', () => { 43 const html = '<p>background.jpg@cdn.com</p>'; 44 const result = extractContactsFromHtml(html, BASE_URL); 45 assert.equal(result.email_addresses.length, 0); 46 }); 47 48 test('extracts tel: phone', () => { 49 const html = '<a href="tel:+61418804934">Call us</a>'; 50 const result = extractContactsFromHtml(html, BASE_URL); 51 assert.equal(result.phone_numbers.length, 1); 52 assert.equal(result.phone_numbers[0].number, '+61418804934'); 53 }); 54 55 test('extracts URL-encoded tel:', () => { 56 const html = '<a href="tel:0401%20524%20647">Call</a>'; 57 const result = extractContactsFromHtml(html, BASE_URL); 58 assert.equal(result.phone_numbers.length, 1); 59 assert.ok(result.phone_numbers[0].number.includes('0401')); 60 }); 61 62 test('rejects short phone noise', () => { 63 // CSS colour code or short number 64 const html = '<a href="tel:123">Ring</a>'; 65 const result = extractContactsFromHtml(html, BASE_URL); 66 assert.equal(result.phone_numbers.length, 0); 67 }); 68 69 test('extracts x.com social profile', () => { 70 const html = '<a href="https://x.com/acmeplumbing">Follow us</a>'; 71 const result = extractContactsFromHtml(html, BASE_URL); 72 const x = result.social_profiles.find(s => s.label === 'X'); 73 assert.ok(x); 74 assert.equal(x.usable, true); 75 }); 76 77 test('extracts linkedin.com/company profile', () => { 78 const html = '<a href="https://linkedin.com/company/acmeplumbing">LinkedIn</a>'; 79 const result = extractContactsFromHtml(html, BASE_URL); 80 const li = result.social_profiles.find(s => s.label === 'LinkedIn'); 81 assert.ok(li); 82 assert.equal(li.usable, true); 83 }); 84 85 test('extracts facebook but marks not usable', () => { 86 const html = '<a href="https://www.facebook.com/acmeplumbing">Facebook</a>'; 87 const result = extractContactsFromHtml(html, BASE_URL); 88 const fb = result.social_profiles.find(s => s.label === 'Facebook'); 89 assert.ok(fb); 90 assert.equal(fb.usable, false); 91 }); 92 93 test('excludes facebook tracking pixels', () => { 94 const html = '<img src="https://www.facebook.com/tr?id=12345&ev=PageView" />'; 95 const result = extractContactsFromHtml(html, BASE_URL); 96 assert.equal(result.social_profiles.filter(s => s.label === 'Facebook').length, 0); 97 }); 98 99 test('excludes x.com login/intent URLs', () => { 100 const html = '<a href="https://x.com/i/flow/login?redirect_after_login=%2Facme">X</a>'; 101 const result = extractContactsFromHtml(html, BASE_URL); 102 assert.equal(result.social_profiles.filter(s => s.label === 'X').length, 0); 103 }); 104 105 test('extracts same-domain contact page links', () => { 106 const html = 107 '<a href="https://example.com.au/contact-us">Contact</a>' + 108 '<a href="https://example.com.au/about-us">About</a>'; 109 const result = extractContactsFromHtml(html, BASE_URL); 110 assert.ok(result.key_pages.length >= 2); 111 }); 112 113 test('excludes cross-domain links from key_pages', () => { 114 const html = '<a href="https://otherdomain.com/contact">Contact</a>'; 115 const result = extractContactsFromHtml(html, BASE_URL); 116 assert.equal(result.key_pages.length, 0); 117 }); 118 119 test('resolves relative hrefs for key_pages', () => { 120 const html = '<a href="/contact-us">Contact</a>'; 121 const result = extractContactsFromHtml(html, BASE_URL); 122 assert.equal(result.key_pages.length, 1); 123 assert.ok(result.key_pages[0].startsWith('https://example.com.au')); 124 }); 125 126 test('detects <form> tag as contact form signal', () => { 127 const html = '<form action="/submit"><input type="text"></form>'; 128 const result = extractContactsFromHtml(html, BASE_URL); 129 assert.equal(result.has_contact_form, true); 130 }); 131 132 test('detects WP contact-form-7 plugin', () => { 133 const html = '<script src="/wp-content/plugins/contact-form-7/js/index.js"></script>'; 134 const result = extractContactsFromHtml(html, BASE_URL); 135 assert.equal(result.has_contact_form, true); 136 }); 137 138 test('returns empty result for sentinel html_dom', () => { 139 const result = extractContactsFromHtml('HTML removed after scoring', BASE_URL); 140 assert.equal(result.email_addresses.length, 0); 141 assert.equal(result.phone_numbers.length, 0); 142 assert.equal(result.has_contact_form, false); 143 }); 144 145 test('returns empty result for null html', () => { 146 const result = extractContactsFromHtml(null, BASE_URL); 147 assert.equal(result.email_addresses.length, 0); 148 }); 149 }); 150 151 describe('mergeExtractedContacts', () => { 152 test('merges new emails into existing contacts_json', () => { 153 const existing = { email_addresses: [{ email: 'old@site.com', label: 'Office' }] }; 154 const extracted = { 155 email_addresses: [{ email: 'new@site.com', label: 'General', source: 'text' }], 156 phone_numbers: [], 157 social_profiles: [], 158 key_pages: [], 159 has_contact_form: false, 160 }; 161 const merged = mergeExtractedContacts(existing, extracted, BASE_URL); 162 assert.equal(merged.email_addresses.length, 2); 163 }); 164 165 test('does not duplicate existing emails', () => { 166 const existing = { email_addresses: [{ email: 'info@site.com', label: 'Office' }] }; 167 const extracted = { 168 email_addresses: [{ email: 'info@site.com', label: 'General', source: 'mailto:href' }], 169 phone_numbers: [], 170 social_profiles: [], 171 key_pages: [], 172 has_contact_form: false, 173 }; 174 const merged = mergeExtractedContacts(existing, extracted, BASE_URL); 175 assert.equal(merged.email_addresses.length, 1); 176 }); 177 178 test('sets contact form when not present', () => { 179 const existing = {}; 180 const extracted = { 181 email_addresses: [], 182 phone_numbers: [], 183 social_profiles: [], 184 key_pages: [], 185 has_contact_form: true, 186 }; 187 const merged = mergeExtractedContacts(existing, extracted, BASE_URL); 188 assert.ok(merged.primary_contact_form?.form_url); 189 }); 190 191 test('does not overwrite existing contact form', () => { 192 const existing = { primary_contact_form: { form_url: 'https://example.com.au/old-form' } }; 193 const extracted = { 194 email_addresses: [], 195 phone_numbers: [], 196 social_profiles: [], 197 key_pages: [], 198 has_contact_form: true, 199 }; 200 const merged = mergeExtractedContacts(existing, extracted, 'https://example.com.au/page'); 201 assert.equal(merged.primary_contact_form.form_url, 'https://example.com.au/old-form'); 202 }); 203 }); 204 205 // ── Obfuscation decoder tests ──────────────────────────────────────────────── 206 207 describe('1A: HTML entity decoding', () => { 208 test('decodes decimal entities: info@site.com', () => { 209 const html = '<p>info@site.com</p>'; 210 const result = extractContactsFromHtml(html, BASE_URL); 211 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 212 }); 213 214 test('decodes hex entities: info@site.com', () => { 215 const html = '<p>info@site.com</p>'; 216 const result = extractContactsFromHtml(html, BASE_URL); 217 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 218 }); 219 220 test('decodes named entity @: info@site.com', () => { 221 const html = '<p>info@site.com</p>'; 222 const result = extractContactsFromHtml(html, BASE_URL); 223 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 224 }); 225 226 test('decodes mixed entities: info@site.com', () => { 227 const html = '<p>info@site.com</p>'; 228 const result = extractContactsFromHtml(html, BASE_URL); 229 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 230 }); 231 232 test('false-positive guard: &lt;3 numbers does not produce email', () => { 233 const html = '<p>&lt;3 numbers</p>'; 234 const result = extractContactsFromHtml(html, BASE_URL); 235 assert.equal(result.email_addresses.length, 0); 236 }); 237 }); 238 239 describe('1B: Text obfuscation patterns', () => { 240 test('[at] and [dot] bracket notation', () => { 241 const html = '<p>info [at] site [dot] com.au</p>'; 242 const result = extractContactsFromHtml(html, BASE_URL); 243 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com.au')); 244 }); 245 246 test('(at) and (dot) paren notation', () => { 247 const html = '<p>info(at)site(dot)com</p>'; 248 const result = extractContactsFromHtml(html, BASE_URL); 249 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 250 }); 251 252 test('space-word "at" and "dot" with full email structure', () => { 253 const html = '<p>contact at company dot co.nz</p>'; 254 const result = extractContactsFromHtml(html, BASE_URL); 255 assert.ok(result.email_addresses.some(e => e.email === 'contact@company.co.nz')); 256 }); 257 258 test('space-word "at" with literal dot in domain', () => { 259 const html = '<p>hello at business.com.au</p>'; 260 const result = extractContactsFromHtml(html, BASE_URL); 261 assert.ok(result.email_addresses.some(e => e.email === 'hello@business.com.au')); 262 }); 263 264 test('false-positive guard: "what a dotcom world" does not produce email', () => { 265 const html = '<p>what a dotcom world</p>'; 266 const result = extractContactsFromHtml(html, BASE_URL); 267 assert.equal(result.email_addresses.length, 0); 268 }); 269 270 test('false-positive guard: "at least 3 things" does not produce email', () => { 271 const html = '<p>at least 3 things</p>'; 272 const result = extractContactsFromHtml(html, BASE_URL); 273 assert.equal(result.email_addresses.length, 0); 274 }); 275 276 test('false-positive guard: "look at this" (no dot pattern) does not produce email', () => { 277 const html = '<p>look at this website</p>'; 278 const result = extractContactsFromHtml(html, BASE_URL); 279 assert.equal(result.email_addresses.length, 0); 280 }); 281 }); 282 283 describe('1C: data-email and data-contact attributes', () => { 284 test('data-email attribute extracts email', () => { 285 const html = '<span data-email="info@site.com">Contact</span>'; 286 const result = extractContactsFromHtml(html, BASE_URL); 287 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 288 }); 289 290 test('data-contact attribute extracts email', () => { 291 const html = '<a data-contact="sales@company.com">Contact</a>'; 292 const result = extractContactsFromHtml(html, BASE_URL); 293 assert.ok(result.email_addresses.some(e => e.email === 'sales@company.com')); 294 }); 295 296 test('data-href attribute extracts email', () => { 297 const html = '<div data-href="info@site.com"></div>'; 298 const result = extractContactsFromHtml(html, BASE_URL); 299 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 300 }); 301 302 test('false-positive guard: non-email data-email value produces no email', () => { 303 const html = '<div data-email="not-an-email"></div>'; 304 const result = extractContactsFromHtml(html, BASE_URL); 305 assert.equal(result.email_addresses.length, 0); 306 }); 307 }); 308 309 describe('1D: CloudFlare email protection', () => { 310 // CF encodes info@site.com as XOR with key 0x42: 311 // key=42, then each char XOR 0x42 312 // i(69)^42=2b, n(6e)^42=2c, f(66)^42=24, o(6f)^42=2d, @(40)^42=02, 313 // s(73)^42=31, i(69)^42=2b, t(74)^42=36, e(65)^42=27, .(2e)^42=6c, 314 // c(63)^42=21, o(6f)^42=2d, m(6d)^42=2f 315 // → "422b2c242d02312b3627" + "6c212d2f" → full hex below 316 const CF_ENCODED_INFO_AT_SITE_COM = '422b2c242d02312b36276c212d2f'; 317 318 test('decodes data-cfemail attribute to email', () => { 319 const html = `<a href="/cdn-cgi/l/email-protection" data-cfemail="${CF_ENCODED_INFO_AT_SITE_COM}">[email protected]</a>`; 320 const result = extractContactsFromHtml(html, BASE_URL); 321 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 322 }); 323 324 test('source is tagged as cloudflare', () => { 325 const html = `<a data-cfemail="${CF_ENCODED_INFO_AT_SITE_COM}">email</a>`; 326 const result = extractContactsFromHtml(html, BASE_URL); 327 const cf = result.email_addresses.find(e => e.source === 'cloudflare'); 328 assert.ok(cf); 329 }); 330 331 test('false-positive guard: odd-length hex string does not crash', () => { 332 const html = '<a data-cfemail="abc">broken</a>'; 333 assert.doesNotThrow(() => extractContactsFromHtml(html, BASE_URL)); 334 const result = extractContactsFromHtml(html, BASE_URL); 335 assert.equal(result.email_addresses.length, 0); 336 }); 337 338 test('false-positive guard: short hex string produces no email', () => { 339 const html = '<a data-cfemail="0000">short</a>'; 340 const result = extractContactsFromHtml(html, BASE_URL); 341 assert.equal(result.email_addresses.length, 0); 342 }); 343 }); 344 345 describe('1E: CSS direction:rtl reversal', () => { 346 test('reverses span with direction:rtl to extract email', () => { 347 // "info@site.com" reversed = "moc.etis@ofni" 348 const html = '<span style="direction:rtl">moc.etis@ofni</span>'; 349 const result = extractContactsFromHtml(html, BASE_URL); 350 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 351 }); 352 353 test('handles unicode-bidi with direction:rtl', () => { 354 // "contact@site.com" reversed = "moc.etis@tcatnoc" 355 const html = '<div style="unicode-bidi:bidi-override;direction:rtl">moc.etis@tcatnoc</div>'; 356 const result = extractContactsFromHtml(html, BASE_URL); 357 assert.ok(result.email_addresses.some(e => e.email === 'contact@site.com')); 358 }); 359 360 test('handles direction: rtl with spaces around colon', () => { 361 const html = '<span style="direction : rtl">moc.etis@ofni</span>'; 362 const result = extractContactsFromHtml(html, BASE_URL); 363 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 364 }); 365 366 test('false-positive guard: Arabic RTL text without email does not produce email', () => { 367 const html = '<span style="direction:rtl">مرحبا بالعالم</span>'; 368 const result = extractContactsFromHtml(html, BASE_URL); 369 assert.equal(result.email_addresses.length, 0); 370 }); 371 }); 372 373 describe('1F: Plaintext international phone numbers', () => { 374 test('extracts +61 Australian mobile from visible text', () => { 375 const html = '<p>Call us: +61 418 804 934</p>'; 376 const result = extractContactsFromHtml(html, BASE_URL); 377 assert.ok(result.phone_numbers.length > 0); 378 assert.ok(result.phone_numbers.some(p => p.number.replace(/\D/g, '').includes('61418804934'))); 379 }); 380 381 test('extracts +1 US number with parens', () => { 382 const html = '<p>Book now: +1 (555) 123-4567</p>'; 383 const result = extractContactsFromHtml(html, BASE_URL); 384 assert.ok(result.phone_numbers.length > 0); 385 }); 386 387 test('extracts +44 UK number', () => { 388 const html = '<p>Ring us: +44 20 7946 0958</p>'; 389 const result = extractContactsFromHtml(html, BASE_URL); 390 assert.ok(result.phone_numbers.length > 0); 391 }); 392 393 test('source is tagged as text for plaintext phones', () => { 394 const html = '<p>+61 418 804 934</p>'; 395 const result = extractContactsFromHtml(html, BASE_URL); 396 const textPhone = result.phone_numbers.find(p => p.source === 'text'); 397 assert.ok(textPhone); 398 }); 399 400 test('false-positive guard: "Order #1234567890" does not extract phone', () => { 401 const html = '<p>Order #1234567890 confirmed</p>'; 402 const result = extractContactsFromHtml(html, BASE_URL); 403 // No + prefix, so should not be extracted 404 assert.equal(result.phone_numbers.length, 0); 405 }); 406 407 test('false-positive guard: CSS hex color does not extract phone', () => { 408 const html = '<p style="color: #FF6600">Hello</p>'; 409 const result = extractContactsFromHtml(html, BASE_URL); 410 assert.equal(result.phone_numbers.length, 0); 411 }); 412 413 test('false-positive guard: ABN-adjacent number skipped', () => { 414 // ABN 12 345 678 901 — not a phone number 415 const html = '<p>ABN 12 345 678 901</p>'; 416 const result = extractContactsFromHtml(html, BASE_URL); 417 // No + prefix, not extracted 418 assert.equal(result.phone_numbers.length, 0); 419 }); 420 }); 421 422 describe('1G: Unicode homoglyph normalization', () => { 423 test('full-width @ sign (U+FF20) is treated as @', () => { 424 // info@site.com — @ is U+FF20 425 const html = '<p>info\uFF20site.com</p>'; 426 const result = extractContactsFromHtml(html, BASE_URL); 427 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 428 }); 429 430 test('Cyrillic о (U+043E) in local part is normalized to o', () => { 431 // "inf" + Cyrillic о + "@site.com" 432 const html = '<p>inf\u043E@site.com</p>'; 433 const result = extractContactsFromHtml(html, BASE_URL); 434 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 435 }); 436 437 test('Cyrillic і (U+0456) normalized to i', () => { 438 // "nfo" with Cyrillic і at start → "info@site.com" 439 const html = '<p>\u0456nfo@site.com</p>'; 440 const result = extractContactsFromHtml(html, BASE_URL); 441 assert.ok(result.email_addresses.some(e => e.email === 'info@site.com')); 442 }); 443 444 test('false-positive guard: purely Cyrillic word does not produce email', () => { 445 const html = '<p>привет</p>'; 446 const result = extractContactsFromHtml(html, BASE_URL); 447 assert.equal(result.email_addresses.length, 0); 448 }); 449 }); 450 451 describe('countUsableContacts', () => { 452 test('counts emails and phones', () => { 453 const contacts = { 454 email_addresses: [{ email: 'a@b.com' }, { email: 'c@d.com' }], 455 phone_numbers: [{ number: '+61400000000' }], 456 social_profiles: [], 457 }; 458 assert.equal(countUsableContacts(contacts), 3); 459 }); 460 461 test('counts form as 1', () => { 462 const contacts = { 463 email_addresses: [], 464 phone_numbers: [], 465 social_profiles: [], 466 primary_contact_form: { form_url: 'https://example.com/contact' }, 467 }; 468 assert.equal(countUsableContacts(contacts), 1); 469 }); 470 471 test('counts x.com social as usable', () => { 472 const contacts = { 473 email_addresses: [], 474 phone_numbers: [], 475 social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }], 476 }; 477 assert.equal(countUsableContacts(contacts), 1); 478 }); 479 480 test('does not count facebook as usable', () => { 481 const contacts = { 482 email_addresses: [], 483 phone_numbers: [], 484 social_profiles: [{ url: 'https://facebook.com/acme', label: 'Facebook', usable: false }], 485 }; 486 assert.equal(countUsableContacts(contacts), 0); 487 }); 488 489 test('returns 0 for null', () => { 490 assert.equal(countUsableContacts(null), 0); 491 }); 492 });