html-contact-extractor.test.js
1 /** 2 * Tests for src/utils/html-contact-extractor.js 3 * 4 * Covers: extractContactsFromHtml, mergeExtractedContacts, countUsableContacts 5 * 6 * Pure regex/logic — no DB, no LLM, no external dependencies. 7 */ 8 9 import { test, describe } from 'node:test'; 10 import assert from 'node:assert/strict'; 11 12 import { 13 extractContactsFromHtml, 14 mergeExtractedContacts, 15 countUsableContacts, 16 } from '../../src/utils/html-contact-extractor.js'; 17 18 // ─── extractContactsFromHtml ────────────────────────────────────────────────── 19 20 describe('extractContactsFromHtml', () => { 21 describe('guard clauses', () => { 22 test('returns empty result for null', () => { 23 const r = extractContactsFromHtml(null, 'https://example.com'); 24 assert.deepEqual(r.email_addresses, []); 25 assert.deepEqual(r.phone_numbers, []); 26 assert.deepEqual(r.social_profiles, []); 27 assert.deepEqual(r.key_pages, []); 28 assert.equal(r.has_contact_form, false); 29 }); 30 31 test('returns empty result for "HTML removed after scoring"', () => { 32 const r = extractContactsFromHtml('HTML removed after scoring', 'https://example.com'); 33 assert.equal(r.email_addresses.length, 0); 34 }); 35 36 test('returns empty result for empty string', () => { 37 const r = extractContactsFromHtml('', 'https://example.com'); 38 assert.equal(r.email_addresses.length, 0); 39 }); 40 }); 41 42 describe('email extraction — mailto: hrefs', () => { 43 test('extracts email from mailto: href', () => { 44 const html = '<a href="mailto:info@acme.com">Contact</a>'; 45 const r = extractContactsFromHtml(html, 'https://acme.com'); 46 assert.equal(r.email_addresses.length, 1); 47 assert.equal(r.email_addresses[0].email, 'info@acme.com'); 48 assert.equal(r.email_addresses[0].source, 'mailto:href'); 49 }); 50 51 test('deduplicates emails across sources', () => { 52 const html = '<a href="mailto:info@acme.com">Contact</a> <p>info@acme.com</p>'; 53 const r = extractContactsFromHtml(html, 'https://acme.com'); 54 assert.equal(r.email_addresses.length, 1); 55 }); 56 57 test('lowercases email addresses', () => { 58 const html = '<a href="mailto:INFO@Acme.COM">email</a>'; 59 const r = extractContactsFromHtml(html, 'https://acme.com'); 60 assert.equal(r.email_addresses[0].email, 'info@acme.com'); 61 }); 62 }); 63 64 describe('email extraction — plaintext', () => { 65 test('extracts plaintext email from body text', () => { 66 const html = '<p>Email us: hello@acme-business.com</p>'; 67 const r = extractContactsFromHtml(html, 'https://acme-business.com'); 68 const emails = r.email_addresses.map(e => e.email); 69 assert.ok(emails.includes('hello@acme-business.com')); 70 }); 71 72 test('filters out noise domains (schema.org)', () => { 73 const html = '<p>See schema.org: info@schema.org</p>'; 74 const r = extractContactsFromHtml(html, 'https://mysite.com'); 75 const emails = r.email_addresses.map(e => e.email); 76 assert.ok(!emails.includes('info@schema.org')); 77 }); 78 79 test('filters out image extension fake emails', () => { 80 const html = '<img src="background.jpg@cdn.com">'; 81 const r = extractContactsFromHtml(html, 'https://mysite.com'); 82 assert.equal(r.email_addresses.length, 0); 83 }); 84 }); 85 86 describe('email extraction — HTML entity decoding', () => { 87 test('decodes @ entity for @', () => { 88 const html = '<p>info@acme.com</p>'; 89 const r = extractContactsFromHtml(html, 'https://acme.com'); 90 const emails = r.email_addresses.map(e => e.email); 91 assert.ok(emails.includes('info@acme.com')); 92 }); 93 94 test('decodes [at] obfuscation', () => { 95 const html = '<p>info [at] acme.com</p>'; 96 const r = extractContactsFromHtml(html, 'https://acme.com'); 97 const emails = r.email_addresses.map(e => e.email); 98 assert.ok(emails.includes('info@acme.com')); 99 }); 100 101 test('decodes (at) obfuscation', () => { 102 const html = '<p>info(at)acme.com</p>'; 103 const r = extractContactsFromHtml(html, 'https://acme.com'); 104 const emails = r.email_addresses.map(e => e.email); 105 assert.ok(emails.includes('info@acme.com')); 106 }); 107 108 test('decodes [dot] obfuscation', () => { 109 const html = '<p>info [at] acme [dot] com</p>'; 110 const r = extractContactsFromHtml(html, 'https://acme.com'); 111 const emails = r.email_addresses.map(e => e.email); 112 assert.ok(emails.includes('info@acme.com'), `got: ${JSON.stringify(emails)}`); 113 }); 114 }); 115 116 describe('email extraction — CloudFlare protection', () => { 117 test('decodes CloudFlare cfemail', () => { 118 // Encode info@test.com: key=0x10 119 // i=0x69^0x10=0x79='y' ... actually let's compute properly: 120 // key = 0x10, then XOR each char with key 121 // 'info@test.com' -> bytes, XOR with 0x10 122 const email = 'info@test.com'; 123 const key = 0x10; 124 let hex = key.toString(16).padStart(2, '0'); 125 for (const c of email) { 126 hex += (c.charCodeAt(0) ^ key).toString(16).padStart(2, '0'); 127 } 128 const html = `<a href="/cdn-cgi/l/email-protection" data-cfemail="${hex}">[email protected]</a>`; 129 const r = extractContactsFromHtml(html, 'https://test.com'); 130 const emails = r.email_addresses.map(e => e.email); 131 assert.ok(emails.includes('info@test.com'), `got: ${JSON.stringify(emails)}`); 132 }); 133 }); 134 135 describe('phone extraction', () => { 136 test('extracts phone from tel: href', () => { 137 const html = '<a href="tel:+61412345678">Call us</a>'; 138 const r = extractContactsFromHtml(html, 'https://acme.com.au'); 139 assert.equal(r.phone_numbers.length, 1); 140 assert.equal(r.phone_numbers[0].number, '+61412345678'); 141 assert.equal(r.phone_numbers[0].source, 'tel:href'); 142 }); 143 144 test('extracts international plaintext phone with + prefix', () => { 145 const html = '<p>Call us on +44 20 7946 0958</p>'; 146 const r = extractContactsFromHtml(html, 'https://acme.co.uk'); 147 assert.ok(r.phone_numbers.length >= 1); 148 }); 149 150 test('deduplicates phone numbers', () => { 151 const html = '<a href="tel:+61412345678">Call</a> <p>+61 412 345 678</p>'; 152 const r = extractContactsFromHtml(html, 'https://acme.com.au'); 153 // Both should resolve to same normalized number, so only 1 154 // (tel:href normalizes, text normalizes — may or may not dedup depending on format) 155 assert.ok(r.phone_numbers.length >= 1); 156 }); 157 158 test('ignores numbers preceded by noise words (ABN)', () => { 159 const html = '<p>ABN 123456789</p>'; 160 const r = extractContactsFromHtml(html, 'https://acme.com.au'); 161 // ABN numbers without + prefix won't be picked up by intl regex anyway 162 // (the plaintext regex requires + prefix) 163 assert.ok(true); // just verify no crash 164 }); 165 166 test('rejects phone with fewer than 6 digits', () => { 167 const html = '<a href="tel:+123">Short</a>'; 168 const r = extractContactsFromHtml(html, 'https://acme.com'); 169 assert.equal(r.phone_numbers.length, 0); 170 }); 171 }); 172 173 describe('social profile extraction', () => { 174 test('extracts X.com profile', () => { 175 const html = '<a href="https://x.com/acmecorp">Follow</a>'; 176 const r = extractContactsFromHtml(html, 'https://acme.com'); 177 const xProfile = r.social_profiles.find(s => s.url.includes('x.com')); 178 assert.ok(xProfile); 179 assert.equal(xProfile.label, 'X'); 180 assert.equal(xProfile.usable, true); 181 }); 182 183 test('extracts Twitter.com profile', () => { 184 const html = '<a href="https://twitter.com/acmecorp">Follow</a>'; 185 const r = extractContactsFromHtml(html, 'https://acme.com'); 186 const tProfile = r.social_profiles.find(s => s.url.includes('twitter.com')); 187 assert.ok(tProfile); 188 assert.equal(tProfile.label, 'Twitter'); 189 assert.equal(tProfile.usable, true); 190 }); 191 192 test('extracts LinkedIn company profile', () => { 193 const html = '<a href="https://linkedin.com/company/acme-corp">LinkedIn</a>'; 194 const r = extractContactsFromHtml(html, 'https://acme.com'); 195 const li = r.social_profiles.find(s => s.url.includes('linkedin.com')); 196 assert.ok(li); 197 assert.equal(li.label, 'LinkedIn'); 198 assert.equal(li.usable, true); 199 }); 200 201 test('extracts Facebook profile (usable=false)', () => { 202 const html = '<a href="https://facebook.com/acmecorp">Facebook</a>'; 203 const r = extractContactsFromHtml(html, 'https://acme.com'); 204 const fb = r.social_profiles.find(s => s.url.includes('facebook.com')); 205 assert.ok(fb); 206 assert.equal(fb.usable, false); 207 }); 208 209 test('excludes X.com /i/ tracking paths', () => { 210 // /i/ is an excluded tracking path prefix 211 const html = '<a href="https://x.com/i/some-tracking-url">Track</a>'; 212 const r = extractContactsFromHtml(html, 'https://acme.com'); 213 // The /i/ prefix should be excluded by the exclude pattern 214 const iProfile = r.social_profiles.find(s => /x\.com\/i$/.test(s.url)); 215 assert.ok( 216 !iProfile, 217 `should not have x.com/i profile, got: ${JSON.stringify(r.social_profiles)}` 218 ); 219 }); 220 221 test('excludes LinkedIn sharing paths', () => { 222 const html = '<a href="https://linkedin.com/sharing/share-offsite/?url=foo">Share</a>'; 223 const r = extractContactsFromHtml(html, 'https://acme.com'); 224 // sharing/ path should be excluded 225 const sharing = r.social_profiles.find(s => s.url.includes('sharing')); 226 assert.ok(!sharing); 227 }); 228 229 test('deduplicates social profiles', () => { 230 const html = 231 '<a href="https://x.com/acmecorp">X1</a><a href="https://x.com/acmecorp/">X2</a>'; 232 const r = extractContactsFromHtml(html, 'https://acme.com'); 233 const xProfiles = r.social_profiles.filter(s => s.url.includes('x.com')); 234 assert.equal(xProfiles.length, 1); 235 }); 236 }); 237 238 describe('key pages extraction', () => { 239 test('extracts contact page links', () => { 240 const html = '<a href="https://acme.com/contact-us">Contact</a>'; 241 const r = extractContactsFromHtml(html, 'https://acme.com'); 242 assert.ok(r.key_pages.length >= 1); 243 assert.ok(r.key_pages.some(p => p.includes('contact'))); 244 }); 245 246 test('extracts about page links', () => { 247 const html = '<a href="https://acme.com/about">About Us</a>'; 248 const r = extractContactsFromHtml(html, 'https://acme.com'); 249 assert.ok(r.key_pages.some(p => p.includes('about'))); 250 }); 251 252 test('excludes cross-domain links', () => { 253 const html = '<a href="https://otherdomain.com/contact">Contact</a>'; 254 const r = extractContactsFromHtml(html, 'https://acme.com'); 255 assert.ok(!r.key_pages.some(p => p.includes('otherdomain'))); 256 }); 257 258 test('resolves relative hrefs to absolute', () => { 259 const html = '<a href="/contact">Contact</a>'; 260 const r = extractContactsFromHtml(html, 'https://acme.com'); 261 assert.ok(r.key_pages.some(p => p.startsWith('https://acme.com/contact'))); 262 }); 263 264 test('deduplicates key pages', () => { 265 const html = 266 '<a href="https://acme.com/contact">C1</a><a href="https://acme.com/contact/">C2</a>'; 267 const r = extractContactsFromHtml(html, 'https://acme.com'); 268 const contactPages = r.key_pages.filter(p => p.includes('contact')); 269 assert.equal(contactPages.length, 1); 270 }); 271 }); 272 273 describe('contact form detection', () => { 274 test('detects <form> tag', () => { 275 const html = '<html><body><form method="POST"><input name="email"></form></body></html>'; 276 const r = extractContactsFromHtml(html, 'https://acme.com'); 277 assert.equal(r.has_contact_form, true); 278 }); 279 280 test('detects WP contact-form-7 plugin', () => { 281 const html = '<div class="contact-form-7"><form></form></div>'; 282 const r = extractContactsFromHtml(html, 'https://acme.com'); 283 assert.equal(r.has_contact_form, true); 284 }); 285 286 test('detects wpforms plugin', () => { 287 const html = '<div class="wpforms-container"><form></form></div>'; 288 const r = extractContactsFromHtml(html, 'https://acme.com'); 289 assert.equal(r.has_contact_form, true); 290 }); 291 292 test('returns false when no form present', () => { 293 const html = '<html><body><p>No forms here</p></body></html>'; 294 const r = extractContactsFromHtml(html, 'https://acme.com'); 295 assert.equal(r.has_contact_form, false); 296 }); 297 }); 298 }); 299 300 // ─── mergeExtractedContacts ──────────────────────────────────────────────────── 301 302 describe('mergeExtractedContacts', () => { 303 test('merges into empty existing contacts', () => { 304 const extracted = { 305 email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }], 306 phone_numbers: [{ number: '+61412345678', label: 'General', source: 'tel:href' }], 307 social_profiles: [], 308 key_pages: [], 309 has_contact_form: false, 310 }; 311 const result = mergeExtractedContacts(null, extracted, 'https://acme.com'); 312 assert.equal(result.email_addresses.length, 1); 313 assert.equal(result.phone_numbers.length, 1); 314 }); 315 316 test('does not duplicate existing emails', () => { 317 const existing = { 318 email_addresses: [{ email: 'info@acme.com', label: 'Work', source: 'llm' }], 319 phone_numbers: [], 320 social_profiles: [], 321 key_pages: [], 322 }; 323 const extracted = { 324 email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }], 325 phone_numbers: [], 326 social_profiles: [], 327 key_pages: [], 328 has_contact_form: false, 329 }; 330 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com'); 331 assert.equal(result.email_addresses.length, 1); 332 // Original entry should be kept 333 assert.equal(result.email_addresses[0].source, 'llm'); 334 }); 335 336 test('adds new emails not in existing', () => { 337 const existing = { 338 email_addresses: [{ email: 'old@acme.com', label: 'Work', source: 'llm' }], 339 phone_numbers: [], 340 social_profiles: [], 341 key_pages: [], 342 }; 343 const extracted = { 344 email_addresses: [{ email: 'new@acme.com', label: 'General', source: 'mailto:href' }], 345 phone_numbers: [], 346 social_profiles: [], 347 key_pages: [], 348 has_contact_form: false, 349 }; 350 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com'); 351 assert.equal(result.email_addresses.length, 2); 352 }); 353 354 test('sets primary_contact_form when has_contact_form=true and none exists', () => { 355 const existing = { email_addresses: [], phone_numbers: [], social_profiles: [], key_pages: [] }; 356 const extracted = { 357 email_addresses: [], 358 phone_numbers: [], 359 social_profiles: [], 360 key_pages: [], 361 has_contact_form: true, 362 }; 363 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com/contact'); 364 assert.ok(result.primary_contact_form?.form_url); 365 assert.equal(result.primary_contact_form.form_url, 'https://acme.com/contact'); 366 }); 367 368 test('does not overwrite existing primary_contact_form', () => { 369 const existing = { 370 email_addresses: [], 371 phone_numbers: [], 372 social_profiles: [], 373 key_pages: [], 374 primary_contact_form: { form_url: 'https://acme.com/old-form', form_action_url: null }, 375 }; 376 const extracted = { 377 email_addresses: [], 378 phone_numbers: [], 379 social_profiles: [], 380 key_pages: [], 381 has_contact_form: true, 382 }; 383 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com/new-form'); 384 assert.equal(result.primary_contact_form.form_url, 'https://acme.com/old-form'); 385 }); 386 387 test('deduplicates phones by normalized value', () => { 388 const existing = { 389 email_addresses: [], 390 phone_numbers: [{ number: '+61412345678', label: 'Work', source: 'llm' }], 391 social_profiles: [], 392 key_pages: [], 393 }; 394 const extracted = { 395 email_addresses: [], 396 phone_numbers: [{ number: '+61412345678', label: 'General', source: 'tel:href' }], 397 social_profiles: [], 398 key_pages: [], 399 has_contact_form: false, 400 }; 401 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com'); 402 assert.equal(result.phone_numbers.length, 1); 403 }); 404 405 test('merges social profiles without duplicates', () => { 406 const existing = { 407 email_addresses: [], 408 phone_numbers: [], 409 social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }], 410 key_pages: [], 411 }; 412 const extracted = { 413 email_addresses: [], 414 phone_numbers: [], 415 social_profiles: [ 416 { url: 'https://x.com/acme', label: 'X', usable: true }, 417 { url: 'https://linkedin.com/company/acme', label: 'LinkedIn', usable: true }, 418 ], 419 key_pages: [], 420 has_contact_form: false, 421 }; 422 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com'); 423 assert.equal(result.social_profiles.length, 2); 424 }); 425 426 test('handles existing with string format emails (legacy)', () => { 427 const existing = { 428 email_addresses: ['info@acme.com'], // legacy string format 429 phone_numbers: [], 430 social_profiles: [], 431 key_pages: [], 432 }; 433 const extracted = { 434 email_addresses: [{ email: 'info@acme.com', label: 'General', source: 'mailto:href' }], 435 phone_numbers: [], 436 social_profiles: [], 437 key_pages: [], 438 has_contact_form: false, 439 }; 440 const result = mergeExtractedContacts(existing, extracted, 'https://acme.com'); 441 // Should not duplicate 442 assert.equal(result.email_addresses.length, 1); 443 }); 444 }); 445 446 // ─── countUsableContacts ────────────────────────────────────────────────────── 447 448 describe('countUsableContacts', () => { 449 test('returns 0 for null', () => { 450 assert.equal(countUsableContacts(null), 0); 451 }); 452 453 test('returns 0 for empty contacts', () => { 454 assert.equal( 455 countUsableContacts({ email_addresses: [], phone_numbers: [], social_profiles: [] }), 456 0 457 ); 458 }); 459 460 test('counts email addresses', () => { 461 const contacts = { 462 email_addresses: [ 463 { email: 'a@example.com', label: 'Work' }, 464 { email: 'b@example.com', label: 'General' }, 465 ], 466 phone_numbers: [], 467 social_profiles: [], 468 }; 469 assert.equal(countUsableContacts(contacts), 2); 470 }); 471 472 test('counts phone numbers', () => { 473 const contacts = { 474 email_addresses: [], 475 phone_numbers: [{ number: '+61412345678', label: 'General' }], 476 social_profiles: [], 477 }; 478 assert.equal(countUsableContacts(contacts), 1); 479 }); 480 481 test('counts contact form', () => { 482 const contacts = { 483 email_addresses: [], 484 phone_numbers: [], 485 social_profiles: [], 486 primary_contact_form: { form_url: 'https://acme.com/contact', form_action_url: null }, 487 }; 488 assert.equal(countUsableContacts(contacts), 1); 489 }); 490 491 test('counts usable social profiles (X, LinkedIn)', () => { 492 const contacts = { 493 email_addresses: [], 494 phone_numbers: [], 495 social_profiles: [ 496 { url: 'https://x.com/acme', label: 'X', usable: true }, 497 { url: 'https://linkedin.com/company/acme', label: 'LinkedIn', usable: true }, 498 { url: 'https://facebook.com/acme', label: 'Facebook', usable: false }, 499 ], 500 }; 501 assert.equal(countUsableContacts(contacts), 2); 502 }); 503 504 test('counts X/LinkedIn by URL when usable flag is null', () => { 505 const contacts = { 506 email_addresses: [], 507 phone_numbers: [], 508 social_profiles: [ 509 { url: 'https://x.com/acme', label: 'X' }, // no usable flag 510 { url: 'https://instagram.com/acme', label: 'Instagram' }, // not usable 511 ], 512 }; 513 assert.equal(countUsableContacts(contacts), 1); 514 }); 515 516 test('counts string-format social profiles by URL', () => { 517 const contacts = { 518 email_addresses: [], 519 phone_numbers: [], 520 social_profiles: [ 521 'https://linkedin.com/company/acme', // legacy string format 522 'https://facebook.com/acme', // not usable by URL 523 ], 524 }; 525 assert.equal(countUsableContacts(contacts), 1); 526 }); 527 528 test('counts all types together', () => { 529 const contacts = { 530 email_addresses: [{ email: 'info@acme.com', label: 'Work' }], 531 phone_numbers: [{ number: '+61412345678', label: 'General' }], 532 social_profiles: [{ url: 'https://x.com/acme', label: 'X', usable: true }], 533 primary_contact_form: { form_url: 'https://acme.com/contact', form_action_url: null }, 534 }; 535 assert.equal(countUsableContacts(contacts), 4); 536 }); 537 });