social-contact-extractor-browser.test.js
1 /** 2 * Tests for src/utils/social-contact-extractor.js — Playwright browser fallback paths 3 * 4 * Covers Playwright stealth fallback for LinkedIn, Facebook, Yelp, Instagram. 5 * The Outscraper API is NOT available (no OUTSCRAPER_API_KEY), so the extractor 6 * falls through to the browser path. 7 * 8 * Uses a configurable browser mock to exercise page-level behaviour: 9 * - Successful extraction 10 * - HTTP error (status >= 400) 11 * - Null response from goto 12 * - Exception from page operations 13 * - Instagram login wall detection 14 */ 15 16 import { test, describe, mock, beforeEach } from 'node:test'; 17 import assert from 'node:assert/strict'; 18 19 // ── Configurable browser mock ───────────────────────────────────────────────── 20 21 // Browser mock state — mutated per test 22 const _browserState = { 23 gotoStatus: 200, 24 gotoNull: false, 25 evaluateResult: null, 26 pageHtml: '<html><body>contact@company.com</body></html>', 27 throwOnGoto: false, 28 throwOnContent: false, 29 }; 30 31 function resetBrowserState() { 32 _browserState.gotoStatus = 200; 33 _browserState.gotoNull = false; 34 _browserState.evaluateResult = null; 35 _browserState.pageHtml = '<html><body>contact@company.com</body></html>'; 36 _browserState.throwOnGoto = false; 37 _browserState.throwOnContent = false; 38 } 39 40 // Mock stealth-browser using configurable state 41 mock.module('../../src/utils/stealth-browser.js', { 42 namedExports: { 43 createStealthContext: mock.fn(async () => { 44 const page = { 45 goto: async () => { 46 if (_browserState.throwOnGoto) throw new Error('goto error'); 47 if (_browserState.gotoNull) return null; 48 return { status: () => _browserState.gotoStatus }; 49 }, 50 evaluate: async () => _browserState.evaluateResult, 51 content: async () => { 52 if (_browserState.throwOnContent) throw new Error('content error'); 53 return _browserState.pageHtml; 54 }, 55 close: async () => {}, 56 }; 57 return { 58 newPage: async () => page, 59 close: async () => {}, 60 }; 61 }), 62 humanScroll: mock.fn(async () => {}), 63 randomDelay: mock.fn(async () => {}), 64 waitForCloudflare: mock.fn(async () => true), 65 }, 66 }); 67 68 // Mock html-contact-extractor 69 mock.module('../../src/utils/html-contact-extractor.js', { 70 namedExports: { 71 extractContactsFromHtml: mock.fn(() => ({ 72 email_addresses: [{ email: 'extracted@company.com', label: 'General', source: 'page' }], 73 phone_numbers: [{ number: '+61400111111', label: 'General', source: 'page' }], 74 })), 75 }, 76 }); 77 78 mock.module('../../src/utils/logger.js', { 79 defaultExport: class { 80 info() {} warn() {} error() {} debug() {} success() {} 81 }, 82 }); 83 mock.module('../../src/utils/load-env.js', { defaultExport: {} }); 84 85 // Ensure OUTSCRAPER_API_KEY is not set — forces fallback to browser 86 delete process.env.OUTSCRAPER_API_KEY; 87 delete process.env.ENABLE_SOCIAL_EXTRACTION; 88 89 // Fetch mock (for when Outscraper is tried despite no key - should be skipped) 90 const origFetch = globalThis.fetch; 91 92 // Import after mocks 93 const { extractFromSocialProfiles } = await import('../../src/utils/social-contact-extractor.js'); 94 95 // ── Browser mock object ─────────────────────────────────────────────────────── 96 97 // Create a minimal browser object — it's only passed through to createStealthContext 98 const mockBrowser = {}; 99 100 // ═══════════════════════════════════════════════════════════════════════════════ 101 // LinkedIn — Playwright fallback 102 // ═══════════════════════════════════════════════════════════════════════════════ 103 104 describe('LinkedIn Playwright fallback', () => { 105 beforeEach(() => { 106 resetBrowserState(); 107 delete process.env.OUTSCRAPER_API_KEY; 108 }); 109 110 test('extracts city from Headquarters field via page.evaluate', async () => { 111 _browserState.evaluateResult = { Headquarters: 'Melbourne, VIC', Phone: null }; 112 113 const result = await extractFromSocialProfiles( 114 ['https://www.linkedin.com/company/acme'], 115 'https://acme.com', 116 mockBrowser 117 ); 118 assert.ok(result); 119 assert.equal(result._city, 'Melbourne'); 120 assert.equal(result.email_addresses.length, 1); // from html-contact-extractor mock 121 }); 122 123 test('extracts emails and phones from page content', async () => { 124 _browserState.evaluateResult = {}; 125 126 const result = await extractFromSocialProfiles( 127 ['https://www.linkedin.com/company/acme'], 128 'https://acme.com', 129 mockBrowser 130 ); 131 assert.ok(result); 132 assert.equal(result.email_addresses[0].email, 'extracted@company.com'); 133 assert.equal(result.phone_numbers[0].number, '+61400111111'); 134 }); 135 136 test('returns null when page status >= 400', async () => { 137 _browserState.gotoStatus = 404; 138 139 const result = await extractFromSocialProfiles( 140 ['https://www.linkedin.com/company/acme'], 141 'https://acme.com', 142 mockBrowser 143 ); 144 assert.equal(result, null); 145 }); 146 147 test('returns null when goto returns null response', async () => { 148 _browserState.gotoNull = true; 149 150 const result = await extractFromSocialProfiles( 151 ['https://www.linkedin.com/company/acme'], 152 'https://acme.com', 153 mockBrowser 154 ); 155 assert.equal(result, null); 156 }); 157 158 test('returns null when page throws during goto', async () => { 159 _browserState.throwOnGoto = true; 160 161 const result = await extractFromSocialProfiles( 162 ['https://www.linkedin.com/company/acme'], 163 'https://acme.com', 164 mockBrowser 165 ); 166 assert.equal(result, null); 167 }); 168 169 test('handles evaluate returning empty object (no Headquarters field found)', async () => { 170 // When no keywords are found in the page, evaluate returns {} 171 _browserState.evaluateResult = {}; 172 173 const result = await extractFromSocialProfiles( 174 ['https://www.linkedin.com/company/acme'], 175 'https://acme.com', 176 mockBrowser 177 ); 178 // Should still return a result with contacts from HTML 179 assert.ok(result); 180 assert.ok(result.email_addresses.length > 0); 181 assert.equal(result._city, undefined); // no headquarters found 182 }); 183 }); 184 185 // ═══════════════════════════════════════════════════════════════════════════════ 186 // Facebook — Playwright fallback 187 // ═══════════════════════════════════════════════════════════════════════════════ 188 189 describe('Facebook Playwright fallback', () => { 190 beforeEach(() => { 191 resetBrowserState(); 192 delete process.env.OUTSCRAPER_API_KEY; 193 }); 194 195 test('extracts contacts from rendered page HTML', async () => { 196 const result = await extractFromSocialProfiles( 197 ['https://www.facebook.com/AcmePlumbing'], 198 'https://acme.com', 199 mockBrowser 200 ); 201 assert.ok(result); 202 assert.equal(result.email_addresses[0].email, 'extracted@company.com'); 203 }); 204 205 test('returns null when page status >= 400', async () => { 206 _browserState.gotoStatus = 403; 207 208 const result = await extractFromSocialProfiles( 209 ['https://www.facebook.com/AcmePlumbing'], 210 'https://acme.com', 211 mockBrowser 212 ); 213 assert.equal(result, null); 214 }); 215 216 test('returns null when goto returns null', async () => { 217 _browserState.gotoNull = true; 218 219 const result = await extractFromSocialProfiles( 220 ['https://www.facebook.com/AcmePlumbing'], 221 'https://acme.com', 222 mockBrowser 223 ); 224 assert.equal(result, null); 225 }); 226 227 test('returns null when page throws', async () => { 228 _browserState.throwOnGoto = true; 229 230 const result = await extractFromSocialProfiles( 231 ['https://www.facebook.com/AcmePlumbing'], 232 'https://acme.com', 233 mockBrowser 234 ); 235 assert.equal(result, null); 236 }); 237 }); 238 239 // ═══════════════════════════════════════════════════════════════════════════════ 240 // Yelp — Playwright fallback 241 // ═══════════════════════════════════════════════════════════════════════════════ 242 243 describe('Yelp Playwright fallback', () => { 244 beforeEach(() => { 245 resetBrowserState(); 246 delete process.env.OUTSCRAPER_API_KEY; 247 }); 248 249 test('extracts contacts and city from page HTML and evaluate', async () => { 250 _browserState.evaluateResult = 'Adelaide'; // cityFromLd 251 252 const result = await extractFromSocialProfiles( 253 ['https://www.yelp.com/biz/test-plumbing'], 254 'https://acme.com', 255 mockBrowser 256 ); 257 assert.ok(result); 258 assert.equal(result._city, 'Adelaide'); 259 assert.equal(result.email_addresses[0].email, 'extracted@company.com'); 260 }); 261 262 test('does not set _city when evaluate returns null', async () => { 263 _browserState.evaluateResult = null; 264 265 const result = await extractFromSocialProfiles( 266 ['https://www.yelp.com/biz/test-plumbing'], 267 'https://acme.com', 268 mockBrowser 269 ); 270 assert.ok(result); 271 assert.equal(result._city, undefined); 272 }); 273 274 test('returns null when page status >= 400', async () => { 275 _browserState.gotoStatus = 404; 276 277 const result = await extractFromSocialProfiles( 278 ['https://www.yelp.com/biz/test-plumbing'], 279 'https://acme.com', 280 mockBrowser 281 ); 282 assert.equal(result, null); 283 }); 284 285 test('returns null when goto returns null (Cloudflare block)', async () => { 286 _browserState.gotoNull = true; 287 288 const result = await extractFromSocialProfiles( 289 ['https://www.yelp.com/biz/test-plumbing'], 290 'https://acme.com', 291 mockBrowser 292 ); 293 assert.equal(result, null); 294 }); 295 296 test('returns null when page throws', async () => { 297 _browserState.throwOnGoto = true; 298 299 const result = await extractFromSocialProfiles( 300 ['https://www.yelp.com/biz/test-plumbing'], 301 'https://acme.com', 302 mockBrowser 303 ); 304 assert.equal(result, null); 305 }); 306 307 test('handles Cloudflare not resolved (waitForCloudflare returns false)', async () => { 308 // The Yelp path calls waitForCloudflare; if it returns false, it warns but continues. 309 // Our mock returns true by default, but let's verify the warn path is exercised. 310 const { waitForCloudflare } = await import('../../src/utils/stealth-browser.js'); 311 // Temporarily override 312 const origMock = waitForCloudflare.mock; 313 waitForCloudflare.mock.mockImplementation(async () => false); 314 315 const result = await extractFromSocialProfiles( 316 ['https://www.yelp.com/biz/test-plumbing'], 317 'https://acme.com', 318 mockBrowser 319 ); 320 // Even with Cloudflare unresolved, still tries to extract 321 assert.ok(result !== undefined); // may be null or object 322 323 // Restore 324 waitForCloudflare.mock.mockImplementation(async () => true); 325 }); 326 }); 327 328 // ═══════════════════════════════════════════════════════════════════════════════ 329 // Instagram — Playwright (always requires browser) 330 // ═══════════════════════════════════════════════════════════════════════════════ 331 332 describe('Instagram Playwright extraction', () => { 333 beforeEach(() => { 334 resetBrowserState(); 335 delete process.env.OUTSCRAPER_API_KEY; 336 }); 337 338 test('extracts contacts when page is accessible', async () => { 339 _browserState.evaluateResult = false; // isLoginWall = false 340 341 const result = await extractFromSocialProfiles( 342 ['https://www.instagram.com/acmebiz'], 343 'https://acme.com', 344 mockBrowser 345 ); 346 assert.ok(result); 347 assert.equal(result.email_addresses[0].email, 'extracted@company.com'); 348 }); 349 350 test('returns null when Instagram shows login wall', async () => { 351 _browserState.evaluateResult = true; // isLoginWall = true 352 353 const result = await extractFromSocialProfiles( 354 ['https://www.instagram.com/acmebiz'], 355 'https://acme.com', 356 mockBrowser 357 ); 358 assert.equal(result, null); 359 }); 360 361 test('returns null when page status >= 400', async () => { 362 _browserState.gotoStatus = 404; 363 _browserState.evaluateResult = false; 364 365 const result = await extractFromSocialProfiles( 366 ['https://www.instagram.com/acmebiz'], 367 'https://acme.com', 368 mockBrowser 369 ); 370 assert.equal(result, null); 371 }); 372 373 test('returns null when goto returns null', async () => { 374 _browserState.gotoNull = true; 375 376 const result = await extractFromSocialProfiles( 377 ['https://www.instagram.com/acmebiz'], 378 'https://acme.com', 379 mockBrowser 380 ); 381 assert.equal(result, null); 382 }); 383 384 test('returns null when page throws during extraction', async () => { 385 _browserState.throwOnGoto = true; 386 387 const result = await extractFromSocialProfiles( 388 ['https://www.instagram.com/acmebiz'], 389 'https://acme.com', 390 mockBrowser 391 ); 392 assert.equal(result, null); 393 }); 394 395 test('returns null when no browser is provided (regardless of ENABLE_SOCIAL_EXTRACTION)', async () => { 396 const result = await extractFromSocialProfiles( 397 ['https://www.instagram.com/acmebiz'], 398 'https://acme.com', 399 null // no browser 400 ); 401 assert.equal(result, null); 402 }); 403 });