/ tests / utils / social-contact-extractor-browser.test.js
social-contact-extractor-browser.test.js
  1  /**
  2   * Tests for src/utils/social-contact-extractor.js — Playwright browser fallback paths
  3   *
  4   * Covers Playwright stealth fallback for LinkedIn, Facebook, Yelp, Instagram.
  5   * The Outscraper API is NOT available (no OUTSCRAPER_API_KEY), so the extractor
  6   * falls through to the browser path.
  7   *
  8   * Uses a configurable browser mock to exercise page-level behaviour:
  9   *   - Successful extraction
 10   *   - HTTP error (status >= 400)
 11   *   - Null response from goto
 12   *   - Exception from page operations
 13   *   - Instagram login wall detection
 14   */
 15  
 16  import { test, describe, mock, beforeEach } from 'node:test';
 17  import assert from 'node:assert/strict';
 18  
 19  // ── Configurable browser mock ─────────────────────────────────────────────────
 20  
 21  // Browser mock state — mutated per test
 22  const _browserState = {
 23    gotoStatus: 200,
 24    gotoNull: false,
 25    evaluateResult: null,
 26    pageHtml: '<html><body>contact@company.com</body></html>',
 27    throwOnGoto: false,
 28    throwOnContent: false,
 29  };
 30  
 31  function resetBrowserState() {
 32    _browserState.gotoStatus = 200;
 33    _browserState.gotoNull = false;
 34    _browserState.evaluateResult = null;
 35    _browserState.pageHtml = '<html><body>contact@company.com</body></html>';
 36    _browserState.throwOnGoto = false;
 37    _browserState.throwOnContent = false;
 38  }
 39  
 40  // Mock stealth-browser using configurable state
 41  mock.module('../../src/utils/stealth-browser.js', {
 42    namedExports: {
 43      createStealthContext: mock.fn(async () => {
 44        const page = {
 45          goto: async () => {
 46            if (_browserState.throwOnGoto) throw new Error('goto error');
 47            if (_browserState.gotoNull) return null;
 48            return { status: () => _browserState.gotoStatus };
 49          },
 50          evaluate: async () => _browserState.evaluateResult,
 51          content: async () => {
 52            if (_browserState.throwOnContent) throw new Error('content error');
 53            return _browserState.pageHtml;
 54          },
 55          close: async () => {},
 56        };
 57        return {
 58          newPage: async () => page,
 59          close: async () => {},
 60        };
 61      }),
 62      humanScroll: mock.fn(async () => {}),
 63      randomDelay: mock.fn(async () => {}),
 64      waitForCloudflare: mock.fn(async () => true),
 65    },
 66  });
 67  
 68  // Mock html-contact-extractor
 69  mock.module('../../src/utils/html-contact-extractor.js', {
 70    namedExports: {
 71      extractContactsFromHtml: mock.fn(() => ({
 72        email_addresses: [{ email: 'extracted@company.com', label: 'General', source: 'page' }],
 73        phone_numbers: [{ number: '+61400111111', label: 'General', source: 'page' }],
 74      })),
 75    },
 76  });
 77  
 78  mock.module('../../src/utils/logger.js', {
 79    defaultExport: class {
 80      info() {} warn() {} error() {} debug() {} success() {}
 81    },
 82  });
 83  mock.module('../../src/utils/load-env.js', { defaultExport: {} });
 84  
 85  // Ensure OUTSCRAPER_API_KEY is not set — forces fallback to browser
 86  delete process.env.OUTSCRAPER_API_KEY;
 87  delete process.env.ENABLE_SOCIAL_EXTRACTION;
 88  
 89  // Fetch mock (for when Outscraper is tried despite no key - should be skipped)
 90  const origFetch = globalThis.fetch;
 91  
 92  // Import after mocks
 93  const { extractFromSocialProfiles } = await import('../../src/utils/social-contact-extractor.js');
 94  
 95  // ── Browser mock object ───────────────────────────────────────────────────────
 96  
 97  // Create a minimal browser object — it's only passed through to createStealthContext
 98  const mockBrowser = {};
 99  
100  // ═══════════════════════════════════════════════════════════════════════════════
101  // LinkedIn — Playwright fallback
102  // ═══════════════════════════════════════════════════════════════════════════════
103  
104  describe('LinkedIn Playwright fallback', () => {
105    beforeEach(() => {
106      resetBrowserState();
107      delete process.env.OUTSCRAPER_API_KEY;
108    });
109  
110    test('extracts city from Headquarters field via page.evaluate', async () => {
111      _browserState.evaluateResult = { Headquarters: 'Melbourne, VIC', Phone: null };
112  
113      const result = await extractFromSocialProfiles(
114        ['https://www.linkedin.com/company/acme'],
115        'https://acme.com',
116        mockBrowser
117      );
118      assert.ok(result);
119      assert.equal(result._city, 'Melbourne');
120      assert.equal(result.email_addresses.length, 1); // from html-contact-extractor mock
121    });
122  
123    test('extracts emails and phones from page content', async () => {
124      _browserState.evaluateResult = {};
125  
126      const result = await extractFromSocialProfiles(
127        ['https://www.linkedin.com/company/acme'],
128        'https://acme.com',
129        mockBrowser
130      );
131      assert.ok(result);
132      assert.equal(result.email_addresses[0].email, 'extracted@company.com');
133      assert.equal(result.phone_numbers[0].number, '+61400111111');
134    });
135  
136    test('returns null when page status >= 400', async () => {
137      _browserState.gotoStatus = 404;
138  
139      const result = await extractFromSocialProfiles(
140        ['https://www.linkedin.com/company/acme'],
141        'https://acme.com',
142        mockBrowser
143      );
144      assert.equal(result, null);
145    });
146  
147    test('returns null when goto returns null response', async () => {
148      _browserState.gotoNull = true;
149  
150      const result = await extractFromSocialProfiles(
151        ['https://www.linkedin.com/company/acme'],
152        'https://acme.com',
153        mockBrowser
154      );
155      assert.equal(result, null);
156    });
157  
158    test('returns null when page throws during goto', async () => {
159      _browserState.throwOnGoto = true;
160  
161      const result = await extractFromSocialProfiles(
162        ['https://www.linkedin.com/company/acme'],
163        'https://acme.com',
164        mockBrowser
165      );
166      assert.equal(result, null);
167    });
168  
169    test('handles evaluate returning empty object (no Headquarters field found)', async () => {
170      // When no keywords are found in the page, evaluate returns {}
171      _browserState.evaluateResult = {};
172  
173      const result = await extractFromSocialProfiles(
174        ['https://www.linkedin.com/company/acme'],
175        'https://acme.com',
176        mockBrowser
177      );
178      // Should still return a result with contacts from HTML
179      assert.ok(result);
180      assert.ok(result.email_addresses.length > 0);
181      assert.equal(result._city, undefined); // no headquarters found
182    });
183  });
184  
185  // ═══════════════════════════════════════════════════════════════════════════════
186  // Facebook — Playwright fallback
187  // ═══════════════════════════════════════════════════════════════════════════════
188  
189  describe('Facebook Playwright fallback', () => {
190    beforeEach(() => {
191      resetBrowserState();
192      delete process.env.OUTSCRAPER_API_KEY;
193    });
194  
195    test('extracts contacts from rendered page HTML', async () => {
196      const result = await extractFromSocialProfiles(
197        ['https://www.facebook.com/AcmePlumbing'],
198        'https://acme.com',
199        mockBrowser
200      );
201      assert.ok(result);
202      assert.equal(result.email_addresses[0].email, 'extracted@company.com');
203    });
204  
205    test('returns null when page status >= 400', async () => {
206      _browserState.gotoStatus = 403;
207  
208      const result = await extractFromSocialProfiles(
209        ['https://www.facebook.com/AcmePlumbing'],
210        'https://acme.com',
211        mockBrowser
212      );
213      assert.equal(result, null);
214    });
215  
216    test('returns null when goto returns null', async () => {
217      _browserState.gotoNull = true;
218  
219      const result = await extractFromSocialProfiles(
220        ['https://www.facebook.com/AcmePlumbing'],
221        'https://acme.com',
222        mockBrowser
223      );
224      assert.equal(result, null);
225    });
226  
227    test('returns null when page throws', async () => {
228      _browserState.throwOnGoto = true;
229  
230      const result = await extractFromSocialProfiles(
231        ['https://www.facebook.com/AcmePlumbing'],
232        'https://acme.com',
233        mockBrowser
234      );
235      assert.equal(result, null);
236    });
237  });
238  
239  // ═══════════════════════════════════════════════════════════════════════════════
240  // Yelp — Playwright fallback
241  // ═══════════════════════════════════════════════════════════════════════════════
242  
243  describe('Yelp Playwright fallback', () => {
244    beforeEach(() => {
245      resetBrowserState();
246      delete process.env.OUTSCRAPER_API_KEY;
247    });
248  
249    test('extracts contacts and city from page HTML and evaluate', async () => {
250      _browserState.evaluateResult = 'Adelaide'; // cityFromLd
251  
252      const result = await extractFromSocialProfiles(
253        ['https://www.yelp.com/biz/test-plumbing'],
254        'https://acme.com',
255        mockBrowser
256      );
257      assert.ok(result);
258      assert.equal(result._city, 'Adelaide');
259      assert.equal(result.email_addresses[0].email, 'extracted@company.com');
260    });
261  
262    test('does not set _city when evaluate returns null', async () => {
263      _browserState.evaluateResult = null;
264  
265      const result = await extractFromSocialProfiles(
266        ['https://www.yelp.com/biz/test-plumbing'],
267        'https://acme.com',
268        mockBrowser
269      );
270      assert.ok(result);
271      assert.equal(result._city, undefined);
272    });
273  
274    test('returns null when page status >= 400', async () => {
275      _browserState.gotoStatus = 404;
276  
277      const result = await extractFromSocialProfiles(
278        ['https://www.yelp.com/biz/test-plumbing'],
279        'https://acme.com',
280        mockBrowser
281      );
282      assert.equal(result, null);
283    });
284  
285    test('returns null when goto returns null (Cloudflare block)', async () => {
286      _browserState.gotoNull = true;
287  
288      const result = await extractFromSocialProfiles(
289        ['https://www.yelp.com/biz/test-plumbing'],
290        'https://acme.com',
291        mockBrowser
292      );
293      assert.equal(result, null);
294    });
295  
296    test('returns null when page throws', async () => {
297      _browserState.throwOnGoto = true;
298  
299      const result = await extractFromSocialProfiles(
300        ['https://www.yelp.com/biz/test-plumbing'],
301        'https://acme.com',
302        mockBrowser
303      );
304      assert.equal(result, null);
305    });
306  
307    test('handles Cloudflare not resolved (waitForCloudflare returns false)', async () => {
308      // The Yelp path calls waitForCloudflare; if it returns false, it warns but continues.
309      // Our mock returns true by default, but let's verify the warn path is exercised.
310      const { waitForCloudflare } = await import('../../src/utils/stealth-browser.js');
311      // Temporarily override
312      const origMock = waitForCloudflare.mock;
313      waitForCloudflare.mock.mockImplementation(async () => false);
314  
315      const result = await extractFromSocialProfiles(
316        ['https://www.yelp.com/biz/test-plumbing'],
317        'https://acme.com',
318        mockBrowser
319      );
320      // Even with Cloudflare unresolved, still tries to extract
321      assert.ok(result !== undefined); // may be null or object
322  
323      // Restore
324      waitForCloudflare.mock.mockImplementation(async () => true);
325    });
326  });
327  
328  // ═══════════════════════════════════════════════════════════════════════════════
329  // Instagram — Playwright (always requires browser)
330  // ═══════════════════════════════════════════════════════════════════════════════
331  
332  describe('Instagram Playwright extraction', () => {
333    beforeEach(() => {
334      resetBrowserState();
335      delete process.env.OUTSCRAPER_API_KEY;
336    });
337  
338    test('extracts contacts when page is accessible', async () => {
339      _browserState.evaluateResult = false; // isLoginWall = false
340  
341      const result = await extractFromSocialProfiles(
342        ['https://www.instagram.com/acmebiz'],
343        'https://acme.com',
344        mockBrowser
345      );
346      assert.ok(result);
347      assert.equal(result.email_addresses[0].email, 'extracted@company.com');
348    });
349  
350    test('returns null when Instagram shows login wall', async () => {
351      _browserState.evaluateResult = true; // isLoginWall = true
352  
353      const result = await extractFromSocialProfiles(
354        ['https://www.instagram.com/acmebiz'],
355        'https://acme.com',
356        mockBrowser
357      );
358      assert.equal(result, null);
359    });
360  
361    test('returns null when page status >= 400', async () => {
362      _browserState.gotoStatus = 404;
363      _browserState.evaluateResult = false;
364  
365      const result = await extractFromSocialProfiles(
366        ['https://www.instagram.com/acmebiz'],
367        'https://acme.com',
368        mockBrowser
369      );
370      assert.equal(result, null);
371    });
372  
373    test('returns null when goto returns null', async () => {
374      _browserState.gotoNull = true;
375  
376      const result = await extractFromSocialProfiles(
377        ['https://www.instagram.com/acmebiz'],
378        'https://acme.com',
379        mockBrowser
380      );
381      assert.equal(result, null);
382    });
383  
384    test('returns null when page throws during extraction', async () => {
385      _browserState.throwOnGoto = true;
386  
387      const result = await extractFromSocialProfiles(
388        ['https://www.instagram.com/acmebiz'],
389        'https://acme.com',
390        mockBrowser
391      );
392      assert.equal(result, null);
393    });
394  
395    test('returns null when no browser is provided (regardless of ENABLE_SOCIAL_EXTRACTION)', async () => {
396      const result = await extractFromSocialProfiles(
397        ['https://www.instagram.com/acmebiz'],
398        'https://acme.com',
399        null  // no browser
400      );
401      assert.equal(result, null);
402    });
403  });