/ src / utils / social-contact-extractor.js
social-contact-extractor.js
  1  /**
  2   * Extract contact details (email, phone, city) from social media profile pages.
  3   *
  4   * Platforms:
  5   *   - YouTube:   raw HTTP fetch -> parse ytInitialData JSON from page HTML
  6   *   - LinkedIn:  Outscraper API (primary) -> /linkedin/companies -> headquarters city
  7   *                Playwright stealth (fallback if no API key)
  8   *   - Facebook:  Outscraper API (primary) -> /facebook-pages -> email + phone
  9   *                Playwright stealth (fallback if no API key)
 10   *   - Yelp:      Outscraper API (primary) -> /yelp -> phone + city (no CAPTCHA needed)
 11   *                Playwright stealth + nopecha CAPTCHA (fallback if no API key)
 12   *   - Instagram: Playwright stealth -> extractContactsFromHtml on rendered HTML
 13   *
 14   * Outscraper API (OUTSCRAPER_API_KEY) is preferred for Yelp/Facebook/LinkedIn:
 15   * - No browser/CAPTCHA overhead
 16   * - Returns structured fields (email, phone, city) directly
 17   * - ~$3/1k requests vs Playwright's operational complexity
 18   *
 19   * Returns a contacts_json-compatible partial that merges via mergeExtractedContacts().
 20   */
 21  
 22  import { extractContactsFromHtml } from './html-contact-extractor.js';
 23  import {
 24    createStealthContext,
 25    humanScroll,
 26    randomDelay,
 27    waitForCloudflare,
 28  } from './stealth-browser.js';
 29  import Logger from './logger.js';
 30  
 31  const logger = new Logger('SocialExtract');
 32  const info = (...args) => logger.info(...args);
 33  const warn = (...args) => logger.warn(...args);
 34  
 35  // ---------------------------------------------------------------------------
 36  // Helpers
 37  // ---------------------------------------------------------------------------
 38  
 39  const DESKTOP_UA =
 40    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36';
 41  
 42  const DESKTOP_HEADERS = {
 43    'User-Agent': DESKTOP_UA,
 44    Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 45    'Accept-Language': 'en-US,en;q=0.9',
 46    'Sec-Fetch-Dest': 'document',
 47    'Sec-Fetch-Mode': 'navigate',
 48    'Sec-Fetch-Site': 'none',
 49    'Sec-Fetch-User': '?1',
 50    'Sec-Ch-Ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
 51    'Sec-Ch-Ua-Mobile': '?0',
 52    'Sec-Ch-Ua-Platform': '"macOS"',
 53  };
 54  
 55  const OUTSCRAPER_BASE = 'https://api.app.outscraper.com';
 56  
 57  /** Classify a social profile URL by platform. */
 58  function classifyPlatform(url) {
 59    const u = url.toLowerCase();
 60    if (u.includes('youtube.com') || u.includes('youtu.be')) return 'youtube';
 61    if (u.includes('linkedin.com/company')) return 'linkedin';
 62    if (u.includes('facebook.com')) return 'facebook';
 63    if (u.includes('yelp.com/biz')) return 'yelp';
 64    if (u.includes('instagram.com')) return 'instagram';
 65    return null;
 66  }
 67  
 68  /** Skip patterns -- personal FB profiles, FB groups, IG intent pages, etc. */
 69  function shouldSkip(url) {
 70    return /profile\.php\?id=|\/groups\/|\/intent\/|\/share\?|\/login/i.test(url);
 71  }
 72  
 73  /** Build an empty contacts partial. */
 74  function emptyResult() {
 75    return { email_addresses: [], phone_numbers: [], social_profiles: [], key_pages: [] };
 76  }
 77  
 78  // ---------------------------------------------------------------------------
 79  // Outscraper API helper
 80  // ---------------------------------------------------------------------------
 81  
 82  /**
 83   * Call an Outscraper endpoint synchronously.
 84   * Returns the first result object, or null on error/empty.
 85   */
 86  async function outscraperFetch(endpoint, query) {
 87    const apiKey = process.env.OUTSCRAPER_API_KEY;
 88    if (!apiKey) return null;
 89  
 90    const url = new URL(`${OUTSCRAPER_BASE}/${endpoint}`);
 91    url.searchParams.set('query', query);
 92    url.searchParams.set('limit', '1');
 93    url.searchParams.set('async', 'false');
 94  
 95    try {
 96      const res = await fetch(url.toString(), {
 97        headers: { 'X-API-KEY': apiKey },
 98      });
 99      if (!res.ok) {
100        warn(`Outscraper ${endpoint} HTTP ${res.status} for ${query}`);
101        return null;
102      }
103      const json = await res.json();
104      if (json.status !== 'Success' || !json.data?.length) return null;
105      // Synchronous results are nested: data[0] is either the object or an array of objects
106      const first = Array.isArray(json.data[0]) ? json.data[0][0] : json.data[0];
107      return first || null;
108    } catch (err) {
109      warn(`Outscraper ${endpoint} error for ${query}: ${err.message}`);
110      return null;
111    }
112  }
113  
114  // ---------------------------------------------------------------------------
115  // YouTube -- raw HTTP (no browser needed)
116  // ---------------------------------------------------------------------------
117  
118  async function extractFromYouTube(profileUrl) {
119    // Normalise to /about path to get the aboutChannelViewModel
120    let fetchUrl = profileUrl.replace(/\/$/, '');
121    if (!fetchUrl.endsWith('/about')) fetchUrl += '/about';
122  
123    try {
124      const res = await fetch(fetchUrl, { headers: DESKTOP_HEADERS, redirect: 'follow' });
125      if (!res.ok) return null;
126      const html = await res.text();
127  
128      const result = emptyResult();
129  
130      // Parse ytInitialData JSON blob
131      const match = html.match(/var\s+ytInitialData\s*=\s*(\{.*?\});\s*<\/script>/s);
132      if (match) {
133        try {
134          const data = JSON.parse(match[1]);
135  
136          // Channel description (often has email/phone for SMBs)
137          const description =
138            findNestedValue(data, 'aboutChannelViewModel', 'description') ||
139            data?.metadata?.channelMetadataRenderer?.description ||
140            '';
141  
142          if (description) {
143            const descContacts = extractContactsFromHtml(
144              `<p>${escapeHtml(description)}</p>`,
145              profileUrl
146            );
147            result.email_addresses.push(...descContacts.email_addresses);
148            result.phone_numbers.push(...descContacts.phone_numbers);
149          }
150  
151          // Country -> city fallback
152          const country = findNestedValue(data, 'aboutChannelViewModel', 'country');
153          if (country) {
154            result._city = country; // Will be used as fallback if no city already set
155          }
156        } catch {
157          // JSON parse failed -- fall through to regex on raw HTML
158        }
159      }
160  
161      // Fallback: run generic regex on the full HTML
162      if (result.email_addresses.length === 0 && result.phone_numbers.length === 0) {
163        const htmlContacts = extractContactsFromHtml(html, profileUrl);
164        result.email_addresses.push(...htmlContacts.email_addresses);
165        result.phone_numbers.push(...htmlContacts.phone_numbers);
166      }
167  
168      return result;
169    } catch (err) {
170      warn(`YouTube fetch failed for ${profileUrl}: ${err.message}`);
171      return null;
172    }
173  }
174  
175  /** Recursively find a key in nested object, then return a specific field from it. */
176  function findNestedValue(obj, targetKey, field) {
177    if (!obj || typeof obj !== 'object') return undefined;
178    if (targetKey in obj) return field ? obj[targetKey]?.[field] : obj[targetKey];
179    for (const val of Object.values(obj)) {
180      const found = findNestedValue(val, targetKey, field);
181      if (found !== undefined) return found;
182    }
183    return undefined;
184  }
185  
186  function escapeHtml(text) {
187    return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
188  }
189  
190  // ---------------------------------------------------------------------------
191  // LinkedIn -- Outscraper API (primary) / Playwright stealth (fallback)
192  // ---------------------------------------------------------------------------
193  
194  async function extractFromLinkedIn(profileUrl, browser) {
195    // Primary: Outscraper /linkedin/companies — returns headquarters city
196    const row = await outscraperFetch('linkedin/companies', profileUrl);
197    if (row) {
198      const result = emptyResult();
199      // headquarters is e.g. "Amberley" or "Amberley, Canterbury"
200      const hq = row.headquarters || (row.locations?.[0] || '');
201      const city = hq.split(',')[0]?.trim();
202      if (city) result._city = city;
203      info(`  [SOCIAL] LinkedIn via Outscraper: city=${city || 'n/a'}`);
204      return result;
205    }
206  
207    // Fallback: Playwright stealth
208    if (!browser) return null;
209    const context = await createStealthContext(browser, {
210      viewport: { width: 1280, height: 720 },
211      stealthLevel: 'aggressive',
212    });
213    const page = await context.newPage();
214  
215    try {
216      const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
217      if (!res || res.status() >= 400) return null;
218  
219      await randomDelay(2000, 3000);
220      await humanScroll(page, { distance: 'short' });
221      await randomDelay(500, 1000);
222  
223      const result = emptyResult();
224  
225      // Extract structured company info from rendered text
226      const pageInfo = await page.evaluate(() => {
227        // eslint-disable-next-line no-undef
228        const lines = document.body.innerText.split('\n').map(l => l.trim()).filter(Boolean);
229        const found = {};
230        const keywords = ['Headquarters', 'Phone'];
231        for (const kw of keywords) {
232          const idx = lines.findIndex(l => l === kw);
233          if (idx > -1 && idx < lines.length - 1) {
234            found[kw] = lines[idx + 1];
235          }
236        }
237        return found;
238      });
239  
240      // Extract city from Headquarters
241      if (pageInfo.Headquarters) {
242        // Format: "City, State" or "City, Country"
243        const city = pageInfo.Headquarters.split(',')[0]?.trim();
244        if (city) result._city = city;
245      }
246  
247      // Run email/phone regex on full rendered HTML
248      const html = await page.content();
249      const htmlContacts = extractContactsFromHtml(html, profileUrl);
250      result.email_addresses.push(...htmlContacts.email_addresses);
251      result.phone_numbers.push(...htmlContacts.phone_numbers);
252  
253      return result;
254    } catch (err) {
255      warn(`LinkedIn extraction failed for ${profileUrl}: ${err.message}`);
256      return null;
257    } finally {
258      await page.close();
259      await context.close();
260    }
261  }
262  
263  // ---------------------------------------------------------------------------
264  // Facebook -- Outscraper API (primary) / Playwright stealth (fallback)
265  // ---------------------------------------------------------------------------
266  
267  async function extractFromFacebook(profileUrl, browser) {
268    // Primary: Outscraper /facebook-pages — returns email + phone as structured fields
269    const row = await outscraperFetch('facebook-pages', profileUrl);
270    if (row) {
271      const result = emptyResult();
272      if (row.email) {
273        result.email_addresses.push({ email: row.email, label: 'General', source: 'facebook' });
274      }
275      if (row.phone) {
276        // Outscraper returns phone as digits only (e.g. "64800624473") — normalise to +E.164
277        const phoneStr = String(row.phone).trim();
278        const normalised = phoneStr.startsWith('+') ? phoneStr : `+${phoneStr}`;
279        result.phone_numbers.push({ number: normalised, label: 'General', source: 'facebook' });
280      }
281      info(`  [SOCIAL] Facebook via Outscraper: email=${row.email || 'n/a'} phone=${row.phone || 'n/a'}`);
282      return result;
283    }
284  
285    // Fallback: Playwright stealth
286    if (!browser) return null;
287    const context = await createStealthContext(browser, {
288      viewport: { width: 1280, height: 720 },
289      stealthLevel: 'aggressive',
290    });
291    const page = await context.newPage();
292  
293    try {
294      const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
295      if (!res || res.status() >= 400) return null;
296  
297      await randomDelay(2000, 3000);
298      await humanScroll(page, { distance: 'medium' });
299      await randomDelay(1000, 1500);
300  
301      const result = emptyResult();
302  
303      // Run regex extraction on full rendered HTML
304      const html = await page.content();
305      const fullContacts = extractContactsFromHtml(html, profileUrl);
306      result.email_addresses.push(...fullContacts.email_addresses);
307      result.phone_numbers.push(...fullContacts.phone_numbers);
308  
309      return result;
310    } catch (err) {
311      warn(`Facebook extraction failed for ${profileUrl}: ${err.message}`);
312      return null;
313    } finally {
314      await page.close();
315      await context.close();
316    }
317  }
318  
319  // ---------------------------------------------------------------------------
320  // Yelp -- Outscraper API (primary) / Playwright stealth + nopecha (fallback)
321  // ---------------------------------------------------------------------------
322  
323  async function extractFromYelp(profileUrl, browser) {
324    // Primary: Outscraper /yelp — returns phone + city, no CAPTCHA
325    const row = await outscraperFetch('yelp', profileUrl);
326    if (row) {
327      const result = emptyResult();
328      if (row.phone) {
329        result.phone_numbers.push({ number: row.phone, label: 'General', source: 'yelp' });
330      }
331      if (row.city) result._city = row.city;
332      info(`  [SOCIAL] Yelp via Outscraper: phone=${row.phone || 'n/a'} city=${row.city || 'n/a'}`);
333      return result;
334    }
335  
336    // Fallback: Playwright stealth + nopecha CAPTCHA
337    if (!browser) return null;
338    const context = await createStealthContext(browser, {
339      viewport: { width: 1280, height: 720 },
340      stealthLevel: 'aggressive',
341    });
342    const page = await context.newPage();
343  
344    try {
345      const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
346      if (!res || res.status() >= 400) return null;
347  
348      // Wait for Cloudflare/CAPTCHA -- nopecha should auto-solve
349      const cfResolved = await waitForCloudflare(page, { timeout: 45000 });
350      if (!cfResolved) {
351        warn(`Yelp Cloudflare challenge not resolved for ${profileUrl}`);
352      }
353  
354      await randomDelay(2000, 3000);
355      await humanScroll(page, { distance: 'short' });
356      await randomDelay(500, 1000);
357  
358      const result = emptyResult();
359  
360      // Run regex extraction on rendered HTML
361      const html = await page.content();
362      const htmlContacts = extractContactsFromHtml(html, profileUrl);
363      result.email_addresses.push(...htmlContacts.email_addresses);
364      result.phone_numbers.push(...htmlContacts.phone_numbers);
365  
366      // Try to extract city from JSON-LD LocalBusiness schema
367      const cityFromLd = await page.evaluate(() => {
368        // eslint-disable-next-line no-undef
369        const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
370        for (const script of ldScripts) {
371          try {
372            const data = JSON.parse(script.textContent);
373            if (data.address?.addressLocality) return data.address.addressLocality;
374            if (data['@graph']) {
375              for (const item of data['@graph']) {
376                if (item.address?.addressLocality) return item.address.addressLocality;
377              }
378            }
379          } catch { /* ignore parse errors */ }
380        }
381        return null;
382      });
383      if (cityFromLd) result._city = cityFromLd;
384  
385      return result;
386    } catch (err) {
387      warn(`Yelp extraction failed for ${profileUrl}: ${err.message}`);
388      return null;
389    } finally {
390      await page.close();
391      await context.close();
392    }
393  }
394  
395  // ---------------------------------------------------------------------------
396  // Instagram -- Playwright stealth (best-effort, login walls frequent)
397  // ---------------------------------------------------------------------------
398  
399  async function extractFromInstagram(profileUrl, browser) {
400    if (!browser) return null;
401    const context = await createStealthContext(browser, {
402      viewport: { width: 1280, height: 720 },
403      stealthLevel: 'aggressive',
404    });
405    const page = await context.newPage();
406  
407    try {
408      const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
409      if (!res || res.status() >= 400) return null;
410  
411      await randomDelay(2000, 3000);
412      await humanScroll(page, { distance: 'short' });
413      await randomDelay(500, 1000);
414  
415      const result = emptyResult();
416  
417      // Check for login wall -- if present, bail out
418      const isLoginWall = await page.evaluate(() => {
419        // eslint-disable-next-line no-undef
420        const text = document.body.innerText || '';
421        return text.includes('Log in') && text.includes('Sign up') && text.length < 5000;
422      });
423      if (isLoginWall) {
424        info(`  Instagram login wall for ${profileUrl} -- skipping`);
425        return null;
426      }
427  
428      // Run regex extraction on rendered HTML
429      const html = await page.content();
430      const htmlContacts = extractContactsFromHtml(html, profileUrl);
431      result.email_addresses.push(...htmlContacts.email_addresses);
432      result.phone_numbers.push(...htmlContacts.phone_numbers);
433  
434      return result;
435    } catch (err) {
436      warn(`Instagram extraction failed for ${profileUrl}: ${err.message}`);
437      return null;
438    } finally {
439      await page.close();
440      await context.close();
441    }
442  }
443  
444  // ---------------------------------------------------------------------------
445  // Main entry point
446  // ---------------------------------------------------------------------------
447  
448  const PLATFORM_EXTRACTORS = {
449    youtube: { fn: extractFromYouTube, needsBrowser: false },
450    linkedin: { fn: extractFromLinkedIn, needsBrowser: false },
451    facebook: { fn: extractFromFacebook, needsBrowser: false },
452    yelp: { fn: extractFromYelp, needsBrowser: false },
453    instagram: { fn: extractFromInstagram, needsBrowser: true },
454  };
455  
456  /**
457   * Extract contact details from social media profile pages.
458   *
459   * @param {Array} socialProfiles - Array of {url, label} or plain URL strings
460   * @param {string} siteUrl - The prospect's website URL (for logging)
461   * @param {Browser|null} browser - Playwright browser instance (fallback for non-Outscraper paths)
462   * @returns {Promise<Object|null>} Contacts partial with email_addresses, phone_numbers, _city
463   */
464  // Exported for testing
465  export { classifyPlatform, shouldSkip, emptyResult, findNestedValue, escapeHtml };
466  
467  export async function extractFromSocialProfiles(socialProfiles, siteUrl, browser = null) {
468    if (!socialProfiles?.length) return null;
469    if (process.env.ENABLE_SOCIAL_EXTRACTION === 'false') return null;
470  
471    const merged = emptyResult();
472    let cityFound = null;
473    let platformsProcessed = 0;
474  
475    for (const sp of socialProfiles) {
476      const url = typeof sp === 'string' ? sp : sp.url;
477      if (!url) continue;
478  
479      const platform = classifyPlatform(url);
480      if (!platform) continue;
481      if (shouldSkip(url)) continue;
482  
483      const extractor = PLATFORM_EXTRACTORS[platform];
484      if (!extractor) continue;
485  
486      // Instagram still needs a browser; others use Outscraper (or free HTTP)
487      if (extractor.needsBrowser && !browser) continue;
488  
489      try {
490        info(`  [SOCIAL] Extracting from ${platform}: ${url}`);
491        const result = await extractor.fn(url, browser);
492  
493        if (result) {
494          merged.email_addresses.push(...result.email_addresses);
495          merged.phone_numbers.push(...result.phone_numbers);
496          if (result._city && !cityFound) cityFound = result._city;
497          platformsProcessed++;
498        }
499      } catch (err) {
500        warn(`  [SOCIAL] ${platform} error for ${url}: ${err.message}`);
501      }
502    }
503  
504    if (platformsProcessed === 0) return null;
505  
506    info(`  [SOCIAL] Processed ${platformsProcessed} profiles for ${siteUrl}: ${merged.email_addresses.length} emails, ${merged.phone_numbers.length} phones`);
507  
508    // Attach city as a top-level field for the caller to merge
509    if (cityFound) merged._city = cityFound;
510  
511    return merged;
512  }