social-contact-extractor.js
1 /** 2 * Extract contact details (email, phone, city) from social media profile pages. 3 * 4 * Platforms: 5 * - YouTube: raw HTTP fetch -> parse ytInitialData JSON from page HTML 6 * - LinkedIn: Outscraper API (primary) -> /linkedin/companies -> headquarters city 7 * Playwright stealth (fallback if no API key) 8 * - Facebook: Outscraper API (primary) -> /facebook-pages -> email + phone 9 * Playwright stealth (fallback if no API key) 10 * - Yelp: Outscraper API (primary) -> /yelp -> phone + city (no CAPTCHA needed) 11 * Playwright stealth + nopecha CAPTCHA (fallback if no API key) 12 * - Instagram: Playwright stealth -> extractContactsFromHtml on rendered HTML 13 * 14 * Outscraper API (OUTSCRAPER_API_KEY) is preferred for Yelp/Facebook/LinkedIn: 15 * - No browser/CAPTCHA overhead 16 * - Returns structured fields (email, phone, city) directly 17 * - ~$3/1k requests vs Playwright's operational complexity 18 * 19 * Returns a contacts_json-compatible partial that merges via mergeExtractedContacts(). 20 */ 21 22 import { extractContactsFromHtml } from './html-contact-extractor.js'; 23 import { 24 createStealthContext, 25 humanScroll, 26 randomDelay, 27 waitForCloudflare, 28 } from './stealth-browser.js'; 29 import Logger from './logger.js'; 30 31 const logger = new Logger('SocialExtract'); 32 const info = (...args) => logger.info(...args); 33 const warn = (...args) => logger.warn(...args); 34 35 // --------------------------------------------------------------------------- 36 // Helpers 37 // --------------------------------------------------------------------------- 38 39 const DESKTOP_UA = 40 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'; 41 42 const DESKTOP_HEADERS = { 43 'User-Agent': DESKTOP_UA, 44 Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 45 'Accept-Language': 'en-US,en;q=0.9', 46 'Sec-Fetch-Dest': 'document', 47 'Sec-Fetch-Mode': 'navigate', 48 'Sec-Fetch-Site': 'none', 49 'Sec-Fetch-User': '?1', 50 'Sec-Ch-Ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', 51 'Sec-Ch-Ua-Mobile': '?0', 52 'Sec-Ch-Ua-Platform': '"macOS"', 53 }; 54 55 const OUTSCRAPER_BASE = 'https://api.app.outscraper.com'; 56 57 /** Classify a social profile URL by platform. */ 58 function classifyPlatform(url) { 59 const u = url.toLowerCase(); 60 if (u.includes('youtube.com') || u.includes('youtu.be')) return 'youtube'; 61 if (u.includes('linkedin.com/company')) return 'linkedin'; 62 if (u.includes('facebook.com')) return 'facebook'; 63 if (u.includes('yelp.com/biz')) return 'yelp'; 64 if (u.includes('instagram.com')) return 'instagram'; 65 return null; 66 } 67 68 /** Skip patterns -- personal FB profiles, FB groups, IG intent pages, etc. */ 69 function shouldSkip(url) { 70 return /profile\.php\?id=|\/groups\/|\/intent\/|\/share\?|\/login/i.test(url); 71 } 72 73 /** Build an empty contacts partial. */ 74 function emptyResult() { 75 return { email_addresses: [], phone_numbers: [], social_profiles: [], key_pages: [] }; 76 } 77 78 // --------------------------------------------------------------------------- 79 // Outscraper API helper 80 // --------------------------------------------------------------------------- 81 82 /** 83 * Call an Outscraper endpoint synchronously. 84 * Returns the first result object, or null on error/empty. 85 */ 86 async function outscraperFetch(endpoint, query) { 87 const apiKey = process.env.OUTSCRAPER_API_KEY; 88 if (!apiKey) return null; 89 90 const url = new URL(`${OUTSCRAPER_BASE}/${endpoint}`); 91 url.searchParams.set('query', query); 92 url.searchParams.set('limit', '1'); 93 url.searchParams.set('async', 'false'); 94 95 try { 96 const res = await fetch(url.toString(), { 97 headers: { 'X-API-KEY': apiKey }, 98 }); 99 if (!res.ok) { 100 warn(`Outscraper ${endpoint} HTTP ${res.status} for ${query}`); 101 return null; 102 } 103 const json = await res.json(); 104 if (json.status !== 'Success' || !json.data?.length) return null; 105 // Synchronous results are nested: data[0] is either the object or an array of objects 106 const first = Array.isArray(json.data[0]) ? json.data[0][0] : json.data[0]; 107 return first || null; 108 } catch (err) { 109 warn(`Outscraper ${endpoint} error for ${query}: ${err.message}`); 110 return null; 111 } 112 } 113 114 // --------------------------------------------------------------------------- 115 // YouTube -- raw HTTP (no browser needed) 116 // --------------------------------------------------------------------------- 117 118 async function extractFromYouTube(profileUrl) { 119 // Normalise to /about path to get the aboutChannelViewModel 120 let fetchUrl = profileUrl.replace(/\/$/, ''); 121 if (!fetchUrl.endsWith('/about')) fetchUrl += '/about'; 122 123 try { 124 const res = await fetch(fetchUrl, { headers: DESKTOP_HEADERS, redirect: 'follow' }); 125 if (!res.ok) return null; 126 const html = await res.text(); 127 128 const result = emptyResult(); 129 130 // Parse ytInitialData JSON blob 131 const match = html.match(/var\s+ytInitialData\s*=\s*(\{.*?\});\s*<\/script>/s); 132 if (match) { 133 try { 134 const data = JSON.parse(match[1]); 135 136 // Channel description (often has email/phone for SMBs) 137 const description = 138 findNestedValue(data, 'aboutChannelViewModel', 'description') || 139 data?.metadata?.channelMetadataRenderer?.description || 140 ''; 141 142 if (description) { 143 const descContacts = extractContactsFromHtml( 144 `<p>${escapeHtml(description)}</p>`, 145 profileUrl 146 ); 147 result.email_addresses.push(...descContacts.email_addresses); 148 result.phone_numbers.push(...descContacts.phone_numbers); 149 } 150 151 // Country -> city fallback 152 const country = findNestedValue(data, 'aboutChannelViewModel', 'country'); 153 if (country) { 154 result._city = country; // Will be used as fallback if no city already set 155 } 156 } catch { 157 // JSON parse failed -- fall through to regex on raw HTML 158 } 159 } 160 161 // Fallback: run generic regex on the full HTML 162 if (result.email_addresses.length === 0 && result.phone_numbers.length === 0) { 163 const htmlContacts = extractContactsFromHtml(html, profileUrl); 164 result.email_addresses.push(...htmlContacts.email_addresses); 165 result.phone_numbers.push(...htmlContacts.phone_numbers); 166 } 167 168 return result; 169 } catch (err) { 170 warn(`YouTube fetch failed for ${profileUrl}: ${err.message}`); 171 return null; 172 } 173 } 174 175 /** Recursively find a key in nested object, then return a specific field from it. */ 176 function findNestedValue(obj, targetKey, field) { 177 if (!obj || typeof obj !== 'object') return undefined; 178 if (targetKey in obj) return field ? obj[targetKey]?.[field] : obj[targetKey]; 179 for (const val of Object.values(obj)) { 180 const found = findNestedValue(val, targetKey, field); 181 if (found !== undefined) return found; 182 } 183 return undefined; 184 } 185 186 function escapeHtml(text) { 187 return text.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>'); 188 } 189 190 // --------------------------------------------------------------------------- 191 // LinkedIn -- Outscraper API (primary) / Playwright stealth (fallback) 192 // --------------------------------------------------------------------------- 193 194 async function extractFromLinkedIn(profileUrl, browser) { 195 // Primary: Outscraper /linkedin/companies — returns headquarters city 196 const row = await outscraperFetch('linkedin/companies', profileUrl); 197 if (row) { 198 const result = emptyResult(); 199 // headquarters is e.g. "Amberley" or "Amberley, Canterbury" 200 const hq = row.headquarters || (row.locations?.[0] || ''); 201 const city = hq.split(',')[0]?.trim(); 202 if (city) result._city = city; 203 info(` [SOCIAL] LinkedIn via Outscraper: city=${city || 'n/a'}`); 204 return result; 205 } 206 207 // Fallback: Playwright stealth 208 if (!browser) return null; 209 const context = await createStealthContext(browser, { 210 viewport: { width: 1280, height: 720 }, 211 stealthLevel: 'aggressive', 212 }); 213 const page = await context.newPage(); 214 215 try { 216 const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); 217 if (!res || res.status() >= 400) return null; 218 219 await randomDelay(2000, 3000); 220 await humanScroll(page, { distance: 'short' }); 221 await randomDelay(500, 1000); 222 223 const result = emptyResult(); 224 225 // Extract structured company info from rendered text 226 const pageInfo = await page.evaluate(() => { 227 // eslint-disable-next-line no-undef 228 const lines = document.body.innerText.split('\n').map(l => l.trim()).filter(Boolean); 229 const found = {}; 230 const keywords = ['Headquarters', 'Phone']; 231 for (const kw of keywords) { 232 const idx = lines.findIndex(l => l === kw); 233 if (idx > -1 && idx < lines.length - 1) { 234 found[kw] = lines[idx + 1]; 235 } 236 } 237 return found; 238 }); 239 240 // Extract city from Headquarters 241 if (pageInfo.Headquarters) { 242 // Format: "City, State" or "City, Country" 243 const city = pageInfo.Headquarters.split(',')[0]?.trim(); 244 if (city) result._city = city; 245 } 246 247 // Run email/phone regex on full rendered HTML 248 const html = await page.content(); 249 const htmlContacts = extractContactsFromHtml(html, profileUrl); 250 result.email_addresses.push(...htmlContacts.email_addresses); 251 result.phone_numbers.push(...htmlContacts.phone_numbers); 252 253 return result; 254 } catch (err) { 255 warn(`LinkedIn extraction failed for ${profileUrl}: ${err.message}`); 256 return null; 257 } finally { 258 await page.close(); 259 await context.close(); 260 } 261 } 262 263 // --------------------------------------------------------------------------- 264 // Facebook -- Outscraper API (primary) / Playwright stealth (fallback) 265 // --------------------------------------------------------------------------- 266 267 async function extractFromFacebook(profileUrl, browser) { 268 // Primary: Outscraper /facebook-pages — returns email + phone as structured fields 269 const row = await outscraperFetch('facebook-pages', profileUrl); 270 if (row) { 271 const result = emptyResult(); 272 if (row.email) { 273 result.email_addresses.push({ email: row.email, label: 'General', source: 'facebook' }); 274 } 275 if (row.phone) { 276 // Outscraper returns phone as digits only (e.g. "64800624473") — normalise to +E.164 277 const phoneStr = String(row.phone).trim(); 278 const normalised = phoneStr.startsWith('+') ? phoneStr : `+${phoneStr}`; 279 result.phone_numbers.push({ number: normalised, label: 'General', source: 'facebook' }); 280 } 281 info(` [SOCIAL] Facebook via Outscraper: email=${row.email || 'n/a'} phone=${row.phone || 'n/a'}`); 282 return result; 283 } 284 285 // Fallback: Playwright stealth 286 if (!browser) return null; 287 const context = await createStealthContext(browser, { 288 viewport: { width: 1280, height: 720 }, 289 stealthLevel: 'aggressive', 290 }); 291 const page = await context.newPage(); 292 293 try { 294 const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); 295 if (!res || res.status() >= 400) return null; 296 297 await randomDelay(2000, 3000); 298 await humanScroll(page, { distance: 'medium' }); 299 await randomDelay(1000, 1500); 300 301 const result = emptyResult(); 302 303 // Run regex extraction on full rendered HTML 304 const html = await page.content(); 305 const fullContacts = extractContactsFromHtml(html, profileUrl); 306 result.email_addresses.push(...fullContacts.email_addresses); 307 result.phone_numbers.push(...fullContacts.phone_numbers); 308 309 return result; 310 } catch (err) { 311 warn(`Facebook extraction failed for ${profileUrl}: ${err.message}`); 312 return null; 313 } finally { 314 await page.close(); 315 await context.close(); 316 } 317 } 318 319 // --------------------------------------------------------------------------- 320 // Yelp -- Outscraper API (primary) / Playwright stealth + nopecha (fallback) 321 // --------------------------------------------------------------------------- 322 323 async function extractFromYelp(profileUrl, browser) { 324 // Primary: Outscraper /yelp — returns phone + city, no CAPTCHA 325 const row = await outscraperFetch('yelp', profileUrl); 326 if (row) { 327 const result = emptyResult(); 328 if (row.phone) { 329 result.phone_numbers.push({ number: row.phone, label: 'General', source: 'yelp' }); 330 } 331 if (row.city) result._city = row.city; 332 info(` [SOCIAL] Yelp via Outscraper: phone=${row.phone || 'n/a'} city=${row.city || 'n/a'}`); 333 return result; 334 } 335 336 // Fallback: Playwright stealth + nopecha CAPTCHA 337 if (!browser) return null; 338 const context = await createStealthContext(browser, { 339 viewport: { width: 1280, height: 720 }, 340 stealthLevel: 'aggressive', 341 }); 342 const page = await context.newPage(); 343 344 try { 345 const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); 346 if (!res || res.status() >= 400) return null; 347 348 // Wait for Cloudflare/CAPTCHA -- nopecha should auto-solve 349 const cfResolved = await waitForCloudflare(page, { timeout: 45000 }); 350 if (!cfResolved) { 351 warn(`Yelp Cloudflare challenge not resolved for ${profileUrl}`); 352 } 353 354 await randomDelay(2000, 3000); 355 await humanScroll(page, { distance: 'short' }); 356 await randomDelay(500, 1000); 357 358 const result = emptyResult(); 359 360 // Run regex extraction on rendered HTML 361 const html = await page.content(); 362 const htmlContacts = extractContactsFromHtml(html, profileUrl); 363 result.email_addresses.push(...htmlContacts.email_addresses); 364 result.phone_numbers.push(...htmlContacts.phone_numbers); 365 366 // Try to extract city from JSON-LD LocalBusiness schema 367 const cityFromLd = await page.evaluate(() => { 368 // eslint-disable-next-line no-undef 369 const ldScripts = document.querySelectorAll('script[type="application/ld+json"]'); 370 for (const script of ldScripts) { 371 try { 372 const data = JSON.parse(script.textContent); 373 if (data.address?.addressLocality) return data.address.addressLocality; 374 if (data['@graph']) { 375 for (const item of data['@graph']) { 376 if (item.address?.addressLocality) return item.address.addressLocality; 377 } 378 } 379 } catch { /* ignore parse errors */ } 380 } 381 return null; 382 }); 383 if (cityFromLd) result._city = cityFromLd; 384 385 return result; 386 } catch (err) { 387 warn(`Yelp extraction failed for ${profileUrl}: ${err.message}`); 388 return null; 389 } finally { 390 await page.close(); 391 await context.close(); 392 } 393 } 394 395 // --------------------------------------------------------------------------- 396 // Instagram -- Playwright stealth (best-effort, login walls frequent) 397 // --------------------------------------------------------------------------- 398 399 async function extractFromInstagram(profileUrl, browser) { 400 if (!browser) return null; 401 const context = await createStealthContext(browser, { 402 viewport: { width: 1280, height: 720 }, 403 stealthLevel: 'aggressive', 404 }); 405 const page = await context.newPage(); 406 407 try { 408 const res = await page.goto(profileUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); 409 if (!res || res.status() >= 400) return null; 410 411 await randomDelay(2000, 3000); 412 await humanScroll(page, { distance: 'short' }); 413 await randomDelay(500, 1000); 414 415 const result = emptyResult(); 416 417 // Check for login wall -- if present, bail out 418 const isLoginWall = await page.evaluate(() => { 419 // eslint-disable-next-line no-undef 420 const text = document.body.innerText || ''; 421 return text.includes('Log in') && text.includes('Sign up') && text.length < 5000; 422 }); 423 if (isLoginWall) { 424 info(` Instagram login wall for ${profileUrl} -- skipping`); 425 return null; 426 } 427 428 // Run regex extraction on rendered HTML 429 const html = await page.content(); 430 const htmlContacts = extractContactsFromHtml(html, profileUrl); 431 result.email_addresses.push(...htmlContacts.email_addresses); 432 result.phone_numbers.push(...htmlContacts.phone_numbers); 433 434 return result; 435 } catch (err) { 436 warn(`Instagram extraction failed for ${profileUrl}: ${err.message}`); 437 return null; 438 } finally { 439 await page.close(); 440 await context.close(); 441 } 442 } 443 444 // --------------------------------------------------------------------------- 445 // Main entry point 446 // --------------------------------------------------------------------------- 447 448 const PLATFORM_EXTRACTORS = { 449 youtube: { fn: extractFromYouTube, needsBrowser: false }, 450 linkedin: { fn: extractFromLinkedIn, needsBrowser: false }, 451 facebook: { fn: extractFromFacebook, needsBrowser: false }, 452 yelp: { fn: extractFromYelp, needsBrowser: false }, 453 instagram: { fn: extractFromInstagram, needsBrowser: true }, 454 }; 455 456 /** 457 * Extract contact details from social media profile pages. 458 * 459 * @param {Array} socialProfiles - Array of {url, label} or plain URL strings 460 * @param {string} siteUrl - The prospect's website URL (for logging) 461 * @param {Browser|null} browser - Playwright browser instance (fallback for non-Outscraper paths) 462 * @returns {Promise<Object|null>} Contacts partial with email_addresses, phone_numbers, _city 463 */ 464 // Exported for testing 465 export { classifyPlatform, shouldSkip, emptyResult, findNestedValue, escapeHtml }; 466 467 export async function extractFromSocialProfiles(socialProfiles, siteUrl, browser = null) { 468 if (!socialProfiles?.length) return null; 469 if (process.env.ENABLE_SOCIAL_EXTRACTION === 'false') return null; 470 471 const merged = emptyResult(); 472 let cityFound = null; 473 let platformsProcessed = 0; 474 475 for (const sp of socialProfiles) { 476 const url = typeof sp === 'string' ? sp : sp.url; 477 if (!url) continue; 478 479 const platform = classifyPlatform(url); 480 if (!platform) continue; 481 if (shouldSkip(url)) continue; 482 483 const extractor = PLATFORM_EXTRACTORS[platform]; 484 if (!extractor) continue; 485 486 // Instagram still needs a browser; others use Outscraper (or free HTTP) 487 if (extractor.needsBrowser && !browser) continue; 488 489 try { 490 info(` [SOCIAL] Extracting from ${platform}: ${url}`); 491 const result = await extractor.fn(url, browser); 492 493 if (result) { 494 merged.email_addresses.push(...result.email_addresses); 495 merged.phone_numbers.push(...result.phone_numbers); 496 if (result._city && !cityFound) cityFound = result._city; 497 platformsProcessed++; 498 } 499 } catch (err) { 500 warn(` [SOCIAL] ${platform} error for ${url}: ${err.message}`); 501 } 502 } 503 504 if (platformsProcessed === 0) return null; 505 506 info(` [SOCIAL] Processed ${platformsProcessed} profiles for ${siteUrl}: ${merged.email_addresses.length} emails, ${merged.phone_numbers.length} phones`); 507 508 // Attach city as a top-level field for the caller to merge 509 if (cityFound) merged._city = cityFound; 510 511 return merged; 512 }