test-yelp-nopecha.js
1 #!/usr/bin/env node 2 /** 3 * Test Yelp extraction with launchWithExtensions (nopecha CAPTCHA solving). 4 * 5 * Uses launchWithExtensions directly — returns a BrowserContext with nopecha loaded. 6 * The Yelp extractor uses createStealthContext(browser), so we bypass it here 7 * and drive the context/page directly, mirroring exactly what extractFromYelp does. 8 * 9 * Usage: node scripts/test-yelp-nopecha.js 10 */ 11 12 import '../src/utils/load-env.js'; 13 import { readdirSync, readFileSync } from 'fs'; 14 import { join } from 'path'; 15 import { 16 launchWithExtensions, 17 waitForCloudflare, 18 humanScroll, 19 randomDelay, 20 } from '../src/utils/stealth-browser.js'; 21 import { extractContactsFromHtml } from '../src/utils/html-contact-extractor.js'; 22 23 const contactsDir = join(import.meta.dirname, '../data/contacts'); 24 25 // Collect up to 3 Yelp URLs 26 const yelpUrls = []; 27 for (const file of readdirSync(contactsDir).filter(f => f.endsWith('.json'))) { 28 if (yelpUrls.length >= 3) break; 29 try { 30 const data = JSON.parse(readFileSync(join(contactsDir, file), 'utf8')); 31 for (const sp of data.social_profiles || []) { 32 const url = sp.url || sp; 33 if (url?.includes('yelp.com/biz')) { 34 yelpUrls.push({ url, siteId: file }); 35 break; 36 } 37 } 38 } catch { /* skip */ } 39 } 40 41 console.log(`Found ${yelpUrls.length} Yelp URLs:`); 42 for (const u of yelpUrls) console.log(` ${u.url} (${u.siteId})`); 43 44 console.log('\nLaunching browser with extensions (nopecha)...'); 45 const { context, close, hasNopeCHA } = await launchWithExtensions({ 46 headless: true, 47 stealthLevel: 'aggressive', 48 }); 49 console.log(`hasNopeCHA: ${hasNopeCHA}`); 50 51 try { 52 for (const { url, siteId } of yelpUrls) { 53 console.log(`\n--- Testing: ${url} ---`); 54 const page = await context.newPage(); 55 try { 56 const res = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); 57 console.log(` HTTP status: ${res?.status()}`); 58 59 const cfResolved = await waitForCloudflare(page, { timeout: 45000 }); 60 console.log(` Cloudflare resolved: ${cfResolved}`); 61 62 await randomDelay(2000, 3000); 63 await humanScroll(page, { distance: 'short' }); 64 await randomDelay(500, 1000); 65 66 const html = await page.content(); 67 const contacts = extractContactsFromHtml(html, url); 68 console.log(` Emails: ${contacts.email_addresses.map(e => e.email).join(', ') || '(none)'}`); 69 console.log(` Phones: ${contacts.phone_numbers.map(p => p.number).join(', ') || '(none)'}`); 70 71 // JSON-LD city 72 /* eslint-disable no-undef */ 73 const city = await page.evaluate(() => { 74 const scripts = document.querySelectorAll('script[type="application/ld+json"]'); 75 for (const s of scripts) { 76 try { 77 const d = JSON.parse(s.textContent); 78 if (d.address?.addressLocality) return d.address.addressLocality; 79 if (d['@graph']) { 80 for (const item of d['@graph']) { 81 if (item.address?.addressLocality) return item.address.addressLocality; 82 } 83 } 84 } catch { /* skip */ } 85 } 86 return null; 87 }); 88 /* eslint-enable no-undef */ 89 console.log(` City (JSON-LD): ${city || '(none)'}`); 90 } catch (err) { 91 console.error(` Error: ${err.message}`); 92 } finally { 93 await page.close(); 94 } 95 } 96 } finally { 97 await close(); 98 console.log('\nDone.'); 99 }