/ scripts / test-yelp-nopecha.js
test-yelp-nopecha.js
 1  #!/usr/bin/env node
 2  /**
 3   * Test Yelp extraction with launchWithExtensions (nopecha CAPTCHA solving).
 4   *
 5   * Uses launchWithExtensions directly — returns a BrowserContext with nopecha loaded.
 6   * The Yelp extractor uses createStealthContext(browser), so we bypass it here
 7   * and drive the context/page directly, mirroring exactly what extractFromYelp does.
 8   *
 9   * Usage: node scripts/test-yelp-nopecha.js
10   */
11  
12  import '../src/utils/load-env.js';
13  import { readdirSync, readFileSync } from 'fs';
14  import { join } from 'path';
15  import {
16    launchWithExtensions,
17    waitForCloudflare,
18    humanScroll,
19    randomDelay,
20  } from '../src/utils/stealth-browser.js';
21  import { extractContactsFromHtml } from '../src/utils/html-contact-extractor.js';
22  
23  const contactsDir = join(import.meta.dirname, '../data/contacts');
24  
25  // Collect up to 3 Yelp URLs
26  const yelpUrls = [];
27  for (const file of readdirSync(contactsDir).filter(f => f.endsWith('.json'))) {
28    if (yelpUrls.length >= 3) break;
29    try {
30      const data = JSON.parse(readFileSync(join(contactsDir, file), 'utf8'));
31      for (const sp of data.social_profiles || []) {
32        const url = sp.url || sp;
33        if (url?.includes('yelp.com/biz')) {
34          yelpUrls.push({ url, siteId: file });
35          break;
36        }
37      }
38    } catch { /* skip */ }
39  }
40  
41  console.log(`Found ${yelpUrls.length} Yelp URLs:`);
42  for (const u of yelpUrls) console.log(`  ${u.url} (${u.siteId})`);
43  
44  console.log('\nLaunching browser with extensions (nopecha)...');
45  const { context, close, hasNopeCHA } = await launchWithExtensions({
46    headless: true,
47    stealthLevel: 'aggressive',
48  });
49  console.log(`hasNopeCHA: ${hasNopeCHA}`);
50  
51  try {
52    for (const { url, siteId } of yelpUrls) {
53      console.log(`\n--- Testing: ${url} ---`);
54      const page = await context.newPage();
55      try {
56        const res = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
57        console.log(`  HTTP status: ${res?.status()}`);
58  
59        const cfResolved = await waitForCloudflare(page, { timeout: 45000 });
60        console.log(`  Cloudflare resolved: ${cfResolved}`);
61  
62        await randomDelay(2000, 3000);
63        await humanScroll(page, { distance: 'short' });
64        await randomDelay(500, 1000);
65  
66        const html = await page.content();
67        const contacts = extractContactsFromHtml(html, url);
68        console.log(`  Emails: ${contacts.email_addresses.map(e => e.email).join(', ') || '(none)'}`);
69        console.log(`  Phones: ${contacts.phone_numbers.map(p => p.number).join(', ') || '(none)'}`);
70  
71        // JSON-LD city
72        /* eslint-disable no-undef */
73        const city = await page.evaluate(() => {
74          const scripts = document.querySelectorAll('script[type="application/ld+json"]');
75          for (const s of scripts) {
76            try {
77              const d = JSON.parse(s.textContent);
78              if (d.address?.addressLocality) return d.address.addressLocality;
79              if (d['@graph']) {
80                for (const item of d['@graph']) {
81                  if (item.address?.addressLocality) return item.address.addressLocality;
82                }
83              }
84            } catch { /* skip */ }
85          }
86          return null;
87        });
88        /* eslint-enable no-undef */
89        console.log(`  City (JSON-LD): ${city || '(none)'}`);
90      } catch (err) {
91        console.error(`  Error: ${err.message}`);
92      } finally {
93        await page.close();
94      }
95    }
96  } finally {
97    await close();
98    console.log('\nDone.');
99  }