/ scripts / test-social-extraction.js
test-social-extraction.js
 1  #!/usr/bin/env node
 2  /**
 3   * Integration test: run social contact extractor against real social profile URLs
 4   * from existing contacts data.
 5   *
 6   * Usage: node scripts/test-social-extraction.js
 7   */
 8  
 9  import '../src/utils/load-env.js';
10  import { readdirSync, readFileSync } from 'fs';
11  import { join } from 'path';
12  import { extractFromSocialProfiles } from '../src/utils/social-contact-extractor.js';
13  import { launchStealthBrowser } from '../src/utils/stealth-browser.js';
14  
15  const contactsDir = join(import.meta.dirname, '../data/contacts');
16  
17  // Collect sample URLs per platform
18  const samples = { youtube: [], facebook: [], linkedin: [], yelp: [], instagram: [] };
19  
20  const files = readdirSync(contactsDir).filter(f => f.endsWith('.json'));
21  for (const file of files) {
22    // Stop once we have enough samples
23    const allFull = Object.values(samples).every(arr => arr.length >= 3);
24    if (allFull) break;
25  
26    try {
27      const data = JSON.parse(readFileSync(join(contactsDir, file), 'utf8'));
28      for (const sp of data.social_profiles || []) {
29        const url = sp.url || sp;
30        if (!url) continue;
31  
32        if (url.includes('youtube.com/channel') && samples.youtube.length < 3) {
33          samples.youtube.push({ url, label: sp.label || 'YouTube', siteId: file });
34        } else if (url.includes('facebook.com/') && !url.includes('profile.php') && !url.includes('/groups/') && samples.facebook.length < 3) {
35          samples.facebook.push({ url, label: sp.label || 'Facebook', siteId: file });
36        } else if (url.includes('linkedin.com/company') && samples.linkedin.length < 3) {
37          samples.linkedin.push({ url, label: sp.label || 'LinkedIn', siteId: file });
38        } else if (url.includes('yelp.com/biz') && samples.yelp.length < 3) {
39          samples.yelp.push({ url, label: sp.label || 'Yelp', siteId: file });
40        } else if (url.includes('instagram.com/') && !url.includes('/p/') && !url.includes('/reel/') && samples.instagram.length < 3) {
41          samples.instagram.push({ url, label: sp.label || 'Instagram', siteId: file });
42        }
43      }
44    } catch { /* skip broken files */ }
45  }
46  
47  console.log('Collected samples:');
48  for (const [platform, urls] of Object.entries(samples)) {
49    console.log(`  ${platform}: ${urls.length} URLs`);
50    for (const u of urls) console.log(`    ${u.url} (from ${u.siteId})`);
51  }
52  
53  // Test YouTube first (no browser needed)
54  console.log('\n=== YOUTUBE (raw HTTP) ===');
55  for (const sp of samples.youtube) {
56    console.log(`\nTesting: ${sp.url}`);
57    const result = await extractFromSocialProfiles([sp], 'test', null);
58    console.log('Result:', JSON.stringify(result, null, 2));
59  }
60  
61  // Launch browser for remaining platforms
62  console.log('\n=== Launching stealth browser ===');
63  const browser = await launchStealthBrowser({ headless: true, stealthLevel: 'standard' });
64  
65  try {
66    for (const platform of ['linkedin', 'facebook', 'yelp', 'instagram']) {
67      console.log(`\n=== ${platform.toUpperCase()} (Playwright stealth) ===`);
68      for (const sp of samples[platform]) {
69        console.log(`\nTesting: ${sp.url}`);
70        try {
71          const result = await extractFromSocialProfiles([sp], 'test', browser);
72          console.log('Result:', JSON.stringify(result, null, 2));
73        } catch (err) {
74          console.error(`Error: ${err.message}`);
75        }
76      }
77    }
78  } finally {
79    await browser.close();
80    console.log('\nBrowser closed. Done.');
81  }