/ src / scrape.js
scrape.js
  1  /**
  2   * SERP Scraping Module
  3   * Uses ZenRows Google Search SERP API to fetch structured search results
  4   * Supports multi-country targeting with localized Google domains
  5   */
  6  
  7  import axios from 'axios';
  8  import Logger from './utils/logger.js';
  9  import { retryWithBackoff, isRetryableError } from './utils/error-handler.js';
 10  import { zenRowsBreaker } from './utils/circuit-breaker.js';
 11  import { zenrowsLimiter } from './utils/rate-limiter.js';
 12  import { getCountryByCode } from './config/countries.js';
 13  import './utils/load-env.js';
 14  
 15  const logger = new Logger('Scraper');
 16  
 17  const DEFAULT_TIMEOUT = 180000; // 3 minutes - ZenRows recommended minimum
 18  const SLOW_TIMEOUT = parseInt(process.env.ZENROWS_SLOW_TIMEOUT || '300000', 10); // 5 minutes default
 19  const SLOW_COUNTRIES = new Set(
 20    (process.env.ZENROWS_SLOW_COUNTRIES || '')
 21      .split(',')
 22      .map(c => c.trim().toUpperCase())
 23      .filter(Boolean)
 24  );
 25  
 26  /**
 27   * Scrape SERP for business listings using ZenRows Google Search API
 28   * @param {string} keyword - Search keyword (e.g., "plumber seattle")
 29   * @param {number} limit - Max number of URLs to extract (default 10)
 30   * @param {string} countryCode - ISO country code (e.g., 'AU', 'US', 'UK')
 31   * @returns {Promise<Object>} - {results: Array, metadata: Object}
 32   */
 33  // eslint-disable-next-line require-await -- Wraps retryWithBackoff which handles async
 34  export async function scrapeSERP(keyword, limit = 10, countryCode = 'AU') {
 35    const country = getCountryByCode(countryCode);
 36  
 37    logger.info(
 38      `Scraping SERP for keyword: "${keyword}" (limit: ${limit}, country: ${country.name})`
 39    );
 40  
 41    const apiKey = process.env.ZENROWS_API_KEY;
 42    if (!apiKey) {
 43      throw new Error('ZENROWS_API_KEY not found in environment variables');
 44    }
 45  
 46    const encodedQuery = encodeURIComponent(keyword);
 47    const apiEndpoint = `https://serp.api.zenrows.com/v1/targets/google/search/${encodedQuery}`;
 48  
 49    return retryWithBackoff(
 50      // eslint-disable-next-line require-await -- Wrapper for circuit breaker fire()
 51      async () => {
 52        // Wrap the API call with circuit breaker and rate limiter
 53        return zenRowsBreaker.fire(async () => {
 54          // Build params with optional geo-targeting
 55          const params = {
 56            apikey: apiKey,
 57            // Add geo-targeting for non-US countries if premium proxy enabled
 58            ...(process.env.ZENROWS_PREMIUM === 'true' && country.code !== 'US'
 59              ? {
 60                  premium_proxy: 'true',
 61                  proxy_country: country.code.toLowerCase(),
 62                }
 63              : {}),
 64          };
 65  
 66          logger.debug(`ZenRows params:`, params);
 67  
 68          const isSlow = SLOW_COUNTRIES.has(country.code);
 69          const timeout = isSlow ? SLOW_TIMEOUT : DEFAULT_TIMEOUT;
 70          if (isSlow) {
 71            logger.debug(
 72              `Using extended timeout (${timeout / 1000}s) for slow country: ${country.code}`
 73            );
 74          }
 75  
 76          const response = await zenrowsLimiter.schedule(() =>
 77            axios({
 78              method: 'GET',
 79              url: apiEndpoint,
 80              params,
 81              timeout,
 82            })
 83          );
 84  
 85          const { data } = response;
 86          logger.info(`Received structured SERP data from ZenRows (${country.googleDomain})`);
 87  
 88          // Extract organic results from the API response
 89          const organicResults = data.organic_results || [];
 90  
 91          if (organicResults.length === 0) {
 92            logger.error('No organic results found in API response');
 93            throw new Error('Empty SERP response: zero organic results');
 94          }
 95  
 96          // Take only the requested number of results
 97          const limitedResults = organicResults.slice(0, limit);
 98  
 99          // Transform to our format
100          const results = limitedResults.map(result => ({
101            url: result.link,
102            business_name: extractBusinessName(result.link, result.title),
103            serp_contacts: null,
104            source: 'ZenRows SERP API',
105            keyword,
106            title: result.title,
107            snippet: result.snippet || null,
108          }));
109  
110          // Include country metadata with results
111          const metadata = {
112            countryCode: country.code,
113            googleDomain: country.googleDomain,
114            language: country.language,
115            currency: country.currency,
116            currencySymbol: country.currencySymbol,
117          };
118  
119          logger.success(`Found ${results.length} results from ZenRows SERP API (${country.name})`);
120  
121          return { results, metadata };
122        });
123      },
124      {
125        maxRetries: 3,
126        shouldRetry: isRetryableError,
127        onRetry: attempt => {
128          logger.warn(`Retrying ZenRows SERP API request (attempt ${attempt + 1})...`);
129        },
130      }
131    );
132  }
133  
134  /**
135   * Extract business name from URL and title
136   */
137  function extractBusinessName(url, title) {
138    try {
139      // Try to use title first if available
140      if (title) {
141        // Remove common suffixes from title
142        const cleanTitle = title
143          .replace(/\s*[-|–—]\s*.*/g, '') // Remove everything after dash/pipe
144          .replace(/\s*\|.*/g, '')
145          .replace(/\s*\(.*/g, '') // Remove parentheticals
146          .trim();
147  
148        if (cleanTitle) {
149          return cleanTitle;
150        }
151      }
152  
153      // Fallback to URL parsing
154      const urlObj = new URL(url);
155      const domain = urlObj.hostname.replace(/^www\./, '');
156  
157      // For Yelp URLs, extract from path
158      if (domain.includes('yelp.com')) {
159        const match = url.match(/\/biz\/([^/?]+)/);
160        if (match) {
161          return match[1].replace(/-/g, ' ');
162        }
163      }
164  
165      // Otherwise use domain
166      return domain.split('.')[0];
167    } catch {
168      return url;
169    }
170  }
171  
172  export default {
173    scrapeSERP,
174  };