scrape.js
1 /** 2 * SERP Scraping Module 3 * Uses ZenRows Google Search SERP API to fetch structured search results 4 * Supports multi-country targeting with localized Google domains 5 */ 6 7 import axios from 'axios'; 8 import Logger from './utils/logger.js'; 9 import { retryWithBackoff, isRetryableError } from './utils/error-handler.js'; 10 import { zenRowsBreaker } from './utils/circuit-breaker.js'; 11 import { zenrowsLimiter } from './utils/rate-limiter.js'; 12 import { getCountryByCode } from './config/countries.js'; 13 import './utils/load-env.js'; 14 15 const logger = new Logger('Scraper'); 16 17 const DEFAULT_TIMEOUT = 180000; // 3 minutes - ZenRows recommended minimum 18 const SLOW_TIMEOUT = parseInt(process.env.ZENROWS_SLOW_TIMEOUT || '300000', 10); // 5 minutes default 19 const SLOW_COUNTRIES = new Set( 20 (process.env.ZENROWS_SLOW_COUNTRIES || '') 21 .split(',') 22 .map(c => c.trim().toUpperCase()) 23 .filter(Boolean) 24 ); 25 26 /** 27 * Scrape SERP for business listings using ZenRows Google Search API 28 * @param {string} keyword - Search keyword (e.g., "plumber seattle") 29 * @param {number} limit - Max number of URLs to extract (default 10) 30 * @param {string} countryCode - ISO country code (e.g., 'AU', 'US', 'UK') 31 * @returns {Promise<Object>} - {results: Array, metadata: Object} 32 */ 33 // eslint-disable-next-line require-await -- Wraps retryWithBackoff which handles async 34 export async function scrapeSERP(keyword, limit = 10, countryCode = 'AU') { 35 const country = getCountryByCode(countryCode); 36 37 logger.info( 38 `Scraping SERP for keyword: "${keyword}" (limit: ${limit}, country: ${country.name})` 39 ); 40 41 const apiKey = process.env.ZENROWS_API_KEY; 42 if (!apiKey) { 43 throw new Error('ZENROWS_API_KEY not found in environment variables'); 44 } 45 46 const encodedQuery = encodeURIComponent(keyword); 47 const apiEndpoint = `https://serp.api.zenrows.com/v1/targets/google/search/${encodedQuery}`; 48 49 return retryWithBackoff( 50 // eslint-disable-next-line require-await -- Wrapper for circuit breaker fire() 51 async () => { 52 // Wrap the API call with circuit breaker and rate limiter 53 return zenRowsBreaker.fire(async () => { 54 // Build params with optional geo-targeting 55 const params = { 56 apikey: apiKey, 57 // Add geo-targeting for non-US countries if premium proxy enabled 58 ...(process.env.ZENROWS_PREMIUM === 'true' && country.code !== 'US' 59 ? { 60 premium_proxy: 'true', 61 proxy_country: country.code.toLowerCase(), 62 } 63 : {}), 64 }; 65 66 logger.debug(`ZenRows params:`, params); 67 68 const isSlow = SLOW_COUNTRIES.has(country.code); 69 const timeout = isSlow ? SLOW_TIMEOUT : DEFAULT_TIMEOUT; 70 if (isSlow) { 71 logger.debug( 72 `Using extended timeout (${timeout / 1000}s) for slow country: ${country.code}` 73 ); 74 } 75 76 const response = await zenrowsLimiter.schedule(() => 77 axios({ 78 method: 'GET', 79 url: apiEndpoint, 80 params, 81 timeout, 82 }) 83 ); 84 85 const { data } = response; 86 logger.info(`Received structured SERP data from ZenRows (${country.googleDomain})`); 87 88 // Extract organic results from the API response 89 const organicResults = data.organic_results || []; 90 91 if (organicResults.length === 0) { 92 logger.error('No organic results found in API response'); 93 throw new Error('Empty SERP response: zero organic results'); 94 } 95 96 // Take only the requested number of results 97 const limitedResults = organicResults.slice(0, limit); 98 99 // Transform to our format 100 const results = limitedResults.map(result => ({ 101 url: result.link, 102 business_name: extractBusinessName(result.link, result.title), 103 serp_contacts: null, 104 source: 'ZenRows SERP API', 105 keyword, 106 title: result.title, 107 snippet: result.snippet || null, 108 })); 109 110 // Include country metadata with results 111 const metadata = { 112 countryCode: country.code, 113 googleDomain: country.googleDomain, 114 language: country.language, 115 currency: country.currency, 116 currencySymbol: country.currencySymbol, 117 }; 118 119 logger.success(`Found ${results.length} results from ZenRows SERP API (${country.name})`); 120 121 return { results, metadata }; 122 }); 123 }, 124 { 125 maxRetries: 3, 126 shouldRetry: isRetryableError, 127 onRetry: attempt => { 128 logger.warn(`Retrying ZenRows SERP API request (attempt ${attempt + 1})...`); 129 }, 130 } 131 ); 132 } 133 134 /** 135 * Extract business name from URL and title 136 */ 137 function extractBusinessName(url, title) { 138 try { 139 // Try to use title first if available 140 if (title) { 141 // Remove common suffixes from title 142 const cleanTitle = title 143 .replace(/\s*[-|–—]\s*.*/g, '') // Remove everything after dash/pipe 144 .replace(/\s*\|.*/g, '') 145 .replace(/\s*\(.*/g, '') // Remove parentheticals 146 .trim(); 147 148 if (cleanTitle) { 149 return cleanTitle; 150 } 151 } 152 153 // Fallback to URL parsing 154 const urlObj = new URL(url); 155 const domain = urlObj.hostname.replace(/^www\./, ''); 156 157 // For Yelp URLs, extract from path 158 if (domain.includes('yelp.com')) { 159 const match = url.match(/\/biz\/([^/?]+)/); 160 if (match) { 161 return match[1].replace(/-/g, ' '); 162 } 163 } 164 165 // Otherwise use domain 166 return domain.split('.')[0]; 167 } catch { 168 return url; 169 } 170 } 171 172 export default { 173 scrapeSERP, 174 };