scrape.test.js
1 /** 2 * Tests for SERP Scraping Module 3 * 4 * Uses Node.js 22+ mock.module() to mock axios for comprehensive testing 5 * of API interactions, error handling, circuit breaker, and rate limiting. 6 * 7 * Note: mock.module() must be called before the module is first imported. 8 * The mock persists for all tests in this file. 9 */ 10 11 import { describe, test, mock, beforeEach, afterEach, before } from 'node:test'; 12 import assert from 'node:assert'; 13 14 // Create a mock axios function that can be reconfigured per test 15 const axiosMockFn = mock.fn(); 16 17 // Create mock circuit breaker that passes through without opening 18 const mockZenRowsBreaker = { 19 fire: async fn => fn(), 20 opened: false, 21 halfOpen: false, 22 }; 23 24 // Mock axios module BEFORE importing scrape.js 25 mock.module('axios', { 26 defaultExport: axiosMockFn, 27 }); 28 29 // Mock circuit-breaker module BEFORE importing scrape.js 30 mock.module('../../src/utils/circuit-breaker.js', { 31 namedExports: { 32 zenRowsBreaker: mockZenRowsBreaker, 33 }, 34 }); 35 36 // Now import scrape.js - it will use the mocked modules 37 const { scrapeSERP } = await import('../../src/scrape.js'); 38 39 describe('Scrape Module', () => { 40 let originalApiKey; 41 42 beforeEach(() => { 43 originalApiKey = process.env.ZENROWS_API_KEY; 44 process.env.ZENROWS_API_KEY = 'test-api-key'; 45 // Reset mock between tests 46 axiosMockFn.mock.resetCalls(); 47 }); 48 49 afterEach(() => { 50 if (originalApiKey) { 51 process.env.ZENROWS_API_KEY = originalApiKey; 52 } else { 53 delete process.env.ZENROWS_API_KEY; 54 } 55 }); 56 57 describe('scrapeSERP - Success Cases', () => { 58 test('should scrape SERP and return formatted results', async () => { 59 const mockResponse = { 60 data: { 61 organic_results: [ 62 { 63 link: 'https://example.com', 64 title: 'Example Plumbing - Seattle', 65 snippet: 'Best plumber in Seattle', 66 }, 67 { 68 link: 'https://acme-plumbing.com', 69 title: 'Acme Plumbing Services', 70 snippet: 'Professional plumbing', 71 }, 72 ], 73 }, 74 }; 75 76 // Configure mock for this test 77 axiosMockFn.mock.mockImplementation(async () => mockResponse); 78 79 const result = await scrapeSERP('plumber seattle', 10, 'US'); 80 81 assert.ok(result.results, 'Should have results'); 82 assert.strictEqual(result.results.length, 2); 83 assert.strictEqual(result.results[0].url, 'https://example.com'); 84 assert.strictEqual(result.results[0].business_name, 'Example Plumbing'); 85 assert.strictEqual(result.results[0].keyword, 'plumber seattle'); 86 assert.strictEqual(result.results[0].source, 'ZenRows SERP API'); 87 88 // Verify metadata 89 assert.ok(result.metadata, 'Should have metadata'); 90 assert.strictEqual(result.metadata.countryCode, 'US'); 91 }); 92 93 test('should limit results to specified number', async () => { 94 const mockResponse = { 95 data: { 96 organic_results: Array.from({ length: 20 }, (_, i) => ({ 97 link: `https://example${i}.com`, 98 title: `Business ${i}`, 99 snippet: `Description ${i}`, 100 })), 101 }, 102 }; 103 104 axiosMockFn.mock.mockImplementation(async () => mockResponse); 105 106 const result = await scrapeSERP('plumber seattle', 5, 'US'); 107 108 assert.strictEqual(result.results.length, 5, 'Should limit to 5 results'); 109 assert.strictEqual(result.results[0].url, 'https://example0.com'); 110 assert.strictEqual(result.results[4].url, 'https://example4.com'); 111 }); 112 113 test('should include keyword in all results', async () => { 114 const keyword = 'electrician portland'; 115 const mockResponse = { 116 data: { 117 organic_results: [ 118 { 119 link: 'https://example.com', 120 title: 'Example Electrician', 121 snippet: 'Test', 122 }, 123 ], 124 }, 125 }; 126 127 axiosMockFn.mock.mockImplementation(async () => mockResponse); 128 129 const result = await scrapeSERP(keyword, 10, 'US'); 130 131 result.results.forEach(r => { 132 assert.strictEqual(r.keyword, keyword); 133 }); 134 }); 135 136 test('should handle missing snippet gracefully', async () => { 137 const mockResponse = { 138 data: { 139 organic_results: [ 140 { 141 link: 'https://example.com', 142 title: 'Example Business', 143 // No snippet 144 }, 145 ], 146 }, 147 }; 148 149 axiosMockFn.mock.mockImplementation(async () => mockResponse); 150 151 const result = await scrapeSERP('test keyword', 10, 'US'); 152 153 assert.strictEqual(result.results[0].snippet, null); 154 }); 155 156 test('should work with different country codes', async () => { 157 const mockResponse = { 158 data: { 159 organic_results: [ 160 { 161 link: 'https://example.com.au', 162 title: 'Australian Business', 163 snippet: 'Test', 164 }, 165 ], 166 }, 167 }; 168 169 axiosMockFn.mock.mockImplementation(async () => mockResponse); 170 171 const result = await scrapeSERP('plumber sydney', 10, 'AU'); 172 173 assert.strictEqual(result.metadata.countryCode, 'AU'); 174 assert.strictEqual(result.metadata.googleDomain, 'google.com.au'); 175 assert.strictEqual(result.metadata.currency, 'AUD'); 176 }); 177 }); 178 179 describe('scrapeSERP - Error Cases', () => { 180 test('should throw error when ZENROWS_API_KEY is missing', async () => { 181 delete process.env.ZENROWS_API_KEY; 182 183 axiosMockFn.mock.mockImplementation(async () => ({})); 184 185 await assert.rejects( 186 async () => { 187 await scrapeSERP('plumber seattle'); 188 }, 189 { message: 'ZENROWS_API_KEY not found in environment variables' } 190 ); 191 192 // Restore for other tests 193 process.env.ZENROWS_API_KEY = 'test-api-key'; 194 }); 195 196 test('should throw error when no organic results found', async () => { 197 const mockResponse = { 198 data: { 199 organic_results: [], 200 }, 201 }; 202 203 axiosMockFn.mock.mockImplementation(async () => mockResponse); 204 205 await assert.rejects( 206 async () => { 207 await scrapeSERP('plumber seattle', 10, 'US'); 208 }, 209 { message: /zero organic results/ } 210 ); 211 }); 212 213 test('should handle API errors with retry logic', async () => { 214 // Circuit breaker is already mocked at module level to pass through 215 let attemptCount = 0; 216 const mockResponse = { 217 status: 200, 218 data: { 219 organic_results: [ 220 { 221 link: 'https://example.com', 222 title: 'Test Result', 223 snippet: 'Test description', 224 }, 225 ], 226 }, 227 }; 228 229 axiosMockFn.mock.mockImplementation(async () => { 230 attemptCount++; 231 // maxRetries=3 means 4 total attempts (0,1,2,3), so fail first 3 and succeed on 4th 232 // Use a retryable error (ECONNRESET matches isRetryableError check) 233 if (attemptCount < 4) { 234 throw new Error('ECONNRESET: connection reset by peer'); 235 } 236 return mockResponse; 237 }); 238 239 const keyword = 'test keyword'; 240 // scrapeSERP(keyword, limit, countryCode) - pass limit and countryCode correctly 241 const result = await scrapeSERP(keyword, 10, 'US'); 242 243 assert.strictEqual(attemptCount, 4, 'Should retry 3 times before succeeding on 4th attempt'); 244 // scrapeSERP returns { results, metadata } not a plain array 245 assert.strictEqual(result.results.length, 1); 246 assert.strictEqual(result.results[0].title, 'Test Result'); 247 }); 248 249 test('should throw error after max retries exceeded', async () => { 250 // Circuit breaker is already mocked at module level to pass through 251 axiosMockFn.mock.mockImplementation(async () => { 252 throw new Error('Persistent API error'); 253 }); 254 255 const keyword = 'test keyword'; 256 await assert.rejects(async () => scrapeSERP(keyword, 'US'), { 257 message: /Persistent API error|Max retries exceeded/, 258 }); 259 }); 260 }); 261 262 describe('URL Encoding', () => { 263 test('should encode keyword properly in API request', () => { 264 const keyword = 'plumber & electrician seattle'; 265 const encoded = encodeURIComponent(keyword); 266 assert.strictEqual(encoded, 'plumber%20%26%20electrician%20seattle'); 267 }); 268 269 test('should handle special characters in keywords', () => { 270 const testCases = [ 271 { input: 'café repair', expected: 'caf%C3%A9%20repair' }, 272 { input: '24/7 service', expected: '24%2F7%20service' }, 273 { input: 'plumber (emergency)', expected: 'plumber%20(emergency)' }, 274 ]; 275 276 testCases.forEach(({ input, expected }) => { 277 assert.strictEqual(encodeURIComponent(input), expected); 278 }); 279 }); 280 }); 281 282 describe('Business Name Extraction', () => { 283 test('should prioritize title over URL', () => { 284 const title = 'Professional Plumbing Co'; 285 const url = 'https://example.com'; 286 287 // Title should be used when available 288 const businessName = title 289 .replace(/\s*[-|–—]\s*.*/g, '') 290 .replace(/\s*\|.*/g, '') 291 .replace(/\s*\(.*/g, '') 292 .trim(); 293 294 assert.strictEqual(businessName, 'Professional Plumbing Co'); 295 assert.notStrictEqual(businessName, 'example'); 296 }); 297 298 test('should clean title of common suffixes', () => { 299 const titles = [ 300 'Company Name - About Us', 301 'Company Name | Services', 302 'Company Name – Location', 303 'Company Name — Description', 304 ]; 305 306 titles.forEach(title => { 307 const cleaned = title.replace(/\s*[-|–—]\s*.*/g, '').trim(); 308 assert.strictEqual(cleaned, 'Company Name'); 309 }); 310 }); 311 312 test('should remove parenthetical content from title', () => { 313 const title = 'Company Name (Established 1990)'; 314 const cleaned = title.replace(/\s*\(.*/g, '').trim(); 315 assert.strictEqual(cleaned, 'Company Name'); 316 }); 317 318 test('should extract business name from Yelp URLs', () => { 319 const url = 'https://www.yelp.com/biz/acme-plumbing-seattle'; 320 const match = url.match(/\/biz\/([^/?]+)/); 321 assert.ok(match); 322 assert.strictEqual(match[1], 'acme-plumbing-seattle'); 323 const businessName = match[1].replace(/-/g, ' '); 324 assert.strictEqual(businessName, 'acme plumbing seattle'); 325 }); 326 327 test('should extract domain from regular URLs', () => { 328 const url = 'https://www.acmeplumbing.com/services'; 329 const urlObj = new URL(url); 330 const domain = urlObj.hostname.replace(/^www\./, ''); 331 assert.strictEqual(domain, 'acmeplumbing.com'); 332 const businessName = domain.split('.')[0]; 333 assert.strictEqual(businessName, 'acmeplumbing'); 334 }); 335 336 test('should handle URLs without www prefix', () => { 337 const url = 'https://example.com'; 338 const urlObj = new URL(url); 339 const domain = urlObj.hostname.replace(/^www\./, ''); 340 assert.strictEqual(domain, 'example.com'); 341 }); 342 343 test('should handle malformed URLs gracefully', () => { 344 const invalidUrl = 'not a valid url'; 345 assert.throws(() => { 346 new URL(invalidUrl); 347 }, Error); 348 }); 349 350 test('should handle multiple cleaning rules', () => { 351 const title = 'Company Name (Location) - Services | About'; 352 let cleaned = title.replace(/\s*\(.*/g, ''); // Remove parentheses first 353 cleaned = cleaned.replace(/\s*[-|–—]\s*.*/g, '').trim(); 354 assert.strictEqual(cleaned, 'Company Name'); 355 }); 356 }); 357 358 describe('URL Parsing', () => { 359 test('should handle various Yelp URL formats', () => { 360 const urls = [ 361 'https://www.yelp.com/biz/acme-plumbing-seattle', 362 'https://yelp.com/biz/best-electrician-portland?start=20', 363 'https://www.yelp.com/biz/top-contractor-boston#reviews', 364 ]; 365 366 urls.forEach(url => { 367 const match = url.match(/\/biz\/([^/?#]+)/); 368 assert.ok(match, `Should match ${url}`); 369 assert.ok(match[1].length > 0, `Should extract business slug from ${url}`); 370 }); 371 }); 372 373 test('should not match non-Yelp URLs with /biz/ path', () => { 374 const url = 'https://example.com/business/info'; 375 const match = url.match(/\/biz\/([^/?]+)/); 376 assert.strictEqual(match, null); 377 }); 378 379 test('should extract top-level domain', () => { 380 const urls = [ 381 { url: 'https://acme.com', expected: 'acme' }, 382 { url: 'https://best-plumbing.net', expected: 'best-plumbing' }, 383 { url: 'https://super.plumber.co.uk', expected: 'super' }, 384 ]; 385 386 urls.forEach(({ url, expected }) => { 387 const urlObj = new URL(url); 388 const domain = urlObj.hostname.replace(/^www\./, ''); 389 const businessName = domain.split('.')[0]; 390 assert.strictEqual(businessName, expected); 391 }); 392 }); 393 }); 394 395 describe('Result Formatting', () => { 396 test('should include all required fields in result', () => { 397 const mockResult = { 398 link: 'https://example.com', 399 title: 'Example Business', 400 snippet: 'Test snippet', 401 }; 402 403 const formatted = { 404 url: mockResult.link, 405 business_name: mockResult.title, 406 serp_contacts: null, 407 source: 'ZenRows SERP API', 408 keyword: 'test keyword', 409 title: mockResult.title, 410 snippet: mockResult.snippet || null, 411 }; 412 413 assert.ok(formatted.url); 414 assert.ok(formatted.business_name); 415 assert.strictEqual(formatted.serp_contacts, null); 416 assert.strictEqual(formatted.source, 'ZenRows SERP API'); 417 assert.ok(formatted.keyword); 418 assert.ok(formatted.title); 419 }); 420 421 test('should handle missing snippet gracefully', () => { 422 const mockResult = { 423 link: 'https://example.com', 424 title: 'Example Business', 425 // No snippet 426 }; 427 428 const formatted = { 429 url: mockResult.link, 430 business_name: mockResult.title, 431 serp_contacts: null, 432 source: 'ZenRows SERP API', 433 keyword: 'test keyword', 434 title: mockResult.title, 435 snippet: mockResult.snippet || null, 436 }; 437 438 assert.strictEqual(formatted.snippet, null); 439 }); 440 }); 441 });