/ tests / pipeline / scrape.test.js
scrape.test.js
  1  /**
  2   * Tests for SERP Scraping Module
  3   *
  4   * Uses Node.js 22+ mock.module() to mock axios for comprehensive testing
  5   * of API interactions, error handling, circuit breaker, and rate limiting.
  6   *
  7   * Note: mock.module() must be called before the module is first imported.
  8   * The mock persists for all tests in this file.
  9   */
 10  
 11  import { describe, test, mock, beforeEach, afterEach, before } from 'node:test';
 12  import assert from 'node:assert';
 13  
 14  // Create a mock axios function that can be reconfigured per test
 15  const axiosMockFn = mock.fn();
 16  
 17  // Create mock circuit breaker that passes through without opening
 18  const mockZenRowsBreaker = {
 19    fire: async fn => fn(),
 20    opened: false,
 21    halfOpen: false,
 22  };
 23  
 24  // Mock axios module BEFORE importing scrape.js
 25  mock.module('axios', {
 26    defaultExport: axiosMockFn,
 27  });
 28  
 29  // Mock circuit-breaker module BEFORE importing scrape.js
 30  mock.module('../../src/utils/circuit-breaker.js', {
 31    namedExports: {
 32      zenRowsBreaker: mockZenRowsBreaker,
 33    },
 34  });
 35  
 36  // Now import scrape.js - it will use the mocked modules
 37  const { scrapeSERP } = await import('../../src/scrape.js');
 38  
 39  describe('Scrape Module', () => {
 40    let originalApiKey;
 41  
 42    beforeEach(() => {
 43      originalApiKey = process.env.ZENROWS_API_KEY;
 44      process.env.ZENROWS_API_KEY = 'test-api-key';
 45      // Reset mock between tests
 46      axiosMockFn.mock.resetCalls();
 47    });
 48  
 49    afterEach(() => {
 50      if (originalApiKey) {
 51        process.env.ZENROWS_API_KEY = originalApiKey;
 52      } else {
 53        delete process.env.ZENROWS_API_KEY;
 54      }
 55    });
 56  
 57    describe('scrapeSERP - Success Cases', () => {
 58      test('should scrape SERP and return formatted results', async () => {
 59        const mockResponse = {
 60          data: {
 61            organic_results: [
 62              {
 63                link: 'https://example.com',
 64                title: 'Example Plumbing - Seattle',
 65                snippet: 'Best plumber in Seattle',
 66              },
 67              {
 68                link: 'https://acme-plumbing.com',
 69                title: 'Acme Plumbing Services',
 70                snippet: 'Professional plumbing',
 71              },
 72            ],
 73          },
 74        };
 75  
 76        // Configure mock for this test
 77        axiosMockFn.mock.mockImplementation(async () => mockResponse);
 78  
 79        const result = await scrapeSERP('plumber seattle', 10, 'US');
 80  
 81        assert.ok(result.results, 'Should have results');
 82        assert.strictEqual(result.results.length, 2);
 83        assert.strictEqual(result.results[0].url, 'https://example.com');
 84        assert.strictEqual(result.results[0].business_name, 'Example Plumbing');
 85        assert.strictEqual(result.results[0].keyword, 'plumber seattle');
 86        assert.strictEqual(result.results[0].source, 'ZenRows SERP API');
 87  
 88        // Verify metadata
 89        assert.ok(result.metadata, 'Should have metadata');
 90        assert.strictEqual(result.metadata.countryCode, 'US');
 91      });
 92  
 93      test('should limit results to specified number', async () => {
 94        const mockResponse = {
 95          data: {
 96            organic_results: Array.from({ length: 20 }, (_, i) => ({
 97              link: `https://example${i}.com`,
 98              title: `Business ${i}`,
 99              snippet: `Description ${i}`,
100            })),
101          },
102        };
103  
104        axiosMockFn.mock.mockImplementation(async () => mockResponse);
105  
106        const result = await scrapeSERP('plumber seattle', 5, 'US');
107  
108        assert.strictEqual(result.results.length, 5, 'Should limit to 5 results');
109        assert.strictEqual(result.results[0].url, 'https://example0.com');
110        assert.strictEqual(result.results[4].url, 'https://example4.com');
111      });
112  
113      test('should include keyword in all results', async () => {
114        const keyword = 'electrician portland';
115        const mockResponse = {
116          data: {
117            organic_results: [
118              {
119                link: 'https://example.com',
120                title: 'Example Electrician',
121                snippet: 'Test',
122              },
123            ],
124          },
125        };
126  
127        axiosMockFn.mock.mockImplementation(async () => mockResponse);
128  
129        const result = await scrapeSERP(keyword, 10, 'US');
130  
131        result.results.forEach(r => {
132          assert.strictEqual(r.keyword, keyword);
133        });
134      });
135  
136      test('should handle missing snippet gracefully', async () => {
137        const mockResponse = {
138          data: {
139            organic_results: [
140              {
141                link: 'https://example.com',
142                title: 'Example Business',
143                // No snippet
144              },
145            ],
146          },
147        };
148  
149        axiosMockFn.mock.mockImplementation(async () => mockResponse);
150  
151        const result = await scrapeSERP('test keyword', 10, 'US');
152  
153        assert.strictEqual(result.results[0].snippet, null);
154      });
155  
156      test('should work with different country codes', async () => {
157        const mockResponse = {
158          data: {
159            organic_results: [
160              {
161                link: 'https://example.com.au',
162                title: 'Australian Business',
163                snippet: 'Test',
164              },
165            ],
166          },
167        };
168  
169        axiosMockFn.mock.mockImplementation(async () => mockResponse);
170  
171        const result = await scrapeSERP('plumber sydney', 10, 'AU');
172  
173        assert.strictEqual(result.metadata.countryCode, 'AU');
174        assert.strictEqual(result.metadata.googleDomain, 'google.com.au');
175        assert.strictEqual(result.metadata.currency, 'AUD');
176      });
177    });
178  
179    describe('scrapeSERP - Error Cases', () => {
180      test('should throw error when ZENROWS_API_KEY is missing', async () => {
181        delete process.env.ZENROWS_API_KEY;
182  
183        axiosMockFn.mock.mockImplementation(async () => ({}));
184  
185        await assert.rejects(
186          async () => {
187            await scrapeSERP('plumber seattle');
188          },
189          { message: 'ZENROWS_API_KEY not found in environment variables' }
190        );
191  
192        // Restore for other tests
193        process.env.ZENROWS_API_KEY = 'test-api-key';
194      });
195  
196      test('should throw error when no organic results found', async () => {
197        const mockResponse = {
198          data: {
199            organic_results: [],
200          },
201        };
202  
203        axiosMockFn.mock.mockImplementation(async () => mockResponse);
204  
205        await assert.rejects(
206          async () => {
207            await scrapeSERP('plumber seattle', 10, 'US');
208          },
209          { message: /zero organic results/ }
210        );
211      });
212  
213      test('should handle API errors with retry logic', async () => {
214        // Circuit breaker is already mocked at module level to pass through
215        let attemptCount = 0;
216        const mockResponse = {
217          status: 200,
218          data: {
219            organic_results: [
220              {
221                link: 'https://example.com',
222                title: 'Test Result',
223                snippet: 'Test description',
224              },
225            ],
226          },
227        };
228  
229        axiosMockFn.mock.mockImplementation(async () => {
230          attemptCount++;
231          // maxRetries=3 means 4 total attempts (0,1,2,3), so fail first 3 and succeed on 4th
232          // Use a retryable error (ECONNRESET matches isRetryableError check)
233          if (attemptCount < 4) {
234            throw new Error('ECONNRESET: connection reset by peer');
235          }
236          return mockResponse;
237        });
238  
239        const keyword = 'test keyword';
240        // scrapeSERP(keyword, limit, countryCode) - pass limit and countryCode correctly
241        const result = await scrapeSERP(keyword, 10, 'US');
242  
243        assert.strictEqual(attemptCount, 4, 'Should retry 3 times before succeeding on 4th attempt');
244        // scrapeSERP returns { results, metadata } not a plain array
245        assert.strictEqual(result.results.length, 1);
246        assert.strictEqual(result.results[0].title, 'Test Result');
247      });
248  
249      test('should throw error after max retries exceeded', async () => {
250        // Circuit breaker is already mocked at module level to pass through
251        axiosMockFn.mock.mockImplementation(async () => {
252          throw new Error('Persistent API error');
253        });
254  
255        const keyword = 'test keyword';
256        await assert.rejects(async () => scrapeSERP(keyword, 'US'), {
257          message: /Persistent API error|Max retries exceeded/,
258        });
259      });
260    });
261  
262    describe('URL Encoding', () => {
263      test('should encode keyword properly in API request', () => {
264        const keyword = 'plumber & electrician seattle';
265        const encoded = encodeURIComponent(keyword);
266        assert.strictEqual(encoded, 'plumber%20%26%20electrician%20seattle');
267      });
268  
269      test('should handle special characters in keywords', () => {
270        const testCases = [
271          { input: 'café repair', expected: 'caf%C3%A9%20repair' },
272          { input: '24/7 service', expected: '24%2F7%20service' },
273          { input: 'plumber (emergency)', expected: 'plumber%20(emergency)' },
274        ];
275  
276        testCases.forEach(({ input, expected }) => {
277          assert.strictEqual(encodeURIComponent(input), expected);
278        });
279      });
280    });
281  
282    describe('Business Name Extraction', () => {
283      test('should prioritize title over URL', () => {
284        const title = 'Professional Plumbing Co';
285        const url = 'https://example.com';
286  
287        // Title should be used when available
288        const businessName = title
289          .replace(/\s*[-|–—]\s*.*/g, '')
290          .replace(/\s*\|.*/g, '')
291          .replace(/\s*\(.*/g, '')
292          .trim();
293  
294        assert.strictEqual(businessName, 'Professional Plumbing Co');
295        assert.notStrictEqual(businessName, 'example');
296      });
297  
298      test('should clean title of common suffixes', () => {
299        const titles = [
300          'Company Name - About Us',
301          'Company Name | Services',
302          'Company Name – Location',
303          'Company Name — Description',
304        ];
305  
306        titles.forEach(title => {
307          const cleaned = title.replace(/\s*[-|–—]\s*.*/g, '').trim();
308          assert.strictEqual(cleaned, 'Company Name');
309        });
310      });
311  
312      test('should remove parenthetical content from title', () => {
313        const title = 'Company Name (Established 1990)';
314        const cleaned = title.replace(/\s*\(.*/g, '').trim();
315        assert.strictEqual(cleaned, 'Company Name');
316      });
317  
318      test('should extract business name from Yelp URLs', () => {
319        const url = 'https://www.yelp.com/biz/acme-plumbing-seattle';
320        const match = url.match(/\/biz\/([^/?]+)/);
321        assert.ok(match);
322        assert.strictEqual(match[1], 'acme-plumbing-seattle');
323        const businessName = match[1].replace(/-/g, ' ');
324        assert.strictEqual(businessName, 'acme plumbing seattle');
325      });
326  
327      test('should extract domain from regular URLs', () => {
328        const url = 'https://www.acmeplumbing.com/services';
329        const urlObj = new URL(url);
330        const domain = urlObj.hostname.replace(/^www\./, '');
331        assert.strictEqual(domain, 'acmeplumbing.com');
332        const businessName = domain.split('.')[0];
333        assert.strictEqual(businessName, 'acmeplumbing');
334      });
335  
336      test('should handle URLs without www prefix', () => {
337        const url = 'https://example.com';
338        const urlObj = new URL(url);
339        const domain = urlObj.hostname.replace(/^www\./, '');
340        assert.strictEqual(domain, 'example.com');
341      });
342  
343      test('should handle malformed URLs gracefully', () => {
344        const invalidUrl = 'not a valid url';
345        assert.throws(() => {
346          new URL(invalidUrl);
347        }, Error);
348      });
349  
350      test('should handle multiple cleaning rules', () => {
351        const title = 'Company Name (Location) - Services | About';
352        let cleaned = title.replace(/\s*\(.*/g, ''); // Remove parentheses first
353        cleaned = cleaned.replace(/\s*[-|–—]\s*.*/g, '').trim();
354        assert.strictEqual(cleaned, 'Company Name');
355      });
356    });
357  
358    describe('URL Parsing', () => {
359      test('should handle various Yelp URL formats', () => {
360        const urls = [
361          'https://www.yelp.com/biz/acme-plumbing-seattle',
362          'https://yelp.com/biz/best-electrician-portland?start=20',
363          'https://www.yelp.com/biz/top-contractor-boston#reviews',
364        ];
365  
366        urls.forEach(url => {
367          const match = url.match(/\/biz\/([^/?#]+)/);
368          assert.ok(match, `Should match ${url}`);
369          assert.ok(match[1].length > 0, `Should extract business slug from ${url}`);
370        });
371      });
372  
373      test('should not match non-Yelp URLs with /biz/ path', () => {
374        const url = 'https://example.com/business/info';
375        const match = url.match(/\/biz\/([^/?]+)/);
376        assert.strictEqual(match, null);
377      });
378  
379      test('should extract top-level domain', () => {
380        const urls = [
381          { url: 'https://acme.com', expected: 'acme' },
382          { url: 'https://best-plumbing.net', expected: 'best-plumbing' },
383          { url: 'https://super.plumber.co.uk', expected: 'super' },
384        ];
385  
386        urls.forEach(({ url, expected }) => {
387          const urlObj = new URL(url);
388          const domain = urlObj.hostname.replace(/^www\./, '');
389          const businessName = domain.split('.')[0];
390          assert.strictEqual(businessName, expected);
391        });
392      });
393    });
394  
395    describe('Result Formatting', () => {
396      test('should include all required fields in result', () => {
397        const mockResult = {
398          link: 'https://example.com',
399          title: 'Example Business',
400          snippet: 'Test snippet',
401        };
402  
403        const formatted = {
404          url: mockResult.link,
405          business_name: mockResult.title,
406          serp_contacts: null,
407          source: 'ZenRows SERP API',
408          keyword: 'test keyword',
409          title: mockResult.title,
410          snippet: mockResult.snippet || null,
411        };
412  
413        assert.ok(formatted.url);
414        assert.ok(formatted.business_name);
415        assert.strictEqual(formatted.serp_contacts, null);
416        assert.strictEqual(formatted.source, 'ZenRows SERP API');
417        assert.ok(formatted.keyword);
418        assert.ok(formatted.title);
419      });
420  
421      test('should handle missing snippet gracefully', () => {
422        const mockResult = {
423          link: 'https://example.com',
424          title: 'Example Business',
425          // No snippet
426        };
427  
428        const formatted = {
429          url: mockResult.link,
430          business_name: mockResult.title,
431          serp_contacts: null,
432          source: 'ZenRows SERP API',
433          keyword: 'test keyword',
434          title: mockResult.title,
435          snippet: mockResult.snippet || null,
436        };
437  
438        assert.strictEqual(formatted.snippet, null);
439      });
440    });
441  });