/ tests / pipeline / site-filters.test.js
site-filters.test.js
  1  /**
  2   * Tests for site-filters module
  3   */
  4  
  5  import { test, describe } from 'node:test';
  6  import assert from 'node:assert';
  7  import fs from 'fs';
  8  import path from 'path';
  9  import { fileURLToPath } from 'url';
 10  import {
 11    checkBlocklist,
 12    DIRECTORY_DOMAINS,
 13    SOCIAL_MEDIA_DOMAINS,
 14    DEMO_EMAIL_DOMAINS,
 15    loadFranchiseDomains,
 16    isGovernmentDomain,
 17    isEducationDomain,
 18    isDemoEmail,
 19    isGovernmentEmail,
 20    isEducationEmail,
 21  } from '../../src/utils/site-filters.js';
 22  
 23  const __filename = fileURLToPath(import.meta.url);
 24  const __dirname = path.dirname(__filename);
 25  
 26  test('checkBlocklist should detect social media domains', () => {
 27    const socialDomains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'x.com'];
 28  
 29    for (const domain of socialDomains) {
 30      const result = checkBlocklist(domain);
 31      assert.ok(result, `${domain} should be detected`);
 32      assert.strictEqual(result.blocklist, 'social_media');
 33      assert.strictEqual(result.reason, 'Ignored: Social media platform');
 34    }
 35  });
 36  
 37  test('checkBlocklist should detect business directory domains', () => {
 38    const directoryDomains = [
 39      'yelp.com',
 40      'yellowpages.com',
 41      'craigslist.com',
 42      'thumbtack.com',
 43      'angi.com',
 44    ];
 45  
 46    for (const domain of directoryDomains) {
 47      const result = checkBlocklist(domain);
 48      assert.ok(result, `${domain} should be detected`);
 49      assert.strictEqual(result.blocklist, 'directory');
 50      assert.strictEqual(result.reason, 'Ignored: Business directory');
 51    }
 52  });
 53  
 54  test('checkBlocklist should handle case-insensitive matching', () => {
 55    const testCases = ['YELP.COM', 'YeLp.CoM', 'Facebook.COM', 'TWITTER.com'];
 56  
 57    for (const domain of testCases) {
 58      const result = checkBlocklist(domain);
 59      assert.ok(result, `${domain} should be detected (case-insensitive)`);
 60    }
 61  });
 62  
 63  test('checkBlocklist should handle subdomain matching', () => {
 64    const testCases = ['www.yelp.com', 'au.yelp.com', 'm.facebook.com', 'business.linkedin.com'];
 65  
 66    for (const domain of testCases) {
 67      const result = checkBlocklist(domain);
 68      assert.ok(result, `${domain} should be detected (with subdomain)`);
 69    }
 70  });
 71  
 72  test('checkBlocklist should return null for normal business sites', () => {
 73    const normalDomains = [
 74      'example.com',
 75      'myplumber.com',
 76      'joes-pizza.com',
 77      'dentist-office.co.uk',
 78      'law-firm.com',
 79    ];
 80  
 81    for (const domain of normalDomains) {
 82      const result = checkBlocklist(domain);
 83      assert.strictEqual(result, null, `${domain} should not be blocked`);
 84    }
 85  });
 86  
 87  test('checkBlocklist should handle null/undefined input', () => {
 88    assert.strictEqual(checkBlocklist(null), null);
 89    assert.strictEqual(checkBlocklist(undefined), null);
 90    assert.strictEqual(checkBlocklist(''), null);
 91  });
 92  
 93  test('DIRECTORY_DOMAINS should include major directories', () => {
 94    const expectedDomains = [
 95      'yelp.com',
 96      'yellowpages.com',
 97      'craigslist.com',
 98      'craigslist.org',
 99      'thumbtack.com',
100      'angi.com',
101      'homeadvisor.com',
102      'zillow.com',
103      'tripadvisor.com',
104    ];
105  
106    for (const domain of expectedDomains) {
107      assert.ok(DIRECTORY_DOMAINS.includes(domain), `DIRECTORY_DOMAINS should include ${domain}`);
108    }
109  });
110  
111  test('SOCIAL_MEDIA_DOMAINS should include major platforms', () => {
112    const expectedDomains = [
113      'facebook.com',
114      'instagram.com',
115      'twitter.com',
116      'x.com',
117      'linkedin.com',
118      'youtube.com',
119      'tiktok.com',
120      'pinterest.com',
121    ];
122  
123    for (const domain of expectedDomains) {
124      assert.ok(
125        SOCIAL_MEDIA_DOMAINS.includes(domain),
126        `SOCIAL_MEDIA_DOMAINS should include ${domain}`
127      );
128    }
129  });
130  
131  test('checkBlocklist should prioritize social media over directory', () => {
132    // LinkedIn is in both lists, but should return social_media
133    const result = checkBlocklist('linkedin.com');
134    assert.ok(result);
135    assert.strictEqual(result.blocklist, 'social_media');
136  });
137  
138  // ===== Franchise Detection Tests =====
139  
140  test('checkBlocklist should detect US franchises', () => {
141    // Mr. Rooter franchise
142    let result = checkBlocklist('mrrooter.com', 'us');
143    assert.ok(result);
144    assert.strictEqual(result.blocklist, 'franchise');
145    assert.strictEqual(result.reason, 'Ignored: Home service franchise');
146  
147    // With subdomain
148    result = checkBlocklist('austin.mrrooter.com', 'us');
149    assert.ok(result);
150    assert.strictEqual(result.blocklist, 'franchise');
151  
152    // Molly Maid franchise
153    result = checkBlocklist('mollymaid.com', 'us');
154    assert.ok(result);
155    assert.strictEqual(result.blocklist, 'franchise');
156  });
157  
158  test('checkBlocklist should detect AU franchises', () => {
159    // Jim's Mowing
160    let result = checkBlocklist('jimsmowing.com.au', 'au');
161    assert.ok(result);
162    assert.strictEqual(result.blocklist, 'franchise');
163  
164    // Jim's Cleaning
165    result = checkBlocklist('jimscleaning.com.au', 'au');
166    assert.ok(result);
167    assert.strictEqual(result.blocklist, 'franchise');
168  });
169  
170  test('checkBlocklist should not detect franchises without country code', () => {
171    // Should return null when no country code provided
172    const result = checkBlocklist('mrrooter.com');
173    assert.strictEqual(result, null);
174  });
175  
176  test('checkBlocklist should handle franchise domain normalization', () => {
177    // With hyphens
178    let result = checkBlocklist('mr-rooter.com', 'us');
179    assert.ok(result);
180    assert.strictEqual(result.blocklist, 'franchise');
181  
182    // With dots
183    result = checkBlocklist('mr.rooter.com', 'us');
184    assert.ok(result);
185    assert.strictEqual(result.blocklist, 'franchise');
186  });
187  
188  test('checkBlocklist should not block independent businesses', () => {
189    // Should not block non-franchise plumber
190    let result = checkBlocklist('acmeplumbing.com', 'us');
191    assert.strictEqual(result, null);
192  
193    // Should not block non-franchise cleaner
194    result = checkBlocklist('smithcleaning.com.au', 'au');
195    assert.strictEqual(result, null);
196  });
197  
198  // ===== loadFranchiseDomains Tests =====
199  
200  describe('loadFranchiseDomains', () => {
201    test('returns empty array for null country code', () => {
202      const result = loadFranchiseDomains(null);
203      assert.deepStrictEqual(result, []);
204    });
205  
206    test('returns empty array for undefined country code', () => {
207      const result = loadFranchiseDomains(undefined);
208      assert.deepStrictEqual(result, []);
209    });
210  
211    test('returns empty array for non-existent country code', () => {
212      const result = loadFranchiseDomains('zz');
213      assert.deepStrictEqual(result, []);
214    });
215  
216    test('caches franchise lists (second call returns same reference)', () => {
217      const result1 = loadFranchiseDomains('us');
218      const result2 = loadFranchiseDomains('us');
219      assert.strictEqual(result1, result2);
220    });
221  
222    test('normalizes brand names to domain-friendly format', () => {
223      // Write to data/franchises/ at repo root (where site-filters.js reads from)
224      const projectRoot = path.join(__dirname, '../..');
225      const testFile = path.join(projectRoot, 'data/franchises/testload99.txt');
226      const testDir = path.dirname(testFile);
227  
228      if (!fs.existsSync(testDir)) {
229        fs.mkdirSync(testDir, { recursive: true });
230      }
231  
232      fs.writeFileSync(
233        testFile,
234        `# Test franchise list
235  Mr. Rooter
236  Jim's Mowing
237  CertaPro Painters
238  
239  # Another franchise
240  The UPS Store`
241      );
242  
243      try {
244        const result = loadFranchiseDomains('testload99');
245  
246        assert.ok(result.includes('mrrooter'));
247        assert.ok(result.includes('jimsmowing'));
248        assert.ok(result.includes('certapropainters'));
249        assert.ok(result.includes('theupsstore'));
250        assert.strictEqual(result.filter(f => f.startsWith('#')).length, 0);
251      } finally {
252        if (fs.existsSync(testFile)) {
253          fs.unlinkSync(testFile);
254        }
255      }
256    });
257  });
258  
259  // ===== Government Domain Tests =====
260  
261  describe('isGovernmentDomain', () => {
262    test('returns false for invalid inputs', () => {
263      assert.strictEqual(isGovernmentDomain(null), false);
264      assert.strictEqual(isGovernmentDomain(undefined), false);
265      assert.strictEqual(isGovernmentDomain(''), false);
266      assert.strictEqual(isGovernmentDomain(123), false);
267    });
268  
269    test('detects US federal .gov domains', () => {
270      assert.strictEqual(isGovernmentDomain('whitehouse.gov'), true);
271      assert.strictEqual(isGovernmentDomain('nasa.gov'), true);
272    });
273  
274    test('detects country-specific .gov domains', () => {
275      assert.strictEqual(isGovernmentDomain('example.gov.au'), true);
276      assert.strictEqual(isGovernmentDomain('example.gov.uk'), true);
277      assert.strictEqual(isGovernmentDomain('example.gov.in'), true);
278    });
279  
280    test('detects Canadian .gc.ca domains', () => {
281      assert.strictEqual(isGovernmentDomain('example.gc.ca'), true);
282    });
283  
284    test('detects New Zealand .govt.nz domains', () => {
285      assert.strictEqual(isGovernmentDomain('example.govt.nz'), true);
286    });
287  
288    test('detects Spanish .gob domains', () => {
289      assert.strictEqual(isGovernmentDomain('example.gob.mx'), true);
290      assert.strictEqual(isGovernmentDomain('example.gob.es'), true);
291    });
292  
293    test('detects French .gouv domains', () => {
294      assert.strictEqual(isGovernmentDomain('example.gouv.fr'), true);
295    });
296  
297    test('detects Asian .go domains', () => {
298      assert.strictEqual(isGovernmentDomain('example.go.jp'), true);
299      assert.strictEqual(isGovernmentDomain('example.go.kr'), true);
300    });
301  
302    test('detects Brazilian .gov.br domains', () => {
303      assert.strictEqual(isGovernmentDomain('example.gov.br'), true);
304    });
305  
306    test('detects US military .mil domains', () => {
307      assert.strictEqual(isGovernmentDomain('army.mil'), true);
308    });
309  
310    test('returns false for non-government domains', () => {
311      assert.strictEqual(isGovernmentDomain('example.com'), false);
312      assert.strictEqual(isGovernmentDomain('government.com'), false);
313    });
314  
315    test('is case insensitive', () => {
316      assert.strictEqual(isGovernmentDomain('EXAMPLE.GOV'), true);
317    });
318  
319    test('handles whitespace', () => {
320      assert.strictEqual(isGovernmentDomain('  example.gov  '), true);
321    });
322  });
323  
324  // ===== Education Domain Tests =====
325  
326  describe('isEducationDomain', () => {
327    test('returns false for invalid inputs', () => {
328      assert.strictEqual(isEducationDomain(null), false);
329      assert.strictEqual(isEducationDomain(undefined), false);
330      assert.strictEqual(isEducationDomain(''), false);
331      assert.strictEqual(isEducationDomain(123), false);
332    });
333  
334    test('detects US .edu domains', () => {
335      assert.strictEqual(isEducationDomain('harvard.edu'), true);
336      assert.strictEqual(isEducationDomain('mit.edu'), true);
337    });
338  
339    test('detects country-specific .edu domains', () => {
340      assert.strictEqual(isEducationDomain('example.edu.au'), true);
341      assert.strictEqual(isEducationDomain('example.edu.uk'), true);
342    });
343  
344    test('detects .ac academic domains', () => {
345      assert.strictEqual(isEducationDomain('example.ac.uk'), true);
346      assert.strictEqual(isEducationDomain('example.ac.jp'), true);
347      assert.strictEqual(isEducationDomain('example.ac.nz'), true);
348    });
349  
350    test('returns false for non-education domains', () => {
351      assert.strictEqual(isEducationDomain('example.com'), false);
352      assert.strictEqual(isEducationDomain('education.com'), false);
353    });
354  
355    test('is case insensitive', () => {
356      assert.strictEqual(isEducationDomain('EXAMPLE.EDU'), true);
357    });
358  
359    test('handles whitespace', () => {
360      assert.strictEqual(isEducationDomain('  example.edu  '), true);
361    });
362  });
363  
364  // ===== Demo Email Tests =====
365  
366  describe('isDemoEmail', () => {
367    test('returns false for invalid inputs', () => {
368      assert.strictEqual(isDemoEmail(null), false);
369      assert.strictEqual(isDemoEmail(undefined), false);
370      assert.strictEqual(isDemoEmail(''), false);
371      assert.strictEqual(isDemoEmail(123), false);
372    });
373  
374    test('detects exact demo domain matches', () => {
375      assert.strictEqual(isDemoEmail('user@example.com'), true);
376      assert.strictEqual(isDemoEmail('user@test.com'), true);
377      assert.strictEqual(isDemoEmail('user@mailinator.com'), true);
378    });
379  
380    test('detects subdomain demo emails', () => {
381      assert.strictEqual(isDemoEmail('user@subdomain.example.com'), true);
382      assert.strictEqual(isDemoEmail('user@api.test.com'), true);
383    });
384  
385    test('returns false for emails without @ symbol', () => {
386      assert.strictEqual(isDemoEmail('notanemail'), false);
387    });
388  
389    test('returns false for legitimate email domains', () => {
390      assert.strictEqual(isDemoEmail('user@gmail.com'), false);
391      assert.strictEqual(isDemoEmail('user@company.com'), false);
392    });
393  
394    test('is case insensitive', () => {
395      assert.strictEqual(isDemoEmail('USER@EXAMPLE.COM'), true);
396    });
397  
398    test('handles whitespace', () => {
399      assert.strictEqual(isDemoEmail('  user@example.com  '), true);
400    });
401  
402    test('detects throwaway email services', () => {
403      assert.strictEqual(isDemoEmail('user@10minutemail.com'), true);
404      assert.strictEqual(isDemoEmail('user@guerrillamail.com'), true);
405      assert.strictEqual(isDemoEmail('user@throwaway.email'), true);
406    });
407  });
408  
409  // ===== Government Email Tests =====
410  
411  describe('isGovernmentEmail', () => {
412    test('returns false for invalid inputs', () => {
413      assert.strictEqual(isGovernmentEmail(null), false);
414      assert.strictEqual(isGovernmentEmail(undefined), false);
415      assert.strictEqual(isGovernmentEmail(''), false);
416      assert.strictEqual(isGovernmentEmail(123), false);
417    });
418  
419    test('detects government email addresses', () => {
420      assert.strictEqual(isGovernmentEmail('user@whitehouse.gov'), true);
421      assert.strictEqual(isGovernmentEmail('user@example.gov.au'), true);
422      assert.strictEqual(isGovernmentEmail('user@example.gc.ca'), true);
423    });
424  
425    test('returns false for emails without @ symbol', () => {
426      assert.strictEqual(isGovernmentEmail('notanemail.gov'), false);
427    });
428  
429    test('returns false for non-government emails', () => {
430      assert.strictEqual(isGovernmentEmail('user@example.com'), false);
431      assert.strictEqual(isGovernmentEmail('user@government.com'), false);
432    });
433  
434    test('is case insensitive', () => {
435      assert.strictEqual(isGovernmentEmail('USER@EXAMPLE.GOV'), true);
436    });
437  
438    test('handles whitespace', () => {
439      assert.strictEqual(isGovernmentEmail('  user@example.gov  '), true);
440    });
441  });
442  
443  // ===== Education Email Tests =====
444  
445  describe('isEducationEmail', () => {
446    test('returns false for invalid inputs', () => {
447      assert.strictEqual(isEducationEmail(null), false);
448      assert.strictEqual(isEducationEmail(undefined), false);
449      assert.strictEqual(isEducationEmail(''), false);
450      assert.strictEqual(isEducationEmail(123), false);
451    });
452  
453    test('detects education email addresses', () => {
454      assert.strictEqual(isEducationEmail('student@harvard.edu'), true);
455      assert.strictEqual(isEducationEmail('student@example.edu.au'), true);
456      assert.strictEqual(isEducationEmail('student@example.ac.uk'), true);
457    });
458  
459    test('returns false for emails without @ symbol', () => {
460      assert.strictEqual(isEducationEmail('notanemail.edu'), false);
461    });
462  
463    test('returns false for non-education emails', () => {
464      assert.strictEqual(isEducationEmail('user@example.com'), false);
465      assert.strictEqual(isEducationEmail('user@education.com'), false);
466    });
467  
468    test('is case insensitive', () => {
469      assert.strictEqual(isEducationEmail('STUDENT@EXAMPLE.EDU'), true);
470    });
471  
472    test('handles whitespace', () => {
473      assert.strictEqual(isEducationEmail('  student@example.edu  '), true);
474    });
475  });
476  
477  // ===== Government/Education Blocking Integration =====
478  
479  describe('checkBlocklist - government and education domains', () => {
480    test('blocks government domains', () => {
481      const result = checkBlocklist('example.gov');
482      assert.ok(result);
483      assert.strictEqual(result.reason, 'Ignored: Government domain');
484      assert.strictEqual(result.blocklist, 'government');
485    });
486  
487    test('blocks education domains', () => {
488      const result = checkBlocklist('example.edu');
489      assert.ok(result);
490      assert.strictEqual(result.reason, 'Ignored: Education domain');
491      assert.strictEqual(result.blocklist, 'education');
492    });
493  
494    test('blocks country-specific government domains', () => {
495      assert.ok(checkBlocklist('example.gov.au'));
496      assert.ok(checkBlocklist('example.gc.ca'));
497      assert.ok(checkBlocklist('example.gob.mx'));
498    });
499  
500    test('blocks country-specific education domains', () => {
501      assert.ok(checkBlocklist('example.edu.au'));
502      assert.ok(checkBlocklist('example.ac.uk'));
503    });
504  });