/ src / utils / site-filters.js
site-filters.js
  1  /**
  2   * Site Filtering Utilities
  3   * Blocklists for directories, social media, and franchises to prevent wasting API credits
  4   */
  5  
  6  import fs from 'fs';
  7  import path from 'path';
  8  import { fileURLToPath } from 'url';
  9  
 10  const __filename = fileURLToPath(import.meta.url);
 11  const __dirname = path.dirname(__filename);
 12  
 13  // Cache for franchise lists by country code
 14  const franchiseCache = new Map();
 15  
 16  // Known business directory domains
 17  export const DIRECTORY_DOMAINS = [
 18    // Better Business Bureau (all regional variants)
 19    'bbb.org',
 20    'bbb.com',
 21    'bbb.ca', // Canadian BBB
 22    'bbbonline.org', // BBB Online arm
 23  
 24    // General directories
 25    'yelp.com',
 26    'yelp.com.au',
 27    'yellowpages.com',
 28    'yellowpages.com.au',
 29    'yellowpages.ca',
 30    'canpages.ca', // Canadian yellow pages
 31    'manta.com',
 32    'superpages.com',
 33    'whitepages.com',
 34    'mapquest.com',
 35    'foursquare.com',
 36    'merchantcircle.com',
 37    'local.com',
 38    'citysearch.com',
 39  
 40    // Consumer review aggregators (often outrank actual business sites)
 41    'trustpilot.com',
 42    'reviews.io',
 43    'g2.com',
 44    'capterra.com',
 45    'getapp.com',
 46    'sitejabber.com',
 47    'pissedconsumer.com',
 48  
 49    // Home services marketplaces
 50    'trustedchoice.com',
 51    'thumbtack.com',
 52    'angi.com',
 53    'angieslist.com',
 54    'homeadvisor.com',
 55    'houzz.com',
 56    'porch.com',
 57    'bark.com', // UK/US/AU home services marketplace
 58    'tasker.com',
 59    'taskrabbit.com',
 60    'checkatrade.com', // UK trades directory
 61    'trustatrader.com', // UK trades directory
 62    'rated-people.com', // UK home improvement
 63    'mybuilder.com', // UK trades directory
 64    'tradesman.ie', // Ireland
 65    // AU/NZ home services
 66    'hipages.com.au',
 67    'serviceseeking.com.au',
 68    'oneflare.com',
 69    'oneflare.com.au',
 70    'homestars.com', // Canadian home services
 71  
 72    // Real estate
 73    'zillow.com',
 74    'realtor.com',
 75    'redfin.com',
 76    'rightmove.co.uk', // UK property
 77    'zoopla.co.uk', // UK property
 78    'domain.com.au', // AU property
 79    'realestate.com.au', // AU property
 80    'trademe.co.nz', // NZ property/marketplace
 81  
 82    // Hospitality/dining
 83    'tripadvisor.com',
 84    'opentable.com',
 85    'zomato.com',
 86    'grubhub.com',
 87    'doordash.com',
 88    'ubereats.com',
 89  
 90    // Jobs/HR — use base-domain patterns (with dot) to catch all country TLDs
 91    // e.g. 'glassdoor.' matches glassdoor.com, glassdoor.co.in, glassdoor.co.uk, etc.
 92    'indeed.',
 93    'glassdoor.',
 94    'seek.com.au',
 95    'seek.co.nz',
 96    'simplyhired.',
 97    'jobstreet.',
 98    'naukri.com',
 99    'sulekha.com', // Indian service/job directory
100    'monster.com',
101    'monster.co.', // monster.co.uk, monster.co.in etc.
102    'careerjet.',
103    'jora.com',
104    'neuvoo.com',
105    'talent.com',
106    'jobrapido.',
107    'fixfinder.', // Service finder directories
108    'numberdekho.com', // Car classifieds India
109  
110    // Healthcare
111    'healthgrades.com',
112    'zocdoc.com',
113    'vitals.com',
114    'ratemds.com',
115    'webmd.com',
116  
117    // Legal
118    'findlaw.com',
119    'avvo.com',
120    'lawyers.com',
121    'justia.com',
122    'martindale.com',
123    'lawinfo.com',
124    'hg.org',
125    'superlawyers.com',
126    'nolo.com',
127    'legalmatch.com',
128  
129    // Classifieds
130    'craigslist.com',
131    'craigslist.org',
132    'gumtree.com',
133    'gumtree.com.au',
134    'kijiji.ca',
135  
136    // News / media (not local businesses — produce incomplete LLM responses)
137    'barrietoday.com',
138    'newmarkettoday.ca',
139    'allafrica.com',
140    'thestar.com',
141    'theglobeandmail.com',
142    'cbc.ca',
143    'abc.net.au',
144    'smh.com.au',
145    'ninemsn.com.au',
146  
147    // Free website builders / subdomains (not real business sites)
148    'ueniweb.com',
149    'wixsite.com',
150    'weebly.com',
151    'jimdo.com',
152    'site123.me',
153    'webnode.com',
154  
155    // International directories / platforms found during proofreading
156    'all.biz',
157    'wheree.com',
158    'kompass.com',
159  
160    // Obituary / memorial aggregators (SERP noise from name-based keywords)
161    'legacy.com',
162    'dignitymemorial.com',
163    'tributearchive.com',
164    'echovita.com',
165  
166    // Government / municipal domains
167    'lille.fr',
168  ];
169  
170  // Social media platforms
171  export const SOCIAL_MEDIA_DOMAINS = [
172    'facebook.com',
173    'instagram.com',
174    'twitter.com',
175    'x.com',
176    'linkedin.com',
177    'youtube.com',
178    'tiktok.com',
179    'pinterest.com',
180    'snapchat.com',
181    'reddit.com',
182    'tumblr.com',
183  ];
184  
185  // Demo/test email domains to filter out
186  export const DEMO_EMAIL_DOMAINS = [
187    'example.com',
188    'example.org',
189    'example.net',
190    'test.com',
191    'test.org',
192    'test.net',
193    'testing.com',
194    'demo.com',
195    'sample.com',
196    'localhost',
197    'invalid',
198    'invalid.com',
199    'tempmail.com',
200    'throwaway.email',
201    '10minutemail.com',
202    'guerrillamail.com',
203    'mailinator.com',
204  ];
205  
206  /**
207   * Load franchise domains for a specific country
208   * @param {string} countryCode - Two-letter country code (e.g., 'us', 'au')
209   * @returns {string[]} - Array of franchise brand names for domain matching
210   */
211  export function loadFranchiseDomains(countryCode) {
212    if (!countryCode) return [];
213  
214    const lowerCountryCode = countryCode.toLowerCase();
215  
216    // Check cache first
217    if (franchiseCache.has(lowerCountryCode)) {
218      return franchiseCache.get(lowerCountryCode);
219    }
220  
221    // Load from file
222    const franchiseFilePath = path.join(
223      __dirname,
224      '../../data/franchises',
225      `${lowerCountryCode}.txt`
226    );
227  
228    try {
229      if (!fs.existsSync(franchiseFilePath)) {
230        // No franchise file for this country - cache empty array
231        franchiseCache.set(lowerCountryCode, []);
232        return [];
233      }
234  
235      const fileContent = fs.readFileSync(franchiseFilePath, 'utf-8');
236      const franchises = fileContent
237        .split('\n')
238        .map(line => line.trim())
239        .filter(line => line && !line.startsWith('#')) // Remove empty lines and comments
240        .map(brandName => {
241          // Convert brand name to domain-friendly format for matching
242          // "Mr. Rooter" -> "mrrooter", "Jim's Mowing" -> "jimsmowing"
243          return brandName
244            .toLowerCase()
245            .replace(/[^a-z0-9]/g, '') // Remove special chars, spaces, dots
246            .trim();
247        })
248        .filter(domain => domain.length > 0); // Remove empty strings
249  
250      // Cache the loaded list
251      franchiseCache.set(lowerCountryCode, franchises);
252      return franchises;
253    } catch (error) {
254      console.error(`Error loading franchise list for ${countryCode}:`, error.message);
255      franchiseCache.set(lowerCountryCode, []);
256      return [];
257    }
258  }
259  
260  /**
261   * Check if domain matches any blocklist (directories, social media, franchises, or government/education)
262   * @param {string} domain - Domain to check
263   * @param {string} countryCode - Optional country code for franchise checking (e.g., 'us', 'au')
264   * @returns {object|null} - {reason, blocklist} if matched, null otherwise
265   */
266  export function checkBlocklist(domain, countryCode = null) {
267    if (!domain) return null;
268  
269    const lowerDomain = domain.toLowerCase();
270  
271    // Domain boundary match: blocked domain must be the full domain or a subdomain
272    // e.g. "x.com" matches "x.com" and "sub.x.com" but NOT "auditandfix.com"
273    const domainMatches = (domain, blocked) => domain === blocked || domain.endsWith(`.${blocked}`);
274  
275    // Check social media
276    if (SOCIAL_MEDIA_DOMAINS.some(blocked => domainMatches(lowerDomain, blocked))) {
277      return {
278        reason: 'Ignored: Social media platform',
279        blocklist: 'social_media',
280      };
281    }
282  
283    // Check directories
284    if (DIRECTORY_DOMAINS.some(blocked => domainMatches(lowerDomain, blocked))) {
285      return {
286        reason: 'Ignored: Business directory',
287        blocklist: 'directory',
288      };
289    }
290  
291    // Check government/education/non-commercial TLDs
292    if (isGovernmentDomain(lowerDomain)) {
293      return {
294        reason: 'Ignored: Government domain',
295        blocklist: 'government',
296      };
297    }
298  
299    if (isEducationDomain(lowerDomain)) {
300      return {
301        reason: 'Ignored: Education domain',
302        blocklist: 'education',
303      };
304    }
305  
306    if (isNonCommercialDomain(lowerDomain)) {
307      return {
308        reason: 'Ignored: Non-commercial domain',
309        blocklist: 'non-commercial',
310      };
311    }
312  
313    // Check franchises (country-specific)
314    if (countryCode) {
315      const franchiseDomains = loadFranchiseDomains(countryCode);
316      // Remove special chars from domain for matching (www.mr-rooter.com -> mrrooter)
317      const cleanDomain = lowerDomain.replace(/[^a-z0-9]/g, '');
318  
319      for (const franchiseDomain of franchiseDomains) {
320        if (cleanDomain.includes(franchiseDomain)) {
321          return {
322            reason: 'Ignored: Home service franchise',
323            blocklist: 'franchise',
324          };
325        }
326      }
327    }
328  
329    return null;
330  }
331  
332  /**
333   * Check if domain is a government domain
334   * @param {string} domain - Domain to check
335   * @returns {boolean} True if government domain
336   */
337  export function isGovernmentDomain(domain) {
338    if (!domain || typeof domain !== 'string') return false;
339  
340    const lower = domain.toLowerCase().trim();
341  
342    // Government TLD patterns by country
343    const govPatterns = [
344      /\.gov$/i, // US federal (.gov)
345      /\.gov\.[a-z]{2}$/i, // Country-specific (.gov.au, .gov.uk, .gov.in, etc.)
346      /\.gc\.ca$/i, // Canada (Government of Canada)
347      /\.govt\.nz$/i, // New Zealand
348      /\.gob\.[a-z]{2}$/i, // Spanish-speaking countries (.gob.mx, .gob.es, etc.)
349      /\.gouv\.[a-z]{2}$/i, // French-speaking countries (.gouv.fr, .gouv.be, etc.)
350      /\.go\.[a-z]{2}$/i, // Japan, Korea (.go.jp, .go.kr)
351      /\.gov\.br$/i, // Brazil
352      /\.mil$/i, // US military
353      /\.mil\.[a-z]{2}$/i, // Country military domains
354    ];
355  
356    return govPatterns.some(pattern => pattern.test(lower));
357  }
358  
359  /**
360   * Check if domain is an education domain
361   * @param {string} domain - Domain to check
362   * @returns {boolean} True if education domain
363   */
364  export function isEducationDomain(domain) {
365    if (!domain || typeof domain !== 'string') return false;
366  
367    const lower = domain.toLowerCase().trim();
368  
369    // Education TLD patterns
370    const eduPatterns = [
371      /\.edu$/i, // US education (.edu)
372      /\.edu\.[a-z]{2}$/i, // Country-specific (.edu.au, .edu.uk, etc.)
373      /\.ac\.[a-z]{2}$/i, // Academic domains (.ac.uk, .ac.jp, .ac.nz, etc.)
374    ];
375  
376    return eduPatterns.some(pattern => pattern.test(lower));
377  }
378  
379  /**
380   * Check if domain uses a non-commercial TLD (charities, associations, non-profits)
381   * @param {string} domain - Domain to check
382   * @returns {boolean} True if non-commercial domain
383   */
384  export function isNonCommercialDomain(domain) {
385    if (!domain || typeof domain !== 'string') return false;
386  
387    const lower = domain.toLowerCase().trim();
388  
389    const nonCommercialPatterns = [
390      /\.org$/i, // Generic non-profit (.org)
391      /\.org\.[a-z]{2}$/i, // Country-specific (.org.au, .org.nz, .org.uk, .org.za, etc.)
392      /\.asn\.au$/i, // Australian associations
393      /\.ngo$/i, // Non-governmental organisations
394      /\.charity$/i, // UK charities
395      /\.foundation$/i, // Foundations
396    ];
397  
398    return nonCommercialPatterns.some(pattern => pattern.test(lower));
399  }
400  
401  /**
402   * Check if email address uses a demo/test domain
403   * @param {string} email - Email address to check
404   * @returns {boolean} True if demo email
405   */
406  export function isDemoEmail(email) {
407    if (!email || typeof email !== 'string') return false;
408  
409    const lower = email.toLowerCase().trim();
410    const domain = lower.split('@')[1];
411    if (!domain) return false;
412  
413    return DEMO_EMAIL_DOMAINS.some(blocked => domain === blocked || domain.endsWith(`.${blocked}`));
414  }
415  
416  /**
417   * Check if email address is from a government domain
418   * @param {string} email - Email address to check
419   * @returns {boolean} True if government email
420   */
421  export function isGovernmentEmail(email) {
422    if (!email || typeof email !== 'string') return false;
423  
424    const lower = email.toLowerCase().trim();
425    const domain = lower.split('@')[1];
426    if (!domain) return false;
427  
428    return isGovernmentDomain(domain);
429  }
430  
431  /**
432   * Check if email address is from an education domain
433   * @param {string} email - Email address to check
434   * @returns {boolean} True if education email
435   */
436  export function isEducationEmail(email) {
437    if (!email || typeof email !== 'string') return false;
438  
439    const lower = email.toLowerCase().trim();
440    const domain = lower.split('@')[1];
441    if (!domain) return false;
442  
443    return isEducationDomain(domain);
444  }
445  
446  // Industry classification keywords by type
447  const INDUSTRY_KEYWORDS = {
448    legal: ['solicitor', 'barrister', 'attorney', 'lawfirm', 'lawyer', 'legal', 'counsel', 'esq'],
449    healthcare: [
450      'dental',
451      'dentist',
452      'medical',
453      'clinic',
454      'hospital',
455      'pharmacy',
456      'physio',
457      'veterinary',
458    ],
459    financial: ['financialadviser', 'mortgagebroker', 'accounting', 'bookkeeper'],
460  };
461  
462  /**
463   * Classify industry from domain name keywords
464   * @param {string} domain - Domain to check (e.g., 'smith-solicitors.com.au')
465   * @returns {object|null} - {type, reason} if matched, null otherwise
466   */
467  export function classifyIndustry(domain) {
468    if (!domain || typeof domain !== 'string') return null;
469  
470    // Normalize: strip TLD parts, remove hyphens/dots, lowercase
471    const lowerDomain = domain.toLowerCase();
472  
473    for (const [type, keywords] of Object.entries(INDUSTRY_KEYWORDS)) {
474      for (const keyword of keywords) {
475        if (lowerDomain.includes(keyword)) {
476          return { type, reason: `domain keyword: ${keyword}` };
477        }
478      }
479    }
480  
481    return null;
482  }