site-filters.js
1 /** 2 * Site Filtering Utilities 3 * Blocklists for directories, social media, and franchises to prevent wasting API credits 4 */ 5 6 import fs from 'fs'; 7 import path from 'path'; 8 import { fileURLToPath } from 'url'; 9 10 const __filename = fileURLToPath(import.meta.url); 11 const __dirname = path.dirname(__filename); 12 13 // Cache for franchise lists by country code 14 const franchiseCache = new Map(); 15 16 // Known business directory domains 17 export const DIRECTORY_DOMAINS = [ 18 // Better Business Bureau (all regional variants) 19 'bbb.org', 20 'bbb.com', 21 'bbb.ca', // Canadian BBB 22 'bbbonline.org', // BBB Online arm 23 24 // General directories 25 'yelp.com', 26 'yelp.com.au', 27 'yellowpages.com', 28 'yellowpages.com.au', 29 'yellowpages.ca', 30 'canpages.ca', // Canadian yellow pages 31 'manta.com', 32 'superpages.com', 33 'whitepages.com', 34 'mapquest.com', 35 'foursquare.com', 36 'merchantcircle.com', 37 'local.com', 38 'citysearch.com', 39 40 // Consumer review aggregators (often outrank actual business sites) 41 'trustpilot.com', 42 'reviews.io', 43 'g2.com', 44 'capterra.com', 45 'getapp.com', 46 'sitejabber.com', 47 'pissedconsumer.com', 48 49 // Home services marketplaces 50 'trustedchoice.com', 51 'thumbtack.com', 52 'angi.com', 53 'angieslist.com', 54 'homeadvisor.com', 55 'houzz.com', 56 'porch.com', 57 'bark.com', // UK/US/AU home services marketplace 58 'tasker.com', 59 'taskrabbit.com', 60 'checkatrade.com', // UK trades directory 61 'trustatrader.com', // UK trades directory 62 'rated-people.com', // UK home improvement 63 'mybuilder.com', // UK trades directory 64 'tradesman.ie', // Ireland 65 // AU/NZ home services 66 'hipages.com.au', 67 'serviceseeking.com.au', 68 'oneflare.com', 69 'oneflare.com.au', 70 'homestars.com', // Canadian home services 71 72 // Real estate 73 'zillow.com', 74 'realtor.com', 75 'redfin.com', 76 'rightmove.co.uk', // UK property 77 'zoopla.co.uk', // UK property 78 'domain.com.au', // AU property 79 'realestate.com.au', // AU property 80 'trademe.co.nz', // NZ property/marketplace 81 82 // Hospitality/dining 83 'tripadvisor.com', 84 'opentable.com', 85 'zomato.com', 86 'grubhub.com', 87 'doordash.com', 88 'ubereats.com', 89 90 // Jobs/HR — use base-domain patterns (with dot) to catch all country TLDs 91 // e.g. 'glassdoor.' matches glassdoor.com, glassdoor.co.in, glassdoor.co.uk, etc. 92 'indeed.', 93 'glassdoor.', 94 'seek.com.au', 95 'seek.co.nz', 96 'simplyhired.', 97 'jobstreet.', 98 'naukri.com', 99 'sulekha.com', // Indian service/job directory 100 'monster.com', 101 'monster.co.', // monster.co.uk, monster.co.in etc. 102 'careerjet.', 103 'jora.com', 104 'neuvoo.com', 105 'talent.com', 106 'jobrapido.', 107 'fixfinder.', // Service finder directories 108 'numberdekho.com', // Car classifieds India 109 110 // Healthcare 111 'healthgrades.com', 112 'zocdoc.com', 113 'vitals.com', 114 'ratemds.com', 115 'webmd.com', 116 117 // Legal 118 'findlaw.com', 119 'avvo.com', 120 'lawyers.com', 121 'justia.com', 122 'martindale.com', 123 'lawinfo.com', 124 'hg.org', 125 'superlawyers.com', 126 'nolo.com', 127 'legalmatch.com', 128 129 // Classifieds 130 'craigslist.com', 131 'craigslist.org', 132 'gumtree.com', 133 'gumtree.com.au', 134 'kijiji.ca', 135 136 // News / media (not local businesses — produce incomplete LLM responses) 137 'barrietoday.com', 138 'newmarkettoday.ca', 139 'allafrica.com', 140 'thestar.com', 141 'theglobeandmail.com', 142 'cbc.ca', 143 'abc.net.au', 144 'smh.com.au', 145 'ninemsn.com.au', 146 147 // Free website builders / subdomains (not real business sites) 148 'ueniweb.com', 149 'wixsite.com', 150 'weebly.com', 151 'jimdo.com', 152 'site123.me', 153 'webnode.com', 154 155 // International directories / platforms found during proofreading 156 'all.biz', 157 'wheree.com', 158 'kompass.com', 159 160 // Obituary / memorial aggregators (SERP noise from name-based keywords) 161 'legacy.com', 162 'dignitymemorial.com', 163 'tributearchive.com', 164 'echovita.com', 165 166 // Government / municipal domains 167 'lille.fr', 168 ]; 169 170 // Social media platforms 171 export const SOCIAL_MEDIA_DOMAINS = [ 172 'facebook.com', 173 'instagram.com', 174 'twitter.com', 175 'x.com', 176 'linkedin.com', 177 'youtube.com', 178 'tiktok.com', 179 'pinterest.com', 180 'snapchat.com', 181 'reddit.com', 182 'tumblr.com', 183 ]; 184 185 // Demo/test email domains to filter out 186 export const DEMO_EMAIL_DOMAINS = [ 187 'example.com', 188 'example.org', 189 'example.net', 190 'test.com', 191 'test.org', 192 'test.net', 193 'testing.com', 194 'demo.com', 195 'sample.com', 196 'localhost', 197 'invalid', 198 'invalid.com', 199 'tempmail.com', 200 'throwaway.email', 201 '10minutemail.com', 202 'guerrillamail.com', 203 'mailinator.com', 204 ]; 205 206 /** 207 * Load franchise domains for a specific country 208 * @param {string} countryCode - Two-letter country code (e.g., 'us', 'au') 209 * @returns {string[]} - Array of franchise brand names for domain matching 210 */ 211 export function loadFranchiseDomains(countryCode) { 212 if (!countryCode) return []; 213 214 const lowerCountryCode = countryCode.toLowerCase(); 215 216 // Check cache first 217 if (franchiseCache.has(lowerCountryCode)) { 218 return franchiseCache.get(lowerCountryCode); 219 } 220 221 // Load from file 222 const franchiseFilePath = path.join( 223 __dirname, 224 '../../data/franchises', 225 `${lowerCountryCode}.txt` 226 ); 227 228 try { 229 if (!fs.existsSync(franchiseFilePath)) { 230 // No franchise file for this country - cache empty array 231 franchiseCache.set(lowerCountryCode, []); 232 return []; 233 } 234 235 const fileContent = fs.readFileSync(franchiseFilePath, 'utf-8'); 236 const franchises = fileContent 237 .split('\n') 238 .map(line => line.trim()) 239 .filter(line => line && !line.startsWith('#')) // Remove empty lines and comments 240 .map(brandName => { 241 // Convert brand name to domain-friendly format for matching 242 // "Mr. Rooter" -> "mrrooter", "Jim's Mowing" -> "jimsmowing" 243 return brandName 244 .toLowerCase() 245 .replace(/[^a-z0-9]/g, '') // Remove special chars, spaces, dots 246 .trim(); 247 }) 248 .filter(domain => domain.length > 0); // Remove empty strings 249 250 // Cache the loaded list 251 franchiseCache.set(lowerCountryCode, franchises); 252 return franchises; 253 } catch (error) { 254 console.error(`Error loading franchise list for ${countryCode}:`, error.message); 255 franchiseCache.set(lowerCountryCode, []); 256 return []; 257 } 258 } 259 260 /** 261 * Check if domain matches any blocklist (directories, social media, franchises, or government/education) 262 * @param {string} domain - Domain to check 263 * @param {string} countryCode - Optional country code for franchise checking (e.g., 'us', 'au') 264 * @returns {object|null} - {reason, blocklist} if matched, null otherwise 265 */ 266 export function checkBlocklist(domain, countryCode = null) { 267 if (!domain) return null; 268 269 const lowerDomain = domain.toLowerCase(); 270 271 // Domain boundary match: blocked domain must be the full domain or a subdomain 272 // e.g. "x.com" matches "x.com" and "sub.x.com" but NOT "auditandfix.com" 273 const domainMatches = (domain, blocked) => domain === blocked || domain.endsWith(`.${blocked}`); 274 275 // Check social media 276 if (SOCIAL_MEDIA_DOMAINS.some(blocked => domainMatches(lowerDomain, blocked))) { 277 return { 278 reason: 'Ignored: Social media platform', 279 blocklist: 'social_media', 280 }; 281 } 282 283 // Check directories 284 if (DIRECTORY_DOMAINS.some(blocked => domainMatches(lowerDomain, blocked))) { 285 return { 286 reason: 'Ignored: Business directory', 287 blocklist: 'directory', 288 }; 289 } 290 291 // Check government/education/non-commercial TLDs 292 if (isGovernmentDomain(lowerDomain)) { 293 return { 294 reason: 'Ignored: Government domain', 295 blocklist: 'government', 296 }; 297 } 298 299 if (isEducationDomain(lowerDomain)) { 300 return { 301 reason: 'Ignored: Education domain', 302 blocklist: 'education', 303 }; 304 } 305 306 if (isNonCommercialDomain(lowerDomain)) { 307 return { 308 reason: 'Ignored: Non-commercial domain', 309 blocklist: 'non-commercial', 310 }; 311 } 312 313 // Check franchises (country-specific) 314 if (countryCode) { 315 const franchiseDomains = loadFranchiseDomains(countryCode); 316 // Remove special chars from domain for matching (www.mr-rooter.com -> mrrooter) 317 const cleanDomain = lowerDomain.replace(/[^a-z0-9]/g, ''); 318 319 for (const franchiseDomain of franchiseDomains) { 320 if (cleanDomain.includes(franchiseDomain)) { 321 return { 322 reason: 'Ignored: Home service franchise', 323 blocklist: 'franchise', 324 }; 325 } 326 } 327 } 328 329 return null; 330 } 331 332 /** 333 * Check if domain is a government domain 334 * @param {string} domain - Domain to check 335 * @returns {boolean} True if government domain 336 */ 337 export function isGovernmentDomain(domain) { 338 if (!domain || typeof domain !== 'string') return false; 339 340 const lower = domain.toLowerCase().trim(); 341 342 // Government TLD patterns by country 343 const govPatterns = [ 344 /\.gov$/i, // US federal (.gov) 345 /\.gov\.[a-z]{2}$/i, // Country-specific (.gov.au, .gov.uk, .gov.in, etc.) 346 /\.gc\.ca$/i, // Canada (Government of Canada) 347 /\.govt\.nz$/i, // New Zealand 348 /\.gob\.[a-z]{2}$/i, // Spanish-speaking countries (.gob.mx, .gob.es, etc.) 349 /\.gouv\.[a-z]{2}$/i, // French-speaking countries (.gouv.fr, .gouv.be, etc.) 350 /\.go\.[a-z]{2}$/i, // Japan, Korea (.go.jp, .go.kr) 351 /\.gov\.br$/i, // Brazil 352 /\.mil$/i, // US military 353 /\.mil\.[a-z]{2}$/i, // Country military domains 354 ]; 355 356 return govPatterns.some(pattern => pattern.test(lower)); 357 } 358 359 /** 360 * Check if domain is an education domain 361 * @param {string} domain - Domain to check 362 * @returns {boolean} True if education domain 363 */ 364 export function isEducationDomain(domain) { 365 if (!domain || typeof domain !== 'string') return false; 366 367 const lower = domain.toLowerCase().trim(); 368 369 // Education TLD patterns 370 const eduPatterns = [ 371 /\.edu$/i, // US education (.edu) 372 /\.edu\.[a-z]{2}$/i, // Country-specific (.edu.au, .edu.uk, etc.) 373 /\.ac\.[a-z]{2}$/i, // Academic domains (.ac.uk, .ac.jp, .ac.nz, etc.) 374 ]; 375 376 return eduPatterns.some(pattern => pattern.test(lower)); 377 } 378 379 /** 380 * Check if domain uses a non-commercial TLD (charities, associations, non-profits) 381 * @param {string} domain - Domain to check 382 * @returns {boolean} True if non-commercial domain 383 */ 384 export function isNonCommercialDomain(domain) { 385 if (!domain || typeof domain !== 'string') return false; 386 387 const lower = domain.toLowerCase().trim(); 388 389 const nonCommercialPatterns = [ 390 /\.org$/i, // Generic non-profit (.org) 391 /\.org\.[a-z]{2}$/i, // Country-specific (.org.au, .org.nz, .org.uk, .org.za, etc.) 392 /\.asn\.au$/i, // Australian associations 393 /\.ngo$/i, // Non-governmental organisations 394 /\.charity$/i, // UK charities 395 /\.foundation$/i, // Foundations 396 ]; 397 398 return nonCommercialPatterns.some(pattern => pattern.test(lower)); 399 } 400 401 /** 402 * Check if email address uses a demo/test domain 403 * @param {string} email - Email address to check 404 * @returns {boolean} True if demo email 405 */ 406 export function isDemoEmail(email) { 407 if (!email || typeof email !== 'string') return false; 408 409 const lower = email.toLowerCase().trim(); 410 const domain = lower.split('@')[1]; 411 if (!domain) return false; 412 413 return DEMO_EMAIL_DOMAINS.some(blocked => domain === blocked || domain.endsWith(`.${blocked}`)); 414 } 415 416 /** 417 * Check if email address is from a government domain 418 * @param {string} email - Email address to check 419 * @returns {boolean} True if government email 420 */ 421 export function isGovernmentEmail(email) { 422 if (!email || typeof email !== 'string') return false; 423 424 const lower = email.toLowerCase().trim(); 425 const domain = lower.split('@')[1]; 426 if (!domain) return false; 427 428 return isGovernmentDomain(domain); 429 } 430 431 /** 432 * Check if email address is from an education domain 433 * @param {string} email - Email address to check 434 * @returns {boolean} True if education email 435 */ 436 export function isEducationEmail(email) { 437 if (!email || typeof email !== 'string') return false; 438 439 const lower = email.toLowerCase().trim(); 440 const domain = lower.split('@')[1]; 441 if (!domain) return false; 442 443 return isEducationDomain(domain); 444 } 445 446 // Industry classification keywords by type 447 const INDUSTRY_KEYWORDS = { 448 legal: ['solicitor', 'barrister', 'attorney', 'lawfirm', 'lawyer', 'legal', 'counsel', 'esq'], 449 healthcare: [ 450 'dental', 451 'dentist', 452 'medical', 453 'clinic', 454 'hospital', 455 'pharmacy', 456 'physio', 457 'veterinary', 458 ], 459 financial: ['financialadviser', 'mortgagebroker', 'accounting', 'bookkeeper'], 460 }; 461 462 /** 463 * Classify industry from domain name keywords 464 * @param {string} domain - Domain to check (e.g., 'smith-solicitors.com.au') 465 * @returns {object|null} - {type, reason} if matched, null otherwise 466 */ 467 export function classifyIndustry(domain) { 468 if (!domain || typeof domain !== 'string') return null; 469 470 // Normalize: strip TLD parts, remove hyphens/dots, lowercase 471 const lowerDomain = domain.toLowerCase(); 472 473 for (const [type, keywords] of Object.entries(INDUSTRY_KEYWORDS)) { 474 for (const keyword of keywords) { 475 if (lowerDomain.includes(keyword)) { 476 return { type, reason: `domain keyword: ${keyword}` }; 477 } 478 } 479 } 480 481 return null; 482 }