gdpr-verification.js
1 /** 2 * GDPR Verification Utilities 3 * 4 * Verifies that email addresses belong to registered companies (not individuals) 5 * before sending outreach in EU/UK countries, as required by GDPR. 6 * 7 * Verification steps: 8 * 1. Free email provider check (gmail.com, outlook.com, etc.) 9 * 2. Company type search in HTML (GmbH, Ltd, AG, S.r.l., etc.) 10 * 3. Company keyword search (registration numbers, VAT IDs, etc.) 11 * 4. Key page verification (Impressum, About Us, Contact pages) 12 */ 13 14 import { getCountryByCode } from '../config/countries.js'; 15 16 /** 17 * Free email providers list (75+ domains across EU/UK) 18 * These indicate individual/personal emails, not company emails 19 */ 20 const FREE_EMAIL_PROVIDERS = [ 21 // International 22 'gmail.com', 23 'yahoo.com', 24 'outlook.com', 25 'hotmail.com', 26 'aol.com', 27 'icloud.com', 28 'mail.com', 29 'protonmail.com', 30 'zoho.com', 31 'yandex.com', 32 33 // Germany 34 'gmx.de', 35 'gmx.net', 36 'web.de', 37 't-online.de', 38 'freenet.de', 39 '1und1.de', 40 'arcor.de', 41 'online.de', 42 'gmx.at', 43 'gmx.ch', 44 45 // France 46 'laposte.net', 47 'orange.fr', 48 'free.fr', 49 'wanadoo.fr', 50 'sfr.fr', 51 'neuf.fr', 52 'bbox.fr', 53 'club-internet.fr', 54 'voila.fr', 55 'yahoo.fr', 56 57 // UK 58 'btinternet.com', 59 'virginmedia.com', 60 'sky.com', 61 'talktalk.net', 62 'ntlworld.com', 63 'blueyonder.co.uk', 64 'tiscali.co.uk', 65 'yahoo.co.uk', 66 67 // Italy 68 'libero.it', 69 'virgilio.it', 70 'tiscali.it', 71 'alice.it', 72 'tin.it', 73 'email.it', 74 'fastwebnet.it', 75 'inwind.it', 76 'yahoo.it', 77 78 // Spain 79 'telefonica.net', 80 'terra.es', 81 'ono.com', 82 'ya.com', 83 'hotmail.es', 84 'yahoo.es', 85 'gmail.es', 86 87 // Netherlands 88 'ziggo.nl', 89 'hetnet.nl', 90 'planet.nl', 91 'home.nl', 92 'xs4all.nl', 93 'telfort.nl', 94 'chello.nl', 95 'yahoo.nl', 96 97 // Belgium 98 'skynet.be', 99 'telenet.be', 100 'pandora.be', 101 'yahoo.be', 102 'hotmail.be', 103 104 // Austria 105 'aon.at', 106 'chello.at', 107 'gmx.at', 108 'yahoo.at', 109 110 // Sweden 111 'telia.com', 112 'bredband.net', 113 'spray.se', 114 'yahoo.se', 115 116 // Denmark 117 'jubii.dk', 118 'mail.dk', 119 'yahoo.dk', 120 121 // Norway 122 'online.no', 123 'start.no', 124 'yahoo.no', 125 126 // Poland 127 'wp.pl', 128 'onet.pl', 129 'interia.pl', 130 'o2.pl', 131 'tlen.pl', 132 'gazeta.pl', 133 'poczta.fm', 134 'yahoo.pl', 135 136 // Ireland 137 'eircom.net', 138 'yahoo.ie', 139 'hotmail.ie', 140 ]; 141 142 /** 143 * Check if email domain is a free email provider 144 * @param {string} email - Email address to check 145 * @returns {boolean} True if free email provider 146 */ 147 export function isFreeEmailProvider(email) { 148 if (!email || typeof email !== 'string') { 149 return false; 150 } 151 152 const domain = email.toLowerCase().split('@')[1]; 153 if (!domain) { 154 return false; 155 } 156 157 return FREE_EMAIL_PROVIDERS.includes(domain); 158 } 159 160 /** 161 * Search HTML for company type indicators (GmbH, Ltd, AG, etc.) 162 * @param {string} html - HTML content to search 163 * @param {Object} country - Country config from countries.js 164 * @returns {Object} { found: boolean, matches: string[] } 165 */ 166 export function searchCompanyTypes(html, country) { 167 if (!html || !country.companyTypes) { 168 return { found: false, matches: [] }; 169 } 170 171 const matches = []; 172 173 for (const companyType of country.companyTypes) { 174 // Allow word boundaries OR hyphens/whitespace around the type 175 // e.g., "sole-trader", "sole trader", "Sole Trader" all match "Sole trader" 176 const escapedType = companyType.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); 177 // Replace spaces with flexible whitespace/hyphen pattern 178 const flexType = escapedType.replace(/\s+/g, '[\\s\\-]+'); 179 /* eslint-disable-next-line security/detect-non-literal-regexp -- companyType is from trusted config, not user input */ 180 const regex = new RegExp(`(?:^|[\\s,;.(>"'])${flexType}(?:[\\s,;.)<"']|$)`, 'i'); 181 182 if (regex.test(html)) { 183 matches.push(companyType); 184 } 185 } 186 187 return { 188 found: matches.length > 0, 189 matches, 190 }; 191 } 192 193 /** 194 * Search HTML for individual subscriber indicators (sole trader, partnership). 195 * Under UK PECR / EU ePrivacy, these are NOT corporate subscribers and 196 * cannot receive unsolicited marketing without consent. 197 * @param {string} html - HTML content to search 198 * @param {Object} country - Country config from countries.js 199 * @returns {Object} { found: boolean, matches: string[] } 200 */ 201 export function searchIndividualIndicators(html, country) { 202 if (!html || !country.individualIndicators) { 203 return { found: false, matches: [] }; 204 } 205 206 const matches = []; 207 208 for (const indicator of country.individualIndicators) { 209 const escaped = indicator.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); 210 const flexIndicator = escaped.replace(/\s+/g, '[\\s\\-]+'); 211 /* eslint-disable-next-line security/detect-non-literal-regexp -- indicator is from trusted config */ 212 const regex = new RegExp(`(?:^|[\\s,;.(>"'])${flexIndicator}(?:[\\s,;.)<"']|$)`, 'i'); 213 214 if (regex.test(html)) { 215 matches.push(indicator); 216 } 217 } 218 219 return { 220 found: matches.length > 0, 221 matches, 222 }; 223 } 224 225 /** 226 * Search HTML for company registration keywords 227 * (Handelsregister, VAT numbers, registration numbers, etc.) 228 * @param {string} html - HTML content to search 229 * @param {Object} country - Country config from countries.js 230 * @returns {Object} { found: boolean, matches: string[] } 231 */ 232 export function searchCompanyKeywords(html, country) { 233 if (!html || !country.companyKeywords) { 234 return { found: false, matches: [] }; 235 } 236 237 const matches = []; 238 239 for (const keyword of country.companyKeywords) { 240 // Build flexible regex: collapse whitespace/punctuation variations 241 // "Company No" matches "Company No.", "Company No", "Company No:" 242 const escaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); 243 const flexPattern = escaped.replace(/\s+/g, '[\\s.,:;]+'); 244 /* eslint-disable-next-line security/detect-non-literal-regexp -- keyword is from trusted config */ 245 const regex = new RegExp(flexPattern, 'i'); 246 if (regex.test(html)) { 247 matches.push(keyword); 248 } 249 } 250 251 return { 252 found: matches.length > 0, 253 matches, 254 }; 255 } 256 257 /** 258 * Get key page names for a country 259 * @param {string} countryCode - ISO country code 260 * @returns {string[]} Array of key page names 261 */ 262 export function getKeyPageNames(countryCode) { 263 const country = getCountryByCode(countryCode); 264 return country.keyPageNames || []; 265 } 266 267 /** 268 * Verify company email for GDPR compliance 269 * 270 * Multi-step verification process: 271 * 1. Check if GDPR verification required for country 272 * 2. Check if email is from free provider (FAIL if yes) 273 * 3. Search for company type in HTML (PASS with high confidence) 274 * 4. Search for company keywords (PASS with medium confidence) 275 * 5. Return uncertain if unable to verify 276 * 277 * @param {Object} params - Verification parameters 278 * @param {string} params.email - Email address to verify 279 * @param {string} params.html - HTML content to search 280 * @param {string} params.countryCode - ISO country code 281 * @param {string} params.domain - Site domain 282 * @returns {Object} Verification result 283 * @returns {boolean} result.isVerified - True if verified as company email 284 * @returns {string} result.reason - Verification reason 285 * @returns {string} result.confidence - Confidence level (high/medium/low/uncertain/n/a) 286 * @returns {string[]} result.matches - Matching company types or keywords 287 */ 288 export function verifyCompanyEmail({ email, html, countryCode, domain: _domain }) { 289 const country = getCountryByCode(countryCode); 290 291 // Step 1: Check if GDPR verification required 292 if (!country.requiresGDPRCheck) { 293 return { 294 isVerified: true, 295 reason: 'GDPR verification not required for this country', 296 confidence: 'n/a', 297 matches: [], 298 }; 299 } 300 301 // Step 2: Check for free email provider (FAIL) 302 if (isFreeEmailProvider(email)) { 303 return { 304 isVerified: false, 305 reason: 'Free email provider (likely individual, not company)', 306 confidence: 'high', 307 matches: [], 308 }; 309 } 310 311 // Step 3: Check for individual indicators (sole trader, partnership) — FAIL immediately. 312 // Under UK PECR / EU ePrivacy, sole traders and non-LLP partnerships are 313 // individual subscribers — unsolicited marketing requires consent. 314 const individualResult = searchIndividualIndicators(html, country); 315 if (individualResult.found) { 316 return { 317 isVerified: false, 318 reason: `Individual subscriber (not corporate): ${individualResult.matches.join(', ')}`, 319 confidence: 'high', 320 matches: individualResult.matches, 321 }; 322 } 323 324 // Step 4: Search for company type (PASS with high confidence) 325 const companyTypeResult = searchCompanyTypes(html, country); 326 if (companyTypeResult.found) { 327 return { 328 isVerified: true, 329 reason: `Company type found: ${companyTypeResult.matches.join(', ')}`, 330 confidence: 'high', 331 matches: companyTypeResult.matches, 332 }; 333 } 334 335 // Step 5: Search for company keywords (PASS with medium confidence) 336 const keywordResult = searchCompanyKeywords(html, country); 337 if (keywordResult.found) { 338 return { 339 isVerified: true, 340 reason: `Company registration keywords found: ${keywordResult.matches.join(', ')}`, 341 confidence: 'medium', 342 matches: keywordResult.matches, 343 }; 344 } 345 346 // Step 6: Unable to verify (UNCERTAIN) 347 return { 348 isVerified: false, 349 reason: 'Unable to verify company registration (no company type or keywords found)', 350 confidence: 'uncertain', 351 matches: [], 352 }; 353 } 354 355 /** 356 * Batch verify multiple emails for a site 357 * @param {Object} params - Batch verification parameters 358 * @param {string[]} params.emails - Array of email addresses 359 * @param {string} params.html - HTML content to search 360 * @param {string} params.countryCode - ISO country code 361 * @param {string} params.domain - Site domain 362 * @returns {Object[]} Array of verification results (one per email) 363 */ 364 export function batchVerifyEmails({ emails, html, countryCode, domain }) { 365 if (!emails || !Array.isArray(emails)) { 366 return []; 367 } 368 369 return emails.map(email => verifyCompanyEmail({ email, html, countryCode, domain })); 370 }