/ src / utils / gdpr-verification.js
gdpr-verification.js
  1  /**
  2   * GDPR Verification Utilities
  3   *
  4   * Verifies that email addresses belong to registered companies (not individuals)
  5   * before sending outreach in EU/UK countries, as required by GDPR.
  6   *
  7   * Verification steps:
  8   * 1. Free email provider check (gmail.com, outlook.com, etc.)
  9   * 2. Company type search in HTML (GmbH, Ltd, AG, S.r.l., etc.)
 10   * 3. Company keyword search (registration numbers, VAT IDs, etc.)
 11   * 4. Key page verification (Impressum, About Us, Contact pages)
 12   */
 13  
 14  import { getCountryByCode } from '../config/countries.js';
 15  
 16  /**
 17   * Free email providers list (75+ domains across EU/UK)
 18   * These indicate individual/personal emails, not company emails
 19   */
 20  const FREE_EMAIL_PROVIDERS = [
 21    // International
 22    'gmail.com',
 23    'yahoo.com',
 24    'outlook.com',
 25    'hotmail.com',
 26    'aol.com',
 27    'icloud.com',
 28    'mail.com',
 29    'protonmail.com',
 30    'zoho.com',
 31    'yandex.com',
 32  
 33    // Germany
 34    'gmx.de',
 35    'gmx.net',
 36    'web.de',
 37    't-online.de',
 38    'freenet.de',
 39    '1und1.de',
 40    'arcor.de',
 41    'online.de',
 42    'gmx.at',
 43    'gmx.ch',
 44  
 45    // France
 46    'laposte.net',
 47    'orange.fr',
 48    'free.fr',
 49    'wanadoo.fr',
 50    'sfr.fr',
 51    'neuf.fr',
 52    'bbox.fr',
 53    'club-internet.fr',
 54    'voila.fr',
 55    'yahoo.fr',
 56  
 57    // UK
 58    'btinternet.com',
 59    'virginmedia.com',
 60    'sky.com',
 61    'talktalk.net',
 62    'ntlworld.com',
 63    'blueyonder.co.uk',
 64    'tiscali.co.uk',
 65    'yahoo.co.uk',
 66  
 67    // Italy
 68    'libero.it',
 69    'virgilio.it',
 70    'tiscali.it',
 71    'alice.it',
 72    'tin.it',
 73    'email.it',
 74    'fastwebnet.it',
 75    'inwind.it',
 76    'yahoo.it',
 77  
 78    // Spain
 79    'telefonica.net',
 80    'terra.es',
 81    'ono.com',
 82    'ya.com',
 83    'hotmail.es',
 84    'yahoo.es',
 85    'gmail.es',
 86  
 87    // Netherlands
 88    'ziggo.nl',
 89    'hetnet.nl',
 90    'planet.nl',
 91    'home.nl',
 92    'xs4all.nl',
 93    'telfort.nl',
 94    'chello.nl',
 95    'yahoo.nl',
 96  
 97    // Belgium
 98    'skynet.be',
 99    'telenet.be',
100    'pandora.be',
101    'yahoo.be',
102    'hotmail.be',
103  
104    // Austria
105    'aon.at',
106    'chello.at',
107    'gmx.at',
108    'yahoo.at',
109  
110    // Sweden
111    'telia.com',
112    'bredband.net',
113    'spray.se',
114    'yahoo.se',
115  
116    // Denmark
117    'jubii.dk',
118    'mail.dk',
119    'yahoo.dk',
120  
121    // Norway
122    'online.no',
123    'start.no',
124    'yahoo.no',
125  
126    // Poland
127    'wp.pl',
128    'onet.pl',
129    'interia.pl',
130    'o2.pl',
131    'tlen.pl',
132    'gazeta.pl',
133    'poczta.fm',
134    'yahoo.pl',
135  
136    // Ireland
137    'eircom.net',
138    'yahoo.ie',
139    'hotmail.ie',
140  ];
141  
142  /**
143   * Check if email domain is a free email provider
144   * @param {string} email - Email address to check
145   * @returns {boolean} True if free email provider
146   */
147  export function isFreeEmailProvider(email) {
148    if (!email || typeof email !== 'string') {
149      return false;
150    }
151  
152    const domain = email.toLowerCase().split('@')[1];
153    if (!domain) {
154      return false;
155    }
156  
157    return FREE_EMAIL_PROVIDERS.includes(domain);
158  }
159  
160  /**
161   * Search HTML for company type indicators (GmbH, Ltd, AG, etc.)
162   * @param {string} html - HTML content to search
163   * @param {Object} country - Country config from countries.js
164   * @returns {Object} { found: boolean, matches: string[] }
165   */
166  export function searchCompanyTypes(html, country) {
167    if (!html || !country.companyTypes) {
168      return { found: false, matches: [] };
169    }
170  
171    const matches = [];
172  
173    for (const companyType of country.companyTypes) {
174      // Allow word boundaries OR hyphens/whitespace around the type
175      // e.g., "sole-trader", "sole trader", "Sole Trader" all match "Sole trader"
176      const escapedType = companyType.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
177      // Replace spaces with flexible whitespace/hyphen pattern
178      const flexType = escapedType.replace(/\s+/g, '[\\s\\-]+');
179      /* eslint-disable-next-line security/detect-non-literal-regexp -- companyType is from trusted config, not user input */
180      const regex = new RegExp(`(?:^|[\\s,;.(>"'])${flexType}(?:[\\s,;.)<"']|$)`, 'i');
181  
182      if (regex.test(html)) {
183        matches.push(companyType);
184      }
185    }
186  
187    return {
188      found: matches.length > 0,
189      matches,
190    };
191  }
192  
193  /**
194   * Search HTML for individual subscriber indicators (sole trader, partnership).
195   * Under UK PECR / EU ePrivacy, these are NOT corporate subscribers and
196   * cannot receive unsolicited marketing without consent.
197   * @param {string} html - HTML content to search
198   * @param {Object} country - Country config from countries.js
199   * @returns {Object} { found: boolean, matches: string[] }
200   */
201  export function searchIndividualIndicators(html, country) {
202    if (!html || !country.individualIndicators) {
203      return { found: false, matches: [] };
204    }
205  
206    const matches = [];
207  
208    for (const indicator of country.individualIndicators) {
209      const escaped = indicator.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
210      const flexIndicator = escaped.replace(/\s+/g, '[\\s\\-]+');
211      /* eslint-disable-next-line security/detect-non-literal-regexp -- indicator is from trusted config */
212      const regex = new RegExp(`(?:^|[\\s,;.(>"'])${flexIndicator}(?:[\\s,;.)<"']|$)`, 'i');
213  
214      if (regex.test(html)) {
215        matches.push(indicator);
216      }
217    }
218  
219    return {
220      found: matches.length > 0,
221      matches,
222    };
223  }
224  
225  /**
226   * Search HTML for company registration keywords
227   * (Handelsregister, VAT numbers, registration numbers, etc.)
228   * @param {string} html - HTML content to search
229   * @param {Object} country - Country config from countries.js
230   * @returns {Object} { found: boolean, matches: string[] }
231   */
232  export function searchCompanyKeywords(html, country) {
233    if (!html || !country.companyKeywords) {
234      return { found: false, matches: [] };
235    }
236  
237    const matches = [];
238  
239    for (const keyword of country.companyKeywords) {
240      // Build flexible regex: collapse whitespace/punctuation variations
241      // "Company No" matches "Company No.", "Company  No", "Company No:"
242      const escaped = keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
243      const flexPattern = escaped.replace(/\s+/g, '[\\s.,:;]+');
244      /* eslint-disable-next-line security/detect-non-literal-regexp -- keyword is from trusted config */
245      const regex = new RegExp(flexPattern, 'i');
246      if (regex.test(html)) {
247        matches.push(keyword);
248      }
249    }
250  
251    return {
252      found: matches.length > 0,
253      matches,
254    };
255  }
256  
257  /**
258   * Get key page names for a country
259   * @param {string} countryCode - ISO country code
260   * @returns {string[]} Array of key page names
261   */
262  export function getKeyPageNames(countryCode) {
263    const country = getCountryByCode(countryCode);
264    return country.keyPageNames || [];
265  }
266  
267  /**
268   * Verify company email for GDPR compliance
269   *
270   * Multi-step verification process:
271   * 1. Check if GDPR verification required for country
272   * 2. Check if email is from free provider (FAIL if yes)
273   * 3. Search for company type in HTML (PASS with high confidence)
274   * 4. Search for company keywords (PASS with medium confidence)
275   * 5. Return uncertain if unable to verify
276   *
277   * @param {Object} params - Verification parameters
278   * @param {string} params.email - Email address to verify
279   * @param {string} params.html - HTML content to search
280   * @param {string} params.countryCode - ISO country code
281   * @param {string} params.domain - Site domain
282   * @returns {Object} Verification result
283   * @returns {boolean} result.isVerified - True if verified as company email
284   * @returns {string} result.reason - Verification reason
285   * @returns {string} result.confidence - Confidence level (high/medium/low/uncertain/n/a)
286   * @returns {string[]} result.matches - Matching company types or keywords
287   */
288  export function verifyCompanyEmail({ email, html, countryCode, domain: _domain }) {
289    const country = getCountryByCode(countryCode);
290  
291    // Step 1: Check if GDPR verification required
292    if (!country.requiresGDPRCheck) {
293      return {
294        isVerified: true,
295        reason: 'GDPR verification not required for this country',
296        confidence: 'n/a',
297        matches: [],
298      };
299    }
300  
301    // Step 2: Check for free email provider (FAIL)
302    if (isFreeEmailProvider(email)) {
303      return {
304        isVerified: false,
305        reason: 'Free email provider (likely individual, not company)',
306        confidence: 'high',
307        matches: [],
308      };
309    }
310  
311    // Step 3: Check for individual indicators (sole trader, partnership) — FAIL immediately.
312    // Under UK PECR / EU ePrivacy, sole traders and non-LLP partnerships are
313    // individual subscribers — unsolicited marketing requires consent.
314    const individualResult = searchIndividualIndicators(html, country);
315    if (individualResult.found) {
316      return {
317        isVerified: false,
318        reason: `Individual subscriber (not corporate): ${individualResult.matches.join(', ')}`,
319        confidence: 'high',
320        matches: individualResult.matches,
321      };
322    }
323  
324    // Step 4: Search for company type (PASS with high confidence)
325    const companyTypeResult = searchCompanyTypes(html, country);
326    if (companyTypeResult.found) {
327      return {
328        isVerified: true,
329        reason: `Company type found: ${companyTypeResult.matches.join(', ')}`,
330        confidence: 'high',
331        matches: companyTypeResult.matches,
332      };
333    }
334  
335    // Step 5: Search for company keywords (PASS with medium confidence)
336    const keywordResult = searchCompanyKeywords(html, country);
337    if (keywordResult.found) {
338      return {
339        isVerified: true,
340        reason: `Company registration keywords found: ${keywordResult.matches.join(', ')}`,
341        confidence: 'medium',
342        matches: keywordResult.matches,
343      };
344    }
345  
346    // Step 6: Unable to verify (UNCERTAIN)
347    return {
348      isVerified: false,
349      reason: 'Unable to verify company registration (no company type or keywords found)',
350      confidence: 'uncertain',
351      matches: [],
352    };
353  }
354  
355  /**
356   * Batch verify multiple emails for a site
357   * @param {Object} params - Batch verification parameters
358   * @param {string[]} params.emails - Array of email addresses
359   * @param {string} params.html - HTML content to search
360   * @param {string} params.countryCode - ISO country code
361   * @param {string} params.domain - Site domain
362   * @returns {Object[]} Array of verification results (one per email)
363   */
364  export function batchVerifyEmails({ emails, html, countryCode, domain }) {
365    if (!emails || !Array.isArray(emails)) {
366      return [];
367    }
368  
369    return emails.map(email => verifyCompanyEmail({ email, html, countryCode, domain }));
370  }