detect-language.js
1 /** 2 * Language detection utility for pipeline sites. 3 * 4 * Priority order (most → least reliable): 5 * 1. Content-Language HTTP response header (server-declared) 6 * 2. hreflang tags that confirm the country's primary language 7 * 3. hreflang tags that explicitly confirm English 8 * 4. htmlLang attribute — trusted ONLY when hreflangs or country confirms it 9 * (htmlLang='en' with no hreflangs in a non-English country → template default, ignored) 10 * 5. Country config fallback (primary language for that country) 11 * 12 * @param {string} countryCode - ISO country code (e.g. 'DE', 'JP') 13 * @param {string|null} localeData - JSON string from assets stage: {htmlLang, hreflangs:[]} 14 * @param {string|null} httpHeaders - JSON string of response headers 15 * @param {Function} getCountryByCode - Injected to avoid circular deps 16 * @returns {string|null} BCP 47 primary language subtag (e.g. 'en', 'de', 'ja') or null 17 */ 18 export function deriveLanguageCode(countryCode, localeData, httpHeaders, getCountryByCode) { 19 const country = countryCode ? getCountryByCode(countryCode) : null; 20 const countryLang = country?.language || null; 21 22 // Parse locale data from the page 23 let htmlLang = null; 24 let hreflangs = []; 25 try { 26 const parsed = JSON.parse(localeData || '{}'); 27 if (parsed.htmlLang) { 28 // Split on both '-' (BCP 47 standard) and '_' (non-standard but common in HTML) 29 htmlLang = parsed.htmlLang.split(/[-_]/)[0].toLowerCase(); 30 // 'zxx' = ISO 639-2 "no linguistic content" — treat as unknown, fall through to country 31 if (htmlLang === 'zxx') htmlLang = null; 32 } 33 hreflangs = Array.isArray(parsed.hreflangs) ? parsed.hreflangs : []; 34 } catch { 35 // malformed JSON — fall through 36 } 37 38 // 1. Content-Language header — most authoritative when present 39 try { 40 const headers = JSON.parse(httpHeaders || '{}'); 41 const contentLang = 42 headers['content-language'] || headers['Content-Language'] || headers['content_language']; 43 if (contentLang && typeof contentLang === 'string') { 44 return contentLang.split(',')[0].split('-')[0].toLowerCase().trim() || null; 45 } 46 } catch { 47 // malformed headers JSON — fall through 48 } 49 50 // Extract unique language subtags from hreflangs (exclude x-default, region-only variants) 51 const hreflangLangs = hreflangs 52 .map(h => h?.hreflang) 53 .filter(h => h && h !== 'x-default') 54 .map(h => h.split('-')[0].toLowerCase()); 55 56 // 2. hreflangs that match the country's primary language — strongest page-level signal 57 if (countryLang && hreflangLangs.includes(countryLang)) { 58 return countryLang; 59 } 60 61 // 3. hreflangs explicitly confirm English (e.g. Japanese company with English site) 62 if (hreflangLangs.includes('en') && htmlLang === 'en') { 63 return 'en'; 64 } 65 66 // 4a. htmlLang is non-English — generally trust it (German sites declaring 'de', etc.) 67 if (htmlLang && htmlLang !== 'en') { 68 return htmlLang; 69 } 70 71 // 4b. htmlLang='en', hreflangs are empty, country primary language is non-English 72 // → very likely a template default (WordPress, Wix, etc.) — don't trust it 73 if (htmlLang === 'en' && hreflangs.length === 0 && countryLang && countryLang !== 'en') { 74 return countryLang; 75 } 76 77 // 4c. htmlLang='en' with hreflangs present but no language confirmed above 78 // → use htmlLang (could be genuinely English-language foreign site) 79 if (htmlLang) { 80 return htmlLang; 81 } 82 83 // 5. Country fallback 84 return countryLang || null; 85 }