/ src / utils / detect-language.js
detect-language.js
 1  /**
 2   * Language detection utility for pipeline sites.
 3   *
 4   * Priority order (most → least reliable):
 5   *  1. Content-Language HTTP response header (server-declared)
 6   *  2. hreflang tags that confirm the country's primary language
 7   *  3. hreflang tags that explicitly confirm English
 8   *  4. htmlLang attribute — trusted ONLY when hreflangs or country confirms it
 9   *     (htmlLang='en' with no hreflangs in a non-English country → template default, ignored)
10   *  5. Country config fallback (primary language for that country)
11   *
12   * @param {string} countryCode - ISO country code (e.g. 'DE', 'JP')
13   * @param {string|null} localeData - JSON string from assets stage: {htmlLang, hreflangs:[]}
14   * @param {string|null} httpHeaders - JSON string of response headers
15   * @param {Function} getCountryByCode - Injected to avoid circular deps
16   * @returns {string|null} BCP 47 primary language subtag (e.g. 'en', 'de', 'ja') or null
17   */
18  export function deriveLanguageCode(countryCode, localeData, httpHeaders, getCountryByCode) {
19    const country = countryCode ? getCountryByCode(countryCode) : null;
20    const countryLang = country?.language || null;
21  
22    // Parse locale data from the page
23    let htmlLang = null;
24    let hreflangs = [];
25    try {
26      const parsed = JSON.parse(localeData || '{}');
27      if (parsed.htmlLang) {
28        // Split on both '-' (BCP 47 standard) and '_' (non-standard but common in HTML)
29        htmlLang = parsed.htmlLang.split(/[-_]/)[0].toLowerCase();
30        // 'zxx' = ISO 639-2 "no linguistic content" — treat as unknown, fall through to country
31        if (htmlLang === 'zxx') htmlLang = null;
32      }
33      hreflangs = Array.isArray(parsed.hreflangs) ? parsed.hreflangs : [];
34    } catch {
35      // malformed JSON — fall through
36    }
37  
38    // 1. Content-Language header — most authoritative when present
39    try {
40      const headers = JSON.parse(httpHeaders || '{}');
41      const contentLang =
42        headers['content-language'] || headers['Content-Language'] || headers['content_language'];
43      if (contentLang && typeof contentLang === 'string') {
44        return contentLang.split(',')[0].split('-')[0].toLowerCase().trim() || null;
45      }
46    } catch {
47      // malformed headers JSON — fall through
48    }
49  
50    // Extract unique language subtags from hreflangs (exclude x-default, region-only variants)
51    const hreflangLangs = hreflangs
52      .map(h => h?.hreflang)
53      .filter(h => h && h !== 'x-default')
54      .map(h => h.split('-')[0].toLowerCase());
55  
56    // 2. hreflangs that match the country's primary language — strongest page-level signal
57    if (countryLang && hreflangLangs.includes(countryLang)) {
58      return countryLang;
59    }
60  
61    // 3. hreflangs explicitly confirm English (e.g. Japanese company with English site)
62    if (hreflangLangs.includes('en') && htmlLang === 'en') {
63      return 'en';
64    }
65  
66    // 4a. htmlLang is non-English — generally trust it (German sites declaring 'de', etc.)
67    if (htmlLang && htmlLang !== 'en') {
68      return htmlLang;
69    }
70  
71    // 4b. htmlLang='en', hreflangs are empty, country primary language is non-English
72    //     → very likely a template default (WordPress, Wix, etc.) — don't trust it
73    if (htmlLang === 'en' && hreflangs.length === 0 && countryLang && countryLang !== 'en') {
74      return countryLang;
75    }
76  
77    // 4c. htmlLang='en' with hreflangs present but no language confirmed above
78    //     → use htmlLang (could be genuinely English-language foreign site)
79    if (htmlLang) {
80      return htmlLang;
81    }
82  
83    // 5. Country fallback
84    return countryLang || null;
85  }