/ scripts / backfill-language-code.js
backfill-language-code.js
  1  #!/usr/bin/env node
  2  /**
  3   * Backfill language_code from locale_data
  4   *
  5   * One-off script to populate language_code for all existing sites.
  6   * Parses locale_data.htmlLang and falls back to the country's primary language.
  7   *
  8   * Usage:
  9   *   node scripts/backfill-language-code.js [--dry-run]
 10   */
 11  
 12  import { createDatabaseConnection } from '../src/utils/db.js';
 13  import { getCountryByCode } from '../src/config/countries.js';
 14  import dotenv from 'dotenv';
 15  
 16  dotenv.config();
 17  
 18  const BATCH_SIZE = 500;
 19  const isDryRun = process.argv.includes('--dry-run');
 20  
 21  function normalizeLanguageCode(htmlLang) {
 22    if (!htmlLang) return null;
 23    // Split on either hyphen or underscore: 'de-DE' → 'de', 'de_DE' → 'de'
 24    return htmlLang.split(/[-_]/)[0].toLowerCase();
 25  }
 26  
 27  function deriveLanguageCode(site) {
 28    // 1. Try locale_data.htmlLang
 29    if (site.locale_data) {
 30      try {
 31        const parsed = JSON.parse(site.locale_data);
 32        const code = normalizeLanguageCode(parsed.htmlLang);
 33        if (code) return code;
 34      } catch {
 35        // malformed locale_data JSON — fall through to country default
 36      }
 37    }
 38    // 2. Fall back to country's primary language
 39    if (site.country_code) {
 40      try {
 41        const country = getCountryByCode(site.country_code);
 42        if (country?.language) return country.language;
 43      } catch {
 44        // Unknown or unsupported country code — skip
 45      }
 46    }
 47    return null;
 48  }
 49  
 50  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
 51  const db = createDatabaseConnection(dbPath);
 52  db.pragma('foreign_keys = ON');
 53  
 54  const sites = db
 55    .prepare(
 56      `SELECT id, country_code, locale_data, language_code
 57       FROM sites
 58       WHERE status != 'ignored'
 59       ORDER BY id`
 60    )
 61    .all();
 62  
 63  console.log(
 64    `Processing ${sites.length} sites (batch size: ${BATCH_SIZE})${isDryRun ? ' [DRY RUN]' : ''}...`
 65  );
 66  
 67  let updated = 0;
 68  let skipped = 0;
 69  let alreadySet = 0;
 70  let unchanged = 0;
 71  
 72  const updateStmt = db.prepare('UPDATE sites SET language_code = ? WHERE id = ?');
 73  
 74  for (let i = 0; i < sites.length; i += BATCH_SIZE) {
 75    const batch = sites.slice(i, i + BATCH_SIZE);
 76  
 77    if (!isDryRun) {
 78      const runBatch = db.transaction(items => {
 79        for (const site of items) {
 80          const derived = deriveLanguageCode(site);
 81  
 82          if (!derived) {
 83            skipped++;
 84            continue;
 85          }
 86  
 87          if (site.language_code === derived) {
 88            alreadySet++;
 89            continue;
 90          }
 91  
 92          if (site.language_code && site.language_code !== derived) {
 93            // Already has a different value — prefer keeping existing (may have been set by scoring)
 94            unchanged++;
 95            continue;
 96          }
 97  
 98          updateStmt.run(derived, site.id);
 99          updated++;
100        }
101      });
102      runBatch(batch);
103    } else {
104      for (const site of batch) {
105        const derived = deriveLanguageCode(site);
106        if (!derived) {
107          skipped++;
108          continue;
109        }
110        if (site.language_code === derived) {
111          alreadySet++;
112          continue;
113        }
114        if (site.language_code && site.language_code !== derived) {
115          unchanged++;
116          continue;
117        }
118        updated++;
119      }
120    }
121  
122    process.stdout.write(
123      `\r  Processed ${Math.min(i + BATCH_SIZE, sites.length)}/${sites.length}...`
124    );
125  }
126  
127  console.log('\n');
128  console.log('Results:');
129  console.log(`  Updated:      ${updated}`);
130  console.log(`  Already set:  ${alreadySet}`);
131  console.log(`  Skipped (no language derived): ${skipped}`);
132  console.log(`  Skipped (existing value differs): ${unchanged}`);
133  
134  if (isDryRun) {
135    console.log('\n[DRY RUN] No changes written. Re-run without --dry-run to apply.');
136  }
137  
138  db.close();