backfill-language-code.js
1 #!/usr/bin/env node 2 /** 3 * Backfill language_code from locale_data 4 * 5 * One-off script to populate language_code for all existing sites. 6 * Parses locale_data.htmlLang and falls back to the country's primary language. 7 * 8 * Usage: 9 * node scripts/backfill-language-code.js [--dry-run] 10 */ 11 12 import { createDatabaseConnection } from '../src/utils/db.js'; 13 import { getCountryByCode } from '../src/config/countries.js'; 14 import dotenv from 'dotenv'; 15 16 dotenv.config(); 17 18 const BATCH_SIZE = 500; 19 const isDryRun = process.argv.includes('--dry-run'); 20 21 function normalizeLanguageCode(htmlLang) { 22 if (!htmlLang) return null; 23 // Split on either hyphen or underscore: 'de-DE' → 'de', 'de_DE' → 'de' 24 return htmlLang.split(/[-_]/)[0].toLowerCase(); 25 } 26 27 function deriveLanguageCode(site) { 28 // 1. Try locale_data.htmlLang 29 if (site.locale_data) { 30 try { 31 const parsed = JSON.parse(site.locale_data); 32 const code = normalizeLanguageCode(parsed.htmlLang); 33 if (code) return code; 34 } catch { 35 // malformed locale_data JSON — fall through to country default 36 } 37 } 38 // 2. Fall back to country's primary language 39 if (site.country_code) { 40 try { 41 const country = getCountryByCode(site.country_code); 42 if (country?.language) return country.language; 43 } catch { 44 // Unknown or unsupported country code — skip 45 } 46 } 47 return null; 48 } 49 50 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 51 const db = createDatabaseConnection(dbPath); 52 db.pragma('foreign_keys = ON'); 53 54 const sites = db 55 .prepare( 56 `SELECT id, country_code, locale_data, language_code 57 FROM sites 58 WHERE status != 'ignored' 59 ORDER BY id` 60 ) 61 .all(); 62 63 console.log( 64 `Processing ${sites.length} sites (batch size: ${BATCH_SIZE})${isDryRun ? ' [DRY RUN]' : ''}...` 65 ); 66 67 let updated = 0; 68 let skipped = 0; 69 let alreadySet = 0; 70 let unchanged = 0; 71 72 const updateStmt = db.prepare('UPDATE sites SET language_code = ? WHERE id = ?'); 73 74 for (let i = 0; i < sites.length; i += BATCH_SIZE) { 75 const batch = sites.slice(i, i + BATCH_SIZE); 76 77 if (!isDryRun) { 78 const runBatch = db.transaction(items => { 79 for (const site of items) { 80 const derived = deriveLanguageCode(site); 81 82 if (!derived) { 83 skipped++; 84 continue; 85 } 86 87 if (site.language_code === derived) { 88 alreadySet++; 89 continue; 90 } 91 92 if (site.language_code && site.language_code !== derived) { 93 // Already has a different value — prefer keeping existing (may have been set by scoring) 94 unchanged++; 95 continue; 96 } 97 98 updateStmt.run(derived, site.id); 99 updated++; 100 } 101 }); 102 runBatch(batch); 103 } else { 104 for (const site of batch) { 105 const derived = deriveLanguageCode(site); 106 if (!derived) { 107 skipped++; 108 continue; 109 } 110 if (site.language_code === derived) { 111 alreadySet++; 112 continue; 113 } 114 if (site.language_code && site.language_code !== derived) { 115 unchanged++; 116 continue; 117 } 118 updated++; 119 } 120 } 121 122 process.stdout.write( 123 `\r Processed ${Math.min(i + BATCH_SIZE, sites.length)}/${sites.length}...` 124 ); 125 } 126 127 console.log('\n'); 128 console.log('Results:'); 129 console.log(` Updated: ${updated}`); 130 console.log(` Already set: ${alreadySet}`); 131 console.log(` Skipped (no language derived): ${skipped}`); 132 console.log(` Skipped (existing value differs): ${unchanged}`); 133 134 if (isDryRun) { 135 console.log('\n[DRY RUN] No changes written. Re-run without --dry-run to apply.'); 136 } 137 138 db.close();