120-extract-score-contacts-json.js
1 #!/usr/bin/env node 2 /** 3 * 120-extract-score-contacts-json.js 4 * 5 * Extracts score_json and contacts_json blobs from the sites table to 6 * the filesystem: 7 * data/scores/{site_id}.json 8 * data/contacts/{site_id}.json 9 * 10 * After writing to disk, replaces each DB column value with a sentinel 11 * '{"_fs":true}' to reclaim space while preserving IS NOT NULL semantics. 12 * json_extract() calls on the sentinel return NULL gracefully (already 13 * handled in all code paths via getScoreJsonWithFallback / getContactsJsonWithFallback). 14 * 15 * Safe to re-run (idempotent): 16 * - If the file already exists on disk the DB blob is set to sentinel and skipped. 17 * - If the DB column already holds the sentinel the row is skipped entirely. 18 * 19 * After this migration, the pipeline no longer writes to these columns at all. 20 * Migration 121 NULLs them out entirely once VACUUM has been run. 21 * 22 * Usage: 23 * node db/migrations/120-extract-score-contacts-json.js 24 * node db/migrations/120-extract-score-contacts-json.js --dry-run 25 * node db/migrations/120-extract-score-contacts-json.js --scores-only 26 * node db/migrations/120-extract-score-contacts-json.js --contacts-only 27 * DATABASE_PATH=/path/to/sites.db node db/migrations/120-extract-score-contacts-json.js 28 */ 29 30 import Database from 'better-sqlite3'; 31 import { existsSync, mkdirSync, writeFileSync } from 'fs'; 32 import { join } from 'path'; 33 import path from 'path'; 34 import { fileURLToPath } from 'url'; 35 36 const __dirname = path.dirname(fileURLToPath(import.meta.url)); 37 38 const DRY_RUN = process.argv.includes('--dry-run'); 39 const SCORES_ONLY = process.argv.includes('--scores-only'); 40 const CONTACTS_ONLY = process.argv.includes('--contacts-only'); 41 const DO_SCORES = !CONTACTS_ONLY; 42 const DO_CONTACTS = !SCORES_ONLY; 43 44 const dbPath = 45 process.env.DATABASE_PATH || path.resolve(__dirname, '../../db/sites.db'); 46 47 const SCORES_DIR = path.resolve(__dirname, '../../data/scores'); 48 const CONTACTS_DIR = path.resolve(__dirname, '../../data/contacts'); 49 50 // Sentinel: valid JSON that signals "data is on filesystem". 51 // Preserves IS NOT NULL while shrinking the column to 14 bytes. 52 const FS_SENTINEL = '{"_fs":true}'; 53 54 // Batch size for SELECT + UPDATE loops 55 const BATCH_SIZE = 500; 56 57 // ------------------------------------------------------------------------- 58 // Helpers 59 // ------------------------------------------------------------------------- 60 61 function scoreFilePath(siteId) { 62 return join(SCORES_DIR, `${siteId}.json`); 63 } 64 65 function contactsFilePath(siteId) { 66 return join(CONTACTS_DIR, `${siteId}.json`); 67 } 68 69 function writeJson(filePath, json) { 70 writeFileSync(filePath, json, 'utf8'); 71 } 72 73 // ------------------------------------------------------------------------- 74 // Extract score_json 75 // ------------------------------------------------------------------------- 76 77 function extractScoreJson(db) { 78 console.log('\n=== Extracting score_json ==='); 79 80 if (!DRY_RUN) { 81 mkdirSync(SCORES_DIR, { recursive: true }); 82 } 83 84 const total = db 85 .prepare(`SELECT COUNT(*) as c FROM sites WHERE score_json IS NOT NULL AND score_json != ?`) 86 .get(FS_SENTINEL).c; 87 console.log(`Sites with score_json in DB: ${total.toLocaleString()}`); 88 89 if (total === 0) { 90 console.log('Nothing to extract.'); 91 return { extracted: 0, skipped: 0, errors: 0 }; 92 } 93 94 const setSentinel = db.prepare('UPDATE sites SET score_json = ? WHERE id = ?'); 95 96 let extracted = 0; 97 let skipped = 0; 98 let errors = 0; 99 let offset = 0; 100 101 while (true) { 102 const rows = db 103 .prepare( 104 `SELECT id, score_json FROM sites 105 WHERE score_json IS NOT NULL AND score_json != ? 106 LIMIT ? OFFSET ?` 107 ) 108 .all(FS_SENTINEL, BATCH_SIZE, offset); 109 110 if (rows.length === 0) break; 111 112 for (const row of rows) { 113 try { 114 const filePath = scoreFilePath(row.id); 115 116 if (existsSync(filePath)) { 117 // Already on disk — just write sentinel to DB (reclaim space) 118 if (!DRY_RUN) setSentinel.run(FS_SENTINEL, row.id); 119 skipped++; 120 continue; 121 } 122 123 if (DRY_RUN) { 124 extracted++; 125 continue; 126 } 127 128 writeJson(filePath, row.score_json); 129 setSentinel.run(FS_SENTINEL, row.id); 130 extracted++; 131 } catch (err) { 132 console.error(` Error extracting score_json for site ${row.id}: ${err.message}`); 133 errors++; 134 } 135 } 136 137 // In dry-run mode rows are not modified so advance offset manually 138 if (DRY_RUN) offset += BATCH_SIZE; 139 140 const progress = extracted + skipped + errors; 141 if (progress > 0 && (progress % 10000 === 0 || rows.length < BATCH_SIZE)) { 142 console.log( 143 ` Progress: ${progress.toLocaleString()}/${total.toLocaleString()} (extracted=${extracted}, skipped=${skipped}, errors=${errors})` 144 ); 145 } 146 } 147 148 console.log( 149 `Score extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}` 150 ); 151 return { extracted, skipped, errors }; 152 } 153 154 // ------------------------------------------------------------------------- 155 // Extract contacts_json 156 // ------------------------------------------------------------------------- 157 158 function extractContactsJson(db) { 159 console.log('\n=== Extracting contacts_json ==='); 160 161 if (!DRY_RUN) { 162 mkdirSync(CONTACTS_DIR, { recursive: true }); 163 } 164 165 const total = db 166 .prepare(`SELECT COUNT(*) as c FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ?`) 167 .get(FS_SENTINEL).c; 168 console.log(`Sites with contacts_json in DB: ${total.toLocaleString()}`); 169 170 if (total === 0) { 171 console.log('Nothing to extract.'); 172 return { extracted: 0, skipped: 0, errors: 0 }; 173 } 174 175 const setSentinel = db.prepare('UPDATE sites SET contacts_json = ? WHERE id = ?'); 176 177 let extracted = 0; 178 let skipped = 0; 179 let errors = 0; 180 let offset = 0; 181 182 while (true) { 183 const rows = db 184 .prepare( 185 `SELECT id, contacts_json FROM sites 186 WHERE contacts_json IS NOT NULL AND contacts_json != ? 187 LIMIT ? OFFSET ?` 188 ) 189 .all(FS_SENTINEL, BATCH_SIZE, offset); 190 191 if (rows.length === 0) break; 192 193 for (const row of rows) { 194 try { 195 const filePath = contactsFilePath(row.id); 196 197 if (existsSync(filePath)) { 198 if (!DRY_RUN) setSentinel.run(FS_SENTINEL, row.id); 199 skipped++; 200 continue; 201 } 202 203 if (DRY_RUN) { 204 extracted++; 205 continue; 206 } 207 208 writeJson(filePath, row.contacts_json); 209 setSentinel.run(FS_SENTINEL, row.id); 210 extracted++; 211 } catch (err) { 212 console.error(` Error extracting contacts_json for site ${row.id}: ${err.message}`); 213 errors++; 214 } 215 } 216 217 if (DRY_RUN) offset += BATCH_SIZE; 218 219 const progress = extracted + skipped + errors; 220 if (progress > 0 && (progress % 10000 === 0 || rows.length < BATCH_SIZE)) { 221 console.log( 222 ` Progress: ${progress.toLocaleString()}/${total.toLocaleString()} (extracted=${extracted}, skipped=${skipped}, errors=${errors})` 223 ); 224 } 225 } 226 227 console.log( 228 `Contacts extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}` 229 ); 230 return { extracted, skipped, errors }; 231 } 232 233 // ------------------------------------------------------------------------- 234 // Main 235 // ------------------------------------------------------------------------- 236 237 if (!existsSync(dbPath)) { 238 console.error(`ERROR: Database not found at: ${dbPath}`); 239 process.exit(1); 240 } 241 242 const db = new Database(dbPath); 243 db.pragma('journal_mode = WAL'); 244 db.pragma('busy_timeout = 30000'); 245 246 console.log('=== 120-extract-score-contacts-json migration ==='); 247 console.log(`Database: ${dbPath}`); 248 if (DRY_RUN) console.log('MODE: DRY RUN — no changes will be made'); 249 else console.log('MODE: LIVE'); 250 251 const startTime = Date.now(); 252 253 try { 254 if (DO_SCORES) extractScoreJson(db); 255 if (DO_CONTACTS) extractContactsJson(db); 256 257 if (!DRY_RUN) { 258 console.log('\nRunning VACUUM to reclaim disk space from replaced blobs...'); 259 db.exec('VACUUM'); 260 console.log('VACUUM complete.'); 261 } 262 } finally { 263 db.close(); 264 } 265 266 const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); 267 console.log(`\nDone in ${elapsed}s`); 268 console.log('\nNext step: run migration 121 to NULL out the now-empty DB columns.');