extract-score-contacts-json.js
1 #!/usr/bin/env node 2 /** 3 * Extract score_json and contacts_json from DB to filesystem. 4 * 5 * For each site row that has score_json or contacts_json: 6 * 1. Write the JSON blob to data/scores/{id}.json or data/contacts/{id}.json 7 * 2. Replace the DB column with a sentinel value '{"_fs":true}' to reclaim space 8 * while keeping IS NOT NULL checks and json_extract() calls working 9 * 10 * The sentinel '{"_fs":true}' is valid JSON so: 11 * - score_json IS NOT NULL → still true 12 * - json_extract(score_json, '$.overall_calculation.conversion_score') → NULL (graceful) 13 * - json_array_length(contacts_json, '$.contacts') → NULL (graceful) 14 * 15 * Safe to re-run: skips sites already extracted (sentinel in DB or file exists on fs). 16 * 17 * Usage: 18 * node scripts/extract-score-contacts-json.js # extract both 19 * node scripts/extract-score-contacts-json.js --scores-only # extract score_json only 20 * node scripts/extract-score-contacts-json.js --contacts-only # extract contacts_json only 21 * node scripts/extract-score-contacts-json.js --dry-run # show what would happen 22 */ 23 24 import { createDatabaseConnection } from '../src/utils/db.js'; 25 import { join, dirname } from 'path'; 26 import { fileURLToPath } from 'url'; 27 import { setScoreJson, hasScoreJson } from '../src/utils/score-storage.js'; 28 import { setContactsJson, hasContactsJson } from '../src/utils/contacts-storage.js'; 29 30 const __filename = fileURLToPath(import.meta.url); 31 const __dirname = dirname(__filename); 32 const projectRoot = join(__dirname, '..'); 33 const dbPath = join(projectRoot, 'db', 'sites.db'); 34 35 const args = process.argv.slice(2); 36 const dryRun = args.includes('--dry-run'); 37 const scoresOnly = args.includes('--scores-only'); 38 const contactsOnly = args.includes('--contacts-only'); 39 const doScores = !contactsOnly; 40 const doContacts = !scoresOnly; 41 42 // Sentinel value: valid JSON that preserves IS NOT NULL while reclaiming space. 43 // json_extract() calls on real paths return NULL gracefully. 44 const FS_SENTINEL = '{"_fs":true}'; 45 46 function extractScoreJson(db) { 47 console.log('\n=== Extracting score_json ==='); 48 49 // Count total (exclude already-extracted sentinel rows) 50 const total = db.prepare( 51 `SELECT COUNT(*) as c FROM sites WHERE score_json IS NOT NULL AND score_json != ?` 52 ).get(FS_SENTINEL).c; 53 console.log(`Sites with score_json in DB: ${total}`); 54 55 if (total === 0) { 56 console.log('Nothing to extract.'); 57 return { extracted: 0, skipped: 0, errors: 0 }; 58 } 59 60 // Process in batches of 500 61 const BATCH_SIZE = 500; 62 let extracted = 0; 63 let skipped = 0; 64 let errors = 0; 65 let offset = 0; 66 67 const setSentinel = db.prepare('UPDATE sites SET score_json = ? WHERE id = ?'); 68 69 while (true) { 70 const rows = db.prepare( 71 `SELECT id, score_json FROM sites WHERE score_json IS NOT NULL AND score_json != ? LIMIT ? OFFSET ?` 72 ).all(FS_SENTINEL, BATCH_SIZE, offset); 73 74 if (rows.length === 0) break; 75 76 for (const row of rows) { 77 try { 78 // Skip if already on filesystem — just replace DB blob with sentinel 79 if (hasScoreJson(row.id)) { 80 if (!dryRun) { 81 setSentinel.run(FS_SENTINEL, row.id); 82 } 83 skipped++; 84 continue; 85 } 86 87 if (dryRun) { 88 extracted++; 89 continue; 90 } 91 92 // Write to filesystem 93 setScoreJson(row.id, row.score_json); 94 95 // Replace DB column with sentinel 96 setSentinel.run(FS_SENTINEL, row.id); 97 extracted++; 98 } catch (err) { 99 console.error(` Error extracting score_json for site ${row.id}: ${err.message}`); 100 errors++; 101 } 102 } 103 104 // Since we're replacing rows with sentinel, they won't match the WHERE next time 105 // For dry-run, advance offset since rows aren't being modified 106 if (dryRun) { 107 offset += BATCH_SIZE; 108 } 109 110 const progress = extracted + skipped + errors; 111 if (progress % 1000 === 0 || rows.length < BATCH_SIZE) { 112 console.log(` Progress: ${progress}/${total} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`); 113 } 114 } 115 116 console.log(`Score extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`); 117 return { extracted, skipped, errors }; 118 } 119 120 function extractContactsJson(db) { 121 console.log('\n=== Extracting contacts_json ==='); 122 123 const total = db.prepare( 124 `SELECT COUNT(*) as c FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ?` 125 ).get(FS_SENTINEL).c; 126 console.log(`Sites with contacts_json in DB: ${total}`); 127 128 if (total === 0) { 129 console.log('Nothing to extract.'); 130 return { extracted: 0, skipped: 0, errors: 0 }; 131 } 132 133 const BATCH_SIZE = 500; 134 let extracted = 0; 135 let skipped = 0; 136 let errors = 0; 137 let offset = 0; 138 139 const setSentinel = db.prepare('UPDATE sites SET contacts_json = ? WHERE id = ?'); 140 141 while (true) { 142 const rows = db.prepare( 143 `SELECT id, contacts_json FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ? LIMIT ? OFFSET ?` 144 ).all(FS_SENTINEL, BATCH_SIZE, offset); 145 146 if (rows.length === 0) break; 147 148 for (const row of rows) { 149 try { 150 if (hasContactsJson(row.id)) { 151 if (!dryRun) { 152 setSentinel.run(FS_SENTINEL, row.id); 153 } 154 skipped++; 155 continue; 156 } 157 158 if (dryRun) { 159 extracted++; 160 continue; 161 } 162 163 setContactsJson(row.id, row.contacts_json); 164 setSentinel.run(FS_SENTINEL, row.id); 165 extracted++; 166 } catch (err) { 167 console.error(` Error extracting contacts_json for site ${row.id}: ${err.message}`); 168 errors++; 169 } 170 } 171 172 if (dryRun) { 173 offset += BATCH_SIZE; 174 } 175 176 const progress = extracted + skipped + errors; 177 if (progress % 1000 === 0 || rows.length < BATCH_SIZE) { 178 console.log(` Progress: ${progress}/${total} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`); 179 } 180 } 181 182 console.log(`Contacts extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`); 183 return { extracted, skipped, errors }; 184 } 185 186 // Main 187 const db = createDatabaseConnection(dbPath); 188 db.pragma('journal_mode = WAL'); 189 db.pragma('busy_timeout = 30000'); 190 191 if (dryRun) { 192 console.log('=== DRY RUN — no changes will be made ==='); 193 } 194 195 const startTime = Date.now(); 196 197 try { 198 if (doScores) extractScoreJson(db); 199 if (doContacts) extractContactsJson(db); 200 201 if (!dryRun) { 202 // Reclaim space after replacing large blobs with sentinels 203 console.log('\nRunning VACUUM to reclaim disk space...'); 204 db.exec('VACUUM'); 205 console.log('VACUUM complete.'); 206 } 207 } finally { 208 db.close(); 209 } 210 211 const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); 212 console.log(`\nDone in ${elapsed}s`);