/ db / migrations / 120-extract-score-contacts-json.js
120-extract-score-contacts-json.js
  1  #!/usr/bin/env node
  2  /**
  3   * 120-extract-score-contacts-json.js
  4   *
  5   * Extracts score_json and contacts_json blobs from the sites table to
  6   * the filesystem:
  7   *   data/scores/{site_id}.json
  8   *   data/contacts/{site_id}.json
  9   *
 10   * After writing to disk, replaces each DB column value with a sentinel
 11   * '{"_fs":true}' to reclaim space while preserving IS NOT NULL semantics.
 12   * json_extract() calls on the sentinel return NULL gracefully (already
 13   * handled in all code paths via getScoreJsonWithFallback / getContactsJsonWithFallback).
 14   *
 15   * Safe to re-run (idempotent):
 16   *   - If the file already exists on disk the DB blob is set to sentinel and skipped.
 17   *   - If the DB column already holds the sentinel the row is skipped entirely.
 18   *
 19   * After this migration, the pipeline no longer writes to these columns at all.
 20   * Migration 121 NULLs them out entirely once VACUUM has been run.
 21   *
 22   * Usage:
 23   *   node db/migrations/120-extract-score-contacts-json.js
 24   *   node db/migrations/120-extract-score-contacts-json.js --dry-run
 25   *   node db/migrations/120-extract-score-contacts-json.js --scores-only
 26   *   node db/migrations/120-extract-score-contacts-json.js --contacts-only
 27   *   DATABASE_PATH=/path/to/sites.db node db/migrations/120-extract-score-contacts-json.js
 28   */
 29  
 30  import Database from 'better-sqlite3';
 31  import { existsSync, mkdirSync, writeFileSync } from 'fs';
 32  import { join } from 'path';
 33  import path from 'path';
 34  import { fileURLToPath } from 'url';
 35  
 36  const __dirname = path.dirname(fileURLToPath(import.meta.url));
 37  
 38  const DRY_RUN = process.argv.includes('--dry-run');
 39  const SCORES_ONLY = process.argv.includes('--scores-only');
 40  const CONTACTS_ONLY = process.argv.includes('--contacts-only');
 41  const DO_SCORES = !CONTACTS_ONLY;
 42  const DO_CONTACTS = !SCORES_ONLY;
 43  
 44  const dbPath =
 45    process.env.DATABASE_PATH || path.resolve(__dirname, '../../db/sites.db');
 46  
 47  const SCORES_DIR = path.resolve(__dirname, '../../data/scores');
 48  const CONTACTS_DIR = path.resolve(__dirname, '../../data/contacts');
 49  
 50  // Sentinel: valid JSON that signals "data is on filesystem".
 51  // Preserves IS NOT NULL while shrinking the column to 14 bytes.
 52  const FS_SENTINEL = '{"_fs":true}';
 53  
 54  // Batch size for SELECT + UPDATE loops
 55  const BATCH_SIZE = 500;
 56  
 57  // -------------------------------------------------------------------------
 58  // Helpers
 59  // -------------------------------------------------------------------------
 60  
 61  function scoreFilePath(siteId) {
 62    return join(SCORES_DIR, `${siteId}.json`);
 63  }
 64  
 65  function contactsFilePath(siteId) {
 66    return join(CONTACTS_DIR, `${siteId}.json`);
 67  }
 68  
 69  function writeJson(filePath, json) {
 70    writeFileSync(filePath, json, 'utf8');
 71  }
 72  
 73  // -------------------------------------------------------------------------
 74  // Extract score_json
 75  // -------------------------------------------------------------------------
 76  
 77  function extractScoreJson(db) {
 78    console.log('\n=== Extracting score_json ===');
 79  
 80    if (!DRY_RUN) {
 81      mkdirSync(SCORES_DIR, { recursive: true });
 82    }
 83  
 84    const total = db
 85      .prepare(`SELECT COUNT(*) as c FROM sites WHERE score_json IS NOT NULL AND score_json != ?`)
 86      .get(FS_SENTINEL).c;
 87    console.log(`Sites with score_json in DB: ${total.toLocaleString()}`);
 88  
 89    if (total === 0) {
 90      console.log('Nothing to extract.');
 91      return { extracted: 0, skipped: 0, errors: 0 };
 92    }
 93  
 94    const setSentinel = db.prepare('UPDATE sites SET score_json = ? WHERE id = ?');
 95  
 96    let extracted = 0;
 97    let skipped = 0;
 98    let errors = 0;
 99    let offset = 0;
100  
101    while (true) {
102      const rows = db
103        .prepare(
104          `SELECT id, score_json FROM sites
105           WHERE score_json IS NOT NULL AND score_json != ?
106           LIMIT ? OFFSET ?`
107        )
108        .all(FS_SENTINEL, BATCH_SIZE, offset);
109  
110      if (rows.length === 0) break;
111  
112      for (const row of rows) {
113        try {
114          const filePath = scoreFilePath(row.id);
115  
116          if (existsSync(filePath)) {
117            // Already on disk — just write sentinel to DB (reclaim space)
118            if (!DRY_RUN) setSentinel.run(FS_SENTINEL, row.id);
119            skipped++;
120            continue;
121          }
122  
123          if (DRY_RUN) {
124            extracted++;
125            continue;
126          }
127  
128          writeJson(filePath, row.score_json);
129          setSentinel.run(FS_SENTINEL, row.id);
130          extracted++;
131        } catch (err) {
132          console.error(`  Error extracting score_json for site ${row.id}: ${err.message}`);
133          errors++;
134        }
135      }
136  
137      // In dry-run mode rows are not modified so advance offset manually
138      if (DRY_RUN) offset += BATCH_SIZE;
139  
140      const progress = extracted + skipped + errors;
141      if (progress > 0 && (progress % 10000 === 0 || rows.length < BATCH_SIZE)) {
142        console.log(
143          `  Progress: ${progress.toLocaleString()}/${total.toLocaleString()} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`
144        );
145      }
146    }
147  
148    console.log(
149      `Score extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`
150    );
151    return { extracted, skipped, errors };
152  }
153  
154  // -------------------------------------------------------------------------
155  // Extract contacts_json
156  // -------------------------------------------------------------------------
157  
158  function extractContactsJson(db) {
159    console.log('\n=== Extracting contacts_json ===');
160  
161    if (!DRY_RUN) {
162      mkdirSync(CONTACTS_DIR, { recursive: true });
163    }
164  
165    const total = db
166      .prepare(`SELECT COUNT(*) as c FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ?`)
167      .get(FS_SENTINEL).c;
168    console.log(`Sites with contacts_json in DB: ${total.toLocaleString()}`);
169  
170    if (total === 0) {
171      console.log('Nothing to extract.');
172      return { extracted: 0, skipped: 0, errors: 0 };
173    }
174  
175    const setSentinel = db.prepare('UPDATE sites SET contacts_json = ? WHERE id = ?');
176  
177    let extracted = 0;
178    let skipped = 0;
179    let errors = 0;
180    let offset = 0;
181  
182    while (true) {
183      const rows = db
184        .prepare(
185          `SELECT id, contacts_json FROM sites
186           WHERE contacts_json IS NOT NULL AND contacts_json != ?
187           LIMIT ? OFFSET ?`
188        )
189        .all(FS_SENTINEL, BATCH_SIZE, offset);
190  
191      if (rows.length === 0) break;
192  
193      for (const row of rows) {
194        try {
195          const filePath = contactsFilePath(row.id);
196  
197          if (existsSync(filePath)) {
198            if (!DRY_RUN) setSentinel.run(FS_SENTINEL, row.id);
199            skipped++;
200            continue;
201          }
202  
203          if (DRY_RUN) {
204            extracted++;
205            continue;
206          }
207  
208          writeJson(filePath, row.contacts_json);
209          setSentinel.run(FS_SENTINEL, row.id);
210          extracted++;
211        } catch (err) {
212          console.error(`  Error extracting contacts_json for site ${row.id}: ${err.message}`);
213          errors++;
214        }
215      }
216  
217      if (DRY_RUN) offset += BATCH_SIZE;
218  
219      const progress = extracted + skipped + errors;
220      if (progress > 0 && (progress % 10000 === 0 || rows.length < BATCH_SIZE)) {
221        console.log(
222          `  Progress: ${progress.toLocaleString()}/${total.toLocaleString()} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`
223        );
224      }
225    }
226  
227    console.log(
228      `Contacts extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`
229    );
230    return { extracted, skipped, errors };
231  }
232  
233  // -------------------------------------------------------------------------
234  // Main
235  // -------------------------------------------------------------------------
236  
237  if (!existsSync(dbPath)) {
238    console.error(`ERROR: Database not found at: ${dbPath}`);
239    process.exit(1);
240  }
241  
242  const db = new Database(dbPath);
243  db.pragma('journal_mode = WAL');
244  db.pragma('busy_timeout = 30000');
245  
246  console.log('=== 120-extract-score-contacts-json migration ===');
247  console.log(`Database: ${dbPath}`);
248  if (DRY_RUN) console.log('MODE: DRY RUN — no changes will be made');
249  else console.log('MODE: LIVE');
250  
251  const startTime = Date.now();
252  
253  try {
254    if (DO_SCORES) extractScoreJson(db);
255    if (DO_CONTACTS) extractContactsJson(db);
256  
257    if (!DRY_RUN) {
258      console.log('\nRunning VACUUM to reclaim disk space from replaced blobs...');
259      db.exec('VACUUM');
260      console.log('VACUUM complete.');
261    }
262  } finally {
263    db.close();
264  }
265  
266  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
267  console.log(`\nDone in ${elapsed}s`);
268  console.log('\nNext step: run migration 121 to NULL out the now-empty DB columns.');