/ scripts / extract-score-contacts-json.js
extract-score-contacts-json.js
  1  #!/usr/bin/env node
  2  /**
  3   * Extract score_json and contacts_json from DB to filesystem.
  4   *
  5   * For each site row that has score_json or contacts_json:
  6   *   1. Write the JSON blob to data/scores/{id}.json or data/contacts/{id}.json
  7   *   2. Replace the DB column with a sentinel value '{"_fs":true}' to reclaim space
  8   *      while keeping IS NOT NULL checks and json_extract() calls working
  9   *
 10   * The sentinel '{"_fs":true}' is valid JSON so:
 11   *   - score_json IS NOT NULL  → still true
 12   *   - json_extract(score_json, '$.overall_calculation.conversion_score') → NULL (graceful)
 13   *   - json_array_length(contacts_json, '$.contacts') → NULL (graceful)
 14   *
 15   * Safe to re-run: skips sites already extracted (sentinel in DB or file exists on fs).
 16   *
 17   * Usage:
 18   *   node scripts/extract-score-contacts-json.js                # extract both
 19   *   node scripts/extract-score-contacts-json.js --scores-only  # extract score_json only
 20   *   node scripts/extract-score-contacts-json.js --contacts-only # extract contacts_json only
 21   *   node scripts/extract-score-contacts-json.js --dry-run      # show what would happen
 22   */
 23  
 24  import { createDatabaseConnection } from '../src/utils/db.js';
 25  import { join, dirname } from 'path';
 26  import { fileURLToPath } from 'url';
 27  import { setScoreJson, hasScoreJson } from '../src/utils/score-storage.js';
 28  import { setContactsJson, hasContactsJson } from '../src/utils/contacts-storage.js';
 29  
 30  const __filename = fileURLToPath(import.meta.url);
 31  const __dirname = dirname(__filename);
 32  const projectRoot = join(__dirname, '..');
 33  const dbPath = join(projectRoot, 'db', 'sites.db');
 34  
 35  const args = process.argv.slice(2);
 36  const dryRun = args.includes('--dry-run');
 37  const scoresOnly = args.includes('--scores-only');
 38  const contactsOnly = args.includes('--contacts-only');
 39  const doScores = !contactsOnly;
 40  const doContacts = !scoresOnly;
 41  
 42  // Sentinel value: valid JSON that preserves IS NOT NULL while reclaiming space.
 43  // json_extract() calls on real paths return NULL gracefully.
 44  const FS_SENTINEL = '{"_fs":true}';
 45  
 46  function extractScoreJson(db) {
 47    console.log('\n=== Extracting score_json ===');
 48  
 49    // Count total (exclude already-extracted sentinel rows)
 50    const total = db.prepare(
 51      `SELECT COUNT(*) as c FROM sites WHERE score_json IS NOT NULL AND score_json != ?`
 52    ).get(FS_SENTINEL).c;
 53    console.log(`Sites with score_json in DB: ${total}`);
 54  
 55    if (total === 0) {
 56      console.log('Nothing to extract.');
 57      return { extracted: 0, skipped: 0, errors: 0 };
 58    }
 59  
 60    // Process in batches of 500
 61    const BATCH_SIZE = 500;
 62    let extracted = 0;
 63    let skipped = 0;
 64    let errors = 0;
 65    let offset = 0;
 66  
 67    const setSentinel = db.prepare('UPDATE sites SET score_json = ? WHERE id = ?');
 68  
 69    while (true) {
 70      const rows = db.prepare(
 71        `SELECT id, score_json FROM sites WHERE score_json IS NOT NULL AND score_json != ? LIMIT ? OFFSET ?`
 72      ).all(FS_SENTINEL, BATCH_SIZE, offset);
 73  
 74      if (rows.length === 0) break;
 75  
 76      for (const row of rows) {
 77        try {
 78          // Skip if already on filesystem — just replace DB blob with sentinel
 79          if (hasScoreJson(row.id)) {
 80            if (!dryRun) {
 81              setSentinel.run(FS_SENTINEL, row.id);
 82            }
 83            skipped++;
 84            continue;
 85          }
 86  
 87          if (dryRun) {
 88            extracted++;
 89            continue;
 90          }
 91  
 92          // Write to filesystem
 93          setScoreJson(row.id, row.score_json);
 94  
 95          // Replace DB column with sentinel
 96          setSentinel.run(FS_SENTINEL, row.id);
 97          extracted++;
 98        } catch (err) {
 99          console.error(`  Error extracting score_json for site ${row.id}: ${err.message}`);
100          errors++;
101        }
102      }
103  
104      // Since we're replacing rows with sentinel, they won't match the WHERE next time
105      // For dry-run, advance offset since rows aren't being modified
106      if (dryRun) {
107        offset += BATCH_SIZE;
108      }
109  
110      const progress = extracted + skipped + errors;
111      if (progress % 1000 === 0 || rows.length < BATCH_SIZE) {
112        console.log(`  Progress: ${progress}/${total} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`);
113      }
114    }
115  
116    console.log(`Score extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`);
117    return { extracted, skipped, errors };
118  }
119  
120  function extractContactsJson(db) {
121    console.log('\n=== Extracting contacts_json ===');
122  
123    const total = db.prepare(
124      `SELECT COUNT(*) as c FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ?`
125    ).get(FS_SENTINEL).c;
126    console.log(`Sites with contacts_json in DB: ${total}`);
127  
128    if (total === 0) {
129      console.log('Nothing to extract.');
130      return { extracted: 0, skipped: 0, errors: 0 };
131    }
132  
133    const BATCH_SIZE = 500;
134    let extracted = 0;
135    let skipped = 0;
136    let errors = 0;
137    let offset = 0;
138  
139    const setSentinel = db.prepare('UPDATE sites SET contacts_json = ? WHERE id = ?');
140  
141    while (true) {
142      const rows = db.prepare(
143        `SELECT id, contacts_json FROM sites WHERE contacts_json IS NOT NULL AND contacts_json != ? LIMIT ? OFFSET ?`
144      ).all(FS_SENTINEL, BATCH_SIZE, offset);
145  
146      if (rows.length === 0) break;
147  
148      for (const row of rows) {
149        try {
150          if (hasContactsJson(row.id)) {
151            if (!dryRun) {
152              setSentinel.run(FS_SENTINEL, row.id);
153            }
154            skipped++;
155            continue;
156          }
157  
158          if (dryRun) {
159            extracted++;
160            continue;
161          }
162  
163          setContactsJson(row.id, row.contacts_json);
164          setSentinel.run(FS_SENTINEL, row.id);
165          extracted++;
166        } catch (err) {
167          console.error(`  Error extracting contacts_json for site ${row.id}: ${err.message}`);
168          errors++;
169        }
170      }
171  
172      if (dryRun) {
173        offset += BATCH_SIZE;
174      }
175  
176      const progress = extracted + skipped + errors;
177      if (progress % 1000 === 0 || rows.length < BATCH_SIZE) {
178        console.log(`  Progress: ${progress}/${total} (extracted=${extracted}, skipped=${skipped}, errors=${errors})`);
179      }
180    }
181  
182    console.log(`Contacts extraction complete: extracted=${extracted}, skipped=${skipped}, errors=${errors}`);
183    return { extracted, skipped, errors };
184  }
185  
186  // Main
187  const db = createDatabaseConnection(dbPath);
188  db.pragma('journal_mode = WAL');
189  db.pragma('busy_timeout = 30000');
190  
191  if (dryRun) {
192    console.log('=== DRY RUN — no changes will be made ===');
193  }
194  
195  const startTime = Date.now();
196  
197  try {
198    if (doScores) extractScoreJson(db);
199    if (doContacts) extractContactsJson(db);
200  
201    if (!dryRun) {
202      // Reclaim space after replacing large blobs with sentinels
203      console.log('\nRunning VACUUM to reclaim disk space...');
204      db.exec('VACUUM');
205      console.log('VACUUM complete.');
206    }
207  } finally {
208    db.close();
209  }
210  
211  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
212  console.log(`\nDone in ${elapsed}s`);