/ scripts / migrate-html-to-filesystem.js
migrate-html-to-filesystem.js
  1  #!/usr/bin/env node
  2  /**
  3   * One-time migration: move html_dom and key_pages_html from SQLite to filesystem.
  4   *
  5   * For each site with non-null, non-sentinel html_dom:
  6   *   1. Write HTML to data/html/{site_id}.json
  7   *   2. Set html_dom = 'fs' in DB (flag indicating filesystem storage)
  8   *
  9   * For each site with non-null key_pages_html:
 10   *   1. Write key_pages_html to the same JSON file
 11   *   2. Set key_pages_html = 'fs' in DB
 12   *
 13   * After migration completes successfully:
 14   *   3. NULL out the old html_dom/key_pages_html content for migrated rows
 15   *   4. Run VACUUM to reclaim disk space
 16   *
 17   * Usage:
 18   *   node scripts/migrate-html-to-filesystem.js [--dry-run] [--batch-size=1000] [--skip-vacuum]
 19   *
 20   * IMPORTANT: Stop the pipeline before running this migration.
 21   */
 22  
 23  import { createDatabaseConnection } from '../src/utils/db.js';
 24  import { writeHtmlDom, writeKeyPagesHtml } from '../src/utils/html-storage.js';
 25  import '../src/utils/load-env.js';
 26  
 27  const args = process.argv.slice(2);
 28  const dryRun = args.includes('--dry-run');
 29  const skipVacuum = args.includes('--skip-vacuum');
 30  const batchSizeArg = args.find(a => a.startsWith('--batch-size='));
 31  const batchSize = batchSizeArg ? parseInt(batchSizeArg.split('=')[1], 10) : 1000;
 32  
 33  const HTML_SENTINEL = 'HTML removed after scoring';
 34  
 35  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
 36  const db = createDatabaseConnection(dbPath);
 37  db.pragma('journal_mode = WAL');
 38  db.pragma('busy_timeout = 30000');
 39  
 40  console.log(`Migration: html_dom + key_pages_html → filesystem`);
 41  console.log(`Database: ${dbPath}`);
 42  console.log(`Batch size: ${batchSize}`);
 43  console.log(`Dry run: ${dryRun}`);
 44  console.log(`Skip vacuum: ${skipVacuum}`);
 45  console.log('');
 46  
 47  // Phase 1: Migrate html_dom
 48  const htmlCount = db
 49    .prepare(
 50      `SELECT COUNT(*) as count FROM sites
 51       WHERE html_dom IS NOT NULL
 52         AND html_dom != '${HTML_SENTINEL}'
 53         AND html_dom != 'fs'
 54         AND length(html_dom) > 10`
 55    )
 56    .get().count;
 57  
 58  console.log(`Phase 1: ${htmlCount} sites have html_dom to migrate`);
 59  
 60  let htmlMigrated = 0;
 61  let htmlErrors = 0;
 62  
 63  while (htmlMigrated + htmlErrors < htmlCount) {
 64    const batch = db
 65      .prepare(
 66        `SELECT id, html_dom FROM sites
 67         WHERE html_dom IS NOT NULL
 68           AND html_dom != '${HTML_SENTINEL}'
 69           AND html_dom != 'fs'
 70           AND length(html_dom) > 10
 71         LIMIT ?`
 72      )
 73      .all(batchSize);
 74  
 75    if (batch.length === 0) break;
 76  
 77    for (const site of batch) {
 78      try {
 79        if (!dryRun) {
 80          writeHtmlDom(site.id, site.html_dom);
 81          db.prepare(`UPDATE sites SET html_dom = 'fs' WHERE id = ?`).run(site.id);
 82        }
 83        htmlMigrated++;
 84      } catch (err) {
 85        console.error(`  Error migrating site ${site.id}: ${err.message}`);
 86        htmlErrors++;
 87      }
 88    }
 89  
 90    console.log(`  html_dom: ${htmlMigrated}/${htmlCount} migrated (${htmlErrors} errors)`);
 91  }
 92  
 93  // Phase 2: Migrate key_pages_html
 94  const kphCount = db
 95    .prepare(
 96      `SELECT COUNT(*) as count FROM sites
 97       WHERE key_pages_html IS NOT NULL
 98         AND key_pages_html != 'fs'
 99         AND length(key_pages_html) > 10`
100    )
101    .get().count;
102  
103  console.log(`\nPhase 2: ${kphCount} sites have key_pages_html to migrate`);
104  
105  let kphMigrated = 0;
106  let kphErrors = 0;
107  
108  while (kphMigrated + kphErrors < kphCount) {
109    const batch = db
110      .prepare(
111        `SELECT id, key_pages_html FROM sites
112         WHERE key_pages_html IS NOT NULL
113           AND key_pages_html != 'fs'
114           AND length(key_pages_html) > 10
115         LIMIT ?`
116      )
117      .all(batchSize);
118  
119    if (batch.length === 0) break;
120  
121    for (const site of batch) {
122      try {
123        const keyPages = JSON.parse(site.key_pages_html);
124        if (!dryRun) {
125          writeKeyPagesHtml(site.id, keyPages);
126          db.prepare(`UPDATE sites SET key_pages_html = 'fs' WHERE id = ?`).run(site.id);
127        }
128        kphMigrated++;
129      } catch (err) {
130        console.error(`  Error migrating key_pages_html for site ${site.id}: ${err.message}`);
131        kphErrors++;
132      }
133    }
134  
135    console.log(`  key_pages_html: ${kphMigrated}/${kphCount} migrated (${kphErrors} errors)`);
136  }
137  
138  // Phase 3: Clear sentinel values (set to NULL — no longer needed)
139  if (!dryRun) {
140    console.log('\nPhase 3: Clearing sentinel values...');
141    const sentinelCleared = db
142      .prepare(`UPDATE sites SET html_dom = NULL WHERE html_dom = '${HTML_SENTINEL}'`)
143      .run().changes;
144    console.log(`  Cleared ${sentinelCleared} sentinel values`);
145  }
146  
147  // Phase 4: VACUUM
148  if (!dryRun && !skipVacuum) {
149    console.log('\nPhase 4: Running VACUUM (this will take a while for a 19 GB database)...');
150    const startTime = Date.now();
151    db.exec('VACUUM');
152    const duration = ((Date.now() - startTime) / 1000).toFixed(1);
153    console.log(`  VACUUM completed in ${duration}s`);
154  }
155  
156  db.close();
157  
158  console.log('\nMigration complete!');
159  console.log(`  html_dom: ${htmlMigrated} migrated, ${htmlErrors} errors`);
160  console.log(`  key_pages_html: ${kphMigrated} migrated, ${kphErrors} errors`);
161  if (dryRun) console.log('  (dry run — no changes made)');