migrate-html-to-filesystem.js
1 #!/usr/bin/env node 2 /** 3 * One-time migration: move html_dom and key_pages_html from SQLite to filesystem. 4 * 5 * For each site with non-null, non-sentinel html_dom: 6 * 1. Write HTML to data/html/{site_id}.json 7 * 2. Set html_dom = 'fs' in DB (flag indicating filesystem storage) 8 * 9 * For each site with non-null key_pages_html: 10 * 1. Write key_pages_html to the same JSON file 11 * 2. Set key_pages_html = 'fs' in DB 12 * 13 * After migration completes successfully: 14 * 3. NULL out the old html_dom/key_pages_html content for migrated rows 15 * 4. Run VACUUM to reclaim disk space 16 * 17 * Usage: 18 * node scripts/migrate-html-to-filesystem.js [--dry-run] [--batch-size=1000] [--skip-vacuum] 19 * 20 * IMPORTANT: Stop the pipeline before running this migration. 21 */ 22 23 import { createDatabaseConnection } from '../src/utils/db.js'; 24 import { writeHtmlDom, writeKeyPagesHtml } from '../src/utils/html-storage.js'; 25 import '../src/utils/load-env.js'; 26 27 const args = process.argv.slice(2); 28 const dryRun = args.includes('--dry-run'); 29 const skipVacuum = args.includes('--skip-vacuum'); 30 const batchSizeArg = args.find(a => a.startsWith('--batch-size=')); 31 const batchSize = batchSizeArg ? parseInt(batchSizeArg.split('=')[1], 10) : 1000; 32 33 const HTML_SENTINEL = 'HTML removed after scoring'; 34 35 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 36 const db = createDatabaseConnection(dbPath); 37 db.pragma('journal_mode = WAL'); 38 db.pragma('busy_timeout = 30000'); 39 40 console.log(`Migration: html_dom + key_pages_html → filesystem`); 41 console.log(`Database: ${dbPath}`); 42 console.log(`Batch size: ${batchSize}`); 43 console.log(`Dry run: ${dryRun}`); 44 console.log(`Skip vacuum: ${skipVacuum}`); 45 console.log(''); 46 47 // Phase 1: Migrate html_dom 48 const htmlCount = db 49 .prepare( 50 `SELECT COUNT(*) as count FROM sites 51 WHERE html_dom IS NOT NULL 52 AND html_dom != '${HTML_SENTINEL}' 53 AND html_dom != 'fs' 54 AND length(html_dom) > 10` 55 ) 56 .get().count; 57 58 console.log(`Phase 1: ${htmlCount} sites have html_dom to migrate`); 59 60 let htmlMigrated = 0; 61 let htmlErrors = 0; 62 63 while (htmlMigrated + htmlErrors < htmlCount) { 64 const batch = db 65 .prepare( 66 `SELECT id, html_dom FROM sites 67 WHERE html_dom IS NOT NULL 68 AND html_dom != '${HTML_SENTINEL}' 69 AND html_dom != 'fs' 70 AND length(html_dom) > 10 71 LIMIT ?` 72 ) 73 .all(batchSize); 74 75 if (batch.length === 0) break; 76 77 for (const site of batch) { 78 try { 79 if (!dryRun) { 80 writeHtmlDom(site.id, site.html_dom); 81 db.prepare(`UPDATE sites SET html_dom = 'fs' WHERE id = ?`).run(site.id); 82 } 83 htmlMigrated++; 84 } catch (err) { 85 console.error(` Error migrating site ${site.id}: ${err.message}`); 86 htmlErrors++; 87 } 88 } 89 90 console.log(` html_dom: ${htmlMigrated}/${htmlCount} migrated (${htmlErrors} errors)`); 91 } 92 93 // Phase 2: Migrate key_pages_html 94 const kphCount = db 95 .prepare( 96 `SELECT COUNT(*) as count FROM sites 97 WHERE key_pages_html IS NOT NULL 98 AND key_pages_html != 'fs' 99 AND length(key_pages_html) > 10` 100 ) 101 .get().count; 102 103 console.log(`\nPhase 2: ${kphCount} sites have key_pages_html to migrate`); 104 105 let kphMigrated = 0; 106 let kphErrors = 0; 107 108 while (kphMigrated + kphErrors < kphCount) { 109 const batch = db 110 .prepare( 111 `SELECT id, key_pages_html FROM sites 112 WHERE key_pages_html IS NOT NULL 113 AND key_pages_html != 'fs' 114 AND length(key_pages_html) > 10 115 LIMIT ?` 116 ) 117 .all(batchSize); 118 119 if (batch.length === 0) break; 120 121 for (const site of batch) { 122 try { 123 const keyPages = JSON.parse(site.key_pages_html); 124 if (!dryRun) { 125 writeKeyPagesHtml(site.id, keyPages); 126 db.prepare(`UPDATE sites SET key_pages_html = 'fs' WHERE id = ?`).run(site.id); 127 } 128 kphMigrated++; 129 } catch (err) { 130 console.error(` Error migrating key_pages_html for site ${site.id}: ${err.message}`); 131 kphErrors++; 132 } 133 } 134 135 console.log(` key_pages_html: ${kphMigrated}/${kphCount} migrated (${kphErrors} errors)`); 136 } 137 138 // Phase 3: Clear sentinel values (set to NULL — no longer needed) 139 if (!dryRun) { 140 console.log('\nPhase 3: Clearing sentinel values...'); 141 const sentinelCleared = db 142 .prepare(`UPDATE sites SET html_dom = NULL WHERE html_dom = '${HTML_SENTINEL}'`) 143 .run().changes; 144 console.log(` Cleared ${sentinelCleared} sentinel values`); 145 } 146 147 // Phase 4: VACUUM 148 if (!dryRun && !skipVacuum) { 149 console.log('\nPhase 4: Running VACUUM (this will take a while for a 19 GB database)...'); 150 const startTime = Date.now(); 151 db.exec('VACUUM'); 152 const duration = ((Date.now() - startTime) / 1000).toFixed(1); 153 console.log(` VACUUM completed in ${duration}s`); 154 } 155 156 db.close(); 157 158 console.log('\nMigration complete!'); 159 console.log(` html_dom: ${htmlMigrated} migrated, ${htmlErrors} errors`); 160 console.log(` key_pages_html: ${kphMigrated} migrated, ${kphErrors} errors`); 161 if (dryRun) console.log(' (dry run — no changes made)');