116-archive-ignored-sites.js
1 #!/usr/bin/env node 2 /** 3 * 116-archive-ignored-sites.js 4 * 5 * One-time migration: moves 826K+ status='ignored' rows from the main sites 6 * table into a sites_archive table. Ignored rows are never queried by the 7 * pipeline — they participate in every full-table scan as dead weight (69% of 8 * all rows at time of writing). 9 * 10 * The archive table uses CREATE TABLE AS SELECT to mirror the exact column 11 * layout of sites at the time the migration runs, so it stays valid even if 12 * future migrations add columns to sites (they won't be in the archive, which 13 * is intentional — the archive is read-only historical data). 14 * 15 * Safety: 16 * - Counts are verified before DELETE commits 17 * - Entire operation runs in a single transaction 18 * - Supports --dry-run to preview without writing 19 * - Idempotent: safe to re-run (uses INSERT OR IGNORE; skips if archive already 20 * contains matching ids) 21 * 22 * WARNING: This migration is one-time and largely irreversible. The DELETE from 23 * sites cannot be undone without restoring from backup. The archive table persists 24 * in the database for audit/reference purposes. 25 * 26 * New ignored rows written by the pipeline after this migration runs will remain 27 * in the sites table (the pipeline must continue writing there). Run this migration 28 * periodically or schedule it as a cron job to keep the archive current. 29 * 30 * Usage: 31 * node db/migrations/116-archive-ignored-sites.js # live run 32 * node db/migrations/116-archive-ignored-sites.js --dry-run # preview only 33 * DATABASE_PATH=/path/to/sites.db node db/migrations/116-archive-ignored-sites.js 34 */ 35 36 import Database from 'better-sqlite3'; 37 import { existsSync } from 'fs'; 38 import path from 'path'; 39 import { fileURLToPath } from 'url'; 40 41 const __dirname = path.dirname(fileURLToPath(import.meta.url)); 42 const DRY_RUN = process.argv.includes('--dry-run'); 43 44 const dbPath = 45 process.env.DATABASE_PATH || path.resolve(__dirname, '../../db/sites.db'); 46 47 if (!existsSync(dbPath)) { 48 console.error(`ERROR: Database not found at: ${dbPath}`); 49 process.exit(1); 50 } 51 52 const db = new Database(dbPath); 53 db.pragma('journal_mode = WAL'); 54 db.pragma('busy_timeout = 60000'); 55 db.pragma('foreign_keys = OFF'); // sites_archive has no FKs to maintain 56 57 // --------------------------------------------------------------------------- 58 // Pre-migration counts 59 // --------------------------------------------------------------------------- 60 61 const ignoredInSites = db 62 .prepare("SELECT COUNT(*) AS c FROM sites WHERE status = 'ignored'") 63 .get().c; 64 65 console.log('=== 116-archive-ignored-sites migration ==='); 66 console.log(`Database: ${dbPath}`); 67 if (DRY_RUN) console.log('MODE: DRY RUN — no changes will be committed'); 68 else console.log('MODE: LIVE — changes will be committed'); 69 console.log(''); 70 console.log(`Ignored rows in sites: ${ignoredInSites.toLocaleString()}`); 71 72 if (ignoredInSites === 0) { 73 console.log('Nothing to archive. Exiting.'); 74 db.close(); 75 process.exit(0); 76 } 77 78 // --------------------------------------------------------------------------- 79 // Ensure archive table exists (schema mirrors sites at migration time) 80 // --------------------------------------------------------------------------- 81 82 // If the archive table exists but has a different column count than sites, 83 // it was created from an older schema — drop and recreate it. 84 const archiveExists = db 85 .prepare("SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='sites_archive'") 86 .get().c; 87 88 if (archiveExists) { 89 const sitesColCount = db.prepare('SELECT COUNT(*) AS c FROM pragma_table_info(\'sites\')').get().c; 90 const archiveColCount = db.prepare('SELECT COUNT(*) AS c FROM pragma_table_info(\'sites_archive\')').get().c; 91 if (sitesColCount !== archiveColCount) { 92 console.log(`Schema mismatch: sites has ${sitesColCount} cols, archive has ${archiveColCount}. Recreating archive table.`); 93 db.exec('DROP TABLE sites_archive'); 94 } 95 } 96 97 db.exec(` 98 CREATE TABLE IF NOT EXISTS sites_archive AS 99 SELECT * FROM sites WHERE 1 = 0; 100 `); 101 102 // Add a unique index on id to support INSERT OR IGNORE idempotency 103 try { 104 db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_sites_archive_id ON sites_archive(id)`); 105 } catch { 106 // Index may already exist from a prior run — ignore 107 } 108 109 const archiveBefore = db 110 .prepare('SELECT COUNT(*) AS c FROM sites_archive') 111 .get().c; 112 113 console.log(`Existing archive rows: ${archiveBefore.toLocaleString()}`); 114 console.log(''); 115 116 // --------------------------------------------------------------------------- 117 // Migration transaction 118 // --------------------------------------------------------------------------- 119 120 const migrate = db.transaction(() => { 121 // Step 1: Insert ignored rows into archive (skip any already archived) 122 const insertResult = db 123 .prepare( 124 `INSERT OR IGNORE INTO sites_archive 125 SELECT * FROM sites WHERE status = 'ignored'` 126 ) 127 .run(); 128 129 const inserted = insertResult.changes; 130 131 // Step 2: Count archive rows now (should equal archiveBefore + inserted) 132 const archiveAfterInsert = db 133 .prepare('SELECT COUNT(*) AS c FROM sites_archive') 134 .get().c; 135 136 const expectedArchive = archiveBefore + inserted; 137 138 if (archiveAfterInsert !== expectedArchive) { 139 throw new Error( 140 `Archive count mismatch after INSERT: expected ${expectedArchive}, got ${archiveAfterInsert}. Aborting.` 141 ); 142 } 143 144 // Step 3: Delete from sites only for rows we just archived 145 // (rows that exist in sites_archive AND are still in sites) 146 const deleteResult = db 147 .prepare( 148 `DELETE FROM sites 149 WHERE status = 'ignored' 150 AND id IN (SELECT id FROM sites_archive)` 151 ) 152 .run(); 153 154 const deleted = deleteResult.changes; 155 156 // Step 4: Final verification — deleted count must match inserted count 157 if (deleted !== inserted) { 158 throw new Error( 159 `Delete count (${deleted}) does not match insert count (${inserted}). Aborting.` 160 ); 161 } 162 163 return { inserted, deleted, archiveAfterInsert }; 164 }); 165 166 // --------------------------------------------------------------------------- 167 // Execute (or dry-run rollback) 168 // --------------------------------------------------------------------------- 169 170 let stats; 171 172 if (DRY_RUN) { 173 const dryTx = db.transaction(() => { 174 stats = migrate(); 175 throw new Error('DRY_RUN_ROLLBACK'); 176 }); 177 try { 178 dryTx(); 179 } catch (e) { 180 if (e.message !== 'DRY_RUN_ROLLBACK') throw e; 181 } 182 } else { 183 stats = migrate(); 184 } 185 186 // --------------------------------------------------------------------------- 187 // Post-migration report 188 // --------------------------------------------------------------------------- 189 190 const ignoredAfter = db 191 .prepare("SELECT COUNT(*) AS c FROM sites WHERE status = 'ignored'") 192 .get().c; 193 const archiveAfter = db 194 .prepare('SELECT COUNT(*) AS c FROM sites_archive') 195 .get().c; 196 const totalAfter = db 197 .prepare('SELECT COUNT(*) AS c FROM sites') 198 .get().c; 199 200 console.log(`Results${DRY_RUN ? ' (DRY RUN — rolled back)' : ''}:`); 201 console.log(` Rows inserted into sites_archive: ${stats.inserted.toLocaleString()}`); 202 console.log(` Rows deleted from sites: ${stats.deleted.toLocaleString()}`); 203 console.log(''); 204 console.log('Post-migration state:'); 205 console.log(` sites total: ${totalAfter.toLocaleString()}`); 206 console.log(` sites ignored: ${ignoredAfter.toLocaleString()}`); 207 console.log(` sites_archive total: ${archiveAfter.toLocaleString()}`); 208 209 if (!DRY_RUN) { 210 console.log(''); 211 console.log('Migration complete.'); 212 console.log(''); 213 console.log('NOTE: New ignored rows written by the pipeline will accumulate in'); 214 console.log('sites again over time. Re-run this migration periodically to keep'); 215 console.log('the archive current and the main table lean.'); 216 } 217 218 db.close();