/ db / migrations / 116-archive-ignored-sites.js
116-archive-ignored-sites.js
  1  #!/usr/bin/env node
  2  /**
  3   * 116-archive-ignored-sites.js
  4   *
  5   * One-time migration: moves 826K+ status='ignored' rows from the main sites
  6   * table into a sites_archive table. Ignored rows are never queried by the
  7   * pipeline — they participate in every full-table scan as dead weight (69% of
  8   * all rows at time of writing).
  9   *
 10   * The archive table uses CREATE TABLE AS SELECT to mirror the exact column
 11   * layout of sites at the time the migration runs, so it stays valid even if
 12   * future migrations add columns to sites (they won't be in the archive, which
 13   * is intentional — the archive is read-only historical data).
 14   *
 15   * Safety:
 16   *   - Counts are verified before DELETE commits
 17   *   - Entire operation runs in a single transaction
 18   *   - Supports --dry-run to preview without writing
 19   *   - Idempotent: safe to re-run (uses INSERT OR IGNORE; skips if archive already
 20   *     contains matching ids)
 21   *
 22   * WARNING: This migration is one-time and largely irreversible. The DELETE from
 23   * sites cannot be undone without restoring from backup. The archive table persists
 24   * in the database for audit/reference purposes.
 25   *
 26   * New ignored rows written by the pipeline after this migration runs will remain
 27   * in the sites table (the pipeline must continue writing there). Run this migration
 28   * periodically or schedule it as a cron job to keep the archive current.
 29   *
 30   * Usage:
 31   *   node db/migrations/116-archive-ignored-sites.js           # live run
 32   *   node db/migrations/116-archive-ignored-sites.js --dry-run # preview only
 33   *   DATABASE_PATH=/path/to/sites.db node db/migrations/116-archive-ignored-sites.js
 34   */
 35  
 36  import Database from 'better-sqlite3';
 37  import { existsSync } from 'fs';
 38  import path from 'path';
 39  import { fileURLToPath } from 'url';
 40  
 41  const __dirname = path.dirname(fileURLToPath(import.meta.url));
 42  const DRY_RUN = process.argv.includes('--dry-run');
 43  
 44  const dbPath =
 45    process.env.DATABASE_PATH || path.resolve(__dirname, '../../db/sites.db');
 46  
 47  if (!existsSync(dbPath)) {
 48    console.error(`ERROR: Database not found at: ${dbPath}`);
 49    process.exit(1);
 50  }
 51  
 52  const db = new Database(dbPath);
 53  db.pragma('journal_mode = WAL');
 54  db.pragma('busy_timeout = 60000');
 55  db.pragma('foreign_keys = OFF'); // sites_archive has no FKs to maintain
 56  
 57  // ---------------------------------------------------------------------------
 58  // Pre-migration counts
 59  // ---------------------------------------------------------------------------
 60  
 61  const ignoredInSites = db
 62    .prepare("SELECT COUNT(*) AS c FROM sites WHERE status = 'ignored'")
 63    .get().c;
 64  
 65  console.log('=== 116-archive-ignored-sites migration ===');
 66  console.log(`Database:   ${dbPath}`);
 67  if (DRY_RUN) console.log('MODE:       DRY RUN — no changes will be committed');
 68  else console.log('MODE:       LIVE — changes will be committed');
 69  console.log('');
 70  console.log(`Ignored rows in sites:   ${ignoredInSites.toLocaleString()}`);
 71  
 72  if (ignoredInSites === 0) {
 73    console.log('Nothing to archive. Exiting.');
 74    db.close();
 75    process.exit(0);
 76  }
 77  
 78  // ---------------------------------------------------------------------------
 79  // Ensure archive table exists (schema mirrors sites at migration time)
 80  // ---------------------------------------------------------------------------
 81  
 82  // If the archive table exists but has a different column count than sites,
 83  // it was created from an older schema — drop and recreate it.
 84  const archiveExists = db
 85    .prepare("SELECT COUNT(*) AS c FROM sqlite_master WHERE type='table' AND name='sites_archive'")
 86    .get().c;
 87  
 88  if (archiveExists) {
 89    const sitesColCount = db.prepare('SELECT COUNT(*) AS c FROM pragma_table_info(\'sites\')').get().c;
 90    const archiveColCount = db.prepare('SELECT COUNT(*) AS c FROM pragma_table_info(\'sites_archive\')').get().c;
 91    if (sitesColCount !== archiveColCount) {
 92      console.log(`Schema mismatch: sites has ${sitesColCount} cols, archive has ${archiveColCount}. Recreating archive table.`);
 93      db.exec('DROP TABLE sites_archive');
 94    }
 95  }
 96  
 97  db.exec(`
 98    CREATE TABLE IF NOT EXISTS sites_archive AS
 99      SELECT * FROM sites WHERE 1 = 0;
100  `);
101  
102  // Add a unique index on id to support INSERT OR IGNORE idempotency
103  try {
104    db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_sites_archive_id ON sites_archive(id)`);
105  } catch {
106    // Index may already exist from a prior run — ignore
107  }
108  
109  const archiveBefore = db
110    .prepare('SELECT COUNT(*) AS c FROM sites_archive')
111    .get().c;
112  
113  console.log(`Existing archive rows:   ${archiveBefore.toLocaleString()}`);
114  console.log('');
115  
116  // ---------------------------------------------------------------------------
117  // Migration transaction
118  // ---------------------------------------------------------------------------
119  
120  const migrate = db.transaction(() => {
121    // Step 1: Insert ignored rows into archive (skip any already archived)
122    const insertResult = db
123      .prepare(
124        `INSERT OR IGNORE INTO sites_archive
125         SELECT * FROM sites WHERE status = 'ignored'`
126      )
127      .run();
128  
129    const inserted = insertResult.changes;
130  
131    // Step 2: Count archive rows now (should equal archiveBefore + inserted)
132    const archiveAfterInsert = db
133      .prepare('SELECT COUNT(*) AS c FROM sites_archive')
134      .get().c;
135  
136    const expectedArchive = archiveBefore + inserted;
137  
138    if (archiveAfterInsert !== expectedArchive) {
139      throw new Error(
140        `Archive count mismatch after INSERT: expected ${expectedArchive}, got ${archiveAfterInsert}. Aborting.`
141      );
142    }
143  
144    // Step 3: Delete from sites only for rows we just archived
145    //         (rows that exist in sites_archive AND are still in sites)
146    const deleteResult = db
147      .prepare(
148        `DELETE FROM sites
149         WHERE status = 'ignored'
150           AND id IN (SELECT id FROM sites_archive)`
151      )
152      .run();
153  
154    const deleted = deleteResult.changes;
155  
156    // Step 4: Final verification — deleted count must match inserted count
157    if (deleted !== inserted) {
158      throw new Error(
159        `Delete count (${deleted}) does not match insert count (${inserted}). Aborting.`
160      );
161    }
162  
163    return { inserted, deleted, archiveAfterInsert };
164  });
165  
166  // ---------------------------------------------------------------------------
167  // Execute (or dry-run rollback)
168  // ---------------------------------------------------------------------------
169  
170  let stats;
171  
172  if (DRY_RUN) {
173    const dryTx = db.transaction(() => {
174      stats = migrate();
175      throw new Error('DRY_RUN_ROLLBACK');
176    });
177    try {
178      dryTx();
179    } catch (e) {
180      if (e.message !== 'DRY_RUN_ROLLBACK') throw e;
181    }
182  } else {
183    stats = migrate();
184  }
185  
186  // ---------------------------------------------------------------------------
187  // Post-migration report
188  // ---------------------------------------------------------------------------
189  
190  const ignoredAfter = db
191    .prepare("SELECT COUNT(*) AS c FROM sites WHERE status = 'ignored'")
192    .get().c;
193  const archiveAfter = db
194    .prepare('SELECT COUNT(*) AS c FROM sites_archive')
195    .get().c;
196  const totalAfter = db
197    .prepare('SELECT COUNT(*) AS c FROM sites')
198    .get().c;
199  
200  console.log(`Results${DRY_RUN ? ' (DRY RUN — rolled back)' : ''}:`);
201  console.log(`  Rows inserted into sites_archive: ${stats.inserted.toLocaleString()}`);
202  console.log(`  Rows deleted from sites:          ${stats.deleted.toLocaleString()}`);
203  console.log('');
204  console.log('Post-migration state:');
205  console.log(`  sites total:         ${totalAfter.toLocaleString()}`);
206  console.log(`  sites ignored:       ${ignoredAfter.toLocaleString()}`);
207  console.log(`  sites_archive total: ${archiveAfter.toLocaleString()}`);
208  
209  if (!DRY_RUN) {
210    console.log('');
211    console.log('Migration complete.');
212    console.log('');
213    console.log('NOTE: New ignored rows written by the pipeline will accumulate in');
214    console.log('sites again over time. Re-run this migration periodically to keep');
215    console.log('the archive current and the main table lean.');
216  }
217  
218  db.close();