/ scripts / reset-agent-circuit-breakers.js
reset-agent-circuit-breakers.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Reset Agent Circuit Breakers
  5   *
  6   * Resets circuit breakers for agents that have been blocked for >30 minutes
  7   * and cleans up old failed tasks (>24 hours) to prepare for system activation.
  8   *
  9   * Usage:
 10   *   node scripts/reset-agent-circuit-breakers.js [--dry-run] [--cleanup-tasks]
 11   *
 12   * Options:
 13   *   --dry-run         Show what would be reset without making changes
 14   *   --cleanup-tasks   Also clean up old failed tasks (>24 hours)
 15   *   --force           Reset all circuit breakers regardless of cooldown
 16   */
 17  
 18  import { createDatabaseConnection } from '../src/utils/db.js';
 19  import Logger from '../src/utils/logger.js';
 20  
 21  const logger = new Logger('CircuitBreakerReset');
 22  
 23  // Parse CLI arguments
 24  const args = process.argv.slice(2);
 25  const isDryRun = args.includes('--dry-run');
 26  const cleanupTasks = args.includes('--cleanup-tasks');
 27  const force = args.includes('--force');
 28  
 29  // Default cooldown: 30 minutes
 30  const COOLDOWN_MINUTES = parseInt(process.env.AGENT_CIRCUIT_BREAKER_COOLDOWN || '30', 10);
 31  
 32  // Database
 33  const db = createDatabaseConnection();
 34  db.pragma('foreign_keys = ON');
 35  
 36  /**
 37   * Get agents with active circuit breakers
 38   */
 39  function getBlockedAgents() {
 40    return db
 41      .prepare(
 42        `
 43      SELECT
 44        agent_name,
 45        status,
 46        metrics_json
 47      FROM tel.agent_state
 48      WHERE status = 'blocked'
 49    `
 50      )
 51      .all();
 52  }
 53  
 54  /**
 55   * Reset circuit breaker for an agent
 56   */
 57  function resetCircuitBreaker(agentName, reason) {
 58    if (isDryRun) {
 59      logger.info(`[DRY RUN] Would reset circuit breaker for ${agentName}: ${reason}`);
 60      return;
 61    }
 62  
 63    db.prepare(
 64      `
 65      UPDATE tel.agent_state
 66      SET status = 'idle',
 67          metrics_json = json_set(
 68            COALESCE(metrics_json, '{}'),
 69            '$.circuit_breaker_recovered_at', datetime('now'),
 70            '$.manual_reset', 1,
 71            '$.reset_reason', ?
 72          )
 73      WHERE agent_name = ?
 74    `
 75    ).run(reason, agentName);
 76  
 77    // Log recovery
 78    db.prepare(
 79      `
 80      INSERT INTO tel.agent_logs (agent_name, log_level, message, data_json, created_at)
 81      VALUES (?, 'info', ?, ?, datetime('now'))
 82    `
 83    ).run(
 84      agentName,
 85      'Circuit breaker manually reset',
 86      JSON.stringify({ reason, manual_reset: true })
 87    );
 88  
 89    logger.success(`Reset circuit breaker for ${agentName}: ${reason}`);
 90  }
 91  
 92  /**
 93   * Get old failed tasks
 94   */
 95  function getOldFailedTasks() {
 96    return db
 97      .prepare(
 98        `
 99      SELECT
100        id,
101        task_type,
102        assigned_to,
103        error_message,
104        created_at,
105        julianday('now') - julianday(created_at) as age_days
106      FROM tel.agent_tasks
107      WHERE status = 'failed'
108        AND created_at < datetime('now', '-24 hours')
109      ORDER BY created_at ASC
110    `
111      )
112      .all();
113  }
114  
115  /**
116   * Mark old failed tasks as cancelled
117   */
118  function cleanupOldTasks() {
119    if (isDryRun) {
120      logger.info('[DRY RUN] Would cleanup old failed tasks');
121      return 0;
122    }
123  
124    const result = db
125      .prepare(
126        `
127      UPDATE tel.agent_tasks
128      SET status = 'completed',
129          error_message = COALESCE(error_message, '') || ' [Auto-cancelled after 24h]',
130          completed_at = datetime('now')
131      WHERE status = 'failed'
132        AND created_at < datetime('now', '-24 hours')
133    `
134      )
135      .run();
136  
137    return result.changes;
138  }
139  
140  /**
141   * Main execution
142   */
143  function main() {
144    logger.info('Agent Circuit Breaker Reset Tool');
145    logger.info('=================================\n');
146  
147    if (isDryRun) {
148      logger.warn('DRY RUN MODE - No changes will be made\n');
149    }
150  
151    // Step 1: Check for blocked agents
152    const blockedAgents = getBlockedAgents();
153  
154    if (blockedAgents.length === 0) {
155      logger.success('No agents with active circuit breakers');
156    } else {
157      logger.info(`Found ${blockedAgents.length} blocked agents:\n`);
158  
159      let resetCount = 0;
160  
161      for (const agent of blockedAgents) {
162        const metrics = agent.metrics_json ? JSON.parse(agent.metrics_json) : {};
163        const triggeredAt = metrics.circuit_breaker_triggered_at
164          ? new Date(metrics.circuit_breaker_triggered_at)
165          : null;
166  
167        if (!triggeredAt) {
168          logger.warn(`  ${agent.agent_name}: Blocked but no trigger timestamp found - resetting`);
169          resetCircuitBreaker(agent.agent_name, 'No trigger timestamp');
170          resetCount++;
171          continue;
172        }
173  
174        const ageMinutes = Math.floor((Date.now() - triggeredAt.getTime()) / 60000);
175        const failureRate = metrics.failure_rate || 0;
176  
177        logger.info(`  ${agent.agent_name}:`);
178        logger.info(`    - Triggered: ${triggeredAt.toISOString()}`);
179        logger.info(`    - Age: ${ageMinutes} minutes`);
180        logger.info(`    - Failure rate: ${(failureRate * 100).toFixed(1)}%`);
181  
182        if (force) {
183          logger.info(`    - Action: FORCE RESET`);
184          resetCircuitBreaker(agent.agent_name, `Force reset (age: ${ageMinutes}m)`);
185          resetCount++;
186        } else if (ageMinutes >= COOLDOWN_MINUTES) {
187          logger.info(`    - Action: RESET (cooldown expired)`);
188          resetCircuitBreaker(agent.agent_name, `Cooldown expired (${COOLDOWN_MINUTES}m threshold)`);
189          resetCount++;
190        } else {
191          const remainingMinutes = COOLDOWN_MINUTES - ageMinutes;
192          logger.warn(`    - Action: SKIP (cooldown expires in ${remainingMinutes} minutes)`);
193          logger.info(`      Use --force to reset immediately`);
194        }
195  
196        logger.info('');
197      }
198  
199      logger.info(`Reset ${resetCount}/${blockedAgents.length} circuit breakers\n`);
200    }
201  
202    // Step 2: Clean up old failed tasks if requested
203    if (cleanupTasks) {
204      const oldTasks = getOldFailedTasks();
205  
206      if (oldTasks.length === 0) {
207        logger.success('No old failed tasks to cleanup');
208      } else {
209        logger.info(`Found ${oldTasks.length} old failed tasks (>24 hours):\n`);
210  
211        // Show sample of tasks
212        const sample = oldTasks.slice(0, 5);
213        for (const task of sample) {
214          logger.info(`  Task #${task.id} (${task.assigned_to}):`);
215          logger.info(`    - Type: ${task.task_type}`);
216          logger.info(`    - Age: ${task.age_days.toFixed(1)} days`);
217          logger.info(`    - Error: ${(task.error_message || 'N/A').substring(0, 60)}...`);
218          logger.info('');
219        }
220  
221        if (oldTasks.length > 5) {
222          logger.info(`  ... and ${oldTasks.length - 5} more\n`);
223        }
224  
225        const cleaned = cleanupOldTasks();
226        if (!isDryRun) {
227          logger.success(`Marked ${cleaned} old failed tasks as cancelled\n`);
228        }
229      }
230    }
231  
232    // Step 3: Show current agent status
233    logger.info('Current Agent Status:');
234    logger.info('=====================\n');
235  
236    const allAgents = db
237      .prepare(
238        `
239      SELECT
240        agent_name,
241        status,
242        metrics_json
243      FROM tel.agent_state
244      ORDER BY agent_name
245    `
246      )
247      .all();
248  
249    for (const agent of allAgents) {
250      const metrics = agent.metrics_json ? JSON.parse(agent.metrics_json) : {};
251      const status = agent.status === 'blocked' ? '🔴 BLOCKED' : '✅ ACTIVE';
252  
253      logger.info(`  ${agent.agent_name}: ${status}`);
254      logger.info(`    - Success rate: ${((metrics.success_rate || 0) * 100).toFixed(1)}%`);
255      logger.info(`    - Failure rate: ${((metrics.failure_rate || 0) * 100).toFixed(1)}%`);
256      logger.info(`    - Total tasks (24h): ${metrics.total_tasks_24h || 0}`);
257      logger.info('');
258    }
259  
260    logger.success('Circuit breaker reset complete!');
261  
262    if (isDryRun) {
263      logger.warn('\nDRY RUN COMPLETE - Run without --dry-run to apply changes');
264    }
265  }
266  
267  // Run
268  try {
269    main();
270  } catch (error) {
271    logger.error('Failed to reset circuit breakers:', error);
272    process.exit(1);
273  } finally {
274    db.close();
275  }