reset-agent-circuit-breakers.js
1 #!/usr/bin/env node 2 3 /** 4 * Reset Agent Circuit Breakers 5 * 6 * Resets circuit breakers for agents that have been blocked for >30 minutes 7 * and cleans up old failed tasks (>24 hours) to prepare for system activation. 8 * 9 * Usage: 10 * node scripts/reset-agent-circuit-breakers.js [--dry-run] [--cleanup-tasks] 11 * 12 * Options: 13 * --dry-run Show what would be reset without making changes 14 * --cleanup-tasks Also clean up old failed tasks (>24 hours) 15 * --force Reset all circuit breakers regardless of cooldown 16 */ 17 18 import { createDatabaseConnection } from '../src/utils/db.js'; 19 import Logger from '../src/utils/logger.js'; 20 21 const logger = new Logger('CircuitBreakerReset'); 22 23 // Parse CLI arguments 24 const args = process.argv.slice(2); 25 const isDryRun = args.includes('--dry-run'); 26 const cleanupTasks = args.includes('--cleanup-tasks'); 27 const force = args.includes('--force'); 28 29 // Default cooldown: 30 minutes 30 const COOLDOWN_MINUTES = parseInt(process.env.AGENT_CIRCUIT_BREAKER_COOLDOWN || '30', 10); 31 32 // Database 33 const db = createDatabaseConnection(); 34 db.pragma('foreign_keys = ON'); 35 36 /** 37 * Get agents with active circuit breakers 38 */ 39 function getBlockedAgents() { 40 return db 41 .prepare( 42 ` 43 SELECT 44 agent_name, 45 status, 46 metrics_json 47 FROM tel.agent_state 48 WHERE status = 'blocked' 49 ` 50 ) 51 .all(); 52 } 53 54 /** 55 * Reset circuit breaker for an agent 56 */ 57 function resetCircuitBreaker(agentName, reason) { 58 if (isDryRun) { 59 logger.info(`[DRY RUN] Would reset circuit breaker for ${agentName}: ${reason}`); 60 return; 61 } 62 63 db.prepare( 64 ` 65 UPDATE tel.agent_state 66 SET status = 'idle', 67 metrics_json = json_set( 68 COALESCE(metrics_json, '{}'), 69 '$.circuit_breaker_recovered_at', datetime('now'), 70 '$.manual_reset', 1, 71 '$.reset_reason', ? 72 ) 73 WHERE agent_name = ? 74 ` 75 ).run(reason, agentName); 76 77 // Log recovery 78 db.prepare( 79 ` 80 INSERT INTO tel.agent_logs (agent_name, log_level, message, data_json, created_at) 81 VALUES (?, 'info', ?, ?, datetime('now')) 82 ` 83 ).run( 84 agentName, 85 'Circuit breaker manually reset', 86 JSON.stringify({ reason, manual_reset: true }) 87 ); 88 89 logger.success(`Reset circuit breaker for ${agentName}: ${reason}`); 90 } 91 92 /** 93 * Get old failed tasks 94 */ 95 function getOldFailedTasks() { 96 return db 97 .prepare( 98 ` 99 SELECT 100 id, 101 task_type, 102 assigned_to, 103 error_message, 104 created_at, 105 julianday('now') - julianday(created_at) as age_days 106 FROM tel.agent_tasks 107 WHERE status = 'failed' 108 AND created_at < datetime('now', '-24 hours') 109 ORDER BY created_at ASC 110 ` 111 ) 112 .all(); 113 } 114 115 /** 116 * Mark old failed tasks as cancelled 117 */ 118 function cleanupOldTasks() { 119 if (isDryRun) { 120 logger.info('[DRY RUN] Would cleanup old failed tasks'); 121 return 0; 122 } 123 124 const result = db 125 .prepare( 126 ` 127 UPDATE tel.agent_tasks 128 SET status = 'completed', 129 error_message = COALESCE(error_message, '') || ' [Auto-cancelled after 24h]', 130 completed_at = datetime('now') 131 WHERE status = 'failed' 132 AND created_at < datetime('now', '-24 hours') 133 ` 134 ) 135 .run(); 136 137 return result.changes; 138 } 139 140 /** 141 * Main execution 142 */ 143 function main() { 144 logger.info('Agent Circuit Breaker Reset Tool'); 145 logger.info('=================================\n'); 146 147 if (isDryRun) { 148 logger.warn('DRY RUN MODE - No changes will be made\n'); 149 } 150 151 // Step 1: Check for blocked agents 152 const blockedAgents = getBlockedAgents(); 153 154 if (blockedAgents.length === 0) { 155 logger.success('No agents with active circuit breakers'); 156 } else { 157 logger.info(`Found ${blockedAgents.length} blocked agents:\n`); 158 159 let resetCount = 0; 160 161 for (const agent of blockedAgents) { 162 const metrics = agent.metrics_json ? JSON.parse(agent.metrics_json) : {}; 163 const triggeredAt = metrics.circuit_breaker_triggered_at 164 ? new Date(metrics.circuit_breaker_triggered_at) 165 : null; 166 167 if (!triggeredAt) { 168 logger.warn(` ${agent.agent_name}: Blocked but no trigger timestamp found - resetting`); 169 resetCircuitBreaker(agent.agent_name, 'No trigger timestamp'); 170 resetCount++; 171 continue; 172 } 173 174 const ageMinutes = Math.floor((Date.now() - triggeredAt.getTime()) / 60000); 175 const failureRate = metrics.failure_rate || 0; 176 177 logger.info(` ${agent.agent_name}:`); 178 logger.info(` - Triggered: ${triggeredAt.toISOString()}`); 179 logger.info(` - Age: ${ageMinutes} minutes`); 180 logger.info(` - Failure rate: ${(failureRate * 100).toFixed(1)}%`); 181 182 if (force) { 183 logger.info(` - Action: FORCE RESET`); 184 resetCircuitBreaker(agent.agent_name, `Force reset (age: ${ageMinutes}m)`); 185 resetCount++; 186 } else if (ageMinutes >= COOLDOWN_MINUTES) { 187 logger.info(` - Action: RESET (cooldown expired)`); 188 resetCircuitBreaker(agent.agent_name, `Cooldown expired (${COOLDOWN_MINUTES}m threshold)`); 189 resetCount++; 190 } else { 191 const remainingMinutes = COOLDOWN_MINUTES - ageMinutes; 192 logger.warn(` - Action: SKIP (cooldown expires in ${remainingMinutes} minutes)`); 193 logger.info(` Use --force to reset immediately`); 194 } 195 196 logger.info(''); 197 } 198 199 logger.info(`Reset ${resetCount}/${blockedAgents.length} circuit breakers\n`); 200 } 201 202 // Step 2: Clean up old failed tasks if requested 203 if (cleanupTasks) { 204 const oldTasks = getOldFailedTasks(); 205 206 if (oldTasks.length === 0) { 207 logger.success('No old failed tasks to cleanup'); 208 } else { 209 logger.info(`Found ${oldTasks.length} old failed tasks (>24 hours):\n`); 210 211 // Show sample of tasks 212 const sample = oldTasks.slice(0, 5); 213 for (const task of sample) { 214 logger.info(` Task #${task.id} (${task.assigned_to}):`); 215 logger.info(` - Type: ${task.task_type}`); 216 logger.info(` - Age: ${task.age_days.toFixed(1)} days`); 217 logger.info(` - Error: ${(task.error_message || 'N/A').substring(0, 60)}...`); 218 logger.info(''); 219 } 220 221 if (oldTasks.length > 5) { 222 logger.info(` ... and ${oldTasks.length - 5} more\n`); 223 } 224 225 const cleaned = cleanupOldTasks(); 226 if (!isDryRun) { 227 logger.success(`Marked ${cleaned} old failed tasks as cancelled\n`); 228 } 229 } 230 } 231 232 // Step 3: Show current agent status 233 logger.info('Current Agent Status:'); 234 logger.info('=====================\n'); 235 236 const allAgents = db 237 .prepare( 238 ` 239 SELECT 240 agent_name, 241 status, 242 metrics_json 243 FROM tel.agent_state 244 ORDER BY agent_name 245 ` 246 ) 247 .all(); 248 249 for (const agent of allAgents) { 250 const metrics = agent.metrics_json ? JSON.parse(agent.metrics_json) : {}; 251 const status = agent.status === 'blocked' ? '🔴 BLOCKED' : '✅ ACTIVE'; 252 253 logger.info(` ${agent.agent_name}: ${status}`); 254 logger.info(` - Success rate: ${((metrics.success_rate || 0) * 100).toFixed(1)}%`); 255 logger.info(` - Failure rate: ${((metrics.failure_rate || 0) * 100).toFixed(1)}%`); 256 logger.info(` - Total tasks (24h): ${metrics.total_tasks_24h || 0}`); 257 logger.info(''); 258 } 259 260 logger.success('Circuit breaker reset complete!'); 261 262 if (isDryRun) { 263 logger.warn('\nDRY RUN COMPLETE - Run without --dry-run to apply changes'); 264 } 265 } 266 267 // Run 268 try { 269 main(); 270 } catch (error) { 271 logger.error('Failed to reset circuit breakers:', error); 272 process.exit(1); 273 } finally { 274 db.close(); 275 }