process-reaper.js
1 /** 2 * Process Reaper - Stale Agent Process Cleanup 3 * 4 * Runs every 5 minutes via cron. Kills stale claude subprocesses (background 5 * agents that have been running too long) and logs system health metrics. 6 * 7 * Why zombies happen: 8 * The Claude Code Task tool spawns claude subprocesses for background agents. 9 * Those agents run git commands via execSync. Under memory pressure (swap 10 * thrashing), git can hang. When the parent claude process exits first, the 11 * orphaned git process eventually exits and becomes a zombie. Zombies can't 12 * be killed — they're already dead process-table entries. Container restart 13 * clears them. Prevention is the fix: kill stale parents before they 14 * accumulate, and add timeouts to all git execSync calls. 15 * 16 * What this does: 17 * 1. Kill claude subprocesses older than MAX_AGE (default 2h) 18 * 2. Log zombie count + memory pressure to system_health table 19 * 3. Emit warnings when memory is critically low 20 */ 21 22 import { execSync } from 'child_process'; 23 import { freemem, totalmem } from 'os'; 24 import { run } from './../utils/db.js'; 25 import { join, dirname } from 'path'; 26 import { fileURLToPath } from 'url'; 27 import Logger from '../utils/logger.js'; 28 import '../utils/load-env.js'; 29 30 const __filename = fileURLToPath(import.meta.url); 31 const __dirname = dirname(__filename); 32 33 const logger = new Logger('ProcessReaper'); 34 35 const MAX_AGE_MINUTES = parseInt(process.env.REAPER_MAX_AGENT_AGE_MINUTES || '120', 10); 36 const ZOMBIE_WARN_THRESHOLD = parseInt(process.env.REAPER_ZOMBIE_WARN_THRESHOLD || '500', 10); 37 const FREE_MEM_WARN_MB = parseInt(process.env.REAPER_FREE_MEM_WARN_MB || '1024', 10); 38 const SWAP_WARN_PCT = parseFloat(process.env.REAPER_SWAP_WARN_PCT || '0.7'); 39 40 function getZombieCount() { 41 try { 42 const out = execSync("ps -eo stat --no-headers | grep -c '^Z' || true", { 43 encoding: 'utf8', 44 timeout: 10000, 45 shell: '/bin/sh', 46 }).trim(); 47 return parseInt(out, 10) || 0; 48 } catch { 49 return 0; 50 } 51 } 52 53 function getMemoryInfo() { 54 const freeMemMB = freemem() / 1024 / 1024; 55 const totalMemMB = totalmem() / 1024 / 1024; 56 let swapUsedMB = 0; 57 let swapTotalMB = 0; 58 59 try { 60 const [total, used] = execSync("free -m | awk '/^Swap/{print $2, $3}'", { 61 encoding: 'utf8', 62 timeout: 5000, 63 shell: '/bin/sh', 64 }) 65 .trim() 66 .split(' ') 67 .map(Number); 68 swapTotalMB = total || 0; 69 swapUsedMB = used || 0; 70 } catch { 71 /* ignore */ 72 } 73 74 return { freeMemMB, totalMemMB, swapUsedMB, swapTotalMB }; 75 } 76 77 /** 78 * Find live (non-zombie) agent subprocesses older than MAX_AGE_MINUTES and kill them. 79 * Targets both 'claude' and 'npm' processes (agent:run:single workers). 80 * Only kills processes in zombie-prone parent chains (codium/electron/PID 1). 81 */ 82 function killStaleAgentProcesses() { 83 const killed = []; 84 const currentPid = process.pid; 85 const maxAgeSeconds = MAX_AGE_MINUTES * 60; 86 87 // Get all non-zombie claude and npm processes older than threshold 88 let psOutput = ''; 89 try { 90 // stat field: Z = zombie (skip), others = killable 91 psOutput = execSync( 92 'ps -eo pid,ppid,etimes,stat,comm --no-headers | awk \'$5=="claude" || $5~/^npm/{print $0}\'', 93 { encoding: 'utf8', timeout: 10000, shell: '/bin/sh' } 94 ).trim(); 95 } catch { 96 return killed; 97 } 98 99 if (!psOutput) return killed; 100 101 for (const line of psOutput.split('\n')) { 102 const parts = line.trim().split(/\s+/); 103 const pid = parseInt(parts[0], 10); 104 const ppid = parseInt(parts[1], 10); 105 const ageSeconds = parseInt(parts[2], 10); 106 const stat = parts[3] || ''; 107 const comm = parts[4] || ''; 108 109 if (!pid || pid === currentPid) continue; 110 if (stat.startsWith('Z')) continue; // Skip zombies — already dead, can't kill 111 if (ageSeconds < maxAgeSeconds) continue; 112 113 // For npm processes, verify they're running agent:run:single (via /proc/PID/cmdline) 114 if (comm.startsWith('npm')) { 115 try { 116 const cmdline = execSync(`cat /proc/${pid}/cmdline 2>/dev/null | tr '\\0' ' '`, { 117 encoding: 'utf8', 118 timeout: 2000, 119 shell: '/bin/sh', 120 }).trim(); 121 if (!cmdline.includes('agent')) continue; // Not an agent worker, skip 122 } catch { 123 continue; // Can't read cmdline, skip 124 } 125 } 126 127 const ageMinutes = Math.floor(ageSeconds / 60); 128 logger.warn(`Killing stale ${comm} PID ${pid} (${ageMinutes}min old, ppid=${ppid})`); 129 130 try { 131 execSync(`kill -TERM ${pid} 2>/dev/null; sleep 1; kill -KILL ${pid} 2>/dev/null; true`, { 132 encoding: 'utf8', 133 timeout: 6000, 134 shell: '/bin/sh', 135 }); 136 killed.push({ pid, ageMinutes, comm }); 137 } catch { 138 killed.push({ pid, ageMinutes, comm, note: 'already_gone' }); 139 } 140 } 141 142 return killed; 143 } 144 145 async function logToDb(status, details, actionTaken) { 146 await run( 147 `INSERT INTO tel.system_health (check_type, status, details, action_taken) VALUES ($1, $2, $3, $4)`, 148 ['process_reaper', status, JSON.stringify(details), actionTaken] 149 ); 150 151 // Housekeeping (10% chance per run) 152 if (Math.random() < 0.1) { 153 await run( 154 `DELETE FROM tel.system_health WHERE check_type = $1 AND created_at < NOW() - INTERVAL '7 days'`, 155 ['process_reaper'] 156 ); 157 } 158 } 159 160 export async function runProcessReaper() { 161 const startTime = Date.now(); 162 logger.info('Process reaper starting'); 163 164 const zombieCount = getZombieCount(); 165 const mem = getMemoryInfo(); 166 const killed = killStaleAgentProcesses(); 167 168 const swapPct = mem.swapTotalMB > 0 ? mem.swapUsedMB / mem.swapTotalMB : 0; 169 const memWarning = mem.freeMemMB < FREE_MEM_WARN_MB || swapPct > SWAP_WARN_PCT; 170 const status = memWarning || zombieCount > ZOMBIE_WARN_THRESHOLD ? 'warning' : 'ok'; 171 172 if (zombieCount > 0) { 173 logger.warn( 174 `${zombieCount} zombie processes detected (already-dead; clears on container restart). ` + 175 `Prevention: git execSync timeouts + stale process reaping.` 176 ); 177 } 178 if (memWarning) { 179 logger.warn( 180 `Memory pressure: ${mem.freeMemMB.toFixed(0)}MB free, swap ${(swapPct * 100).toFixed(0)}% used` 181 ); 182 } 183 if (killed.length > 0) { 184 logger.info(`Killed ${killed.length} stale agent processes`, { 185 pids: killed.map(p => p.pid), 186 }); 187 } 188 189 const details = { 190 zombie_count: zombieCount, 191 zombie_threshold: ZOMBIE_WARN_THRESHOLD, 192 free_mem_mb: Math.round(mem.freeMemMB), 193 swap_used_mb: mem.swapUsedMB, 194 swap_pct: Math.round(swapPct * 100), 195 stale_processes_killed: killed.length, 196 max_age_minutes: MAX_AGE_MINUTES, 197 }; 198 199 await logToDb( 200 status, 201 details, 202 killed.length > 0 ? `Killed PIDs: ${killed.map(p => p.pid).join(', ')}` : null 203 ); 204 205 const duration = ((Date.now() - startTime) / 1000).toFixed(1); 206 logger.info( 207 `Process reaper done in ${duration}s: ${zombieCount} zombies, ` + 208 `${mem.freeMemMB.toFixed(0)}MB free, ${killed.length} stale killed` 209 ); 210 211 return { ...details, status, duration_seconds: parseFloat(duration), killed }; 212 } 213 214 // CLI support 215 if (import.meta.url === `file://${process.argv[1]}`) { 216 runProcessReaper() 217 .then(r => { 218 console.log(JSON.stringify(r, null, 2)); 219 process.exit(0); 220 }) 221 .catch(err => { 222 console.error('Process reaper error:', err.message); 223 process.exit(1); 224 }); 225 }