/ src / cron / process-reaper.js
process-reaper.js
  1  /**
  2   * Process Reaper - Stale Agent Process Cleanup
  3   *
  4   * Runs every 5 minutes via cron. Kills stale claude subprocesses (background
  5   * agents that have been running too long) and logs system health metrics.
  6   *
  7   * Why zombies happen:
  8   *   The Claude Code Task tool spawns claude subprocesses for background agents.
  9   *   Those agents run git commands via execSync. Under memory pressure (swap
 10   *   thrashing), git can hang. When the parent claude process exits first, the
 11   *   orphaned git process eventually exits and becomes a zombie. Zombies can't
 12   *   be killed — they're already dead process-table entries. Container restart
 13   *   clears them. Prevention is the fix: kill stale parents before they
 14   *   accumulate, and add timeouts to all git execSync calls.
 15   *
 16   * What this does:
 17   *   1. Kill claude subprocesses older than MAX_AGE (default 2h)
 18   *   2. Log zombie count + memory pressure to system_health table
 19   *   3. Emit warnings when memory is critically low
 20   */
 21  
 22  import { execSync } from 'child_process';
 23  import { freemem, totalmem } from 'os';
 24  import { run } from './../utils/db.js';
 25  import { join, dirname } from 'path';
 26  import { fileURLToPath } from 'url';
 27  import Logger from '../utils/logger.js';
 28  import '../utils/load-env.js';
 29  
 30  const __filename = fileURLToPath(import.meta.url);
 31  const __dirname = dirname(__filename);
 32  
 33  const logger = new Logger('ProcessReaper');
 34  
 35  const MAX_AGE_MINUTES = parseInt(process.env.REAPER_MAX_AGENT_AGE_MINUTES || '120', 10);
 36  const ZOMBIE_WARN_THRESHOLD = parseInt(process.env.REAPER_ZOMBIE_WARN_THRESHOLD || '500', 10);
 37  const FREE_MEM_WARN_MB = parseInt(process.env.REAPER_FREE_MEM_WARN_MB || '1024', 10);
 38  const SWAP_WARN_PCT = parseFloat(process.env.REAPER_SWAP_WARN_PCT || '0.7');
 39  
 40  function getZombieCount() {
 41    try {
 42      const out = execSync("ps -eo stat --no-headers | grep -c '^Z' || true", {
 43        encoding: 'utf8',
 44        timeout: 10000,
 45        shell: '/bin/sh',
 46      }).trim();
 47      return parseInt(out, 10) || 0;
 48    } catch {
 49      return 0;
 50    }
 51  }
 52  
 53  function getMemoryInfo() {
 54    const freeMemMB = freemem() / 1024 / 1024;
 55    const totalMemMB = totalmem() / 1024 / 1024;
 56    let swapUsedMB = 0;
 57    let swapTotalMB = 0;
 58  
 59    try {
 60      const [total, used] = execSync("free -m | awk '/^Swap/{print $2, $3}'", {
 61        encoding: 'utf8',
 62        timeout: 5000,
 63        shell: '/bin/sh',
 64      })
 65        .trim()
 66        .split(' ')
 67        .map(Number);
 68      swapTotalMB = total || 0;
 69      swapUsedMB = used || 0;
 70    } catch {
 71      /* ignore */
 72    }
 73  
 74    return { freeMemMB, totalMemMB, swapUsedMB, swapTotalMB };
 75  }
 76  
 77  /**
 78   * Find live (non-zombie) agent subprocesses older than MAX_AGE_MINUTES and kill them.
 79   * Targets both 'claude' and 'npm' processes (agent:run:single workers).
 80   * Only kills processes in zombie-prone parent chains (codium/electron/PID 1).
 81   */
 82  function killStaleAgentProcesses() {
 83    const killed = [];
 84    const currentPid = process.pid;
 85    const maxAgeSeconds = MAX_AGE_MINUTES * 60;
 86  
 87    // Get all non-zombie claude and npm processes older than threshold
 88    let psOutput = '';
 89    try {
 90      // stat field: Z = zombie (skip), others = killable
 91      psOutput = execSync(
 92        'ps -eo pid,ppid,etimes,stat,comm --no-headers | awk \'$5=="claude" || $5~/^npm/{print $0}\'',
 93        { encoding: 'utf8', timeout: 10000, shell: '/bin/sh' }
 94      ).trim();
 95    } catch {
 96      return killed;
 97    }
 98  
 99    if (!psOutput) return killed;
100  
101    for (const line of psOutput.split('\n')) {
102      const parts = line.trim().split(/\s+/);
103      const pid = parseInt(parts[0], 10);
104      const ppid = parseInt(parts[1], 10);
105      const ageSeconds = parseInt(parts[2], 10);
106      const stat = parts[3] || '';
107      const comm = parts[4] || '';
108  
109      if (!pid || pid === currentPid) continue;
110      if (stat.startsWith('Z')) continue; // Skip zombies — already dead, can't kill
111      if (ageSeconds < maxAgeSeconds) continue;
112  
113      // For npm processes, verify they're running agent:run:single (via /proc/PID/cmdline)
114      if (comm.startsWith('npm')) {
115        try {
116          const cmdline = execSync(`cat /proc/${pid}/cmdline 2>/dev/null | tr '\\0' ' '`, {
117            encoding: 'utf8',
118            timeout: 2000,
119            shell: '/bin/sh',
120          }).trim();
121          if (!cmdline.includes('agent')) continue; // Not an agent worker, skip
122        } catch {
123          continue; // Can't read cmdline, skip
124        }
125      }
126  
127      const ageMinutes = Math.floor(ageSeconds / 60);
128      logger.warn(`Killing stale ${comm} PID ${pid} (${ageMinutes}min old, ppid=${ppid})`);
129  
130      try {
131        execSync(`kill -TERM ${pid} 2>/dev/null; sleep 1; kill -KILL ${pid} 2>/dev/null; true`, {
132          encoding: 'utf8',
133          timeout: 6000,
134          shell: '/bin/sh',
135        });
136        killed.push({ pid, ageMinutes, comm });
137      } catch {
138        killed.push({ pid, ageMinutes, comm, note: 'already_gone' });
139      }
140    }
141  
142    return killed;
143  }
144  
145  async function logToDb(status, details, actionTaken) {
146    await run(
147      `INSERT INTO tel.system_health (check_type, status, details, action_taken) VALUES ($1, $2, $3, $4)`,
148      ['process_reaper', status, JSON.stringify(details), actionTaken]
149    );
150  
151    // Housekeeping (10% chance per run)
152    if (Math.random() < 0.1) {
153      await run(
154        `DELETE FROM tel.system_health WHERE check_type = $1 AND created_at < NOW() - INTERVAL '7 days'`,
155        ['process_reaper']
156      );
157    }
158  }
159  
160  export async function runProcessReaper() {
161    const startTime = Date.now();
162    logger.info('Process reaper starting');
163  
164    const zombieCount = getZombieCount();
165    const mem = getMemoryInfo();
166    const killed = killStaleAgentProcesses();
167  
168    const swapPct = mem.swapTotalMB > 0 ? mem.swapUsedMB / mem.swapTotalMB : 0;
169    const memWarning = mem.freeMemMB < FREE_MEM_WARN_MB || swapPct > SWAP_WARN_PCT;
170    const status = memWarning || zombieCount > ZOMBIE_WARN_THRESHOLD ? 'warning' : 'ok';
171  
172    if (zombieCount > 0) {
173      logger.warn(
174        `${zombieCount} zombie processes detected (already-dead; clears on container restart). ` +
175          `Prevention: git execSync timeouts + stale process reaping.`
176      );
177    }
178    if (memWarning) {
179      logger.warn(
180        `Memory pressure: ${mem.freeMemMB.toFixed(0)}MB free, swap ${(swapPct * 100).toFixed(0)}% used`
181      );
182    }
183    if (killed.length > 0) {
184      logger.info(`Killed ${killed.length} stale agent processes`, {
185        pids: killed.map(p => p.pid),
186      });
187    }
188  
189    const details = {
190      zombie_count: zombieCount,
191      zombie_threshold: ZOMBIE_WARN_THRESHOLD,
192      free_mem_mb: Math.round(mem.freeMemMB),
193      swap_used_mb: mem.swapUsedMB,
194      swap_pct: Math.round(swapPct * 100),
195      stale_processes_killed: killed.length,
196      max_age_minutes: MAX_AGE_MINUTES,
197    };
198  
199    await logToDb(
200      status,
201      details,
202      killed.length > 0 ? `Killed PIDs: ${killed.map(p => p.pid).join(', ')}` : null
203    );
204  
205    const duration = ((Date.now() - startTime) / 1000).toFixed(1);
206    logger.info(
207      `Process reaper done in ${duration}s: ${zombieCount} zombies, ` +
208        `${mem.freeMemMB.toFixed(0)}MB free, ${killed.length} stale killed`
209    );
210  
211    return { ...details, status, duration_seconds: parseFloat(duration), killed };
212  }
213  
214  // CLI support
215  if (import.meta.url === `file://${process.argv[1]}`) {
216    runProcessReaper()
217      .then(r => {
218        console.log(JSON.stringify(r, null, 2));
219        process.exit(0);
220      })
221      .catch(err => {
222        console.error('Process reaper error:', err.message);
223        process.exit(1);
224      });
225  }