/ tests / cron / process-guardian.test.js
process-guardian.test.js
  1  /**
  2   * Tests for src/cron/process-guardian.js
  3   *
  4   * Covers all 6 health checks:
  5   *  1. checkPipelineService — active/inactive/cooldown/crash-loop paths
  6   *  2. checkClearanceCycle — running/completed transition
  7   *  3. checkCircuitBreaker — ok/critical paths
  8   *  4. cleanExpiredRateLimits — ok/error paths
  9   *  5. checkBrowserLoopHung — no data/vision-disabled/hung/cooldown/ok paths
 10   *  6. checkCronTimer — ok/infinity/cooldown paths
 11   *  Plus: writeStatusFile, cleanupOldRecords, runProcessGuardian
 12   *
 13   * Strategy: mock child_process (execSync) and rate-limit-scheduler at module level,
 14   * use pg-mock (in-memory SQLite via db.js mock) for database interactions.
 15   * NODE_ENV=test ensures Logger skips createWriteStream.
 16   *
 17   * NOTE: requires --experimental-test-module-mocks
 18   */
 19  
 20  import { test, describe, mock, before, after } from 'node:test';
 21  import assert from 'node:assert/strict';
 22  import Database from 'better-sqlite3';
 23  import { createPgMock } from '../helpers/pg-mock.js';
 24  
 25  // ── Mutable stubs ────────────────────────────────────────────────────────────
 26  
 27  let execSyncFn = (_cmd, _opts) => '';
 28  let getSkipStagesFn = () => new Set();
 29  let getRateLimitStatusFn = () => [];
 30  
 31  // ─── Create in-memory test DB ─────────────────────────────────────────────────
 32  
 33  const db = new Database(':memory:');
 34  
 35  db.exec(`
 36    CREATE TABLE IF NOT EXISTS sites (
 37      id INTEGER PRIMARY KEY AUTOINCREMENT,
 38      domain TEXT NOT NULL DEFAULT 'test.com',
 39      status TEXT DEFAULT 'found',
 40      score REAL,
 41      error_message TEXT,
 42      updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
 43      rescored_at DATETIME
 44    );
 45  
 46    CREATE TABLE IF NOT EXISTS system_health (
 47      id INTEGER PRIMARY KEY AUTOINCREMENT,
 48      check_type TEXT NOT NULL,
 49      status TEXT NOT NULL,
 50      details TEXT,
 51      action_taken TEXT,
 52      created_at TEXT DEFAULT (datetime('now'))
 53    );
 54  
 55    CREATE TABLE IF NOT EXISTS agent_tasks (
 56      id INTEGER PRIMARY KEY AUTOINCREMENT,
 57      task_type TEXT NOT NULL,
 58      assigned_to TEXT,
 59      priority INTEGER DEFAULT 5,
 60      status TEXT DEFAULT 'pending',
 61      context_json TEXT,
 62      result_json TEXT,
 63      error_message TEXT,
 64      created_at TEXT DEFAULT (datetime('now')),
 65      updated_at TEXT DEFAULT (datetime('now'))
 66    );
 67  
 68    CREATE TABLE IF NOT EXISTS pipeline_control (
 69      key TEXT PRIMARY KEY,
 70      value TEXT,
 71      last_browser_loop_at TEXT,
 72      last_api_loop_at TEXT
 73    );
 74  
 75    CREATE TABLE IF NOT EXISTS settings (
 76      key TEXT PRIMARY KEY,
 77      value TEXT,
 78      description TEXT,
 79      updated_at TEXT DEFAULT CURRENT_TIMESTAMP
 80    );
 81  `);
 82  
 83  // ── Mock modules BEFORE any import ───────────────────────────────────────────
 84  
 85  mock.module('../../src/utils/db.js', {
 86    namedExports: createPgMock(db),
 87  });
 88  
 89  mock.module('child_process', {
 90    namedExports: {
 91      execSync: (...args) => execSyncFn(...args),
 92    },
 93  });
 94  
 95  mock.module('../../src/utils/rate-limit-scheduler.js', {
 96    namedExports: {
 97      getSkipStages: () => getSkipStagesFn(),
 98      getRateLimitStatus: () => getRateLimitStatusFn(),
 99      setRateLimit: () => {},
100    },
101  });
102  
103  mock.module('../../src/utils/load-env.js', {
104    namedExports: {},
105  });
106  
107  // ── Env setup ────────────────────────────────────────────────────────────────
108  
109  process.env.NODE_ENV = 'test';
110  
111  // Import AFTER mocks
112  const { runProcessGuardian } = await import('../../src/cron/process-guardian.js');
113  
114  // ── Helpers ───────────────────────────────────────────────────────────────────
115  
116  function clearTable(tableName) {
117    db.prepare(`DELETE FROM ${tableName}`).run();
118  }
119  
120  // Default happy-path execSync: pipeline active, no clearance, timer ok
121  function happyExecSync(cmd) {
122    if (cmd.includes('is-active')) return 'active\n';
123    if (cmd.includes('ps aux')) return 'root 123 node\n';
124    if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234567\n';
125    return '';
126  }
127  
128  // ── Tests ─────────────────────────────────────────────────────────────────────
129  
130  describe('runProcessGuardian — pipeline active (happy path)', () => {
131    before(() => {
132      clearTable('system_health');
133      execSyncFn = happyExecSync;
134      getSkipStagesFn = () => new Set();
135      getRateLimitStatusFn = () => [];
136    });
137  
138    test('returns summary object with expected fields', async () => {
139      const result = await runProcessGuardian();
140      assert.equal(typeof result, 'object');
141      assert.ok(result.checks_run >= 5, 'should run at least 5 checks');
142      assert.ok(typeof result.ok === 'number');
143      assert.ok(typeof result.warnings === 'number');
144      assert.ok(typeof result.critical === 'number');
145      assert.ok(typeof result.duration_seconds === 'number');
146      assert.ok(Array.isArray(result.results));
147    });
148  
149    test('writes system_health records to DB', async () => {
150      clearTable('system_health');
151      await runProcessGuardian();
152      const rows = db.prepare('SELECT check_type FROM system_health').all();
153      assert.ok(rows.length >= 5, `expected ≥5 health records, got ${rows.length}`);
154      const types = rows.map(r => r.check_type);
155      assert.ok(types.includes('pipeline_service'));
156      assert.ok(types.includes('circuit_breaker'));
157      assert.ok(types.includes('browser_loop_hung'));
158      assert.ok(types.includes('cron_timer_dead'));
159    });
160  });
161  
162  describe('checkPipelineService — pipeline inactive → restart', () => {
163    before(() => {
164      clearTable('system_health');
165      execSyncFn = cmd => {
166        if (cmd.includes('is-active') && !cmd.includes('cron')) {
167          const err = new Error('inactive');
168          err.stdout = 'inactive\n';
169          throw err;
170        }
171        if (cmd.includes('ps aux')) return 'root 123 node\n';
172        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
173        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
174        return '';
175      };
176      getSkipStagesFn = () => new Set();
177    });
178  
179    test('restarts pipeline and logs restarted_pipeline action', async () => {
180      await runProcessGuardian();
181      const row = db
182        .prepare(
183          "SELECT * FROM system_health WHERE check_type='pipeline_service' ORDER BY id DESC LIMIT 1"
184        )
185        .get();
186      assert.ok(row, 'should have a pipeline_service health record');
187      assert.ok(
188        row.action_taken?.includes('restart') || row.action_taken?.includes('crash'),
189        `action should involve restart: ${row.action_taken}`
190      );
191    });
192  });
193  
194  describe('checkPipelineService — cooldown active (recent restart in last 3min)', () => {
195    before(() => {
196      clearTable('system_health');
197      // Insert a recent restart record
198      db.prepare(
199        `INSERT INTO system_health (check_type, status, action_taken, created_at)
200         VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-1 minute'))`
201      ).run();
202  
203      execSyncFn = cmd => {
204        if (cmd.includes('is-active') && !cmd.includes('cron')) {
205          const err = new Error('inactive');
206          err.stdout = 'inactive\n';
207          throw err;
208        }
209        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
210        if (cmd.includes('ps aux')) return 'root 123 node\n';
211        return '';
212      };
213      getSkipStagesFn = () => new Set();
214    });
215  
216    test('skips restart when cooldown active', async () => {
217      await runProcessGuardian();
218      const rows = db
219        .prepare(
220          "SELECT action_taken FROM system_health WHERE check_type='pipeline_service' ORDER BY id DESC LIMIT 3"
221        )
222        .all();
223      const latestAction = rows[0]?.action_taken || '';
224      assert.ok(
225        latestAction.includes('cooldown') || latestAction.includes('restarted'),
226        `expected cooldown skip or restart: ${latestAction}`
227      );
228    });
229  });
230  
231  describe('checkCircuitBreaker — critical when >3 breaker-open errors', () => {
232    before(() => {
233      clearTable('system_health');
234      clearTable('agent_tasks');
235      for (let i = 0; i < 4; i++) {
236        db.prepare(
237          `INSERT INTO agent_tasks (task_type, assigned_to, context_json, created_at, updated_at)
238           VALUES ('triage', 'triage', '{"error": "Breaker is open"}', datetime('now'), datetime('now'))`
239        ).run();
240      }
241  
242      execSyncFn = happyExecSync;
243      getSkipStagesFn = () => new Set();
244    });
245  
246    test('reports critical status for circuit_breaker check', async () => {
247      const result = await runProcessGuardian();
248      const cb = result.results?.find(r => r.check === 'circuit_breaker');
249      assert.ok(cb, 'should have circuit_breaker result');
250      assert.equal(cb.status, 'critical');
251      assert.ok(cb.breaker_open_errors_last_hour >= 4);
252    });
253  });
254  
255  describe('checkCircuitBreaker — ok when ≤3 open errors', () => {
256    before(() => {
257      clearTable('system_health');
258      clearTable('agent_tasks');
259      execSyncFn = happyExecSync;
260      getSkipStagesFn = () => new Set();
261    });
262  
263    test('reports ok status when no breaker errors', async () => {
264      const result = await runProcessGuardian();
265      const cb = result.results?.find(r => r.check === 'circuit_breaker');
266      assert.ok(cb, 'should have circuit_breaker result');
267      assert.equal(cb.status, 'ok');
268      assert.equal(cb.breaker_open_errors_last_hour, 0);
269    });
270  });
271  
272  describe('cleanExpiredRateLimits — with active rate limits', () => {
273    before(() => {
274      clearTable('system_health');
275      execSyncFn = happyExecSync;
276      getSkipStagesFn = () => new Set(['scoring', 'enrich']);
277      getRateLimitStatusFn = () => [
278        { api: 'zenrows', waitMinutes: 15 },
279        { api: 'openrouter', waitMinutes: 5 },
280      ];
281    });
282  
283    test('rate_limit_cleanup result has activeRateLimits=2', async () => {
284      const result = await runProcessGuardian();
285      const rl = result.results?.find(r => r.check === 'rate_limit_cleanup');
286      assert.ok(rl, 'should have rate_limit_cleanup result');
287      assert.equal(rl.activeRateLimits, 2);
288      assert.equal(rl.status, 'ok');
289    });
290  });
291  
292  describe('cleanExpiredRateLimits — getSkipStages throws', () => {
293    before(() => {
294      clearTable('system_health');
295      execSyncFn = happyExecSync;
296      getSkipStagesFn = () => {
297        throw new Error('rate-limits.json parse error');
298      };
299    });
300  
301    test('returns warning status without throwing', async () => {
302      const result = await runProcessGuardian();
303      const rl = result.results?.find(r => r.check === 'rate_limit_cleanup');
304      assert.ok(rl, 'should have rate_limit_cleanup result');
305      assert.equal(rl.status, 'warning');
306    });
307  });
308  
309  describe('checkBrowserLoopHung — no pipeline_control data', () => {
310    before(() => {
311      clearTable('system_health');
312      clearTable('pipeline_control');
313      execSyncFn = happyExecSync;
314      getSkipStagesFn = () => new Set();
315      delete process.env.ENABLE_VISION;
316    });
317  
318    test('returns ok when no pipeline_control row exists', async () => {
319      const result = await runProcessGuardian();
320      const br = result.results?.find(r => r.check === 'browser_loop_hung');
321      assert.ok(br, 'should have browser_loop_hung result');
322      assert.equal(br.status, 'ok');
323      assert.equal(br.browserAgeMin, null);
324    });
325  });
326  
327  describe('checkBrowserLoopHung — ENABLE_VISION=false skips check', () => {
328    before(() => {
329      clearTable('system_health');
330      db.prepare(
331        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
332         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-5 minutes'))`
333      ).run();
334      process.env.ENABLE_VISION = 'false';
335      execSyncFn = happyExecSync;
336      getSkipStagesFn = () => new Set();
337    });
338  
339    after(() => {
340      delete process.env.ENABLE_VISION;
341    });
342  
343    test('skips browser hung check when ENABLE_VISION=false', async () => {
344      const result = await runProcessGuardian();
345      const br = result.results?.find(r => r.check === 'browser_loop_hung');
346      assert.ok(br, 'should have browser_loop_hung result');
347      assert.equal(br.status, 'ok');
348      assert.equal(br.actionTaken, 'skipped_vision_disabled');
349    });
350  });
351  
352  describe('checkBrowserLoopHung — browser hung + API active → restart', () => {
353    before(() => {
354      clearTable('system_health');
355      db.prepare(
356        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
357         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-3 minutes'))`
358      ).run();
359      delete process.env.ENABLE_VISION;
360      execSyncFn = cmd => {
361        if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n';
362        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
363        if (cmd.includes('ps aux')) return 'root 123\n';
364        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
365        return '';
366      };
367      getSkipStagesFn = () => new Set();
368    });
369  
370    test('restarts pipeline when browser loop is hung', async () => {
371      const result = await runProcessGuardian();
372      const br = result.results?.find(r => r.check === 'browser_loop_hung');
373      assert.ok(br, 'should have browser_loop_hung result');
374      assert.ok(
375        br.status === 'warning' || br.actionTaken?.includes('restarted'),
376        `expected warning or restart: ${br.status} / ${br.actionTaken}`
377      );
378    });
379  });
380  
381  describe('checkBrowserLoopHung — hung but cooldown active', () => {
382    before(() => {
383      clearTable('system_health');
384      db.prepare(
385        `INSERT INTO system_health (check_type, status, action_taken, created_at)
386         VALUES ('browser_loop_hung', 'warning', 'restarted_pipeline_browser_hung_50min', datetime('now', '-5 minutes'))`
387      ).run();
388      db.prepare(
389        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
390         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-2 minutes'))`
391      ).run();
392      delete process.env.ENABLE_VISION;
393      execSyncFn = cmd => {
394        if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n';
395        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
396        if (cmd.includes('ps aux')) return 'root 123\n';
397        return '';
398      };
399      getSkipStagesFn = () => new Set();
400    });
401  
402    test('skips restart when cooldown active', async () => {
403      const result = await runProcessGuardian();
404      const br = result.results?.find(r => r.check === 'browser_loop_hung');
405      assert.ok(br, 'should have browser_loop_hung result');
406      assert.ok(br.actionTaken?.includes('cooldown'), `expected cooldown: ${br.actionTaken}`);
407    });
408  });
409  
410  describe('checkBrowserLoopHung — browser ok (recent cycle)', () => {
411    before(() => {
412      clearTable('system_health');
413      db.prepare(
414        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
415         VALUES ('singleton', '1', datetime('now', '-5 minutes'), datetime('now', '-2 minutes'))`
416      ).run();
417      delete process.env.ENABLE_VISION;
418      execSyncFn = cmd => {
419        if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n';
420        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
421        if (cmd.includes('ps aux')) return 'root 123\n';
422        return '';
423      };
424      getSkipStagesFn = () => new Set();
425    });
426  
427    test('returns ok when browser cycle is recent', async () => {
428      const result = await runProcessGuardian();
429      const br = result.results?.find(r => r.check === 'browser_loop_hung');
430      assert.ok(br, 'should have browser_loop_hung result');
431      assert.equal(br.status, 'ok');
432    });
433  });
434  
435  describe('checkCronTimer — timer has next trigger (ok)', () => {
436    before(() => {
437      clearTable('system_health');
438      execSyncFn = happyExecSync;
439      getSkipStagesFn = () => new Set();
440    });
441  
442    test('returns ok status for cron_timer_dead check', async () => {
443      const result = await runProcessGuardian();
444      const cr = result.results?.find(r => r.check === 'cron_timer_dead');
445      assert.ok(cr, 'should have cron_timer_dead result');
446      assert.equal(cr.status, 'ok');
447      assert.equal(cr.actionTaken, null);
448    });
449  });
450  
451  describe('checkCronTimer — timer shows infinity → restart', () => {
452    before(() => {
453      clearTable('system_health');
454      execSyncFn = cmd => {
455        if (cmd.includes('is-active')) return 'active\n';
456        if (cmd.includes('ps aux')) return 'root 123\n';
457        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n';
458        if (cmd.includes('restart --no-block mmo-cron.timer')) return '';
459        return '';
460      };
461      getSkipStagesFn = () => new Set();
462    });
463  
464    test('restarts cron timer when it shows infinity', async () => {
465      const result = await runProcessGuardian();
466      const cr = result.results?.find(r => r.check === 'cron_timer_dead');
467      assert.ok(cr, 'should have cron_timer_dead result');
468      assert.equal(cr.status, 'warning');
469      assert.equal(cr.actionTaken, 'restarted_cron_timer');
470    });
471  });
472  
473  describe('checkCronTimer — infinity but cooldown active', () => {
474    before(() => {
475      clearTable('system_health');
476      db.prepare(
477        `INSERT INTO system_health (check_type, status, action_taken, created_at)
478         VALUES ('cron_timer_dead', 'warning', 'restarted_cron_timer', datetime('now', '-2 minutes'))`
479      ).run();
480      execSyncFn = cmd => {
481        if (cmd.includes('is-active')) return 'active\n';
482        if (cmd.includes('ps aux')) return 'root 123\n';
483        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n';
484        return '';
485      };
486      getSkipStagesFn = () => new Set();
487    });
488  
489    test('skips restart when cooldown active', async () => {
490      const result = await runProcessGuardian();
491      const cr = result.results?.find(r => r.check === 'cron_timer_dead');
492      assert.ok(cr, 'should have cron_timer_dead result');
493      assert.equal(cr.actionTaken, 'cooldown_skip_cron_timer');
494    });
495  });
496  
497  describe('checkClearanceCycle — was running, now stopped', () => {
498    before(() => {
499      clearTable('system_health');
500      db.prepare(
501        `INSERT INTO system_health (check_type, status, details, created_at)
502         VALUES ('clearance_cycle', 'ok', '{"clearance_running":true}', datetime('now', '-1 minute'))`
503      ).run();
504      execSyncFn = cmd => {
505        if (cmd.includes('is-active')) return 'active\n';
506        if (cmd.includes('ps aux')) return 'root 123 node\n'; // no clearance script
507        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
508        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
509        return '';
510      };
511      getSkipStagesFn = () => new Set();
512    });
513  
514    test('restarts pipeline after clearance cycle completion', async () => {
515      const result = await runProcessGuardian();
516      const cl = result.results?.find(r => r.check === 'clearance_cycle');
517      assert.ok(cl, 'should have clearance_cycle result');
518      assert.ok(
519        cl.actionTaken === 'restarted_pipeline_after_clearance' || cl.status === 'ok',
520        `got action: ${cl.actionTaken}, status: ${cl.status}`
521      );
522    });
523  });
524  
525  describe('cleanupOldRecords — probabilistic housekeeping', () => {
526    test('does not throw when called via runProcessGuardian many times', async () => {
527      clearTable('system_health');
528      execSyncFn = happyExecSync;
529      getSkipStagesFn = () => new Set();
530      // Run 3 times — at 10% probability, cleanup may or may not fire
531      for (let i = 0; i < 3; i++) {
532        await assert.doesNotReject(() => runProcessGuardian());
533      }
534    });
535  });
536  
537  // ── Additional branch coverage tests ─────────────────────────────────────────
538  
539  describe('checkPipelineService — crash loop escalation (>10min stuck)', () => {
540    before(() => {
541      clearTable('system_health');
542      clearTable('agent_tasks');
543      // Insert restart records 5-25 minutes ago (none within last 3min, so cooldown won't trigger).
544      // MIN(created_at) will be ~25min ago → minutesStuck > 10 → crash loop path.
545      for (let i = 25; i >= 5; i -= 4) {
546        db.prepare(
547          `INSERT INTO system_health (check_type, status, action_taken, created_at)
548           VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-${i} minutes'))`
549        ).run();
550      }
551  
552      execSyncFn = cmd => {
553        if (cmd.includes('is-active') && !cmd.includes('cron')) {
554          const err = new Error('inactive');
555          err.stdout = 'inactive\n';
556          throw err;
557        }
558        if (cmd.includes('journalctl')) return 'some startup log\n';
559        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
560        if (cmd.includes('ps aux')) return 'root 123 node\n';
561        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
562        return '';
563      };
564      getSkipStagesFn = () => new Set();
565    });
566  
567    test('escalates to triage task and marks status critical', async () => {
568      const result = await runProcessGuardian();
569      const ps = result.results?.find(r => r.check === 'pipeline_service');
570      assert.ok(ps, 'should have pipeline_service result');
571      assert.equal(ps.status, 'critical');
572      assert.ok(
573        ps.actionTaken?.includes('crash_loop_escalated') || ps.actionTaken?.includes('restarted'),
574        `expected crash_loop_escalated in actionTaken: ${ps.actionTaken}`
575      );
576      // A triage task should have been inserted
577      const task = db
578        .prepare("SELECT * FROM agent_tasks WHERE task_type='classify_error' LIMIT 1")
579        .get();
580      assert.ok(task, 'should have created a triage agent_task');
581    });
582  });
583  
584  describe('checkPipelineService — crash loop escalation, journalctl throws', () => {
585    before(() => {
586      clearTable('system_health');
587      clearTable('agent_tasks');
588      for (let i = 25; i >= 5; i -= 4) {
589        db.prepare(
590          `INSERT INTO system_health (check_type, status, action_taken, created_at)
591           VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-${i} minutes'))`
592        ).run();
593      }
594  
595      execSyncFn = cmd => {
596        if (cmd.includes('is-active') && !cmd.includes('cron')) {
597          const err = new Error('inactive');
598          err.stdout = 'inactive\n';
599          throw err;
600        }
601        if (cmd.includes('journalctl')) throw new Error('journalctl not available');
602        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
603        if (cmd.includes('ps aux')) return 'root 123 node\n';
604        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
605        return '';
606      };
607      getSkipStagesFn = () => new Set();
608    });
609  
610    test('handles journalctl failure gracefully (startupError stays null)', async () => {
611      const result = await runProcessGuardian();
612      const ps = result.results?.find(r => r.check === 'pipeline_service');
613      assert.ok(ps, 'should have pipeline_service result');
614      // Should still escalate even without journalctl output
615      assert.ok(
616        ps.status === 'critical' || ps.actionTaken?.includes('restart'),
617        `expected critical/restart: ${ps.status} / ${ps.actionTaken}`
618      );
619    });
620  });
621  
622  describe('checkPipelineService — restart command fails', () => {
623    before(() => {
624      clearTable('system_health');
625      execSyncFn = cmd => {
626        if (cmd.includes('is-active') && !cmd.includes('cron')) {
627          const err = new Error('inactive');
628          err.stdout = 'inactive\n';
629          throw err;
630        }
631        if (cmd.includes('restart --no-block 333method-pipeline')) {
632          throw new Error('Failed to connect to bus: No such file or directory');
633        }
634        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
635        if (cmd.includes('ps aux')) return 'root 123 node\n';
636        return '';
637      };
638      getSkipStagesFn = () => new Set();
639    });
640  
641    test('marks status critical when restart command fails', async () => {
642      const result = await runProcessGuardian();
643      const ps = result.results?.find(r => r.check === 'pipeline_service');
644      assert.ok(ps, 'should have pipeline_service result');
645      assert.equal(ps.status, 'critical');
646      assert.ok(
647        ps.actionTaken?.includes('restart_failed'),
648        `expected restart_failed: ${ps.actionTaken}`
649      );
650    });
651  });
652  
653  describe('checkClearanceCycle — ps aux throws (clearanceRunning defaults false)', () => {
654    before(() => {
655      clearTable('system_health');
656      execSyncFn = cmd => {
657        if (cmd.includes('ps aux')) throw new Error('ps not found');
658        if (cmd.includes('is-active')) return 'active\n';
659        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
660        return '';
661      };
662      getSkipStagesFn = () => new Set();
663    });
664  
665    test('returns ok and records clearance_running=false when ps throws', async () => {
666      const result = await runProcessGuardian();
667      const cl = result.results?.find(r => r.check === 'clearance_cycle');
668      assert.ok(cl, 'should have clearance_cycle result');
669      assert.equal(cl.status, 'ok');
670      assert.equal(cl.clearanceRunning, false);
671    });
672  });
673  
674  describe('checkClearanceCycle — malformed JSON details in last health row', () => {
675    before(() => {
676      clearTable('system_health');
677      db.prepare(
678        `INSERT INTO system_health (check_type, status, details, created_at)
679         VALUES ('clearance_cycle', 'ok', 'not-valid-json{{', datetime('now', '-1 minute'))`
680      ).run();
681      execSyncFn = cmd => {
682        if (cmd.includes('ps aux')) return 'root 123 node\n';
683        if (cmd.includes('is-active')) return 'active\n';
684        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
685        return '';
686      };
687      getSkipStagesFn = () => new Set();
688    });
689  
690    test('handles JSON parse error gracefully (wasRunning defaults false)', async () => {
691      const result = await runProcessGuardian();
692      const cl = result.results?.find(r => r.check === 'clearance_cycle');
693      assert.ok(cl, 'should have clearance_cycle result');
694      assert.equal(cl.status, 'ok');
695      // wasRunning was false (parse error), clearanceRunning is false → no restart
696      assert.equal(cl.actionTaken, null);
697    });
698  });
699  
700  describe('checkClearanceCycle — restart after clearance fails', () => {
701    before(() => {
702      clearTable('system_health');
703      db.prepare(
704        `INSERT INTO system_health (check_type, status, details, created_at)
705         VALUES ('clearance_cycle', 'ok', '{"clearance_running":true}', datetime('now', '-1 minute'))`
706      ).run();
707      execSyncFn = cmd => {
708        if (cmd.includes('ps aux')) return 'root 123 node\n'; // clearance not running
709        if (cmd.includes('is-active')) return 'active\n';
710        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
711        if (cmd.includes('restart --no-block 333method-pipeline')) {
712          throw new Error('systemctl restart failed');
713        }
714        return '';
715      };
716      getSkipStagesFn = () => new Set();
717    });
718  
719    test('marks status warning when clearance restart fails', async () => {
720      const result = await runProcessGuardian();
721      const cl = result.results?.find(r => r.check === 'clearance_cycle');
722      assert.ok(cl, 'should have clearance_cycle result');
723      assert.equal(cl.status, 'warning');
724      assert.ok(
725        cl.actionTaken?.includes('clearance_restart_failed'),
726        `expected clearance_restart_failed: ${cl.actionTaken}`
727      );
728    });
729  });
730  
731  describe('checkBrowserLoopHung — systemctl is-active throws (falls back to timestamp)', () => {
732    before(() => {
733      clearTable('system_health');
734      // browser stale 60min, API stale 5min (within 60min threshold) — hung scenario
735      db.prepare(
736        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
737         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-5 minutes'))`
738      ).run();
739      delete process.env.ENABLE_VISION;
740      execSyncFn = cmd => {
741        if (cmd.includes('is-active') && cmd.includes('pipeline')) {
742          throw new Error('D-Bus not available');
743        }
744        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
745        if (cmd.includes('ps aux')) return 'root 123\n';
746        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
747        return '';
748      };
749      getSkipStagesFn = () => new Set();
750    });
751  
752    test('falls back to timestamp check when systemctl throws, restarts if hung', async () => {
753      const result = await runProcessGuardian();
754      const br = result.results?.find(r => r.check === 'browser_loop_hung');
755      assert.ok(br, 'should have browser_loop_hung result');
756      // API within 60min threshold (5min) + browser stale 60min > 45min threshold → restart
757      assert.ok(
758        br.status === 'warning' || br.actionTaken?.includes('restarted'),
759        `expected warning/restart: ${br.status} / ${br.actionTaken}`
760      );
761    });
762  });
763  
764  describe('checkBrowserLoopHung — browser hung restart fails', () => {
765    before(() => {
766      clearTable('system_health');
767      db.prepare(
768        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
769         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-3 minutes'))`
770      ).run();
771      delete process.env.ENABLE_VISION;
772      execSyncFn = cmd => {
773        if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n';
774        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
775        if (cmd.includes('ps aux')) return 'root 123\n';
776        if (cmd.includes('restart --no-block 333method-pipeline')) {
777          throw new Error('restart failed: no bus');
778        }
779        return '';
780      };
781      getSkipStagesFn = () => new Set();
782    });
783  
784    test('marks status critical when browser-hung restart fails', async () => {
785      const result = await runProcessGuardian();
786      const br = result.results?.find(r => r.check === 'browser_loop_hung');
787      assert.ok(br, 'should have browser_loop_hung result');
788      assert.equal(br.status, 'critical');
789      assert.ok(
790        br.actionTaken?.includes('browser_hung_restart_failed'),
791        `expected browser_hung_restart_failed: ${br.actionTaken}`
792      );
793    });
794  });
795  
796  describe('checkBrowserLoopHung — DB query throws (outer catch)', () => {
797    before(() => {
798      clearTable('system_health');
799      // Insert a pipeline_control row with NULL timestamps
800      db.prepare(
801        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
802         VALUES ('singleton', '1', NULL, NULL)`
803      ).run();
804      delete process.env.ENABLE_VISION;
805      execSyncFn = cmd => {
806        if (cmd.includes('is-active')) return 'active\n';
807        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
808        if (cmd.includes('ps aux')) return 'root 123\n';
809        return '';
810      };
811      getSkipStagesFn = () => new Set();
812    });
813  
814    test('returns ok when pipeline_control has NULL timestamps', async () => {
815      const result = await runProcessGuardian();
816      const br = result.results?.find(r => r.check === 'browser_loop_hung');
817      assert.ok(br, 'should have browser_loop_hung result');
818      // NULL timestamps → julianday(null) = null → age computed as 0 → ok (not hung)
819      assert.equal(br.status, 'ok');
820      assert.ok(br.browserAgeMin === null || br.browserAgeMin === 0, `expected null or 0 browserAgeMin, got ${br.browserAgeMin}`);
821    });
822  });
823  
824  describe('checkCronTimer — timer restart fails', () => {
825    before(() => {
826      clearTable('system_health');
827      execSyncFn = cmd => {
828        if (cmd.includes('is-active')) return 'active\n';
829        if (cmd.includes('ps aux')) return 'root 123\n';
830        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n';
831        if (cmd.includes('restart --no-block mmo-cron.timer')) {
832          throw new Error('D-Bus connection failed');
833        }
834        return '';
835      };
836      getSkipStagesFn = () => new Set();
837    });
838  
839    test('marks status critical when cron timer restart fails', async () => {
840      const result = await runProcessGuardian();
841      const cr = result.results?.find(r => r.check === 'cron_timer_dead');
842      assert.ok(cr, 'should have cron_timer_dead result');
843      assert.equal(cr.status, 'critical');
844      assert.ok(
845        cr.actionTaken?.includes('cron_timer_restart_failed'),
846        `expected cron_timer_restart_failed: ${cr.actionTaken}`
847      );
848    });
849  });
850  
851  describe('checkCronTimer — systemctl show throws (outer catch)', () => {
852    before(() => {
853      clearTable('system_health');
854      execSyncFn = cmd => {
855        if (cmd.includes('is-active')) return 'active\n';
856        if (cmd.includes('ps aux')) return 'root 123\n';
857        if (cmd.includes('show mmo-cron.timer')) throw new Error('systemctl not available');
858        return '';
859      };
860      getSkipStagesFn = () => new Set();
861    });
862  
863    test('returns ok when systemctl show throws (outer catch swallows error)', async () => {
864      const result = await runProcessGuardian();
865      const cr = result.results?.find(r => r.check === 'cron_timer_dead');
866      assert.ok(cr, 'should have cron_timer_dead result');
867      assert.equal(cr.status, 'ok');
868      assert.equal(cr.actionTaken, null);
869    });
870  });
871  
872  describe('cleanupOldRecords — actually deletes old rows (covers line 510-511)', () => {
873    test('logs when old records are deleted', async () => {
874      clearTable('system_health');
875      // Insert records that are older than 7 days
876      db.prepare(
877        `INSERT INTO system_health (check_type, status, details, action_taken, created_at)
878         VALUES ('pipeline_service', 'ok', '{}', NULL, datetime('now', '-8 days'))`
879      ).run();
880      db.prepare(
881        `INSERT INTO system_health (check_type, status, details, action_taken, created_at)
882         VALUES ('circuit_breaker', 'ok', '{}', NULL, datetime('now', '-10 days'))`
883      ).run();
884  
885      // Force cleanup to always run by patching Math.random temporarily
886      const origRandom = Math.random;
887      Math.random = () => 0.05; // below 0.1 threshold
888      execSyncFn = happyExecSync;
889      getSkipStagesFn = () => new Set();
890      try {
891        await assert.doesNotReject(() => runProcessGuardian());
892      } finally {
893        Math.random = origRandom;
894      }
895  
896      // After cleanup, old records should be gone
897      const oldRows = db
898        .prepare(
899          "SELECT count(*) as cnt FROM system_health WHERE created_at < datetime('now', '-7 days')"
900        )
901        .get();
902      assert.equal(oldRows.cnt, 0, 'old records should have been deleted');
903    });
904  });
905  
906  describe('writeStatusFile — pipeline stopped + circuit breaker errors in status file', () => {
907    before(() => {
908      clearTable('system_health');
909      clearTable('agent_tasks');
910      // Insert 4 breaker-open errors so cbErrors > 3
911      for (let i = 0; i < 4; i++) {
912        db.prepare(
913          `INSERT INTO agent_tasks (task_type, assigned_to, context_json, created_at, updated_at)
914           VALUES ('triage', 'triage', '{"error": "Breaker is open"}', datetime('now'), datetime('now'))`
915        ).run();
916      }
917      execSyncFn = cmd => {
918        if (cmd.includes('is-active') && !cmd.includes('cron')) {
919          const err = new Error('inactive');
920          err.stdout = 'inactive\n';
921          throw err;
922        }
923        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
924        if (cmd.includes('ps aux')) return 'root 123 node\n';
925        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
926        return '';
927      };
928      getSkipStagesFn = () => new Set();
929      getRateLimitStatusFn = () => [];
930    });
931  
932    test('writes status file with stopped pipeline and circuit breaker errors', async () => {
933      const { readFileSync } = await import('node:fs');
934      await runProcessGuardian();
935      const content = readFileSync('/tmp/watchdog-status.txt', 'utf8');
936      assert.ok(content.includes('Process Guardian'), 'should have header');
937      assert.ok(
938        content.includes('stopped') || content.includes('Pipeline'),
939        'should mention pipeline state'
940      );
941    });
942  });
943  
944  describe('checkBrowserLoopHung — API stale + pipeline inactive → apiActive false (no restart)', () => {
945    before(() => {
946      clearTable('system_health');
947      // Browser stale 60min (> 45min threshold) but API also stale 90min (> 60min) + pipeline inactive
948      // → apiActive=false → no restart needed
949      db.prepare(
950        `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at)
951         VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-90 minutes'))`
952      ).run();
953      delete process.env.ENABLE_VISION;
954      execSyncFn = cmd => {
955        if (cmd.includes('is-active') && cmd.includes('pipeline')) {
956          const err = new Error('inactive');
957          err.stdout = 'inactive\n';
958          throw err;
959        }
960        if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n';
961        if (cmd.includes('ps aux')) return 'root 123\n';
962        if (cmd.includes('restart --no-block 333method-pipeline')) return '';
963        return '';
964      };
965      getSkipStagesFn = () => new Set();
966    });
967  
968    test('returns ok when both loops are stale (apiActive=false)', async () => {
969      const result = await runProcessGuardian();
970      const br = result.results?.find(r => r.check === 'browser_loop_hung');
971      assert.ok(br, 'should have browser_loop_hung result');
972      assert.equal(br.status, 'ok');
973      assert.equal(br.actionTaken, null);
974    });
975  });