process-guardian.test.js
1 /** 2 * Tests for src/cron/process-guardian.js 3 * 4 * Covers all 6 health checks: 5 * 1. checkPipelineService — active/inactive/cooldown/crash-loop paths 6 * 2. checkClearanceCycle — running/completed transition 7 * 3. checkCircuitBreaker — ok/critical paths 8 * 4. cleanExpiredRateLimits — ok/error paths 9 * 5. checkBrowserLoopHung — no data/vision-disabled/hung/cooldown/ok paths 10 * 6. checkCronTimer — ok/infinity/cooldown paths 11 * Plus: writeStatusFile, cleanupOldRecords, runProcessGuardian 12 * 13 * Strategy: mock child_process (execSync) and rate-limit-scheduler at module level, 14 * use pg-mock (in-memory SQLite via db.js mock) for database interactions. 15 * NODE_ENV=test ensures Logger skips createWriteStream. 16 * 17 * NOTE: requires --experimental-test-module-mocks 18 */ 19 20 import { test, describe, mock, before, after } from 'node:test'; 21 import assert from 'node:assert/strict'; 22 import Database from 'better-sqlite3'; 23 import { createPgMock } from '../helpers/pg-mock.js'; 24 25 // ── Mutable stubs ──────────────────────────────────────────────────────────── 26 27 let execSyncFn = (_cmd, _opts) => ''; 28 let getSkipStagesFn = () => new Set(); 29 let getRateLimitStatusFn = () => []; 30 31 // ─── Create in-memory test DB ───────────────────────────────────────────────── 32 33 const db = new Database(':memory:'); 34 35 db.exec(` 36 CREATE TABLE IF NOT EXISTS sites ( 37 id INTEGER PRIMARY KEY AUTOINCREMENT, 38 domain TEXT NOT NULL DEFAULT 'test.com', 39 status TEXT DEFAULT 'found', 40 score REAL, 41 error_message TEXT, 42 updated_at TEXT DEFAULT CURRENT_TIMESTAMP, 43 rescored_at DATETIME 44 ); 45 46 CREATE TABLE IF NOT EXISTS system_health ( 47 id INTEGER PRIMARY KEY AUTOINCREMENT, 48 check_type TEXT NOT NULL, 49 status TEXT NOT NULL, 50 details TEXT, 51 action_taken TEXT, 52 created_at TEXT DEFAULT (datetime('now')) 53 ); 54 55 CREATE TABLE IF NOT EXISTS agent_tasks ( 56 id INTEGER PRIMARY KEY AUTOINCREMENT, 57 task_type TEXT NOT NULL, 58 assigned_to TEXT, 59 priority INTEGER DEFAULT 5, 60 status TEXT DEFAULT 'pending', 61 context_json TEXT, 62 result_json TEXT, 63 error_message TEXT, 64 created_at TEXT DEFAULT (datetime('now')), 65 updated_at TEXT DEFAULT (datetime('now')) 66 ); 67 68 CREATE TABLE IF NOT EXISTS pipeline_control ( 69 key TEXT PRIMARY KEY, 70 value TEXT, 71 last_browser_loop_at TEXT, 72 last_api_loop_at TEXT 73 ); 74 75 CREATE TABLE IF NOT EXISTS settings ( 76 key TEXT PRIMARY KEY, 77 value TEXT, 78 description TEXT, 79 updated_at TEXT DEFAULT CURRENT_TIMESTAMP 80 ); 81 `); 82 83 // ── Mock modules BEFORE any import ─────────────────────────────────────────── 84 85 mock.module('../../src/utils/db.js', { 86 namedExports: createPgMock(db), 87 }); 88 89 mock.module('child_process', { 90 namedExports: { 91 execSync: (...args) => execSyncFn(...args), 92 }, 93 }); 94 95 mock.module('../../src/utils/rate-limit-scheduler.js', { 96 namedExports: { 97 getSkipStages: () => getSkipStagesFn(), 98 getRateLimitStatus: () => getRateLimitStatusFn(), 99 setRateLimit: () => {}, 100 }, 101 }); 102 103 mock.module('../../src/utils/load-env.js', { 104 namedExports: {}, 105 }); 106 107 // ── Env setup ──────────────────────────────────────────────────────────────── 108 109 process.env.NODE_ENV = 'test'; 110 111 // Import AFTER mocks 112 const { runProcessGuardian } = await import('../../src/cron/process-guardian.js'); 113 114 // ── Helpers ─────────────────────────────────────────────────────────────────── 115 116 function clearTable(tableName) { 117 db.prepare(`DELETE FROM ${tableName}`).run(); 118 } 119 120 // Default happy-path execSync: pipeline active, no clearance, timer ok 121 function happyExecSync(cmd) { 122 if (cmd.includes('is-active')) return 'active\n'; 123 if (cmd.includes('ps aux')) return 'root 123 node\n'; 124 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234567\n'; 125 return ''; 126 } 127 128 // ── Tests ───────────────────────────────────────────────────────────────────── 129 130 describe('runProcessGuardian — pipeline active (happy path)', () => { 131 before(() => { 132 clearTable('system_health'); 133 execSyncFn = happyExecSync; 134 getSkipStagesFn = () => new Set(); 135 getRateLimitStatusFn = () => []; 136 }); 137 138 test('returns summary object with expected fields', async () => { 139 const result = await runProcessGuardian(); 140 assert.equal(typeof result, 'object'); 141 assert.ok(result.checks_run >= 5, 'should run at least 5 checks'); 142 assert.ok(typeof result.ok === 'number'); 143 assert.ok(typeof result.warnings === 'number'); 144 assert.ok(typeof result.critical === 'number'); 145 assert.ok(typeof result.duration_seconds === 'number'); 146 assert.ok(Array.isArray(result.results)); 147 }); 148 149 test('writes system_health records to DB', async () => { 150 clearTable('system_health'); 151 await runProcessGuardian(); 152 const rows = db.prepare('SELECT check_type FROM system_health').all(); 153 assert.ok(rows.length >= 5, `expected ≥5 health records, got ${rows.length}`); 154 const types = rows.map(r => r.check_type); 155 assert.ok(types.includes('pipeline_service')); 156 assert.ok(types.includes('circuit_breaker')); 157 assert.ok(types.includes('browser_loop_hung')); 158 assert.ok(types.includes('cron_timer_dead')); 159 }); 160 }); 161 162 describe('checkPipelineService — pipeline inactive → restart', () => { 163 before(() => { 164 clearTable('system_health'); 165 execSyncFn = cmd => { 166 if (cmd.includes('is-active') && !cmd.includes('cron')) { 167 const err = new Error('inactive'); 168 err.stdout = 'inactive\n'; 169 throw err; 170 } 171 if (cmd.includes('ps aux')) return 'root 123 node\n'; 172 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 173 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 174 return ''; 175 }; 176 getSkipStagesFn = () => new Set(); 177 }); 178 179 test('restarts pipeline and logs restarted_pipeline action', async () => { 180 await runProcessGuardian(); 181 const row = db 182 .prepare( 183 "SELECT * FROM system_health WHERE check_type='pipeline_service' ORDER BY id DESC LIMIT 1" 184 ) 185 .get(); 186 assert.ok(row, 'should have a pipeline_service health record'); 187 assert.ok( 188 row.action_taken?.includes('restart') || row.action_taken?.includes('crash'), 189 `action should involve restart: ${row.action_taken}` 190 ); 191 }); 192 }); 193 194 describe('checkPipelineService — cooldown active (recent restart in last 3min)', () => { 195 before(() => { 196 clearTable('system_health'); 197 // Insert a recent restart record 198 db.prepare( 199 `INSERT INTO system_health (check_type, status, action_taken, created_at) 200 VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-1 minute'))` 201 ).run(); 202 203 execSyncFn = cmd => { 204 if (cmd.includes('is-active') && !cmd.includes('cron')) { 205 const err = new Error('inactive'); 206 err.stdout = 'inactive\n'; 207 throw err; 208 } 209 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 210 if (cmd.includes('ps aux')) return 'root 123 node\n'; 211 return ''; 212 }; 213 getSkipStagesFn = () => new Set(); 214 }); 215 216 test('skips restart when cooldown active', async () => { 217 await runProcessGuardian(); 218 const rows = db 219 .prepare( 220 "SELECT action_taken FROM system_health WHERE check_type='pipeline_service' ORDER BY id DESC LIMIT 3" 221 ) 222 .all(); 223 const latestAction = rows[0]?.action_taken || ''; 224 assert.ok( 225 latestAction.includes('cooldown') || latestAction.includes('restarted'), 226 `expected cooldown skip or restart: ${latestAction}` 227 ); 228 }); 229 }); 230 231 describe('checkCircuitBreaker — critical when >3 breaker-open errors', () => { 232 before(() => { 233 clearTable('system_health'); 234 clearTable('agent_tasks'); 235 for (let i = 0; i < 4; i++) { 236 db.prepare( 237 `INSERT INTO agent_tasks (task_type, assigned_to, context_json, created_at, updated_at) 238 VALUES ('triage', 'triage', '{"error": "Breaker is open"}', datetime('now'), datetime('now'))` 239 ).run(); 240 } 241 242 execSyncFn = happyExecSync; 243 getSkipStagesFn = () => new Set(); 244 }); 245 246 test('reports critical status for circuit_breaker check', async () => { 247 const result = await runProcessGuardian(); 248 const cb = result.results?.find(r => r.check === 'circuit_breaker'); 249 assert.ok(cb, 'should have circuit_breaker result'); 250 assert.equal(cb.status, 'critical'); 251 assert.ok(cb.breaker_open_errors_last_hour >= 4); 252 }); 253 }); 254 255 describe('checkCircuitBreaker — ok when ≤3 open errors', () => { 256 before(() => { 257 clearTable('system_health'); 258 clearTable('agent_tasks'); 259 execSyncFn = happyExecSync; 260 getSkipStagesFn = () => new Set(); 261 }); 262 263 test('reports ok status when no breaker errors', async () => { 264 const result = await runProcessGuardian(); 265 const cb = result.results?.find(r => r.check === 'circuit_breaker'); 266 assert.ok(cb, 'should have circuit_breaker result'); 267 assert.equal(cb.status, 'ok'); 268 assert.equal(cb.breaker_open_errors_last_hour, 0); 269 }); 270 }); 271 272 describe('cleanExpiredRateLimits — with active rate limits', () => { 273 before(() => { 274 clearTable('system_health'); 275 execSyncFn = happyExecSync; 276 getSkipStagesFn = () => new Set(['scoring', 'enrich']); 277 getRateLimitStatusFn = () => [ 278 { api: 'zenrows', waitMinutes: 15 }, 279 { api: 'openrouter', waitMinutes: 5 }, 280 ]; 281 }); 282 283 test('rate_limit_cleanup result has activeRateLimits=2', async () => { 284 const result = await runProcessGuardian(); 285 const rl = result.results?.find(r => r.check === 'rate_limit_cleanup'); 286 assert.ok(rl, 'should have rate_limit_cleanup result'); 287 assert.equal(rl.activeRateLimits, 2); 288 assert.equal(rl.status, 'ok'); 289 }); 290 }); 291 292 describe('cleanExpiredRateLimits — getSkipStages throws', () => { 293 before(() => { 294 clearTable('system_health'); 295 execSyncFn = happyExecSync; 296 getSkipStagesFn = () => { 297 throw new Error('rate-limits.json parse error'); 298 }; 299 }); 300 301 test('returns warning status without throwing', async () => { 302 const result = await runProcessGuardian(); 303 const rl = result.results?.find(r => r.check === 'rate_limit_cleanup'); 304 assert.ok(rl, 'should have rate_limit_cleanup result'); 305 assert.equal(rl.status, 'warning'); 306 }); 307 }); 308 309 describe('checkBrowserLoopHung — no pipeline_control data', () => { 310 before(() => { 311 clearTable('system_health'); 312 clearTable('pipeline_control'); 313 execSyncFn = happyExecSync; 314 getSkipStagesFn = () => new Set(); 315 delete process.env.ENABLE_VISION; 316 }); 317 318 test('returns ok when no pipeline_control row exists', async () => { 319 const result = await runProcessGuardian(); 320 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 321 assert.ok(br, 'should have browser_loop_hung result'); 322 assert.equal(br.status, 'ok'); 323 assert.equal(br.browserAgeMin, null); 324 }); 325 }); 326 327 describe('checkBrowserLoopHung — ENABLE_VISION=false skips check', () => { 328 before(() => { 329 clearTable('system_health'); 330 db.prepare( 331 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 332 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-5 minutes'))` 333 ).run(); 334 process.env.ENABLE_VISION = 'false'; 335 execSyncFn = happyExecSync; 336 getSkipStagesFn = () => new Set(); 337 }); 338 339 after(() => { 340 delete process.env.ENABLE_VISION; 341 }); 342 343 test('skips browser hung check when ENABLE_VISION=false', async () => { 344 const result = await runProcessGuardian(); 345 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 346 assert.ok(br, 'should have browser_loop_hung result'); 347 assert.equal(br.status, 'ok'); 348 assert.equal(br.actionTaken, 'skipped_vision_disabled'); 349 }); 350 }); 351 352 describe('checkBrowserLoopHung — browser hung + API active → restart', () => { 353 before(() => { 354 clearTable('system_health'); 355 db.prepare( 356 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 357 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-3 minutes'))` 358 ).run(); 359 delete process.env.ENABLE_VISION; 360 execSyncFn = cmd => { 361 if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n'; 362 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 363 if (cmd.includes('ps aux')) return 'root 123\n'; 364 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 365 return ''; 366 }; 367 getSkipStagesFn = () => new Set(); 368 }); 369 370 test('restarts pipeline when browser loop is hung', async () => { 371 const result = await runProcessGuardian(); 372 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 373 assert.ok(br, 'should have browser_loop_hung result'); 374 assert.ok( 375 br.status === 'warning' || br.actionTaken?.includes('restarted'), 376 `expected warning or restart: ${br.status} / ${br.actionTaken}` 377 ); 378 }); 379 }); 380 381 describe('checkBrowserLoopHung — hung but cooldown active', () => { 382 before(() => { 383 clearTable('system_health'); 384 db.prepare( 385 `INSERT INTO system_health (check_type, status, action_taken, created_at) 386 VALUES ('browser_loop_hung', 'warning', 'restarted_pipeline_browser_hung_50min', datetime('now', '-5 minutes'))` 387 ).run(); 388 db.prepare( 389 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 390 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-2 minutes'))` 391 ).run(); 392 delete process.env.ENABLE_VISION; 393 execSyncFn = cmd => { 394 if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n'; 395 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 396 if (cmd.includes('ps aux')) return 'root 123\n'; 397 return ''; 398 }; 399 getSkipStagesFn = () => new Set(); 400 }); 401 402 test('skips restart when cooldown active', async () => { 403 const result = await runProcessGuardian(); 404 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 405 assert.ok(br, 'should have browser_loop_hung result'); 406 assert.ok(br.actionTaken?.includes('cooldown'), `expected cooldown: ${br.actionTaken}`); 407 }); 408 }); 409 410 describe('checkBrowserLoopHung — browser ok (recent cycle)', () => { 411 before(() => { 412 clearTable('system_health'); 413 db.prepare( 414 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 415 VALUES ('singleton', '1', datetime('now', '-5 minutes'), datetime('now', '-2 minutes'))` 416 ).run(); 417 delete process.env.ENABLE_VISION; 418 execSyncFn = cmd => { 419 if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n'; 420 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 421 if (cmd.includes('ps aux')) return 'root 123\n'; 422 return ''; 423 }; 424 getSkipStagesFn = () => new Set(); 425 }); 426 427 test('returns ok when browser cycle is recent', async () => { 428 const result = await runProcessGuardian(); 429 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 430 assert.ok(br, 'should have browser_loop_hung result'); 431 assert.equal(br.status, 'ok'); 432 }); 433 }); 434 435 describe('checkCronTimer — timer has next trigger (ok)', () => { 436 before(() => { 437 clearTable('system_health'); 438 execSyncFn = happyExecSync; 439 getSkipStagesFn = () => new Set(); 440 }); 441 442 test('returns ok status for cron_timer_dead check', async () => { 443 const result = await runProcessGuardian(); 444 const cr = result.results?.find(r => r.check === 'cron_timer_dead'); 445 assert.ok(cr, 'should have cron_timer_dead result'); 446 assert.equal(cr.status, 'ok'); 447 assert.equal(cr.actionTaken, null); 448 }); 449 }); 450 451 describe('checkCronTimer — timer shows infinity → restart', () => { 452 before(() => { 453 clearTable('system_health'); 454 execSyncFn = cmd => { 455 if (cmd.includes('is-active')) return 'active\n'; 456 if (cmd.includes('ps aux')) return 'root 123\n'; 457 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n'; 458 if (cmd.includes('restart --no-block mmo-cron.timer')) return ''; 459 return ''; 460 }; 461 getSkipStagesFn = () => new Set(); 462 }); 463 464 test('restarts cron timer when it shows infinity', async () => { 465 const result = await runProcessGuardian(); 466 const cr = result.results?.find(r => r.check === 'cron_timer_dead'); 467 assert.ok(cr, 'should have cron_timer_dead result'); 468 assert.equal(cr.status, 'warning'); 469 assert.equal(cr.actionTaken, 'restarted_cron_timer'); 470 }); 471 }); 472 473 describe('checkCronTimer — infinity but cooldown active', () => { 474 before(() => { 475 clearTable('system_health'); 476 db.prepare( 477 `INSERT INTO system_health (check_type, status, action_taken, created_at) 478 VALUES ('cron_timer_dead', 'warning', 'restarted_cron_timer', datetime('now', '-2 minutes'))` 479 ).run(); 480 execSyncFn = cmd => { 481 if (cmd.includes('is-active')) return 'active\n'; 482 if (cmd.includes('ps aux')) return 'root 123\n'; 483 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n'; 484 return ''; 485 }; 486 getSkipStagesFn = () => new Set(); 487 }); 488 489 test('skips restart when cooldown active', async () => { 490 const result = await runProcessGuardian(); 491 const cr = result.results?.find(r => r.check === 'cron_timer_dead'); 492 assert.ok(cr, 'should have cron_timer_dead result'); 493 assert.equal(cr.actionTaken, 'cooldown_skip_cron_timer'); 494 }); 495 }); 496 497 describe('checkClearanceCycle — was running, now stopped', () => { 498 before(() => { 499 clearTable('system_health'); 500 db.prepare( 501 `INSERT INTO system_health (check_type, status, details, created_at) 502 VALUES ('clearance_cycle', 'ok', '{"clearance_running":true}', datetime('now', '-1 minute'))` 503 ).run(); 504 execSyncFn = cmd => { 505 if (cmd.includes('is-active')) return 'active\n'; 506 if (cmd.includes('ps aux')) return 'root 123 node\n'; // no clearance script 507 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 508 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 509 return ''; 510 }; 511 getSkipStagesFn = () => new Set(); 512 }); 513 514 test('restarts pipeline after clearance cycle completion', async () => { 515 const result = await runProcessGuardian(); 516 const cl = result.results?.find(r => r.check === 'clearance_cycle'); 517 assert.ok(cl, 'should have clearance_cycle result'); 518 assert.ok( 519 cl.actionTaken === 'restarted_pipeline_after_clearance' || cl.status === 'ok', 520 `got action: ${cl.actionTaken}, status: ${cl.status}` 521 ); 522 }); 523 }); 524 525 describe('cleanupOldRecords — probabilistic housekeeping', () => { 526 test('does not throw when called via runProcessGuardian many times', async () => { 527 clearTable('system_health'); 528 execSyncFn = happyExecSync; 529 getSkipStagesFn = () => new Set(); 530 // Run 3 times — at 10% probability, cleanup may or may not fire 531 for (let i = 0; i < 3; i++) { 532 await assert.doesNotReject(() => runProcessGuardian()); 533 } 534 }); 535 }); 536 537 // ── Additional branch coverage tests ───────────────────────────────────────── 538 539 describe('checkPipelineService — crash loop escalation (>10min stuck)', () => { 540 before(() => { 541 clearTable('system_health'); 542 clearTable('agent_tasks'); 543 // Insert restart records 5-25 minutes ago (none within last 3min, so cooldown won't trigger). 544 // MIN(created_at) will be ~25min ago → minutesStuck > 10 → crash loop path. 545 for (let i = 25; i >= 5; i -= 4) { 546 db.prepare( 547 `INSERT INTO system_health (check_type, status, action_taken, created_at) 548 VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-${i} minutes'))` 549 ).run(); 550 } 551 552 execSyncFn = cmd => { 553 if (cmd.includes('is-active') && !cmd.includes('cron')) { 554 const err = new Error('inactive'); 555 err.stdout = 'inactive\n'; 556 throw err; 557 } 558 if (cmd.includes('journalctl')) return 'some startup log\n'; 559 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 560 if (cmd.includes('ps aux')) return 'root 123 node\n'; 561 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 562 return ''; 563 }; 564 getSkipStagesFn = () => new Set(); 565 }); 566 567 test('escalates to triage task and marks status critical', async () => { 568 const result = await runProcessGuardian(); 569 const ps = result.results?.find(r => r.check === 'pipeline_service'); 570 assert.ok(ps, 'should have pipeline_service result'); 571 assert.equal(ps.status, 'critical'); 572 assert.ok( 573 ps.actionTaken?.includes('crash_loop_escalated') || ps.actionTaken?.includes('restarted'), 574 `expected crash_loop_escalated in actionTaken: ${ps.actionTaken}` 575 ); 576 // A triage task should have been inserted 577 const task = db 578 .prepare("SELECT * FROM agent_tasks WHERE task_type='classify_error' LIMIT 1") 579 .get(); 580 assert.ok(task, 'should have created a triage agent_task'); 581 }); 582 }); 583 584 describe('checkPipelineService — crash loop escalation, journalctl throws', () => { 585 before(() => { 586 clearTable('system_health'); 587 clearTable('agent_tasks'); 588 for (let i = 25; i >= 5; i -= 4) { 589 db.prepare( 590 `INSERT INTO system_health (check_type, status, action_taken, created_at) 591 VALUES ('pipeline_service', 'warning', 'restarted_pipeline', datetime('now', '-${i} minutes'))` 592 ).run(); 593 } 594 595 execSyncFn = cmd => { 596 if (cmd.includes('is-active') && !cmd.includes('cron')) { 597 const err = new Error('inactive'); 598 err.stdout = 'inactive\n'; 599 throw err; 600 } 601 if (cmd.includes('journalctl')) throw new Error('journalctl not available'); 602 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 603 if (cmd.includes('ps aux')) return 'root 123 node\n'; 604 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 605 return ''; 606 }; 607 getSkipStagesFn = () => new Set(); 608 }); 609 610 test('handles journalctl failure gracefully (startupError stays null)', async () => { 611 const result = await runProcessGuardian(); 612 const ps = result.results?.find(r => r.check === 'pipeline_service'); 613 assert.ok(ps, 'should have pipeline_service result'); 614 // Should still escalate even without journalctl output 615 assert.ok( 616 ps.status === 'critical' || ps.actionTaken?.includes('restart'), 617 `expected critical/restart: ${ps.status} / ${ps.actionTaken}` 618 ); 619 }); 620 }); 621 622 describe('checkPipelineService — restart command fails', () => { 623 before(() => { 624 clearTable('system_health'); 625 execSyncFn = cmd => { 626 if (cmd.includes('is-active') && !cmd.includes('cron')) { 627 const err = new Error('inactive'); 628 err.stdout = 'inactive\n'; 629 throw err; 630 } 631 if (cmd.includes('restart --no-block 333method-pipeline')) { 632 throw new Error('Failed to connect to bus: No such file or directory'); 633 } 634 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 635 if (cmd.includes('ps aux')) return 'root 123 node\n'; 636 return ''; 637 }; 638 getSkipStagesFn = () => new Set(); 639 }); 640 641 test('marks status critical when restart command fails', async () => { 642 const result = await runProcessGuardian(); 643 const ps = result.results?.find(r => r.check === 'pipeline_service'); 644 assert.ok(ps, 'should have pipeline_service result'); 645 assert.equal(ps.status, 'critical'); 646 assert.ok( 647 ps.actionTaken?.includes('restart_failed'), 648 `expected restart_failed: ${ps.actionTaken}` 649 ); 650 }); 651 }); 652 653 describe('checkClearanceCycle — ps aux throws (clearanceRunning defaults false)', () => { 654 before(() => { 655 clearTable('system_health'); 656 execSyncFn = cmd => { 657 if (cmd.includes('ps aux')) throw new Error('ps not found'); 658 if (cmd.includes('is-active')) return 'active\n'; 659 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 660 return ''; 661 }; 662 getSkipStagesFn = () => new Set(); 663 }); 664 665 test('returns ok and records clearance_running=false when ps throws', async () => { 666 const result = await runProcessGuardian(); 667 const cl = result.results?.find(r => r.check === 'clearance_cycle'); 668 assert.ok(cl, 'should have clearance_cycle result'); 669 assert.equal(cl.status, 'ok'); 670 assert.equal(cl.clearanceRunning, false); 671 }); 672 }); 673 674 describe('checkClearanceCycle — malformed JSON details in last health row', () => { 675 before(() => { 676 clearTable('system_health'); 677 db.prepare( 678 `INSERT INTO system_health (check_type, status, details, created_at) 679 VALUES ('clearance_cycle', 'ok', 'not-valid-json{{', datetime('now', '-1 minute'))` 680 ).run(); 681 execSyncFn = cmd => { 682 if (cmd.includes('ps aux')) return 'root 123 node\n'; 683 if (cmd.includes('is-active')) return 'active\n'; 684 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 685 return ''; 686 }; 687 getSkipStagesFn = () => new Set(); 688 }); 689 690 test('handles JSON parse error gracefully (wasRunning defaults false)', async () => { 691 const result = await runProcessGuardian(); 692 const cl = result.results?.find(r => r.check === 'clearance_cycle'); 693 assert.ok(cl, 'should have clearance_cycle result'); 694 assert.equal(cl.status, 'ok'); 695 // wasRunning was false (parse error), clearanceRunning is false → no restart 696 assert.equal(cl.actionTaken, null); 697 }); 698 }); 699 700 describe('checkClearanceCycle — restart after clearance fails', () => { 701 before(() => { 702 clearTable('system_health'); 703 db.prepare( 704 `INSERT INTO system_health (check_type, status, details, created_at) 705 VALUES ('clearance_cycle', 'ok', '{"clearance_running":true}', datetime('now', '-1 minute'))` 706 ).run(); 707 execSyncFn = cmd => { 708 if (cmd.includes('ps aux')) return 'root 123 node\n'; // clearance not running 709 if (cmd.includes('is-active')) return 'active\n'; 710 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 711 if (cmd.includes('restart --no-block 333method-pipeline')) { 712 throw new Error('systemctl restart failed'); 713 } 714 return ''; 715 }; 716 getSkipStagesFn = () => new Set(); 717 }); 718 719 test('marks status warning when clearance restart fails', async () => { 720 const result = await runProcessGuardian(); 721 const cl = result.results?.find(r => r.check === 'clearance_cycle'); 722 assert.ok(cl, 'should have clearance_cycle result'); 723 assert.equal(cl.status, 'warning'); 724 assert.ok( 725 cl.actionTaken?.includes('clearance_restart_failed'), 726 `expected clearance_restart_failed: ${cl.actionTaken}` 727 ); 728 }); 729 }); 730 731 describe('checkBrowserLoopHung — systemctl is-active throws (falls back to timestamp)', () => { 732 before(() => { 733 clearTable('system_health'); 734 // browser stale 60min, API stale 5min (within 60min threshold) — hung scenario 735 db.prepare( 736 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 737 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-5 minutes'))` 738 ).run(); 739 delete process.env.ENABLE_VISION; 740 execSyncFn = cmd => { 741 if (cmd.includes('is-active') && cmd.includes('pipeline')) { 742 throw new Error('D-Bus not available'); 743 } 744 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 745 if (cmd.includes('ps aux')) return 'root 123\n'; 746 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 747 return ''; 748 }; 749 getSkipStagesFn = () => new Set(); 750 }); 751 752 test('falls back to timestamp check when systemctl throws, restarts if hung', async () => { 753 const result = await runProcessGuardian(); 754 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 755 assert.ok(br, 'should have browser_loop_hung result'); 756 // API within 60min threshold (5min) + browser stale 60min > 45min threshold → restart 757 assert.ok( 758 br.status === 'warning' || br.actionTaken?.includes('restarted'), 759 `expected warning/restart: ${br.status} / ${br.actionTaken}` 760 ); 761 }); 762 }); 763 764 describe('checkBrowserLoopHung — browser hung restart fails', () => { 765 before(() => { 766 clearTable('system_health'); 767 db.prepare( 768 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 769 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-3 minutes'))` 770 ).run(); 771 delete process.env.ENABLE_VISION; 772 execSyncFn = cmd => { 773 if (cmd.includes('is-active') && cmd.includes('pipeline')) return 'active\n'; 774 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 775 if (cmd.includes('ps aux')) return 'root 123\n'; 776 if (cmd.includes('restart --no-block 333method-pipeline')) { 777 throw new Error('restart failed: no bus'); 778 } 779 return ''; 780 }; 781 getSkipStagesFn = () => new Set(); 782 }); 783 784 test('marks status critical when browser-hung restart fails', async () => { 785 const result = await runProcessGuardian(); 786 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 787 assert.ok(br, 'should have browser_loop_hung result'); 788 assert.equal(br.status, 'critical'); 789 assert.ok( 790 br.actionTaken?.includes('browser_hung_restart_failed'), 791 `expected browser_hung_restart_failed: ${br.actionTaken}` 792 ); 793 }); 794 }); 795 796 describe('checkBrowserLoopHung — DB query throws (outer catch)', () => { 797 before(() => { 798 clearTable('system_health'); 799 // Insert a pipeline_control row with NULL timestamps 800 db.prepare( 801 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 802 VALUES ('singleton', '1', NULL, NULL)` 803 ).run(); 804 delete process.env.ENABLE_VISION; 805 execSyncFn = cmd => { 806 if (cmd.includes('is-active')) return 'active\n'; 807 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 808 if (cmd.includes('ps aux')) return 'root 123\n'; 809 return ''; 810 }; 811 getSkipStagesFn = () => new Set(); 812 }); 813 814 test('returns ok when pipeline_control has NULL timestamps', async () => { 815 const result = await runProcessGuardian(); 816 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 817 assert.ok(br, 'should have browser_loop_hung result'); 818 // NULL timestamps → julianday(null) = null → age computed as 0 → ok (not hung) 819 assert.equal(br.status, 'ok'); 820 assert.ok(br.browserAgeMin === null || br.browserAgeMin === 0, `expected null or 0 browserAgeMin, got ${br.browserAgeMin}`); 821 }); 822 }); 823 824 describe('checkCronTimer — timer restart fails', () => { 825 before(() => { 826 clearTable('system_health'); 827 execSyncFn = cmd => { 828 if (cmd.includes('is-active')) return 'active\n'; 829 if (cmd.includes('ps aux')) return 'root 123\n'; 830 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=infinity\n'; 831 if (cmd.includes('restart --no-block mmo-cron.timer')) { 832 throw new Error('D-Bus connection failed'); 833 } 834 return ''; 835 }; 836 getSkipStagesFn = () => new Set(); 837 }); 838 839 test('marks status critical when cron timer restart fails', async () => { 840 const result = await runProcessGuardian(); 841 const cr = result.results?.find(r => r.check === 'cron_timer_dead'); 842 assert.ok(cr, 'should have cron_timer_dead result'); 843 assert.equal(cr.status, 'critical'); 844 assert.ok( 845 cr.actionTaken?.includes('cron_timer_restart_failed'), 846 `expected cron_timer_restart_failed: ${cr.actionTaken}` 847 ); 848 }); 849 }); 850 851 describe('checkCronTimer — systemctl show throws (outer catch)', () => { 852 before(() => { 853 clearTable('system_health'); 854 execSyncFn = cmd => { 855 if (cmd.includes('is-active')) return 'active\n'; 856 if (cmd.includes('ps aux')) return 'root 123\n'; 857 if (cmd.includes('show mmo-cron.timer')) throw new Error('systemctl not available'); 858 return ''; 859 }; 860 getSkipStagesFn = () => new Set(); 861 }); 862 863 test('returns ok when systemctl show throws (outer catch swallows error)', async () => { 864 const result = await runProcessGuardian(); 865 const cr = result.results?.find(r => r.check === 'cron_timer_dead'); 866 assert.ok(cr, 'should have cron_timer_dead result'); 867 assert.equal(cr.status, 'ok'); 868 assert.equal(cr.actionTaken, null); 869 }); 870 }); 871 872 describe('cleanupOldRecords — actually deletes old rows (covers line 510-511)', () => { 873 test('logs when old records are deleted', async () => { 874 clearTable('system_health'); 875 // Insert records that are older than 7 days 876 db.prepare( 877 `INSERT INTO system_health (check_type, status, details, action_taken, created_at) 878 VALUES ('pipeline_service', 'ok', '{}', NULL, datetime('now', '-8 days'))` 879 ).run(); 880 db.prepare( 881 `INSERT INTO system_health (check_type, status, details, action_taken, created_at) 882 VALUES ('circuit_breaker', 'ok', '{}', NULL, datetime('now', '-10 days'))` 883 ).run(); 884 885 // Force cleanup to always run by patching Math.random temporarily 886 const origRandom = Math.random; 887 Math.random = () => 0.05; // below 0.1 threshold 888 execSyncFn = happyExecSync; 889 getSkipStagesFn = () => new Set(); 890 try { 891 await assert.doesNotReject(() => runProcessGuardian()); 892 } finally { 893 Math.random = origRandom; 894 } 895 896 // After cleanup, old records should be gone 897 const oldRows = db 898 .prepare( 899 "SELECT count(*) as cnt FROM system_health WHERE created_at < datetime('now', '-7 days')" 900 ) 901 .get(); 902 assert.equal(oldRows.cnt, 0, 'old records should have been deleted'); 903 }); 904 }); 905 906 describe('writeStatusFile — pipeline stopped + circuit breaker errors in status file', () => { 907 before(() => { 908 clearTable('system_health'); 909 clearTable('agent_tasks'); 910 // Insert 4 breaker-open errors so cbErrors > 3 911 for (let i = 0; i < 4; i++) { 912 db.prepare( 913 `INSERT INTO agent_tasks (task_type, assigned_to, context_json, created_at, updated_at) 914 VALUES ('triage', 'triage', '{"error": "Breaker is open"}', datetime('now'), datetime('now'))` 915 ).run(); 916 } 917 execSyncFn = cmd => { 918 if (cmd.includes('is-active') && !cmd.includes('cron')) { 919 const err = new Error('inactive'); 920 err.stdout = 'inactive\n'; 921 throw err; 922 } 923 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 924 if (cmd.includes('ps aux')) return 'root 123 node\n'; 925 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 926 return ''; 927 }; 928 getSkipStagesFn = () => new Set(); 929 getRateLimitStatusFn = () => []; 930 }); 931 932 test('writes status file with stopped pipeline and circuit breaker errors', async () => { 933 const { readFileSync } = await import('node:fs'); 934 await runProcessGuardian(); 935 const content = readFileSync('/tmp/watchdog-status.txt', 'utf8'); 936 assert.ok(content.includes('Process Guardian'), 'should have header'); 937 assert.ok( 938 content.includes('stopped') || content.includes('Pipeline'), 939 'should mention pipeline state' 940 ); 941 }); 942 }); 943 944 describe('checkBrowserLoopHung — API stale + pipeline inactive → apiActive false (no restart)', () => { 945 before(() => { 946 clearTable('system_health'); 947 // Browser stale 60min (> 45min threshold) but API also stale 90min (> 60min) + pipeline inactive 948 // → apiActive=false → no restart needed 949 db.prepare( 950 `INSERT OR REPLACE INTO pipeline_control (key, value, last_browser_loop_at, last_api_loop_at) 951 VALUES ('singleton', '1', datetime('now', '-60 minutes'), datetime('now', '-90 minutes'))` 952 ).run(); 953 delete process.env.ENABLE_VISION; 954 execSyncFn = cmd => { 955 if (cmd.includes('is-active') && cmd.includes('pipeline')) { 956 const err = new Error('inactive'); 957 err.stdout = 'inactive\n'; 958 throw err; 959 } 960 if (cmd.includes('show mmo-cron.timer')) return 'NextElapseUSecMonotonic=1234\n'; 961 if (cmd.includes('ps aux')) return 'root 123\n'; 962 if (cmd.includes('restart --no-block 333method-pipeline')) return ''; 963 return ''; 964 }; 965 getSkipStagesFn = () => new Set(); 966 }); 967 968 test('returns ok when both loops are stale (apiActive=false)', async () => { 969 const result = await runProcessGuardian(); 970 const br = result.results?.find(r => r.check === 'browser_loop_hung'); 971 assert.ok(br, 'should have browser_loop_hung result'); 972 assert.equal(br.status, 'ok'); 973 assert.equal(br.actionTaken, null); 974 }); 975 });