test_stale_code_self_check.py
1 """Tests for the gateway stale-code self-check (Issue #17648). 2 3 A gateway that survives ``hermes update`` keeps pre-update modules cached 4 in ``sys.modules``. Later imports of names added post-update (e.g. 5 ``cfg_get`` from PR #17304) raise ImportError against the stale module 6 object. The self-check in ``GatewayRunner._detect_stale_code()`` detects 7 this by comparing boot-time sentinel-file mtimes against current ones, 8 and ``_trigger_stale_code_restart()`` triggers a graceful restart. 9 """ 10 11 import os 12 import time 13 from pathlib import Path 14 from unittest.mock import MagicMock, patch 15 16 import pytest 17 18 from gateway.run import ( 19 GatewayRunner, 20 _compute_repo_mtime, 21 _STALE_CODE_SENTINELS, 22 ) 23 24 25 def _make_tmp_repo(tmp_path: Path) -> Path: 26 """Create a fake repo with all stale-code sentinel files.""" 27 for rel in _STALE_CODE_SENTINELS: 28 p = tmp_path / rel 29 p.parent.mkdir(parents=True, exist_ok=True) 30 p.write_text("# test sentinel\n") 31 return tmp_path 32 33 34 def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float): 35 """Bare GatewayRunner with just the stale-check attributes set.""" 36 runner = object.__new__(GatewayRunner) 37 runner._repo_root_for_staleness = repo_root 38 runner._boot_wall_time = boot_wall 39 runner._boot_repo_mtime = boot_mtime 40 runner._stale_code_notified = set() 41 runner._stale_code_restart_triggered = False 42 return runner 43 44 45 def test_compute_repo_mtime_returns_newest(tmp_path): 46 """_compute_repo_mtime returns the newest mtime across sentinel files.""" 47 repo = _make_tmp_repo(tmp_path) 48 49 # Stamp a baseline mtime across all sentinels 50 baseline = time.time() - 100 51 for rel in _STALE_CODE_SENTINELS: 52 os.utime(repo / rel, (baseline, baseline)) 53 54 # Touch one file forward 55 newer = time.time() 56 os.utime(repo / "hermes_cli/config.py", (newer, newer)) 57 58 result = _compute_repo_mtime(repo) 59 assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution) 60 61 62 def test_compute_repo_mtime_missing_files_returns_zero(tmp_path): 63 """Missing sentinel files return 0.0 (treated as 'can't tell' upstream).""" 64 # tmp_path has none of the sentinels 65 assert _compute_repo_mtime(tmp_path) == 0.0 66 67 68 def test_compute_repo_mtime_partial_files_still_works(tmp_path): 69 """Partial sentinel presence still returns newest of the readable ones.""" 70 (tmp_path / "hermes_cli").mkdir() 71 target = tmp_path / "hermes_cli" / "config.py" 72 target.write_text("# partial\n") 73 target_mtime = time.time() - 50 74 os.utime(target, (target_mtime, target_mtime)) 75 76 result = _compute_repo_mtime(tmp_path) 77 assert abs(result - target_mtime) < 1.0 78 79 80 def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path): 81 """No boot snapshot → can't tell → not stale (no restart loop).""" 82 repo = _make_tmp_repo(tmp_path) 83 runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0) 84 assert runner._detect_stale_code() is False 85 86 87 def test_detect_stale_code_false_when_files_unchanged(tmp_path): 88 """Source files at boot mtime → not stale.""" 89 repo = _make_tmp_repo(tmp_path) 90 # Freeze all sentinels to the same mtime 91 baseline = time.time() - 100 92 for rel in _STALE_CODE_SENTINELS: 93 os.utime(repo / rel, (baseline, baseline)) 94 95 runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) 96 assert runner._detect_stale_code() is False 97 98 99 def test_detect_stale_code_true_after_update(tmp_path): 100 """Sentinel files newer than boot snapshot → stale.""" 101 repo = _make_tmp_repo(tmp_path) 102 baseline = time.time() - 100 103 for rel in _STALE_CODE_SENTINELS: 104 os.utime(repo / rel, (baseline, baseline)) 105 106 runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) 107 108 # Simulate hermes update touching config.py 109 new_mtime = time.time() 110 os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime)) 111 112 assert runner._detect_stale_code() is True 113 114 115 def test_detect_stale_code_ignores_subsecond_drift(tmp_path): 116 """2-second slack prevents false positives on coarse-mtime filesystems.""" 117 repo = _make_tmp_repo(tmp_path) 118 baseline = time.time() - 100 119 for rel in _STALE_CODE_SENTINELS: 120 os.utime(repo / rel, (baseline, baseline)) 121 122 runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) 123 124 # Touch config.py 1s newer — within the 2s slack → not stale 125 os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0)) 126 assert runner._detect_stale_code() is False 127 128 # Touch 5s newer → stale 129 os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0)) 130 assert runner._detect_stale_code() is True 131 132 133 def test_trigger_stale_code_restart_is_idempotent(tmp_path): 134 """Calling _trigger_stale_code_restart twice only requests restart once.""" 135 repo = _make_tmp_repo(tmp_path) 136 runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) 137 138 calls = [] 139 140 def fake_request_restart(*, detached=False, via_service=False): 141 calls.append((detached, via_service)) 142 return True 143 144 runner.request_restart = fake_request_restart 145 146 runner._trigger_stale_code_restart() 147 runner._trigger_stale_code_restart() 148 runner._trigger_stale_code_restart() 149 150 assert len(calls) == 1 151 assert runner._stale_code_restart_triggered is True 152 153 154 def test_trigger_stale_code_restart_survives_request_failure(tmp_path): 155 """If request_restart raises, we swallow and mark as triggered anyway.""" 156 repo = _make_tmp_repo(tmp_path) 157 runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) 158 159 def boom(*, detached=False, via_service=False): 160 raise RuntimeError("no event loop") 161 162 runner.request_restart = boom 163 164 # Should not raise 165 runner._trigger_stale_code_restart() 166 167 # Marked triggered so we don't retry on every subsequent message 168 assert runner._stale_code_restart_triggered is True 169 170 171 def test_detect_stale_code_handles_disappearing_repo_root(tmp_path): 172 """If the repo root vanishes after boot, return False (don't loop).""" 173 repo = _make_tmp_repo(tmp_path) 174 baseline = time.time() - 100 175 for rel in _STALE_CODE_SENTINELS: 176 os.utime(repo / rel, (baseline, baseline)) 177 178 runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) 179 180 # Remove all sentinel files — _compute_repo_mtime returns 0.0 181 for rel in _STALE_CODE_SENTINELS: 182 (repo / rel).unlink(missing_ok=True) 183 184 assert runner._detect_stale_code() is False 185 186 187 def test_class_level_defaults_prevent_uninitialized_access(): 188 """Partial construction via object.__new__ must not crash _detect_stale_code.""" 189 runner = object.__new__(GatewayRunner) 190 # Don't set any instance attrs — class-level defaults should kick in 191 runner._repo_root_for_staleness = Path(".") 192 # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0) 193 assert runner._detect_stale_code() is False 194 # _stale_code_restart_triggered falls through to class default (False) 195 assert runner._stale_code_restart_triggered is False 196 197 198 def test_init_captures_boot_snapshot(monkeypatch, tmp_path): 199 """GatewayRunner.__init__ captures a usable stale-code baseline.""" 200 # Stub out the heavy parts of __init__ we don't need. We only want 201 # to prove the stale-code snapshot is captured before anything else. 202 from gateway import run as run_mod 203 204 calls = {} 205 206 def fake_compute(repo_root): 207 calls["repo_root"] = repo_root 208 return 1234567890.0 209 210 monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute) 211 212 # Build a runner without running the full __init__ — then manually 213 # exercise the stale-check init block that __init__ contains. 214 runner = object.__new__(GatewayRunner) 215 runner._boot_wall_time = time.time() 216 runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent 217 runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness) 218 runner._stale_code_notified = set() 219 runner._stale_code_restart_triggered = False 220 221 assert runner._boot_repo_mtime == 1234567890.0 222 assert calls["repo_root"] == runner._repo_root_for_staleness 223 assert runner._boot_wall_time > 0