/ tests / gateway / test_stale_code_self_check.py
test_stale_code_self_check.py
  1  """Tests for the gateway stale-code self-check (Issue #17648).
  2  
  3  A gateway that survives ``hermes update`` keeps pre-update modules cached
  4  in ``sys.modules``.  Later imports of names added post-update (e.g.
  5  ``cfg_get`` from PR #17304) raise ImportError against the stale module
  6  object.  The self-check in ``GatewayRunner._detect_stale_code()`` detects
  7  this by comparing boot-time sentinel-file mtimes against current ones,
  8  and ``_trigger_stale_code_restart()`` triggers a graceful restart.
  9  """
 10  
 11  import os
 12  import time
 13  from pathlib import Path
 14  from unittest.mock import MagicMock, patch
 15  
 16  import pytest
 17  
 18  from gateway.run import (
 19      GatewayRunner,
 20      _compute_repo_mtime,
 21      _STALE_CODE_SENTINELS,
 22  )
 23  
 24  
 25  def _make_tmp_repo(tmp_path: Path) -> Path:
 26      """Create a fake repo with all stale-code sentinel files."""
 27      for rel in _STALE_CODE_SENTINELS:
 28          p = tmp_path / rel
 29          p.parent.mkdir(parents=True, exist_ok=True)
 30          p.write_text("# test sentinel\n")
 31      return tmp_path
 32  
 33  
 34  def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
 35      """Bare GatewayRunner with just the stale-check attributes set."""
 36      runner = object.__new__(GatewayRunner)
 37      runner._repo_root_for_staleness = repo_root
 38      runner._boot_wall_time = boot_wall
 39      runner._boot_repo_mtime = boot_mtime
 40      runner._stale_code_notified = set()
 41      runner._stale_code_restart_triggered = False
 42      return runner
 43  
 44  
 45  def test_compute_repo_mtime_returns_newest(tmp_path):
 46      """_compute_repo_mtime returns the newest mtime across sentinel files."""
 47      repo = _make_tmp_repo(tmp_path)
 48  
 49      # Stamp a baseline mtime across all sentinels
 50      baseline = time.time() - 100
 51      for rel in _STALE_CODE_SENTINELS:
 52          os.utime(repo / rel, (baseline, baseline))
 53  
 54      # Touch one file forward
 55      newer = time.time()
 56      os.utime(repo / "hermes_cli/config.py", (newer, newer))
 57  
 58      result = _compute_repo_mtime(repo)
 59      assert abs(result - newer) < 1.0  # within 1s (filesystem mtime resolution)
 60  
 61  
 62  def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
 63      """Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
 64      # tmp_path has none of the sentinels
 65      assert _compute_repo_mtime(tmp_path) == 0.0
 66  
 67  
 68  def test_compute_repo_mtime_partial_files_still_works(tmp_path):
 69      """Partial sentinel presence still returns newest of the readable ones."""
 70      (tmp_path / "hermes_cli").mkdir()
 71      target = tmp_path / "hermes_cli" / "config.py"
 72      target.write_text("# partial\n")
 73      target_mtime = time.time() - 50
 74      os.utime(target, (target_mtime, target_mtime))
 75  
 76      result = _compute_repo_mtime(tmp_path)
 77      assert abs(result - target_mtime) < 1.0
 78  
 79  
 80  def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
 81      """No boot snapshot → can't tell → not stale (no restart loop)."""
 82      repo = _make_tmp_repo(tmp_path)
 83      runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
 84      assert runner._detect_stale_code() is False
 85  
 86  
 87  def test_detect_stale_code_false_when_files_unchanged(tmp_path):
 88      """Source files at boot mtime → not stale."""
 89      repo = _make_tmp_repo(tmp_path)
 90      # Freeze all sentinels to the same mtime
 91      baseline = time.time() - 100
 92      for rel in _STALE_CODE_SENTINELS:
 93          os.utime(repo / rel, (baseline, baseline))
 94  
 95      runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
 96      assert runner._detect_stale_code() is False
 97  
 98  
 99  def test_detect_stale_code_true_after_update(tmp_path):
100      """Sentinel files newer than boot snapshot → stale."""
101      repo = _make_tmp_repo(tmp_path)
102      baseline = time.time() - 100
103      for rel in _STALE_CODE_SENTINELS:
104          os.utime(repo / rel, (baseline, baseline))
105  
106      runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
107  
108      # Simulate hermes update touching config.py
109      new_mtime = time.time()
110      os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
111  
112      assert runner._detect_stale_code() is True
113  
114  
115  def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
116      """2-second slack prevents false positives on coarse-mtime filesystems."""
117      repo = _make_tmp_repo(tmp_path)
118      baseline = time.time() - 100
119      for rel in _STALE_CODE_SENTINELS:
120          os.utime(repo / rel, (baseline, baseline))
121  
122      runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
123  
124      # Touch config.py 1s newer — within the 2s slack → not stale
125      os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
126      assert runner._detect_stale_code() is False
127  
128      # Touch 5s newer → stale
129      os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
130      assert runner._detect_stale_code() is True
131  
132  
133  def test_trigger_stale_code_restart_is_idempotent(tmp_path):
134      """Calling _trigger_stale_code_restart twice only requests restart once."""
135      repo = _make_tmp_repo(tmp_path)
136      runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
137  
138      calls = []
139  
140      def fake_request_restart(*, detached=False, via_service=False):
141          calls.append((detached, via_service))
142          return True
143  
144      runner.request_restart = fake_request_restart
145  
146      runner._trigger_stale_code_restart()
147      runner._trigger_stale_code_restart()
148      runner._trigger_stale_code_restart()
149  
150      assert len(calls) == 1
151      assert runner._stale_code_restart_triggered is True
152  
153  
154  def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
155      """If request_restart raises, we swallow and mark as triggered anyway."""
156      repo = _make_tmp_repo(tmp_path)
157      runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
158  
159      def boom(*, detached=False, via_service=False):
160          raise RuntimeError("no event loop")
161  
162      runner.request_restart = boom
163  
164      # Should not raise
165      runner._trigger_stale_code_restart()
166  
167      # Marked triggered so we don't retry on every subsequent message
168      assert runner._stale_code_restart_triggered is True
169  
170  
171  def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
172      """If the repo root vanishes after boot, return False (don't loop)."""
173      repo = _make_tmp_repo(tmp_path)
174      baseline = time.time() - 100
175      for rel in _STALE_CODE_SENTINELS:
176          os.utime(repo / rel, (baseline, baseline))
177  
178      runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
179  
180      # Remove all sentinel files — _compute_repo_mtime returns 0.0
181      for rel in _STALE_CODE_SENTINELS:
182          (repo / rel).unlink(missing_ok=True)
183  
184      assert runner._detect_stale_code() is False
185  
186  
187  def test_class_level_defaults_prevent_uninitialized_access():
188      """Partial construction via object.__new__ must not crash _detect_stale_code."""
189      runner = object.__new__(GatewayRunner)
190      # Don't set any instance attrs — class-level defaults should kick in
191      runner._repo_root_for_staleness = Path(".")
192      # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
193      assert runner._detect_stale_code() is False
194      # _stale_code_restart_triggered falls through to class default (False)
195      assert runner._stale_code_restart_triggered is False
196  
197  
198  def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
199      """GatewayRunner.__init__ captures a usable stale-code baseline."""
200      # Stub out the heavy parts of __init__ we don't need.  We only want
201      # to prove the stale-code snapshot is captured before anything else.
202      from gateway import run as run_mod
203  
204      calls = {}
205  
206      def fake_compute(repo_root):
207          calls["repo_root"] = repo_root
208          return 1234567890.0
209  
210      monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)
211  
212      # Build a runner without running the full __init__ — then manually
213      # exercise the stale-check init block that __init__ contains.
214      runner = object.__new__(GatewayRunner)
215      runner._boot_wall_time = time.time()
216      runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
217      runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
218      runner._stale_code_notified = set()
219      runner._stale_code_restart_triggered = False
220  
221      assert runner._boot_repo_mtime == 1234567890.0
222      assert calls["repo_root"] == runner._repo_root_for_staleness
223      assert runner._boot_wall_time > 0