test_update_stale_dashboard.py
1 """Tests for the stale-dashboard handling run at the end of ``hermes update``. 2 3 ``hermes update`` detects ``hermes dashboard`` processes left over from the 4 previous version and kills them (SIGTERM + SIGKILL grace, or ``taskkill /F`` 5 on Windows). Without this, the running backend silently serves stale Python 6 against a freshly-updated JS bundle, producing 401s / empty data. 7 8 History: 9 - #16872 introduced the warn-only helper (``_warn_stale_dashboard_processes``). 10 - #17049 fixed a Windows wmic UnicodeDecodeError crash on non-UTF-8 locales. 11 - This file now also covers the kill semantics that replaced the warning. 12 """ 13 14 from __future__ import annotations 15 16 import importlib 17 import os 18 import sys 19 from unittest.mock import patch, MagicMock, call 20 21 import pytest 22 23 from hermes_cli.main import ( 24 _find_stale_dashboard_pids, 25 _kill_stale_dashboard_processes, 26 _warn_stale_dashboard_processes, # back-compat alias 27 ) 28 29 30 @pytest.fixture(autouse=True) 31 def _refresh_bindings_against_live_module(): 32 """Rebind module-level names to the *current* ``hermes_cli.main``. 33 34 Other tests in the suite (notably ``test_env_loader.py`` and 35 ``test_skills_subparser.py``) reload or delete ``hermes_cli.main`` from 36 ``sys.modules``. When that happens on the same xdist worker before we 37 run, our top-of-file ``from hermes_cli.main import ...`` bindings end 38 up pointing at the *old* module object. ``patch(\"hermes_cli.main.X\")`` 39 then patches the *new* module, but the function we call still resolves 40 ``_find_stale_dashboard_pids`` via its stale ``__globals__``, so every 41 patch becomes a no-op and the kill path silently returns early. 42 43 Refreshing the bindings (and the patch target) to the live module 44 object — and keeping them consistent — makes the tests immune to 45 ordering within the worker. The fix lives in the test module because 46 the two pollutants above are load-bearing for their own tests. 47 """ 48 global _find_stale_dashboard_pids 49 global _kill_stale_dashboard_processes 50 global _warn_stale_dashboard_processes 51 52 live = sys.modules.get("hermes_cli.main") 53 if live is None: 54 live = importlib.import_module("hermes_cli.main") 55 56 _find_stale_dashboard_pids = live._find_stale_dashboard_pids 57 _kill_stale_dashboard_processes = live._kill_stale_dashboard_processes 58 _warn_stale_dashboard_processes = live._warn_stale_dashboard_processes 59 yield 60 61 62 def _ps_line(pid: int, cmd: str) -> str: 63 """Format a line as it would appear in ``ps -A -o pid=,command=`` output.""" 64 return f"{pid:>7} {cmd}" 65 66 67 def _ps_runner(stdout: str): 68 """Build a subprocess.run side_effect that only stubs ps -A calls. 69 70 Any other subprocess.run invocation (e.g. taskkill on Windows) is 71 handed back as a successful no-op. This lets tests exercise the real 72 scan path without having to re-stub every unrelated subprocess call 73 made later in ``_kill_stale_dashboard_processes``. 74 """ 75 def _side_effect(args, *a, **kw): 76 if isinstance(args, (list, tuple)) and args and args[0] == "ps": 77 return MagicMock(returncode=0, stdout=stdout, stderr="") 78 # Any other subprocess.run (e.g. taskkill) — benign success stub. 79 return MagicMock(returncode=0, stdout="", stderr="") 80 return _side_effect 81 82 83 class TestFindStaleDashboardPids: 84 """Unit tests for the ps/wmic-based detection step.""" 85 86 def test_no_matches_returns_empty(self): 87 with patch("subprocess.run") as mock_run: 88 mock_run.return_value = MagicMock( 89 returncode=0, 90 stdout=_ps_line(111, "/usr/bin/python3 -m some.other.module") 91 + "\n" 92 + _ps_line(222, "/usr/bin/bash") 93 + "\n", 94 stderr="", 95 ) 96 assert _find_stale_dashboard_pids() == [] 97 98 def test_matches_running_dashboard(self): 99 with patch("subprocess.run") as mock_run: 100 mock_run.return_value = MagicMock( 101 returncode=0, 102 stdout=_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119") + "\n", 103 stderr="", 104 ) 105 assert _find_stale_dashboard_pids() == [12345] 106 107 def test_multiple_matches(self): 108 with patch("subprocess.run") as mock_run: 109 mock_run.return_value = MagicMock( 110 returncode=0, 111 stdout="\n".join([ 112 _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"), 113 _ps_line(12346, "hermes dashboard --port 9120 --no-open"), 114 _ps_line(12347, "python /home/x/hermes_cli/main.py dashboard"), 115 ]) + "\n", 116 stderr="", 117 ) 118 assert sorted(_find_stale_dashboard_pids()) == [12345, 12346, 12347] 119 120 def test_self_pid_excluded(self): 121 with patch("subprocess.run") as mock_run: 122 mock_run.return_value = MagicMock( 123 returncode=0, 124 stdout="\n".join([ 125 _ps_line(os.getpid(), "python3 -m hermes_cli.main dashboard"), 126 _ps_line(12345, "hermes dashboard --port 9119"), 127 ]) + "\n", 128 stderr="", 129 ) 130 pids = _find_stale_dashboard_pids() 131 assert os.getpid() not in pids 132 assert 12345 in pids 133 134 def test_ps_not_found_returns_empty(self): 135 with patch("subprocess.run", side_effect=FileNotFoundError): 136 assert _find_stale_dashboard_pids() == [] 137 138 def test_ps_timeout_returns_empty(self): 139 import subprocess as sp 140 with patch("subprocess.run", side_effect=sp.TimeoutExpired("ps", 10)): 141 assert _find_stale_dashboard_pids() == [] 142 143 def test_unrelated_process_containing_word_dashboard_not_matched(self): 144 """Guards against greedy pgrep-style matching catching chat sessions 145 or unrelated processes whose cmdline happens to contain 'dashboard'. 146 """ 147 with patch("subprocess.run") as mock_run: 148 mock_run.return_value = MagicMock( 149 returncode=0, 150 stdout="\n".join([ 151 _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"), 152 _ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"), 153 _ps_line(33333, "node /opt/grafana/dashboard-server.js"), 154 ]) + "\n", 155 stderr="", 156 ) 157 pids = _find_stale_dashboard_pids() 158 assert pids == [12345] 159 160 def test_grep_lines_ignored(self): 161 with patch("subprocess.run") as mock_run: 162 mock_run.return_value = MagicMock( 163 returncode=0, 164 stdout="\n".join([ 165 _ps_line(99999, "grep hermes dashboard"), 166 _ps_line(12345, "hermes dashboard --port 9119"), 167 ]) + "\n", 168 stderr="", 169 ) 170 pids = _find_stale_dashboard_pids() 171 assert 99999 not in pids 172 assert 12345 in pids 173 174 def test_invalid_pid_lines_skipped(self): 175 with patch("subprocess.run") as mock_run: 176 mock_run.return_value = MagicMock( 177 returncode=0, 178 stdout="\n".join([ 179 "notapid hermes dashboard --bad", 180 _ps_line(12345, "hermes dashboard --port 9119"), 181 " ", 182 ]) + "\n", 183 stderr="", 184 ) 185 pids = _find_stale_dashboard_pids() 186 assert pids == [12345] 187 188 189 @pytest.mark.skipif(sys.platform == "win32", reason="POSIX kill semantics") 190 class TestKillStaleDashboardPosix: 191 """Kill path on Linux / macOS: SIGTERM then SIGKILL any survivors.""" 192 193 def test_no_stale_processes_is_a_noop(self, capsys): 194 with patch("hermes_cli.main._find_stale_dashboard_pids", return_value=[]): 195 _kill_stale_dashboard_processes() 196 assert capsys.readouterr().out == "" 197 198 def test_sigterm_graceful_exit(self, capsys): 199 """Processes that exit on SIGTERM (the probe gets ProcessLookupError) 200 are reported as stopped and SIGKILL is never sent.""" 201 import signal as _signal 202 203 killed_signals: list[tuple[int, int]] = [] 204 205 def fake_kill(pid, sig): 206 killed_signals.append((pid, sig)) 207 if sig == 0: 208 # Probe after SIGTERM → "process gone". 209 raise ProcessLookupError 210 # SIGTERM itself: succeed silently. 211 212 with patch("hermes_cli.main._find_stale_dashboard_pids", 213 return_value=[12345, 12346]), \ 214 patch("os.kill", side_effect=fake_kill), \ 215 patch("time.sleep"): 216 _kill_stale_dashboard_processes() 217 218 # Both got SIGTERM. 219 sigterms = [pid for pid, sig in killed_signals if sig == _signal.SIGTERM] 220 assert sorted(sigterms) == [12345, 12346] 221 # No SIGKILL was needed. 222 assert not any(sig == _signal.SIGKILL for _, sig in killed_signals) 223 224 out = capsys.readouterr().out 225 assert "Stopping 2 dashboard" in out 226 assert "✓ stopped PID 12345" in out 227 assert "✓ stopped PID 12346" in out 228 assert "Restart the dashboard" in out 229 230 def test_sigkill_fallback_for_survivors(self, capsys): 231 """If a process survives SIGTERM + the grace window, SIGKILL is sent.""" 232 import signal as _signal 233 234 sent: list[tuple[int, int]] = [] 235 236 def fake_kill(pid, sig): 237 sent.append((pid, sig)) 238 # Simulate stubborn process: probe (sig 0) always succeeds, 239 # SIGTERM does nothing, SIGKILL is where it "dies". 240 if sig in (_signal.SIGTERM, 0, _signal.SIGKILL): 241 return 242 # Any other signal — also fine. 243 244 with patch("hermes_cli.main._find_stale_dashboard_pids", 245 return_value=[99999]), \ 246 patch("os.kill", side_effect=fake_kill), \ 247 patch("time.sleep"), \ 248 patch("time.monotonic", side_effect=[0.0] + [10.0] * 20): 249 # monotonic jumps past the 3s deadline on the second read so the 250 # grace loop exits immediately after one iteration. 251 _kill_stale_dashboard_processes() 252 253 signals_sent = [sig for _, sig in sent] 254 assert _signal.SIGTERM in signals_sent 255 assert _signal.SIGKILL in signals_sent 256 257 out = capsys.readouterr().out 258 assert "✓ stopped PID 99999" in out 259 260 def test_permission_error_is_reported_not_raised(self, capsys): 261 """os.kill raising PermissionError (e.g. another user's process) 262 must not abort hermes update — it's reported as a failure and we 263 move on.""" 264 def fake_kill(pid, sig): 265 raise PermissionError("Operation not permitted") 266 267 with patch("hermes_cli.main._find_stale_dashboard_pids", 268 return_value=[12345]), \ 269 patch("os.kill", side_effect=fake_kill), \ 270 patch("time.sleep"): 271 _kill_stale_dashboard_processes() # must not raise 272 273 out = capsys.readouterr().out 274 assert "✗ failed to stop PID 12345" in out 275 assert "Operation not permitted" in out 276 277 def test_process_already_gone_counts_as_stopped(self, capsys): 278 """ProcessLookupError on the initial SIGTERM means the process 279 already exited between detection and the kill — treat as success.""" 280 def fake_kill(pid, sig): 281 raise ProcessLookupError 282 283 with patch("hermes_cli.main._find_stale_dashboard_pids", 284 return_value=[12345]), \ 285 patch("os.kill", side_effect=fake_kill), \ 286 patch("time.sleep"): 287 _kill_stale_dashboard_processes() 288 289 out = capsys.readouterr().out 290 assert "✓ stopped PID 12345" in out 291 assert "failed to stop" not in out 292 293 294 class TestKillStaleDashboardWindows: 295 """Kill path on Windows: taskkill /F.""" 296 297 def test_taskkill_invoked_for_each_pid(self, monkeypatch, capsys): 298 monkeypatch.setattr(sys, "platform", "win32") 299 300 def fake_run(args, *a, **kw): 301 # taskkill returns 0 on success 302 return MagicMock(returncode=0, stdout="", stderr="") 303 304 with patch("hermes_cli.main._find_stale_dashboard_pids", 305 return_value=[12345, 12346]), \ 306 patch("subprocess.run", side_effect=fake_run) as mock_run: 307 _kill_stale_dashboard_processes() 308 309 # Each PID triggered a taskkill /PID <n> /F invocation. 310 taskkill_calls = [ 311 c for c in mock_run.call_args_list 312 if c.args and isinstance(c.args[0], list) and c.args[0][:1] == ["taskkill"] 313 ] 314 assert len(taskkill_calls) == 2 315 assert ["taskkill", "/PID", "12345", "/F"] in [c.args[0] for c in taskkill_calls] 316 assert ["taskkill", "/PID", "12346", "/F"] in [c.args[0] for c in taskkill_calls] 317 318 out = capsys.readouterr().out 319 assert "✓ stopped PID 12345" in out 320 assert "✓ stopped PID 12346" in out 321 322 def test_taskkill_failure_is_reported(self, monkeypatch, capsys): 323 monkeypatch.setattr(sys, "platform", "win32") 324 325 def fake_run(args, *a, **kw): 326 return MagicMock(returncode=128, stdout="", 327 stderr="ERROR: Access is denied.") 328 329 with patch("hermes_cli.main._find_stale_dashboard_pids", 330 return_value=[12345]), \ 331 patch("subprocess.run", side_effect=fake_run): 332 _kill_stale_dashboard_processes() # must not raise 333 334 out = capsys.readouterr().out 335 assert "✗ failed to stop PID 12345" in out 336 assert "Access is denied" in out 337 338 339 class TestBackCompatAlias: 340 """``_warn_stale_dashboard_processes`` is kept as an alias for the 341 new kill function so old imports don't break.""" 342 343 def test_alias_is_the_kill_function(self): 344 assert _warn_stale_dashboard_processes is _kill_stale_dashboard_processes 345 346 347 class TestWindowsWmicEncoding: 348 """Regression tests for #17049 — the Windows wmic branch must not crash 349 `hermes update` on non-UTF-8 system locales (e.g. cp936 on zh-CN). 350 """ 351 352 def test_wmic_invoked_with_utf8_ignore_errors(self, monkeypatch): 353 """The wmic subprocess.run call must pass encoding='utf-8' and 354 errors='ignore' so the subprocess reader thread cannot raise 355 UnicodeDecodeError on non-UTF-8 wmic output.""" 356 monkeypatch.setattr(sys, "platform", "win32") 357 with patch("subprocess.run") as mock_run: 358 mock_run.return_value = MagicMock( 359 returncode=0, 360 stdout=( 361 "CommandLine=python -m hermes_cli.main dashboard\n" 362 "ProcessId=12345\n" 363 ), 364 stderr="", 365 ) 366 _find_stale_dashboard_pids() 367 368 # The wmic call is the first subprocess.run invocation. 369 assert mock_run.called, "subprocess.run was not invoked" 370 wmic_call = mock_run.call_args_list[0] 371 kwargs = wmic_call.kwargs 372 assert kwargs.get("encoding") == "utf-8", ( 373 "encoding kwarg must be 'utf-8' so wmic output is decoded " 374 "deterministically rather than via the implicit reader-thread " 375 "default that crashes on non-UTF-8 locales (#17049)." 376 ) 377 assert kwargs.get("errors") == "ignore", ( 378 "errors kwarg must be 'ignore' so undecodable bytes don't take " 379 "down the reader thread (#17049)." 380 ) 381 382 def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch): 383 """If subprocess.run returns successfully but stdout is None — which 384 is what Python 3.11 leaves behind when the reader thread silently 385 crashed on UnicodeDecodeError before this fix landed — detection 386 must short-circuit instead of raising AttributeError on 387 ``None.split('\\n')`` and aborting `hermes update` (#17049).""" 388 monkeypatch.setattr(sys, "platform", "win32") 389 with patch("subprocess.run") as mock_run: 390 mock_run.return_value = MagicMock( 391 returncode=0, stdout=None, stderr="" 392 ) 393 # Must not raise. 394 assert _find_stale_dashboard_pids() == []