/ tests / hermes_cli / test_update_stale_dashboard.py
test_update_stale_dashboard.py
  1  """Tests for the stale-dashboard handling run at the end of ``hermes update``.
  2  
  3  ``hermes update`` detects ``hermes dashboard`` processes left over from the
  4  previous version and kills them (SIGTERM + SIGKILL grace, or ``taskkill /F``
  5  on Windows).  Without this, the running backend silently serves stale Python
  6  against a freshly-updated JS bundle, producing 401s / empty data.
  7  
  8  History:
  9  - #16872 introduced the warn-only helper (``_warn_stale_dashboard_processes``).
 10  - #17049 fixed a Windows wmic UnicodeDecodeError crash on non-UTF-8 locales.
 11  - This file now also covers the kill semantics that replaced the warning.
 12  """
 13  
 14  from __future__ import annotations
 15  
 16  import importlib
 17  import os
 18  import sys
 19  from unittest.mock import patch, MagicMock, call
 20  
 21  import pytest
 22  
 23  from hermes_cli.main import (
 24      _find_stale_dashboard_pids,
 25      _kill_stale_dashboard_processes,
 26      _warn_stale_dashboard_processes,  # back-compat alias
 27  )
 28  
 29  
 30  @pytest.fixture(autouse=True)
 31  def _refresh_bindings_against_live_module():
 32      """Rebind module-level names to the *current* ``hermes_cli.main``.
 33  
 34      Other tests in the suite (notably ``test_env_loader.py`` and
 35      ``test_skills_subparser.py``) reload or delete ``hermes_cli.main`` from
 36      ``sys.modules``.  When that happens on the same xdist worker before we
 37      run, our top-of-file ``from hermes_cli.main import ...`` bindings end
 38      up pointing at the *old* module object.  ``patch(\"hermes_cli.main.X\")``
 39      then patches the *new* module, but the function we call still resolves
 40      ``_find_stale_dashboard_pids`` via its stale ``__globals__``, so every
 41      patch becomes a no-op and the kill path silently returns early.
 42  
 43      Refreshing the bindings (and the patch target) to the live module
 44      object — and keeping them consistent — makes the tests immune to
 45      ordering within the worker.  The fix lives in the test module because
 46      the two pollutants above are load-bearing for their own tests.
 47      """
 48      global _find_stale_dashboard_pids
 49      global _kill_stale_dashboard_processes
 50      global _warn_stale_dashboard_processes
 51  
 52      live = sys.modules.get("hermes_cli.main")
 53      if live is None:
 54          live = importlib.import_module("hermes_cli.main")
 55  
 56      _find_stale_dashboard_pids = live._find_stale_dashboard_pids
 57      _kill_stale_dashboard_processes = live._kill_stale_dashboard_processes
 58      _warn_stale_dashboard_processes = live._warn_stale_dashboard_processes
 59      yield
 60  
 61  
 62  def _ps_line(pid: int, cmd: str) -> str:
 63      """Format a line as it would appear in ``ps -A -o pid=,command=`` output."""
 64      return f"{pid:>7} {cmd}"
 65  
 66  
 67  def _ps_runner(stdout: str):
 68      """Build a subprocess.run side_effect that only stubs ps -A calls.
 69  
 70      Any other subprocess.run invocation (e.g. taskkill on Windows) is
 71      handed back as a successful no-op.  This lets tests exercise the real
 72      scan path without having to re-stub every unrelated subprocess call
 73      made later in ``_kill_stale_dashboard_processes``.
 74      """
 75      def _side_effect(args, *a, **kw):
 76          if isinstance(args, (list, tuple)) and args and args[0] == "ps":
 77              return MagicMock(returncode=0, stdout=stdout, stderr="")
 78          # Any other subprocess.run (e.g. taskkill) — benign success stub.
 79          return MagicMock(returncode=0, stdout="", stderr="")
 80      return _side_effect
 81  
 82  
 83  class TestFindStaleDashboardPids:
 84      """Unit tests for the ps/wmic-based detection step."""
 85  
 86      def test_no_matches_returns_empty(self):
 87          with patch("subprocess.run") as mock_run:
 88              mock_run.return_value = MagicMock(
 89                  returncode=0,
 90                  stdout=_ps_line(111, "/usr/bin/python3 -m some.other.module")
 91                  + "\n"
 92                  + _ps_line(222, "/usr/bin/bash")
 93                  + "\n",
 94                  stderr="",
 95              )
 96              assert _find_stale_dashboard_pids() == []
 97  
 98      def test_matches_running_dashboard(self):
 99          with patch("subprocess.run") as mock_run:
100              mock_run.return_value = MagicMock(
101                  returncode=0,
102                  stdout=_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119") + "\n",
103                  stderr="",
104              )
105              assert _find_stale_dashboard_pids() == [12345]
106  
107      def test_multiple_matches(self):
108          with patch("subprocess.run") as mock_run:
109              mock_run.return_value = MagicMock(
110                  returncode=0,
111                  stdout="\n".join([
112                      _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"),
113                      _ps_line(12346, "hermes dashboard --port 9120 --no-open"),
114                      _ps_line(12347, "python /home/x/hermes_cli/main.py dashboard"),
115                  ]) + "\n",
116                  stderr="",
117              )
118              assert sorted(_find_stale_dashboard_pids()) == [12345, 12346, 12347]
119  
120      def test_self_pid_excluded(self):
121          with patch("subprocess.run") as mock_run:
122              mock_run.return_value = MagicMock(
123                  returncode=0,
124                  stdout="\n".join([
125                      _ps_line(os.getpid(), "python3 -m hermes_cli.main dashboard"),
126                      _ps_line(12345, "hermes dashboard --port 9119"),
127                  ]) + "\n",
128                  stderr="",
129              )
130              pids = _find_stale_dashboard_pids()
131          assert os.getpid() not in pids
132          assert 12345 in pids
133  
134      def test_ps_not_found_returns_empty(self):
135          with patch("subprocess.run", side_effect=FileNotFoundError):
136              assert _find_stale_dashboard_pids() == []
137  
138      def test_ps_timeout_returns_empty(self):
139          import subprocess as sp
140          with patch("subprocess.run", side_effect=sp.TimeoutExpired("ps", 10)):
141              assert _find_stale_dashboard_pids() == []
142  
143      def test_unrelated_process_containing_word_dashboard_not_matched(self):
144          """Guards against greedy pgrep-style matching catching chat sessions
145          or unrelated processes whose cmdline happens to contain 'dashboard'.
146          """
147          with patch("subprocess.run") as mock_run:
148              mock_run.return_value = MagicMock(
149                  returncode=0,
150                  stdout="\n".join([
151                      _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"),
152                      _ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"),
153                      _ps_line(33333, "node /opt/grafana/dashboard-server.js"),
154                  ]) + "\n",
155                  stderr="",
156              )
157              pids = _find_stale_dashboard_pids()
158          assert pids == [12345]
159  
160      def test_grep_lines_ignored(self):
161          with patch("subprocess.run") as mock_run:
162              mock_run.return_value = MagicMock(
163                  returncode=0,
164                  stdout="\n".join([
165                      _ps_line(99999, "grep hermes dashboard"),
166                      _ps_line(12345, "hermes dashboard --port 9119"),
167                  ]) + "\n",
168                  stderr="",
169              )
170              pids = _find_stale_dashboard_pids()
171          assert 99999 not in pids
172          assert 12345 in pids
173  
174      def test_invalid_pid_lines_skipped(self):
175          with patch("subprocess.run") as mock_run:
176              mock_run.return_value = MagicMock(
177                  returncode=0,
178                  stdout="\n".join([
179                      "notapid hermes dashboard --bad",
180                      _ps_line(12345, "hermes dashboard --port 9119"),
181                      "   ",
182                  ]) + "\n",
183                  stderr="",
184              )
185              pids = _find_stale_dashboard_pids()
186          assert pids == [12345]
187  
188  
189  @pytest.mark.skipif(sys.platform == "win32", reason="POSIX kill semantics")
190  class TestKillStaleDashboardPosix:
191      """Kill path on Linux / macOS: SIGTERM then SIGKILL any survivors."""
192  
193      def test_no_stale_processes_is_a_noop(self, capsys):
194          with patch("hermes_cli.main._find_stale_dashboard_pids", return_value=[]):
195              _kill_stale_dashboard_processes()
196          assert capsys.readouterr().out == ""
197  
198      def test_sigterm_graceful_exit(self, capsys):
199          """Processes that exit on SIGTERM (the probe gets ProcessLookupError)
200          are reported as stopped and SIGKILL is never sent."""
201          import signal as _signal
202  
203          killed_signals: list[tuple[int, int]] = []
204  
205          def fake_kill(pid, sig):
206              killed_signals.append((pid, sig))
207              if sig == 0:
208                  # Probe after SIGTERM → "process gone".
209                  raise ProcessLookupError
210              # SIGTERM itself: succeed silently.
211  
212          with patch("hermes_cli.main._find_stale_dashboard_pids",
213                     return_value=[12345, 12346]), \
214               patch("os.kill", side_effect=fake_kill), \
215               patch("time.sleep"):
216              _kill_stale_dashboard_processes()
217  
218          # Both got SIGTERM.
219          sigterms = [pid for pid, sig in killed_signals if sig == _signal.SIGTERM]
220          assert sorted(sigterms) == [12345, 12346]
221          # No SIGKILL was needed.
222          assert not any(sig == _signal.SIGKILL for _, sig in killed_signals)
223  
224          out = capsys.readouterr().out
225          assert "Stopping 2 dashboard" in out
226          assert "✓ stopped PID 12345" in out
227          assert "✓ stopped PID 12346" in out
228          assert "Restart the dashboard" in out
229  
230      def test_sigkill_fallback_for_survivors(self, capsys):
231          """If a process survives SIGTERM + the grace window, SIGKILL is sent."""
232          import signal as _signal
233  
234          sent: list[tuple[int, int]] = []
235  
236          def fake_kill(pid, sig):
237              sent.append((pid, sig))
238              # Simulate stubborn process: probe (sig 0) always succeeds,
239              # SIGTERM does nothing, SIGKILL is where it "dies".
240              if sig in (_signal.SIGTERM, 0, _signal.SIGKILL):
241                  return
242              # Any other signal — also fine.
243  
244          with patch("hermes_cli.main._find_stale_dashboard_pids",
245                     return_value=[99999]), \
246               patch("os.kill", side_effect=fake_kill), \
247               patch("time.sleep"), \
248               patch("time.monotonic", side_effect=[0.0] + [10.0] * 20):
249              # monotonic jumps past the 3s deadline on the second read so the
250              # grace loop exits immediately after one iteration.
251              _kill_stale_dashboard_processes()
252  
253          signals_sent = [sig for _, sig in sent]
254          assert _signal.SIGTERM in signals_sent
255          assert _signal.SIGKILL in signals_sent
256  
257          out = capsys.readouterr().out
258          assert "✓ stopped PID 99999" in out
259  
260      def test_permission_error_is_reported_not_raised(self, capsys):
261          """os.kill raising PermissionError (e.g. another user's process)
262          must not abort hermes update — it's reported as a failure and we
263          move on."""
264          def fake_kill(pid, sig):
265              raise PermissionError("Operation not permitted")
266  
267          with patch("hermes_cli.main._find_stale_dashboard_pids",
268                     return_value=[12345]), \
269               patch("os.kill", side_effect=fake_kill), \
270               patch("time.sleep"):
271              _kill_stale_dashboard_processes()  # must not raise
272  
273          out = capsys.readouterr().out
274          assert "✗ failed to stop PID 12345" in out
275          assert "Operation not permitted" in out
276  
277      def test_process_already_gone_counts_as_stopped(self, capsys):
278          """ProcessLookupError on the initial SIGTERM means the process
279          already exited between detection and the kill — treat as success."""
280          def fake_kill(pid, sig):
281              raise ProcessLookupError
282  
283          with patch("hermes_cli.main._find_stale_dashboard_pids",
284                     return_value=[12345]), \
285               patch("os.kill", side_effect=fake_kill), \
286               patch("time.sleep"):
287              _kill_stale_dashboard_processes()
288  
289          out = capsys.readouterr().out
290          assert "✓ stopped PID 12345" in out
291          assert "failed to stop" not in out
292  
293  
294  class TestKillStaleDashboardWindows:
295      """Kill path on Windows: taskkill /F."""
296  
297      def test_taskkill_invoked_for_each_pid(self, monkeypatch, capsys):
298          monkeypatch.setattr(sys, "platform", "win32")
299  
300          def fake_run(args, *a, **kw):
301              # taskkill returns 0 on success
302              return MagicMock(returncode=0, stdout="", stderr="")
303  
304          with patch("hermes_cli.main._find_stale_dashboard_pids",
305                     return_value=[12345, 12346]), \
306               patch("subprocess.run", side_effect=fake_run) as mock_run:
307              _kill_stale_dashboard_processes()
308  
309          # Each PID triggered a taskkill /PID <n> /F invocation.
310          taskkill_calls = [
311              c for c in mock_run.call_args_list
312              if c.args and isinstance(c.args[0], list) and c.args[0][:1] == ["taskkill"]
313          ]
314          assert len(taskkill_calls) == 2
315          assert ["taskkill", "/PID", "12345", "/F"] in [c.args[0] for c in taskkill_calls]
316          assert ["taskkill", "/PID", "12346", "/F"] in [c.args[0] for c in taskkill_calls]
317  
318          out = capsys.readouterr().out
319          assert "✓ stopped PID 12345" in out
320          assert "✓ stopped PID 12346" in out
321  
322      def test_taskkill_failure_is_reported(self, monkeypatch, capsys):
323          monkeypatch.setattr(sys, "platform", "win32")
324  
325          def fake_run(args, *a, **kw):
326              return MagicMock(returncode=128, stdout="",
327                               stderr="ERROR: Access is denied.")
328  
329          with patch("hermes_cli.main._find_stale_dashboard_pids",
330                     return_value=[12345]), \
331               patch("subprocess.run", side_effect=fake_run):
332              _kill_stale_dashboard_processes()  # must not raise
333  
334          out = capsys.readouterr().out
335          assert "✗ failed to stop PID 12345" in out
336          assert "Access is denied" in out
337  
338  
339  class TestBackCompatAlias:
340      """``_warn_stale_dashboard_processes`` is kept as an alias for the
341      new kill function so old imports don't break."""
342  
343      def test_alias_is_the_kill_function(self):
344          assert _warn_stale_dashboard_processes is _kill_stale_dashboard_processes
345  
346  
347  class TestWindowsWmicEncoding:
348      """Regression tests for #17049 — the Windows wmic branch must not crash
349      `hermes update` on non-UTF-8 system locales (e.g. cp936 on zh-CN).
350      """
351  
352      def test_wmic_invoked_with_utf8_ignore_errors(self, monkeypatch):
353          """The wmic subprocess.run call must pass encoding='utf-8' and
354          errors='ignore' so the subprocess reader thread cannot raise
355          UnicodeDecodeError on non-UTF-8 wmic output."""
356          monkeypatch.setattr(sys, "platform", "win32")
357          with patch("subprocess.run") as mock_run:
358              mock_run.return_value = MagicMock(
359                  returncode=0,
360                  stdout=(
361                      "CommandLine=python -m hermes_cli.main dashboard\n"
362                      "ProcessId=12345\n"
363                  ),
364                  stderr="",
365              )
366              _find_stale_dashboard_pids()
367  
368          # The wmic call is the first subprocess.run invocation.
369          assert mock_run.called, "subprocess.run was not invoked"
370          wmic_call = mock_run.call_args_list[0]
371          kwargs = wmic_call.kwargs
372          assert kwargs.get("encoding") == "utf-8", (
373              "encoding kwarg must be 'utf-8' so wmic output is decoded "
374              "deterministically rather than via the implicit reader-thread "
375              "default that crashes on non-UTF-8 locales (#17049)."
376          )
377          assert kwargs.get("errors") == "ignore", (
378              "errors kwarg must be 'ignore' so undecodable bytes don't take "
379              "down the reader thread (#17049)."
380          )
381  
382      def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch):
383          """If subprocess.run returns successfully but stdout is None — which
384          is what Python 3.11 leaves behind when the reader thread silently
385          crashed on UnicodeDecodeError before this fix landed — detection
386          must short-circuit instead of raising AttributeError on
387          ``None.split('\\n')`` and aborting `hermes update` (#17049)."""
388          monkeypatch.setattr(sys, "platform", "win32")
389          with patch("subprocess.run") as mock_run:
390              mock_run.return_value = MagicMock(
391                  returncode=0, stdout=None, stderr=""
392              )
393              # Must not raise.
394              assert _find_stale_dashboard_pids() == []