/ tests / gateway / test_clean_shutdown_marker.py
test_clean_shutdown_marker.py
  1  """Tests for the clean shutdown marker that prevents unwanted session auto-resets.
  2  
  3  When the gateway shuts down gracefully (hermes update, gateway restart, /restart),
  4  it writes a .clean_shutdown marker.  On the next startup, if the marker exists,
  5  suspend_recently_active() is skipped so users don't lose their sessions.
  6  
  7  After a crash (no marker), suspension still fires as a safety net for stuck sessions.
  8  """
  9  
 10  import os
 11  from datetime import datetime, timedelta
 12  from pathlib import Path
 13  from unittest.mock import AsyncMock, MagicMock, patch
 14  
 15  import pytest
 16  
 17  from gateway.config import GatewayConfig, Platform, PlatformConfig, SessionResetPolicy
 18  from gateway.session import SessionEntry, SessionSource, SessionStore
 19  
 20  
 21  # ---------------------------------------------------------------------------
 22  # Helpers
 23  # ---------------------------------------------------------------------------
 24  
 25  def _make_source(platform=Platform.TELEGRAM, chat_id="123", user_id="u1"):
 26      return SessionSource(platform=platform, chat_id=chat_id, user_id=user_id)
 27  
 28  
 29  def _make_store(tmp_path, policy=None):
 30      config = GatewayConfig()
 31      if policy:
 32          config.default_reset_policy = policy
 33      return SessionStore(sessions_dir=tmp_path, config=config)
 34  
 35  
 36  # ---------------------------------------------------------------------------
 37  # SessionStore.suspend_recently_active
 38  # ---------------------------------------------------------------------------
 39  
 40  class TestSuspendRecentlyActive:
 41      """Verify suspend_recently_active only marks recent sessions."""
 42  
 43      def test_suspends_recently_active_sessions(self, tmp_path):
 44          store = _make_store(tmp_path)
 45          source = _make_source()
 46          entry = store.get_or_create_session(source)
 47          assert not entry.suspended
 48  
 49          count = store.suspend_recently_active()
 50          assert count == 1
 51  
 52          # Re-fetch — should be resume_pending (preserved, not wiped)
 53          refreshed = store.get_or_create_session(source)
 54          assert refreshed.resume_pending
 55          assert refreshed.session_id == entry.session_id  # same session preserved
 56  
 57      def test_does_not_suspend_old_sessions(self, tmp_path):
 58          store = _make_store(tmp_path)
 59          source = _make_source()
 60          entry = store.get_or_create_session(source)
 61  
 62          # Backdate the session's updated_at beyond the cutoff
 63          with store._lock:
 64              entry.updated_at = datetime.now() - timedelta(seconds=300)
 65              store._save()
 66  
 67          count = store.suspend_recently_active(max_age_seconds=120)
 68          assert count == 0
 69  
 70      def test_already_resume_pending_not_double_counted(self, tmp_path):
 71          store = _make_store(tmp_path)
 72          source = _make_source()
 73          entry = store.get_or_create_session(source)
 74  
 75          # Mark resume_pending once
 76          count1 = store.suspend_recently_active()
 77          assert count1 == 1
 78  
 79          # Re-fetch returns the SAME session (preserved, not reset)
 80          entry2 = store.get_or_create_session(source)
 81          assert entry2.session_id == entry.session_id
 82  
 83          # Second call skips already-resume_pending entries
 84          count2 = store.suspend_recently_active()
 85          assert count2 == 0
 86  
 87  
 88  # ---------------------------------------------------------------------------
 89  # Clean shutdown marker integration
 90  # ---------------------------------------------------------------------------
 91  
 92  class TestCleanShutdownMarker:
 93      """Test that the marker file controls session suspension on startup."""
 94  
 95      def test_marker_written_on_graceful_stop(self, tmp_path, monkeypatch):
 96          """stop() should write .clean_shutdown marker."""
 97          monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
 98          marker = tmp_path / ".clean_shutdown"
 99          assert not marker.exists()
100  
101          # Create a minimal runner and call the shutdown logic directly
102          from gateway.run import GatewayRunner
103          runner = object.__new__(GatewayRunner)
104          runner._restart_requested = False
105          runner._restart_detached = False
106          runner._restart_via_service = False
107          runner._restart_task_started = False
108          runner._running = True
109          runner._draining = False
110          runner._stop_task = None
111          runner._running_agents = {}
112          runner._pending_messages = {}
113          runner._pending_approvals = {}
114          runner._background_tasks = set()
115          runner._shutdown_event = MagicMock()
116          runner._restart_drain_timeout = 5
117          runner._exit_code = None
118          runner._exit_reason = None
119          runner.adapters = {}
120          runner.config = GatewayConfig()
121  
122          # Mock heavy dependencies
123          with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \
124               patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \
125               patch("gateway.run.GatewayRunner._update_runtime_status"), \
126               patch("gateway.status.remove_pid_file"), \
127               patch("tools.process_registry.process_registry") as mock_proc_reg, \
128               patch("tools.terminal_tool.cleanup_all_environments"), \
129               patch("tools.browser_tool.cleanup_all_browsers"):
130              mock_proc_reg.kill_all = MagicMock()
131  
132              import asyncio
133              asyncio.get_event_loop().run_until_complete(runner.stop())
134  
135          assert marker.exists(), ".clean_shutdown marker should exist after graceful stop"
136  
137      def test_marker_skips_suspension_on_startup(self, tmp_path, monkeypatch):
138          """If .clean_shutdown exists, suspend_recently_active should NOT be called."""
139          monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
140  
141          # Create the marker
142          marker = tmp_path / ".clean_shutdown"
143          marker.touch()
144  
145          # Create a store with a recently active session
146          store = _make_store(tmp_path)
147          source = _make_source()
148          entry = store.get_or_create_session(source)
149          assert not entry.suspended
150  
151          # Simulate what start() does:
152          if marker.exists():
153              marker.unlink()
154              # Should NOT call suspend_recently_active
155          else:
156              store.suspend_recently_active()
157  
158          # Session should NOT be suspended
159          with store._lock:
160              store._ensure_loaded_locked()
161              for e in store._entries.values():
162                  assert not e.suspended, "Session should NOT be suspended after clean shutdown"
163  
164          assert not marker.exists(), "Marker should be cleaned up"
165  
166      def test_no_marker_triggers_suspension(self, tmp_path, monkeypatch):
167          """Without .clean_shutdown marker (crash), suspension should fire."""
168          monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
169  
170          marker = tmp_path / ".clean_shutdown"
171          assert not marker.exists()
172  
173          # Create a store with a recently active session
174          store = _make_store(tmp_path)
175          source = _make_source()
176          entry = store.get_or_create_session(source)
177          assert not entry.suspended
178  
179          # Simulate what start() does:
180          if marker.exists():
181              marker.unlink()
182          else:
183              store.suspend_recently_active()
184  
185          # Session SHOULD be resume_pending (crash recovery preserves history)
186          with store._lock:
187              store._ensure_loaded_locked()
188              resume_count = sum(1 for e in store._entries.values() if e.resume_pending)
189          assert resume_count == 1, "Session should be resume_pending after crash (no marker)"
190  
191      def test_marker_written_on_restart_stop(self, tmp_path, monkeypatch):
192          """stop(restart=True) should also write the marker."""
193          monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
194          marker = tmp_path / ".clean_shutdown"
195  
196          from gateway.run import GatewayRunner
197          runner = object.__new__(GatewayRunner)
198          runner._restart_requested = False
199          runner._restart_detached = False
200          runner._restart_via_service = False
201          runner._restart_task_started = False
202          runner._running = True
203          runner._draining = False
204          runner._stop_task = None
205          runner._running_agents = {}
206          runner._pending_messages = {}
207          runner._pending_approvals = {}
208          runner._background_tasks = set()
209          runner._shutdown_event = MagicMock()
210          runner._restart_drain_timeout = 5
211          runner._exit_code = None
212          runner._exit_reason = None
213          runner.adapters = {}
214          runner.config = GatewayConfig()
215  
216          with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \
217               patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \
218               patch("gateway.run.GatewayRunner._update_runtime_status"), \
219               patch("gateway.status.remove_pid_file"), \
220               patch("tools.process_registry.process_registry") as mock_proc_reg, \
221               patch("tools.terminal_tool.cleanup_all_environments"), \
222               patch("tools.browser_tool.cleanup_all_browsers"):
223              mock_proc_reg.kill_all = MagicMock()
224  
225              import asyncio
226              asyncio.get_event_loop().run_until_complete(runner.stop(restart=True))
227  
228          assert marker.exists(), ".clean_shutdown marker should exist after restart-stop too"