test_clean_shutdown_marker.py
1 """Tests for the clean shutdown marker that prevents unwanted session auto-resets. 2 3 When the gateway shuts down gracefully (hermes update, gateway restart, /restart), 4 it writes a .clean_shutdown marker. On the next startup, if the marker exists, 5 suspend_recently_active() is skipped so users don't lose their sessions. 6 7 After a crash (no marker), suspension still fires as a safety net for stuck sessions. 8 """ 9 10 import os 11 from datetime import datetime, timedelta 12 from pathlib import Path 13 from unittest.mock import AsyncMock, MagicMock, patch 14 15 import pytest 16 17 from gateway.config import GatewayConfig, Platform, PlatformConfig, SessionResetPolicy 18 from gateway.session import SessionEntry, SessionSource, SessionStore 19 20 21 # --------------------------------------------------------------------------- 22 # Helpers 23 # --------------------------------------------------------------------------- 24 25 def _make_source(platform=Platform.TELEGRAM, chat_id="123", user_id="u1"): 26 return SessionSource(platform=platform, chat_id=chat_id, user_id=user_id) 27 28 29 def _make_store(tmp_path, policy=None): 30 config = GatewayConfig() 31 if policy: 32 config.default_reset_policy = policy 33 return SessionStore(sessions_dir=tmp_path, config=config) 34 35 36 # --------------------------------------------------------------------------- 37 # SessionStore.suspend_recently_active 38 # --------------------------------------------------------------------------- 39 40 class TestSuspendRecentlyActive: 41 """Verify suspend_recently_active only marks recent sessions.""" 42 43 def test_suspends_recently_active_sessions(self, tmp_path): 44 store = _make_store(tmp_path) 45 source = _make_source() 46 entry = store.get_or_create_session(source) 47 assert not entry.suspended 48 49 count = store.suspend_recently_active() 50 assert count == 1 51 52 # Re-fetch — should be resume_pending (preserved, not wiped) 53 refreshed = store.get_or_create_session(source) 54 assert refreshed.resume_pending 55 assert refreshed.session_id == entry.session_id # same session preserved 56 57 def test_does_not_suspend_old_sessions(self, tmp_path): 58 store = _make_store(tmp_path) 59 source = _make_source() 60 entry = store.get_or_create_session(source) 61 62 # Backdate the session's updated_at beyond the cutoff 63 with store._lock: 64 entry.updated_at = datetime.now() - timedelta(seconds=300) 65 store._save() 66 67 count = store.suspend_recently_active(max_age_seconds=120) 68 assert count == 0 69 70 def test_already_resume_pending_not_double_counted(self, tmp_path): 71 store = _make_store(tmp_path) 72 source = _make_source() 73 entry = store.get_or_create_session(source) 74 75 # Mark resume_pending once 76 count1 = store.suspend_recently_active() 77 assert count1 == 1 78 79 # Re-fetch returns the SAME session (preserved, not reset) 80 entry2 = store.get_or_create_session(source) 81 assert entry2.session_id == entry.session_id 82 83 # Second call skips already-resume_pending entries 84 count2 = store.suspend_recently_active() 85 assert count2 == 0 86 87 88 # --------------------------------------------------------------------------- 89 # Clean shutdown marker integration 90 # --------------------------------------------------------------------------- 91 92 class TestCleanShutdownMarker: 93 """Test that the marker file controls session suspension on startup.""" 94 95 def test_marker_written_on_graceful_stop(self, tmp_path, monkeypatch): 96 """stop() should write .clean_shutdown marker.""" 97 monkeypatch.setattr("gateway.run._hermes_home", tmp_path) 98 marker = tmp_path / ".clean_shutdown" 99 assert not marker.exists() 100 101 # Create a minimal runner and call the shutdown logic directly 102 from gateway.run import GatewayRunner 103 runner = object.__new__(GatewayRunner) 104 runner._restart_requested = False 105 runner._restart_detached = False 106 runner._restart_via_service = False 107 runner._restart_task_started = False 108 runner._running = True 109 runner._draining = False 110 runner._stop_task = None 111 runner._running_agents = {} 112 runner._pending_messages = {} 113 runner._pending_approvals = {} 114 runner._background_tasks = set() 115 runner._shutdown_event = MagicMock() 116 runner._restart_drain_timeout = 5 117 runner._exit_code = None 118 runner._exit_reason = None 119 runner.adapters = {} 120 runner.config = GatewayConfig() 121 122 # Mock heavy dependencies 123 with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \ 124 patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \ 125 patch("gateway.run.GatewayRunner._update_runtime_status"), \ 126 patch("gateway.status.remove_pid_file"), \ 127 patch("tools.process_registry.process_registry") as mock_proc_reg, \ 128 patch("tools.terminal_tool.cleanup_all_environments"), \ 129 patch("tools.browser_tool.cleanup_all_browsers"): 130 mock_proc_reg.kill_all = MagicMock() 131 132 import asyncio 133 asyncio.get_event_loop().run_until_complete(runner.stop()) 134 135 assert marker.exists(), ".clean_shutdown marker should exist after graceful stop" 136 137 def test_marker_skips_suspension_on_startup(self, tmp_path, monkeypatch): 138 """If .clean_shutdown exists, suspend_recently_active should NOT be called.""" 139 monkeypatch.setattr("gateway.run._hermes_home", tmp_path) 140 141 # Create the marker 142 marker = tmp_path / ".clean_shutdown" 143 marker.touch() 144 145 # Create a store with a recently active session 146 store = _make_store(tmp_path) 147 source = _make_source() 148 entry = store.get_or_create_session(source) 149 assert not entry.suspended 150 151 # Simulate what start() does: 152 if marker.exists(): 153 marker.unlink() 154 # Should NOT call suspend_recently_active 155 else: 156 store.suspend_recently_active() 157 158 # Session should NOT be suspended 159 with store._lock: 160 store._ensure_loaded_locked() 161 for e in store._entries.values(): 162 assert not e.suspended, "Session should NOT be suspended after clean shutdown" 163 164 assert not marker.exists(), "Marker should be cleaned up" 165 166 def test_no_marker_triggers_suspension(self, tmp_path, monkeypatch): 167 """Without .clean_shutdown marker (crash), suspension should fire.""" 168 monkeypatch.setattr("gateway.run._hermes_home", tmp_path) 169 170 marker = tmp_path / ".clean_shutdown" 171 assert not marker.exists() 172 173 # Create a store with a recently active session 174 store = _make_store(tmp_path) 175 source = _make_source() 176 entry = store.get_or_create_session(source) 177 assert not entry.suspended 178 179 # Simulate what start() does: 180 if marker.exists(): 181 marker.unlink() 182 else: 183 store.suspend_recently_active() 184 185 # Session SHOULD be resume_pending (crash recovery preserves history) 186 with store._lock: 187 store._ensure_loaded_locked() 188 resume_count = sum(1 for e in store._entries.values() if e.resume_pending) 189 assert resume_count == 1, "Session should be resume_pending after crash (no marker)" 190 191 def test_marker_written_on_restart_stop(self, tmp_path, monkeypatch): 192 """stop(restart=True) should also write the marker.""" 193 monkeypatch.setattr("gateway.run._hermes_home", tmp_path) 194 marker = tmp_path / ".clean_shutdown" 195 196 from gateway.run import GatewayRunner 197 runner = object.__new__(GatewayRunner) 198 runner._restart_requested = False 199 runner._restart_detached = False 200 runner._restart_via_service = False 201 runner._restart_task_started = False 202 runner._running = True 203 runner._draining = False 204 runner._stop_task = None 205 runner._running_agents = {} 206 runner._pending_messages = {} 207 runner._pending_approvals = {} 208 runner._background_tasks = set() 209 runner._shutdown_event = MagicMock() 210 runner._restart_drain_timeout = 5 211 runner._exit_code = None 212 runner._exit_reason = None 213 runner.adapters = {} 214 runner.config = GatewayConfig() 215 216 with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \ 217 patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \ 218 patch("gateway.run.GatewayRunner._update_runtime_status"), \ 219 patch("gateway.status.remove_pid_file"), \ 220 patch("tools.process_registry.process_registry") as mock_proc_reg, \ 221 patch("tools.terminal_tool.cleanup_all_environments"), \ 222 patch("tools.browser_tool.cleanup_all_browsers"): 223 mock_proc_reg.kill_all = MagicMock() 224 225 import asyncio 226 asyncio.get_event_loop().run_until_complete(runner.stop(restart=True)) 227 228 assert marker.exists(), ".clean_shutdown marker should exist after restart-stop too"