test_1630_context_overflow_loop.py
1 """Tests for #1630 — gateway infinite 400 failure loop prevention. 2 3 Verifies that: 4 1. Generic 400 errors with large sessions are treated as context-length errors 5 and trigger compression instead of aborting. 6 2. The gateway does not persist messages when the agent fails early, preventing 7 the session from growing on each failure. 8 3. Context-overflow failures produce helpful error messages suggesting /compact. 9 """ 10 11 import pytest 12 from types import SimpleNamespace 13 from unittest.mock import MagicMock, patch 14 15 16 # --------------------------------------------------------------------------- 17 # Test 1: Agent heuristic — generic 400 with large session → compression 18 # --------------------------------------------------------------------------- 19 20 21 class TestGeneric400Heuristic: 22 """The agent should treat a generic 400 with a large session as a 23 probable context-length error and trigger compression, not abort.""" 24 25 def _make_agent(self): 26 """Create a minimal AIAgent for testing error handling.""" 27 with ( 28 patch("run_agent.get_tool_definitions", return_value=[]), 29 patch("run_agent.check_toolset_requirements", return_value={}), 30 patch("run_agent.OpenAI"), 31 ): 32 from run_agent import AIAgent 33 a = AIAgent( 34 api_key="test-key-12345", 35 base_url="https://openrouter.ai/api/v1", 36 quiet_mode=True, 37 skip_context_files=True, 38 skip_memory=True, 39 ) 40 a.client = MagicMock() 41 a._cached_system_prompt = "You are helpful." 42 a._use_prompt_caching = False 43 a.tool_delay = 0 44 a.compression_enabled = False 45 return a 46 47 def test_generic_400_with_small_session_is_client_error(self): 48 """A generic 400 with a small session should still be treated 49 as a non-retryable client error (not context overflow).""" 50 error_msg = "error" 51 status_code = 400 52 approx_tokens = 1000 # Small session 53 api_messages = [{"role": "user", "content": "hi"}] 54 55 # Simulate the phrase matching 56 is_context_length_error = any(phrase in error_msg for phrase in [ 57 'context length', 'context size', 'maximum context', 58 'token limit', 'too many tokens', 'reduce the length', 59 'exceeds the limit', 'context window', 60 'request entity too large', 61 'prompt is too long', 62 ]) 63 assert not is_context_length_error 64 65 # The heuristic should NOT trigger for small sessions 66 ctx_len = 200000 67 is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 68 is_generic_error = len(error_msg.strip()) < 30 69 assert not is_large_session # Small session → heuristic doesn't fire 70 71 def test_generic_400_with_large_token_count_triggers_heuristic(self): 72 """A generic 400 with high token count should be treated as 73 probable context overflow.""" 74 error_msg = "error" 75 status_code = 400 76 ctx_len = 200000 77 approx_tokens = 100000 # > 40% of 200k 78 api_messages = [{"role": "user", "content": "hi"}] * 20 79 80 is_context_length_error = any(phrase in error_msg for phrase in [ 81 'context length', 'context size', 'maximum context', 82 ]) 83 assert not is_context_length_error 84 85 # Heuristic check 86 is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 87 is_generic_error = len(error_msg.strip()) < 30 88 assert is_large_session 89 assert is_generic_error 90 # Both conditions true → should be treated as context overflow 91 92 def test_generic_400_with_many_messages_triggers_heuristic(self): 93 """A generic 400 with >80 messages should trigger the heuristic 94 even if estimated tokens are low.""" 95 error_msg = "error" 96 status_code = 400 97 ctx_len = 200000 98 approx_tokens = 5000 # Low token estimate 99 api_messages = [{"role": "user", "content": "x"}] * 100 # > 80 messages 100 101 is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 102 is_generic_error = len(error_msg.strip()) < 30 103 assert is_large_session 104 assert is_generic_error 105 106 def test_specific_error_message_bypasses_heuristic(self): 107 """A 400 with a specific, long error message should NOT trigger 108 the heuristic even with a large session.""" 109 error_msg = "invalid model: anthropic/claude-nonexistent-model is not available" 110 status_code = 400 111 ctx_len = 200000 112 approx_tokens = 100000 113 114 is_generic_error = len(error_msg.strip()) < 30 115 assert not is_generic_error # Long specific message → heuristic doesn't fire 116 117 def test_descriptive_context_error_caught_by_phrases(self): 118 """Descriptive context-length errors should still be caught by 119 the existing phrase matching (not the heuristic).""" 120 error_msg = "prompt is too long: 250000 tokens > 200000 maximum" 121 is_context_length_error = any(phrase in error_msg for phrase in [ 122 'context length', 'context size', 'maximum context', 123 'token limit', 'too many tokens', 'reduce the length', 124 'exceeds the limit', 'context window', 125 'request entity too large', 126 'prompt is too long', 127 ]) 128 assert is_context_length_error 129 130 131 # --------------------------------------------------------------------------- 132 # Test 2: Gateway skips persistence on failed agent results 133 # --------------------------------------------------------------------------- 134 135 class TestGatewaySkipsPersistenceOnFailure: 136 """When the agent returns failed=True with no final_response, 137 the gateway should NOT persist messages to the transcript.""" 138 139 def test_agent_failed_early_detected(self): 140 """The agent_failed_early flag is True when failed=True, 141 regardless of final_response.""" 142 agent_result = { 143 "failed": True, 144 "final_response": None, 145 "messages": [], 146 "error": "Non-retryable client error", 147 } 148 agent_failed_early = bool(agent_result.get("failed")) 149 assert agent_failed_early 150 151 def test_agent_failed_with_error_response_still_detected(self): 152 """When _run_agent_blocking converts an error to final_response, 153 the failed flag should still trigger agent_failed_early. This 154 was the core bug in #9893 — the old guard checked 155 ``not final_response`` which was always truthy after conversion.""" 156 agent_result = { 157 "failed": True, 158 "final_response": "⚠️ Request payload too large: max compression attempts reached.", 159 "messages": [], 160 } 161 agent_failed_early = bool(agent_result.get("failed")) 162 assert agent_failed_early 163 164 def test_successful_agent_not_failed_early(self): 165 """A successful agent result should not trigger skip.""" 166 agent_result = { 167 "final_response": "Hello!", 168 "messages": [{"role": "assistant", "content": "Hello!"}], 169 } 170 agent_failed_early = bool(agent_result.get("failed")) 171 assert not agent_failed_early 172 173 174 class TestCompressionExhaustedFlag: 175 """When compression is exhausted, the agent should set both 176 failed=True and compression_exhausted=True so the gateway can 177 auto-reset the session. (#9893)""" 178 179 def test_compression_exhausted_returns_carry_flag(self): 180 """Simulate the return dict from a compression-exhausted agent.""" 181 agent_result = { 182 "messages": [], 183 "completed": False, 184 "api_calls": 3, 185 "error": "Request payload too large: max compression attempts (3) reached.", 186 "partial": True, 187 "failed": True, 188 "compression_exhausted": True, 189 } 190 assert agent_result.get("failed") 191 assert agent_result.get("compression_exhausted") 192 193 def test_normal_failure_not_compression_exhausted(self): 194 """Non-compression failures should not have compression_exhausted.""" 195 agent_result = { 196 "messages": [], 197 "completed": False, 198 "failed": True, 199 "error": "Invalid API response after 3 retries", 200 } 201 assert agent_result.get("failed") 202 assert not agent_result.get("compression_exhausted") 203 204 205 # --------------------------------------------------------------------------- 206 # Test 3: Context-overflow error messages 207 # --------------------------------------------------------------------------- 208 209 class TestContextOverflowErrorMessages: 210 """The gateway should produce helpful error messages when the failure 211 looks like a context overflow.""" 212 213 def test_detects_context_keywords(self): 214 """Error messages containing context-related keywords should be 215 identified as context failures.""" 216 keywords = [ 217 "context length exceeded", 218 "too many tokens in the prompt", 219 "request entity too large", 220 "payload too large for model", 221 "context window exceeded", 222 ] 223 for error_str in keywords: 224 _is_ctx_fail = any(p in error_str.lower() for p in ( 225 "context", "token", "too large", "too long", 226 "exceed", "payload", 227 )) 228 assert _is_ctx_fail, f"Should detect: {error_str}" 229 230 def test_detects_generic_400_with_large_history(self): 231 """A generic 400 error code in the string with a large history 232 should be flagged as context failure.""" 233 error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}" 234 history_len = 100 # Large session 235 236 _is_ctx_fail = any(p in error_str.lower() for p in ( 237 "context", "token", "too large", "too long", 238 "exceed", "payload", 239 )) or ( 240 "400" in error_str.lower() 241 and history_len > 50 242 ) 243 assert _is_ctx_fail 244 245 def test_unrelated_error_not_flagged(self): 246 """Unrelated errors should not be flagged as context failures.""" 247 error_str = "invalid api key: authentication failed" 248 history_len = 10 249 250 _is_ctx_fail = any(p in error_str.lower() for p in ( 251 "context", "token", "too large", "too long", 252 "exceed", "payload", 253 )) or ( 254 "400" in error_str.lower() 255 and history_len > 50 256 ) 257 assert not _is_ctx_fail 258 259 260 # --------------------------------------------------------------------------- 261 # Test 4: Agent skips persistence for large failed sessions 262 # --------------------------------------------------------------------------- 263 264 class TestAgentSkipsPersistenceForLargeFailedSessions: 265 """When a 400 error occurs and the session is large, the agent 266 should skip persisting to prevent the growth loop.""" 267 268 def test_large_session_400_skips_persistence(self): 269 """Status 400 + high token count should skip persistence.""" 270 status_code = 400 271 approx_tokens = 60000 # > 50000 threshold 272 api_messages = [{"role": "user", "content": "x"}] * 10 273 274 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) 275 assert should_skip 276 277 def test_small_session_400_persists_normally(self): 278 """Status 400 + small session should still persist.""" 279 status_code = 400 280 approx_tokens = 5000 # < 50000 281 api_messages = [{"role": "user", "content": "x"}] * 10 # < 80 282 283 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) 284 assert not should_skip 285 286 def test_non_400_error_persists_normally(self): 287 """Non-400 errors should always persist normally.""" 288 status_code = 401 # Auth error 289 approx_tokens = 100000 # Large session, but not a 400 290 api_messages = [{"role": "user", "content": "x"}] * 100 291 292 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) 293 assert not should_skip