Cradicle Explorer

/ tests / run_agent / test_1630_context_overflow_loop.py
test_1630_context_overflow_loop.py
  1  """Tests for #1630 — gateway infinite 400 failure loop prevention.
  2  
  3  Verifies that:
  4  1. Generic 400 errors with large sessions are treated as context-length errors
  5     and trigger compression instead of aborting.
  6  2. The gateway does not persist messages when the agent fails early, preventing
  7     the session from growing on each failure.
  8  3. Context-overflow failures produce helpful error messages suggesting /compact.
  9  """
 10  
 11  import pytest
 12  from types import SimpleNamespace
 13  from unittest.mock import MagicMock, patch
 14  
 15  
 16  # ---------------------------------------------------------------------------
 17  # Test 1: Agent heuristic — generic 400 with large session → compression
 18  # ---------------------------------------------------------------------------
 19  
 20  
 21  class TestGeneric400Heuristic:
 22      """The agent should treat a generic 400 with a large session as a
 23      probable context-length error and trigger compression, not abort."""
 24  
 25      def _make_agent(self):
 26          """Create a minimal AIAgent for testing error handling."""
 27          with (
 28              patch("run_agent.get_tool_definitions", return_value=[]),
 29              patch("run_agent.check_toolset_requirements", return_value={}),
 30              patch("run_agent.OpenAI"),
 31          ):
 32              from run_agent import AIAgent
 33              a = AIAgent(
 34                  api_key="test-key-12345",
 35                  base_url="https://openrouter.ai/api/v1",
 36                  quiet_mode=True,
 37                  skip_context_files=True,
 38                  skip_memory=True,
 39              )
 40              a.client = MagicMock()
 41              a._cached_system_prompt = "You are helpful."
 42              a._use_prompt_caching = False
 43              a.tool_delay = 0
 44              a.compression_enabled = False
 45              return a
 46  
 47      def test_generic_400_with_small_session_is_client_error(self):
 48          """A generic 400 with a small session should still be treated
 49          as a non-retryable client error (not context overflow)."""
 50          error_msg = "error"
 51          status_code = 400
 52          approx_tokens = 1000  # Small session
 53          api_messages = [{"role": "user", "content": "hi"}]
 54  
 55          # Simulate the phrase matching
 56          is_context_length_error = any(phrase in error_msg for phrase in [
 57              'context length', 'context size', 'maximum context',
 58              'token limit', 'too many tokens', 'reduce the length',
 59              'exceeds the limit', 'context window',
 60              'request entity too large',
 61              'prompt is too long',
 62          ])
 63          assert not is_context_length_error
 64  
 65          # The heuristic should NOT trigger for small sessions
 66          ctx_len = 200000
 67          is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
 68          is_generic_error = len(error_msg.strip()) < 30
 69          assert not is_large_session  # Small session → heuristic doesn't fire
 70  
 71      def test_generic_400_with_large_token_count_triggers_heuristic(self):
 72          """A generic 400 with high token count should be treated as
 73          probable context overflow."""
 74          error_msg = "error"
 75          status_code = 400
 76          ctx_len = 200000
 77          approx_tokens = 100000  # > 40% of 200k
 78          api_messages = [{"role": "user", "content": "hi"}] * 20
 79  
 80          is_context_length_error = any(phrase in error_msg for phrase in [
 81              'context length', 'context size', 'maximum context',
 82          ])
 83          assert not is_context_length_error
 84  
 85          # Heuristic check
 86          is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
 87          is_generic_error = len(error_msg.strip()) < 30
 88          assert is_large_session
 89          assert is_generic_error
 90          # Both conditions true → should be treated as context overflow
 91  
 92      def test_generic_400_with_many_messages_triggers_heuristic(self):
 93          """A generic 400 with >80 messages should trigger the heuristic
 94          even if estimated tokens are low."""
 95          error_msg = "error"
 96          status_code = 400
 97          ctx_len = 200000
 98          approx_tokens = 5000  # Low token estimate
 99          api_messages = [{"role": "user", "content": "x"}] * 100  # > 80 messages
100  
101          is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
102          is_generic_error = len(error_msg.strip()) < 30
103          assert is_large_session
104          assert is_generic_error
105  
106      def test_specific_error_message_bypasses_heuristic(self):
107          """A 400 with a specific, long error message should NOT trigger
108          the heuristic even with a large session."""
109          error_msg = "invalid model: anthropic/claude-nonexistent-model is not available"
110          status_code = 400
111          ctx_len = 200000
112          approx_tokens = 100000
113  
114          is_generic_error = len(error_msg.strip()) < 30
115          assert not is_generic_error  # Long specific message → heuristic doesn't fire
116  
117      def test_descriptive_context_error_caught_by_phrases(self):
118          """Descriptive context-length errors should still be caught by
119          the existing phrase matching (not the heuristic)."""
120          error_msg = "prompt is too long: 250000 tokens > 200000 maximum"
121          is_context_length_error = any(phrase in error_msg for phrase in [
122              'context length', 'context size', 'maximum context',
123              'token limit', 'too many tokens', 'reduce the length',
124              'exceeds the limit', 'context window',
125              'request entity too large',
126              'prompt is too long',
127          ])
128          assert is_context_length_error
129  
130  
131  # ---------------------------------------------------------------------------
132  # Test 2: Gateway skips persistence on failed agent results
133  # ---------------------------------------------------------------------------
134  
135  class TestGatewaySkipsPersistenceOnFailure:
136      """When the agent returns failed=True with no final_response,
137      the gateway should NOT persist messages to the transcript."""
138  
139      def test_agent_failed_early_detected(self):
140          """The agent_failed_early flag is True when failed=True,
141          regardless of final_response."""
142          agent_result = {
143              "failed": True,
144              "final_response": None,
145              "messages": [],
146              "error": "Non-retryable client error",
147          }
148          agent_failed_early = bool(agent_result.get("failed"))
149          assert agent_failed_early
150  
151      def test_agent_failed_with_error_response_still_detected(self):
152          """When _run_agent_blocking converts an error to final_response,
153          the failed flag should still trigger agent_failed_early.  This
154          was the core bug in #9893 — the old guard checked
155          ``not final_response`` which was always truthy after conversion."""
156          agent_result = {
157              "failed": True,
158              "final_response": "⚠️ Request payload too large: max compression attempts reached.",
159              "messages": [],
160          }
161          agent_failed_early = bool(agent_result.get("failed"))
162          assert agent_failed_early
163  
164      def test_successful_agent_not_failed_early(self):
165          """A successful agent result should not trigger skip."""
166          agent_result = {
167              "final_response": "Hello!",
168              "messages": [{"role": "assistant", "content": "Hello!"}],
169          }
170          agent_failed_early = bool(agent_result.get("failed"))
171          assert not agent_failed_early
172  
173  
174  class TestCompressionExhaustedFlag:
175      """When compression is exhausted, the agent should set both
176      failed=True and compression_exhausted=True so the gateway can
177      auto-reset the session.  (#9893)"""
178  
179      def test_compression_exhausted_returns_carry_flag(self):
180          """Simulate the return dict from a compression-exhausted agent."""
181          agent_result = {
182              "messages": [],
183              "completed": False,
184              "api_calls": 3,
185              "error": "Request payload too large: max compression attempts (3) reached.",
186              "partial": True,
187              "failed": True,
188              "compression_exhausted": True,
189          }
190          assert agent_result.get("failed")
191          assert agent_result.get("compression_exhausted")
192  
193      def test_normal_failure_not_compression_exhausted(self):
194          """Non-compression failures should not have compression_exhausted."""
195          agent_result = {
196              "messages": [],
197              "completed": False,
198              "failed": True,
199              "error": "Invalid API response after 3 retries",
200          }
201          assert agent_result.get("failed")
202          assert not agent_result.get("compression_exhausted")
203  
204  
205  # ---------------------------------------------------------------------------
206  # Test 3: Context-overflow error messages
207  # ---------------------------------------------------------------------------
208  
209  class TestContextOverflowErrorMessages:
210      """The gateway should produce helpful error messages when the failure
211      looks like a context overflow."""
212  
213      def test_detects_context_keywords(self):
214          """Error messages containing context-related keywords should be
215          identified as context failures."""
216          keywords = [
217              "context length exceeded",
218              "too many tokens in the prompt",
219              "request entity too large",
220              "payload too large for model",
221              "context window exceeded",
222          ]
223          for error_str in keywords:
224              _is_ctx_fail = any(p in error_str.lower() for p in (
225                  "context", "token", "too large", "too long",
226                  "exceed", "payload",
227              ))
228              assert _is_ctx_fail, f"Should detect: {error_str}"
229  
230      def test_detects_generic_400_with_large_history(self):
231          """A generic 400 error code in the string with a large history
232          should be flagged as context failure."""
233          error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}"
234          history_len = 100  # Large session
235  
236          _is_ctx_fail = any(p in error_str.lower() for p in (
237              "context", "token", "too large", "too long",
238              "exceed", "payload",
239          )) or (
240              "400" in error_str.lower()
241              and history_len > 50
242          )
243          assert _is_ctx_fail
244  
245      def test_unrelated_error_not_flagged(self):
246          """Unrelated errors should not be flagged as context failures."""
247          error_str = "invalid api key: authentication failed"
248          history_len = 10
249  
250          _is_ctx_fail = any(p in error_str.lower() for p in (
251              "context", "token", "too large", "too long",
252              "exceed", "payload",
253          )) or (
254              "400" in error_str.lower()
255              and history_len > 50
256          )
257          assert not _is_ctx_fail
258  
259  
260  # ---------------------------------------------------------------------------
261  # Test 4: Agent skips persistence for large failed sessions
262  # ---------------------------------------------------------------------------
263  
264  class TestAgentSkipsPersistenceForLargeFailedSessions:
265      """When a 400 error occurs and the session is large, the agent
266      should skip persisting to prevent the growth loop."""
267  
268      def test_large_session_400_skips_persistence(self):
269          """Status 400 + high token count should skip persistence."""
270          status_code = 400
271          approx_tokens = 60000  # > 50000 threshold
272          api_messages = [{"role": "user", "content": "x"}] * 10
273  
274          should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
275          assert should_skip
276  
277      def test_small_session_400_persists_normally(self):
278          """Status 400 + small session should still persist."""
279          status_code = 400
280          approx_tokens = 5000  # < 50000
281          api_messages = [{"role": "user", "content": "x"}] * 10  # < 80
282  
283          should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
284          assert not should_skip
285  
286      def test_non_400_error_persists_normally(self):
287          """Non-400 errors should always persist normally."""
288          status_code = 401  # Auth error
289          approx_tokens = 100000  # Large session, but not a 400
290          api_messages = [{"role": "user", "content": "x"}] * 100
291  
292          should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
293          assert not should_skip