Cradicle Explorer

/ tests / run_agent / test_agent_loop_tool_calling.py
test_agent_loop_tool_calling.py
  1  """Integration tests for HermesAgentLoop tool calling.
  2  
  3  Tests the full agent loop with real LLM calls via OpenRouter.
  4  Uses stepfun/step-3.5-flash:free by default (zero cost), falls back
  5  to anthropic/claude-sonnet-4 if the free model is unavailable.
  6  
  7  These tests verify:
  8  1. Single tool call: model calls a tool, gets result, responds
  9  2. Multi-tool call: model calls multiple tools in one turn
 10  3. Multi-turn: model calls tools across multiple turns
 11  4. Unknown tool rejection: model calling a non-existent tool gets an error
 12  5. Max turns: loop stops when max_turns is reached
 13  6. No tools: model responds without calling any tools
 14  7. Tool error handling: tool execution errors are captured
 15  
 16  Run:
 17      pytest tests/test_agent_loop_tool_calling.py -v
 18      pytest tests/test_agent_loop_tool_calling.py -v -k "single"  # run one test
 19  """
 20  
 21  import asyncio
 22  import json
 23  import os
 24  import sys
 25  from pathlib import Path
 26  from typing import Any, Dict, List, Set
 27  from unittest.mock import patch
 28  
 29  import pytest
 30  
 31  # pytestmark removed — tests skip gracefully via OPENROUTER_API_KEY check on line 59
 32  
 33  # Ensure repo root is importable
 34  _repo_root = Path(__file__).resolve().parent.parent.parent
 35  if str(_repo_root) not in sys.path:
 36      sys.path.insert(0, str(_repo_root))
 37  
 38  try:
 39      from environments.agent_loop import AgentResult, HermesAgentLoop
 40      from atroposlib.envs.server_handling.openai_server import OpenAIServer  # noqa: F401
 41  except ImportError:
 42      pytest.skip("atroposlib not installed", allow_module_level=True)
 43  
 44  
 45  # =========================================================================
 46  # Test infrastructure
 47  # =========================================================================
 48  
 49  # Models to try, in order of preference (free first)
 50  _MODELS = [
 51      "stepfun/step-3.5-flash:free",
 52      "google/gemini-2.0-flash-001",
 53      "anthropic/claude-sonnet-4",
 54  ]
 55  
 56  def _get_api_key():
 57      key = os.getenv("OPENROUTER_API_KEY", "")
 58      if not key:
 59          pytest.skip("OPENROUTER_API_KEY not set")
 60      return key
 61  
 62  
 63  def _make_server(model: str = None):
 64      """Create an OpenAI server for testing."""
 65      from atroposlib.envs.server_handling.openai_server import OpenAIServer
 66      from atroposlib.envs.server_handling.server_manager import APIServerConfig
 67  
 68      config = APIServerConfig(
 69          base_url="https://openrouter.ai/api/v1",
 70          model_name=model or _MODELS[0],
 71          server_type="openai",
 72          api_key=_get_api_key(),
 73          health_check=False,
 74      )
 75      return OpenAIServer(config)
 76  
 77  
 78  async def _try_models(test_fn):
 79      """Try running a test with each model until one works."""
 80      last_error = None
 81      for model in _MODELS:
 82          try:
 83              server = _make_server(model)
 84              return await test_fn(server, model)
 85          except Exception as e:
 86              last_error = e
 87              if "rate" in str(e).lower() or "limit" in str(e).lower():
 88                  continue  # Rate limited, try next model
 89              raise  # Real error
 90      pytest.skip(f"All models failed. Last error: {last_error}")
 91  
 92  
 93  # =========================================================================
 94  # Fake tools for testing
 95  # =========================================================================
 96  
 97  # Simple calculator tool
 98  CALC_TOOL = {
 99      "type": "function",
100      "function": {
101          "name": "calculate",
102          "description": "Calculate a math expression. Returns the numeric result.",
103          "parameters": {
104              "type": "object",
105              "properties": {
106                  "expression": {
107                      "type": "string",
108                      "description": "Math expression to evaluate, e.g. '2 + 3'"
109                  }
110              },
111              "required": ["expression"],
112          },
113      },
114  }
115  
116  # Weather lookup tool
117  WEATHER_TOOL = {
118      "type": "function",
119      "function": {
120          "name": "get_weather",
121          "description": "Get the current weather for a city. Returns temperature and conditions.",
122          "parameters": {
123              "type": "object",
124              "properties": {
125                  "city": {
126                      "type": "string",
127                      "description": "City name, e.g. 'Tokyo'"
128                  }
129              },
130              "required": ["city"],
131          },
132      },
133  }
134  
135  # Lookup tool (always succeeds)
136  LOOKUP_TOOL = {
137      "type": "function",
138      "function": {
139          "name": "lookup",
140          "description": "Look up a fact. Returns a short answer string.",
141          "parameters": {
142              "type": "object",
143              "properties": {
144                  "query": {
145                      "type": "string",
146                      "description": "What to look up"
147                  }
148              },
149              "required": ["query"],
150          },
151      },
152  }
153  
154  # Error tool (always fails)
155  ERROR_TOOL = {
156      "type": "function",
157      "function": {
158          "name": "failing_tool",
159          "description": "A tool that always fails with an error.",
160          "parameters": {
161              "type": "object",
162              "properties": {
163                  "input": {"type": "string"}
164              },
165              "required": ["input"],
166          },
167      },
168  }
169  
170  
171  def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str:
172      """Handle fake tool calls for testing."""
173      if tool_name == "calculate":
174          expr = args.get("expression", "0")
175          try:
176              # Safe eval for simple math
177              result = eval(expr, {"__builtins__": {}}, {})
178              return json.dumps({"result": result})
179          except Exception as e:
180              return json.dumps({"error": str(e)})
181  
182      elif tool_name == "get_weather":
183          city = args.get("city", "Unknown")
184          # Return canned weather
185          return json.dumps({
186              "city": city,
187              "temperature": 22,
188              "conditions": "sunny",
189              "humidity": 45,
190          })
191  
192      elif tool_name == "lookup":
193          query = args.get("query", "")
194          return json.dumps({"answer": f"The answer to '{query}' is 42."})
195  
196      elif tool_name == "failing_tool":
197          raise RuntimeError("This tool always fails!")
198  
199      return json.dumps({"error": f"Unknown tool: {tool_name}"})
200  
201  
202  # =========================================================================
203  # Tests
204  # =========================================================================
205  
206  @pytest.mark.asyncio
207  async def test_single_tool_call():
208      """Model should call a single tool, get the result, and respond."""
209  
210      async def _run(server, model):
211          agent = HermesAgentLoop(
212              server=server,
213              tool_schemas=[WEATHER_TOOL],
214              valid_tool_names={"get_weather"},
215              max_turns=5,
216              temperature=0.0,
217              max_tokens=500,
218          )
219  
220          messages = [
221              {"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."},
222          ]
223  
224          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
225              result = await agent.run(messages)
226  
227          assert isinstance(result, AgentResult)
228          assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}"
229  
230          # Verify a tool call happened
231          tool_calls_found = False
232          for msg in result.messages:
233              if msg.get("role") == "assistant" and msg.get("tool_calls"):
234                  for tc in msg["tool_calls"]:
235                      if tc["function"]["name"] == "get_weather":
236                          tool_calls_found = True
237                          args = json.loads(tc["function"]["arguments"])
238                          assert "city" in args
239          assert tool_calls_found, "Model should have called get_weather"
240  
241          # Verify tool result is in conversation
242          tool_results = [m for m in result.messages if m.get("role") == "tool"]
243          assert len(tool_results) >= 1, "Should have at least one tool result"
244  
245          # Verify the final response references the weather
246          final_msg = result.messages[-1]
247          assert final_msg["role"] == "assistant"
248          assert final_msg["content"], "Final response should have content"
249  
250          return result
251  
252      await _try_models(_run)
253  
254  
255  @pytest.mark.asyncio
256  async def test_multi_tool_single_turn():
257      """Model should call multiple tools in a single turn."""
258  
259      async def _run(server, model):
260          agent = HermesAgentLoop(
261              server=server,
262              tool_schemas=[WEATHER_TOOL, CALC_TOOL],
263              valid_tool_names={"get_weather", "calculate"},
264              max_turns=5,
265              temperature=0.0,
266              max_tokens=500,
267          )
268  
269          messages = [
270              {"role": "user", "content": (
271                  "I need two things at once: "
272                  "1) What's the weather in Paris? Use get_weather. "
273                  "2) What is 15 * 7? Use calculate. "
274                  "Call BOTH tools in a single response."
275              )},
276          ]
277  
278          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
279              result = await agent.run(messages)
280  
281          # Count distinct tools called
282          tools_called = set()
283          for msg in result.messages:
284              if msg.get("role") == "assistant" and msg.get("tool_calls"):
285                  for tc in msg["tool_calls"]:
286                      tools_called.add(tc["function"]["name"])
287  
288          # At minimum, both tools should have been called (maybe in different turns)
289          assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}"
290          assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
291  
292          return result
293  
294      await _try_models(_run)
295  
296  
297  @pytest.mark.asyncio
298  async def test_multi_turn_conversation():
299      """Agent should handle multiple turns of tool calls."""
300  
301      async def _run(server, model):
302          agent = HermesAgentLoop(
303              server=server,
304              tool_schemas=[LOOKUP_TOOL, CALC_TOOL],
305              valid_tool_names={"lookup", "calculate"},
306              max_turns=10,
307              temperature=0.0,
308              max_tokens=500,
309          )
310  
311          messages = [
312              {"role": "user", "content": (
313                  "First, use the lookup tool to look up 'meaning of life'. "
314                  "Then use calculate to compute 6 * 7. "
315                  "Do these in separate tool calls, one at a time."
316              )},
317          ]
318  
319          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
320              result = await agent.run(messages)
321  
322          # Should have used both tools
323          tools_called = set()
324          for msg in result.messages:
325              if msg.get("role") == "assistant" and msg.get("tool_calls"):
326                  for tc in msg["tool_calls"]:
327                      tools_called.add(tc["function"]["name"])
328  
329          assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}"
330          assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
331  
332          # Should finish naturally
333          assert result.finished_naturally, "Should finish naturally after answering"
334  
335          return result
336  
337      await _try_models(_run)
338  
339  
340  @pytest.mark.asyncio
341  async def test_unknown_tool_rejected():
342      """If the model calls a tool not in valid_tool_names, it gets an error."""
343  
344      async def _run(server, model):
345          # Only allow "calculate" but give schema for both
346          agent = HermesAgentLoop(
347              server=server,
348              tool_schemas=[CALC_TOOL, WEATHER_TOOL],
349              valid_tool_names={"calculate"},  # weather NOT allowed
350              max_turns=5,
351              temperature=0.0,
352              max_tokens=500,
353          )
354  
355          messages = [
356              {"role": "user", "content": "What's the weather in London? Use get_weather."},
357          ]
358  
359          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
360              result = await agent.run(messages)
361  
362          # Check if get_weather was called and rejected
363          if result.tool_errors:
364              weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"]
365              assert len(weather_errors) > 0, "get_weather should have been rejected"
366              assert "Unknown tool" in weather_errors[0].error
367  
368          return result
369  
370      await _try_models(_run)
371  
372  
373  @pytest.mark.asyncio
374  async def test_max_turns_limit():
375      """Agent should stop after max_turns even if model keeps calling tools."""
376  
377      async def _run(server, model):
378          agent = HermesAgentLoop(
379              server=server,
380              tool_schemas=[LOOKUP_TOOL],
381              valid_tool_names={"lookup"},
382              max_turns=2,  # Very low limit
383              temperature=0.0,
384              max_tokens=500,
385          )
386  
387          messages = [
388              {"role": "user", "content": (
389                  "Keep looking up facts. Look up 'fact 1', then 'fact 2', "
390                  "then 'fact 3', then 'fact 4'. Do them one at a time."
391              )},
392          ]
393  
394          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
395              result = await agent.run(messages)
396  
397          assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}"
398          assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)"
399  
400          return result
401  
402      await _try_models(_run)
403  
404  
405  @pytest.mark.asyncio
406  async def test_no_tools_direct_response():
407      """When no tools are useful, model should respond directly."""
408  
409      async def _run(server, model):
410          agent = HermesAgentLoop(
411              server=server,
412              tool_schemas=[WEATHER_TOOL],
413              valid_tool_names={"get_weather"},
414              max_turns=5,
415              temperature=0.0,
416              max_tokens=200,
417          )
418  
419          messages = [
420              {"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."},
421          ]
422  
423          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
424              result = await agent.run(messages)
425  
426          assert result.finished_naturally, "Should finish naturally with a direct response"
427          assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}"
428  
429          final = result.messages[-1]
430          assert final["role"] == "assistant"
431          assert final["content"], "Should have text content"
432          assert "4" in final["content"], "Should contain the answer '4'"
433  
434          return result
435  
436      await _try_models(_run)
437  
438  
439  @pytest.mark.asyncio
440  async def test_tool_error_handling():
441      """Tool execution errors should be captured and reported to the model."""
442  
443      async def _run(server, model):
444          agent = HermesAgentLoop(
445              server=server,
446              tool_schemas=[ERROR_TOOL],
447              valid_tool_names={"failing_tool"},
448              max_turns=5,
449              temperature=0.0,
450              max_tokens=500,
451          )
452  
453          messages = [
454              {"role": "user", "content": "Please call the failing_tool with input 'test'."},
455          ]
456  
457          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
458              result = await agent.run(messages)
459  
460          # The tool error should be recorded
461          assert len(result.tool_errors) >= 1, "Should have at least one tool error"
462          assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error
463  
464          # The error should be in the conversation as a tool result
465          tool_results = [m for m in result.messages if m.get("role") == "tool"]
466          assert len(tool_results) >= 1
467          error_result = json.loads(tool_results[0]["content"])
468          assert "error" in error_result
469  
470          return result
471  
472      await _try_models(_run)
473  
474  
475  @pytest.mark.asyncio
476  async def test_agent_result_structure():
477      """Verify the AgentResult has all expected fields populated."""
478  
479      async def _run(server, model):
480          agent = HermesAgentLoop(
481              server=server,
482              tool_schemas=[CALC_TOOL],
483              valid_tool_names={"calculate"},
484              max_turns=5,
485              temperature=0.0,
486              max_tokens=300,
487          )
488  
489          messages = [
490              {"role": "user", "content": "What is 3 + 4? Use the calculate tool."},
491          ]
492  
493          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
494              result = await agent.run(messages)
495  
496          # Structural checks
497          assert isinstance(result, AgentResult)
498          assert isinstance(result.messages, list)
499          assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)"
500          assert isinstance(result.turns_used, int)
501          assert result.turns_used > 0
502          assert isinstance(result.finished_naturally, bool)
503          assert isinstance(result.tool_errors, list)
504          assert isinstance(result.reasoning_per_turn, list)
505  
506          # Messages should follow OpenAI format
507          for msg in result.messages:
508              assert "role" in msg, f"Message missing 'role': {msg}"
509              assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}"
510  
511          return result
512  
513      await _try_models(_run)
514  
515  
516  @pytest.mark.asyncio
517  async def test_conversation_history_preserved():
518      """The full conversation history should be in result.messages."""
519  
520      async def _run(server, model):
521          agent = HermesAgentLoop(
522              server=server,
523              tool_schemas=[WEATHER_TOOL],
524              valid_tool_names={"get_weather"},
525              max_turns=5,
526              temperature=0.0,
527              max_tokens=500,
528          )
529  
530          messages = [
531              {"role": "system", "content": "You are a helpful weather assistant."},
532              {"role": "user", "content": "What's the weather in Berlin? Use get_weather."},
533          ]
534  
535          with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
536              result = await agent.run(messages)
537  
538          # System message should be preserved
539          assert result.messages[0]["role"] == "system"
540          assert "weather assistant" in result.messages[0]["content"]
541  
542          # User message should be preserved
543          assert result.messages[1]["role"] == "user"
544          assert "Berlin" in result.messages[1]["content"]
545  
546          # Should have assistant + tool + assistant sequence
547          roles = [m["role"] for m in result.messages]
548          assert "tool" in roles, "Should have tool results in conversation"
549  
550          return result
551  
552      await _try_models(_run)