test_agent_loop_tool_calling.py
1 """Integration tests for HermesAgentLoop tool calling. 2 3 Tests the full agent loop with real LLM calls via OpenRouter. 4 Uses stepfun/step-3.5-flash:free by default (zero cost), falls back 5 to anthropic/claude-sonnet-4 if the free model is unavailable. 6 7 These tests verify: 8 1. Single tool call: model calls a tool, gets result, responds 9 2. Multi-tool call: model calls multiple tools in one turn 10 3. Multi-turn: model calls tools across multiple turns 11 4. Unknown tool rejection: model calling a non-existent tool gets an error 12 5. Max turns: loop stops when max_turns is reached 13 6. No tools: model responds without calling any tools 14 7. Tool error handling: tool execution errors are captured 15 16 Run: 17 pytest tests/test_agent_loop_tool_calling.py -v 18 pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test 19 """ 20 21 import asyncio 22 import json 23 import os 24 import sys 25 from pathlib import Path 26 from typing import Any, Dict, List, Set 27 from unittest.mock import patch 28 29 import pytest 30 31 # pytestmark removed — tests skip gracefully via OPENROUTER_API_KEY check on line 59 32 33 # Ensure repo root is importable 34 _repo_root = Path(__file__).resolve().parent.parent.parent 35 if str(_repo_root) not in sys.path: 36 sys.path.insert(0, str(_repo_root)) 37 38 try: 39 from environments.agent_loop import AgentResult, HermesAgentLoop 40 from atroposlib.envs.server_handling.openai_server import OpenAIServer # noqa: F401 41 except ImportError: 42 pytest.skip("atroposlib not installed", allow_module_level=True) 43 44 45 # ========================================================================= 46 # Test infrastructure 47 # ========================================================================= 48 49 # Models to try, in order of preference (free first) 50 _MODELS = [ 51 "stepfun/step-3.5-flash:free", 52 "google/gemini-2.0-flash-001", 53 "anthropic/claude-sonnet-4", 54 ] 55 56 def _get_api_key(): 57 key = os.getenv("OPENROUTER_API_KEY", "") 58 if not key: 59 pytest.skip("OPENROUTER_API_KEY not set") 60 return key 61 62 63 def _make_server(model: str = None): 64 """Create an OpenAI server for testing.""" 65 from atroposlib.envs.server_handling.openai_server import OpenAIServer 66 from atroposlib.envs.server_handling.server_manager import APIServerConfig 67 68 config = APIServerConfig( 69 base_url="https://openrouter.ai/api/v1", 70 model_name=model or _MODELS[0], 71 server_type="openai", 72 api_key=_get_api_key(), 73 health_check=False, 74 ) 75 return OpenAIServer(config) 76 77 78 async def _try_models(test_fn): 79 """Try running a test with each model until one works.""" 80 last_error = None 81 for model in _MODELS: 82 try: 83 server = _make_server(model) 84 return await test_fn(server, model) 85 except Exception as e: 86 last_error = e 87 if "rate" in str(e).lower() or "limit" in str(e).lower(): 88 continue # Rate limited, try next model 89 raise # Real error 90 pytest.skip(f"All models failed. Last error: {last_error}") 91 92 93 # ========================================================================= 94 # Fake tools for testing 95 # ========================================================================= 96 97 # Simple calculator tool 98 CALC_TOOL = { 99 "type": "function", 100 "function": { 101 "name": "calculate", 102 "description": "Calculate a math expression. Returns the numeric result.", 103 "parameters": { 104 "type": "object", 105 "properties": { 106 "expression": { 107 "type": "string", 108 "description": "Math expression to evaluate, e.g. '2 + 3'" 109 } 110 }, 111 "required": ["expression"], 112 }, 113 }, 114 } 115 116 # Weather lookup tool 117 WEATHER_TOOL = { 118 "type": "function", 119 "function": { 120 "name": "get_weather", 121 "description": "Get the current weather for a city. Returns temperature and conditions.", 122 "parameters": { 123 "type": "object", 124 "properties": { 125 "city": { 126 "type": "string", 127 "description": "City name, e.g. 'Tokyo'" 128 } 129 }, 130 "required": ["city"], 131 }, 132 }, 133 } 134 135 # Lookup tool (always succeeds) 136 LOOKUP_TOOL = { 137 "type": "function", 138 "function": { 139 "name": "lookup", 140 "description": "Look up a fact. Returns a short answer string.", 141 "parameters": { 142 "type": "object", 143 "properties": { 144 "query": { 145 "type": "string", 146 "description": "What to look up" 147 } 148 }, 149 "required": ["query"], 150 }, 151 }, 152 } 153 154 # Error tool (always fails) 155 ERROR_TOOL = { 156 "type": "function", 157 "function": { 158 "name": "failing_tool", 159 "description": "A tool that always fails with an error.", 160 "parameters": { 161 "type": "object", 162 "properties": { 163 "input": {"type": "string"} 164 }, 165 "required": ["input"], 166 }, 167 }, 168 } 169 170 171 def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str: 172 """Handle fake tool calls for testing.""" 173 if tool_name == "calculate": 174 expr = args.get("expression", "0") 175 try: 176 # Safe eval for simple math 177 result = eval(expr, {"__builtins__": {}}, {}) 178 return json.dumps({"result": result}) 179 except Exception as e: 180 return json.dumps({"error": str(e)}) 181 182 elif tool_name == "get_weather": 183 city = args.get("city", "Unknown") 184 # Return canned weather 185 return json.dumps({ 186 "city": city, 187 "temperature": 22, 188 "conditions": "sunny", 189 "humidity": 45, 190 }) 191 192 elif tool_name == "lookup": 193 query = args.get("query", "") 194 return json.dumps({"answer": f"The answer to '{query}' is 42."}) 195 196 elif tool_name == "failing_tool": 197 raise RuntimeError("This tool always fails!") 198 199 return json.dumps({"error": f"Unknown tool: {tool_name}"}) 200 201 202 # ========================================================================= 203 # Tests 204 # ========================================================================= 205 206 @pytest.mark.asyncio 207 async def test_single_tool_call(): 208 """Model should call a single tool, get the result, and respond.""" 209 210 async def _run(server, model): 211 agent = HermesAgentLoop( 212 server=server, 213 tool_schemas=[WEATHER_TOOL], 214 valid_tool_names={"get_weather"}, 215 max_turns=5, 216 temperature=0.0, 217 max_tokens=500, 218 ) 219 220 messages = [ 221 {"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."}, 222 ] 223 224 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 225 result = await agent.run(messages) 226 227 assert isinstance(result, AgentResult) 228 assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}" 229 230 # Verify a tool call happened 231 tool_calls_found = False 232 for msg in result.messages: 233 if msg.get("role") == "assistant" and msg.get("tool_calls"): 234 for tc in msg["tool_calls"]: 235 if tc["function"]["name"] == "get_weather": 236 tool_calls_found = True 237 args = json.loads(tc["function"]["arguments"]) 238 assert "city" in args 239 assert tool_calls_found, "Model should have called get_weather" 240 241 # Verify tool result is in conversation 242 tool_results = [m for m in result.messages if m.get("role") == "tool"] 243 assert len(tool_results) >= 1, "Should have at least one tool result" 244 245 # Verify the final response references the weather 246 final_msg = result.messages[-1] 247 assert final_msg["role"] == "assistant" 248 assert final_msg["content"], "Final response should have content" 249 250 return result 251 252 await _try_models(_run) 253 254 255 @pytest.mark.asyncio 256 async def test_multi_tool_single_turn(): 257 """Model should call multiple tools in a single turn.""" 258 259 async def _run(server, model): 260 agent = HermesAgentLoop( 261 server=server, 262 tool_schemas=[WEATHER_TOOL, CALC_TOOL], 263 valid_tool_names={"get_weather", "calculate"}, 264 max_turns=5, 265 temperature=0.0, 266 max_tokens=500, 267 ) 268 269 messages = [ 270 {"role": "user", "content": ( 271 "I need two things at once: " 272 "1) What's the weather in Paris? Use get_weather. " 273 "2) What is 15 * 7? Use calculate. " 274 "Call BOTH tools in a single response." 275 )}, 276 ] 277 278 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 279 result = await agent.run(messages) 280 281 # Count distinct tools called 282 tools_called = set() 283 for msg in result.messages: 284 if msg.get("role") == "assistant" and msg.get("tool_calls"): 285 for tc in msg["tool_calls"]: 286 tools_called.add(tc["function"]["name"]) 287 288 # At minimum, both tools should have been called (maybe in different turns) 289 assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}" 290 assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" 291 292 return result 293 294 await _try_models(_run) 295 296 297 @pytest.mark.asyncio 298 async def test_multi_turn_conversation(): 299 """Agent should handle multiple turns of tool calls.""" 300 301 async def _run(server, model): 302 agent = HermesAgentLoop( 303 server=server, 304 tool_schemas=[LOOKUP_TOOL, CALC_TOOL], 305 valid_tool_names={"lookup", "calculate"}, 306 max_turns=10, 307 temperature=0.0, 308 max_tokens=500, 309 ) 310 311 messages = [ 312 {"role": "user", "content": ( 313 "First, use the lookup tool to look up 'meaning of life'. " 314 "Then use calculate to compute 6 * 7. " 315 "Do these in separate tool calls, one at a time." 316 )}, 317 ] 318 319 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 320 result = await agent.run(messages) 321 322 # Should have used both tools 323 tools_called = set() 324 for msg in result.messages: 325 if msg.get("role") == "assistant" and msg.get("tool_calls"): 326 for tc in msg["tool_calls"]: 327 tools_called.add(tc["function"]["name"]) 328 329 assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}" 330 assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" 331 332 # Should finish naturally 333 assert result.finished_naturally, "Should finish naturally after answering" 334 335 return result 336 337 await _try_models(_run) 338 339 340 @pytest.mark.asyncio 341 async def test_unknown_tool_rejected(): 342 """If the model calls a tool not in valid_tool_names, it gets an error.""" 343 344 async def _run(server, model): 345 # Only allow "calculate" but give schema for both 346 agent = HermesAgentLoop( 347 server=server, 348 tool_schemas=[CALC_TOOL, WEATHER_TOOL], 349 valid_tool_names={"calculate"}, # weather NOT allowed 350 max_turns=5, 351 temperature=0.0, 352 max_tokens=500, 353 ) 354 355 messages = [ 356 {"role": "user", "content": "What's the weather in London? Use get_weather."}, 357 ] 358 359 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 360 result = await agent.run(messages) 361 362 # Check if get_weather was called and rejected 363 if result.tool_errors: 364 weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"] 365 assert len(weather_errors) > 0, "get_weather should have been rejected" 366 assert "Unknown tool" in weather_errors[0].error 367 368 return result 369 370 await _try_models(_run) 371 372 373 @pytest.mark.asyncio 374 async def test_max_turns_limit(): 375 """Agent should stop after max_turns even if model keeps calling tools.""" 376 377 async def _run(server, model): 378 agent = HermesAgentLoop( 379 server=server, 380 tool_schemas=[LOOKUP_TOOL], 381 valid_tool_names={"lookup"}, 382 max_turns=2, # Very low limit 383 temperature=0.0, 384 max_tokens=500, 385 ) 386 387 messages = [ 388 {"role": "user", "content": ( 389 "Keep looking up facts. Look up 'fact 1', then 'fact 2', " 390 "then 'fact 3', then 'fact 4'. Do them one at a time." 391 )}, 392 ] 393 394 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 395 result = await agent.run(messages) 396 397 assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}" 398 assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)" 399 400 return result 401 402 await _try_models(_run) 403 404 405 @pytest.mark.asyncio 406 async def test_no_tools_direct_response(): 407 """When no tools are useful, model should respond directly.""" 408 409 async def _run(server, model): 410 agent = HermesAgentLoop( 411 server=server, 412 tool_schemas=[WEATHER_TOOL], 413 valid_tool_names={"get_weather"}, 414 max_turns=5, 415 temperature=0.0, 416 max_tokens=200, 417 ) 418 419 messages = [ 420 {"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."}, 421 ] 422 423 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 424 result = await agent.run(messages) 425 426 assert result.finished_naturally, "Should finish naturally with a direct response" 427 assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}" 428 429 final = result.messages[-1] 430 assert final["role"] == "assistant" 431 assert final["content"], "Should have text content" 432 assert "4" in final["content"], "Should contain the answer '4'" 433 434 return result 435 436 await _try_models(_run) 437 438 439 @pytest.mark.asyncio 440 async def test_tool_error_handling(): 441 """Tool execution errors should be captured and reported to the model.""" 442 443 async def _run(server, model): 444 agent = HermesAgentLoop( 445 server=server, 446 tool_schemas=[ERROR_TOOL], 447 valid_tool_names={"failing_tool"}, 448 max_turns=5, 449 temperature=0.0, 450 max_tokens=500, 451 ) 452 453 messages = [ 454 {"role": "user", "content": "Please call the failing_tool with input 'test'."}, 455 ] 456 457 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 458 result = await agent.run(messages) 459 460 # The tool error should be recorded 461 assert len(result.tool_errors) >= 1, "Should have at least one tool error" 462 assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error 463 464 # The error should be in the conversation as a tool result 465 tool_results = [m for m in result.messages if m.get("role") == "tool"] 466 assert len(tool_results) >= 1 467 error_result = json.loads(tool_results[0]["content"]) 468 assert "error" in error_result 469 470 return result 471 472 await _try_models(_run) 473 474 475 @pytest.mark.asyncio 476 async def test_agent_result_structure(): 477 """Verify the AgentResult has all expected fields populated.""" 478 479 async def _run(server, model): 480 agent = HermesAgentLoop( 481 server=server, 482 tool_schemas=[CALC_TOOL], 483 valid_tool_names={"calculate"}, 484 max_turns=5, 485 temperature=0.0, 486 max_tokens=300, 487 ) 488 489 messages = [ 490 {"role": "user", "content": "What is 3 + 4? Use the calculate tool."}, 491 ] 492 493 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 494 result = await agent.run(messages) 495 496 # Structural checks 497 assert isinstance(result, AgentResult) 498 assert isinstance(result.messages, list) 499 assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)" 500 assert isinstance(result.turns_used, int) 501 assert result.turns_used > 0 502 assert isinstance(result.finished_naturally, bool) 503 assert isinstance(result.tool_errors, list) 504 assert isinstance(result.reasoning_per_turn, list) 505 506 # Messages should follow OpenAI format 507 for msg in result.messages: 508 assert "role" in msg, f"Message missing 'role': {msg}" 509 assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}" 510 511 return result 512 513 await _try_models(_run) 514 515 516 @pytest.mark.asyncio 517 async def test_conversation_history_preserved(): 518 """The full conversation history should be in result.messages.""" 519 520 async def _run(server, model): 521 agent = HermesAgentLoop( 522 server=server, 523 tool_schemas=[WEATHER_TOOL], 524 valid_tool_names={"get_weather"}, 525 max_turns=5, 526 temperature=0.0, 527 max_tokens=500, 528 ) 529 530 messages = [ 531 {"role": "system", "content": "You are a helpful weather assistant."}, 532 {"role": "user", "content": "What's the weather in Berlin? Use get_weather."}, 533 ] 534 535 with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): 536 result = await agent.run(messages) 537 538 # System message should be preserved 539 assert result.messages[0]["role"] == "system" 540 assert "weather assistant" in result.messages[0]["content"] 541 542 # User message should be preserved 543 assert result.messages[1]["role"] == "user" 544 assert "Berlin" in result.messages[1]["content"] 545 546 # Should have assistant + tool + assistant sequence 547 roles = [m["role"] for m in result.messages] 548 assert "tool" in roles, "Should have tool results in conversation" 549 550 return result 551 552 await _try_models(_run)