/ mini_swe_runner.py
mini_swe_runner.py
1 #!/usr/bin/env python3 2 """ 3 SWE Runner with Hermes Trajectory Format 4 5 A runner that uses Hermes-Agent's built-in execution environments 6 (local, docker, modal) and outputs trajectories in the Hermes-Agent format 7 compatible with batch_runner.py and trajectory_compressor.py. 8 9 Features: 10 - Uses Hermes-Agent's Docker, Modal, or Local environments for command execution 11 - Outputs trajectories in Hermes format (from/value pairs with <tool_call>/<tool_response> XML) 12 - Compatible with the trajectory compression pipeline 13 - Supports batch processing from JSONL prompt files 14 15 Usage: 16 # Run a single task with local environment 17 python mini_swe_runner.py --task "Create a hello world Python script" --env local 18 19 # Run with Docker 20 python mini_swe_runner.py --task "List files in /tmp" --env docker --image python:3.11-slim 21 22 # Run with Modal (cloud) 23 python mini_swe_runner.py --task "Install numpy and test it" --env modal --image python:3.11-slim 24 25 # Batch mode from JSONL file 26 python mini_swe_runner.py --prompts_file prompts.jsonl --output_file trajectories.jsonl --env docker 27 """ 28 29 import json 30 import logging 31 import os 32 import sys 33 import time 34 import uuid 35 from datetime import datetime 36 from pathlib import Path 37 from typing import List, Dict, Any, Optional, Literal 38 39 import fire 40 from dotenv import load_dotenv 41 42 # Load environment variables 43 load_dotenv() 44 45 46 def _effective_temperature_for_model( 47 model: str, 48 base_url: Optional[str] = None, 49 ) -> Optional[float]: 50 """Return a fixed temperature for models with strict sampling contracts. 51 52 Returns ``None`` when the model manages temperature server-side (Kimi); 53 callers must omit the ``temperature`` kwarg entirely in that case. 54 """ 55 try: 56 from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE 57 except Exception: 58 return None 59 result = _fixed_temperature_for_model(model, base_url) 60 if result is OMIT_TEMPERATURE: 61 return None # caller must omit temperature 62 return result 63 64 65 66 67 # ============================================================================ 68 # Terminal Tool Definition (matches Hermes-Agent format) 69 # ============================================================================ 70 71 TERMINAL_TOOL_DEFINITION = { 72 "type": "function", 73 "function": { 74 "name": "terminal", 75 "description": """Execute bash commands in a sandboxed environment. 76 77 **Environment:** 78 - Isolated execution environment (local, Docker, or Modal cloud) 79 - Filesystem persists between tool calls within the same task 80 - Internet access available 81 82 **Command Execution:** 83 - Provide the command to execute via the 'command' parameter 84 - Optional 'timeout' parameter in seconds (default: 60) 85 86 **Examples:** 87 - Run command: `{"command": "ls -la"}` 88 - With timeout: `{"command": "long_task.sh", "timeout": 300}` 89 90 **Best Practices:** 91 - Use non-interactive commands (avoid vim, nano, interactive python) 92 - Pipe to cat if output might be large 93 - Install tools with apt-get or pip as needed 94 95 **Completion:** 96 - When task is complete, output: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by your result 97 """, 98 "parameters": { 99 "type": "object", 100 "properties": { 101 "command": { 102 "type": "string", 103 "description": "The bash command to execute" 104 }, 105 "timeout": { 106 "type": "integer", 107 "description": "Command timeout in seconds (default: 60)" 108 } 109 }, 110 "required": ["command"] 111 } 112 } 113 } 114 115 116 # ============================================================================ 117 # Environment Factory 118 # ============================================================================ 119 120 def create_environment( 121 env_type: str = "local", 122 image: str = "python:3.11-slim", 123 cwd: str = "/tmp", 124 timeout: int = 60, 125 **kwargs 126 ): 127 """ 128 Create an execution environment using Hermes-Agent's built-in backends. 129 130 Args: 131 env_type: One of "local", "docker", "modal" 132 image: Docker/Modal image name (ignored for local) 133 cwd: Working directory 134 timeout: Default command timeout 135 **kwargs: Additional environment-specific options 136 137 Returns: 138 Environment instance with execute() and cleanup() methods 139 """ 140 if env_type == "local": 141 from tools.environments.local import LocalEnvironment 142 return LocalEnvironment(cwd=cwd, timeout=timeout) 143 144 elif env_type == "docker": 145 from tools.environments.docker import DockerEnvironment 146 return DockerEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs) 147 148 elif env_type == "modal": 149 from tools.environments.modal import ModalEnvironment 150 return ModalEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs) 151 152 else: 153 raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'") 154 155 156 # ============================================================================ 157 # Mini-SWE Runner with Hermes Trajectory Format 158 # ============================================================================ 159 160 class MiniSWERunner: 161 """ 162 Agent runner that uses Hermes-Agent's built-in execution environments 163 and outputs trajectories in Hermes-Agent format. 164 """ 165 166 def __init__( 167 self, 168 model: str = "anthropic/claude-sonnet-4.6", 169 base_url: str = None, 170 api_key: str = None, 171 env_type: str = "local", 172 image: str = "python:3.11-slim", 173 cwd: str = "/tmp", 174 max_iterations: int = 15, 175 command_timeout: int = 60, 176 verbose: bool = False, 177 ): 178 """ 179 Initialize the Mini-SWE Runner. 180 181 Args: 182 model: Model name for OpenAI-compatible API 183 base_url: API base URL (optional, uses env vars if not provided) 184 api_key: API key (optional, uses env vars if not provided) 185 env_type: Environment type - "local", "docker", or "modal" 186 image: Docker/Modal image (ignored for local) 187 cwd: Working directory for commands 188 max_iterations: Maximum tool-calling iterations 189 command_timeout: Default timeout for commands 190 verbose: Enable verbose logging 191 """ 192 self.model = model 193 self.max_iterations = max_iterations 194 self.command_timeout = command_timeout 195 self.verbose = verbose 196 self.env_type = env_type 197 self.image = image 198 self.cwd = cwd 199 200 # Setup logging 201 logging.basicConfig( 202 level=logging.DEBUG if verbose else logging.INFO, 203 format='%(asctime)s - %(levelname)s - %(message)s', 204 datefmt='%H:%M:%S' 205 ) 206 self.logger = logging.getLogger(__name__) 207 208 # Initialize LLM client via centralized provider router. 209 # If explicit api_key/base_url are provided (e.g. from CLI args), 210 # construct directly. Otherwise use the router for OpenRouter. 211 if api_key or base_url: 212 from openai import OpenAI 213 client_kwargs = { 214 "base_url": base_url or "https://openrouter.ai/api/v1", 215 "api_key": api_key or os.getenv( 216 "OPENROUTER_API_KEY", 217 os.getenv("ANTHROPIC_API_KEY", 218 os.getenv("OPENAI_API_KEY", ""))), 219 } 220 self.client = OpenAI(**client_kwargs) 221 else: 222 from agent.auxiliary_client import resolve_provider_client 223 self.client, _ = resolve_provider_client("openrouter", model=model) 224 if self.client is None: 225 # Fallback: try auto-detection 226 self.client, _ = resolve_provider_client("auto", model=model) 227 if self.client is None: 228 from openai import OpenAI 229 self.client = OpenAI( 230 base_url="https://openrouter.ai/api/v1", 231 api_key=os.getenv("OPENROUTER_API_KEY", "")) 232 233 # Environment will be created per-task 234 self.env = None 235 236 # Tool definition 237 self.tools = [TERMINAL_TOOL_DEFINITION] 238 239 print("š¤ Mini-SWE Runner initialized") 240 print(f" Model: {self.model}") 241 print(f" Environment: {self.env_type}") 242 if self.env_type != "local": 243 print(f" Image: {self.image}") 244 print(f" Max iterations: {self.max_iterations}") 245 246 def _create_env(self): 247 """Create the execution environment.""" 248 print(f"š§ Creating {self.env_type} environment...") 249 self.env = create_environment( 250 env_type=self.env_type, 251 image=self.image, 252 cwd=self.cwd, 253 timeout=self.command_timeout 254 ) 255 print("ā Environment ready") 256 257 def _cleanup_env(self): 258 """Cleanup the execution environment.""" 259 if self.env is not None: 260 if hasattr(self.env, 'cleanup'): 261 self.env.cleanup() 262 elif hasattr(self.env, 'stop'): 263 self.env.stop() 264 self.env = None 265 266 def _execute_command(self, command: str, timeout: int = None) -> Dict[str, Any]: 267 """ 268 Execute a command in the environment. 269 270 Args: 271 command: Bash command to execute 272 timeout: Optional timeout override 273 274 Returns: 275 Dict with 'output' and 'returncode' 276 """ 277 if self.env is None: 278 self._create_env() 279 280 try: 281 result = self.env.execute(command, timeout=timeout or self.command_timeout) 282 return { 283 "output": result.get("output", ""), 284 "exit_code": result.get("returncode", 0), 285 "error": None 286 } 287 except Exception as e: 288 return { 289 "output": "", 290 "exit_code": -1, 291 "error": str(e) 292 } 293 294 def _format_tools_for_system_message(self) -> str: 295 """Format tool definitions for the system message.""" 296 formatted_tools = [] 297 for tool in self.tools: 298 func = tool["function"] 299 formatted_tools.append({ 300 "name": func["name"], 301 "description": func.get("description", ""), 302 "parameters": func.get("parameters", {}), 303 "required": None 304 }) 305 return json.dumps(formatted_tools, ensure_ascii=False) 306 307 def _convert_to_hermes_format( 308 self, 309 messages: List[Dict[str, Any]], 310 user_query: str, 311 completed: bool 312 ) -> List[Dict[str, Any]]: 313 """ 314 Convert internal message format to Hermes trajectory format. 315 316 This produces the exact format used by batch_runner.py. 317 """ 318 trajectory = [] 319 320 # System message with tool definitions 321 system_msg = ( 322 "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. " 323 "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting " 324 "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug " 325 "into functions. After calling & executing the functions, you will be provided with function results within " 326 "<tool_response> </tool_response> XML tags. Here are the available tools:\n" 327 f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n" 328 "For each function call return a JSON object, with the following pydantic model json schema for each:\n" 329 "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, " 330 "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n" 331 "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n" 332 "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>" 333 ) 334 335 trajectory.append({"from": "system", "value": system_msg}) 336 trajectory.append({"from": "human", "value": user_query}) 337 338 # Process messages (skip first user message as we already added it) 339 i = 1 340 while i < len(messages): 341 msg = messages[i] 342 343 if msg["role"] == "assistant": 344 if "tool_calls" in msg and msg["tool_calls"]: 345 # Assistant message with tool calls 346 content = "" 347 348 # Add reasoning if present 349 if msg.get("reasoning"): 350 content = f"<think>{msg['reasoning']}</think>" 351 352 if msg.get("content"): 353 content += msg["content"] + "\n" 354 355 # Add tool calls in XML format 356 for tool_call in msg["tool_calls"]: 357 if not tool_call or not isinstance(tool_call, dict): continue 358 try: 359 arguments = json.loads(tool_call["function"]["arguments"]) \ 360 if isinstance(tool_call["function"]["arguments"], str) \ 361 else tool_call["function"]["arguments"] 362 except json.JSONDecodeError: 363 arguments = {} 364 365 tool_call_json = { 366 "name": tool_call["function"]["name"], 367 "arguments": arguments 368 } 369 content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n" 370 371 trajectory.append({"from": "gpt", "value": content.rstrip()}) 372 373 # Collect subsequent tool responses 374 tool_responses = [] 375 j = i + 1 376 while j < len(messages) and messages[j]["role"] == "tool": 377 tool_msg = messages[j] 378 tool_content = tool_msg["content"] 379 380 # Try to parse as JSON 381 try: 382 if tool_content.strip().startswith(("{", "[")): 383 tool_content = json.loads(tool_content) 384 except (json.JSONDecodeError, AttributeError): 385 pass 386 387 tool_response = "<tool_response>\n" 388 tool_response += json.dumps({ 389 "tool_call_id": tool_msg.get("tool_call_id", ""), 390 "name": msg["tool_calls"][len(tool_responses)]["function"]["name"] \ 391 if len(tool_responses) < len(msg["tool_calls"]) else "unknown", 392 "content": tool_content 393 }, ensure_ascii=False) 394 tool_response += "\n</tool_response>" 395 tool_responses.append(tool_response) 396 j += 1 397 398 if tool_responses: 399 trajectory.append({"from": "tool", "value": "\n".join(tool_responses)}) 400 i = j - 1 401 402 else: 403 # Regular assistant message (no tool calls) 404 content = "" 405 if msg.get("reasoning"): 406 content = f"<think>{msg['reasoning']}</think>" 407 content += msg.get("content") or "" 408 trajectory.append({"from": "gpt", "value": content}) 409 410 elif msg["role"] == "user": 411 trajectory.append({"from": "human", "value": msg["content"]}) 412 413 i += 1 414 415 return trajectory 416 417 def run_task(self, task: str) -> Dict[str, Any]: 418 """ 419 Run a single task and return the result with trajectory. 420 421 Args: 422 task: The task/prompt to execute 423 424 Returns: 425 Dict with trajectory, completion status, and metadata 426 """ 427 print(f"\n{'='*60}") 428 print(f"š Task: {task[:80]}{'...' if len(task) > 80 else ''}") 429 print(f"{'='*60}") 430 431 # Initialize environment 432 self._create_env() 433 434 # Message history 435 messages = [{"role": "user", "content": task}] 436 437 # System prompt for the LLM (ephemeral - not saved to trajectory) 438 system_prompt = """You are an AI agent that can execute bash commands to complete tasks. 439 440 When you need to run commands, use the 'terminal' tool with your bash command. 441 442 **Important:** 443 - When you have completed the task successfully, run: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by a summary 444 - Be concise and efficient in your approach 445 - Install any needed tools with apt-get or pip 446 - Avoid interactive commands (no vim, nano, less, etc.) 447 448 Complete the user's task step by step.""" 449 450 api_call_count = 0 451 completed = False 452 final_response = None 453 454 try: 455 while api_call_count < self.max_iterations: 456 api_call_count += 1 457 print(f"\nš API call #{api_call_count}/{self.max_iterations}") 458 459 # Prepare API messages 460 api_messages = [{"role": "system", "content": system_prompt}] + messages 461 462 # Make API call 463 try: 464 api_kwargs = { 465 "model": self.model, 466 "messages": api_messages, 467 "tools": self.tools, 468 "timeout": 300.0, 469 } 470 fixed_temperature = _effective_temperature_for_model( 471 self.model, 472 str(getattr(self.client, "base_url", "") or ""), 473 ) 474 if fixed_temperature is not None: 475 api_kwargs["temperature"] = fixed_temperature 476 477 response = self.client.chat.completions.create(**api_kwargs) 478 except Exception as e: 479 self.logger.error(f"API call failed: {e}") 480 break 481 482 assistant_message = response.choices[0].message 483 484 # Log assistant response 485 if assistant_message.content: 486 print(f"š¤ Assistant: {assistant_message.content[:100]}...") 487 488 # Check for tool calls 489 if assistant_message.tool_calls: 490 print(f"š§ Tool calls: {len(assistant_message.tool_calls)}") 491 492 # Add assistant message with tool calls 493 messages.append({ 494 "role": "assistant", 495 "content": assistant_message.content, 496 "tool_calls": [ 497 { 498 "id": tc.id, 499 "type": tc.type, 500 "function": { 501 "name": tc.function.name, 502 "arguments": tc.function.arguments 503 } 504 } 505 for tc in assistant_message.tool_calls 506 ] 507 }) 508 509 # Execute each tool call 510 for tc in assistant_message.tool_calls: 511 try: 512 args = json.loads(tc.function.arguments) 513 except json.JSONDecodeError: 514 args = {} 515 516 command = args.get("command", "echo 'No command provided'") 517 timeout = args.get("timeout", self.command_timeout) 518 519 print(f" š terminal: {command[:60]}...") 520 521 # Execute command 522 result = self._execute_command(command, timeout) 523 524 # Format result 525 result_json = json.dumps({ 526 "content": { 527 "output": result["output"], 528 "exit_code": result["exit_code"], 529 "error": result["error"] 530 } 531 }, ensure_ascii=False) 532 533 # Check for task completion signal 534 if "MINI_SWE_AGENT_FINAL_OUTPUT" in result["output"]: 535 print(" ā Task completion signal detected!") 536 completed = True 537 538 # Add tool response 539 messages.append({ 540 "role": "tool", 541 "content": result_json, 542 "tool_call_id": tc.id 543 }) 544 545 print(f" ā exit_code={result['exit_code']}, output={len(result['output'])} chars") 546 547 # If task completed, we can stop 548 if completed: 549 final_response = assistant_message.content 550 break 551 552 else: 553 # No tool calls - final response 554 final_response = assistant_message.content or "" 555 messages.append({ 556 "role": "assistant", 557 "content": final_response 558 }) 559 completed = True 560 print("š Agent finished (no more tool calls)") 561 break 562 563 if api_call_count >= self.max_iterations: 564 print(f"ā ļø Reached max iterations ({self.max_iterations})") 565 566 finally: 567 # Cleanup environment 568 self._cleanup_env() 569 570 # Convert to Hermes trajectory format 571 trajectory = self._convert_to_hermes_format(messages, task, completed) 572 573 return { 574 "conversations": trajectory, 575 "completed": completed, 576 "api_calls": api_call_count, 577 "metadata": { 578 "model": self.model, 579 "env_type": self.env_type, 580 "timestamp": datetime.now().isoformat() 581 } 582 } 583 584 def run_batch( 585 self, 586 prompts: List[str], 587 output_file: str 588 ) -> List[Dict[str, Any]]: 589 """ 590 Run multiple tasks and save trajectories to a JSONL file. 591 592 Args: 593 prompts: List of task prompts 594 output_file: Output JSONL file path 595 596 Returns: 597 List of results 598 """ 599 results = [] 600 601 print(f"\nš¦ Running batch of {len(prompts)} tasks") 602 print(f"š Output: {output_file}") 603 604 with open(output_file, 'w', encoding='utf-8') as f: 605 for i, prompt in enumerate(prompts, 1): 606 print(f"\n{'='*60}") 607 print(f"š Task {i}/{len(prompts)}") 608 print(f"{'='*60}") 609 610 try: 611 result = self.run_task(prompt) 612 results.append(result) 613 614 # Write to file immediately 615 f.write(json.dumps(result, ensure_ascii=False) + "\n") 616 f.flush() 617 618 print(f"ā Task {i} completed (api_calls={result['api_calls']})") 619 620 except Exception as e: 621 self.logger.error(f"Error on task {i}: {e}") 622 error_result = { 623 "conversations": [], 624 "completed": False, 625 "api_calls": 0, 626 "error": str(e), 627 "metadata": {"timestamp": datetime.now().isoformat()} 628 } 629 results.append(error_result) 630 f.write(json.dumps(error_result, ensure_ascii=False) + "\n") 631 f.flush() 632 633 print(f"\nā Batch complete! {len(results)} trajectories saved to {output_file}") 634 return results 635 636 637 # ============================================================================ 638 # CLI Interface 639 # ============================================================================ 640 641 def main( 642 task: str = None, 643 prompts_file: str = None, 644 output_file: str = "swe-runner-test1.jsonl", 645 model: str = "claude-sonnet-4-20250514", 646 base_url: str = None, 647 api_key: str = None, 648 env: str = "local", 649 image: str = "python:3.11-slim", 650 cwd: str = "/tmp", 651 max_iterations: int = 15, 652 timeout: int = 60, 653 verbose: bool = False, 654 ): 655 """ 656 Run SWE tasks with Hermes trajectory format output. 657 658 Args: 659 task: Single task to run (use this OR prompts_file) 660 prompts_file: JSONL file with prompts (each line: {"prompt": "..."}) 661 output_file: Output JSONL file for trajectories 662 model: Model name (default: claude-sonnet-4-20250514) 663 base_url: API base URL (optional) 664 api_key: API key (optional, uses env vars) 665 env: Environment type - "local", "docker", or "modal" 666 image: Docker/Modal image (default: python:3.11-slim) 667 cwd: Working directory (default: /tmp) 668 max_iterations: Maximum tool-calling iterations (default: 15) 669 timeout: Command timeout in seconds (default: 60) 670 verbose: Enable verbose logging 671 672 Examples: 673 # Single task with local environment 674 python mini_swe_runner.py --task "Create hello.py that prints Hello World" 675 676 # Single task with Docker 677 python mini_swe_runner.py --task "List files" --env docker 678 679 # Batch from file 680 python mini_swe_runner.py --prompts_file tasks.jsonl --output_file results.jsonl 681 """ 682 print("š Mini-SWE Runner with Hermes Trajectory Format") 683 print("=" * 60) 684 685 # Initialize runner 686 runner = MiniSWERunner( 687 model=model, 688 base_url=base_url, 689 api_key=api_key, 690 env_type=env, 691 image=image, 692 cwd=cwd, 693 max_iterations=max_iterations, 694 command_timeout=timeout, 695 verbose=verbose, 696 ) 697 698 if task: 699 # Single task mode 700 result = runner.run_task(task) 701 702 # Save to file 703 with open(output_file, 'w', encoding='utf-8') as f: 704 f.write(json.dumps(result, ensure_ascii=False) + "\n") 705 706 print(f"\nš Trajectory saved to: {output_file}") 707 print(f"ā Completed: {result['completed']}") 708 print(f"š API calls: {result['api_calls']}") 709 print(f"š¬ Turns: {len(result['conversations'])}") 710 711 elif prompts_file: 712 # Batch mode 713 prompts = [] 714 with open(prompts_file, 'r', encoding='utf-8') as f: 715 for line in f: 716 line = line.strip() 717 if line: 718 try: 719 entry = json.loads(line) 720 prompts.append(entry.get("prompt", entry.get("task", ""))) 721 except json.JSONDecodeError: 722 prompts.append(line) 723 724 if not prompts: 725 print(f"ā No prompts found in {prompts_file}") 726 return 727 728 runner.run_batch(prompts, output_file) 729 730 else: 731 print("ā Please provide either --task or --prompts_file") 732 print(" Example: python mini_swe_runner.py --task 'Create a hello world script'") 733 734 735 if __name__ == "__main__": 736 fire.Fire(main)