Cradicle Explorer

/ mini_swe_runner.py
mini_swe_runner.py
  1  #!/usr/bin/env python3
  2  """
  3  SWE Runner with Hermes Trajectory Format
  4  
  5  A runner that uses Hermes-Agent's built-in execution environments
  6  (local, docker, modal) and outputs trajectories in the Hermes-Agent format
  7  compatible with batch_runner.py and trajectory_compressor.py.
  8  
  9  Features:
 10  - Uses Hermes-Agent's Docker, Modal, or Local environments for command execution
 11  - Outputs trajectories in Hermes format (from/value pairs with <tool_call>/<tool_response> XML)
 12  - Compatible with the trajectory compression pipeline
 13  - Supports batch processing from JSONL prompt files
 14  
 15  Usage:
 16      # Run a single task with local environment
 17      python mini_swe_runner.py --task "Create a hello world Python script" --env local
 18      
 19      # Run with Docker
 20      python mini_swe_runner.py --task "List files in /tmp" --env docker --image python:3.11-slim
 21      
 22      # Run with Modal (cloud)
 23      python mini_swe_runner.py --task "Install numpy and test it" --env modal --image python:3.11-slim
 24      
 25      # Batch mode from JSONL file
 26      python mini_swe_runner.py --prompts_file prompts.jsonl --output_file trajectories.jsonl --env docker
 27  """
 28  
 29  import json
 30  import logging
 31  import os
 32  import sys
 33  import time
 34  import uuid
 35  from datetime import datetime
 36  from pathlib import Path
 37  from typing import List, Dict, Any, Optional, Literal
 38  
 39  import fire
 40  from dotenv import load_dotenv
 41  
 42  # Load environment variables
 43  load_dotenv()
 44  
 45  
 46  def _effective_temperature_for_model(
 47      model: str,
 48      base_url: Optional[str] = None,
 49  ) -> Optional[float]:
 50      """Return a fixed temperature for models with strict sampling contracts.
 51  
 52      Returns ``None`` when the model manages temperature server-side (Kimi);
 53      callers must omit the ``temperature`` kwarg entirely in that case.
 54      """
 55      try:
 56          from agent.auxiliary_client import _fixed_temperature_for_model, OMIT_TEMPERATURE
 57      except Exception:
 58          return None
 59      result = _fixed_temperature_for_model(model, base_url)
 60      if result is OMIT_TEMPERATURE:
 61          return None  # caller must omit temperature
 62      return result
 63  
 64  
 65  
 66  
 67  # ============================================================================
 68  # Terminal Tool Definition (matches Hermes-Agent format)
 69  # ============================================================================
 70  
 71  TERMINAL_TOOL_DEFINITION = {
 72      "type": "function",
 73      "function": {
 74          "name": "terminal",
 75          "description": """Execute bash commands in a sandboxed environment.
 76  
 77  **Environment:**
 78  - Isolated execution environment (local, Docker, or Modal cloud)
 79  - Filesystem persists between tool calls within the same task
 80  - Internet access available
 81  
 82  **Command Execution:**
 83  - Provide the command to execute via the 'command' parameter
 84  - Optional 'timeout' parameter in seconds (default: 60)
 85  
 86  **Examples:**
 87  - Run command: `{"command": "ls -la"}`
 88  - With timeout: `{"command": "long_task.sh", "timeout": 300}`
 89  
 90  **Best Practices:**
 91  - Use non-interactive commands (avoid vim, nano, interactive python)
 92  - Pipe to cat if output might be large
 93  - Install tools with apt-get or pip as needed
 94  
 95  **Completion:**
 96  - When task is complete, output: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by your result
 97  """,
 98          "parameters": {
 99              "type": "object",
100              "properties": {
101                  "command": {
102                      "type": "string",
103                      "description": "The bash command to execute"
104                  },
105                  "timeout": {
106                      "type": "integer",
107                      "description": "Command timeout in seconds (default: 60)"
108                  }
109              },
110              "required": ["command"]
111          }
112      }
113  }
114  
115  
116  # ============================================================================
117  # Environment Factory
118  # ============================================================================
119  
120  def create_environment(
121      env_type: str = "local",
122      image: str = "python:3.11-slim",
123      cwd: str = "/tmp",
124      timeout: int = 60,
125      **kwargs
126  ):
127      """
128      Create an execution environment using Hermes-Agent's built-in backends.
129      
130      Args:
131          env_type: One of "local", "docker", "modal"
132          image: Docker/Modal image name (ignored for local)
133          cwd: Working directory
134          timeout: Default command timeout
135          **kwargs: Additional environment-specific options
136          
137      Returns:
138          Environment instance with execute() and cleanup() methods
139      """
140      if env_type == "local":
141          from tools.environments.local import LocalEnvironment
142          return LocalEnvironment(cwd=cwd, timeout=timeout)
143      
144      elif env_type == "docker":
145          from tools.environments.docker import DockerEnvironment
146          return DockerEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs)
147      
148      elif env_type == "modal":
149          from tools.environments.modal import ModalEnvironment
150          return ModalEnvironment(image=image, cwd=cwd, timeout=timeout, **kwargs)
151      
152      else:
153          raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'")
154  
155  
156  # ============================================================================
157  # Mini-SWE Runner with Hermes Trajectory Format
158  # ============================================================================
159  
160  class MiniSWERunner:
161      """
162      Agent runner that uses Hermes-Agent's built-in execution environments
163      and outputs trajectories in Hermes-Agent format.
164      """
165      
166      def __init__(
167          self,
168          model: str = "anthropic/claude-sonnet-4.6",
169          base_url: str = None,
170          api_key: str = None,
171          env_type: str = "local",
172          image: str = "python:3.11-slim",
173          cwd: str = "/tmp",
174          max_iterations: int = 15,
175          command_timeout: int = 60,
176          verbose: bool = False,
177      ):
178          """
179          Initialize the Mini-SWE Runner.
180          
181          Args:
182              model: Model name for OpenAI-compatible API
183              base_url: API base URL (optional, uses env vars if not provided)
184              api_key: API key (optional, uses env vars if not provided)
185              env_type: Environment type - "local", "docker", or "modal"
186              image: Docker/Modal image (ignored for local)
187              cwd: Working directory for commands
188              max_iterations: Maximum tool-calling iterations
189              command_timeout: Default timeout for commands
190              verbose: Enable verbose logging
191          """
192          self.model = model
193          self.max_iterations = max_iterations
194          self.command_timeout = command_timeout
195          self.verbose = verbose
196          self.env_type = env_type
197          self.image = image
198          self.cwd = cwd
199          
200          # Setup logging
201          logging.basicConfig(
202              level=logging.DEBUG if verbose else logging.INFO,
203              format='%(asctime)s - %(levelname)s - %(message)s',
204              datefmt='%H:%M:%S'
205          )
206          self.logger = logging.getLogger(__name__)
207          
208          # Initialize LLM client via centralized provider router.
209          # If explicit api_key/base_url are provided (e.g. from CLI args),
210          # construct directly.  Otherwise use the router for OpenRouter.
211          if api_key or base_url:
212              from openai import OpenAI
213              client_kwargs = {
214                  "base_url": base_url or "https://openrouter.ai/api/v1",
215                  "api_key": api_key or os.getenv(
216                      "OPENROUTER_API_KEY",
217                      os.getenv("ANTHROPIC_API_KEY",
218                                os.getenv("OPENAI_API_KEY", ""))),
219              }
220              self.client = OpenAI(**client_kwargs)
221          else:
222              from agent.auxiliary_client import resolve_provider_client
223              self.client, _ = resolve_provider_client("openrouter", model=model)
224              if self.client is None:
225                  # Fallback: try auto-detection
226                  self.client, _ = resolve_provider_client("auto", model=model)
227              if self.client is None:
228                  from openai import OpenAI
229                  self.client = OpenAI(
230                      base_url="https://openrouter.ai/api/v1",
231                      api_key=os.getenv("OPENROUTER_API_KEY", ""))
232          
233          # Environment will be created per-task
234          self.env = None
235          
236          # Tool definition
237          self.tools = [TERMINAL_TOOL_DEFINITION]
238          
239          print("🤖 Mini-SWE Runner initialized")
240          print(f"   Model: {self.model}")
241          print(f"   Environment: {self.env_type}")
242          if self.env_type != "local":
243              print(f"   Image: {self.image}")
244          print(f"   Max iterations: {self.max_iterations}")
245      
246      def _create_env(self):
247          """Create the execution environment."""
248          print(f"🔧 Creating {self.env_type} environment...")
249          self.env = create_environment(
250              env_type=self.env_type,
251              image=self.image,
252              cwd=self.cwd,
253              timeout=self.command_timeout
254          )
255          print("✅ Environment ready")
256      
257      def _cleanup_env(self):
258          """Cleanup the execution environment."""
259          if self.env is not None:
260              if hasattr(self.env, 'cleanup'):
261                  self.env.cleanup()
262              elif hasattr(self.env, 'stop'):
263                  self.env.stop()
264              self.env = None
265      
266      def _execute_command(self, command: str, timeout: int = None) -> Dict[str, Any]:
267          """
268          Execute a command in the environment.
269          
270          Args:
271              command: Bash command to execute
272              timeout: Optional timeout override
273              
274          Returns:
275              Dict with 'output' and 'returncode'
276          """
277          if self.env is None:
278              self._create_env()
279          
280          try:
281              result = self.env.execute(command, timeout=timeout or self.command_timeout)
282              return {
283                  "output": result.get("output", ""),
284                  "exit_code": result.get("returncode", 0),
285                  "error": None
286              }
287          except Exception as e:
288              return {
289                  "output": "",
290                  "exit_code": -1,
291                  "error": str(e)
292              }
293      
294      def _format_tools_for_system_message(self) -> str:
295          """Format tool definitions for the system message."""
296          formatted_tools = []
297          for tool in self.tools:
298              func = tool["function"]
299              formatted_tools.append({
300                  "name": func["name"],
301                  "description": func.get("description", ""),
302                  "parameters": func.get("parameters", {}),
303                  "required": None
304              })
305          return json.dumps(formatted_tools, ensure_ascii=False)
306      
307      def _convert_to_hermes_format(
308          self,
309          messages: List[Dict[str, Any]],
310          user_query: str,
311          completed: bool
312      ) -> List[Dict[str, Any]]:
313          """
314          Convert internal message format to Hermes trajectory format.
315          
316          This produces the exact format used by batch_runner.py.
317          """
318          trajectory = []
319          
320          # System message with tool definitions
321          system_msg = (
322              "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
323              "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
324              "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
325              "into functions. After calling & executing the functions, you will be provided with function results within "
326              "<tool_response> </tool_response> XML tags. Here are the available tools:\n"
327              f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
328              "For each function call return a JSON object, with the following pydantic model json schema for each:\n"
329              "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
330              "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
331              "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
332              "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
333          )
334          
335          trajectory.append({"from": "system", "value": system_msg})
336          trajectory.append({"from": "human", "value": user_query})
337          
338          # Process messages (skip first user message as we already added it)
339          i = 1
340          while i < len(messages):
341              msg = messages[i]
342              
343              if msg["role"] == "assistant":
344                  if "tool_calls" in msg and msg["tool_calls"]:
345                      # Assistant message with tool calls
346                      content = ""
347                      
348                      # Add reasoning if present
349                      if msg.get("reasoning"):
350                          content = f"<think>{msg['reasoning']}</think>"
351                      
352                      if msg.get("content"):
353                          content += msg["content"] + "\n"
354                      
355                      # Add tool calls in XML format
356                      for tool_call in msg["tool_calls"]:
357                          if not tool_call or not isinstance(tool_call, dict): continue
358                          try:
359                              arguments = json.loads(tool_call["function"]["arguments"]) \
360                                  if isinstance(tool_call["function"]["arguments"], str) \
361                                  else tool_call["function"]["arguments"]
362                          except json.JSONDecodeError:
363                              arguments = {}
364                          
365                          tool_call_json = {
366                              "name": tool_call["function"]["name"],
367                              "arguments": arguments
368                          }
369                          content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
370                      
371                      trajectory.append({"from": "gpt", "value": content.rstrip()})
372                      
373                      # Collect subsequent tool responses
374                      tool_responses = []
375                      j = i + 1
376                      while j < len(messages) and messages[j]["role"] == "tool":
377                          tool_msg = messages[j]
378                          tool_content = tool_msg["content"]
379                          
380                          # Try to parse as JSON
381                          try:
382                              if tool_content.strip().startswith(("{", "[")):
383                                  tool_content = json.loads(tool_content)
384                          except (json.JSONDecodeError, AttributeError):
385                              pass
386                          
387                          tool_response = "<tool_response>\n"
388                          tool_response += json.dumps({
389                              "tool_call_id": tool_msg.get("tool_call_id", ""),
390                              "name": msg["tool_calls"][len(tool_responses)]["function"]["name"] \
391                                  if len(tool_responses) < len(msg["tool_calls"]) else "unknown",
392                              "content": tool_content
393                          }, ensure_ascii=False)
394                          tool_response += "\n</tool_response>"
395                          tool_responses.append(tool_response)
396                          j += 1
397                      
398                      if tool_responses:
399                          trajectory.append({"from": "tool", "value": "\n".join(tool_responses)})
400                          i = j - 1
401                  
402                  else:
403                      # Regular assistant message (no tool calls)
404                      content = ""
405                      if msg.get("reasoning"):
406                          content = f"<think>{msg['reasoning']}</think>"
407                      content += msg.get("content") or ""
408                      trajectory.append({"from": "gpt", "value": content})
409              
410              elif msg["role"] == "user":
411                  trajectory.append({"from": "human", "value": msg["content"]})
412              
413              i += 1
414          
415          return trajectory
416      
417      def run_task(self, task: str) -> Dict[str, Any]:
418          """
419          Run a single task and return the result with trajectory.
420          
421          Args:
422              task: The task/prompt to execute
423              
424          Returns:
425              Dict with trajectory, completion status, and metadata
426          """
427          print(f"\n{'='*60}")
428          print(f"📝 Task: {task[:80]}{'...' if len(task) > 80 else ''}")
429          print(f"{'='*60}")
430          
431          # Initialize environment
432          self._create_env()
433          
434          # Message history
435          messages = [{"role": "user", "content": task}]
436          
437          # System prompt for the LLM (ephemeral - not saved to trajectory)
438          system_prompt = """You are an AI agent that can execute bash commands to complete tasks.
439  
440  When you need to run commands, use the 'terminal' tool with your bash command.
441  
442  **Important:**
443  - When you have completed the task successfully, run: echo "MINI_SWE_AGENT_FINAL_OUTPUT" followed by a summary
444  - Be concise and efficient in your approach
445  - Install any needed tools with apt-get or pip
446  - Avoid interactive commands (no vim, nano, less, etc.)
447  
448  Complete the user's task step by step."""
449          
450          api_call_count = 0
451          completed = False
452          final_response = None
453          
454          try:
455              while api_call_count < self.max_iterations:
456                  api_call_count += 1
457                  print(f"\n🔄 API call #{api_call_count}/{self.max_iterations}")
458                  
459                  # Prepare API messages
460                  api_messages = [{"role": "system", "content": system_prompt}] + messages
461                  
462                  # Make API call
463                  try:
464                      api_kwargs = {
465                          "model": self.model,
466                          "messages": api_messages,
467                          "tools": self.tools,
468                          "timeout": 300.0,
469                      }
470                      fixed_temperature = _effective_temperature_for_model(
471                          self.model,
472                          str(getattr(self.client, "base_url", "") or ""),
473                      )
474                      if fixed_temperature is not None:
475                          api_kwargs["temperature"] = fixed_temperature
476  
477                      response = self.client.chat.completions.create(**api_kwargs)
478                  except Exception as e:
479                      self.logger.error(f"API call failed: {e}")
480                      break
481                  
482                  assistant_message = response.choices[0].message
483                  
484                  # Log assistant response
485                  if assistant_message.content:
486                      print(f"🤖 Assistant: {assistant_message.content[:100]}...")
487                  
488                  # Check for tool calls
489                  if assistant_message.tool_calls:
490                      print(f"🔧 Tool calls: {len(assistant_message.tool_calls)}")
491                      
492                      # Add assistant message with tool calls
493                      messages.append({
494                          "role": "assistant",
495                          "content": assistant_message.content,
496                          "tool_calls": [
497                              {
498                                  "id": tc.id,
499                                  "type": tc.type,
500                                  "function": {
501                                      "name": tc.function.name,
502                                      "arguments": tc.function.arguments
503                                  }
504                              }
505                              for tc in assistant_message.tool_calls
506                          ]
507                      })
508                      
509                      # Execute each tool call
510                      for tc in assistant_message.tool_calls:
511                          try:
512                              args = json.loads(tc.function.arguments)
513                          except json.JSONDecodeError:
514                              args = {}
515                          
516                          command = args.get("command", "echo 'No command provided'")
517                          timeout = args.get("timeout", self.command_timeout)
518                          
519                          print(f"   📞 terminal: {command[:60]}...")
520                          
521                          # Execute command
522                          result = self._execute_command(command, timeout)
523                          
524                          # Format result
525                          result_json = json.dumps({
526                              "content": {
527                                  "output": result["output"],
528                                  "exit_code": result["exit_code"],
529                                  "error": result["error"]
530                              }
531                          }, ensure_ascii=False)
532                          
533                          # Check for task completion signal
534                          if "MINI_SWE_AGENT_FINAL_OUTPUT" in result["output"]:
535                              print("   ✅ Task completion signal detected!")
536                              completed = True
537                          
538                          # Add tool response
539                          messages.append({
540                              "role": "tool",
541                              "content": result_json,
542                              "tool_call_id": tc.id
543                          })
544                          
545                          print(f"   ✅ exit_code={result['exit_code']}, output={len(result['output'])} chars")
546                      
547                      # If task completed, we can stop
548                      if completed:
549                          final_response = assistant_message.content
550                          break
551                  
552                  else:
553                      # No tool calls - final response
554                      final_response = assistant_message.content or ""
555                      messages.append({
556                          "role": "assistant",
557                          "content": final_response
558                      })
559                      completed = True
560                      print("🎉 Agent finished (no more tool calls)")
561                      break
562              
563              if api_call_count >= self.max_iterations:
564                  print(f"⚠️  Reached max iterations ({self.max_iterations})")
565          
566          finally:
567              # Cleanup environment
568              self._cleanup_env()
569          
570          # Convert to Hermes trajectory format
571          trajectory = self._convert_to_hermes_format(messages, task, completed)
572          
573          return {
574              "conversations": trajectory,
575              "completed": completed,
576              "api_calls": api_call_count,
577              "metadata": {
578                  "model": self.model,
579                  "env_type": self.env_type,
580                  "timestamp": datetime.now().isoformat()
581              }
582          }
583      
584      def run_batch(
585          self,
586          prompts: List[str],
587          output_file: str
588      ) -> List[Dict[str, Any]]:
589          """
590          Run multiple tasks and save trajectories to a JSONL file.
591          
592          Args:
593              prompts: List of task prompts
594              output_file: Output JSONL file path
595              
596          Returns:
597              List of results
598          """
599          results = []
600          
601          print(f"\n📦 Running batch of {len(prompts)} tasks")
602          print(f"📁 Output: {output_file}")
603          
604          with open(output_file, 'w', encoding='utf-8') as f:
605              for i, prompt in enumerate(prompts, 1):
606                  print(f"\n{'='*60}")
607                  print(f"📋 Task {i}/{len(prompts)}")
608                  print(f"{'='*60}")
609                  
610                  try:
611                      result = self.run_task(prompt)
612                      results.append(result)
613                      
614                      # Write to file immediately
615                      f.write(json.dumps(result, ensure_ascii=False) + "\n")
616                      f.flush()
617                      
618                      print(f"✅ Task {i} completed (api_calls={result['api_calls']})")
619                      
620                  except Exception as e:
621                      self.logger.error(f"Error on task {i}: {e}")
622                      error_result = {
623                          "conversations": [],
624                          "completed": False,
625                          "api_calls": 0,
626                          "error": str(e),
627                          "metadata": {"timestamp": datetime.now().isoformat()}
628                      }
629                      results.append(error_result)
630                      f.write(json.dumps(error_result, ensure_ascii=False) + "\n")
631                      f.flush()
632          
633          print(f"\n✅ Batch complete! {len(results)} trajectories saved to {output_file}")
634          return results
635  
636  
637  # ============================================================================
638  # CLI Interface
639  # ============================================================================
640  
641  def main(
642      task: str = None,
643      prompts_file: str = None,
644      output_file: str = "swe-runner-test1.jsonl",
645      model: str = "claude-sonnet-4-20250514",
646      base_url: str = None,
647      api_key: str = None,
648      env: str = "local",
649      image: str = "python:3.11-slim",
650      cwd: str = "/tmp",
651      max_iterations: int = 15,
652      timeout: int = 60,
653      verbose: bool = False,
654  ):
655      """
656      Run SWE tasks with Hermes trajectory format output.
657      
658      Args:
659          task: Single task to run (use this OR prompts_file)
660          prompts_file: JSONL file with prompts (each line: {"prompt": "..."})
661          output_file: Output JSONL file for trajectories
662          model: Model name (default: claude-sonnet-4-20250514)
663          base_url: API base URL (optional)
664          api_key: API key (optional, uses env vars)
665          env: Environment type - "local", "docker", or "modal"
666          image: Docker/Modal image (default: python:3.11-slim)
667          cwd: Working directory (default: /tmp)
668          max_iterations: Maximum tool-calling iterations (default: 15)
669          timeout: Command timeout in seconds (default: 60)
670          verbose: Enable verbose logging
671          
672      Examples:
673          # Single task with local environment
674          python mini_swe_runner.py --task "Create hello.py that prints Hello World"
675          
676          # Single task with Docker
677          python mini_swe_runner.py --task "List files" --env docker
678          
679          # Batch from file
680          python mini_swe_runner.py --prompts_file tasks.jsonl --output_file results.jsonl
681      """
682      print("🚀 Mini-SWE Runner with Hermes Trajectory Format")
683      print("=" * 60)
684      
685      # Initialize runner
686      runner = MiniSWERunner(
687          model=model,
688          base_url=base_url,
689          api_key=api_key,
690          env_type=env,
691          image=image,
692          cwd=cwd,
693          max_iterations=max_iterations,
694          command_timeout=timeout,
695          verbose=verbose,
696      )
697      
698      if task:
699          # Single task mode
700          result = runner.run_task(task)
701          
702          # Save to file
703          with open(output_file, 'w', encoding='utf-8') as f:
704              f.write(json.dumps(result, ensure_ascii=False) + "\n")
705          
706          print(f"\n📁 Trajectory saved to: {output_file}")
707          print(f"✅ Completed: {result['completed']}")
708          print(f"📞 API calls: {result['api_calls']}")
709          print(f"💬 Turns: {len(result['conversations'])}")
710          
711      elif prompts_file:
712          # Batch mode
713          prompts = []
714          with open(prompts_file, 'r', encoding='utf-8') as f:
715              for line in f:
716                  line = line.strip()
717                  if line:
718                      try:
719                          entry = json.loads(line)
720                          prompts.append(entry.get("prompt", entry.get("task", "")))
721                      except json.JSONDecodeError:
722                          prompts.append(line)
723          
724          if not prompts:
725              print(f"❌ No prompts found in {prompts_file}")
726              return
727          
728          runner.run_batch(prompts, output_file)
729      
730      else:
731          print("❌ Please provide either --task or --prompts_file")
732          print("   Example: python mini_swe_runner.py --task 'Create a hello world script'")
733  
734  
735  if __name__ == "__main__":
736      fire.Fire(main)