Cradicle Explorer

/ environments / terminal_test_env / terminal_test_env.py
terminal_test_env.py
  1  """
  2  TerminalTestEnv -- Simple Test Environment for Validating the Stack
  3  
  4  A self-contained environment with inline tasks (no external dataset needed).
  5  Each task asks the model to create a file at a known path with specific content.
  6  The reward verifier cats the file and checks if the content matches.
  7  
  8  Enables only terminal + file toolsets. Uses Modal terminal backend with
  9  OpenRouter (Claude) by default.
 10  
 11  Training tasks (3):
 12      1. Create ~/greeting.txt with "Hello from Hermes Agent"
 13      2. Create ~/count.txt with numbers 1-5, one per line
 14      3. Create ~/answer.txt with the result of 123 + 456
 15  
 16  Eval task (1):
 17      1. Create ~/result.txt with the result of 6 * 7
 18  
 19  Usage:
 20      # Start Atropos API server
 21      run-api
 22  
 23      # Run environment (uses OpenRouter + Modal by default)
 24      python environments/terminal_test_env.py serve
 25  
 26      # Process mode (no run-api needed, saves to JSONL)
 27      python environments/terminal_test_env.py process \\
 28          --env.data_path_to_save_groups terminal_test_output.jsonl
 29  """
 30  
 31  import logging
 32  import os
 33  import sys
 34  import time
 35  from pathlib import Path
 36  from typing import Any, Dict, List, Optional, Tuple, Union
 37  
 38  # Ensure repo root is on sys.path for imports
 39  _repo_root = Path(__file__).resolve().parent.parent.parent
 40  if str(_repo_root) not in sys.path:
 41      sys.path.insert(0, str(_repo_root))
 42  
 43  from atroposlib.envs.base import ScoredDataGroup
 44  from atroposlib.envs.server_handling.server_manager import APIServerConfig
 45  from atroposlib.type_definitions import Item
 46  
 47  from environments.agent_loop import AgentResult
 48  from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
 49  from environments.tool_context import ToolContext
 50  
 51  logger = logging.getLogger(__name__)
 52  
 53  
 54  # =============================================================================
 55  # Inline task definitions -- no external dataset needed
 56  # =============================================================================
 57  
 58  TRAIN_TASKS = [
 59      {
 60          "prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
 61          "verify_path": "~/greeting.txt",
 62          "expected_content": "Hello from Hermes Agent",
 63      },
 64      {
 65          "prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
 66          "verify_path": "~/count.txt",
 67          "expected_content": "1\n2\n3\n4\n5",
 68      },
 69      {
 70          "prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
 71          "verify_path": "~/answer.txt",
 72          "expected_content": "579",
 73      },
 74  ]
 75  
 76  EVAL_TASKS = [
 77      {
 78          "prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
 79          "verify_path": "~/result.txt",
 80          "expected_content": "42",
 81      },
 82  ]
 83  
 84  
 85  class TerminalTestEnvConfig(HermesAgentEnvConfig):
 86      """Config with defaults suitable for terminal testing."""
 87  
 88      pass  # Inherits all fields, overrides defaults in config_init
 89  
 90  
 91  class TerminalTestEnv(HermesAgentBaseEnv):
 92      """
 93      Simple test environment with inline file-creation tasks.
 94  
 95      All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
 96      The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
 97      against the expected string. Same verifier logic for all tasks.
 98  
 99      This environment is designed to validate the full stack end-to-end:
100      - Agent loop executes tool calls (terminal/file)
101      - ToolContext provides terminal access to the reward function
102      - Reward function verifies file content via cat
103      - Scored data flows through the Atropos pipeline
104      """
105  
106      name = "terminal-test"
107      env_config_cls = TerminalTestEnvConfig
108  
109      @classmethod
110      def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
111          """
112          Default configuration for the terminal test environment.
113  
114          Uses Modal terminal backend for cloud isolation and OpenRouter with
115          Claude for inference. API keys loaded from ~/hermes-agent/.env.
116          """
117          env_config = TerminalTestEnvConfig(
118              # Terminal + file tools only
119              enabled_toolsets=["terminal", "file"],
120              disabled_toolsets=None,
121              distribution=None,
122              # Agent settings
123              max_agent_turns=10,  # Simple tasks, don't need many turns
124              max_token_length=16000,
125              agent_temperature=1.0,
126              system_prompt=(
127                  "You are a helpful assistant with access to a terminal and file tools. "
128                  "Complete the user's request by using the available tools. "
129                  "Be precise and follow instructions exactly."
130              ),
131              # Modal terminal backend for cloud-isolated sandboxes per rollout
132              terminal_backend="modal",
133              # Atropos settings
134              group_size=3,              # 3 rollouts per group
135              tokenizer_name="NousResearch/q-30b-t-h45-e1",
136              tool_call_parser="hermes",
137              steps_per_eval=3,          # Eval after all 3 steps
138              total_steps=3,             # 3 groups total (1 group per step)
139              use_wandb=True,
140              wandb_name="terminal-test",
141              ensure_scores_are_not_same=False,  # Allow all-same scores for simple tasks
142              # No external dataset
143              dataset_name=None,
144          )
145  
146          # OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
147          server_configs = [
148              APIServerConfig(
149                  base_url="https://openrouter.ai/api/v1",
150                  model_name="anthropic/claude-opus-4.6",
151                  server_type="openai",
152                  api_key=os.getenv("OPENROUTER_API_KEY", ""),
153                  health_check=False,  # OpenRouter doesn't have a /health endpoint
154              )
155          ]
156  
157          return env_config, server_configs
158  
159      async def setup(self):
160          """Initialize inline task lists."""
161          self.train_tasks = list(TRAIN_TASKS)
162          self.eval_tasks = list(EVAL_TASKS)
163          self.iter = 0
164          # Track reward stats for wandb logging
165          self.reward_buffer: List[float] = []
166  
167      async def get_next_item(self) -> Dict[str, str]:
168          """Cycle through training tasks."""
169          item = self.train_tasks[self.iter % len(self.train_tasks)]
170          self.iter += 1
171          return item
172  
173      def format_prompt(self, item: Dict[str, str]) -> str:
174          """The prompt is directly in the task item."""
175          return item["prompt"]
176  
177      async def compute_reward(
178          self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
179      ) -> float:
180          """
181          Verify by cat-ing the expected file path and checking content matches.
182          Same verifier for all tasks -- they all write a file at a known path.
183  
184          Scoring:
185              1.0 = exact match
186              0.5 = expected content is present but has extra stuff
187              0.0 = file doesn't exist or content doesn't match
188          """
189          verify_result = ctx.terminal(f"cat {item['verify_path']}")
190  
191          # File doesn't exist or can't be read
192          if verify_result["exit_code"] != 0:
193              self.reward_buffer.append(0.0)
194              return 0.0
195  
196          actual = verify_result.get("output", "").strip()
197          expected = item["expected_content"].strip()
198  
199          # Exact match
200          if actual == expected:
201              self.reward_buffer.append(1.0)
202              return 1.0
203  
204          # Partial credit: expected content is present but has extra stuff
205          if expected in actual:
206              self.reward_buffer.append(0.5)
207              return 0.5
208  
209          self.reward_buffer.append(0.0)
210          return 0.0
211  
212      async def evaluate(self, *args, **kwargs):
213          """
214          Run eval tasks using the agent loop and verify results.
215          Logs accuracy metrics.
216          """
217          start_time = time.time()
218          correct = 0
219          total = len(self.eval_tasks)
220          samples = []
221  
222          for eval_item in self.eval_tasks:
223              try:
224                  # For eval, we do a simple single-turn completion (not full agent loop)
225                  # to keep eval fast. The agent loop is tested via training.
226                  completion = await self.server.chat_completion(
227                      messages=[
228                          {"role": "system", "content": self.config.system_prompt or ""},
229                          {"role": "user", "content": eval_item["prompt"]},
230                      ],
231                      n=1,
232                      max_tokens=self.config.max_token_length,
233                      temperature=0.0,
234                      split="eval",
235                  )
236  
237                  response_content = (
238                      completion.choices[0].message.content if completion.choices else ""
239                  )
240  
241                  samples.append(
242                      {
243                          "prompt": eval_item["prompt"],
244                          "response": response_content,
245                          "expected": eval_item["expected_content"],
246                      }
247                  )
248  
249              except Exception as e:
250                  logger.error("Eval failed for item: %s", e)
251                  samples.append(
252                      {
253                          "prompt": eval_item["prompt"],
254                          "response": f"ERROR: {e}",
255                          "expected": eval_item["expected_content"],
256                      }
257                  )
258  
259          end_time = time.time()
260  
261          eval_metrics = {
262              "eval/num_samples": total,
263          }
264  
265          await self.evaluate_log(
266              metrics=eval_metrics,
267              samples=samples,
268              start_time=start_time,
269              end_time=end_time,
270          )
271  
272      async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
273          """Log training metrics including reward stats and accuracy."""
274          if wandb_metrics is None:
275              wandb_metrics = {}
276  
277          if self.reward_buffer:
278              total = len(self.reward_buffer)
279              correct = sum(1 for r in self.reward_buffer if r == 1.0)
280              partial = sum(1 for r in self.reward_buffer if r == 0.5)
281  
282              wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
283              wandb_metrics["train/accuracy"] = correct / total
284              wandb_metrics["train/partial_match_rate"] = partial / total
285              wandb_metrics["train/total_rollouts"] = total
286              self.reward_buffer = []
287  
288          await super().wandb_log(wandb_metrics)
289  
290  
291  if __name__ == "__main__":
292      TerminalTestEnv.cli()