terminal_test_env.py
1 """ 2 TerminalTestEnv -- Simple Test Environment for Validating the Stack 3 4 A self-contained environment with inline tasks (no external dataset needed). 5 Each task asks the model to create a file at a known path with specific content. 6 The reward verifier cats the file and checks if the content matches. 7 8 Enables only terminal + file toolsets. Uses Modal terminal backend with 9 OpenRouter (Claude) by default. 10 11 Training tasks (3): 12 1. Create ~/greeting.txt with "Hello from Hermes Agent" 13 2. Create ~/count.txt with numbers 1-5, one per line 14 3. Create ~/answer.txt with the result of 123 + 456 15 16 Eval task (1): 17 1. Create ~/result.txt with the result of 6 * 7 18 19 Usage: 20 # Start Atropos API server 21 run-api 22 23 # Run environment (uses OpenRouter + Modal by default) 24 python environments/terminal_test_env.py serve 25 26 # Process mode (no run-api needed, saves to JSONL) 27 python environments/terminal_test_env.py process \\ 28 --env.data_path_to_save_groups terminal_test_output.jsonl 29 """ 30 31 import logging 32 import os 33 import sys 34 import time 35 from pathlib import Path 36 from typing import Any, Dict, List, Optional, Tuple, Union 37 38 # Ensure repo root is on sys.path for imports 39 _repo_root = Path(__file__).resolve().parent.parent.parent 40 if str(_repo_root) not in sys.path: 41 sys.path.insert(0, str(_repo_root)) 42 43 from atroposlib.envs.base import ScoredDataGroup 44 from atroposlib.envs.server_handling.server_manager import APIServerConfig 45 from atroposlib.type_definitions import Item 46 47 from environments.agent_loop import AgentResult 48 from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig 49 from environments.tool_context import ToolContext 50 51 logger = logging.getLogger(__name__) 52 53 54 # ============================================================================= 55 # Inline task definitions -- no external dataset needed 56 # ============================================================================= 57 58 TRAIN_TASKS = [ 59 { 60 "prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent", 61 "verify_path": "~/greeting.txt", 62 "expected_content": "Hello from Hermes Agent", 63 }, 64 { 65 "prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line", 66 "verify_path": "~/count.txt", 67 "expected_content": "1\n2\n3\n4\n5", 68 }, 69 { 70 "prompt": "Create a file at ~/answer.txt containing the result of 123 + 456", 71 "verify_path": "~/answer.txt", 72 "expected_content": "579", 73 }, 74 ] 75 76 EVAL_TASKS = [ 77 { 78 "prompt": "Create a file at ~/result.txt containing the result of 6 * 7", 79 "verify_path": "~/result.txt", 80 "expected_content": "42", 81 }, 82 ] 83 84 85 class TerminalTestEnvConfig(HermesAgentEnvConfig): 86 """Config with defaults suitable for terminal testing.""" 87 88 pass # Inherits all fields, overrides defaults in config_init 89 90 91 class TerminalTestEnv(HermesAgentBaseEnv): 92 """ 93 Simple test environment with inline file-creation tasks. 94 95 All tasks follow the same pattern: "create a file at ~/X.txt with content Y". 96 The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output 97 against the expected string. Same verifier logic for all tasks. 98 99 This environment is designed to validate the full stack end-to-end: 100 - Agent loop executes tool calls (terminal/file) 101 - ToolContext provides terminal access to the reward function 102 - Reward function verifies file content via cat 103 - Scored data flows through the Atropos pipeline 104 """ 105 106 name = "terminal-test" 107 env_config_cls = TerminalTestEnvConfig 108 109 @classmethod 110 def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]: 111 """ 112 Default configuration for the terminal test environment. 113 114 Uses Modal terminal backend for cloud isolation and OpenRouter with 115 Claude for inference. API keys loaded from ~/hermes-agent/.env. 116 """ 117 env_config = TerminalTestEnvConfig( 118 # Terminal + file tools only 119 enabled_toolsets=["terminal", "file"], 120 disabled_toolsets=None, 121 distribution=None, 122 # Agent settings 123 max_agent_turns=10, # Simple tasks, don't need many turns 124 max_token_length=16000, 125 agent_temperature=1.0, 126 system_prompt=( 127 "You are a helpful assistant with access to a terminal and file tools. " 128 "Complete the user's request by using the available tools. " 129 "Be precise and follow instructions exactly." 130 ), 131 # Modal terminal backend for cloud-isolated sandboxes per rollout 132 terminal_backend="modal", 133 # Atropos settings 134 group_size=3, # 3 rollouts per group 135 tokenizer_name="NousResearch/q-30b-t-h45-e1", 136 tool_call_parser="hermes", 137 steps_per_eval=3, # Eval after all 3 steps 138 total_steps=3, # 3 groups total (1 group per step) 139 use_wandb=True, 140 wandb_name="terminal-test", 141 ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks 142 # No external dataset 143 dataset_name=None, 144 ) 145 146 # OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY) 147 server_configs = [ 148 APIServerConfig( 149 base_url="https://openrouter.ai/api/v1", 150 model_name="anthropic/claude-opus-4.6", 151 server_type="openai", 152 api_key=os.getenv("OPENROUTER_API_KEY", ""), 153 health_check=False, # OpenRouter doesn't have a /health endpoint 154 ) 155 ] 156 157 return env_config, server_configs 158 159 async def setup(self): 160 """Initialize inline task lists.""" 161 self.train_tasks = list(TRAIN_TASKS) 162 self.eval_tasks = list(EVAL_TASKS) 163 self.iter = 0 164 # Track reward stats for wandb logging 165 self.reward_buffer: List[float] = [] 166 167 async def get_next_item(self) -> Dict[str, str]: 168 """Cycle through training tasks.""" 169 item = self.train_tasks[self.iter % len(self.train_tasks)] 170 self.iter += 1 171 return item 172 173 def format_prompt(self, item: Dict[str, str]) -> str: 174 """The prompt is directly in the task item.""" 175 return item["prompt"] 176 177 async def compute_reward( 178 self, item: Dict[str, str], result: AgentResult, ctx: ToolContext 179 ) -> float: 180 """ 181 Verify by cat-ing the expected file path and checking content matches. 182 Same verifier for all tasks -- they all write a file at a known path. 183 184 Scoring: 185 1.0 = exact match 186 0.5 = expected content is present but has extra stuff 187 0.0 = file doesn't exist or content doesn't match 188 """ 189 verify_result = ctx.terminal(f"cat {item['verify_path']}") 190 191 # File doesn't exist or can't be read 192 if verify_result["exit_code"] != 0: 193 self.reward_buffer.append(0.0) 194 return 0.0 195 196 actual = verify_result.get("output", "").strip() 197 expected = item["expected_content"].strip() 198 199 # Exact match 200 if actual == expected: 201 self.reward_buffer.append(1.0) 202 return 1.0 203 204 # Partial credit: expected content is present but has extra stuff 205 if expected in actual: 206 self.reward_buffer.append(0.5) 207 return 0.5 208 209 self.reward_buffer.append(0.0) 210 return 0.0 211 212 async def evaluate(self, *args, **kwargs): 213 """ 214 Run eval tasks using the agent loop and verify results. 215 Logs accuracy metrics. 216 """ 217 start_time = time.time() 218 correct = 0 219 total = len(self.eval_tasks) 220 samples = [] 221 222 for eval_item in self.eval_tasks: 223 try: 224 # For eval, we do a simple single-turn completion (not full agent loop) 225 # to keep eval fast. The agent loop is tested via training. 226 completion = await self.server.chat_completion( 227 messages=[ 228 {"role": "system", "content": self.config.system_prompt or ""}, 229 {"role": "user", "content": eval_item["prompt"]}, 230 ], 231 n=1, 232 max_tokens=self.config.max_token_length, 233 temperature=0.0, 234 split="eval", 235 ) 236 237 response_content = ( 238 completion.choices[0].message.content if completion.choices else "" 239 ) 240 241 samples.append( 242 { 243 "prompt": eval_item["prompt"], 244 "response": response_content, 245 "expected": eval_item["expected_content"], 246 } 247 ) 248 249 except Exception as e: 250 logger.error("Eval failed for item: %s", e) 251 samples.append( 252 { 253 "prompt": eval_item["prompt"], 254 "response": f"ERROR: {e}", 255 "expected": eval_item["expected_content"], 256 } 257 ) 258 259 end_time = time.time() 260 261 eval_metrics = { 262 "eval/num_samples": total, 263 } 264 265 await self.evaluate_log( 266 metrics=eval_metrics, 267 samples=samples, 268 start_time=start_time, 269 end_time=end_time, 270 ) 271 272 async def wandb_log(self, wandb_metrics: Optional[Dict] = None): 273 """Log training metrics including reward stats and accuracy.""" 274 if wandb_metrics is None: 275 wandb_metrics = {} 276 277 if self.reward_buffer: 278 total = len(self.reward_buffer) 279 correct = sum(1 for r in self.reward_buffer if r == 1.0) 280 partial = sum(1 for r in self.reward_buffer if r == 0.5) 281 282 wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total 283 wandb_metrics["train/accuracy"] = correct / total 284 wandb_metrics["train/partial_match_rate"] = partial / total 285 wandb_metrics["train/total_rollouts"] = total 286 self.reward_buffer = [] 287 288 await super().wandb_log(wandb_metrics) 289 290 291 if __name__ == "__main__": 292 TerminalTestEnv.cli()