/ rl_cli.py
rl_cli.py
1 #!/usr/bin/env python3 2 """ 3 RL Training CLI Runner 4 5 Dedicated CLI runner for RL training workflows with: 6 - Extended timeouts for long-running training 7 - RL-focused system prompts 8 - Full toolset including RL training tools 9 - Special handling for 30-minute check intervals 10 11 Usage: 12 python rl_cli.py "Train a model on GSM8k for math reasoning" 13 python rl_cli.py --interactive 14 python rl_cli.py --list-environments 15 16 Environment Variables: 17 TINKER_API_KEY: API key for Tinker service (required) 18 WANDB_API_KEY: API key for WandB metrics (required) 19 OPENROUTER_API_KEY: API key for OpenRouter (required for agent) 20 """ 21 22 import asyncio 23 import os 24 import sys 25 from pathlib import Path 26 27 import fire 28 import yaml 29 30 from hermes_constants import OPENROUTER_BASE_URL, get_hermes_home 31 32 # Load .env from ~/.hermes/.env first, then project root as dev fallback. 33 # User-managed env files should override stale shell exports on restart. 34 _hermes_home = get_hermes_home() 35 _project_env = Path(__file__).parent / '.env' 36 37 from hermes_cli.env_loader import load_hermes_dotenv 38 39 _loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env) 40 for _env_path in _loaded_env_paths: 41 print(f"ā Loaded environment variables from {_env_path}") 42 43 # Set terminal working directory to tinker-atropos submodule 44 # This ensures terminal commands run in the right context for RL work 45 tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos' 46 if tinker_atropos_dir.exists(): 47 os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir) 48 os.environ['HERMES_QUIET'] = '1' # Disable temp subdirectory creation 49 print(f"š Terminal working directory: {tinker_atropos_dir}") 50 else: 51 # Fall back to hermes-agent directory if submodule not found 52 os.environ['TERMINAL_CWD'] = str(Path(__file__).parent) 53 os.environ['HERMES_QUIET'] = '1' 54 print(f"ā ļø tinker-atropos submodule not found, using: {Path(__file__).parent}") 55 56 # Import agent and tools 57 from run_agent import AIAgent 58 from tools.rl_training_tool import get_missing_keys 59 60 61 # ============================================================================ 62 # Config Loading 63 # ============================================================================ 64 65 DEFAULT_MODEL = "anthropic/claude-opus-4.5" 66 DEFAULT_BASE_URL = OPENROUTER_BASE_URL 67 68 69 def load_hermes_config() -> dict: 70 """ 71 Load configuration from ~/.hermes/config.yaml. 72 73 Returns: 74 dict: Configuration with model, base_url, etc. 75 """ 76 config_path = _hermes_home / 'config.yaml' 77 78 config = { 79 "model": DEFAULT_MODEL, 80 "base_url": DEFAULT_BASE_URL, 81 } 82 83 if config_path.exists(): 84 try: 85 with open(config_path, "r") as f: 86 file_config = yaml.safe_load(f) or {} 87 88 # Get model from config 89 if "model" in file_config: 90 if isinstance(file_config["model"], str): 91 config["model"] = file_config["model"] 92 elif isinstance(file_config["model"], dict): 93 config["model"] = file_config["model"].get("default", DEFAULT_MODEL) 94 95 # Get base_url if specified 96 if "base_url" in file_config: 97 config["base_url"] = file_config["base_url"] 98 99 except Exception as e: 100 print(f"ā ļø Warning: Failed to load config.yaml: {e}") 101 102 return config 103 104 105 # ============================================================================ 106 # RL-Specific Configuration 107 # ============================================================================ 108 109 # Extended timeouts for long-running RL operations 110 RL_MAX_ITERATIONS = 200 # Allow many more iterations for long workflows 111 112 # RL-focused system prompt 113 RL_SYSTEM_PROMPT = """You are an automated post-training engineer specializing in reinforcement learning for language models. 114 115 ## Your Capabilities 116 117 You have access to RL training tools for running reinforcement learning on models through Tinker-Atropos: 118 119 1. **DISCOVER**: Use `rl_list_environments` to see available RL environments 120 2. **INSPECT**: Read environment files to understand how they work (verifiers, data loading, rewards) 121 3. **INSPECT DATA**: Use terminal to explore HuggingFace datasets and understand their format 122 4. **CREATE**: Copy existing environments as templates, modify for your needs 123 5. **CONFIGURE**: Use `rl_select_environment` and `rl_edit_config` to set up training 124 6. **TEST**: Always use `rl_test_inference` before full training to validate your setup 125 7. **TRAIN**: Use `rl_start_training` to begin, `rl_check_status` to monitor 126 8. **EVALUATE**: Use `rl_get_results` and analyze WandB metrics to assess performance 127 128 ## Environment Files 129 130 Environment files are located in: `tinker-atropos/tinker_atropos/environments/` 131 132 Study existing environments to learn patterns. Look for: 133 - `load_dataset()` calls - how data is loaded 134 - `score_answer()` / `score()` - verification logic 135 - `get_next_item()` - prompt formatting 136 - `system_prompt` - instruction format 137 - `config_init()` - default configuration 138 139 ## Creating New Environments 140 141 To create a new environment: 142 1. Read an existing environment file (e.g., gsm8k_tinker.py) 143 2. Use terminal to explore the target dataset format 144 3. Copy the environment file as a template 145 4. Modify the dataset loading, prompt formatting, and verifier logic 146 5. Test with `rl_test_inference` before training 147 148 ## Important Guidelines 149 150 - **Always test before training**: Training runs take hours - verify everything works first 151 - **Monitor metrics**: Check WandB for reward/mean and percent_correct 152 - **Status check intervals**: Wait at least 30 minutes between status checks 153 - **Early stopping**: Stop training early if metrics look bad or stagnant 154 - **Iterate quickly**: Start with small total_steps to validate, then scale up 155 156 ## Available Toolsets 157 158 You have access to: 159 - **RL tools**: Environment discovery, config management, training, testing 160 - **Terminal**: Run commands, inspect files, explore datasets 161 - **Web**: Search for information, documentation, papers 162 - **File tools**: Read and modify code files 163 164 When asked to train a model, follow this workflow: 165 1. List available environments 166 2. Select and configure the appropriate environment 167 3. Test with sample prompts 168 4. Start training with conservative settings 169 5. Monitor progress and adjust as needed 170 """ 171 172 # Toolsets to enable for RL workflows 173 RL_TOOLSETS = ["terminal", "web", "rl"] 174 175 176 # ============================================================================ 177 # Helper Functions 178 # ============================================================================ 179 180 def check_requirements(): 181 """Check that all required environment variables and services are available.""" 182 errors = [] 183 184 # Check API keys 185 if not os.getenv("OPENROUTER_API_KEY"): 186 errors.append("OPENROUTER_API_KEY not set - required for agent") 187 188 missing_rl_keys = get_missing_keys() 189 if missing_rl_keys: 190 errors.append(f"Missing RL API keys: {', '.join(missing_rl_keys)}") 191 192 if errors: 193 print("ā Missing requirements:") 194 for error in errors: 195 print(f" - {error}") 196 print("\nPlease set these environment variables in your .env file or shell.") 197 return False 198 199 return True 200 201 202 def check_tinker_atropos(): 203 """Check if tinker-atropos submodule is properly set up.""" 204 tinker_path = Path(__file__).parent / "tinker-atropos" 205 206 if not tinker_path.exists(): 207 return False, "tinker-atropos submodule not found. Run: git submodule update --init" 208 209 envs_path = tinker_path / "tinker_atropos" / "environments" 210 if not envs_path.exists(): 211 return False, f"environments directory not found at {envs_path}" 212 213 env_files = list(envs_path.glob("*.py")) 214 env_files = [f for f in env_files if not f.name.startswith("_")] 215 216 return True, {"path": str(tinker_path), "environments_count": len(env_files)} 217 218 219 def list_environments_sync(): 220 """List available environments (synchronous wrapper).""" 221 from tools.rl_training_tool import rl_list_environments 222 import json 223 224 async def _list(): 225 result = await rl_list_environments() 226 return json.loads(result) 227 228 return asyncio.run(_list()) 229 230 231 # ============================================================================ 232 # Main CLI 233 # ============================================================================ 234 235 def main( 236 task: str = None, 237 model: str = None, 238 api_key: str = None, 239 base_url: str = None, 240 max_iterations: int = RL_MAX_ITERATIONS, 241 interactive: bool = False, 242 list_environments: bool = False, 243 check_server: bool = False, 244 verbose: bool = False, 245 save_trajectories: bool = True, 246 ): 247 """ 248 RL Training CLI - Dedicated runner for RL training workflows. 249 250 Args: 251 task: The training task/goal (e.g., "Train a model on GSM8k for math") 252 model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided) 253 api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided) 254 base_url: API base URL (reads from config or defaults to OpenRouter) 255 max_iterations: Maximum agent iterations (default: 200 for long workflows) 256 interactive: Run in interactive mode (multiple conversations) 257 list_environments: Just list available RL environments and exit 258 check_server: Check if RL API server is running and exit 259 verbose: Enable verbose logging 260 save_trajectories: Save conversation trajectories (default: True for RL) 261 262 Examples: 263 # Train on a specific environment 264 python rl_cli.py "Train a model on GSM8k math problems" 265 266 # Interactive mode 267 python rl_cli.py --interactive 268 269 # List available environments 270 python rl_cli.py --list-environments 271 272 # Check server status 273 python rl_cli.py --check-server 274 """ 275 # Load config from ~/.hermes/config.yaml 276 config = load_hermes_config() 277 278 # Use config values if not explicitly provided 279 if model is None: 280 model = config["model"] 281 if base_url is None: 282 base_url = config["base_url"] 283 284 print("šÆ RL Training Agent") 285 print("=" * 60) 286 287 # Handle setup check 288 if check_server: 289 print("\nš Checking tinker-atropos setup...") 290 ok, result = check_tinker_atropos() 291 if ok: 292 print("ā tinker-atropos submodule found") 293 print(f" Path: {result.get('path')}") 294 print(f" Environments found: {result.get('environments_count', 0)}") 295 296 # Also check API keys 297 missing = get_missing_keys() 298 if missing: 299 print(f"\nā ļø Missing API keys: {', '.join(missing)}") 300 print(" Add them to ~/.hermes/.env") 301 else: 302 print("ā API keys configured") 303 else: 304 print(f"ā tinker-atropos not set up: {result}") 305 print("\nTo set up:") 306 print(" git submodule update --init") 307 print(" pip install -e ./tinker-atropos") 308 return 309 310 # Handle environment listing 311 if list_environments: 312 print("\nš Available RL Environments:") 313 print("-" * 40) 314 try: 315 data = list_environments_sync() 316 if "error" in data: 317 print(f"ā Error: {data['error']}") 318 return 319 320 envs = data.get("environments", []) 321 if not envs: 322 print("No environments found.") 323 print("\nMake sure tinker-atropos is set up:") 324 print(" git submodule update --init") 325 return 326 327 for env in envs: 328 print(f"\n š¦ {env['name']}") 329 print(f" Class: {env['class_name']}") 330 print(f" Path: {env['file_path']}") 331 if env.get('description'): 332 desc = env['description'][:100] + "..." if len(env.get('description', '')) > 100 else env.get('description', '') 333 print(f" Description: {desc}") 334 335 print(f"\nš Total: {len(envs)} environments") 336 print("\nUse `rl_select_environment(name)` to select an environment for training.") 337 except Exception as e: 338 print(f"ā Error listing environments: {e}") 339 print("\nMake sure tinker-atropos is set up:") 340 print(" git submodule update --init") 341 print(" pip install -e ./tinker-atropos") 342 return 343 344 # Check requirements 345 if not check_requirements(): 346 sys.exit(1) 347 348 # Set default task if none provided 349 if not task and not interactive: 350 print("\nā ļø No task provided. Use --interactive for interactive mode or provide a task.") 351 print("\nExamples:") 352 print(' python rl_cli.py "Train a model on GSM8k math problems"') 353 print(' python rl_cli.py "Create an RL environment for code generation"') 354 print(' python rl_cli.py --interactive') 355 return 356 357 # Get API key 358 api_key = api_key or os.getenv("OPENROUTER_API_KEY") 359 if not api_key: 360 print("ā No API key provided. Set OPENROUTER_API_KEY or pass --api-key") 361 sys.exit(1) 362 363 print(f"\nš¤ Model: {model}") 364 print(f"š§ Max iterations: {max_iterations}") 365 print(f"š Toolsets: {', '.join(RL_TOOLSETS)}") 366 print("=" * 60) 367 368 # Create agent with RL configuration 369 agent = AIAgent( 370 base_url=base_url, 371 api_key=api_key, 372 model=model, 373 max_iterations=max_iterations, 374 enabled_toolsets=RL_TOOLSETS, 375 save_trajectories=save_trajectories, 376 verbose_logging=verbose, 377 quiet_mode=False, 378 ephemeral_system_prompt=RL_SYSTEM_PROMPT, 379 ) 380 381 if interactive: 382 # Interactive mode - multiple conversations 383 print("\nš Interactive RL Training Mode") 384 print("Type 'quit' or 'exit' to end the session.") 385 print("Type 'status' to check active training runs.") 386 print("-" * 40) 387 388 while True: 389 try: 390 user_input = input("\nšÆ RL Task> ").strip() 391 392 if not user_input: 393 continue 394 395 if user_input.lower() in ('quit', 'exit', 'q'): 396 print("\nš Goodbye!") 397 break 398 399 if user_input.lower() == 'status': 400 # Quick status check 401 from tools.rl_training_tool import rl_list_runs 402 import json 403 result = asyncio.run(rl_list_runs()) 404 runs = json.loads(result) 405 if isinstance(runs, list) and runs: 406 print("\nš Active Runs:") 407 for run in runs: 408 print(f" - {run['run_id']}: {run['environment']} ({run['status']})") 409 else: 410 print("\nNo active runs.") 411 continue 412 413 # Run the agent 414 print("\n" + "=" * 60) 415 agent.run_conversation(user_input) 416 print("\n" + "=" * 60) 417 418 except KeyboardInterrupt: 419 print("\n\nš Interrupted. Goodbye!") 420 break 421 except Exception as e: 422 print(f"\nā Error: {e}") 423 if verbose: 424 import traceback 425 traceback.print_exc() 426 else: 427 # Single task mode 428 print(f"\nš Task: {task}") 429 print("-" * 40) 430 431 try: 432 agent.run_conversation(task) 433 print("\n" + "=" * 60) 434 print("ā Task completed") 435 except KeyboardInterrupt: 436 print("\n\nā ļø Interrupted by user") 437 except Exception as e: 438 print(f"\nā Error: {e}") 439 if verbose: 440 import traceback 441 traceback.print_exc() 442 sys.exit(1) 443 444 445 if __name__ == "__main__": 446 fire.Fire(main)