/ rl_cli.py
rl_cli.py
  1  #!/usr/bin/env python3
  2  """
  3  RL Training CLI Runner
  4  
  5  Dedicated CLI runner for RL training workflows with:
  6  - Extended timeouts for long-running training
  7  - RL-focused system prompts
  8  - Full toolset including RL training tools
  9  - Special handling for 30-minute check intervals
 10  
 11  Usage:
 12      python rl_cli.py "Train a model on GSM8k for math reasoning"
 13      python rl_cli.py --interactive
 14      python rl_cli.py --list-environments
 15  
 16  Environment Variables:
 17      TINKER_API_KEY: API key for Tinker service (required)
 18      WANDB_API_KEY: API key for WandB metrics (required)
 19      OPENROUTER_API_KEY: API key for OpenRouter (required for agent)
 20  """
 21  
 22  import asyncio
 23  import os
 24  import sys
 25  from pathlib import Path
 26  
 27  import fire
 28  import yaml
 29  
 30  from hermes_constants import OPENROUTER_BASE_URL, get_hermes_home
 31  
 32  # Load .env from ~/.hermes/.env first, then project root as dev fallback.
 33  # User-managed env files should override stale shell exports on restart.
 34  _hermes_home = get_hermes_home()
 35  _project_env = Path(__file__).parent / '.env'
 36  
 37  from hermes_cli.env_loader import load_hermes_dotenv
 38  
 39  _loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
 40  for _env_path in _loaded_env_paths:
 41      print(f"āœ… Loaded environment variables from {_env_path}")
 42  
 43  # Set terminal working directory to tinker-atropos submodule
 44  # This ensures terminal commands run in the right context for RL work
 45  tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos'
 46  if tinker_atropos_dir.exists():
 47      os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir)
 48      os.environ['HERMES_QUIET'] = '1'  # Disable temp subdirectory creation
 49      print(f"šŸ“‚ Terminal working directory: {tinker_atropos_dir}")
 50  else:
 51      # Fall back to hermes-agent directory if submodule not found
 52      os.environ['TERMINAL_CWD'] = str(Path(__file__).parent)
 53      os.environ['HERMES_QUIET'] = '1'
 54      print(f"āš ļø  tinker-atropos submodule not found, using: {Path(__file__).parent}")
 55  
 56  # Import agent and tools
 57  from run_agent import AIAgent
 58  from tools.rl_training_tool import get_missing_keys
 59  
 60  
 61  # ============================================================================
 62  # Config Loading
 63  # ============================================================================
 64  
 65  DEFAULT_MODEL = "anthropic/claude-opus-4.5"
 66  DEFAULT_BASE_URL = OPENROUTER_BASE_URL
 67  
 68  
 69  def load_hermes_config() -> dict:
 70      """
 71      Load configuration from ~/.hermes/config.yaml.
 72      
 73      Returns:
 74          dict: Configuration with model, base_url, etc.
 75      """
 76      config_path = _hermes_home / 'config.yaml'
 77      
 78      config = {
 79          "model": DEFAULT_MODEL,
 80          "base_url": DEFAULT_BASE_URL,
 81      }
 82      
 83      if config_path.exists():
 84          try:
 85              with open(config_path, "r") as f:
 86                  file_config = yaml.safe_load(f) or {}
 87              
 88              # Get model from config
 89              if "model" in file_config:
 90                  if isinstance(file_config["model"], str):
 91                      config["model"] = file_config["model"]
 92                  elif isinstance(file_config["model"], dict):
 93                      config["model"] = file_config["model"].get("default", DEFAULT_MODEL)
 94              
 95              # Get base_url if specified
 96              if "base_url" in file_config:
 97                  config["base_url"] = file_config["base_url"]
 98                  
 99          except Exception as e:
100              print(f"āš ļø  Warning: Failed to load config.yaml: {e}")
101      
102      return config
103  
104  
105  # ============================================================================
106  # RL-Specific Configuration
107  # ============================================================================
108  
109  # Extended timeouts for long-running RL operations
110  RL_MAX_ITERATIONS = 200  # Allow many more iterations for long workflows
111  
112  # RL-focused system prompt
113  RL_SYSTEM_PROMPT = """You are an automated post-training engineer specializing in reinforcement learning for language models.
114  
115  ## Your Capabilities
116  
117  You have access to RL training tools for running reinforcement learning on models through Tinker-Atropos:
118  
119  1. **DISCOVER**: Use `rl_list_environments` to see available RL environments
120  2. **INSPECT**: Read environment files to understand how they work (verifiers, data loading, rewards)
121  3. **INSPECT DATA**: Use terminal to explore HuggingFace datasets and understand their format
122  4. **CREATE**: Copy existing environments as templates, modify for your needs
123  5. **CONFIGURE**: Use `rl_select_environment` and `rl_edit_config` to set up training
124  6. **TEST**: Always use `rl_test_inference` before full training to validate your setup
125  7. **TRAIN**: Use `rl_start_training` to begin, `rl_check_status` to monitor
126  8. **EVALUATE**: Use `rl_get_results` and analyze WandB metrics to assess performance
127  
128  ## Environment Files
129  
130  Environment files are located in: `tinker-atropos/tinker_atropos/environments/`
131  
132  Study existing environments to learn patterns. Look for:
133  - `load_dataset()` calls - how data is loaded
134  - `score_answer()` / `score()` - verification logic
135  - `get_next_item()` - prompt formatting
136  - `system_prompt` - instruction format
137  - `config_init()` - default configuration
138  
139  ## Creating New Environments
140  
141  To create a new environment:
142  1. Read an existing environment file (e.g., gsm8k_tinker.py)
143  2. Use terminal to explore the target dataset format
144  3. Copy the environment file as a template
145  4. Modify the dataset loading, prompt formatting, and verifier logic
146  5. Test with `rl_test_inference` before training
147  
148  ## Important Guidelines
149  
150  - **Always test before training**: Training runs take hours - verify everything works first
151  - **Monitor metrics**: Check WandB for reward/mean and percent_correct
152  - **Status check intervals**: Wait at least 30 minutes between status checks
153  - **Early stopping**: Stop training early if metrics look bad or stagnant
154  - **Iterate quickly**: Start with small total_steps to validate, then scale up
155  
156  ## Available Toolsets
157  
158  You have access to:
159  - **RL tools**: Environment discovery, config management, training, testing
160  - **Terminal**: Run commands, inspect files, explore datasets
161  - **Web**: Search for information, documentation, papers
162  - **File tools**: Read and modify code files
163  
164  When asked to train a model, follow this workflow:
165  1. List available environments
166  2. Select and configure the appropriate environment
167  3. Test with sample prompts
168  4. Start training with conservative settings
169  5. Monitor progress and adjust as needed
170  """
171  
172  # Toolsets to enable for RL workflows
173  RL_TOOLSETS = ["terminal", "web", "rl"]
174  
175  
176  # ============================================================================
177  # Helper Functions
178  # ============================================================================
179  
180  def check_requirements():
181      """Check that all required environment variables and services are available."""
182      errors = []
183      
184      # Check API keys
185      if not os.getenv("OPENROUTER_API_KEY"):
186          errors.append("OPENROUTER_API_KEY not set - required for agent")
187      
188      missing_rl_keys = get_missing_keys()
189      if missing_rl_keys:
190          errors.append(f"Missing RL API keys: {', '.join(missing_rl_keys)}")
191      
192      if errors:
193          print("āŒ Missing requirements:")
194          for error in errors:
195              print(f"   - {error}")
196          print("\nPlease set these environment variables in your .env file or shell.")
197          return False
198      
199      return True
200  
201  
202  def check_tinker_atropos():
203      """Check if tinker-atropos submodule is properly set up."""
204      tinker_path = Path(__file__).parent / "tinker-atropos"
205      
206      if not tinker_path.exists():
207          return False, "tinker-atropos submodule not found. Run: git submodule update --init"
208      
209      envs_path = tinker_path / "tinker_atropos" / "environments"
210      if not envs_path.exists():
211          return False, f"environments directory not found at {envs_path}"
212      
213      env_files = list(envs_path.glob("*.py"))
214      env_files = [f for f in env_files if not f.name.startswith("_")]
215      
216      return True, {"path": str(tinker_path), "environments_count": len(env_files)}
217  
218  
219  def list_environments_sync():
220      """List available environments (synchronous wrapper)."""
221      from tools.rl_training_tool import rl_list_environments
222      import json
223      
224      async def _list():
225          result = await rl_list_environments()
226          return json.loads(result)
227      
228      return asyncio.run(_list())
229  
230  
231  # ============================================================================
232  # Main CLI
233  # ============================================================================
234  
235  def main(
236      task: str = None,
237      model: str = None,
238      api_key: str = None,
239      base_url: str = None,
240      max_iterations: int = RL_MAX_ITERATIONS,
241      interactive: bool = False,
242      list_environments: bool = False,
243      check_server: bool = False,
244      verbose: bool = False,
245      save_trajectories: bool = True,
246  ):
247      """
248      RL Training CLI - Dedicated runner for RL training workflows.
249      
250      Args:
251          task: The training task/goal (e.g., "Train a model on GSM8k for math")
252          model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided)
253          api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided)
254          base_url: API base URL (reads from config or defaults to OpenRouter)
255          max_iterations: Maximum agent iterations (default: 200 for long workflows)
256          interactive: Run in interactive mode (multiple conversations)
257          list_environments: Just list available RL environments and exit
258          check_server: Check if RL API server is running and exit
259          verbose: Enable verbose logging
260          save_trajectories: Save conversation trajectories (default: True for RL)
261      
262      Examples:
263          # Train on a specific environment
264          python rl_cli.py "Train a model on GSM8k math problems"
265          
266          # Interactive mode
267          python rl_cli.py --interactive
268          
269          # List available environments
270          python rl_cli.py --list-environments
271          
272          # Check server status
273          python rl_cli.py --check-server
274      """
275      # Load config from ~/.hermes/config.yaml
276      config = load_hermes_config()
277      
278      # Use config values if not explicitly provided
279      if model is None:
280          model = config["model"]
281      if base_url is None:
282          base_url = config["base_url"]
283      
284      print("šŸŽÆ RL Training Agent")
285      print("=" * 60)
286      
287      # Handle setup check
288      if check_server:
289          print("\nšŸ” Checking tinker-atropos setup...")
290          ok, result = check_tinker_atropos()
291          if ok:
292              print("āœ… tinker-atropos submodule found")
293              print(f"   Path: {result.get('path')}")
294              print(f"   Environments found: {result.get('environments_count', 0)}")
295              
296              # Also check API keys
297              missing = get_missing_keys()
298              if missing:
299                  print(f"\nāš ļø  Missing API keys: {', '.join(missing)}")
300                  print("   Add them to ~/.hermes/.env")
301              else:
302                  print("āœ… API keys configured")
303          else:
304              print(f"āŒ tinker-atropos not set up: {result}")
305              print("\nTo set up:")
306              print("  git submodule update --init")
307              print("  pip install -e ./tinker-atropos")
308          return
309      
310      # Handle environment listing
311      if list_environments:
312          print("\nšŸ“‹ Available RL Environments:")
313          print("-" * 40)
314          try:
315              data = list_environments_sync()
316              if "error" in data:
317                  print(f"āŒ Error: {data['error']}")
318                  return
319              
320              envs = data.get("environments", [])
321              if not envs:
322                  print("No environments found.")
323                  print("\nMake sure tinker-atropos is set up:")
324                  print("  git submodule update --init")
325                  return
326              
327              for env in envs:
328                  print(f"\n  šŸ“¦ {env['name']}")
329                  print(f"     Class: {env['class_name']}")
330                  print(f"     Path: {env['file_path']}")
331                  if env.get('description'):
332                      desc = env['description'][:100] + "..." if len(env.get('description', '')) > 100 else env.get('description', '')
333                      print(f"     Description: {desc}")
334              
335              print(f"\nšŸ“Š Total: {len(envs)} environments")
336              print("\nUse `rl_select_environment(name)` to select an environment for training.")
337          except Exception as e:
338              print(f"āŒ Error listing environments: {e}")
339              print("\nMake sure tinker-atropos is set up:")
340              print("  git submodule update --init")
341              print("  pip install -e ./tinker-atropos")
342          return
343      
344      # Check requirements
345      if not check_requirements():
346          sys.exit(1)
347      
348      # Set default task if none provided
349      if not task and not interactive:
350          print("\nāš ļø  No task provided. Use --interactive for interactive mode or provide a task.")
351          print("\nExamples:")
352          print('  python rl_cli.py "Train a model on GSM8k math problems"')
353          print('  python rl_cli.py "Create an RL environment for code generation"')
354          print('  python rl_cli.py --interactive')
355          return
356      
357      # Get API key
358      api_key = api_key or os.getenv("OPENROUTER_API_KEY")
359      if not api_key:
360          print("āŒ No API key provided. Set OPENROUTER_API_KEY or pass --api-key")
361          sys.exit(1)
362      
363      print(f"\nšŸ¤– Model: {model}")
364      print(f"šŸ”§ Max iterations: {max_iterations}")
365      print(f"šŸ“ Toolsets: {', '.join(RL_TOOLSETS)}")
366      print("=" * 60)
367      
368      # Create agent with RL configuration
369      agent = AIAgent(
370          base_url=base_url,
371          api_key=api_key,
372          model=model,
373          max_iterations=max_iterations,
374          enabled_toolsets=RL_TOOLSETS,
375          save_trajectories=save_trajectories,
376          verbose_logging=verbose,
377          quiet_mode=False,
378          ephemeral_system_prompt=RL_SYSTEM_PROMPT,
379      )
380      
381      if interactive:
382          # Interactive mode - multiple conversations
383          print("\nšŸ”„ Interactive RL Training Mode")
384          print("Type 'quit' or 'exit' to end the session.")
385          print("Type 'status' to check active training runs.")
386          print("-" * 40)
387          
388          while True:
389              try:
390                  user_input = input("\nšŸŽÆ RL Task> ").strip()
391                  
392                  if not user_input:
393                      continue
394                  
395                  if user_input.lower() in ('quit', 'exit', 'q'):
396                      print("\nšŸ‘‹ Goodbye!")
397                      break
398                  
399                  if user_input.lower() == 'status':
400                      # Quick status check
401                      from tools.rl_training_tool import rl_list_runs
402                      import json
403                      result = asyncio.run(rl_list_runs())
404                      runs = json.loads(result)
405                      if isinstance(runs, list) and runs:
406                          print("\nšŸ“Š Active Runs:")
407                          for run in runs:
408                              print(f"  - {run['run_id']}: {run['environment']} ({run['status']})")
409                      else:
410                          print("\nNo active runs.")
411                      continue
412                  
413                  # Run the agent
414                  print("\n" + "=" * 60)
415                  agent.run_conversation(user_input)
416                  print("\n" + "=" * 60)
417                  
418              except KeyboardInterrupt:
419                  print("\n\nšŸ‘‹ Interrupted. Goodbye!")
420                  break
421              except Exception as e:
422                  print(f"\nāŒ Error: {e}")
423                  if verbose:
424                      import traceback
425                      traceback.print_exc()
426      else:
427          # Single task mode
428          print(f"\nšŸ“ Task: {task}")
429          print("-" * 40)
430          
431          try:
432              agent.run_conversation(task)
433              print("\n" + "=" * 60)
434              print("āœ… Task completed")
435          except KeyboardInterrupt:
436              print("\n\nāš ļø Interrupted by user")
437          except Exception as e:
438              print(f"\nāŒ Error: {e}")
439              if verbose:
440                  import traceback
441                  traceback.print_exc()
442              sys.exit(1)
443  
444  
445  if __name__ == "__main__":
446      fire.Fire(main)