code_execution_tool.py
1 #!/usr/bin/env python3 2 """ 3 Code Execution Tool -- Programmatic Tool Calling (PTC) 4 5 Lets the LLM write a Python script that calls Hermes tools via RPC, 6 collapsing multi-step tool chains into a single inference turn. 7 8 Architecture (two transports): 9 10 **Local backend (UDS):** 11 1. Parent generates a `hermes_tools.py` stub module with UDS RPC functions 12 2. Parent opens a Unix domain socket and starts an RPC listener thread 13 3. Parent spawns a child process that runs the LLM's script 14 4. Tool calls travel over the UDS back to the parent for dispatch 15 16 **Remote backends (file-based RPC):** 17 1. Parent generates `hermes_tools.py` with file-based RPC stubs 18 2. Parent ships both files to the remote environment 19 3. Script runs inside the terminal backend (Docker/SSH/Modal/Daytona/etc.) 20 4. Tool calls are written as request files; a polling thread on the parent 21 reads them via env.execute(), dispatches, and writes response files 22 5. The script polls for response files and continues 23 24 In both cases, only the script's stdout is returned to the LLM; intermediate 25 tool results never enter the context window. 26 27 Platform: Linux / macOS only (Unix domain sockets for local). Disabled on Windows. 28 Remote execution additionally requires Python 3 in the terminal backend. 29 """ 30 31 import base64 32 import functools 33 import json 34 import logging 35 import os 36 import platform 37 import shlex 38 import signal 39 import socket 40 import subprocess 41 import sys 42 import tempfile 43 import threading 44 import time 45 import uuid 46 47 _IS_WINDOWS = platform.system() == "Windows" 48 from typing import Any, Dict, List, Optional 49 50 # Availability gate: UDS requires a POSIX OS 51 logger = logging.getLogger(__name__) 52 53 SANDBOX_AVAILABLE = sys.platform != "win32" 54 55 # The 7 tools allowed inside the sandbox. The intersection of this list 56 # and the session's enabled tools determines which stubs are generated. 57 SANDBOX_ALLOWED_TOOLS = frozenset([ 58 "web_search", 59 "web_extract", 60 "read_file", 61 "write_file", 62 "search_files", 63 "patch", 64 "terminal", 65 ]) 66 67 # Resource limit defaults (overridable via config.yaml → code_execution.*) 68 DEFAULT_TIMEOUT = 300 # 5 minutes 69 DEFAULT_MAX_TOOL_CALLS = 50 70 MAX_STDOUT_BYTES = 50_000 # 50 KB 71 MAX_STDERR_BYTES = 10_000 # 10 KB 72 73 74 def check_sandbox_requirements() -> bool: 75 """Code execution sandbox requires a POSIX OS for Unix domain sockets.""" 76 if not SANDBOX_AVAILABLE: 77 return False 78 79 try: 80 from tools.terminal_tool import ( 81 _check_vercel_sandbox_requirements, 82 _get_env_config, 83 ) 84 85 config = _get_env_config() 86 except Exception: 87 logger.debug("Could not resolve terminal config for execute_code availability", exc_info=True) 88 return False 89 90 if config.get("env_type") == "vercel_sandbox": 91 return _check_vercel_sandbox_requirements(config) 92 93 return True 94 95 96 # --------------------------------------------------------------------------- 97 # hermes_tools.py code generator 98 # --------------------------------------------------------------------------- 99 100 # Per-tool stub templates: (function_name, signature, docstring, args_dict_expr) 101 # The args_dict_expr builds the JSON payload sent over the RPC socket. 102 _TOOL_STUBS = { 103 "web_search": ( 104 "web_search", 105 "query: str, limit: int = 5", 106 '"""Search the web. Returns dict with data.web list of {url, title, description}."""', 107 '{"query": query, "limit": limit}', 108 ), 109 "web_extract": ( 110 "web_extract", 111 "urls: list", 112 '"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""', 113 '{"urls": urls}', 114 ), 115 "read_file": ( 116 "read_file", 117 "path: str, offset: int = 1, limit: int = 500", 118 '"""Read a file (1-indexed lines). Returns dict with "content" and "total_lines"."""', 119 '{"path": path, "offset": offset, "limit": limit}', 120 ), 121 "write_file": ( 122 "write_file", 123 "path: str, content: str", 124 '"""Write content to a file (always overwrites). Returns dict with status."""', 125 '{"path": path, "content": content}', 126 ), 127 "search_files": ( 128 "search_files", 129 'pattern: str, target: str = "content", path: str = ".", file_glob: str = None, limit: int = 50, offset: int = 0, output_mode: str = "content", context: int = 0', 130 '"""Search file contents (target="content") or find files by name (target="files"). Returns dict with "matches"."""', 131 '{"pattern": pattern, "target": target, "path": path, "file_glob": file_glob, "limit": limit, "offset": offset, "output_mode": output_mode, "context": context}', 132 ), 133 "patch": ( 134 "patch", 135 'path: str = None, old_string: str = None, new_string: str = None, replace_all: bool = False, mode: str = "replace", patch: str = None', 136 '"""Targeted find-and-replace (mode="replace") or V4A multi-file patches (mode="patch"). Returns dict with status."""', 137 '{"path": path, "old_string": old_string, "new_string": new_string, "replace_all": replace_all, "mode": mode, "patch": patch}', 138 ), 139 "terminal": ( 140 "terminal", 141 "command: str, timeout: int = None, workdir: str = None", 142 '"""Run a shell command (foreground only). Returns dict with "output" and "exit_code"."""', 143 '{"command": command, "timeout": timeout, "workdir": workdir}', 144 ), 145 } 146 147 148 def generate_hermes_tools_module(enabled_tools: List[str], 149 transport: str = "uds") -> str: 150 """ 151 Build the source code for the hermes_tools.py stub module. 152 153 Only tools in both SANDBOX_ALLOWED_TOOLS and enabled_tools get stubs. 154 155 Args: 156 enabled_tools: Tool names enabled in the current session. 157 transport: ``"uds"`` for Unix domain socket (local backend) or 158 ``"file"`` for file-based RPC (remote backends). 159 """ 160 tools_to_generate = sorted(SANDBOX_ALLOWED_TOOLS & set(enabled_tools)) 161 162 stub_functions = [] 163 export_names = [] 164 for tool_name in tools_to_generate: 165 if tool_name not in _TOOL_STUBS: 166 continue 167 func_name, sig, doc, args_expr = _TOOL_STUBS[tool_name] 168 stub_functions.append( 169 f"def {func_name}({sig}):\n" 170 f" {doc}\n" 171 f" return _call({func_name!r}, {args_expr})\n" 172 ) 173 export_names.append(func_name) 174 175 if transport == "file": 176 header = _FILE_TRANSPORT_HEADER 177 else: 178 header = _UDS_TRANSPORT_HEADER 179 180 return header + "\n".join(stub_functions) 181 182 183 # ---- Shared helpers section (embedded in both transport headers) ---------- 184 185 _COMMON_HELPERS = '''\ 186 187 # --------------------------------------------------------------------------- 188 # Convenience helpers (avoid common scripting pitfalls) 189 # --------------------------------------------------------------------------- 190 191 def json_parse(text: str): 192 """Parse JSON tolerant of control characters (strict=False). 193 Use this instead of json.loads() when parsing output from terminal() 194 or web_extract() that may contain raw tabs/newlines in strings.""" 195 return json.loads(text, strict=False) 196 197 198 def shell_quote(s: str) -> str: 199 """Shell-escape a string for safe interpolation into commands. 200 Use this when inserting dynamic content into terminal() commands: 201 terminal(f"echo {shell_quote(user_input)}") 202 """ 203 return shlex.quote(s) 204 205 206 def retry(fn, max_attempts=3, delay=2): 207 """Retry a function up to max_attempts times with exponential backoff. 208 Use for transient failures (network errors, API rate limits): 209 result = retry(lambda: terminal("gh issue list ...")) 210 """ 211 last_err = None 212 for attempt in range(max_attempts): 213 try: 214 return fn() 215 except Exception as e: 216 last_err = e 217 if attempt < max_attempts - 1: 218 time.sleep(delay * (2 ** attempt)) 219 raise last_err 220 221 ''' 222 223 # ---- UDS transport (local backend) --------------------------------------- 224 225 _UDS_TRANSPORT_HEADER = '''\ 226 """Auto-generated Hermes tools RPC stubs.""" 227 import json, os, socket, shlex, threading, time 228 229 _sock = None 230 # The RPC server handles a single client connection serially and has no 231 # request-id in the protocol, so concurrent _call() invocations from multiple 232 # threads (e.g. ThreadPoolExecutor) would race on the shared socket and get 233 # each other's responses. Serialize the entire send+recv round-trip. 234 _call_lock = threading.Lock() 235 ''' + _COMMON_HELPERS + '''\ 236 237 def _connect(): 238 global _sock 239 if _sock is None: 240 _sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 241 _sock.connect(os.environ["HERMES_RPC_SOCKET"]) 242 _sock.settimeout(300) 243 return _sock 244 245 def _call(tool_name, args): 246 """Send a tool call to the parent process and return the parsed result.""" 247 request = json.dumps({"tool": tool_name, "args": args}) + "\\n" 248 with _call_lock: 249 conn = _connect() 250 conn.sendall(request.encode()) 251 buf = b"" 252 while True: 253 chunk = conn.recv(65536) 254 if not chunk: 255 raise RuntimeError("Agent process disconnected") 256 buf += chunk 257 if buf.endswith(b"\\n"): 258 break 259 raw = buf.decode().strip() 260 result = json.loads(raw) 261 if isinstance(result, str): 262 try: 263 return json.loads(result) 264 except (json.JSONDecodeError, TypeError): 265 return result 266 return result 267 268 ''' 269 270 # ---- File-based transport (remote backends) ------------------------------- 271 272 _FILE_TRANSPORT_HEADER = '''\ 273 """Auto-generated Hermes tools RPC stubs (file-based transport).""" 274 import json, os, shlex, tempfile, threading, time 275 276 _RPC_DIR = os.environ.get("HERMES_RPC_DIR") or os.path.join(tempfile.gettempdir(), "hermes_rpc") 277 _seq = 0 278 # `_seq += 1` is not atomic (read-modify-write), so concurrent _call() 279 # invocations from multiple threads could allocate the same sequence number 280 # and clobber each other's request files. Guard seq allocation with a lock. 281 _seq_lock = threading.Lock() 282 ''' + _COMMON_HELPERS + '''\ 283 284 def _call(tool_name, args): 285 """Send a tool call request via file-based RPC and wait for response.""" 286 global _seq 287 with _seq_lock: 288 _seq += 1 289 seq = _seq 290 seq_str = f"{seq:06d}" 291 req_file = os.path.join(_RPC_DIR, f"req_{seq_str}") 292 res_file = os.path.join(_RPC_DIR, f"res_{seq_str}") 293 294 # Write request atomically (write to .tmp, then rename) 295 tmp = req_file + ".tmp" 296 with open(tmp, "w") as f: 297 json.dump({"tool": tool_name, "args": args, "seq": seq}, f) 298 os.rename(tmp, req_file) 299 300 # Wait for response with adaptive polling 301 deadline = time.monotonic() + 300 # 5-minute timeout per tool call 302 poll_interval = 0.05 # Start at 50ms 303 while not os.path.exists(res_file): 304 if time.monotonic() > deadline: 305 raise RuntimeError(f"RPC timeout: no response for {tool_name} after 300s") 306 time.sleep(poll_interval) 307 poll_interval = min(poll_interval * 1.2, 0.25) # Back off to 250ms 308 309 with open(res_file) as f: 310 raw = f.read() 311 312 # Clean up response file 313 try: 314 os.unlink(res_file) 315 except OSError: 316 pass 317 318 result = json.loads(raw) 319 if isinstance(result, str): 320 try: 321 return json.loads(result) 322 except (json.JSONDecodeError, TypeError): 323 return result 324 return result 325 326 ''' 327 328 329 # --------------------------------------------------------------------------- 330 # RPC server (runs in a thread inside the parent process) 331 # --------------------------------------------------------------------------- 332 333 # Terminal parameters that must not be used from ephemeral sandbox scripts 334 _TERMINAL_BLOCKED_PARAMS = {"background", "pty", "notify_on_complete", "watch_patterns"} 335 336 337 def _rpc_server_loop( 338 server_sock: socket.socket, 339 task_id: str, 340 tool_call_log: list, 341 tool_call_counter: list, # mutable [int] so the thread can increment 342 max_tool_calls: int, 343 allowed_tools: frozenset, 344 ): 345 """ 346 Accept one client connection and dispatch tool-call requests until 347 the client disconnects or the call limit is reached. 348 """ 349 from model_tools import handle_function_call 350 351 conn = None 352 try: 353 server_sock.settimeout(5) 354 conn, _ = server_sock.accept() 355 conn.settimeout(300) 356 357 buf = b"" 358 while True: 359 try: 360 chunk = conn.recv(65536) 361 except socket.timeout: 362 break 363 if not chunk: 364 break 365 buf += chunk 366 367 # Process all complete newline-delimited messages in the buffer 368 while b"\n" in buf: 369 line, buf = buf.split(b"\n", 1) 370 line = line.strip() 371 if not line: 372 continue 373 374 call_start = time.monotonic() 375 try: 376 request = json.loads(line.decode()) 377 except (json.JSONDecodeError, UnicodeDecodeError) as exc: 378 resp = tool_error(f"Invalid RPC request: {exc}") 379 conn.sendall((resp + "\n").encode()) 380 continue 381 382 tool_name = request.get("tool", "") 383 tool_args = request.get("args", {}) 384 385 # Enforce the allow-list 386 if tool_name not in allowed_tools: 387 available = ", ".join(sorted(allowed_tools)) 388 resp = json.dumps({ 389 "error": ( 390 f"Tool '{tool_name}' is not available in execute_code. " 391 f"Available: {available}" 392 ) 393 }) 394 conn.sendall((resp + "\n").encode()) 395 continue 396 397 # Enforce tool call limit 398 if tool_call_counter[0] >= max_tool_calls: 399 resp = json.dumps({ 400 "error": ( 401 f"Tool call limit reached ({max_tool_calls}). " 402 "No more tool calls allowed in this execution." 403 ) 404 }) 405 conn.sendall((resp + "\n").encode()) 406 continue 407 408 # Strip forbidden terminal parameters 409 if tool_name == "terminal" and isinstance(tool_args, dict): 410 for param in _TERMINAL_BLOCKED_PARAMS: 411 tool_args.pop(param, None) 412 413 # Dispatch through the standard tool handler. 414 # Suppress stdout/stderr from internal tool handlers so 415 # their status prints don't leak into the CLI spinner. 416 try: 417 _real_stdout, _real_stderr = sys.stdout, sys.stderr 418 devnull = open(os.devnull, "w") 419 try: 420 sys.stdout = devnull 421 sys.stderr = devnull 422 result = handle_function_call( 423 tool_name, tool_args, task_id=task_id 424 ) 425 finally: 426 sys.stdout, sys.stderr = _real_stdout, _real_stderr 427 devnull.close() 428 except Exception as exc: 429 logger.error("Tool call failed in sandbox: %s", exc, exc_info=True) 430 result = tool_error(str(exc)) 431 432 tool_call_counter[0] += 1 433 call_duration = time.monotonic() - call_start 434 435 # Log for observability 436 args_preview = str(tool_args)[:80] 437 tool_call_log.append({ 438 "tool": tool_name, 439 "args_preview": args_preview, 440 "duration": round(call_duration, 2), 441 }) 442 443 conn.sendall((result + "\n").encode()) 444 445 except socket.timeout: 446 logger.debug("RPC listener socket timeout") 447 except OSError as e: 448 logger.debug("RPC listener socket error: %s", e, exc_info=True) 449 finally: 450 if conn: 451 try: 452 conn.close() 453 except OSError as e: 454 logger.debug("RPC conn close error: %s", e) 455 456 457 # --------------------------------------------------------------------------- 458 # Remote execution support (file-based RPC via terminal backend) 459 # --------------------------------------------------------------------------- 460 461 def _get_or_create_env(task_id: str): 462 """Get or create the terminal environment for *task_id*. 463 464 Reuses the same environment (container/sandbox/SSH session) that the 465 terminal and file tools use, creating one if it doesn't exist yet. 466 Returns ``(env, env_type)`` tuple. 467 """ 468 from tools.terminal_tool import ( 469 _active_environments, _env_lock, _create_environment, 470 _get_env_config, _last_activity, _start_cleanup_thread, 471 _creation_locks, _creation_locks_lock, _task_env_overrides, 472 _resolve_container_task_id, 473 ) 474 475 effective_task_id = _resolve_container_task_id(task_id) 476 477 # Fast path: environment already exists 478 with _env_lock: 479 if effective_task_id in _active_environments: 480 _last_activity[effective_task_id] = time.time() 481 return _active_environments[effective_task_id], _get_env_config()["env_type"] 482 483 # Slow path: create environment (same pattern as file_tools._get_file_ops) 484 with _creation_locks_lock: 485 if effective_task_id not in _creation_locks: 486 _creation_locks[effective_task_id] = threading.Lock() 487 task_lock = _creation_locks[effective_task_id] 488 489 with task_lock: 490 with _env_lock: 491 if effective_task_id in _active_environments: 492 _last_activity[effective_task_id] = time.time() 493 return _active_environments[effective_task_id], _get_env_config()["env_type"] 494 495 config = _get_env_config() 496 env_type = config["env_type"] 497 overrides = _task_env_overrides.get(effective_task_id, {}) 498 499 if env_type == "docker": 500 image = overrides.get("docker_image") or config["docker_image"] 501 elif env_type == "singularity": 502 image = overrides.get("singularity_image") or config["singularity_image"] 503 elif env_type == "modal": 504 image = overrides.get("modal_image") or config["modal_image"] 505 elif env_type == "daytona": 506 image = overrides.get("daytona_image") or config["daytona_image"] 507 else: 508 image = "" 509 510 cwd = overrides.get("cwd") or config["cwd"] 511 512 container_config = None 513 if env_type in ("docker", "singularity", "modal", "daytona", "vercel_sandbox"): 514 container_config = { 515 "container_cpu": config.get("container_cpu", 1), 516 "container_memory": config.get("container_memory", 5120), 517 "container_disk": config.get("container_disk", 51200), 518 "container_persistent": config.get("container_persistent", True), 519 "vercel_runtime": config.get("vercel_runtime", ""), 520 "docker_volumes": config.get("docker_volumes", []), 521 "docker_run_as_host_user": config.get("docker_run_as_host_user", False), 522 } 523 524 ssh_config = None 525 if env_type == "ssh": 526 ssh_config = { 527 "host": config.get("ssh_host", ""), 528 "user": config.get("ssh_user", ""), 529 "port": config.get("ssh_port", 22), 530 "key": config.get("ssh_key", ""), 531 "persistent": config.get("ssh_persistent", False), 532 } 533 534 local_config = None 535 if env_type == "local": 536 local_config = { 537 "persistent": config.get("local_persistent", False), 538 } 539 540 logger.info("Creating new %s environment for execute_code task %s...", 541 env_type, effective_task_id[:8]) 542 env = _create_environment( 543 env_type=env_type, 544 image=image, 545 cwd=cwd, 546 timeout=config["timeout"], 547 ssh_config=ssh_config, 548 container_config=container_config, 549 local_config=local_config, 550 task_id=effective_task_id, 551 host_cwd=config.get("host_cwd"), 552 ) 553 554 with _env_lock: 555 _active_environments[effective_task_id] = env 556 _last_activity[effective_task_id] = time.time() 557 558 _start_cleanup_thread() 559 logger.info("%s environment ready for execute_code task %s", 560 env_type, effective_task_id[:8]) 561 return env, env_type 562 563 564 def _ship_file_to_remote(env, remote_path: str, content: str) -> None: 565 """Write *content* to *remote_path* on the remote environment. 566 567 Uses ``echo … | base64 -d`` rather than stdin piping because some 568 backends (Modal) don't reliably deliver stdin_data to chained 569 commands. Base64 output is shell-safe ([A-Za-z0-9+/=]) so single 570 quotes are fine. 571 """ 572 encoded = base64.b64encode(content.encode("utf-8")).decode("ascii") 573 quoted_remote_path = shlex.quote(remote_path) 574 env.execute( 575 f"echo '{encoded}' | base64 -d > {quoted_remote_path}", 576 cwd="/", 577 timeout=30, 578 ) 579 580 581 def _env_temp_dir(env: Any) -> str: 582 """Return a writable temp dir for env-backed execute_code sandboxes.""" 583 get_temp_dir = getattr(env, "get_temp_dir", None) 584 if callable(get_temp_dir): 585 try: 586 temp_dir = get_temp_dir() 587 if isinstance(temp_dir, str) and temp_dir.startswith("/"): 588 return temp_dir.rstrip("/") or "/" 589 except Exception as exc: 590 logger.debug("Could not resolve execute_code env temp dir: %s", exc) 591 candidate = tempfile.gettempdir() 592 if isinstance(candidate, str) and candidate.startswith("/"): 593 return candidate.rstrip("/") or "/" 594 return "/tmp" 595 596 597 def _rpc_poll_loop( 598 env, 599 rpc_dir: str, 600 task_id: str, 601 tool_call_log: list, 602 tool_call_counter: list, 603 max_tool_calls: int, 604 allowed_tools: frozenset, 605 stop_event: threading.Event, 606 ): 607 """Poll the remote filesystem for tool call requests and dispatch them. 608 609 Runs in a background thread. Each ``env.execute()`` spawns an 610 independent process, so these calls run safely concurrent with the 611 script-execution thread. 612 """ 613 from model_tools import handle_function_call 614 615 poll_interval = 0.1 # 100 ms 616 617 quoted_rpc_dir = shlex.quote(rpc_dir) 618 while not stop_event.is_set(): 619 try: 620 # List pending request files (skip .tmp partials) 621 ls_result = env.execute( 622 f"ls -1 {quoted_rpc_dir}/req_* 2>/dev/null || true", 623 cwd="/", 624 timeout=10, 625 ) 626 output = ls_result.get("output", "").strip() 627 if not output: 628 stop_event.wait(poll_interval) 629 continue 630 631 req_files = sorted([ 632 f.strip() for f in output.split("\n") 633 if f.strip() 634 and not f.strip().endswith(".tmp") 635 and "/req_" in f.strip() 636 ]) 637 638 for req_file in req_files: 639 if stop_event.is_set(): 640 break 641 642 call_start = time.monotonic() 643 644 quoted_req_file = shlex.quote(req_file) 645 # Read request 646 read_result = env.execute( 647 f"cat {quoted_req_file}", 648 cwd="/", 649 timeout=10, 650 ) 651 try: 652 request = json.loads(read_result.get("output", "")) 653 except (json.JSONDecodeError, ValueError): 654 logger.debug("Malformed RPC request in %s", req_file) 655 # Remove bad request to avoid infinite retry 656 env.execute(f"rm -f {quoted_req_file}", cwd="/", timeout=5) 657 continue 658 659 tool_name = request.get("tool", "") 660 tool_args = request.get("args", {}) 661 seq = request.get("seq", 0) 662 seq_str = f"{seq:06d}" 663 res_file = f"{rpc_dir}/res_{seq_str}" 664 quoted_res_file = shlex.quote(res_file) 665 666 # Enforce allow-list 667 if tool_name not in allowed_tools: 668 available = ", ".join(sorted(allowed_tools)) 669 tool_result = json.dumps({ 670 "error": ( 671 f"Tool '{tool_name}' is not available in execute_code. " 672 f"Available: {available}" 673 ) 674 }) 675 # Enforce tool call limit 676 elif tool_call_counter[0] >= max_tool_calls: 677 tool_result = json.dumps({ 678 "error": ( 679 f"Tool call limit reached ({max_tool_calls}). " 680 "No more tool calls allowed in this execution." 681 ) 682 }) 683 else: 684 # Strip forbidden terminal parameters 685 if tool_name == "terminal" and isinstance(tool_args, dict): 686 for param in _TERMINAL_BLOCKED_PARAMS: 687 tool_args.pop(param, None) 688 689 # Dispatch through the standard tool handler 690 try: 691 _real_stdout, _real_stderr = sys.stdout, sys.stderr 692 devnull = open(os.devnull, "w") 693 try: 694 sys.stdout = devnull 695 sys.stderr = devnull 696 tool_result = handle_function_call( 697 tool_name, tool_args, task_id=task_id 698 ) 699 finally: 700 sys.stdout, sys.stderr = _real_stdout, _real_stderr 701 devnull.close() 702 except Exception as exc: 703 logger.error("Tool call failed in remote sandbox: %s", 704 exc, exc_info=True) 705 tool_result = tool_error(str(exc)) 706 707 tool_call_counter[0] += 1 708 call_duration = time.monotonic() - call_start 709 tool_call_log.append({ 710 "tool": tool_name, 711 "args_preview": str(tool_args)[:80], 712 "duration": round(call_duration, 2), 713 }) 714 715 # Write response atomically (tmp + rename). 716 # Use echo piping (not stdin_data) because Modal doesn't 717 # reliably deliver stdin to chained commands. 718 encoded_result = base64.b64encode( 719 tool_result.encode("utf-8") 720 ).decode("ascii") 721 env.execute( 722 f"echo '{encoded_result}' | base64 -d > {quoted_res_file}.tmp" 723 f" && mv {quoted_res_file}.tmp {quoted_res_file}", 724 cwd="/", 725 timeout=60, 726 ) 727 728 # Remove the request file 729 env.execute(f"rm -f {quoted_req_file}", cwd="/", timeout=5) 730 731 except Exception as e: 732 if not stop_event.is_set(): 733 logger.debug("RPC poll error: %s", e, exc_info=True) 734 735 if not stop_event.is_set(): 736 stop_event.wait(poll_interval) 737 738 739 def _execute_remote( 740 code: str, 741 task_id: Optional[str], 742 enabled_tools: Optional[List[str]], 743 ) -> str: 744 """Run a script on the remote terminal backend via file-based RPC. 745 746 The script and the generated hermes_tools.py module are shipped to 747 the remote environment, and tool calls are proxied through a polling 748 thread that communicates via request/response files. 749 """ 750 751 _cfg = _load_config() 752 timeout = _cfg.get("timeout", DEFAULT_TIMEOUT) 753 max_tool_calls = _cfg.get("max_tool_calls", DEFAULT_MAX_TOOL_CALLS) 754 755 session_tools = set(enabled_tools) if enabled_tools else set() 756 sandbox_tools = frozenset(SANDBOX_ALLOWED_TOOLS & session_tools) 757 if not sandbox_tools: 758 sandbox_tools = SANDBOX_ALLOWED_TOOLS 759 760 effective_task_id = task_id or "default" 761 env, env_type = _get_or_create_env(effective_task_id) 762 763 sandbox_id = uuid.uuid4().hex[:12] 764 temp_dir = _env_temp_dir(env) 765 sandbox_dir = f"{temp_dir}/hermes_exec_{sandbox_id}" 766 quoted_sandbox_dir = shlex.quote(sandbox_dir) 767 quoted_rpc_dir = shlex.quote(f"{sandbox_dir}/rpc") 768 769 tool_call_log: list = [] 770 tool_call_counter = [0] 771 exec_start = time.monotonic() 772 stop_event = threading.Event() 773 rpc_thread = None 774 775 try: 776 # Verify Python is available on the remote 777 py_check = env.execute( 778 "command -v python3 >/dev/null 2>&1 && echo OK", 779 cwd="/", timeout=15, 780 ) 781 if "OK" not in py_check.get("output", ""): 782 return json.dumps({ 783 "status": "error", 784 "error": ( 785 f"Python 3 is not available in the {env_type} terminal " 786 "environment. Install Python to use execute_code with " 787 "remote backends." 788 ), 789 "tool_calls_made": 0, 790 "duration_seconds": 0, 791 }) 792 793 # Create sandbox directory on remote 794 env.execute( 795 f"mkdir -p {quoted_rpc_dir}", cwd="/", timeout=10, 796 ) 797 798 # Generate and ship files 799 tools_src = generate_hermes_tools_module( 800 list(sandbox_tools), transport="file", 801 ) 802 _ship_file_to_remote(env, f"{sandbox_dir}/hermes_tools.py", tools_src) 803 _ship_file_to_remote(env, f"{sandbox_dir}/script.py", code) 804 805 # Start RPC polling thread 806 rpc_thread = threading.Thread( 807 target=_rpc_poll_loop, 808 args=( 809 env, f"{sandbox_dir}/rpc", effective_task_id, 810 tool_call_log, tool_call_counter, max_tool_calls, 811 sandbox_tools, stop_event, 812 ), 813 daemon=True, 814 ) 815 rpc_thread.start() 816 817 # Build environment variable prefix for the script 818 env_prefix = ( 819 f"HERMES_RPC_DIR={shlex.quote(f'{sandbox_dir}/rpc')} " 820 f"PYTHONDONTWRITEBYTECODE=1" 821 ) 822 tz = os.getenv("HERMES_TIMEZONE", "").strip() 823 if tz: 824 env_prefix += f" TZ={tz}" 825 826 # Execute the script on the remote backend 827 logger.info("Executing code on %s backend (task %s)...", 828 env_type, effective_task_id[:8]) 829 script_result = env.execute( 830 f"cd {quoted_sandbox_dir} && {env_prefix} python3 script.py", 831 timeout=timeout, 832 ) 833 834 stdout_text = script_result.get("output", "") 835 exit_code = script_result.get("returncode", -1) 836 status = "success" 837 838 # Check for timeout/interrupt from the backend 839 if exit_code == 124: 840 status = "timeout" 841 elif exit_code == 130: 842 status = "interrupted" 843 844 except Exception as exc: 845 duration = round(time.monotonic() - exec_start, 2) 846 logger.error( 847 "execute_code remote failed after %ss with %d tool calls: %s: %s", 848 duration, tool_call_counter[0], type(exc).__name__, exc, 849 exc_info=True, 850 ) 851 return json.dumps({ 852 "status": "error", 853 "error": str(exc), 854 "tool_calls_made": tool_call_counter[0], 855 "duration_seconds": duration, 856 }, ensure_ascii=False) 857 858 finally: 859 # Stop the polling thread 860 stop_event.set() 861 if rpc_thread is not None: 862 rpc_thread.join(timeout=5) 863 864 # Clean up remote sandbox dir 865 try: 866 env.execute( 867 f"rm -rf {quoted_sandbox_dir}", cwd="/", timeout=15, 868 ) 869 except Exception: 870 logger.debug("Failed to clean up remote sandbox %s", sandbox_dir) 871 872 duration = round(time.monotonic() - exec_start, 2) 873 874 # --- Post-process output (same as local path) --- 875 876 # Truncate stdout to cap 877 if len(stdout_text) > MAX_STDOUT_BYTES: 878 head_bytes = int(MAX_STDOUT_BYTES * 0.4) 879 tail_bytes = MAX_STDOUT_BYTES - head_bytes 880 head = stdout_text[:head_bytes] 881 tail = stdout_text[-tail_bytes:] 882 omitted = len(stdout_text) - len(head) - len(tail) 883 stdout_text = ( 884 head 885 + f"\n\n... [OUTPUT TRUNCATED - {omitted:,} chars omitted " 886 f"out of {len(stdout_text):,} total] ...\n\n" 887 + tail 888 ) 889 890 # Strip ANSI escape sequences 891 from tools.ansi_strip import strip_ansi 892 stdout_text = strip_ansi(stdout_text) 893 894 # Redact secrets 895 from agent.redact import redact_sensitive_text 896 stdout_text = redact_sensitive_text(stdout_text) 897 898 # Build response 899 result: Dict[str, Any] = { 900 "status": status, 901 "output": stdout_text, 902 "tool_calls_made": tool_call_counter[0], 903 "duration_seconds": duration, 904 } 905 906 if status == "timeout": 907 timeout_msg = f"Script timed out after {timeout}s and was killed." 908 result["error"] = timeout_msg 909 # Include timeout message in output so the LLM always surfaces it 910 # to the user (see local path comment — same reasoning, #10807). 911 if stdout_text: 912 result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}" 913 else: 914 result["output"] = f"⏰ {timeout_msg}" 915 logger.warning( 916 "execute_code (remote) timed out after %ss (limit %ss) with %d tool calls", 917 duration, timeout, tool_call_counter[0], 918 ) 919 elif status == "interrupted": 920 result["output"] = ( 921 stdout_text + "\n[execution interrupted — user sent a new message]" 922 ) 923 elif exit_code != 0: 924 result["status"] = "error" 925 result["error"] = f"Script exited with code {exit_code}" 926 927 return json.dumps(result, ensure_ascii=False) 928 929 930 # --------------------------------------------------------------------------- 931 # Main entry point 932 # --------------------------------------------------------------------------- 933 934 def execute_code( 935 code: str, 936 task_id: Optional[str] = None, 937 enabled_tools: Optional[List[str]] = None, 938 ) -> str: 939 """ 940 Run a Python script in a sandboxed child process with RPC access 941 to a subset of Hermes tools. 942 943 Dispatches to the local (UDS) or remote (file-based RPC) path 944 depending on the configured terminal backend. 945 946 Args: 947 code: Python source code to execute. 948 task_id: Session task ID for tool isolation (terminal env, etc.). 949 enabled_tools: Tool names enabled in the current session. The sandbox 950 gets the intersection with SANDBOX_ALLOWED_TOOLS. 951 952 Returns: 953 JSON string with execution results. 954 """ 955 if not SANDBOX_AVAILABLE: 956 return json.dumps({ 957 "error": "execute_code is not available on Windows. Use normal tool calls instead." 958 }) 959 960 if not code or not code.strip(): 961 return tool_error("No code provided.") 962 963 # Dispatch: remote backends use file-based RPC, local uses UDS 964 from tools.terminal_tool import _get_env_config 965 env_type = _get_env_config()["env_type"] 966 if env_type != "local": 967 return _execute_remote(code, task_id, enabled_tools) 968 969 # --- Local execution path (UDS) --- below this line is unchanged --- 970 971 # Import per-thread interrupt check (cooperative cancellation) 972 from tools.interrupt import is_interrupted as _is_interrupted 973 974 # Resolve config 975 _cfg = _load_config() 976 timeout = _cfg.get("timeout", DEFAULT_TIMEOUT) 977 max_tool_calls = _cfg.get("max_tool_calls", DEFAULT_MAX_TOOL_CALLS) 978 979 # Determine which tools the sandbox can call 980 session_tools = set(enabled_tools) if enabled_tools else set() 981 sandbox_tools = frozenset(SANDBOX_ALLOWED_TOOLS & session_tools) 982 983 if not sandbox_tools: 984 sandbox_tools = SANDBOX_ALLOWED_TOOLS 985 986 # --- Set up temp directory with hermes_tools.py and script.py --- 987 tmpdir = tempfile.mkdtemp(prefix="hermes_sandbox_") 988 # Use /tmp on macOS to avoid the long /var/folders/... path that pushes 989 # Unix domain socket paths past the 104-byte macOS AF_UNIX limit. 990 # On Linux, tempfile.gettempdir() already returns /tmp. 991 _sock_tmpdir = "/tmp" if sys.platform == "darwin" else tempfile.gettempdir() 992 sock_path = os.path.join(_sock_tmpdir, f"hermes_rpc_{uuid.uuid4().hex}.sock") 993 994 tool_call_log: list = [] 995 tool_call_counter = [0] # mutable so the RPC thread can increment 996 exec_start = time.monotonic() 997 server_sock = None 998 999 try: 1000 # Write the auto-generated hermes_tools module 1001 # sandbox_tools is already the correct set (intersection with session 1002 # tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above). 1003 tools_src = generate_hermes_tools_module(list(sandbox_tools)) 1004 with open(os.path.join(tmpdir, "hermes_tools.py"), "w") as f: 1005 f.write(tools_src) 1006 1007 # Write the user's script 1008 with open(os.path.join(tmpdir, "script.py"), "w") as f: 1009 f.write(code) 1010 1011 # --- Start UDS server --- 1012 server_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 1013 server_sock.bind(sock_path) 1014 os.chmod(sock_path, 0o600) 1015 server_sock.listen(1) 1016 1017 rpc_thread = threading.Thread( 1018 target=_rpc_server_loop, 1019 args=( 1020 server_sock, task_id, tool_call_log, 1021 tool_call_counter, max_tool_calls, sandbox_tools, 1022 ), 1023 daemon=True, 1024 ) 1025 rpc_thread.start() 1026 1027 # --- Spawn child process --- 1028 # Build a minimal environment for the child. We intentionally exclude 1029 # API keys and tokens to prevent credential exfiltration from LLM- 1030 # generated scripts. The child accesses tools via RPC, not direct API. 1031 # Exception: env vars declared by loaded skills (via env_passthrough 1032 # registry) or explicitly allowed by the user in config.yaml 1033 # (terminal.env_passthrough) are passed through. 1034 _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM", 1035 "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME", 1036 "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA", 1037 "HERMES_") 1038 _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL", 1039 "PASSWD", "AUTH") 1040 try: 1041 from tools.env_passthrough import is_env_passthrough as _is_passthrough 1042 except Exception: 1043 _is_passthrough = lambda _: False # noqa: E731 1044 child_env = {} 1045 for k, v in os.environ.items(): 1046 # Passthrough vars (skill-declared or user-configured) always pass. 1047 if _is_passthrough(k): 1048 child_env[k] = v 1049 continue 1050 # Block vars with secret-like names. 1051 if any(s in k.upper() for s in _SECRET_SUBSTRINGS): 1052 continue 1053 # Allow vars with known safe prefixes. 1054 if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES): 1055 child_env[k] = v 1056 child_env["HERMES_RPC_SOCKET"] = sock_path 1057 child_env["PYTHONDONTWRITEBYTECODE"] = "1" 1058 # Ensure the hermes-agent root is importable in the sandbox so 1059 # repo-root modules are available to child scripts. We also prepend 1060 # the staging tmpdir so ``from hermes_tools import ...`` resolves even 1061 # when the subprocess CWD is not tmpdir (project mode). 1062 _hermes_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 1063 _existing_pp = child_env.get("PYTHONPATH", "") 1064 _pp_parts = [tmpdir, _hermes_root] 1065 if _existing_pp: 1066 _pp_parts.append(_existing_pp) 1067 child_env["PYTHONPATH"] = os.pathsep.join(_pp_parts) 1068 # Inject user's configured timezone so datetime.now() in sandboxed 1069 # code reflects the correct wall-clock time. Only TZ is set — 1070 # HERMES_TIMEZONE is an internal Hermes setting and must not leak 1071 # into child processes. 1072 _tz_name = os.getenv("HERMES_TIMEZONE", "").strip() 1073 if _tz_name: 1074 child_env["TZ"] = _tz_name 1075 child_env.pop("HERMES_TIMEZONE", None) 1076 1077 # Per-profile HOME isolation: redirect system tool configs into 1078 # {HERMES_HOME}/home/ when that directory exists. 1079 from hermes_constants import get_subprocess_home 1080 _profile_home = get_subprocess_home() 1081 if _profile_home: 1082 child_env["HOME"] = _profile_home 1083 1084 # Resolve interpreter + CWD based on execute_code mode. 1085 # - strict : today's behavior (sys.executable + tmpdir CWD). 1086 # - project: user's venv python + session's working directory, so 1087 # project deps like pandas and user files resolve. 1088 # Env scrubbing and tool whitelist apply identically in both modes. 1089 _mode = _get_execution_mode() 1090 _child_python = _resolve_child_python(_mode) 1091 _child_cwd = _resolve_child_cwd(_mode, tmpdir) 1092 _script_path = os.path.join(tmpdir, "script.py") 1093 1094 proc = subprocess.Popen( 1095 [_child_python, _script_path], 1096 cwd=_child_cwd, 1097 env=child_env, 1098 stdout=subprocess.PIPE, 1099 stderr=subprocess.PIPE, 1100 stdin=subprocess.DEVNULL, 1101 preexec_fn=None if _IS_WINDOWS else os.setsid, 1102 ) 1103 1104 # --- Poll loop: watch for exit, timeout, and interrupt --- 1105 deadline = time.monotonic() + timeout 1106 stderr_chunks: list = [] 1107 1108 # Background readers to avoid pipe buffer deadlocks. 1109 # For stdout we use a head+tail strategy: keep the first HEAD_BYTES 1110 # and a rolling window of the last TAIL_BYTES so the final print() 1111 # output is never lost. Stderr keeps head-only (errors appear early). 1112 _STDOUT_HEAD_BYTES = int(MAX_STDOUT_BYTES * 0.4) # 40% head 1113 _STDOUT_TAIL_BYTES = MAX_STDOUT_BYTES - _STDOUT_HEAD_BYTES # 60% tail 1114 1115 def _drain(pipe, chunks, max_bytes): 1116 """Simple head-only drain (used for stderr).""" 1117 total = 0 1118 try: 1119 while True: 1120 data = pipe.read(4096) 1121 if not data: 1122 break 1123 if total < max_bytes: 1124 keep = max_bytes - total 1125 chunks.append(data[:keep]) 1126 total += len(data) 1127 except (ValueError, OSError) as e: 1128 logger.debug("Error reading process output: %s", e, exc_info=True) 1129 1130 stdout_total_bytes = [0] # mutable ref for total bytes seen 1131 1132 def _drain_head_tail(pipe, head_chunks, tail_chunks, head_bytes, tail_bytes, total_ref): 1133 """Drain stdout keeping both head and tail data.""" 1134 head_collected = 0 1135 from collections import deque 1136 tail_buf = deque() 1137 tail_collected = 0 1138 try: 1139 while True: 1140 data = pipe.read(4096) 1141 if not data: 1142 break 1143 total_ref[0] += len(data) 1144 # Fill head buffer first 1145 if head_collected < head_bytes: 1146 keep = min(len(data), head_bytes - head_collected) 1147 head_chunks.append(data[:keep]) 1148 head_collected += keep 1149 data = data[keep:] # remaining goes to tail 1150 if not data: 1151 continue 1152 # Everything past head goes into rolling tail buffer 1153 tail_buf.append(data) 1154 tail_collected += len(data) 1155 # Evict old tail data to stay within tail_bytes budget 1156 while tail_collected > tail_bytes and tail_buf: 1157 oldest = tail_buf.popleft() 1158 tail_collected -= len(oldest) 1159 except (ValueError, OSError): 1160 pass 1161 # Transfer final tail to output list 1162 tail_chunks.extend(tail_buf) 1163 1164 stdout_head_chunks: list = [] 1165 stdout_tail_chunks: list = [] 1166 1167 stdout_reader = threading.Thread( 1168 target=_drain_head_tail, 1169 args=(proc.stdout, stdout_head_chunks, stdout_tail_chunks, 1170 _STDOUT_HEAD_BYTES, _STDOUT_TAIL_BYTES, stdout_total_bytes), 1171 daemon=True 1172 ) 1173 stderr_reader = threading.Thread( 1174 target=_drain, args=(proc.stderr, stderr_chunks, MAX_STDERR_BYTES), daemon=True 1175 ) 1176 stdout_reader.start() 1177 stderr_reader.start() 1178 1179 status = "success" 1180 _activity_state = { 1181 "last_touch": time.monotonic(), 1182 "start": exec_start, 1183 } 1184 while proc.poll() is None: 1185 if _is_interrupted(): 1186 _kill_process_group(proc) 1187 status = "interrupted" 1188 break 1189 if time.monotonic() > deadline: 1190 _kill_process_group(proc, escalate=True) 1191 status = "timeout" 1192 break 1193 # Periodic activity touch so the gateway's inactivity timeout 1194 # doesn't kill the agent during long code execution (#10807). 1195 try: 1196 from tools.environments.base import touch_activity_if_due 1197 touch_activity_if_due(_activity_state, "execute_code running") 1198 except Exception: 1199 pass 1200 time.sleep(0.2) 1201 1202 # Wait for readers to finish draining 1203 stdout_reader.join(timeout=3) 1204 stderr_reader.join(timeout=3) 1205 1206 stdout_head = b"".join(stdout_head_chunks).decode("utf-8", errors="replace") 1207 stdout_tail = b"".join(stdout_tail_chunks).decode("utf-8", errors="replace") 1208 stderr_text = b"".join(stderr_chunks).decode("utf-8", errors="replace") 1209 1210 # Assemble stdout with head+tail truncation 1211 total_stdout = stdout_total_bytes[0] 1212 if total_stdout > MAX_STDOUT_BYTES and stdout_tail: 1213 omitted = total_stdout - len(stdout_head) - len(stdout_tail) 1214 truncated_notice = ( 1215 f"\n\n... [OUTPUT TRUNCATED - {omitted:,} chars omitted " 1216 f"out of {total_stdout:,} total] ...\n\n" 1217 ) 1218 stdout_text = stdout_head + truncated_notice + stdout_tail 1219 else: 1220 stdout_text = stdout_head + stdout_tail 1221 1222 exit_code = proc.returncode if proc.returncode is not None else -1 1223 duration = round(time.monotonic() - exec_start, 2) 1224 1225 # Wait for RPC thread to finish 1226 server_sock.close() # break accept() so thread exits promptly 1227 server_sock = None # prevent double close in finally 1228 rpc_thread.join(timeout=3) 1229 1230 # Strip ANSI escape sequences so the model never sees terminal 1231 # formatting — prevents it from copying escapes into file writes. 1232 from tools.ansi_strip import strip_ansi 1233 stdout_text = strip_ansi(stdout_text) 1234 stderr_text = strip_ansi(stderr_text) 1235 1236 # Redact secrets (API keys, tokens, etc.) from sandbox output. 1237 # The sandbox env-var filter (lines 434-454) blocks os.environ access, 1238 # but scripts can still read secrets from disk (e.g. open('~/.hermes/.env')). 1239 # This ensures leaked secrets never enter the model context. 1240 from agent.redact import redact_sensitive_text 1241 stdout_text = redact_sensitive_text(stdout_text) 1242 stderr_text = redact_sensitive_text(stderr_text) 1243 1244 # Build response 1245 result: Dict[str, Any] = { 1246 "status": status, 1247 "output": stdout_text, 1248 "tool_calls_made": tool_call_counter[0], 1249 "duration_seconds": duration, 1250 } 1251 1252 if status == "timeout": 1253 timeout_msg = f"Script timed out after {timeout}s and was killed." 1254 result["error"] = timeout_msg 1255 # Include timeout message in output so the LLM always surfaces it 1256 # to the user. When output is empty, models often treat the result 1257 # as "nothing happened" and produce an empty response, which the 1258 # gateway stream consumer silently drops (#10807). 1259 if stdout_text: 1260 result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}" 1261 else: 1262 result["output"] = f"⏰ {timeout_msg}" 1263 logger.warning( 1264 "execute_code timed out after %ss (limit %ss) with %d tool calls", 1265 duration, timeout, tool_call_counter[0], 1266 ) 1267 elif status == "interrupted": 1268 result["output"] = stdout_text + "\n[execution interrupted — user sent a new message]" 1269 elif exit_code != 0: 1270 result["status"] = "error" 1271 result["error"] = stderr_text or f"Script exited with code {exit_code}" 1272 # Include stderr in output so the LLM sees the traceback 1273 if stderr_text: 1274 result["output"] = stdout_text + "\n--- stderr ---\n" + stderr_text 1275 1276 return json.dumps(result, ensure_ascii=False) 1277 1278 except Exception as exc: 1279 duration = round(time.monotonic() - exec_start, 2) 1280 logger.error( 1281 "execute_code failed after %ss with %d tool calls: %s: %s", 1282 duration, 1283 tool_call_counter[0], 1284 type(exc).__name__, 1285 exc, 1286 exc_info=True, 1287 ) 1288 return json.dumps({ 1289 "status": "error", 1290 "error": str(exc), 1291 "tool_calls_made": tool_call_counter[0], 1292 "duration_seconds": duration, 1293 }, ensure_ascii=False) 1294 1295 finally: 1296 # Cleanup temp dir and socket 1297 if server_sock is not None: 1298 try: 1299 server_sock.close() 1300 except OSError as e: 1301 logger.debug("Server socket close error: %s", e) 1302 import shutil 1303 shutil.rmtree(tmpdir, ignore_errors=True) 1304 try: 1305 os.unlink(sock_path) 1306 except OSError: 1307 pass # already cleaned up or never created 1308 1309 1310 def _kill_process_group(proc, escalate: bool = False): 1311 """Kill the child and its entire process group.""" 1312 try: 1313 if _IS_WINDOWS: 1314 proc.terminate() 1315 else: 1316 os.killpg(os.getpgid(proc.pid), signal.SIGTERM) 1317 except (ProcessLookupError, PermissionError) as e: 1318 logger.debug("Could not kill process group: %s", e, exc_info=True) 1319 try: 1320 proc.kill() 1321 except Exception as e2: 1322 logger.debug("Could not kill process: %s", e2, exc_info=True) 1323 1324 if escalate: 1325 # Give the process 5s to exit after SIGTERM, then SIGKILL 1326 try: 1327 proc.wait(timeout=5) 1328 except subprocess.TimeoutExpired: 1329 try: 1330 if _IS_WINDOWS: 1331 proc.kill() 1332 else: 1333 os.killpg(os.getpgid(proc.pid), signal.SIGKILL) 1334 except (ProcessLookupError, PermissionError) as e: 1335 logger.debug("Could not kill process group with SIGKILL: %s", e, exc_info=True) 1336 try: 1337 proc.kill() 1338 except Exception as e2: 1339 logger.debug("Could not kill process: %s", e2, exc_info=True) 1340 1341 1342 def _load_config() -> dict: 1343 """Load code_execution config without importing the interactive CLI. 1344 1345 This helper is called while building the module-level execute_code schema 1346 during tool discovery. Importing ``cli`` here pulls prompt_toolkit/Rich and 1347 a large chunk of the classic REPL onto every agent startup path, including 1348 ``hermes --tui`` where it is never used. Read the lightweight raw config 1349 instead; the config layer already caches by (mtime, size), and an absent 1350 key cleanly falls back to DEFAULT_EXECUTION_MODE. 1351 """ 1352 try: 1353 from hermes_cli.config import read_raw_config 1354 1355 cfg = read_raw_config().get("code_execution", {}) 1356 return cfg if isinstance(cfg, dict) else {} 1357 except Exception: 1358 return {} 1359 1360 1361 # --------------------------------------------------------------------------- 1362 # Execution mode resolution (strict vs project) 1363 # --------------------------------------------------------------------------- 1364 1365 # Valid values for code_execution.mode. Kept as a module constant so tests 1366 # and the config layer can reference the canonical set. 1367 EXECUTION_MODES = ("project", "strict") 1368 DEFAULT_EXECUTION_MODE = "project" 1369 1370 1371 def _get_execution_mode() -> str: 1372 """Return the active execute_code mode — 'project' or 'strict'. 1373 1374 Reads ``code_execution.mode`` from config.yaml; invalid values fall back 1375 to ``DEFAULT_EXECUTION_MODE`` ('project') with a log warning. 1376 1377 Mode semantics: 1378 - ``project`` (default): scripts run in the session's working directory 1379 with the active virtual environment's python, so project dependencies 1380 (pandas, torch, project packages) and files resolve naturally. 1381 - ``strict``: scripts run in an isolated temp directory with 1382 ``sys.executable`` (hermes-agent's python). Reproducible and the 1383 interpreter is guaranteed to work, but project deps and relative paths 1384 won't resolve. 1385 1386 Env scrubbing and tool whitelist apply identically in both modes. 1387 """ 1388 cfg_value = str(_load_config().get("mode", DEFAULT_EXECUTION_MODE)).strip().lower() 1389 if cfg_value in EXECUTION_MODES: 1390 return cfg_value 1391 logger.warning( 1392 "Ignoring code_execution.mode=%r (expected one of %s), falling back to %r", 1393 cfg_value, EXECUTION_MODES, DEFAULT_EXECUTION_MODE, 1394 ) 1395 return DEFAULT_EXECUTION_MODE 1396 1397 1398 @functools.lru_cache(maxsize=32) 1399 def _is_usable_python(python_path: str) -> bool: 1400 """Check whether a candidate Python interpreter is usable for execute_code. 1401 1402 Requires Python 3.8+ (f-strings and stdlib modules the RPC stubs need). 1403 Cached so we don't fork a subprocess on every execute_code call. 1404 """ 1405 try: 1406 result = subprocess.run( 1407 [python_path, "-c", 1408 "import sys; sys.exit(0 if sys.version_info >= (3, 8) else 1)"], 1409 timeout=5, 1410 capture_output=True, 1411 ) 1412 return result.returncode == 0 1413 except (OSError, subprocess.TimeoutExpired, subprocess.SubprocessError): 1414 return False 1415 1416 1417 def _resolve_child_python(mode: str) -> str: 1418 """Pick the Python interpreter for the execute_code subprocess. 1419 1420 In ``strict`` mode, always ``sys.executable`` — guaranteed to work and 1421 keeps behavior fully reproducible across sessions. 1422 1423 In ``project`` mode, prefer the user's active virtualenv/conda env's 1424 python so ``import pandas`` etc. work. Falls back to ``sys.executable`` 1425 if no venv is detected, the candidate binary is missing/not executable, 1426 or it fails a Python 3.8+ version check. 1427 """ 1428 if mode != "project": 1429 return sys.executable 1430 1431 if _IS_WINDOWS: 1432 exe_names = ("python.exe", "python3.exe") 1433 subdirs = ("Scripts",) 1434 else: 1435 exe_names = ("python", "python3") 1436 subdirs = ("bin",) 1437 1438 for var in ("VIRTUAL_ENV", "CONDA_PREFIX"): 1439 root = os.environ.get(var, "").strip() 1440 if not root: 1441 continue 1442 for subdir in subdirs: 1443 for exe in exe_names: 1444 candidate = os.path.join(root, subdir, exe) 1445 if not (os.path.isfile(candidate) and os.access(candidate, os.X_OK)): 1446 continue 1447 if _is_usable_python(candidate): 1448 return candidate 1449 # Found the interpreter but it failed the version check — 1450 # log once and fall through to sys.executable. 1451 logger.info( 1452 "execute_code: skipping %s=%s (Python version < 3.8 or broken). " 1453 "Using sys.executable instead.", var, candidate, 1454 ) 1455 return sys.executable 1456 1457 return sys.executable 1458 1459 1460 def _resolve_child_cwd(mode: str, staging_dir: str) -> str: 1461 """Resolve the working directory for the execute_code subprocess. 1462 1463 - ``strict``: the staging tmpdir (today's behavior). 1464 - ``project``: the session's TERMINAL_CWD (same as the terminal tool), or 1465 ``os.getcwd()`` if TERMINAL_CWD is unset or doesn't point at a real dir. 1466 Falls back to the staging tmpdir as a last resort so we never invoke 1467 Popen with a nonexistent cwd. 1468 """ 1469 if mode != "project": 1470 return staging_dir 1471 raw = os.environ.get("TERMINAL_CWD", "").strip() 1472 if raw: 1473 expanded = os.path.expanduser(raw) 1474 if os.path.isdir(expanded): 1475 return expanded 1476 here = os.getcwd() 1477 if os.path.isdir(here): 1478 return here 1479 return staging_dir 1480 1481 1482 # --------------------------------------------------------------------------- 1483 # OpenAI Function-Calling Schema 1484 # --------------------------------------------------------------------------- 1485 1486 # Per-tool documentation lines for the execute_code description. 1487 # Ordered to match the canonical display order. 1488 _TOOL_DOC_LINES = [ 1489 ("web_search", 1490 " web_search(query: str, limit: int = 5) -> dict\n" 1491 " Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"), 1492 ("web_extract", 1493 " web_extract(urls: list[str]) -> dict\n" 1494 " Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"), 1495 ("read_file", 1496 " read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n" 1497 " Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"), 1498 ("write_file", 1499 " write_file(path: str, content: str) -> dict\n" 1500 " Always overwrites the entire file."), 1501 ("search_files", 1502 " search_files(pattern: str, target=\"content\", path=\".\", file_glob=None, limit=50) -> dict\n" 1503 " target: \"content\" (search inside files) or \"files\" (find files by name). Returns {\"matches\": [...]}"), 1504 ("patch", 1505 " patch(path: str, old_string: str, new_string: str, replace_all: bool = False) -> dict\n" 1506 " Replaces old_string with new_string in the file."), 1507 ("terminal", 1508 " terminal(command: str, timeout=None, workdir=None) -> dict\n" 1509 " Foreground only (no background/pty). Returns {\"output\": \"...\", \"exit_code\": N}"), 1510 ] 1511 1512 1513 def build_execute_code_schema(enabled_sandbox_tools: set = None, 1514 mode: str = None) -> dict: 1515 """Build the execute_code schema with description listing only enabled tools. 1516 1517 When tools are disabled via ``hermes tools`` (e.g. web is turned off), 1518 the schema description should NOT mention web_search / web_extract — 1519 otherwise the model thinks they are available and keeps trying to use them. 1520 1521 ``mode`` controls the working-directory sentence in the description: 1522 - ``'strict'``: scripts run in a temp dir (not the session's CWD) 1523 - ``'project'`` (default): scripts run in the session's CWD with the 1524 active venv's python 1525 If ``mode`` is None, the current ``code_execution.mode`` config is read. 1526 """ 1527 if enabled_sandbox_tools is None: 1528 enabled_sandbox_tools = SANDBOX_ALLOWED_TOOLS 1529 if mode is None: 1530 mode = _get_execution_mode() 1531 1532 # Build tool documentation lines for only the enabled tools 1533 tool_lines = "\n".join( 1534 doc for name, doc in _TOOL_DOC_LINES if name in enabled_sandbox_tools 1535 ) 1536 1537 # Build example import list from enabled tools 1538 import_examples = [n for n in ("web_search", "terminal") if n in enabled_sandbox_tools] 1539 if not import_examples: 1540 import_examples = sorted(enabled_sandbox_tools)[:2] 1541 if import_examples: 1542 import_str = ", ".join(import_examples) + ", ..." 1543 else: 1544 import_str = "..." 1545 1546 # Mode-specific CWD guidance. Project mode is the default and matches 1547 # terminal()'s filesystem/interpreter; strict mode retains the isolated 1548 # temp-dir staging and hermes-agent's own python. 1549 if mode == "strict": 1550 cwd_note = ( 1551 "Scripts run in their own temp dir, not the session's CWD — use absolute paths " 1552 "(os.path.expanduser('~/.hermes/.env')) or terminal()/read_file() for user files." 1553 ) 1554 else: 1555 cwd_note = ( 1556 "Scripts run in the session's working directory with the active venv's python, " 1557 "so project deps (pandas, etc.) and relative paths work like in terminal()." 1558 ) 1559 1560 description = ( 1561 "Run a Python script that can call Hermes tools programmatically. " 1562 "Use this when you need 3+ tool calls with processing logic between them, " 1563 "need to filter/reduce large tool outputs before they enter your context, " 1564 "need conditional branching (if X then Y else Z), or need to loop " 1565 "(fetch N pages, process N files, retry on failure).\n\n" 1566 "Use normal tool calls instead when: single tool call with no processing, " 1567 "you need to see the full result and apply complex reasoning, " 1568 "or the task requires interactive user input.\n\n" 1569 f"Available via `from hermes_tools import ...`:\n\n" 1570 f"{tool_lines}\n\n" 1571 "Limits: 5-minute timeout, 50KB stdout cap, max 50 tool calls per script. " 1572 "terminal() is foreground-only (no background or pty).\n\n" 1573 f"{cwd_note}\n\n" 1574 "Print your final result to stdout. Use Python stdlib (json, re, math, csv, " 1575 "datetime, collections, etc.) for processing between tool calls.\n\n" 1576 "Also available (no import needed — built into hermes_tools):\n" 1577 " json_parse(text: str) — json.loads with strict=False; use for terminal() output with control chars\n" 1578 " shell_quote(s: str) — shlex.quote(); use when interpolating dynamic strings into shell commands\n" 1579 " retry(fn, max_attempts=3, delay=2) — retry with exponential backoff for transient failures" 1580 ) 1581 1582 return { 1583 "name": "execute_code", 1584 "description": description, 1585 "parameters": { 1586 "type": "object", 1587 "properties": { 1588 "code": { 1589 "type": "string", 1590 "description": ( 1591 "Python code to execute. Import tools with " 1592 f"`from hermes_tools import {import_str}` " 1593 "and print your final result to stdout." 1594 ), 1595 }, 1596 }, 1597 "required": ["code"], 1598 }, 1599 } 1600 1601 1602 # Default schema used at registration time (all sandbox tools listed, 1603 # current configured mode). model_tools.py rebuilds per-session anyway. 1604 EXECUTE_CODE_SCHEMA = build_execute_code_schema() 1605 1606 1607 # --- Registry --- 1608 from tools.registry import registry, tool_error 1609 1610 registry.register( 1611 name="execute_code", 1612 toolset="code_execution", 1613 schema=EXECUTE_CODE_SCHEMA, 1614 handler=lambda args, **kw: execute_code( 1615 code=args.get("code", ""), 1616 task_id=kw.get("task_id"), 1617 enabled_tools=kw.get("enabled_tools")), 1618 check_fn=check_sandbox_requirements, 1619 emoji="🐍", 1620 max_result_size_chars=100_000, 1621 )