Cradicle Explorer

/ tools / code_execution_tool.py
code_execution_tool.py
   1  #!/usr/bin/env python3
   2  """
   3  Code Execution Tool -- Programmatic Tool Calling (PTC)
   4  
   5  Lets the LLM write a Python script that calls Hermes tools via RPC,
   6  collapsing multi-step tool chains into a single inference turn.
   7  
   8  Architecture (two transports):
   9  
  10    **Local backend (UDS):**
  11    1. Parent generates a `hermes_tools.py` stub module with UDS RPC functions
  12    2. Parent opens a Unix domain socket and starts an RPC listener thread
  13    3. Parent spawns a child process that runs the LLM's script
  14    4. Tool calls travel over the UDS back to the parent for dispatch
  15  
  16    **Remote backends (file-based RPC):**
  17    1. Parent generates `hermes_tools.py` with file-based RPC stubs
  18    2. Parent ships both files to the remote environment
  19    3. Script runs inside the terminal backend (Docker/SSH/Modal/Daytona/etc.)
  20    4. Tool calls are written as request files; a polling thread on the parent
  21       reads them via env.execute(), dispatches, and writes response files
  22    5. The script polls for response files and continues
  23  
  24  In both cases, only the script's stdout is returned to the LLM; intermediate
  25  tool results never enter the context window.
  26  
  27  Platform: Linux / macOS only (Unix domain sockets for local). Disabled on Windows.
  28  Remote execution additionally requires Python 3 in the terminal backend.
  29  """
  30  
  31  import base64
  32  import functools
  33  import json
  34  import logging
  35  import os
  36  import platform
  37  import shlex
  38  import signal
  39  import socket
  40  import subprocess
  41  import sys
  42  import tempfile
  43  import threading
  44  import time
  45  import uuid
  46  
  47  _IS_WINDOWS = platform.system() == "Windows"
  48  from typing import Any, Dict, List, Optional
  49  
  50  # Availability gate: UDS requires a POSIX OS
  51  logger = logging.getLogger(__name__)
  52  
  53  SANDBOX_AVAILABLE = sys.platform != "win32"
  54  
  55  # The 7 tools allowed inside the sandbox. The intersection of this list
  56  # and the session's enabled tools determines which stubs are generated.
  57  SANDBOX_ALLOWED_TOOLS = frozenset([
  58      "web_search",
  59      "web_extract",
  60      "read_file",
  61      "write_file",
  62      "search_files",
  63      "patch",
  64      "terminal",
  65  ])
  66  
  67  # Resource limit defaults (overridable via config.yaml → code_execution.*)
  68  DEFAULT_TIMEOUT = 300        # 5 minutes
  69  DEFAULT_MAX_TOOL_CALLS = 50
  70  MAX_STDOUT_BYTES = 50_000    # 50 KB
  71  MAX_STDERR_BYTES = 10_000    # 10 KB
  72  
  73  
  74  def check_sandbox_requirements() -> bool:
  75      """Code execution sandbox requires a POSIX OS for Unix domain sockets."""
  76      if not SANDBOX_AVAILABLE:
  77          return False
  78  
  79      try:
  80          from tools.terminal_tool import (
  81              _check_vercel_sandbox_requirements,
  82              _get_env_config,
  83          )
  84  
  85          config = _get_env_config()
  86      except Exception:
  87          logger.debug("Could not resolve terminal config for execute_code availability", exc_info=True)
  88          return False
  89  
  90      if config.get("env_type") == "vercel_sandbox":
  91          return _check_vercel_sandbox_requirements(config)
  92  
  93      return True
  94  
  95  
  96  # ---------------------------------------------------------------------------
  97  # hermes_tools.py code generator
  98  # ---------------------------------------------------------------------------
  99  
 100  # Per-tool stub templates: (function_name, signature, docstring, args_dict_expr)
 101  # The args_dict_expr builds the JSON payload sent over the RPC socket.
 102  _TOOL_STUBS = {
 103      "web_search": (
 104          "web_search",
 105          "query: str, limit: int = 5",
 106          '"""Search the web. Returns dict with data.web list of {url, title, description}."""',
 107          '{"query": query, "limit": limit}',
 108      ),
 109      "web_extract": (
 110          "web_extract",
 111          "urls: list",
 112          '"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""',
 113          '{"urls": urls}',
 114      ),
 115      "read_file": (
 116          "read_file",
 117          "path: str, offset: int = 1, limit: int = 500",
 118          '"""Read a file (1-indexed lines). Returns dict with "content" and "total_lines"."""',
 119          '{"path": path, "offset": offset, "limit": limit}',
 120      ),
 121      "write_file": (
 122          "write_file",
 123          "path: str, content: str",
 124          '"""Write content to a file (always overwrites). Returns dict with status."""',
 125          '{"path": path, "content": content}',
 126      ),
 127      "search_files": (
 128          "search_files",
 129          'pattern: str, target: str = "content", path: str = ".", file_glob: str = None, limit: int = 50, offset: int = 0, output_mode: str = "content", context: int = 0',
 130          '"""Search file contents (target="content") or find files by name (target="files"). Returns dict with "matches"."""',
 131          '{"pattern": pattern, "target": target, "path": path, "file_glob": file_glob, "limit": limit, "offset": offset, "output_mode": output_mode, "context": context}',
 132      ),
 133      "patch": (
 134          "patch",
 135          'path: str = None, old_string: str = None, new_string: str = None, replace_all: bool = False, mode: str = "replace", patch: str = None',
 136          '"""Targeted find-and-replace (mode="replace") or V4A multi-file patches (mode="patch"). Returns dict with status."""',
 137          '{"path": path, "old_string": old_string, "new_string": new_string, "replace_all": replace_all, "mode": mode, "patch": patch}',
 138      ),
 139      "terminal": (
 140          "terminal",
 141          "command: str, timeout: int = None, workdir: str = None",
 142          '"""Run a shell command (foreground only). Returns dict with "output" and "exit_code"."""',
 143          '{"command": command, "timeout": timeout, "workdir": workdir}',
 144      ),
 145  }
 146  
 147  
 148  def generate_hermes_tools_module(enabled_tools: List[str],
 149                                   transport: str = "uds") -> str:
 150      """
 151      Build the source code for the hermes_tools.py stub module.
 152  
 153      Only tools in both SANDBOX_ALLOWED_TOOLS and enabled_tools get stubs.
 154  
 155      Args:
 156          enabled_tools: Tool names enabled in the current session.
 157          transport: ``"uds"`` for Unix domain socket (local backend) or
 158                     ``"file"`` for file-based RPC (remote backends).
 159      """
 160      tools_to_generate = sorted(SANDBOX_ALLOWED_TOOLS & set(enabled_tools))
 161  
 162      stub_functions = []
 163      export_names = []
 164      for tool_name in tools_to_generate:
 165          if tool_name not in _TOOL_STUBS:
 166              continue
 167          func_name, sig, doc, args_expr = _TOOL_STUBS[tool_name]
 168          stub_functions.append(
 169              f"def {func_name}({sig}):\n"
 170              f"    {doc}\n"
 171              f"    return _call({func_name!r}, {args_expr})\n"
 172          )
 173          export_names.append(func_name)
 174  
 175      if transport == "file":
 176          header = _FILE_TRANSPORT_HEADER
 177      else:
 178          header = _UDS_TRANSPORT_HEADER
 179  
 180      return header + "\n".join(stub_functions)
 181  
 182  
 183  # ---- Shared helpers section (embedded in both transport headers) ----------
 184  
 185  _COMMON_HELPERS = '''\
 186  
 187  # ---------------------------------------------------------------------------
 188  # Convenience helpers (avoid common scripting pitfalls)
 189  # ---------------------------------------------------------------------------
 190  
 191  def json_parse(text: str):
 192      """Parse JSON tolerant of control characters (strict=False).
 193      Use this instead of json.loads() when parsing output from terminal()
 194      or web_extract() that may contain raw tabs/newlines in strings."""
 195      return json.loads(text, strict=False)
 196  
 197  
 198  def shell_quote(s: str) -> str:
 199      """Shell-escape a string for safe interpolation into commands.
 200      Use this when inserting dynamic content into terminal() commands:
 201          terminal(f"echo {shell_quote(user_input)}")
 202      """
 203      return shlex.quote(s)
 204  
 205  
 206  def retry(fn, max_attempts=3, delay=2):
 207      """Retry a function up to max_attempts times with exponential backoff.
 208      Use for transient failures (network errors, API rate limits):
 209          result = retry(lambda: terminal("gh issue list ..."))
 210      """
 211      last_err = None
 212      for attempt in range(max_attempts):
 213          try:
 214              return fn()
 215          except Exception as e:
 216              last_err = e
 217              if attempt < max_attempts - 1:
 218                  time.sleep(delay * (2 ** attempt))
 219      raise last_err
 220  
 221  '''
 222  
 223  # ---- UDS transport (local backend) ---------------------------------------
 224  
 225  _UDS_TRANSPORT_HEADER = '''\
 226  """Auto-generated Hermes tools RPC stubs."""
 227  import json, os, socket, shlex, threading, time
 228  
 229  _sock = None
 230  # The RPC server handles a single client connection serially and has no
 231  # request-id in the protocol, so concurrent _call() invocations from multiple
 232  # threads (e.g. ThreadPoolExecutor) would race on the shared socket and get
 233  # each other's responses. Serialize the entire send+recv round-trip.
 234  _call_lock = threading.Lock()
 235  ''' + _COMMON_HELPERS + '''\
 236  
 237  def _connect():
 238      global _sock
 239      if _sock is None:
 240          _sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
 241          _sock.connect(os.environ["HERMES_RPC_SOCKET"])
 242          _sock.settimeout(300)
 243      return _sock
 244  
 245  def _call(tool_name, args):
 246      """Send a tool call to the parent process and return the parsed result."""
 247      request = json.dumps({"tool": tool_name, "args": args}) + "\\n"
 248      with _call_lock:
 249          conn = _connect()
 250          conn.sendall(request.encode())
 251          buf = b""
 252          while True:
 253              chunk = conn.recv(65536)
 254              if not chunk:
 255                  raise RuntimeError("Agent process disconnected")
 256              buf += chunk
 257              if buf.endswith(b"\\n"):
 258                  break
 259      raw = buf.decode().strip()
 260      result = json.loads(raw)
 261      if isinstance(result, str):
 262          try:
 263              return json.loads(result)
 264          except (json.JSONDecodeError, TypeError):
 265              return result
 266      return result
 267  
 268  '''
 269  
 270  # ---- File-based transport (remote backends) -------------------------------
 271  
 272  _FILE_TRANSPORT_HEADER = '''\
 273  """Auto-generated Hermes tools RPC stubs (file-based transport)."""
 274  import json, os, shlex, tempfile, threading, time
 275  
 276  _RPC_DIR = os.environ.get("HERMES_RPC_DIR") or os.path.join(tempfile.gettempdir(), "hermes_rpc")
 277  _seq = 0
 278  # `_seq += 1` is not atomic (read-modify-write), so concurrent _call()
 279  # invocations from multiple threads could allocate the same sequence number
 280  # and clobber each other's request files. Guard seq allocation with a lock.
 281  _seq_lock = threading.Lock()
 282  ''' + _COMMON_HELPERS + '''\
 283  
 284  def _call(tool_name, args):
 285      """Send a tool call request via file-based RPC and wait for response."""
 286      global _seq
 287      with _seq_lock:
 288          _seq += 1
 289          seq = _seq
 290      seq_str = f"{seq:06d}"
 291      req_file = os.path.join(_RPC_DIR, f"req_{seq_str}")
 292      res_file = os.path.join(_RPC_DIR, f"res_{seq_str}")
 293  
 294      # Write request atomically (write to .tmp, then rename)
 295      tmp = req_file + ".tmp"
 296      with open(tmp, "w") as f:
 297          json.dump({"tool": tool_name, "args": args, "seq": seq}, f)
 298      os.rename(tmp, req_file)
 299  
 300      # Wait for response with adaptive polling
 301      deadline = time.monotonic() + 300  # 5-minute timeout per tool call
 302      poll_interval = 0.05  # Start at 50ms
 303      while not os.path.exists(res_file):
 304          if time.monotonic() > deadline:
 305              raise RuntimeError(f"RPC timeout: no response for {tool_name} after 300s")
 306          time.sleep(poll_interval)
 307          poll_interval = min(poll_interval * 1.2, 0.25)  # Back off to 250ms
 308  
 309      with open(res_file) as f:
 310          raw = f.read()
 311  
 312      # Clean up response file
 313      try:
 314          os.unlink(res_file)
 315      except OSError:
 316          pass
 317  
 318      result = json.loads(raw)
 319      if isinstance(result, str):
 320          try:
 321              return json.loads(result)
 322          except (json.JSONDecodeError, TypeError):
 323              return result
 324      return result
 325  
 326  '''
 327  
 328  
 329  # ---------------------------------------------------------------------------
 330  # RPC server (runs in a thread inside the parent process)
 331  # ---------------------------------------------------------------------------
 332  
 333  # Terminal parameters that must not be used from ephemeral sandbox scripts
 334  _TERMINAL_BLOCKED_PARAMS = {"background", "pty", "notify_on_complete", "watch_patterns"}
 335  
 336  
 337  def _rpc_server_loop(
 338      server_sock: socket.socket,
 339      task_id: str,
 340      tool_call_log: list,
 341      tool_call_counter: list,   # mutable [int] so the thread can increment
 342      max_tool_calls: int,
 343      allowed_tools: frozenset,
 344  ):
 345      """
 346      Accept one client connection and dispatch tool-call requests until
 347      the client disconnects or the call limit is reached.
 348      """
 349      from model_tools import handle_function_call
 350  
 351      conn = None
 352      try:
 353          server_sock.settimeout(5)
 354          conn, _ = server_sock.accept()
 355          conn.settimeout(300)
 356  
 357          buf = b""
 358          while True:
 359              try:
 360                  chunk = conn.recv(65536)
 361              except socket.timeout:
 362                  break
 363              if not chunk:
 364                  break
 365              buf += chunk
 366  
 367              # Process all complete newline-delimited messages in the buffer
 368              while b"\n" in buf:
 369                  line, buf = buf.split(b"\n", 1)
 370                  line = line.strip()
 371                  if not line:
 372                      continue
 373  
 374                  call_start = time.monotonic()
 375                  try:
 376                      request = json.loads(line.decode())
 377                  except (json.JSONDecodeError, UnicodeDecodeError) as exc:
 378                      resp = tool_error(f"Invalid RPC request: {exc}")
 379                      conn.sendall((resp + "\n").encode())
 380                      continue
 381  
 382                  tool_name = request.get("tool", "")
 383                  tool_args = request.get("args", {})
 384  
 385                  # Enforce the allow-list
 386                  if tool_name not in allowed_tools:
 387                      available = ", ".join(sorted(allowed_tools))
 388                      resp = json.dumps({
 389                          "error": (
 390                              f"Tool '{tool_name}' is not available in execute_code. "
 391                              f"Available: {available}"
 392                          )
 393                      })
 394                      conn.sendall((resp + "\n").encode())
 395                      continue
 396  
 397                  # Enforce tool call limit
 398                  if tool_call_counter[0] >= max_tool_calls:
 399                      resp = json.dumps({
 400                          "error": (
 401                              f"Tool call limit reached ({max_tool_calls}). "
 402                              "No more tool calls allowed in this execution."
 403                          )
 404                      })
 405                      conn.sendall((resp + "\n").encode())
 406                      continue
 407  
 408                  # Strip forbidden terminal parameters
 409                  if tool_name == "terminal" and isinstance(tool_args, dict):
 410                      for param in _TERMINAL_BLOCKED_PARAMS:
 411                          tool_args.pop(param, None)
 412  
 413                  # Dispatch through the standard tool handler.
 414                  # Suppress stdout/stderr from internal tool handlers so
 415                  # their status prints don't leak into the CLI spinner.
 416                  try:
 417                      _real_stdout, _real_stderr = sys.stdout, sys.stderr
 418                      devnull = open(os.devnull, "w")
 419                      try:
 420                          sys.stdout = devnull
 421                          sys.stderr = devnull
 422                          result = handle_function_call(
 423                              tool_name, tool_args, task_id=task_id
 424                          )
 425                      finally:
 426                          sys.stdout, sys.stderr = _real_stdout, _real_stderr
 427                          devnull.close()
 428                  except Exception as exc:
 429                      logger.error("Tool call failed in sandbox: %s", exc, exc_info=True)
 430                      result = tool_error(str(exc))
 431  
 432                  tool_call_counter[0] += 1
 433                  call_duration = time.monotonic() - call_start
 434  
 435                  # Log for observability
 436                  args_preview = str(tool_args)[:80]
 437                  tool_call_log.append({
 438                      "tool": tool_name,
 439                      "args_preview": args_preview,
 440                      "duration": round(call_duration, 2),
 441                  })
 442  
 443                  conn.sendall((result + "\n").encode())
 444  
 445      except socket.timeout:
 446          logger.debug("RPC listener socket timeout")
 447      except OSError as e:
 448          logger.debug("RPC listener socket error: %s", e, exc_info=True)
 449      finally:
 450          if conn:
 451              try:
 452                  conn.close()
 453              except OSError as e:
 454                  logger.debug("RPC conn close error: %s", e)
 455  
 456  
 457  # ---------------------------------------------------------------------------
 458  # Remote execution support (file-based RPC via terminal backend)
 459  # ---------------------------------------------------------------------------
 460  
 461  def _get_or_create_env(task_id: str):
 462      """Get or create the terminal environment for *task_id*.
 463  
 464      Reuses the same environment (container/sandbox/SSH session) that the
 465      terminal and file tools use, creating one if it doesn't exist yet.
 466      Returns ``(env, env_type)`` tuple.
 467      """
 468      from tools.terminal_tool import (
 469          _active_environments, _env_lock, _create_environment,
 470          _get_env_config, _last_activity, _start_cleanup_thread,
 471          _creation_locks, _creation_locks_lock, _task_env_overrides,
 472          _resolve_container_task_id,
 473      )
 474  
 475      effective_task_id = _resolve_container_task_id(task_id)
 476  
 477      # Fast path: environment already exists
 478      with _env_lock:
 479          if effective_task_id in _active_environments:
 480              _last_activity[effective_task_id] = time.time()
 481              return _active_environments[effective_task_id], _get_env_config()["env_type"]
 482  
 483      # Slow path: create environment (same pattern as file_tools._get_file_ops)
 484      with _creation_locks_lock:
 485          if effective_task_id not in _creation_locks:
 486              _creation_locks[effective_task_id] = threading.Lock()
 487          task_lock = _creation_locks[effective_task_id]
 488  
 489      with task_lock:
 490          with _env_lock:
 491              if effective_task_id in _active_environments:
 492                  _last_activity[effective_task_id] = time.time()
 493                  return _active_environments[effective_task_id], _get_env_config()["env_type"]
 494  
 495          config = _get_env_config()
 496          env_type = config["env_type"]
 497          overrides = _task_env_overrides.get(effective_task_id, {})
 498  
 499          if env_type == "docker":
 500              image = overrides.get("docker_image") or config["docker_image"]
 501          elif env_type == "singularity":
 502              image = overrides.get("singularity_image") or config["singularity_image"]
 503          elif env_type == "modal":
 504              image = overrides.get("modal_image") or config["modal_image"]
 505          elif env_type == "daytona":
 506              image = overrides.get("daytona_image") or config["daytona_image"]
 507          else:
 508              image = ""
 509  
 510          cwd = overrides.get("cwd") or config["cwd"]
 511  
 512          container_config = None
 513          if env_type in ("docker", "singularity", "modal", "daytona", "vercel_sandbox"):
 514              container_config = {
 515                  "container_cpu": config.get("container_cpu", 1),
 516                  "container_memory": config.get("container_memory", 5120),
 517                  "container_disk": config.get("container_disk", 51200),
 518                  "container_persistent": config.get("container_persistent", True),
 519                  "vercel_runtime": config.get("vercel_runtime", ""),
 520                  "docker_volumes": config.get("docker_volumes", []),
 521                  "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
 522              }
 523  
 524          ssh_config = None
 525          if env_type == "ssh":
 526              ssh_config = {
 527                  "host": config.get("ssh_host", ""),
 528                  "user": config.get("ssh_user", ""),
 529                  "port": config.get("ssh_port", 22),
 530                  "key": config.get("ssh_key", ""),
 531                  "persistent": config.get("ssh_persistent", False),
 532              }
 533  
 534          local_config = None
 535          if env_type == "local":
 536              local_config = {
 537                  "persistent": config.get("local_persistent", False),
 538              }
 539  
 540          logger.info("Creating new %s environment for execute_code task %s...",
 541                       env_type, effective_task_id[:8])
 542          env = _create_environment(
 543              env_type=env_type,
 544              image=image,
 545              cwd=cwd,
 546              timeout=config["timeout"],
 547              ssh_config=ssh_config,
 548              container_config=container_config,
 549              local_config=local_config,
 550              task_id=effective_task_id,
 551              host_cwd=config.get("host_cwd"),
 552          )
 553  
 554          with _env_lock:
 555              _active_environments[effective_task_id] = env
 556              _last_activity[effective_task_id] = time.time()
 557  
 558          _start_cleanup_thread()
 559          logger.info("%s environment ready for execute_code task %s",
 560                       env_type, effective_task_id[:8])
 561          return env, env_type
 562  
 563  
 564  def _ship_file_to_remote(env, remote_path: str, content: str) -> None:
 565      """Write *content* to *remote_path* on the remote environment.
 566  
 567      Uses ``echo … | base64 -d`` rather than stdin piping because some
 568      backends (Modal) don't reliably deliver stdin_data to chained
 569      commands.  Base64 output is shell-safe ([A-Za-z0-9+/=]) so single
 570      quotes are fine.
 571      """
 572      encoded = base64.b64encode(content.encode("utf-8")).decode("ascii")
 573      quoted_remote_path = shlex.quote(remote_path)
 574      env.execute(
 575          f"echo '{encoded}' | base64 -d > {quoted_remote_path}",
 576          cwd="/",
 577          timeout=30,
 578      )
 579  
 580  
 581  def _env_temp_dir(env: Any) -> str:
 582      """Return a writable temp dir for env-backed execute_code sandboxes."""
 583      get_temp_dir = getattr(env, "get_temp_dir", None)
 584      if callable(get_temp_dir):
 585          try:
 586              temp_dir = get_temp_dir()
 587              if isinstance(temp_dir, str) and temp_dir.startswith("/"):
 588                  return temp_dir.rstrip("/") or "/"
 589          except Exception as exc:
 590              logger.debug("Could not resolve execute_code env temp dir: %s", exc)
 591      candidate = tempfile.gettempdir()
 592      if isinstance(candidate, str) and candidate.startswith("/"):
 593          return candidate.rstrip("/") or "/"
 594      return "/tmp"
 595  
 596  
 597  def _rpc_poll_loop(
 598      env,
 599      rpc_dir: str,
 600      task_id: str,
 601      tool_call_log: list,
 602      tool_call_counter: list,
 603      max_tool_calls: int,
 604      allowed_tools: frozenset,
 605      stop_event: threading.Event,
 606  ):
 607      """Poll the remote filesystem for tool call requests and dispatch them.
 608  
 609      Runs in a background thread.  Each ``env.execute()`` spawns an
 610      independent process, so these calls run safely concurrent with the
 611      script-execution thread.
 612      """
 613      from model_tools import handle_function_call
 614  
 615      poll_interval = 0.1  # 100 ms
 616  
 617      quoted_rpc_dir = shlex.quote(rpc_dir)
 618      while not stop_event.is_set():
 619          try:
 620              # List pending request files (skip .tmp partials)
 621              ls_result = env.execute(
 622                  f"ls -1 {quoted_rpc_dir}/req_* 2>/dev/null || true",
 623                  cwd="/",
 624                  timeout=10,
 625              )
 626              output = ls_result.get("output", "").strip()
 627              if not output:
 628                  stop_event.wait(poll_interval)
 629                  continue
 630  
 631              req_files = sorted([
 632                  f.strip() for f in output.split("\n")
 633                  if f.strip()
 634                  and not f.strip().endswith(".tmp")
 635                  and "/req_" in f.strip()
 636              ])
 637  
 638              for req_file in req_files:
 639                  if stop_event.is_set():
 640                      break
 641  
 642                  call_start = time.monotonic()
 643  
 644                  quoted_req_file = shlex.quote(req_file)
 645                  # Read request
 646                  read_result = env.execute(
 647                      f"cat {quoted_req_file}",
 648                      cwd="/",
 649                      timeout=10,
 650                  )
 651                  try:
 652                      request = json.loads(read_result.get("output", ""))
 653                  except (json.JSONDecodeError, ValueError):
 654                      logger.debug("Malformed RPC request in %s", req_file)
 655                      # Remove bad request to avoid infinite retry
 656                      env.execute(f"rm -f {quoted_req_file}", cwd="/", timeout=5)
 657                      continue
 658  
 659                  tool_name = request.get("tool", "")
 660                  tool_args = request.get("args", {})
 661                  seq = request.get("seq", 0)
 662                  seq_str = f"{seq:06d}"
 663                  res_file = f"{rpc_dir}/res_{seq_str}"
 664                  quoted_res_file = shlex.quote(res_file)
 665  
 666                  # Enforce allow-list
 667                  if tool_name not in allowed_tools:
 668                      available = ", ".join(sorted(allowed_tools))
 669                      tool_result = json.dumps({
 670                          "error": (
 671                              f"Tool '{tool_name}' is not available in execute_code. "
 672                              f"Available: {available}"
 673                          )
 674                      })
 675                  # Enforce tool call limit
 676                  elif tool_call_counter[0] >= max_tool_calls:
 677                      tool_result = json.dumps({
 678                          "error": (
 679                              f"Tool call limit reached ({max_tool_calls}). "
 680                              "No more tool calls allowed in this execution."
 681                          )
 682                      })
 683                  else:
 684                      # Strip forbidden terminal parameters
 685                      if tool_name == "terminal" and isinstance(tool_args, dict):
 686                          for param in _TERMINAL_BLOCKED_PARAMS:
 687                              tool_args.pop(param, None)
 688  
 689                      # Dispatch through the standard tool handler
 690                      try:
 691                          _real_stdout, _real_stderr = sys.stdout, sys.stderr
 692                          devnull = open(os.devnull, "w")
 693                          try:
 694                              sys.stdout = devnull
 695                              sys.stderr = devnull
 696                              tool_result = handle_function_call(
 697                                  tool_name, tool_args, task_id=task_id
 698                              )
 699                          finally:
 700                              sys.stdout, sys.stderr = _real_stdout, _real_stderr
 701                              devnull.close()
 702                      except Exception as exc:
 703                          logger.error("Tool call failed in remote sandbox: %s",
 704                                       exc, exc_info=True)
 705                          tool_result = tool_error(str(exc))
 706  
 707                      tool_call_counter[0] += 1
 708                      call_duration = time.monotonic() - call_start
 709                      tool_call_log.append({
 710                          "tool": tool_name,
 711                          "args_preview": str(tool_args)[:80],
 712                          "duration": round(call_duration, 2),
 713                      })
 714  
 715                  # Write response atomically (tmp + rename).
 716                  # Use echo piping (not stdin_data) because Modal doesn't
 717                  # reliably deliver stdin to chained commands.
 718                  encoded_result = base64.b64encode(
 719                      tool_result.encode("utf-8")
 720                  ).decode("ascii")
 721                  env.execute(
 722                      f"echo '{encoded_result}' | base64 -d > {quoted_res_file}.tmp"
 723                      f" && mv {quoted_res_file}.tmp {quoted_res_file}",
 724                      cwd="/",
 725                      timeout=60,
 726                  )
 727  
 728                  # Remove the request file
 729                  env.execute(f"rm -f {quoted_req_file}", cwd="/", timeout=5)
 730  
 731          except Exception as e:
 732              if not stop_event.is_set():
 733                  logger.debug("RPC poll error: %s", e, exc_info=True)
 734  
 735          if not stop_event.is_set():
 736              stop_event.wait(poll_interval)
 737  
 738  
 739  def _execute_remote(
 740      code: str,
 741      task_id: Optional[str],
 742      enabled_tools: Optional[List[str]],
 743  ) -> str:
 744      """Run a script on the remote terminal backend via file-based RPC.
 745  
 746      The script and the generated hermes_tools.py module are shipped to
 747      the remote environment, and tool calls are proxied through a polling
 748      thread that communicates via request/response files.
 749      """
 750  
 751      _cfg = _load_config()
 752      timeout = _cfg.get("timeout", DEFAULT_TIMEOUT)
 753      max_tool_calls = _cfg.get("max_tool_calls", DEFAULT_MAX_TOOL_CALLS)
 754  
 755      session_tools = set(enabled_tools) if enabled_tools else set()
 756      sandbox_tools = frozenset(SANDBOX_ALLOWED_TOOLS & session_tools)
 757      if not sandbox_tools:
 758          sandbox_tools = SANDBOX_ALLOWED_TOOLS
 759  
 760      effective_task_id = task_id or "default"
 761      env, env_type = _get_or_create_env(effective_task_id)
 762  
 763      sandbox_id = uuid.uuid4().hex[:12]
 764      temp_dir = _env_temp_dir(env)
 765      sandbox_dir = f"{temp_dir}/hermes_exec_{sandbox_id}"
 766      quoted_sandbox_dir = shlex.quote(sandbox_dir)
 767      quoted_rpc_dir = shlex.quote(f"{sandbox_dir}/rpc")
 768  
 769      tool_call_log: list = []
 770      tool_call_counter = [0]
 771      exec_start = time.monotonic()
 772      stop_event = threading.Event()
 773      rpc_thread = None
 774  
 775      try:
 776          # Verify Python is available on the remote
 777          py_check = env.execute(
 778              "command -v python3 >/dev/null 2>&1 && echo OK",
 779              cwd="/", timeout=15,
 780          )
 781          if "OK" not in py_check.get("output", ""):
 782              return json.dumps({
 783                  "status": "error",
 784                  "error": (
 785                      f"Python 3 is not available in the {env_type} terminal "
 786                      "environment. Install Python to use execute_code with "
 787                      "remote backends."
 788                  ),
 789                  "tool_calls_made": 0,
 790                  "duration_seconds": 0,
 791              })
 792  
 793          # Create sandbox directory on remote
 794          env.execute(
 795              f"mkdir -p {quoted_rpc_dir}", cwd="/", timeout=10,
 796          )
 797  
 798          # Generate and ship files
 799          tools_src = generate_hermes_tools_module(
 800              list(sandbox_tools), transport="file",
 801          )
 802          _ship_file_to_remote(env, f"{sandbox_dir}/hermes_tools.py", tools_src)
 803          _ship_file_to_remote(env, f"{sandbox_dir}/script.py", code)
 804  
 805          # Start RPC polling thread
 806          rpc_thread = threading.Thread(
 807              target=_rpc_poll_loop,
 808              args=(
 809                  env, f"{sandbox_dir}/rpc", effective_task_id,
 810                  tool_call_log, tool_call_counter, max_tool_calls,
 811                  sandbox_tools, stop_event,
 812              ),
 813              daemon=True,
 814          )
 815          rpc_thread.start()
 816  
 817          # Build environment variable prefix for the script
 818          env_prefix = (
 819              f"HERMES_RPC_DIR={shlex.quote(f'{sandbox_dir}/rpc')} "
 820              f"PYTHONDONTWRITEBYTECODE=1"
 821          )
 822          tz = os.getenv("HERMES_TIMEZONE", "").strip()
 823          if tz:
 824              env_prefix += f" TZ={tz}"
 825  
 826          # Execute the script on the remote backend
 827          logger.info("Executing code on %s backend (task %s)...",
 828                       env_type, effective_task_id[:8])
 829          script_result = env.execute(
 830              f"cd {quoted_sandbox_dir} && {env_prefix} python3 script.py",
 831              timeout=timeout,
 832          )
 833  
 834          stdout_text = script_result.get("output", "")
 835          exit_code = script_result.get("returncode", -1)
 836          status = "success"
 837  
 838          # Check for timeout/interrupt from the backend
 839          if exit_code == 124:
 840              status = "timeout"
 841          elif exit_code == 130:
 842              status = "interrupted"
 843  
 844      except Exception as exc:
 845          duration = round(time.monotonic() - exec_start, 2)
 846          logger.error(
 847              "execute_code remote failed after %ss with %d tool calls: %s: %s",
 848              duration, tool_call_counter[0], type(exc).__name__, exc,
 849              exc_info=True,
 850          )
 851          return json.dumps({
 852              "status": "error",
 853              "error": str(exc),
 854              "tool_calls_made": tool_call_counter[0],
 855              "duration_seconds": duration,
 856          }, ensure_ascii=False)
 857  
 858      finally:
 859          # Stop the polling thread
 860          stop_event.set()
 861          if rpc_thread is not None:
 862              rpc_thread.join(timeout=5)
 863  
 864          # Clean up remote sandbox dir
 865          try:
 866              env.execute(
 867                  f"rm -rf {quoted_sandbox_dir}", cwd="/", timeout=15,
 868              )
 869          except Exception:
 870              logger.debug("Failed to clean up remote sandbox %s", sandbox_dir)
 871  
 872      duration = round(time.monotonic() - exec_start, 2)
 873  
 874      # --- Post-process output (same as local path) ---
 875  
 876      # Truncate stdout to cap
 877      if len(stdout_text) > MAX_STDOUT_BYTES:
 878          head_bytes = int(MAX_STDOUT_BYTES * 0.4)
 879          tail_bytes = MAX_STDOUT_BYTES - head_bytes
 880          head = stdout_text[:head_bytes]
 881          tail = stdout_text[-tail_bytes:]
 882          omitted = len(stdout_text) - len(head) - len(tail)
 883          stdout_text = (
 884              head
 885              + f"\n\n... [OUTPUT TRUNCATED - {omitted:,} chars omitted "
 886              f"out of {len(stdout_text):,} total] ...\n\n"
 887              + tail
 888          )
 889  
 890      # Strip ANSI escape sequences
 891      from tools.ansi_strip import strip_ansi
 892      stdout_text = strip_ansi(stdout_text)
 893  
 894      # Redact secrets
 895      from agent.redact import redact_sensitive_text
 896      stdout_text = redact_sensitive_text(stdout_text)
 897  
 898      # Build response
 899      result: Dict[str, Any] = {
 900          "status": status,
 901          "output": stdout_text,
 902          "tool_calls_made": tool_call_counter[0],
 903          "duration_seconds": duration,
 904      }
 905  
 906      if status == "timeout":
 907          timeout_msg = f"Script timed out after {timeout}s and was killed."
 908          result["error"] = timeout_msg
 909          # Include timeout message in output so the LLM always surfaces it
 910          # to the user (see local path comment — same reasoning, #10807).
 911          if stdout_text:
 912              result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}"
 913          else:
 914              result["output"] = f"⏰ {timeout_msg}"
 915          logger.warning(
 916              "execute_code (remote) timed out after %ss (limit %ss) with %d tool calls",
 917              duration, timeout, tool_call_counter[0],
 918          )
 919      elif status == "interrupted":
 920          result["output"] = (
 921              stdout_text + "\n[execution interrupted — user sent a new message]"
 922          )
 923      elif exit_code != 0:
 924          result["status"] = "error"
 925          result["error"] = f"Script exited with code {exit_code}"
 926  
 927      return json.dumps(result, ensure_ascii=False)
 928  
 929  
 930  # ---------------------------------------------------------------------------
 931  # Main entry point
 932  # ---------------------------------------------------------------------------
 933  
 934  def execute_code(
 935      code: str,
 936      task_id: Optional[str] = None,
 937      enabled_tools: Optional[List[str]] = None,
 938  ) -> str:
 939      """
 940      Run a Python script in a sandboxed child process with RPC access
 941      to a subset of Hermes tools.
 942  
 943      Dispatches to the local (UDS) or remote (file-based RPC) path
 944      depending on the configured terminal backend.
 945  
 946      Args:
 947          code:          Python source code to execute.
 948          task_id:       Session task ID for tool isolation (terminal env, etc.).
 949          enabled_tools: Tool names enabled in the current session. The sandbox
 950                         gets the intersection with SANDBOX_ALLOWED_TOOLS.
 951  
 952      Returns:
 953          JSON string with execution results.
 954      """
 955      if not SANDBOX_AVAILABLE:
 956          return json.dumps({
 957              "error": "execute_code is not available on Windows. Use normal tool calls instead."
 958          })
 959  
 960      if not code or not code.strip():
 961          return tool_error("No code provided.")
 962  
 963      # Dispatch: remote backends use file-based RPC, local uses UDS
 964      from tools.terminal_tool import _get_env_config
 965      env_type = _get_env_config()["env_type"]
 966      if env_type != "local":
 967          return _execute_remote(code, task_id, enabled_tools)
 968  
 969      # --- Local execution path (UDS) --- below this line is unchanged ---
 970  
 971      # Import per-thread interrupt check (cooperative cancellation)
 972      from tools.interrupt import is_interrupted as _is_interrupted
 973  
 974      # Resolve config
 975      _cfg = _load_config()
 976      timeout = _cfg.get("timeout", DEFAULT_TIMEOUT)
 977      max_tool_calls = _cfg.get("max_tool_calls", DEFAULT_MAX_TOOL_CALLS)
 978  
 979      # Determine which tools the sandbox can call
 980      session_tools = set(enabled_tools) if enabled_tools else set()
 981      sandbox_tools = frozenset(SANDBOX_ALLOWED_TOOLS & session_tools)
 982  
 983      if not sandbox_tools:
 984          sandbox_tools = SANDBOX_ALLOWED_TOOLS
 985  
 986      # --- Set up temp directory with hermes_tools.py and script.py ---
 987      tmpdir = tempfile.mkdtemp(prefix="hermes_sandbox_")
 988      # Use /tmp on macOS to avoid the long /var/folders/... path that pushes
 989      # Unix domain socket paths past the 104-byte macOS AF_UNIX limit.
 990      # On Linux, tempfile.gettempdir() already returns /tmp.
 991      _sock_tmpdir = "/tmp" if sys.platform == "darwin" else tempfile.gettempdir()
 992      sock_path = os.path.join(_sock_tmpdir, f"hermes_rpc_{uuid.uuid4().hex}.sock")
 993  
 994      tool_call_log: list = []
 995      tool_call_counter = [0]  # mutable so the RPC thread can increment
 996      exec_start = time.monotonic()
 997      server_sock = None
 998  
 999      try:
1000          # Write the auto-generated hermes_tools module
1001          # sandbox_tools is already the correct set (intersection with session
1002          # tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above).
1003          tools_src = generate_hermes_tools_module(list(sandbox_tools))
1004          with open(os.path.join(tmpdir, "hermes_tools.py"), "w") as f:
1005              f.write(tools_src)
1006  
1007          # Write the user's script
1008          with open(os.path.join(tmpdir, "script.py"), "w") as f:
1009              f.write(code)
1010  
1011          # --- Start UDS server ---
1012          server_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1013          server_sock.bind(sock_path)
1014          os.chmod(sock_path, 0o600)
1015          server_sock.listen(1)
1016  
1017          rpc_thread = threading.Thread(
1018              target=_rpc_server_loop,
1019              args=(
1020                  server_sock, task_id, tool_call_log,
1021                  tool_call_counter, max_tool_calls, sandbox_tools,
1022              ),
1023              daemon=True,
1024          )
1025          rpc_thread.start()
1026  
1027          # --- Spawn child process ---
1028          # Build a minimal environment for the child. We intentionally exclude
1029          # API keys and tokens to prevent credential exfiltration from LLM-
1030          # generated scripts. The child accesses tools via RPC, not direct API.
1031          # Exception: env vars declared by loaded skills (via env_passthrough
1032          # registry) or explicitly allowed by the user in config.yaml
1033          # (terminal.env_passthrough) are passed through.
1034          _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
1035                                "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
1036                                "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA",
1037                                "HERMES_")
1038          _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
1039                                "PASSWD", "AUTH")
1040          try:
1041              from tools.env_passthrough import is_env_passthrough as _is_passthrough
1042          except Exception:
1043              _is_passthrough = lambda _: False  # noqa: E731
1044          child_env = {}
1045          for k, v in os.environ.items():
1046              # Passthrough vars (skill-declared or user-configured) always pass.
1047              if _is_passthrough(k):
1048                  child_env[k] = v
1049                  continue
1050              # Block vars with secret-like names.
1051              if any(s in k.upper() for s in _SECRET_SUBSTRINGS):
1052                  continue
1053              # Allow vars with known safe prefixes.
1054              if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
1055                  child_env[k] = v
1056          child_env["HERMES_RPC_SOCKET"] = sock_path
1057          child_env["PYTHONDONTWRITEBYTECODE"] = "1"
1058          # Ensure the hermes-agent root is importable in the sandbox so
1059          # repo-root modules are available to child scripts.  We also prepend
1060          # the staging tmpdir so ``from hermes_tools import ...`` resolves even
1061          # when the subprocess CWD is not tmpdir (project mode).
1062          _hermes_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
1063          _existing_pp = child_env.get("PYTHONPATH", "")
1064          _pp_parts = [tmpdir, _hermes_root]
1065          if _existing_pp:
1066              _pp_parts.append(_existing_pp)
1067          child_env["PYTHONPATH"] = os.pathsep.join(_pp_parts)
1068          # Inject user's configured timezone so datetime.now() in sandboxed
1069          # code reflects the correct wall-clock time.  Only TZ is set —
1070          # HERMES_TIMEZONE is an internal Hermes setting and must not leak
1071          # into child processes.
1072          _tz_name = os.getenv("HERMES_TIMEZONE", "").strip()
1073          if _tz_name:
1074              child_env["TZ"] = _tz_name
1075          child_env.pop("HERMES_TIMEZONE", None)
1076  
1077          # Per-profile HOME isolation: redirect system tool configs into
1078          # {HERMES_HOME}/home/ when that directory exists.
1079          from hermes_constants import get_subprocess_home
1080          _profile_home = get_subprocess_home()
1081          if _profile_home:
1082              child_env["HOME"] = _profile_home
1083  
1084          # Resolve interpreter + CWD based on execute_code mode.
1085          #   - strict : today's behavior (sys.executable + tmpdir CWD).
1086          #   - project: user's venv python + session's working directory, so
1087          #              project deps like pandas and user files resolve.
1088          # Env scrubbing and tool whitelist apply identically in both modes.
1089          _mode = _get_execution_mode()
1090          _child_python = _resolve_child_python(_mode)
1091          _child_cwd = _resolve_child_cwd(_mode, tmpdir)
1092          _script_path = os.path.join(tmpdir, "script.py")
1093  
1094          proc = subprocess.Popen(
1095              [_child_python, _script_path],
1096              cwd=_child_cwd,
1097              env=child_env,
1098              stdout=subprocess.PIPE,
1099              stderr=subprocess.PIPE,
1100              stdin=subprocess.DEVNULL,
1101              preexec_fn=None if _IS_WINDOWS else os.setsid,
1102          )
1103  
1104          # --- Poll loop: watch for exit, timeout, and interrupt ---
1105          deadline = time.monotonic() + timeout
1106          stderr_chunks: list = []
1107  
1108          # Background readers to avoid pipe buffer deadlocks.
1109          # For stdout we use a head+tail strategy: keep the first HEAD_BYTES
1110          # and a rolling window of the last TAIL_BYTES so the final print()
1111          # output is never lost.  Stderr keeps head-only (errors appear early).
1112          _STDOUT_HEAD_BYTES = int(MAX_STDOUT_BYTES * 0.4)   # 40% head
1113          _STDOUT_TAIL_BYTES = MAX_STDOUT_BYTES - _STDOUT_HEAD_BYTES  # 60% tail
1114  
1115          def _drain(pipe, chunks, max_bytes):
1116              """Simple head-only drain (used for stderr)."""
1117              total = 0
1118              try:
1119                  while True:
1120                      data = pipe.read(4096)
1121                      if not data:
1122                          break
1123                      if total < max_bytes:
1124                          keep = max_bytes - total
1125                          chunks.append(data[:keep])
1126                      total += len(data)
1127              except (ValueError, OSError) as e:
1128                  logger.debug("Error reading process output: %s", e, exc_info=True)
1129  
1130          stdout_total_bytes = [0]  # mutable ref for total bytes seen
1131  
1132          def _drain_head_tail(pipe, head_chunks, tail_chunks, head_bytes, tail_bytes, total_ref):
1133              """Drain stdout keeping both head and tail data."""
1134              head_collected = 0
1135              from collections import deque
1136              tail_buf = deque()
1137              tail_collected = 0
1138              try:
1139                  while True:
1140                      data = pipe.read(4096)
1141                      if not data:
1142                          break
1143                      total_ref[0] += len(data)
1144                      # Fill head buffer first
1145                      if head_collected < head_bytes:
1146                          keep = min(len(data), head_bytes - head_collected)
1147                          head_chunks.append(data[:keep])
1148                          head_collected += keep
1149                          data = data[keep:]  # remaining goes to tail
1150                          if not data:
1151                              continue
1152                      # Everything past head goes into rolling tail buffer
1153                      tail_buf.append(data)
1154                      tail_collected += len(data)
1155                      # Evict old tail data to stay within tail_bytes budget
1156                      while tail_collected > tail_bytes and tail_buf:
1157                          oldest = tail_buf.popleft()
1158                          tail_collected -= len(oldest)
1159              except (ValueError, OSError):
1160                  pass
1161              # Transfer final tail to output list
1162              tail_chunks.extend(tail_buf)
1163  
1164          stdout_head_chunks: list = []
1165          stdout_tail_chunks: list = []
1166  
1167          stdout_reader = threading.Thread(
1168              target=_drain_head_tail,
1169              args=(proc.stdout, stdout_head_chunks, stdout_tail_chunks,
1170                    _STDOUT_HEAD_BYTES, _STDOUT_TAIL_BYTES, stdout_total_bytes),
1171              daemon=True
1172          )
1173          stderr_reader = threading.Thread(
1174              target=_drain, args=(proc.stderr, stderr_chunks, MAX_STDERR_BYTES), daemon=True
1175          )
1176          stdout_reader.start()
1177          stderr_reader.start()
1178  
1179          status = "success"
1180          _activity_state = {
1181              "last_touch": time.monotonic(),
1182              "start": exec_start,
1183          }
1184          while proc.poll() is None:
1185              if _is_interrupted():
1186                  _kill_process_group(proc)
1187                  status = "interrupted"
1188                  break
1189              if time.monotonic() > deadline:
1190                  _kill_process_group(proc, escalate=True)
1191                  status = "timeout"
1192                  break
1193              # Periodic activity touch so the gateway's inactivity timeout
1194              # doesn't kill the agent during long code execution (#10807).
1195              try:
1196                  from tools.environments.base import touch_activity_if_due
1197                  touch_activity_if_due(_activity_state, "execute_code running")
1198              except Exception:
1199                  pass
1200              time.sleep(0.2)
1201  
1202          # Wait for readers to finish draining
1203          stdout_reader.join(timeout=3)
1204          stderr_reader.join(timeout=3)
1205  
1206          stdout_head = b"".join(stdout_head_chunks).decode("utf-8", errors="replace")
1207          stdout_tail = b"".join(stdout_tail_chunks).decode("utf-8", errors="replace")
1208          stderr_text = b"".join(stderr_chunks).decode("utf-8", errors="replace")
1209  
1210          # Assemble stdout with head+tail truncation
1211          total_stdout = stdout_total_bytes[0]
1212          if total_stdout > MAX_STDOUT_BYTES and stdout_tail:
1213              omitted = total_stdout - len(stdout_head) - len(stdout_tail)
1214              truncated_notice = (
1215                  f"\n\n... [OUTPUT TRUNCATED - {omitted:,} chars omitted "
1216                  f"out of {total_stdout:,} total] ...\n\n"
1217              )
1218              stdout_text = stdout_head + truncated_notice + stdout_tail
1219          else:
1220              stdout_text = stdout_head + stdout_tail
1221  
1222          exit_code = proc.returncode if proc.returncode is not None else -1
1223          duration = round(time.monotonic() - exec_start, 2)
1224  
1225          # Wait for RPC thread to finish
1226          server_sock.close()  # break accept() so thread exits promptly
1227          server_sock = None  # prevent double close in finally
1228          rpc_thread.join(timeout=3)
1229  
1230          # Strip ANSI escape sequences so the model never sees terminal
1231          # formatting — prevents it from copying escapes into file writes.
1232          from tools.ansi_strip import strip_ansi
1233          stdout_text = strip_ansi(stdout_text)
1234          stderr_text = strip_ansi(stderr_text)
1235  
1236          # Redact secrets (API keys, tokens, etc.) from sandbox output.
1237          # The sandbox env-var filter (lines 434-454) blocks os.environ access,
1238          # but scripts can still read secrets from disk (e.g. open('~/.hermes/.env')).
1239          # This ensures leaked secrets never enter the model context.
1240          from agent.redact import redact_sensitive_text
1241          stdout_text = redact_sensitive_text(stdout_text)
1242          stderr_text = redact_sensitive_text(stderr_text)
1243  
1244          # Build response
1245          result: Dict[str, Any] = {
1246              "status": status,
1247              "output": stdout_text,
1248              "tool_calls_made": tool_call_counter[0],
1249              "duration_seconds": duration,
1250          }
1251  
1252          if status == "timeout":
1253              timeout_msg = f"Script timed out after {timeout}s and was killed."
1254              result["error"] = timeout_msg
1255              # Include timeout message in output so the LLM always surfaces it
1256              # to the user.  When output is empty, models often treat the result
1257              # as "nothing happened" and produce an empty response, which the
1258              # gateway stream consumer silently drops (#10807).
1259              if stdout_text:
1260                  result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}"
1261              else:
1262                  result["output"] = f"⏰ {timeout_msg}"
1263              logger.warning(
1264                  "execute_code timed out after %ss (limit %ss) with %d tool calls",
1265                  duration, timeout, tool_call_counter[0],
1266              )
1267          elif status == "interrupted":
1268              result["output"] = stdout_text + "\n[execution interrupted — user sent a new message]"
1269          elif exit_code != 0:
1270              result["status"] = "error"
1271              result["error"] = stderr_text or f"Script exited with code {exit_code}"
1272              # Include stderr in output so the LLM sees the traceback
1273              if stderr_text:
1274                  result["output"] = stdout_text + "\n--- stderr ---\n" + stderr_text
1275  
1276          return json.dumps(result, ensure_ascii=False)
1277  
1278      except Exception as exc:
1279          duration = round(time.monotonic() - exec_start, 2)
1280          logger.error(
1281              "execute_code failed after %ss with %d tool calls: %s: %s",
1282              duration,
1283              tool_call_counter[0],
1284              type(exc).__name__,
1285              exc,
1286              exc_info=True,
1287          )
1288          return json.dumps({
1289              "status": "error",
1290              "error": str(exc),
1291              "tool_calls_made": tool_call_counter[0],
1292              "duration_seconds": duration,
1293          }, ensure_ascii=False)
1294  
1295      finally:
1296          # Cleanup temp dir and socket
1297          if server_sock is not None:
1298              try:
1299                  server_sock.close()
1300              except OSError as e:
1301                  logger.debug("Server socket close error: %s", e)
1302          import shutil
1303          shutil.rmtree(tmpdir, ignore_errors=True)
1304          try:
1305              os.unlink(sock_path)
1306          except OSError:
1307              pass  # already cleaned up or never created
1308  
1309  
1310  def _kill_process_group(proc, escalate: bool = False):
1311      """Kill the child and its entire process group."""
1312      try:
1313          if _IS_WINDOWS:
1314              proc.terminate()
1315          else:
1316              os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
1317      except (ProcessLookupError, PermissionError) as e:
1318          logger.debug("Could not kill process group: %s", e, exc_info=True)
1319          try:
1320              proc.kill()
1321          except Exception as e2:
1322              logger.debug("Could not kill process: %s", e2, exc_info=True)
1323  
1324      if escalate:
1325          # Give the process 5s to exit after SIGTERM, then SIGKILL
1326          try:
1327              proc.wait(timeout=5)
1328          except subprocess.TimeoutExpired:
1329              try:
1330                  if _IS_WINDOWS:
1331                      proc.kill()
1332                  else:
1333                      os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
1334              except (ProcessLookupError, PermissionError) as e:
1335                  logger.debug("Could not kill process group with SIGKILL: %s", e, exc_info=True)
1336                  try:
1337                      proc.kill()
1338                  except Exception as e2:
1339                      logger.debug("Could not kill process: %s", e2, exc_info=True)
1340  
1341  
1342  def _load_config() -> dict:
1343      """Load code_execution config without importing the interactive CLI.
1344  
1345      This helper is called while building the module-level execute_code schema
1346      during tool discovery.  Importing ``cli`` here pulls prompt_toolkit/Rich and
1347      a large chunk of the classic REPL onto every agent startup path, including
1348      ``hermes --tui`` where it is never used.  Read the lightweight raw config
1349      instead; the config layer already caches by (mtime, size), and an absent
1350      key cleanly falls back to DEFAULT_EXECUTION_MODE.
1351      """
1352      try:
1353          from hermes_cli.config import read_raw_config
1354  
1355          cfg = read_raw_config().get("code_execution", {})
1356          return cfg if isinstance(cfg, dict) else {}
1357      except Exception:
1358          return {}
1359  
1360  
1361  # ---------------------------------------------------------------------------
1362  # Execution mode resolution (strict vs project)
1363  # ---------------------------------------------------------------------------
1364  
1365  # Valid values for code_execution.mode. Kept as a module constant so tests
1366  # and the config layer can reference the canonical set.
1367  EXECUTION_MODES = ("project", "strict")
1368  DEFAULT_EXECUTION_MODE = "project"
1369  
1370  
1371  def _get_execution_mode() -> str:
1372      """Return the active execute_code mode — 'project' or 'strict'.
1373  
1374      Reads ``code_execution.mode`` from config.yaml; invalid values fall back
1375      to ``DEFAULT_EXECUTION_MODE`` ('project') with a log warning.
1376  
1377      Mode semantics:
1378        - ``project`` (default): scripts run in the session's working directory
1379          with the active virtual environment's python, so project dependencies
1380          (pandas, torch, project packages) and files resolve naturally.
1381        - ``strict``: scripts run in an isolated temp directory with
1382          ``sys.executable`` (hermes-agent's python). Reproducible and the
1383          interpreter is guaranteed to work, but project deps and relative paths
1384          won't resolve.
1385  
1386      Env scrubbing and tool whitelist apply identically in both modes.
1387      """
1388      cfg_value = str(_load_config().get("mode", DEFAULT_EXECUTION_MODE)).strip().lower()
1389      if cfg_value in EXECUTION_MODES:
1390          return cfg_value
1391      logger.warning(
1392          "Ignoring code_execution.mode=%r (expected one of %s), falling back to %r",
1393          cfg_value, EXECUTION_MODES, DEFAULT_EXECUTION_MODE,
1394      )
1395      return DEFAULT_EXECUTION_MODE
1396  
1397  
1398  @functools.lru_cache(maxsize=32)
1399  def _is_usable_python(python_path: str) -> bool:
1400      """Check whether a candidate Python interpreter is usable for execute_code.
1401  
1402      Requires Python 3.8+ (f-strings and stdlib modules the RPC stubs need).
1403      Cached so we don't fork a subprocess on every execute_code call.
1404      """
1405      try:
1406          result = subprocess.run(
1407              [python_path, "-c",
1408               "import sys; sys.exit(0 if sys.version_info >= (3, 8) else 1)"],
1409              timeout=5,
1410              capture_output=True,
1411          )
1412          return result.returncode == 0
1413      except (OSError, subprocess.TimeoutExpired, subprocess.SubprocessError):
1414          return False
1415  
1416  
1417  def _resolve_child_python(mode: str) -> str:
1418      """Pick the Python interpreter for the execute_code subprocess.
1419  
1420      In ``strict`` mode, always ``sys.executable`` — guaranteed to work and
1421      keeps behavior fully reproducible across sessions.
1422  
1423      In ``project`` mode, prefer the user's active virtualenv/conda env's
1424      python so ``import pandas`` etc. work. Falls back to ``sys.executable``
1425      if no venv is detected, the candidate binary is missing/not executable,
1426      or it fails a Python 3.8+ version check.
1427      """
1428      if mode != "project":
1429          return sys.executable
1430  
1431      if _IS_WINDOWS:
1432          exe_names = ("python.exe", "python3.exe")
1433          subdirs = ("Scripts",)
1434      else:
1435          exe_names = ("python", "python3")
1436          subdirs = ("bin",)
1437  
1438      for var in ("VIRTUAL_ENV", "CONDA_PREFIX"):
1439          root = os.environ.get(var, "").strip()
1440          if not root:
1441              continue
1442          for subdir in subdirs:
1443              for exe in exe_names:
1444                  candidate = os.path.join(root, subdir, exe)
1445                  if not (os.path.isfile(candidate) and os.access(candidate, os.X_OK)):
1446                      continue
1447                  if _is_usable_python(candidate):
1448                      return candidate
1449                  # Found the interpreter but it failed the version check —
1450                  # log once and fall through to sys.executable.
1451                  logger.info(
1452                      "execute_code: skipping %s=%s (Python version < 3.8 or broken). "
1453                      "Using sys.executable instead.", var, candidate,
1454                  )
1455                  return sys.executable
1456  
1457      return sys.executable
1458  
1459  
1460  def _resolve_child_cwd(mode: str, staging_dir: str) -> str:
1461      """Resolve the working directory for the execute_code subprocess.
1462  
1463      - ``strict``: the staging tmpdir (today's behavior).
1464      - ``project``: the session's TERMINAL_CWD (same as the terminal tool), or
1465        ``os.getcwd()`` if TERMINAL_CWD is unset or doesn't point at a real dir.
1466        Falls back to the staging tmpdir as a last resort so we never invoke
1467        Popen with a nonexistent cwd.
1468      """
1469      if mode != "project":
1470          return staging_dir
1471      raw = os.environ.get("TERMINAL_CWD", "").strip()
1472      if raw:
1473          expanded = os.path.expanduser(raw)
1474          if os.path.isdir(expanded):
1475              return expanded
1476      here = os.getcwd()
1477      if os.path.isdir(here):
1478          return here
1479      return staging_dir
1480  
1481  
1482  # ---------------------------------------------------------------------------
1483  # OpenAI Function-Calling Schema
1484  # ---------------------------------------------------------------------------
1485  
1486  # Per-tool documentation lines for the execute_code description.
1487  # Ordered to match the canonical display order.
1488  _TOOL_DOC_LINES = [
1489      ("web_search",
1490       "  web_search(query: str, limit: int = 5) -> dict\n"
1491       "    Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"),
1492      ("web_extract",
1493       "  web_extract(urls: list[str]) -> dict\n"
1494       "    Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"),
1495      ("read_file",
1496       "  read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n"
1497       "    Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"),
1498      ("write_file",
1499       "  write_file(path: str, content: str) -> dict\n"
1500       "    Always overwrites the entire file."),
1501      ("search_files",
1502       "  search_files(pattern: str, target=\"content\", path=\".\", file_glob=None, limit=50) -> dict\n"
1503       "    target: \"content\" (search inside files) or \"files\" (find files by name). Returns {\"matches\": [...]}"),
1504      ("patch",
1505       "  patch(path: str, old_string: str, new_string: str, replace_all: bool = False) -> dict\n"
1506       "    Replaces old_string with new_string in the file."),
1507      ("terminal",
1508       "  terminal(command: str, timeout=None, workdir=None) -> dict\n"
1509       "    Foreground only (no background/pty). Returns {\"output\": \"...\", \"exit_code\": N}"),
1510  ]
1511  
1512  
1513  def build_execute_code_schema(enabled_sandbox_tools: set = None,
1514                                mode: str = None) -> dict:
1515      """Build the execute_code schema with description listing only enabled tools.
1516  
1517      When tools are disabled via ``hermes tools`` (e.g. web is turned off),
1518      the schema description should NOT mention web_search / web_extract —
1519      otherwise the model thinks they are available and keeps trying to use them.
1520  
1521      ``mode`` controls the working-directory sentence in the description:
1522        - ``'strict'``: scripts run in a temp dir (not the session's CWD)
1523        - ``'project'`` (default): scripts run in the session's CWD with the
1524          active venv's python
1525      If ``mode`` is None, the current ``code_execution.mode`` config is read.
1526      """
1527      if enabled_sandbox_tools is None:
1528          enabled_sandbox_tools = SANDBOX_ALLOWED_TOOLS
1529      if mode is None:
1530          mode = _get_execution_mode()
1531  
1532      # Build tool documentation lines for only the enabled tools
1533      tool_lines = "\n".join(
1534          doc for name, doc in _TOOL_DOC_LINES if name in enabled_sandbox_tools
1535      )
1536  
1537      # Build example import list from enabled tools
1538      import_examples = [n for n in ("web_search", "terminal") if n in enabled_sandbox_tools]
1539      if not import_examples:
1540          import_examples = sorted(enabled_sandbox_tools)[:2]
1541      if import_examples:
1542          import_str = ", ".join(import_examples) + ", ..."
1543      else:
1544          import_str = "..."
1545  
1546      # Mode-specific CWD guidance. Project mode is the default and matches
1547      # terminal()'s filesystem/interpreter; strict mode retains the isolated
1548      # temp-dir staging and hermes-agent's own python.
1549      if mode == "strict":
1550          cwd_note = (
1551              "Scripts run in their own temp dir, not the session's CWD — use absolute paths "
1552              "(os.path.expanduser('~/.hermes/.env')) or terminal()/read_file() for user files."
1553          )
1554      else:
1555          cwd_note = (
1556              "Scripts run in the session's working directory with the active venv's python, "
1557              "so project deps (pandas, etc.) and relative paths work like in terminal()."
1558          )
1559  
1560      description = (
1561          "Run a Python script that can call Hermes tools programmatically. "
1562          "Use this when you need 3+ tool calls with processing logic between them, "
1563          "need to filter/reduce large tool outputs before they enter your context, "
1564          "need conditional branching (if X then Y else Z), or need to loop "
1565          "(fetch N pages, process N files, retry on failure).\n\n"
1566          "Use normal tool calls instead when: single tool call with no processing, "
1567          "you need to see the full result and apply complex reasoning, "
1568          "or the task requires interactive user input.\n\n"
1569          f"Available via `from hermes_tools import ...`:\n\n"
1570          f"{tool_lines}\n\n"
1571          "Limits: 5-minute timeout, 50KB stdout cap, max 50 tool calls per script. "
1572          "terminal() is foreground-only (no background or pty).\n\n"
1573          f"{cwd_note}\n\n"
1574          "Print your final result to stdout. Use Python stdlib (json, re, math, csv, "
1575          "datetime, collections, etc.) for processing between tool calls.\n\n"
1576          "Also available (no import needed — built into hermes_tools):\n"
1577          "  json_parse(text: str) — json.loads with strict=False; use for terminal() output with control chars\n"
1578          "  shell_quote(s: str) — shlex.quote(); use when interpolating dynamic strings into shell commands\n"
1579          "  retry(fn, max_attempts=3, delay=2) — retry with exponential backoff for transient failures"
1580      )
1581  
1582      return {
1583          "name": "execute_code",
1584          "description": description,
1585          "parameters": {
1586              "type": "object",
1587              "properties": {
1588                  "code": {
1589                      "type": "string",
1590                      "description": (
1591                          "Python code to execute. Import tools with "
1592                          f"`from hermes_tools import {import_str}` "
1593                          "and print your final result to stdout."
1594                      ),
1595                  },
1596              },
1597              "required": ["code"],
1598          },
1599      }
1600  
1601  
1602  # Default schema used at registration time (all sandbox tools listed,
1603  # current configured mode).  model_tools.py rebuilds per-session anyway.
1604  EXECUTE_CODE_SCHEMA = build_execute_code_schema()
1605  
1606  
1607  # --- Registry ---
1608  from tools.registry import registry, tool_error
1609  
1610  registry.register(
1611      name="execute_code",
1612      toolset="code_execution",
1613      schema=EXECUTE_CODE_SCHEMA,
1614      handler=lambda args, **kw: execute_code(
1615          code=args.get("code", ""),
1616          task_id=kw.get("task_id"),
1617          enabled_tools=kw.get("enabled_tools")),
1618      check_fn=check_sandbox_requirements,
1619      emoji="🐍",
1620      max_result_size_chars=100_000,
1621  )