/ tools / browser_cdp_tool.py
browser_cdp_tool.py
  1  #!/usr/bin/env python3
  2  """
  3  Raw Chrome DevTools Protocol (CDP) passthrough tool.
  4  
  5  Exposes a single tool, ``browser_cdp``, that sends arbitrary CDP commands to
  6  the browser's DevTools WebSocket endpoint.  Works when a CDP URL is
  7  configured — either via ``/browser connect`` (sets ``BROWSER_CDP_URL``) or
  8  ``browser.cdp_url`` in ``config.yaml`` — or when a CDP-backed cloud provider
  9  session is active.
 10  
 11  This is the escape hatch for browser operations not covered by the main
 12  browser tool surface (``browser_navigate``, ``browser_click``,
 13  ``browser_console``, etc.) — handling native dialogs, iframe-scoped
 14  evaluation, cookie/network control, low-level tab management, etc.
 15  
 16  Method reference: https://chromedevtools.github.io/devtools-protocol/
 17  """
 18  from __future__ import annotations
 19  
 20  import asyncio
 21  import json
 22  import logging
 23  from typing import Any, Dict, Optional
 24  
 25  from tools.registry import registry, tool_error
 26  
 27  logger = logging.getLogger(__name__)
 28  
 29  CDP_DOCS_URL = "https://chromedevtools.github.io/devtools-protocol/"
 30  
 31  # ``websockets`` is a transitive dependency of hermes-agent (via fal_client
 32  # and firecrawl-py) and is already imported by gateway/platforms/feishu.py.
 33  # Wrap the import so a clean error surfaces if the package is ever absent.
 34  try:
 35      import websockets
 36      from websockets.exceptions import WebSocketException
 37  
 38      _WS_AVAILABLE = True
 39  except ImportError:
 40      websockets = None  # type: ignore[assignment]
 41      WebSocketException = Exception  # type: ignore[assignment,misc]
 42      _WS_AVAILABLE = False
 43  
 44  
 45  # ---------------------------------------------------------------------------
 46  # Async-from-sync bridge (matches the pattern in homeassistant_tool.py)
 47  # ---------------------------------------------------------------------------
 48  
 49  
 50  def _run_async(coro):
 51      """Run an async coroutine from a sync handler, safe inside or outside a loop."""
 52      try:
 53          loop = asyncio.get_running_loop()
 54      except RuntimeError:
 55          loop = None
 56  
 57      if loop and loop.is_running():
 58          import concurrent.futures
 59  
 60          with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
 61              future = pool.submit(asyncio.run, coro)
 62              return future.result()
 63      return asyncio.run(coro)
 64  
 65  
 66  # ---------------------------------------------------------------------------
 67  # Endpoint resolution
 68  # ---------------------------------------------------------------------------
 69  
 70  
 71  def _resolve_cdp_endpoint() -> str:
 72      """Return the normalized CDP WebSocket URL, or empty string if unavailable.
 73  
 74      Delegates to ``tools.browser_tool._get_cdp_override`` so precedence stays
 75      consistent with the rest of the browser tool surface:
 76  
 77      1. ``BROWSER_CDP_URL`` env var (live override from ``/browser connect``)
 78      2. ``browser.cdp_url`` in ``config.yaml``
 79      """
 80      try:
 81          from tools.browser_tool import _get_cdp_override  # type: ignore[import-not-found]
 82  
 83          return (_get_cdp_override() or "").strip()
 84      except Exception as exc:  # pragma: no cover — defensive
 85          logger.debug("browser_cdp: failed to resolve CDP endpoint: %s", exc)
 86          return ""
 87  
 88  
 89  # ---------------------------------------------------------------------------
 90  # Core CDP call
 91  # ---------------------------------------------------------------------------
 92  
 93  
 94  async def _cdp_call(
 95      ws_url: str,
 96      method: str,
 97      params: Dict[str, Any],
 98      target_id: Optional[str],
 99      timeout: float,
100  ) -> Dict[str, Any]:
101      """Make a single CDP call, optionally attaching to a target first.
102  
103      When ``target_id`` is provided, we call ``Target.attachToTarget`` with
104      ``flatten=True`` to multiplex a page-level session over the same
105      browser-level WebSocket, then send ``method`` with that ``sessionId``.
106      When ``target_id`` is None, ``method`` is sent at browser level — which
107      works for ``Target.*``, ``Browser.*``, ``Storage.*`` and a few other
108      globally-scoped domains.
109      """
110      assert websockets is not None  # guarded by _WS_AVAILABLE at call-site
111  
112      async with websockets.connect(
113          ws_url,
114          max_size=None,  # CDP responses (e.g. DOM.getDocument) can be large
115          open_timeout=timeout,
116          close_timeout=5,
117          ping_interval=None,  # CDP server doesn't expect pings
118      ) as ws:
119          next_id = 1
120          session_id: Optional[str] = None
121  
122          # --- Step 1: attach to target if requested ---
123          if target_id:
124              attach_id = next_id
125              next_id += 1
126              await ws.send(
127                  json.dumps(
128                      {
129                          "id": attach_id,
130                          "method": "Target.attachToTarget",
131                          "params": {"targetId": target_id, "flatten": True},
132                      }
133                  )
134              )
135              deadline = asyncio.get_event_loop().time() + timeout
136              while True:
137                  remaining = deadline - asyncio.get_event_loop().time()
138                  if remaining <= 0:
139                      raise TimeoutError(
140                          f"Timed out attaching to target {target_id}"
141                      )
142                  raw = await asyncio.wait_for(ws.recv(), timeout=remaining)
143                  msg = json.loads(raw)
144                  if msg.get("id") == attach_id:
145                      if "error" in msg:
146                          raise RuntimeError(
147                              f"Target.attachToTarget failed: {msg['error']}"
148                          )
149                      session_id = msg.get("result", {}).get("sessionId")
150                      if not session_id:
151                          raise RuntimeError(
152                              "Target.attachToTarget did not return a sessionId"
153                          )
154                      break
155                  # Ignore events (messages without "id") while waiting
156  
157          # --- Step 2: dispatch the real method ---
158          call_id = next_id
159          next_id += 1
160          req: Dict[str, Any] = {
161              "id": call_id,
162              "method": method,
163              "params": params or {},
164          }
165          if session_id:
166              req["sessionId"] = session_id
167          await ws.send(json.dumps(req))
168  
169          deadline = asyncio.get_event_loop().time() + timeout
170          while True:
171              remaining = deadline - asyncio.get_event_loop().time()
172              if remaining <= 0:
173                  raise TimeoutError(
174                      f"Timed out waiting for response to {method}"
175                  )
176              raw = await asyncio.wait_for(ws.recv(), timeout=remaining)
177              msg = json.loads(raw)
178              if msg.get("id") == call_id:
179                  if "error" in msg:
180                      raise RuntimeError(f"CDP error: {msg['error']}")
181                  return msg.get("result", {})
182              # Ignore events / out-of-order responses
183  
184  
185  # ---------------------------------------------------------------------------
186  # Public tool function
187  # ---------------------------------------------------------------------------
188  
189  
190  def _browser_cdp_via_supervisor(
191      task_id: str,
192      frame_id: str,
193      method: str,
194      params: Optional[Dict[str, Any]],
195      timeout: float,
196  ) -> str:
197      """Route a CDP call through the live supervisor session for an OOPIF frame.
198  
199      Looks up the frame in the supervisor's snapshot, extracts its child
200      ``cdp_session_id``, and dispatches ``method`` with that sessionId via
201      the supervisor's already-connected WebSocket (using
202      ``asyncio.run_coroutine_threadsafe`` onto the supervisor loop).
203      """
204      try:
205          from tools.browser_supervisor import SUPERVISOR_REGISTRY  # type: ignore[import-not-found]
206      except Exception as exc:  # pragma: no cover — defensive
207          return tool_error(
208              f"CDP supervisor is not available: {exc}. frame_id routing requires "
209              f"a running supervisor attached via /browser connect or an active "
210              f"Browserbase session."
211          )
212  
213      supervisor = SUPERVISOR_REGISTRY.get(task_id)
214      if supervisor is None:
215          return tool_error(
216              f"No CDP supervisor is attached for task={task_id!r}. Call "
217              f"browser_navigate or /browser connect first so the supervisor "
218              f"can attach. Once attached, browser_snapshot will populate "
219              f"frame_tree with frame_ids you can pass here."
220          )
221  
222      snap = supervisor.snapshot()
223      # Search both the top frame and the children for the requested id.
224      top = snap.frame_tree.get("top")
225      frame_info: Optional[Dict[str, Any]] = None
226      if top and top.get("frame_id") == frame_id:
227          frame_info = top
228      else:
229          for child in snap.frame_tree.get("children", []) or []:
230              if child.get("frame_id") == frame_id:
231                  frame_info = child
232                  break
233      if frame_info is None:
234          # Check the raw frames dict too (frame_tree is capped at 30 entries)
235          with supervisor._state_lock:  # type: ignore[attr-defined]
236              raw = supervisor._frames.get(frame_id)  # type: ignore[attr-defined]
237          if raw is not None:
238              frame_info = raw.to_dict()
239  
240      if frame_info is None:
241          return tool_error(
242              f"frame_id {frame_id!r} not found in supervisor state. "
243              f"Call browser_snapshot to see current frame_tree."
244          )
245  
246      child_sid = frame_info.get("session_id")
247      if not child_sid:
248          # Not an OOPIF — fall back to top-level session (evaluating at page
249          # scope).  Same-origin iframes don't get their own sessionId; the
250          # agent can still use contentWindow/contentDocument from the parent.
251          return tool_error(
252              f"frame_id {frame_id!r} is not an out-of-process iframe (no "
253              f"dedicated CDP session). For same-origin iframes, use "
254              f"`browser_cdp(method='Runtime.evaluate', params={{'expression': "
255              f"\"document.querySelector('iframe').contentDocument.title\"}})` "
256              f"at the top-level page instead."
257          )
258  
259      # Dispatch onto the supervisor's loop.
260      import asyncio as _asyncio
261      loop = supervisor._loop  # type: ignore[attr-defined]
262      if loop is None or not loop.is_running():
263          return tool_error(
264              "CDP supervisor loop is not running. Try reconnecting with "
265              "/browser connect."
266          )
267  
268      async def _do_cdp():
269          return await supervisor._cdp(  # type: ignore[attr-defined]
270              method,
271              params or {},
272              session_id=child_sid,
273              timeout=timeout,
274          )
275  
276      try:
277          fut = _asyncio.run_coroutine_threadsafe(_do_cdp(), loop)
278          result_msg = fut.result(timeout=timeout + 2)
279      except Exception as exc:
280          return tool_error(
281              f"CDP call via supervisor failed: {type(exc).__name__}: {exc}",
282              cdp_docs=CDP_DOCS_URL,
283          )
284  
285      payload: Dict[str, Any] = {
286          "success": True,
287          "method": method,
288          "frame_id": frame_id,
289          "session_id": child_sid,
290          "result": result_msg.get("result", {}),
291      }
292      return json.dumps(payload, ensure_ascii=False)
293  
294  
295  def browser_cdp(
296      method: str,
297      params: Optional[Dict[str, Any]] = None,
298      target_id: Optional[str] = None,
299      frame_id: Optional[str] = None,
300      timeout: float = 30.0,
301      task_id: Optional[str] = None,
302  ) -> str:
303      """Send a raw CDP command.  See ``CDP_DOCS_URL`` for method documentation.
304  
305      Args:
306          method: CDP method name, e.g. ``"Target.getTargets"``.
307          params: Method-specific parameters; defaults to ``{}``.
308          target_id: Optional target/tab ID for page-level methods.  When set,
309              we first attach to the target (``flatten=True``) and send
310              ``method`` with the resulting ``sessionId``.  Uses a fresh
311              stateless CDP connection.
312          frame_id: Optional cross-origin (OOPIF) iframe ``frame_id`` from
313              ``browser_snapshot.frame_tree.children[]``.  When set (and the
314              frame is an OOPIF with a live session tracked by the CDP
315              supervisor), routes the call through the supervisor's existing
316              WebSocket — which is how you Runtime.evaluate *inside* an
317              iframe on backends where per-call fresh CDP connections would
318              hit signed-URL expiry (Browserbase) or expensive reattach.
319          timeout: Seconds to wait for the call to complete.
320          task_id: Task identifier for supervisor lookup.  When ``frame_id``
321              is set, this identifies which task's supervisor to use; the
322              handler will default to ``"default"`` otherwise.
323  
324      Returns:
325          JSON string ``{"success": True, "method": ..., "result": {...}}`` on
326          success, or ``{"error": "..."}`` on failure.
327      """
328      # --- Route iframe-scoped calls through the supervisor ---------------
329      if frame_id:
330          return _browser_cdp_via_supervisor(
331              task_id=task_id or "default",
332              frame_id=frame_id,
333              method=method,
334              params=params,
335              timeout=timeout,
336          )
337      del task_id  # stateless path below
338  
339      if not method or not isinstance(method, str):
340          return tool_error(
341              "'method' is required (e.g. 'Target.getTargets')",
342              cdp_docs=CDP_DOCS_URL,
343          )
344  
345      if not _WS_AVAILABLE:
346          return tool_error(
347              "The 'websockets' Python package is required but not installed. "
348              "Install it with: pip install websockets"
349          )
350  
351      endpoint = _resolve_cdp_endpoint()
352      if not endpoint:
353          return tool_error(
354              "No CDP endpoint is available. Run '/browser connect' to attach "
355              "to a running Chrome, or set 'browser.cdp_url' in config.yaml. "
356              "The Camofox backend is REST-only and does not expose CDP.",
357              cdp_docs=CDP_DOCS_URL,
358          )
359  
360      if not endpoint.startswith(("ws://", "wss://")):
361          return tool_error(
362              f"CDP endpoint is not a WebSocket URL: {endpoint!r}. "
363              "Expected ws://... or wss://... — the /browser connect "
364              "resolver should have rewritten this. Check that Chrome is "
365              "actually listening on the debug port."
366          )
367  
368      call_params: Dict[str, Any] = params or {}
369      if not isinstance(call_params, dict):
370          return tool_error(
371              f"'params' must be an object/dict, got {type(call_params).__name__}"
372          )
373  
374      try:
375          safe_timeout = float(timeout) if timeout else 30.0
376      except (TypeError, ValueError):
377          safe_timeout = 30.0
378      safe_timeout = max(1.0, min(safe_timeout, 300.0))
379  
380      try:
381          result = _run_async(
382              _cdp_call(endpoint, method, call_params, target_id, safe_timeout)
383          )
384      except asyncio.TimeoutError as exc:
385          return tool_error(
386              f"CDP call timed out after {safe_timeout}s: {exc}",
387              method=method,
388          )
389      except TimeoutError as exc:
390          return tool_error(str(exc), method=method)
391      except RuntimeError as exc:
392          return tool_error(str(exc), method=method)
393      except WebSocketException as exc:
394          return tool_error(
395              f"WebSocket error talking to CDP at {endpoint}: {exc}. The "
396              "browser may have disconnected — try '/browser connect' again.",
397              method=method,
398          )
399      except Exception as exc:  # pragma: no cover — unexpected
400          logger.exception("browser_cdp unexpected error")
401          return tool_error(
402              f"Unexpected error: {type(exc).__name__}: {exc}",
403              method=method,
404          )
405  
406      payload: Dict[str, Any] = {
407          "success": True,
408          "method": method,
409          "result": result,
410      }
411      if target_id:
412          payload["target_id"] = target_id
413      return json.dumps(payload, ensure_ascii=False)
414  
415  
416  # ---------------------------------------------------------------------------
417  # Registry
418  # ---------------------------------------------------------------------------
419  
420  
421  BROWSER_CDP_SCHEMA: Dict[str, Any] = {
422      "name": "browser_cdp",
423      "description": (
424          "Send a raw Chrome DevTools Protocol (CDP) command. Escape hatch for "
425          "browser operations not covered by browser_navigate, browser_click, "
426          "browser_console, etc.\n\n"
427          "**Requires a reachable CDP endpoint.** Available when the user has "
428          "run '/browser connect' to attach to a running Chrome, or when "
429          "'browser.cdp_url' is set in config.yaml. Not currently wired up for "
430          "cloud backends (Browserbase, Browser Use, Firecrawl) — those expose "
431          "CDP per session but live-session routing is a follow-up. Camofox is "
432          "REST-only and will never support CDP. If the tool is in your toolset "
433          "at all, a CDP endpoint is already reachable.\n\n"
434          f"**CDP method reference:** {CDP_DOCS_URL} — use web_extract on a "
435          "method's URL (e.g. '/tot/Page/#method-handleJavaScriptDialog') "
436          "to look up parameters and return shape.\n\n"
437          "**Common patterns:**\n"
438          "- List tabs: method='Target.getTargets', params={}\n"
439          "- Handle a native JS dialog: method='Page.handleJavaScriptDialog', "
440          "params={'accept': true, 'promptText': ''}, target_id=<tabId>\n"
441          "- Get all cookies: method='Network.getAllCookies', params={}\n"
442          "- Eval in a specific tab: method='Runtime.evaluate', "
443          "params={'expression': '...', 'returnByValue': true}, "
444          "target_id=<tabId>\n"
445          "- Set viewport for a tab: method='Emulation.setDeviceMetricsOverride', "
446          "params={'width': 1280, 'height': 720, 'deviceScaleFactor': 1, "
447          "'mobile': false}, target_id=<tabId>\n\n"
448          "**Usage rules:**\n"
449          "- Browser-level methods (Target.*, Browser.*, Storage.*): omit "
450          "target_id and frame_id.\n"
451          "- Page-level methods (Page.*, Runtime.*, DOM.*, Emulation.*, "
452          "Network.* scoped to a tab): pass target_id from Target.getTargets.\n"
453          "- **Cross-origin iframe scope** (Runtime.evaluate inside an OOPIF, "
454          "Page.* targeting a frame target, etc.): pass frame_id from the "
455          "browser_snapshot frame_tree output. This routes through the CDP "
456          "supervisor's live connection — the only reliable way on "
457          "Browserbase where stateless CDP calls hit signed-URL expiry.\n"
458          "- Each stateless call (without frame_id) is independent — sessions "
459          "and event subscriptions do not persist between calls. For stateful "
460          "workflows, prefer the dedicated browser tools or use frame_id "
461          "routing."
462      ),
463      "parameters": {
464          "type": "object",
465          "properties": {
466              "method": {
467                  "type": "string",
468                  "description": (
469                      "CDP method name, e.g. 'Target.getTargets', "
470                      "'Runtime.evaluate', 'Page.handleJavaScriptDialog'."
471                  ),
472              },
473              "params": {
474                  "type": "object",
475                  "description": (
476                      "Method-specific parameters as a JSON object. Omit or "
477                      "pass {} for methods that take no parameters."
478                  ),
479                  "properties": {},
480                  "additionalProperties": True,
481              },
482              "target_id": {
483                  "type": "string",
484                  "description": (
485                      "Optional. Target/tab ID from Target.getTargets result "
486                      "(each entry's 'targetId'). Use for page-level methods "
487                      "at the top-level tab scope. Mutually exclusive with "
488                      "frame_id."
489                  ),
490              },
491              "frame_id": {
492                  "type": "string",
493                  "description": (
494                      "Optional. Out-of-process iframe (OOPIF) frame_id from "
495                      "browser_snapshot.frame_tree.children[] where "
496                      "is_oopif=true. When set, routes the call through the "
497                      "CDP supervisor's live session for that iframe. "
498                      "Essential for Runtime.evaluate inside cross-origin "
499                      "iframes, especially on Browserbase where fresh "
500                      "per-call CDP connections can't keep up with signed "
501                      "URL rotation. For same-origin iframes, use parent "
502                      "contentWindow/contentDocument from Runtime.evaluate "
503                      "at the top-level page instead."
504                  ),
505              },
506              "timeout": {
507                  "type": "number",
508                  "description": (
509                      "Timeout in seconds (default 30, max 300)."
510                  ),
511                  "default": 30,
512              },
513          },
514          "required": ["method"],
515      },
516  }
517  
518  
519  def _browser_cdp_check() -> bool:
520      """Availability check for browser_cdp.
521  
522      The tool is only offered when the Python side can actually reach a CDP
523      endpoint right now — meaning a static URL is set via ``/browser connect``
524      (``BROWSER_CDP_URL``) or ``browser.cdp_url`` in ``config.yaml``.
525  
526      Backends that do *not* currently expose CDP to us — Camofox (REST-only),
527      the default local agent-browser mode (Playwright hides its internal CDP
528      port), and cloud providers whose per-session ``cdp_url`` is not yet
529      surfaced — are gated out so the model doesn't see a tool that would
530      reliably fail.  Cloud-provider CDP routing is a follow-up.
531  
532      Kept in a thin wrapper so the registration statement stays at module top
533      level (the tool-discovery AST scan only picks up top-level
534      ``registry.register(...)`` calls).
535      """
536      try:
537          from tools.browser_tool import (  # type: ignore[import-not-found]
538              _get_cdp_override,
539              check_browser_requirements,
540          )
541      except ImportError as exc:  # pragma: no cover — defensive
542          logger.debug("browser_cdp check: browser_tool import failed: %s", exc)
543          return False
544      if not check_browser_requirements():
545          return False
546      return bool(_get_cdp_override())
547  
548  
549  registry.register(
550      name="browser_cdp",
551      toolset="browser-cdp",
552      schema=BROWSER_CDP_SCHEMA,
553      handler=lambda args, **kw: browser_cdp(
554          method=args.get("method", ""),
555          params=args.get("params"),
556          target_id=args.get("target_id"),
557          frame_id=args.get("frame_id"),
558          timeout=args.get("timeout", 30.0),
559          task_id=kw.get("task_id"),
560      ),
561      check_fn=_browser_cdp_check,
562      emoji="🧪",
563  )