/ tools / web_tools.py
web_tools.py
   1  #!/usr/bin/env python3
   2  """
   3  Standalone Web Tools Module
   4  
   5  This module provides generic web tools that work with multiple backend providers.
   6  Backend is selected during ``hermes tools`` setup (web.backend in config.yaml).
   7  When available, Hermes can route Firecrawl calls through a Nous-hosted tool-gateway
   8  for Nous Subscribers only.
   9  
  10  Available tools:
  11  - web_search_tool: Search the web for information
  12  - web_extract_tool: Extract content from specific web pages
  13  - web_crawl_tool: Crawl websites with specific instructions
  14  
  15  Backend compatibility:
  16  - Exa: https://exa.ai (search, extract)
  17  - Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers)
  18  - Parallel: https://docs.parallel.ai (search, extract)
  19  - Tavily: https://tavily.com (search, extract, crawl)
  20  
  21  LLM Processing:
  22  - Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction
  23  - Extracts key excerpts and creates markdown summaries to reduce token usage
  24  
  25  Debug Mode:
  26  - Set WEB_TOOLS_DEBUG=true to enable detailed logging
  27  - Creates web_tools_debug_UUID.json in ./logs directory
  28  - Captures all tool calls, results, and compression metrics
  29  
  30  Usage:
  31      from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
  32      
  33      # Search the web
  34      results = web_search_tool("Python machine learning libraries", limit=3)
  35      
  36      # Extract content from URLs  
  37      content = web_extract_tool(["https://example.com"], format="markdown")
  38      
  39      # Crawl a website
  40      crawl_data = web_crawl_tool("example.com", "Find contact information")
  41  """
  42  
  43  import json
  44  import logging
  45  import os
  46  import re
  47  import asyncio
  48  from typing import List, Dict, Any, Optional, TYPE_CHECKING
  49  import httpx
  50  # NOTE: `from firecrawl import Firecrawl` is deliberately NOT at module top —
  51  # the SDK pulls ~200 ms of imports (httpcore, firecrawl.v1/v2 type trees) and
  52  # we only need it when the backend is actually "firecrawl". We expose
  53  # ``Firecrawl`` as a thin proxy that imports the SDK on first call/
  54  # isinstance check, so both (a) the in-module ``Firecrawl(...)`` construction
  55  # site in _get_firecrawl_client() works unchanged, and (b) tests using
  56  # ``patch("tools.web_tools.Firecrawl", ...)`` keep working.
  57  if TYPE_CHECKING:
  58      from firecrawl import Firecrawl  # noqa: F401 — type hints only
  59  
  60  _FIRECRAWL_CLS_CACHE: Optional[type] = None
  61  
  62  
  63  def _load_firecrawl_cls() -> type:
  64      """Import and cache ``firecrawl.Firecrawl``."""
  65      global _FIRECRAWL_CLS_CACHE
  66      if _FIRECRAWL_CLS_CACHE is None:
  67          from firecrawl import Firecrawl as _cls
  68          _FIRECRAWL_CLS_CACHE = _cls
  69      return _FIRECRAWL_CLS_CACHE
  70  
  71  
  72  class _FirecrawlProxy:
  73      """Module-level proxy that looks like ``firecrawl.Firecrawl`` but imports lazily."""
  74  
  75      __slots__ = ()
  76  
  77      def __call__(self, *args, **kwargs):
  78          return _load_firecrawl_cls()(*args, **kwargs)
  79  
  80      def __instancecheck__(self, obj):
  81          return isinstance(obj, _load_firecrawl_cls())
  82  
  83      def __repr__(self):
  84          return "<lazy firecrawl.Firecrawl proxy>"
  85  
  86  
  87  Firecrawl = _FirecrawlProxy()
  88  
  89  from agent.auxiliary_client import (
  90      async_call_llm,
  91      extract_content_or_reasoning,
  92      get_async_text_auxiliary_client,
  93  )
  94  from tools.debug_helpers import DebugSession
  95  from tools.managed_tool_gateway import (
  96      build_vendor_gateway_url,
  97      read_nous_access_token as _read_nous_access_token,
  98      resolve_managed_tool_gateway,
  99  )
 100  from tools.tool_backend_helpers import managed_nous_tools_enabled, prefers_gateway
 101  from tools.url_safety import is_safe_url
 102  from tools.website_policy import check_website_access
 103  
 104  logger = logging.getLogger(__name__)
 105  
 106  
 107  # ─── Backend Selection ────────────────────────────────────────────────────────
 108  
 109  def _has_env(name: str) -> bool:
 110      val = os.getenv(name)
 111      return bool(val and val.strip())
 112  
 113  def _load_web_config() -> dict:
 114      """Load the ``web:`` section from ~/.hermes/config.yaml."""
 115      try:
 116          from hermes_cli.config import load_config
 117          return load_config().get("web", {})
 118      except (ImportError, Exception):
 119          return {}
 120  
 121  def _get_backend() -> str:
 122      """Determine which web backend to use.
 123  
 124      Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
 125      Falls back to whichever API key is present for users who configured
 126      keys manually without running setup.
 127      """
 128      configured = (_load_web_config().get("backend") or "").lower().strip()
 129      if configured in ("parallel", "firecrawl", "tavily", "exa"):
 130          return configured
 131  
 132      # Fallback for manual / legacy config — pick the highest-priority
 133      # available backend. Firecrawl also counts as available when the managed
 134      # tool gateway is configured for Nous subscribers.
 135      backend_candidates = (
 136          ("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()),
 137          ("parallel", _has_env("PARALLEL_API_KEY")),
 138          ("tavily", _has_env("TAVILY_API_KEY")),
 139          ("exa", _has_env("EXA_API_KEY")),
 140      )
 141      for backend, available in backend_candidates:
 142          if available:
 143              return backend
 144  
 145      return "firecrawl"  # default (backward compat)
 146  
 147  
 148  def _is_backend_available(backend: str) -> bool:
 149      """Return True when the selected backend is currently usable."""
 150      if backend == "exa":
 151          return _has_env("EXA_API_KEY")
 152      if backend == "parallel":
 153          return _has_env("PARALLEL_API_KEY")
 154      if backend == "firecrawl":
 155          return check_firecrawl_api_key()
 156      if backend == "tavily":
 157          return _has_env("TAVILY_API_KEY")
 158      return False
 159  
 160  # ─── Firecrawl Client ────────────────────────────────────────────────────────
 161  
 162  _firecrawl_client = None
 163  _firecrawl_client_config = None
 164  
 165  
 166  def _get_direct_firecrawl_config() -> Optional[tuple[Dict[str, str], tuple[str, Optional[str], Optional[str]]]]:
 167      """Return explicit direct Firecrawl kwargs + cache key, or None when unset."""
 168      api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
 169      api_url = os.getenv("FIRECRAWL_API_URL", "").strip().rstrip("/")
 170  
 171      if not api_key and not api_url:
 172          return None
 173  
 174      kwargs: Dict[str, str] = {}
 175      if api_key:
 176          kwargs["api_key"] = api_key
 177      if api_url:
 178          kwargs["api_url"] = api_url
 179  
 180      return kwargs, ("direct", api_url or None, api_key or None)
 181  
 182  
 183  def _get_firecrawl_gateway_url() -> str:
 184      """Return configured Firecrawl gateway URL."""
 185      return build_vendor_gateway_url("firecrawl")
 186  
 187  
 188  def _is_tool_gateway_ready() -> bool:
 189      """Return True when gateway URL and a Nous Subscriber token are available."""
 190      return resolve_managed_tool_gateway("firecrawl", token_reader=_read_nous_access_token) is not None
 191  
 192  
 193  def _has_direct_firecrawl_config() -> bool:
 194      """Return True when direct Firecrawl config is explicitly configured."""
 195      return _get_direct_firecrawl_config() is not None
 196  
 197  
 198  def _raise_web_backend_configuration_error() -> None:
 199      """Raise a clear error for unsupported web backend configuration."""
 200      message = (
 201          "Web tools are not configured. "
 202          "Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL for a self-hosted Firecrawl instance."
 203      )
 204      if managed_nous_tools_enabled():
 205          message += (
 206              " With your Nous subscription you can also use the Tool Gateway — "
 207              "run `hermes tools` and select Nous Subscription as the web provider."
 208          )
 209      raise ValueError(message)
 210  
 211  
 212  def _firecrawl_backend_help_suffix() -> str:
 213      """Return optional managed-gateway guidance for Firecrawl help text."""
 214      if not managed_nous_tools_enabled():
 215          return ""
 216      return (
 217          ", or use the Nous Tool Gateway via your subscription "
 218          "(FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN)"
 219      )
 220  
 221  
 222  def _web_requires_env() -> list[str]:
 223      """Return tool metadata env vars for the currently enabled web backends."""
 224      requires = [
 225          "EXA_API_KEY",
 226          "PARALLEL_API_KEY",
 227          "TAVILY_API_KEY",
 228          "FIRECRAWL_API_KEY",
 229          "FIRECRAWL_API_URL",
 230      ]
 231      if managed_nous_tools_enabled():
 232          requires.extend(
 233              [
 234                  "FIRECRAWL_GATEWAY_URL",
 235                  "TOOL_GATEWAY_DOMAIN",
 236                  "TOOL_GATEWAY_SCHEME",
 237                  "TOOL_GATEWAY_USER_TOKEN",
 238              ]
 239          )
 240      return requires
 241  
 242  
 243  def _get_firecrawl_client():
 244      """Get or create Firecrawl client.
 245  
 246      When ``web.use_gateway`` is set in config, the Tool Gateway is preferred
 247      even if direct Firecrawl credentials are present.  Otherwise direct
 248      Firecrawl takes precedence when explicitly configured.
 249      """
 250      global _firecrawl_client, _firecrawl_client_config
 251  
 252      direct_config = _get_direct_firecrawl_config()
 253      if direct_config is not None and not prefers_gateway("web"):
 254          kwargs, client_config = direct_config
 255      else:
 256          managed_gateway = resolve_managed_tool_gateway(
 257              "firecrawl",
 258              token_reader=_read_nous_access_token,
 259          )
 260          if managed_gateway is None:
 261              logger.error("Firecrawl client initialization failed: missing direct config and tool-gateway auth.")
 262              _raise_web_backend_configuration_error()
 263  
 264          kwargs = {
 265              "api_key": managed_gateway.nous_user_token,
 266              "api_url": managed_gateway.gateway_origin,
 267          }
 268          client_config = (
 269              "tool-gateway",
 270              kwargs["api_url"],
 271              managed_gateway.nous_user_token,
 272          )
 273  
 274      if _firecrawl_client is not None and _firecrawl_client_config == client_config:
 275          return _firecrawl_client
 276  
 277      # Uses the module-level `Firecrawl` name (lazy proxy at module top).
 278      _firecrawl_client = Firecrawl(**kwargs)
 279      _firecrawl_client_config = client_config
 280      return _firecrawl_client
 281  
 282  # ─── Parallel Client ─────────────────────────────────────────────────────────
 283  
 284  _parallel_client = None
 285  _async_parallel_client = None
 286  
 287  def _get_parallel_client():
 288      """Get or create the Parallel sync client (lazy initialization).
 289  
 290      Requires PARALLEL_API_KEY environment variable.
 291      """
 292      from parallel import Parallel
 293      global _parallel_client
 294      if _parallel_client is None:
 295          api_key = os.getenv("PARALLEL_API_KEY")
 296          if not api_key:
 297              raise ValueError(
 298                  "PARALLEL_API_KEY environment variable not set. "
 299                  "Get your API key at https://parallel.ai"
 300              )
 301          _parallel_client = Parallel(api_key=api_key)
 302      return _parallel_client
 303  
 304  
 305  def _get_async_parallel_client():
 306      """Get or create the Parallel async client (lazy initialization).
 307  
 308      Requires PARALLEL_API_KEY environment variable.
 309      """
 310      from parallel import AsyncParallel
 311      global _async_parallel_client
 312      if _async_parallel_client is None:
 313          api_key = os.getenv("PARALLEL_API_KEY")
 314          if not api_key:
 315              raise ValueError(
 316                  "PARALLEL_API_KEY environment variable not set. "
 317                  "Get your API key at https://parallel.ai"
 318              )
 319          _async_parallel_client = AsyncParallel(api_key=api_key)
 320      return _async_parallel_client
 321  
 322  # ─── Tavily Client ───────────────────────────────────────────────────────────
 323  
 324  _TAVILY_BASE_URL = os.getenv("TAVILY_BASE_URL", "https://api.tavily.com")
 325  
 326  
 327  def _tavily_request(endpoint: str, payload: dict) -> dict:
 328      """Send a POST request to the Tavily API.
 329  
 330      Auth is provided via ``api_key`` in the JSON body (no header-based auth).
 331      Raises ``ValueError`` if ``TAVILY_API_KEY`` is not set.
 332      """
 333      api_key = os.getenv("TAVILY_API_KEY")
 334      if not api_key:
 335          raise ValueError(
 336              "TAVILY_API_KEY environment variable not set. "
 337              "Get your API key at https://app.tavily.com/home"
 338          )
 339      payload["api_key"] = api_key
 340      url = f"{_TAVILY_BASE_URL}/{endpoint.lstrip('/')}"
 341      logger.info("Tavily %s request to %s", endpoint, url)
 342      response = httpx.post(url, json=payload, timeout=60)
 343      response.raise_for_status()
 344      return response.json()
 345  
 346  
 347  def _normalize_tavily_search_results(response: dict) -> dict:
 348      """Normalize Tavily /search response to the standard web search format.
 349  
 350      Tavily returns ``{results: [{title, url, content, score, ...}]}``.
 351      We map to ``{success, data: {web: [{title, url, description, position}]}}``.
 352      """
 353      web_results = []
 354      for i, result in enumerate(response.get("results", [])):
 355          web_results.append({
 356              "title": result.get("title", ""),
 357              "url": result.get("url", ""),
 358              "description": result.get("content", ""),
 359              "position": i + 1,
 360          })
 361      return {"success": True, "data": {"web": web_results}}
 362  
 363  
 364  def _normalize_tavily_documents(response: dict, fallback_url: str = "") -> List[Dict[str, Any]]:
 365      """Normalize Tavily /extract or /crawl response to the standard document format.
 366  
 367      Maps results to ``{url, title, content, raw_content, metadata}`` and
 368      includes any ``failed_results`` / ``failed_urls`` as error entries.
 369      """
 370      documents: List[Dict[str, Any]] = []
 371      for result in response.get("results", []):
 372          url = result.get("url", fallback_url)
 373          raw = result.get("raw_content", "") or result.get("content", "")
 374          documents.append({
 375              "url": url,
 376              "title": result.get("title", ""),
 377              "content": raw,
 378              "raw_content": raw,
 379              "metadata": {"sourceURL": url, "title": result.get("title", "")},
 380          })
 381      # Handle failed results
 382      for fail in response.get("failed_results", []):
 383          documents.append({
 384              "url": fail.get("url", fallback_url),
 385              "title": "",
 386              "content": "",
 387              "raw_content": "",
 388              "error": fail.get("error", "extraction failed"),
 389              "metadata": {"sourceURL": fail.get("url", fallback_url)},
 390          })
 391      for fail_url in response.get("failed_urls", []):
 392          url_str = fail_url if isinstance(fail_url, str) else str(fail_url)
 393          documents.append({
 394              "url": url_str,
 395              "title": "",
 396              "content": "",
 397              "raw_content": "",
 398              "error": "extraction failed",
 399              "metadata": {"sourceURL": url_str},
 400          })
 401      return documents
 402  
 403  
 404  def _to_plain_object(value: Any) -> Any:
 405      """Convert SDK objects to plain python data structures when possible."""
 406      if value is None:
 407          return None
 408  
 409      if isinstance(value, (dict, list, str, int, float, bool)):
 410          return value
 411  
 412      if hasattr(value, "model_dump"):
 413          try:
 414              return value.model_dump()
 415          except Exception:
 416              pass
 417  
 418      if hasattr(value, "__dict__"):
 419          try:
 420              return {k: v for k, v in value.__dict__.items() if not k.startswith("_")}
 421          except Exception:
 422              pass
 423  
 424      return value
 425  
 426  
 427  def _normalize_result_list(values: Any) -> List[Dict[str, Any]]:
 428      """Normalize mixed SDK/list payloads into a list of dicts."""
 429      if not isinstance(values, list):
 430          return []
 431  
 432      normalized: List[Dict[str, Any]] = []
 433      for item in values:
 434          plain = _to_plain_object(item)
 435          if isinstance(plain, dict):
 436              normalized.append(plain)
 437      return normalized
 438  
 439  
 440  def _extract_web_search_results(response: Any) -> List[Dict[str, Any]]:
 441      """Extract Firecrawl search results across SDK/direct/gateway response shapes."""
 442      response_plain = _to_plain_object(response)
 443  
 444      if isinstance(response_plain, dict):
 445          data = response_plain.get("data")
 446          if isinstance(data, list):
 447              return _normalize_result_list(data)
 448  
 449          if isinstance(data, dict):
 450              data_web = _normalize_result_list(data.get("web"))
 451              if data_web:
 452                  return data_web
 453              data_results = _normalize_result_list(data.get("results"))
 454              if data_results:
 455                  return data_results
 456  
 457          top_web = _normalize_result_list(response_plain.get("web"))
 458          if top_web:
 459              return top_web
 460  
 461          top_results = _normalize_result_list(response_plain.get("results"))
 462          if top_results:
 463              return top_results
 464  
 465      if hasattr(response, "web"):
 466          return _normalize_result_list(getattr(response, "web", []))
 467  
 468      return []
 469  
 470  
 471  def _extract_scrape_payload(scrape_result: Any) -> Dict[str, Any]:
 472      """Normalize Firecrawl scrape payload shape across SDK and gateway variants."""
 473      result_plain = _to_plain_object(scrape_result)
 474      if not isinstance(result_plain, dict):
 475          return {}
 476  
 477      nested = result_plain.get("data")
 478      if isinstance(nested, dict):
 479          return nested
 480  
 481      return result_plain
 482  
 483  
 484  DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
 485  
 486  def _is_nous_auxiliary_client(client: Any) -> bool:
 487      """Return True when the resolved auxiliary backend is Nous Portal."""
 488      from urllib.parse import urlparse
 489  
 490      base_url = str(getattr(client, "base_url", "") or "")
 491      host = (urlparse(base_url).hostname or "").lower()
 492      return host == "nousresearch.com" or host.endswith(".nousresearch.com")
 493  
 494  
 495  def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]:
 496      """Resolve the current web-extract auxiliary client, model, and extra body."""
 497      client, default_model = get_async_text_auxiliary_client("web_extract")
 498      configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
 499      effective_model = model or configured_model or default_model
 500  
 501      extra_body: Dict[str, Any] = {}
 502      if client is not None and _is_nous_auxiliary_client(client):
 503          from agent.auxiliary_client import get_auxiliary_extra_body
 504          extra_body = get_auxiliary_extra_body() or {"tags": ["product=hermes-agent"]}
 505  
 506      return client, effective_model, extra_body
 507  
 508  
 509  def _get_default_summarizer_model() -> Optional[str]:
 510      """Return the current default model for web extraction summarization."""
 511      _, model, _ = _resolve_web_extract_auxiliary()
 512      return model
 513  
 514  _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
 515  
 516  
 517  async def process_content_with_llm(
 518      content: str, 
 519      url: str = "", 
 520      title: str = "",
 521      model: Optional[str] = None,
 522      min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
 523  ) -> Optional[str]:
 524      """
 525      Process web content using LLM to create intelligent summaries with key excerpts.
 526      
 527      This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API 
 528      to intelligently extract key information and create markdown summaries,
 529      significantly reducing token usage while preserving all important information.
 530      
 531      For very large content (>500k chars), uses chunked processing with synthesis.
 532      For extremely large content (>2M chars), refuses to process entirely.
 533      
 534      Args:
 535          content (str): The raw content to process
 536          url (str): The source URL (for context, optional)
 537          title (str): The page title (for context, optional)
 538          model (str): The model to use for processing (default: google/gemini-3-flash-preview)
 539          min_length (int): Minimum content length to trigger processing (default: 5000)
 540          
 541      Returns:
 542          Optional[str]: Processed markdown content, or None if content too short or processing fails
 543      """
 544      # Size thresholds
 545      MAX_CONTENT_SIZE = 2_000_000  # 2M chars - refuse entirely above this
 546      CHUNK_THRESHOLD = 500_000     # 500k chars - use chunked processing above this
 547      CHUNK_SIZE = 100_000          # 100k chars per chunk
 548      MAX_OUTPUT_SIZE = 5000        # Hard cap on final output size
 549      
 550      try:
 551          content_len = len(content)
 552          
 553          # Refuse if content is absurdly large
 554          if content_len > MAX_CONTENT_SIZE:
 555              size_mb = content_len / 1_000_000
 556              logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb)
 557              return f"[Content too large to process: {size_mb:.1f}MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]"
 558          
 559          # Skip processing if content is too short
 560          if content_len < min_length:
 561              logger.debug("Content too short (%d < %d chars), skipping LLM processing", content_len, min_length)
 562              return None
 563          
 564          # Create context information
 565          context_info = []
 566          if title:
 567              context_info.append(f"Title: {title}")
 568          if url:
 569              context_info.append(f"Source: {url}")
 570          context_str = "\n".join(context_info) + "\n\n" if context_info else ""
 571          
 572          # Check if we need chunked processing
 573          if content_len > CHUNK_THRESHOLD:
 574              logger.info("Content large (%d chars). Using chunked processing...", content_len)
 575              return await _process_large_content_chunked(
 576                  content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE
 577              )
 578          
 579          # Standard single-pass processing for normal content
 580          logger.info("Processing content with LLM (%d characters)", content_len)
 581          
 582          processed_content = await _call_summarizer_llm(content, context_str, model)
 583          
 584          if processed_content:
 585              # Enforce output cap
 586              if len(processed_content) > MAX_OUTPUT_SIZE:
 587                  processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]"
 588              
 589              # Log compression metrics
 590              processed_length = len(processed_content)
 591              compression_ratio = processed_length / content_len if content_len > 0 else 1.0
 592              logger.info("Content processed: %d -> %d chars (%.1f%%)", content_len, processed_length, compression_ratio * 100)
 593          
 594          return processed_content
 595          
 596      except Exception as e:
 597          logger.warning(
 598              "web_extract LLM summarization failed (%s). "
 599              "Tip: increase auxiliary.web_extract.timeout in config.yaml "
 600              "or switch to a faster auxiliary model.",
 601              str(e)[:120],
 602          )
 603          # Fall back to truncated raw content instead of returning a useless
 604          # error message.  The first ~5000 chars are almost always more useful
 605          # to the model than "[Failed to process content: ...]".
 606          truncated = content[:MAX_OUTPUT_SIZE]
 607          if len(content) > MAX_OUTPUT_SIZE:
 608              truncated += (
 609                  f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of "
 610                  f"{len(content):,} chars. LLM summarization timed out. "
 611                  f"To fix: increase auxiliary.web_extract.timeout in config.yaml, "
 612                  f"or use a faster auxiliary model. Use browser_navigate for the full page.]"
 613              )
 614          return truncated
 615  
 616  
 617  async def _call_summarizer_llm(
 618      content: str, 
 619      context_str: str, 
 620      model: Optional[str], 
 621      max_tokens: int = 20000,
 622      is_chunk: bool = False,
 623      chunk_info: str = ""
 624  ) -> Optional[str]:
 625      """
 626      Make a single LLM call to summarize content.
 627      
 628      Args:
 629          content: The content to summarize
 630          context_str: Context information (title, URL)
 631          model: Model to use
 632          max_tokens: Maximum output tokens
 633          is_chunk: Whether this is a chunk of a larger document
 634          chunk_info: Information about chunk position (e.g., "Chunk 2/5")
 635          
 636      Returns:
 637          Summarized content or None on failure
 638      """
 639      if is_chunk:
 640          # Chunk-specific prompt - aware that this is partial content
 641          system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
 642  
 643  Important guidelines for chunk processing:
 644  1. Do NOT write introductions or conclusions - this is a partial document
 645  2. Focus on extracting ALL key facts, figures, data points, and insights from this section
 646  3. Preserve important quotes, code snippets, and specific details verbatim
 647  4. Use bullet points and structured formatting for easy synthesis later
 648  5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them
 649  
 650  Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow."""
 651  
 652          user_prompt = f"""Extract key information from this SECTION of a larger document:
 653  
 654  {context_str}{chunk_info}
 655  
 656  SECTION CONTENT:
 657  {content}
 658  
 659  Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions."""
 660  
 661      else:
 662          # Standard full-document prompt
 663          system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
 664  
 665  Create a well-structured markdown summary that includes:
 666  1. Key excerpts (quotes, code snippets, important facts) in their original format
 667  2. Comprehensive summary of all other important information
 668  3. Proper markdown formatting with headers, bullets, and emphasis
 669  
 670  Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized."""
 671  
 672          user_prompt = f"""Please process this web content and create a comprehensive markdown summary:
 673  
 674  {context_str}CONTENT TO PROCESS:
 675  {content}
 676  
 677  Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
 678  
 679      # Call the LLM with retry logic — keep retries low since summarization
 680      # is a nice-to-have; the caller falls back to truncated content on failure.
 681      max_retries = 2
 682      retry_delay = 2
 683      last_error = None
 684  
 685      for attempt in range(max_retries):
 686          try:
 687              aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
 688              if aux_client is None or not effective_model:
 689                  logger.warning("No auxiliary model available for web content processing")
 690                  return None
 691              call_kwargs = {
 692                  "task": "web_extract",
 693                  "model": effective_model,
 694                  "messages": [
 695                      {"role": "system", "content": system_prompt},
 696                      {"role": "user", "content": user_prompt},
 697                  ],
 698                  "temperature": 0.1,
 699                  "max_tokens": max_tokens,
 700                  # No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout
 701                  # from config (default 360s / 6min).  Users with slow local models can
 702                  # increase it in config.yaml.
 703              }
 704              if extra_body:
 705                  call_kwargs["extra_body"] = extra_body
 706              response = await async_call_llm(**call_kwargs)
 707              content = extract_content_or_reasoning(response)
 708              if content:
 709                  return content
 710              # Reasoning-only / empty response — let the retry loop handle it
 711              logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries)
 712              if attempt < max_retries - 1:
 713                  await asyncio.sleep(retry_delay)
 714                  retry_delay = min(retry_delay * 2, 60)
 715                  continue
 716              return content  # Return whatever we got after exhausting retries
 717          except RuntimeError:
 718              logger.warning("No auxiliary model available for web content processing")
 719              return None
 720          except Exception as api_error:
 721              last_error = api_error
 722              if attempt < max_retries - 1:
 723                  logger.warning("LLM API call failed (attempt %d/%d): %s", attempt + 1, max_retries, str(api_error)[:100])
 724                  logger.warning("Retrying in %ds...", retry_delay)
 725                  await asyncio.sleep(retry_delay)
 726                  retry_delay = min(retry_delay * 2, 60)
 727              else:
 728                  raise last_error
 729      
 730      return None
 731  
 732  
 733  async def _process_large_content_chunked(
 734      content: str, 
 735      context_str: str, 
 736      model: Optional[str], 
 737      chunk_size: int,
 738      max_output_size: int
 739  ) -> Optional[str]:
 740      """
 741      Process large content by chunking, summarizing each chunk in parallel,
 742      then synthesizing the summaries.
 743      
 744      Args:
 745          content: The large content to process
 746          context_str: Context information
 747          model: Model to use
 748          chunk_size: Size of each chunk in characters
 749          max_output_size: Maximum final output size
 750          
 751      Returns:
 752          Synthesized summary or None on failure
 753      """
 754      # Split content into chunks
 755      chunks = []
 756      for i in range(0, len(content), chunk_size):
 757          chunk = content[i:i + chunk_size]
 758          chunks.append(chunk)
 759      
 760      logger.info("Split into %d chunks of ~%d chars each", len(chunks), chunk_size)
 761      
 762      # Summarize each chunk in parallel
 763      async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]:
 764          """Summarize a single chunk."""
 765          try:
 766              chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]"
 767              summary = await _call_summarizer_llm(
 768                  chunk_content, 
 769                  context_str, 
 770                  model, 
 771                  max_tokens=10000,
 772                  is_chunk=True,
 773                  chunk_info=chunk_info
 774              )
 775              if summary:
 776                  logger.info("Chunk %d/%d summarized: %d -> %d chars", chunk_idx + 1, len(chunks), len(chunk_content), len(summary))
 777              return chunk_idx, summary
 778          except Exception as e:
 779              logger.warning("Chunk %d/%d failed: %s", chunk_idx + 1, len(chunks), str(e)[:50])
 780              return chunk_idx, None
 781      
 782      # Run all chunk summarizations in parallel
 783      tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
 784      results = await asyncio.gather(*tasks)
 785      
 786      # Collect successful summaries in order
 787      summaries = []
 788      for chunk_idx, summary in sorted(results, key=lambda x: x[0]):
 789          if summary:
 790              summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
 791      
 792      if not summaries:
 793          logger.debug("All chunk summarizations failed")
 794          return "[Failed to process large content: all chunk summarizations failed]"
 795      
 796      logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks))
 797      
 798      # If only one chunk succeeded, just return it (with cap)
 799      if len(summaries) == 1:
 800          result = summaries[0]
 801          if len(result) > max_output_size:
 802              result = result[:max_output_size] + "\n\n[... truncated ...]"
 803          return result
 804      
 805      # Synthesize the summaries into a final summary
 806      logger.info("Synthesizing %d summaries...", len(summaries))
 807      
 808      combined_summaries = "\n\n---\n\n".join(summaries)
 809      
 810      synthesis_prompt = f"""You have been given summaries of different sections of a large document. 
 811  Synthesize these into ONE cohesive, comprehensive summary that:
 812  1. Removes redundancy between sections
 813  2. Preserves all key facts, figures, and actionable information
 814  3. Is well-organized with clear structure
 815  4. Is under {max_output_size} characters
 816  
 817  {context_str}SECTION SUMMARIES:
 818  {combined_summaries}
 819  
 820  Create a single, unified markdown summary."""
 821  
 822      try:
 823          aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
 824          if aux_client is None or not effective_model:
 825              logger.warning("No auxiliary model for synthesis, concatenating summaries")
 826              fallback = "\n\n".join(summaries)
 827              if len(fallback) > max_output_size:
 828                  fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
 829              return fallback
 830  
 831          call_kwargs = {
 832              "task": "web_extract",
 833              "model": effective_model,
 834              "messages": [
 835                  {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
 836                  {"role": "user", "content": synthesis_prompt},
 837              ],
 838              "temperature": 0.1,
 839              "max_tokens": 20000,
 840          }
 841          if extra_body:
 842              call_kwargs["extra_body"] = extra_body
 843          response = await async_call_llm(**call_kwargs)
 844          final_summary = extract_content_or_reasoning(response)
 845  
 846          # Retry once on empty content (reasoning-only response)
 847          if not final_summary:
 848              logger.warning("Synthesis LLM returned empty content, retrying once")
 849              response = await async_call_llm(**call_kwargs)
 850              final_summary = extract_content_or_reasoning(response)
 851  
 852          # If still None after retry, fall back to concatenated summaries
 853          if not final_summary:
 854              logger.warning("Synthesis failed after retry — concatenating chunk summaries")
 855              fallback = "\n\n".join(summaries)
 856              if len(fallback) > max_output_size:
 857                  fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
 858              return fallback
 859  
 860          # Enforce hard cap
 861          if len(final_summary) > max_output_size:
 862              final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
 863          
 864          original_len = len(content)
 865          final_len = len(final_summary)
 866          compression = final_len / original_len if original_len > 0 else 1.0
 867          
 868          logger.info("Synthesis complete: %d -> %d chars (%.2f%%)", original_len, final_len, compression * 100)
 869          return final_summary
 870          
 871      except Exception as e:
 872          logger.warning("Synthesis failed: %s", str(e)[:100])
 873          # Fall back to concatenated summaries with truncation
 874          fallback = "\n\n".join(summaries)
 875          if len(fallback) > max_output_size:
 876              fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]"
 877          return fallback
 878  
 879  
 880  def clean_base64_images(text: str) -> str:
 881      """
 882      Remove base64 encoded images from text to reduce token count and clutter.
 883      
 884      This function finds and removes base64 encoded images in various formats:
 885      - (data:image/png;base64,...)
 886      - (data:image/jpeg;base64,...)
 887      - (data:image/svg+xml;base64,...)
 888      - data:image/[type];base64,... (without parentheses)
 889      
 890      Args:
 891          text: The text content to clean
 892          
 893      Returns:
 894          Cleaned text with base64 images replaced with placeholders
 895      """
 896      # Pattern to match base64 encoded images wrapped in parentheses
 897      # Matches: (data:image/[type];base64,[base64-string])
 898      base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
 899      
 900      # Pattern to match base64 encoded images without parentheses
 901      # Matches: data:image/[type];base64,[base64-string]
 902      base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
 903      
 904      # Replace parentheses-wrapped images first
 905      cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
 906      
 907      # Then replace any remaining non-parentheses images
 908      cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
 909      
 910      return cleaned_text
 911  
 912  
 913  # ─── Exa Client ──────────────────────────────────────────────────────────────
 914  
 915  _exa_client = None
 916  
 917  def _get_exa_client():
 918      """Get or create the Exa client (lazy initialization).
 919  
 920      Requires EXA_API_KEY environment variable.
 921      """
 922      from exa_py import Exa
 923      global _exa_client
 924      if _exa_client is None:
 925          api_key = os.getenv("EXA_API_KEY")
 926          if not api_key:
 927              raise ValueError(
 928                  "EXA_API_KEY environment variable not set. "
 929                  "Get your API key at https://exa.ai"
 930              )
 931          _exa_client = Exa(api_key=api_key)
 932          _exa_client.headers["x-exa-integration"] = "hermes-agent"
 933      return _exa_client
 934  
 935  
 936  # ─── Exa Search & Extract Helpers ─────────────────────────────────────────────
 937  
 938  def _exa_search(query: str, limit: int = 10) -> dict:
 939      """Search using the Exa SDK and return results as a dict."""
 940      from tools.interrupt import is_interrupted
 941      if is_interrupted():
 942          return {"error": "Interrupted", "success": False}
 943  
 944      logger.info("Exa search: '%s' (limit=%d)", query, limit)
 945      response = _get_exa_client().search(
 946          query,
 947          num_results=limit,
 948          contents={
 949              "highlights": True,
 950          },
 951      )
 952  
 953      web_results = []
 954      for i, result in enumerate(response.results or []):
 955          highlights = result.highlights or []
 956          web_results.append({
 957              "url": result.url or "",
 958              "title": result.title or "",
 959              "description": " ".join(highlights) if highlights else "",
 960              "position": i + 1,
 961          })
 962  
 963      return {"success": True, "data": {"web": web_results}}
 964  
 965  
 966  def _exa_extract(urls: List[str]) -> List[Dict[str, Any]]:
 967      """Extract content from URLs using the Exa SDK.
 968  
 969      Returns a list of result dicts matching the structure expected by the
 970      LLM post-processing pipeline (url, title, content, metadata).
 971      """
 972      from tools.interrupt import is_interrupted
 973      if is_interrupted():
 974          return [{"url": u, "error": "Interrupted", "title": ""} for u in urls]
 975  
 976      logger.info("Exa extract: %d URL(s)", len(urls))
 977      response = _get_exa_client().get_contents(
 978          urls,
 979          text=True,
 980      )
 981  
 982      results = []
 983      for result in response.results or []:
 984          content = result.text or ""
 985          url = result.url or ""
 986          title = result.title or ""
 987          results.append({
 988              "url": url,
 989              "title": title,
 990              "content": content,
 991              "raw_content": content,
 992              "metadata": {"sourceURL": url, "title": title},
 993          })
 994  
 995      return results
 996  
 997  
 998  # ─── Parallel Search & Extract Helpers ────────────────────────────────────────
 999  
1000  def _parallel_search(query: str, limit: int = 5) -> dict:
1001      """Search using the Parallel SDK and return results as a dict."""
1002      from tools.interrupt import is_interrupted
1003      if is_interrupted():
1004          return {"error": "Interrupted", "success": False}
1005  
1006      mode = os.getenv("PARALLEL_SEARCH_MODE", "agentic").lower().strip()
1007      if mode not in ("fast", "one-shot", "agentic"):
1008          mode = "agentic"
1009  
1010      logger.info("Parallel search: '%s' (mode=%s, limit=%d)", query, mode, limit)
1011      response = _get_parallel_client().beta.search(
1012          search_queries=[query],
1013          objective=query,
1014          mode=mode,
1015          max_results=min(limit, 20),
1016      )
1017  
1018      web_results = []
1019      for i, result in enumerate(response.results or []):
1020          excerpts = result.excerpts or []
1021          web_results.append({
1022              "url": result.url or "",
1023              "title": result.title or "",
1024              "description": " ".join(excerpts) if excerpts else "",
1025              "position": i + 1,
1026          })
1027  
1028      return {"success": True, "data": {"web": web_results}}
1029  
1030  
1031  async def _parallel_extract(urls: List[str]) -> List[Dict[str, Any]]:
1032      """Extract content from URLs using the Parallel async SDK.
1033  
1034      Returns a list of result dicts matching the structure expected by the
1035      LLM post-processing pipeline (url, title, content, metadata).
1036      """
1037      from tools.interrupt import is_interrupted
1038      if is_interrupted():
1039          return [{"url": u, "error": "Interrupted", "title": ""} for u in urls]
1040  
1041      logger.info("Parallel extract: %d URL(s)", len(urls))
1042      response = await _get_async_parallel_client().beta.extract(
1043          urls=urls,
1044          full_content=True,
1045      )
1046  
1047      results = []
1048      for result in response.results or []:
1049          content = result.full_content or ""
1050          if not content:
1051              content = "\n\n".join(result.excerpts or [])
1052          url = result.url or ""
1053          title = result.title or ""
1054          results.append({
1055              "url": url,
1056              "title": title,
1057              "content": content,
1058              "raw_content": content,
1059              "metadata": {"sourceURL": url, "title": title},
1060          })
1061  
1062      for error in response.errors or []:
1063          results.append({
1064              "url": error.url or "",
1065              "title": "",
1066              "content": "",
1067              "error": error.content or error.error_type or "extraction failed",
1068              "metadata": {"sourceURL": error.url or ""},
1069          })
1070  
1071      return results
1072  
1073  
1074  def web_search_tool(query: str, limit: int = 5) -> str:
1075      """
1076      Search the web for information using available search API backend.
1077  
1078      This function provides a generic interface for web search that can work
1079      with multiple backends (Parallel or Firecrawl).
1080  
1081      Note: This function returns search result metadata only (URLs, titles, descriptions).
1082      Use web_extract_tool to get full content from specific URLs.
1083      
1084      Args:
1085          query (str): The search query to look up
1086          limit (int): Maximum number of results to return (default: 5)
1087      
1088      Returns:
1089          str: JSON string containing search results with the following structure:
1090               {
1091                   "success": bool,
1092                   "data": {
1093                       "web": [
1094                           {
1095                               "title": str,
1096                               "url": str,
1097                               "description": str,
1098                               "position": int
1099                           },
1100                           ...
1101                       ]
1102                   }
1103               }
1104      
1105      Raises:
1106          Exception: If search fails or API key is not set
1107      """
1108      try:
1109          limit = int(limit)
1110      except (TypeError, ValueError):
1111          limit = 5
1112      limit = min(max(limit, 1), 100)
1113  
1114      debug_call_data = {
1115          "parameters": {
1116              "query": query,
1117              "limit": limit
1118          },
1119          "error": None,
1120          "results_count": 0,
1121          "original_response_size": 0,
1122          "final_response_size": 0
1123      }
1124      
1125      try:
1126          from tools.interrupt import is_interrupted
1127          if is_interrupted():
1128              return tool_error("Interrupted", success=False)
1129  
1130          # Dispatch to the configured backend
1131          backend = _get_backend()
1132          if backend == "parallel":
1133              response_data = _parallel_search(query, limit)
1134              debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
1135              result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
1136              debug_call_data["final_response_size"] = len(result_json)
1137              _debug.log_call("web_search_tool", debug_call_data)
1138              _debug.save()
1139              return result_json
1140  
1141          if backend == "exa":
1142              response_data = _exa_search(query, limit)
1143              debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
1144              result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
1145              debug_call_data["final_response_size"] = len(result_json)
1146              _debug.log_call("web_search_tool", debug_call_data)
1147              _debug.save()
1148              return result_json
1149  
1150          if backend == "tavily":
1151              logger.info("Tavily search: '%s' (limit: %d)", query, limit)
1152              raw = _tavily_request("search", {
1153                  "query": query,
1154                  "max_results": min(limit, 20),
1155                  "include_raw_content": False,
1156                  "include_images": False,
1157              })
1158              response_data = _normalize_tavily_search_results(raw)
1159              debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
1160              result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
1161              debug_call_data["final_response_size"] = len(result_json)
1162              _debug.log_call("web_search_tool", debug_call_data)
1163              _debug.save()
1164              return result_json
1165  
1166          logger.info("Searching the web for: '%s' (limit: %d)", query, limit)
1167  
1168          response = _get_firecrawl_client().search(
1169              query=query,
1170              limit=limit
1171          )
1172  
1173          web_results = _extract_web_search_results(response)
1174          results_count = len(web_results)
1175          logger.info("Found %d search results", results_count)
1176          
1177          # Build response with just search metadata (URLs, titles, descriptions)
1178          response_data = {
1179              "success": True,
1180              "data": {
1181                  "web": web_results
1182              }
1183          }
1184          
1185          # Capture debug information
1186          debug_call_data["results_count"] = results_count
1187          
1188          # Convert to JSON
1189          result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
1190          
1191          debug_call_data["final_response_size"] = len(result_json)
1192          
1193          # Log debug information
1194          _debug.log_call("web_search_tool", debug_call_data)
1195          _debug.save()
1196          
1197          return result_json
1198          
1199      except Exception as e:
1200          error_msg = f"Error searching web: {str(e)}"
1201          logger.debug("%s", error_msg)
1202  
1203          debug_call_data["error"] = error_msg
1204          _debug.log_call("web_search_tool", debug_call_data)
1205          _debug.save()
1206  
1207          return tool_error(error_msg)
1208  
1209  
1210  async def web_extract_tool(
1211      urls: List[str],
1212      format: str = None,
1213      use_llm_processing: bool = True,
1214      model: Optional[str] = None,
1215      min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
1216  ) -> str:
1217      """
1218      Extract content from specific web pages using available extraction API backend.
1219  
1220      This function provides a generic interface for web content extraction that
1221      can work with multiple backends. Currently uses Firecrawl.
1222  
1223      Args:
1224          urls (List[str]): List of URLs to extract content from
1225          format (str): Desired output format ("markdown" or "html", optional)
1226          use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
1227          model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
1228          min_length (int): Minimum content length to trigger LLM processing (default: 5000)
1229  
1230      Security: URLs are checked for embedded secrets before fetching.
1231      
1232      Returns:
1233          str: JSON string containing extracted content. If LLM processing is enabled and successful,
1234               the 'content' field will contain the processed markdown summary instead of raw content.
1235      
1236      Raises:
1237          Exception: If extraction fails or API key is not set
1238      """
1239      # Block URLs containing embedded secrets (exfiltration prevention).
1240      # URL-decode first so percent-encoded secrets (%73k- = sk-) are caught.
1241      from agent.redact import _PREFIX_RE
1242      from urllib.parse import unquote
1243      for _url in urls:
1244          if _PREFIX_RE.search(_url) or _PREFIX_RE.search(unquote(_url)):
1245              return json.dumps({
1246                  "success": False,
1247                  "error": "Blocked: URL contains what appears to be an API key or token. "
1248                           "Secrets must not be sent in URLs.",
1249              })
1250  
1251      debug_call_data = {
1252          "parameters": {
1253              "urls": urls,
1254              "format": format,
1255              "use_llm_processing": use_llm_processing,
1256              "model": model,
1257              "min_length": min_length
1258          },
1259          "error": None,
1260          "pages_extracted": 0,
1261          "pages_processed_with_llm": 0,
1262          "original_response_size": 0,
1263          "final_response_size": 0,
1264          "compression_metrics": [],
1265          "processing_applied": []
1266      }
1267      
1268      try:
1269          logger.info("Extracting content from %d URL(s)", len(urls))
1270  
1271          # ── SSRF protection — filter out private/internal URLs before any backend ──
1272          safe_urls = []
1273          ssrf_blocked: List[Dict[str, Any]] = []
1274          for url in urls:
1275              if not is_safe_url(url):
1276                  ssrf_blocked.append({
1277                      "url": url, "title": "", "content": "",
1278                      "error": "Blocked: URL targets a private or internal network address",
1279                  })
1280              else:
1281                  safe_urls.append(url)
1282  
1283          # Dispatch only safe URLs to the configured backend
1284          if not safe_urls:
1285              results = []
1286          else:
1287              backend = _get_backend()
1288  
1289              if backend == "parallel":
1290                  results = await _parallel_extract(safe_urls)
1291              elif backend == "exa":
1292                  results = _exa_extract(safe_urls)
1293              elif backend == "tavily":
1294                  logger.info("Tavily extract: %d URL(s)", len(safe_urls))
1295                  raw = _tavily_request("extract", {
1296                      "urls": safe_urls,
1297                      "include_images": False,
1298                  })
1299                  results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "")
1300              else:
1301                  # ── Firecrawl extraction ──
1302                  # Determine requested formats for Firecrawl v2
1303                  formats: List[str] = []
1304                  if format == "markdown":
1305                      formats = ["markdown"]
1306                  elif format == "html":
1307                      formats = ["html"]
1308                  else:
1309                      # Default: request markdown for LLM-readiness and include html as backup
1310                      formats = ["markdown", "html"]
1311  
1312                  # Always use individual scraping for simplicity and reliability
1313                  # Batch scraping adds complexity without much benefit for small numbers of URLs
1314                  results: List[Dict[str, Any]] = []
1315  
1316                  from tools.interrupt import is_interrupted as _is_interrupted
1317                  for url in safe_urls:
1318                      if _is_interrupted():
1319                          results.append({"url": url, "error": "Interrupted", "title": ""})
1320                          continue
1321  
1322                      # Website policy check — block before fetching
1323                      blocked = check_website_access(url)
1324                      if blocked:
1325                          logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
1326                          results.append({
1327                              "url": url, "title": "", "content": "",
1328                              "error": blocked["message"],
1329                              "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
1330                          })
1331                          continue
1332  
1333                      try:
1334                          logger.info("Scraping: %s", url)
1335                          # Run synchronous Firecrawl scrape in a thread with a
1336                          # 60s timeout so a hung fetch doesn't block the session.
1337                          try:
1338                              scrape_result = await asyncio.wait_for(
1339                                  asyncio.to_thread(
1340                                      _get_firecrawl_client().scrape,
1341                                      url=url,
1342                                      formats=formats,
1343                                  ),
1344                                  timeout=60,
1345                              )
1346                          except asyncio.TimeoutError:
1347                              logger.warning("Firecrawl scrape timed out for %s", url)
1348                              results.append({
1349                                  "url": url, "title": "", "content": "",
1350                                  "error": "Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.",
1351                              })
1352                              continue
1353  
1354                          scrape_payload = _extract_scrape_payload(scrape_result)
1355                          metadata = scrape_payload.get("metadata", {})
1356                          title = ""
1357                          content_markdown = scrape_payload.get("markdown")
1358                          content_html = scrape_payload.get("html")
1359  
1360                          # Ensure metadata is a dict (not an object)
1361                          if not isinstance(metadata, dict):
1362                              if hasattr(metadata, 'model_dump'):
1363                                  metadata = metadata.model_dump()
1364                              elif hasattr(metadata, '__dict__'):
1365                                  metadata = metadata.__dict__
1366                              else:
1367                                  metadata = {}
1368  
1369                          # Get title from metadata
1370                          title = metadata.get("title", "")
1371  
1372                          # Re-check final URL after redirect
1373                          final_url = metadata.get("sourceURL", url)
1374                          final_blocked = check_website_access(final_url)
1375                          if final_blocked:
1376                              logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
1377                              results.append({
1378                                  "url": final_url, "title": title, "content": "", "raw_content": "",
1379                                  "error": final_blocked["message"],
1380                                  "blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
1381                              })
1382                              continue
1383  
1384                          # Choose content based on requested format
1385                          chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
1386  
1387                          results.append({
1388                              "url": final_url,
1389                              "title": title,
1390                              "content": chosen_content,
1391                              "raw_content": chosen_content,
1392                              "metadata": metadata  # Now guaranteed to be a dict
1393                          })
1394  
1395                      except Exception as scrape_err:
1396                          logger.debug("Scrape failed for %s: %s", url, scrape_err)
1397                          results.append({
1398                              "url": url,
1399                              "title": "",
1400                              "content": "",
1401                              "raw_content": "",
1402                              "error": str(scrape_err)
1403                          })
1404  
1405          # Merge any SSRF-blocked results back in
1406          if ssrf_blocked:
1407              results = ssrf_blocked + results
1408  
1409          response = {"results": results}
1410          
1411          pages_extracted = len(response.get('results', []))
1412          logger.info("Extracted content from %d pages", pages_extracted)
1413          
1414          debug_call_data["pages_extracted"] = pages_extracted
1415          debug_call_data["original_response_size"] = len(json.dumps(response))
1416          effective_model = model or _get_default_summarizer_model()
1417          auxiliary_available = check_auxiliary_model()
1418          
1419          # Process each result with LLM if enabled
1420          if use_llm_processing and auxiliary_available:
1421              logger.info("Processing extracted content with LLM (parallel)...")
1422              debug_call_data["processing_applied"].append("llm_processing")
1423              
1424              # Prepare tasks for parallel processing
1425              async def process_single_result(result):
1426                  """Process a single result with LLM and return updated result with metrics."""
1427                  url = result.get('url', 'Unknown URL')
1428                  title = result.get('title', '')
1429                  raw_content = result.get('raw_content', '') or result.get('content', '')
1430                  
1431                  if not raw_content:
1432                      return result, None, "no_content"
1433                  
1434                  original_size = len(raw_content)
1435                  
1436                  # Process content with LLM
1437                  processed = await process_content_with_llm(
1438                      raw_content, url, title, effective_model, min_length
1439                  )
1440                  
1441                  if processed:
1442                      processed_size = len(processed)
1443                      compression_ratio = processed_size / original_size if original_size > 0 else 1.0
1444                      
1445                      # Update result with processed content
1446                      result['content'] = processed
1447                      result['raw_content'] = raw_content
1448                      
1449                      metrics = {
1450                          "url": url,
1451                          "original_size": original_size,
1452                          "processed_size": processed_size,
1453                          "compression_ratio": compression_ratio,
1454                          "model_used": effective_model
1455                      }
1456                      return result, metrics, "processed"
1457                  else:
1458                      metrics = {
1459                          "url": url,
1460                          "original_size": original_size,
1461                          "processed_size": original_size,
1462                          "compression_ratio": 1.0,
1463                          "model_used": None,
1464                          "reason": "content_too_short"
1465                      }
1466                      return result, metrics, "too_short"
1467              
1468              # Run all LLM processing in parallel
1469              results_list = response.get('results', [])
1470              tasks = [process_single_result(result) for result in results_list]
1471              processed_results = await asyncio.gather(*tasks)
1472              
1473              # Collect metrics and print results
1474              for result, metrics, status in processed_results:
1475                  url = result.get('url', 'Unknown URL')
1476                  if status == "processed":
1477                      debug_call_data["compression_metrics"].append(metrics)
1478                      debug_call_data["pages_processed_with_llm"] += 1
1479                      logger.info("%s (processed)", url)
1480                  elif status == "too_short":
1481                      debug_call_data["compression_metrics"].append(metrics)
1482                      logger.info("%s (no processing - content too short)", url)
1483                  else:
1484                      logger.warning("%s (no content to process)", url)
1485          else:
1486              if use_llm_processing and not auxiliary_available:
1487                  logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
1488                  debug_call_data["processing_applied"].append("llm_processing_unavailable")
1489              # Print summary of extracted pages for debugging (original behavior)
1490              for result in response.get('results', []):
1491                  url = result.get('url', 'Unknown URL')
1492                  content_length = len(result.get('raw_content', ''))
1493                  logger.info("%s (%d characters)", url, content_length)
1494          
1495          # Trim output to minimal fields per entry: title, content, error
1496          trimmed_results = [
1497              {
1498                  "url": r.get("url", ""),
1499                  "title": r.get("title", ""),
1500                  "content": r.get("content", ""),
1501                  "error": r.get("error"),
1502                  **({  "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
1503              }
1504              for r in response.get("results", [])
1505          ]
1506          trimmed_response = {"results": trimmed_results}
1507  
1508          if trimmed_response.get("results") == []:
1509              result_json = tool_error("Content was inaccessible or not found")
1510  
1511              cleaned_result = clean_base64_images(result_json)
1512          
1513          else:
1514              result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False)
1515              
1516              cleaned_result = clean_base64_images(result_json)
1517          
1518          debug_call_data["final_response_size"] = len(cleaned_result)
1519          debug_call_data["processing_applied"].append("base64_image_removal")
1520          
1521          # Log debug information
1522          _debug.log_call("web_extract_tool", debug_call_data)
1523          _debug.save()
1524          
1525          return cleaned_result
1526              
1527      except Exception as e:
1528          error_msg = f"Error extracting content: {str(e)}"
1529          logger.debug("%s", error_msg)
1530          
1531          debug_call_data["error"] = error_msg
1532          _debug.log_call("web_extract_tool", debug_call_data)
1533          _debug.save()
1534          
1535          return tool_error(error_msg)
1536  
1537  
1538  async def web_crawl_tool(
1539      url: str, 
1540      instructions: str = None, 
1541      depth: str = "basic", 
1542      use_llm_processing: bool = True,
1543      model: Optional[str] = None,
1544      min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
1545  ) -> str:
1546      """
1547      Crawl a website with specific instructions using available crawling API backend.
1548      
1549      This function provides a generic interface for web crawling that can work
1550      with multiple backends. Currently uses Firecrawl.
1551      
1552      Args:
1553          url (str): The base URL to crawl (can include or exclude https://)
1554          instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
1555          depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
1556          use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
1557          model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
1558          min_length (int): Minimum content length to trigger LLM processing (default: 5000)
1559      
1560      Returns:
1561          str: JSON string containing crawled content. If LLM processing is enabled and successful,
1562               the 'content' field will contain the processed markdown summary instead of raw content.
1563               Each page is processed individually.
1564      
1565      Raises:
1566          Exception: If crawling fails or API key is not set
1567      """
1568      debug_call_data = {
1569          "parameters": {
1570              "url": url,
1571              "instructions": instructions,
1572              "depth": depth,
1573              "use_llm_processing": use_llm_processing,
1574              "model": model,
1575              "min_length": min_length
1576          },
1577          "error": None,
1578          "pages_crawled": 0,
1579          "pages_processed_with_llm": 0,
1580          "original_response_size": 0,
1581          "final_response_size": 0,
1582          "compression_metrics": [],
1583          "processing_applied": []
1584      }
1585      
1586      try:
1587          effective_model = model or _get_default_summarizer_model()
1588          auxiliary_available = check_auxiliary_model()
1589          backend = _get_backend()
1590  
1591          # Tavily supports crawl via its /crawl endpoint
1592          if backend == "tavily":
1593              # Ensure URL has protocol
1594              if not url.startswith(('http://', 'https://')):
1595                  url = f'https://{url}'
1596  
1597              # SSRF protection — block private/internal addresses
1598              if not is_safe_url(url):
1599                  return json.dumps({"results": [{"url": url, "title": "", "content": "",
1600                      "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False)
1601  
1602              # Website policy check
1603              blocked = check_website_access(url)
1604              if blocked:
1605                  logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"])
1606                  return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"],
1607                      "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False)
1608  
1609              from tools.interrupt import is_interrupted as _is_int
1610              if _is_int():
1611                  return tool_error("Interrupted", success=False)
1612  
1613              logger.info("Tavily crawl: %s", url)
1614              payload: Dict[str, Any] = {
1615                  "url": url,
1616                  "limit": 20,
1617                  "extract_depth": depth,
1618              }
1619              if instructions:
1620                  payload["instructions"] = instructions
1621              raw = _tavily_request("crawl", payload)
1622              results = _normalize_tavily_documents(raw, fallback_url=url)
1623  
1624              response = {"results": results}
1625              # Fall through to the shared LLM processing and trimming below
1626              # (skip the Firecrawl-specific crawl logic)
1627              pages_crawled = len(response.get('results', []))
1628              logger.info("Crawled %d pages", pages_crawled)
1629              debug_call_data["pages_crawled"] = pages_crawled
1630              debug_call_data["original_response_size"] = len(json.dumps(response))
1631  
1632              # Process each result with LLM if enabled
1633              if use_llm_processing and auxiliary_available:
1634                  logger.info("Processing crawled content with LLM (parallel)...")
1635                  debug_call_data["processing_applied"].append("llm_processing")
1636  
1637                  async def _process_tavily_crawl(result):
1638                      page_url = result.get('url', 'Unknown URL')
1639                      title = result.get('title', '')
1640                      content = result.get('content', '')
1641                      if not content:
1642                          return result, None, "no_content"
1643                      original_size = len(content)
1644                      processed = await process_content_with_llm(content, page_url, title, effective_model, min_length)
1645                      if processed:
1646                          result['raw_content'] = content
1647                          result['content'] = processed
1648                          metrics = {"url": page_url, "original_size": original_size, "processed_size": len(processed),
1649                                     "compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": effective_model}
1650                          return result, metrics, "processed"
1651                      metrics = {"url": page_url, "original_size": original_size, "processed_size": original_size,
1652                                 "compression_ratio": 1.0, "model_used": None, "reason": "content_too_short"}
1653                      return result, metrics, "too_short"
1654  
1655                  tasks = [_process_tavily_crawl(r) for r in response.get('results', [])]
1656                  processed_results = await asyncio.gather(*tasks)
1657                  for result, metrics, status in processed_results:
1658                      if status == "processed":
1659                          debug_call_data["compression_metrics"].append(metrics)
1660                          debug_call_data["pages_processed_with_llm"] += 1
1661  
1662              if use_llm_processing and not auxiliary_available:
1663                  logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
1664                  debug_call_data["processing_applied"].append("llm_processing_unavailable")
1665  
1666              trimmed_results = [{"url": r.get("url", ""), "title": r.get("title", ""), "content": r.get("content", ""), "error": r.get("error"),
1667                  **({  "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {})} for r in response.get("results", [])]
1668              result_json = json.dumps({"results": trimmed_results}, indent=2, ensure_ascii=False)
1669              cleaned_result = clean_base64_images(result_json)
1670              debug_call_data["final_response_size"] = len(cleaned_result)
1671              _debug.log_call("web_crawl_tool", debug_call_data)
1672              _debug.save()
1673              return cleaned_result
1674  
1675          # web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API
1676          if not check_firecrawl_api_key():
1677              return json.dumps({
1678                  "error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, FIRECRAWL_API_URL"
1679                           f"{_firecrawl_backend_help_suffix()}, or use web_search + web_extract instead.",
1680                  "success": False,
1681              }, ensure_ascii=False)
1682  
1683          # Ensure URL has protocol
1684          if not url.startswith(('http://', 'https://')):
1685              url = f'https://{url}'
1686              logger.info("Added https:// prefix to URL: %s", url)
1687          
1688          instructions_text = f" with instructions: '{instructions}'" if instructions else ""
1689          logger.info("Crawling %s%s", url, instructions_text)
1690          
1691          # SSRF protection — block private/internal addresses
1692          if not is_safe_url(url):
1693              return json.dumps({"results": [{"url": url, "title": "", "content": "",
1694                  "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False)
1695  
1696          # Website policy check — block before crawling
1697          blocked = check_website_access(url)
1698          if blocked:
1699              logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"])
1700              return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"],
1701                  "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False)
1702  
1703          # Use Firecrawl's v2 crawl functionality
1704          # Docs: https://docs.firecrawl.dev/features/crawl
1705          # The crawl() method automatically waits for completion and returns all data
1706          
1707          # Build crawl parameters - keep it simple
1708          crawl_params = {
1709              "limit": 20,  # Limit number of pages to crawl
1710              "scrape_options": {
1711                  "formats": ["markdown"]  # Just markdown for simplicity
1712              }
1713          }
1714          
1715          # Note: The 'prompt' parameter is not documented for crawl
1716          # Instructions are typically used with the Extract endpoint, not Crawl
1717          if instructions:
1718              logger.info("Instructions parameter ignored (not supported in crawl API)")
1719          
1720          from tools.interrupt import is_interrupted as _is_int
1721          if _is_int():
1722              return tool_error("Interrupted", success=False)
1723  
1724          try:
1725              crawl_result = _get_firecrawl_client().crawl(
1726                  url=url,
1727                  **crawl_params
1728              )
1729          except Exception as e:
1730              logger.debug("Crawl API call failed: %s", e)
1731              raise
1732  
1733          pages: List[Dict[str, Any]] = []
1734          
1735          # Process crawl results - the crawl method returns a CrawlJob object with data attribute
1736          data_list = []
1737          
1738          # The crawl_result is a CrawlJob object with a 'data' attribute containing list of Document objects
1739          if hasattr(crawl_result, 'data'):
1740              data_list = crawl_result.data if crawl_result.data else []
1741              logger.info("Status: %s", getattr(crawl_result, 'status', 'unknown'))
1742              logger.info("Retrieved %d pages", len(data_list))
1743              
1744              # Debug: Check other attributes if no data
1745              if not data_list:
1746                  logger.debug("CrawlJob attributes: %s", [attr for attr in dir(crawl_result) if not attr.startswith('_')])
1747                  logger.debug("Status: %s", getattr(crawl_result, 'status', 'N/A'))
1748                  logger.debug("Total: %s", getattr(crawl_result, 'total', 'N/A'))
1749                  logger.debug("Completed: %s", getattr(crawl_result, 'completed', 'N/A'))
1750                  
1751          elif isinstance(crawl_result, dict) and 'data' in crawl_result:
1752              data_list = crawl_result.get("data", [])
1753          else:
1754              logger.warning("Unexpected crawl result type")
1755              logger.debug("Result type: %s", type(crawl_result))
1756              if hasattr(crawl_result, '__dict__'):
1757                  logger.debug("Result attributes: %s", list(crawl_result.__dict__.keys()))
1758          
1759          for item in data_list:
1760              # Process each crawled page - properly handle object serialization
1761              page_url = "Unknown URL"
1762              title = ""
1763              content_markdown = None
1764              content_html = None
1765              metadata = {}
1766              
1767              # Extract data from the item
1768              if hasattr(item, 'model_dump'):
1769                  # Pydantic model - use model_dump to get dict
1770                  item_dict = item.model_dump()
1771                  content_markdown = item_dict.get('markdown')
1772                  content_html = item_dict.get('html')
1773                  metadata = item_dict.get('metadata', {})
1774              elif hasattr(item, '__dict__'):
1775                  # Regular object with attributes
1776                  content_markdown = getattr(item, 'markdown', None)
1777                  content_html = getattr(item, 'html', None)
1778                  
1779                  # Handle metadata - convert to dict if it's an object
1780                  metadata_obj = getattr(item, 'metadata', {})
1781                  if hasattr(metadata_obj, 'model_dump'):
1782                      metadata = metadata_obj.model_dump()
1783                  elif hasattr(metadata_obj, '__dict__'):
1784                      metadata = metadata_obj.__dict__
1785                  elif isinstance(metadata_obj, dict):
1786                      metadata = metadata_obj
1787                  else:
1788                      metadata = {}
1789              elif isinstance(item, dict):
1790                  # Already a dictionary
1791                  content_markdown = item.get('markdown')
1792                  content_html = item.get('html')
1793                  metadata = item.get('metadata', {})
1794              
1795              # Ensure metadata is a dict (not an object)
1796              if not isinstance(metadata, dict):
1797                  if hasattr(metadata, 'model_dump'):
1798                      metadata = metadata.model_dump()
1799                  elif hasattr(metadata, '__dict__'):
1800                      metadata = metadata.__dict__
1801                  else:
1802                      metadata = {}
1803              
1804              # Extract URL and title from metadata
1805              page_url = metadata.get("sourceURL", metadata.get("url", "Unknown URL"))
1806              title = metadata.get("title", "")
1807              
1808              # Re-check crawled page URL against policy
1809              page_blocked = check_website_access(page_url)
1810              if page_blocked:
1811                  logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"])
1812                  pages.append({
1813                      "url": page_url, "title": title, "content": "", "raw_content": "",
1814                      "error": page_blocked["message"],
1815                      "blocked_by_policy": {"host": page_blocked["host"], "rule": page_blocked["rule"], "source": page_blocked["source"]},
1816                  })
1817                  continue
1818  
1819              # Choose content (prefer markdown)
1820              content = content_markdown or content_html or ""
1821              
1822              pages.append({
1823                  "url": page_url,
1824                  "title": title,
1825                  "content": content,
1826                  "raw_content": content,
1827                  "metadata": metadata  # Now guaranteed to be a dict
1828              })
1829  
1830          response = {"results": pages}
1831          
1832          pages_crawled = len(response.get('results', []))
1833          logger.info("Crawled %d pages", pages_crawled)
1834          
1835          debug_call_data["pages_crawled"] = pages_crawled
1836          debug_call_data["original_response_size"] = len(json.dumps(response))
1837          
1838          # Process each result with LLM if enabled
1839          if use_llm_processing and auxiliary_available:
1840              logger.info("Processing crawled content with LLM (parallel)...")
1841              debug_call_data["processing_applied"].append("llm_processing")
1842              
1843              # Prepare tasks for parallel processing
1844              async def process_single_crawl_result(result):
1845                  """Process a single crawl result with LLM and return updated result with metrics."""
1846                  page_url = result.get('url', 'Unknown URL')
1847                  title = result.get('title', '')
1848                  content = result.get('content', '')
1849                  
1850                  if not content:
1851                      return result, None, "no_content"
1852                  
1853                  original_size = len(content)
1854                  
1855                  # Process content with LLM
1856                  processed = await process_content_with_llm(
1857                      content, page_url, title, effective_model, min_length
1858                  )
1859                  
1860                  if processed:
1861                      processed_size = len(processed)
1862                      compression_ratio = processed_size / original_size if original_size > 0 else 1.0
1863                      
1864                      # Update result with processed content
1865                      result['raw_content'] = content
1866                      result['content'] = processed
1867                      
1868                      metrics = {
1869                          "url": page_url,
1870                          "original_size": original_size,
1871                          "processed_size": processed_size,
1872                          "compression_ratio": compression_ratio,
1873                          "model_used": effective_model
1874                      }
1875                      return result, metrics, "processed"
1876                  else:
1877                      metrics = {
1878                          "url": page_url,
1879                          "original_size": original_size,
1880                          "processed_size": original_size,
1881                          "compression_ratio": 1.0,
1882                          "model_used": None,
1883                          "reason": "content_too_short"
1884                      }
1885                      return result, metrics, "too_short"
1886              
1887              # Run all LLM processing in parallel
1888              results_list = response.get('results', [])
1889              tasks = [process_single_crawl_result(result) for result in results_list]
1890              processed_results = await asyncio.gather(*tasks)
1891              
1892              # Collect metrics and print results
1893              for result, metrics, status in processed_results:
1894                  page_url = result.get('url', 'Unknown URL')
1895                  if status == "processed":
1896                      debug_call_data["compression_metrics"].append(metrics)
1897                      debug_call_data["pages_processed_with_llm"] += 1
1898                      logger.info("%s (processed)", page_url)
1899                  elif status == "too_short":
1900                      debug_call_data["compression_metrics"].append(metrics)
1901                      logger.info("%s (no processing - content too short)", page_url)
1902                  else:
1903                      logger.warning("%s (no content to process)", page_url)
1904          else:
1905              if use_llm_processing and not auxiliary_available:
1906                  logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
1907                  debug_call_data["processing_applied"].append("llm_processing_unavailable")
1908              # Print summary of crawled pages for debugging (original behavior)
1909              for result in response.get('results', []):
1910                  page_url = result.get('url', 'Unknown URL')
1911                  content_length = len(result.get('content', ''))
1912                  logger.info("%s (%d characters)", page_url, content_length)
1913          
1914          # Trim output to minimal fields per entry: title, content, error
1915          trimmed_results = [
1916              {
1917                  "url": r.get("url", ""),
1918                  "title": r.get("title", ""),
1919                  "content": r.get("content", ""),
1920                  "error": r.get("error"),
1921                  **({  "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
1922              }
1923              for r in response.get("results", [])
1924          ]
1925          trimmed_response = {"results": trimmed_results}
1926          
1927          result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False)
1928          # Clean base64 images from crawled content
1929          cleaned_result = clean_base64_images(result_json)
1930          
1931          debug_call_data["final_response_size"] = len(cleaned_result)
1932          debug_call_data["processing_applied"].append("base64_image_removal")
1933          
1934          # Log debug information
1935          _debug.log_call("web_crawl_tool", debug_call_data)
1936          _debug.save()
1937          
1938          return cleaned_result
1939          
1940      except Exception as e:
1941          error_msg = f"Error crawling website: {str(e)}"
1942          logger.debug("%s", error_msg)
1943          
1944          debug_call_data["error"] = error_msg
1945          _debug.log_call("web_crawl_tool", debug_call_data)
1946          _debug.save()
1947          
1948          return tool_error(error_msg)
1949  
1950  
1951  # Convenience function to check Firecrawl credentials
1952  def check_firecrawl_api_key() -> bool:
1953      """
1954      Check whether the Firecrawl backend is available.
1955  
1956      Availability is true when either:
1957      1) direct Firecrawl config (`FIRECRAWL_API_KEY` or `FIRECRAWL_API_URL`), or
1958      2) Firecrawl gateway origin + Nous Subscriber access token
1959         (fallback when direct Firecrawl is not configured).
1960  
1961      Returns:
1962          bool: True if direct Firecrawl or the tool-gateway can be used.
1963      """
1964      return _has_direct_firecrawl_config() or _is_tool_gateway_ready()
1965  
1966  
1967  def check_web_api_key() -> bool:
1968      """Check whether the configured web backend is available."""
1969      configured = _load_web_config().get("backend", "").lower().strip()
1970      if configured in ("exa", "parallel", "firecrawl", "tavily"):
1971          return _is_backend_available(configured)
1972      return any(_is_backend_available(backend) for backend in ("exa", "parallel", "firecrawl", "tavily"))
1973  
1974  
1975  def check_auxiliary_model() -> bool:
1976      """Check if an auxiliary text model is available for LLM content processing."""
1977      client, _, _ = _resolve_web_extract_auxiliary()
1978      return client is not None
1979  
1980  
1981  
1982  
1983  if __name__ == "__main__":
1984      """
1985      Simple test/demo when run directly
1986      """
1987      print("🌐 Standalone Web Tools Module")
1988      print("=" * 40)
1989      
1990      # Check if API keys are available
1991      web_available = check_web_api_key()
1992      tool_gateway_available = _is_tool_gateway_ready()
1993      firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
1994      firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip())
1995      nous_available = check_auxiliary_model()
1996      default_summarizer_model = _get_default_summarizer_model()
1997  
1998      if web_available:
1999          backend = _get_backend()
2000          print(f"✅ Web backend: {backend}")
2001          if backend == "exa":
2002              print("   Using Exa API (https://exa.ai)")
2003          elif backend == "parallel":
2004              print("   Using Parallel API (https://parallel.ai)")
2005          elif backend == "tavily":
2006              print("   Using Tavily API (https://tavily.com)")
2007          else:
2008              if firecrawl_url_available:
2009                  print(f"   Using self-hosted Firecrawl: {os.getenv('FIRECRAWL_API_URL').strip().rstrip('/')}")
2010              elif firecrawl_key_available:
2011                  print("   Using direct Firecrawl cloud API")
2012              elif tool_gateway_available:
2013                  print(f"   Using Firecrawl tool-gateway: {_get_firecrawl_gateway_url()}")
2014              else:
2015                  print("   Firecrawl backend selected but not configured")
2016      else:
2017          print("❌ No web search backend configured")
2018          print(
2019              "Set EXA_API_KEY, PARALLEL_API_KEY, TAVILY_API_KEY, FIRECRAWL_API_KEY, FIRECRAWL_API_URL"
2020              f"{_firecrawl_backend_help_suffix()}"
2021          )
2022  
2023      if not nous_available:
2024          print("❌ No auxiliary model available for LLM content processing")
2025          print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
2026          print("⚠️  Without an auxiliary model, LLM content processing will be disabled")
2027      else:
2028          print(f"✅ Auxiliary model available: {default_summarizer_model}")
2029  
2030      if not web_available:
2031          exit(1)
2032  
2033      print("🛠️  Web tools ready for use!")
2034      
2035      if nous_available:
2036          print(f"🧠 LLM content processing available with {default_summarizer_model}")
2037          print(f"   Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
2038      
2039      # Show debug mode status
2040      if _debug.active:
2041          print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
2042          print(f"   Debug logs will be saved to: {_debug.log_dir}/web_tools_debug_{_debug.session_id}.json")
2043      else:
2044          print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)")
2045      
2046      print("\nBasic usage:")
2047      print("  from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
2048      print("  import asyncio")
2049      print("")
2050      print("  # Search (synchronous)")
2051      print("  results = web_search_tool('Python tutorials')")
2052      print("")
2053      print("  # Extract and crawl (asynchronous)")
2054      print("  async def main():")
2055      print("      content = await web_extract_tool(['https://example.com'])")
2056      print("      crawl_data = await web_crawl_tool('example.com', 'Find docs')")
2057      print("  asyncio.run(main())")
2058      
2059      if nous_available:
2060          print("\nLLM-enhanced usage:")
2061          print("  # Content automatically processed for pages >5000 chars (default)")
2062          print("  content = await web_extract_tool(['https://python.org/about/'])")
2063          print("")
2064          print("  # Customize processing parameters")
2065          print("  crawl_data = await web_crawl_tool(")
2066          print("      'docs.python.org',")
2067          print("      'Find key concepts',")
2068          print("      model='google/gemini-3-flash-preview',")
2069          print("      min_length=3000")
2070          print("  )")
2071          print("")
2072          print("  # Disable LLM processing")
2073          print("  raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)")
2074      
2075      print("\nDebug mode:")
2076      print("  # Enable debug logging")
2077      print("  export WEB_TOOLS_DEBUG=true")
2078      print("  # Debug logs capture:")
2079      print("  # - All tool calls with parameters")
2080      print("  # - Original API responses")
2081      print("  # - LLM compression metrics")
2082      print("  # - Final processed results")
2083      print("  # Logs saved to: ./logs/web_tools_debug_UUID.json")
2084      
2085      print("\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities")
2086  
2087  
2088  # ---------------------------------------------------------------------------
2089  # Registry
2090  # ---------------------------------------------------------------------------
2091  from tools.registry import registry, tool_error
2092  
2093  WEB_SEARCH_SCHEMA = {
2094      "name": "web_search",
2095      "description": "Search the web for information. Returns up to 5 results by default with titles, URLs, and descriptions. The query is passed through to the configured backend, so operators such as site:domain, filetype:pdf, intitle:word, -term, and \"exact phrase\" may work when the backend supports them.",
2096      "parameters": {
2097          "type": "object",
2098          "properties": {
2099              "query": {
2100                  "type": "string",
2101                  "description": "The search query to look up on the web. You may include backend-supported operators such as site:example.com, filetype:pdf, intitle:word, -term, or \"exact phrase\"."
2102              },
2103              "limit": {
2104                  "type": "integer",
2105                  "description": "Maximum number of results to return. Defaults to 5.",
2106                  "minimum": 1,
2107                  "maximum": 100,
2108                  "default": 5
2109              }
2110          },
2111          "required": ["query"]
2112      }
2113  }
2114  
2115  WEB_EXTRACT_SCHEMA = {
2116      "name": "web_extract",
2117      "description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
2118      "parameters": {
2119          "type": "object",
2120          "properties": {
2121              "urls": {
2122                  "type": "array",
2123                  "items": {"type": "string"},
2124                  "description": "List of URLs to extract content from (max 5 URLs per call)",
2125                  "maxItems": 5
2126              }
2127          },
2128          "required": ["urls"]
2129      }
2130  }
2131  
2132  registry.register(
2133      name="web_search",
2134      toolset="web",
2135      schema=WEB_SEARCH_SCHEMA,
2136      handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=args.get("limit", 5)),
2137      check_fn=check_web_api_key,
2138      requires_env=_web_requires_env(),
2139      emoji="🔍",
2140      max_result_size_chars=100_000,
2141  )
2142  registry.register(
2143      name="web_extract",
2144      toolset="web",
2145      schema=WEB_EXTRACT_SCHEMA,
2146      handler=lambda args, **kw: web_extract_tool(
2147          args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
2148      check_fn=check_web_api_key,
2149      requires_env=_web_requires_env(),
2150      is_async=True,
2151      emoji="📄",
2152      max_result_size_chars=100_000,
2153  )