web_tools.py
1 #!/usr/bin/env python3 2 """ 3 Standalone Web Tools Module 4 5 This module provides generic web tools that work with multiple backend providers. 6 Backend is selected during ``hermes tools`` setup (web.backend in config.yaml). 7 When available, Hermes can route Firecrawl calls through a Nous-hosted tool-gateway 8 for Nous Subscribers only. 9 10 Available tools: 11 - web_search_tool: Search the web for information 12 - web_extract_tool: Extract content from specific web pages 13 - web_crawl_tool: Crawl websites with specific instructions 14 15 Backend compatibility: 16 - Exa: https://exa.ai (search, extract) 17 - Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway.<domain> for Nous Subscribers) 18 - Parallel: https://docs.parallel.ai (search, extract) 19 - Tavily: https://tavily.com (search, extract, crawl) 20 21 LLM Processing: 22 - Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction 23 - Extracts key excerpts and creates markdown summaries to reduce token usage 24 25 Debug Mode: 26 - Set WEB_TOOLS_DEBUG=true to enable detailed logging 27 - Creates web_tools_debug_UUID.json in ./logs directory 28 - Captures all tool calls, results, and compression metrics 29 30 Usage: 31 from web_tools import web_search_tool, web_extract_tool, web_crawl_tool 32 33 # Search the web 34 results = web_search_tool("Python machine learning libraries", limit=3) 35 36 # Extract content from URLs 37 content = web_extract_tool(["https://example.com"], format="markdown") 38 39 # Crawl a website 40 crawl_data = web_crawl_tool("example.com", "Find contact information") 41 """ 42 43 import json 44 import logging 45 import os 46 import re 47 import asyncio 48 from typing import List, Dict, Any, Optional, TYPE_CHECKING 49 import httpx 50 # NOTE: `from firecrawl import Firecrawl` is deliberately NOT at module top — 51 # the SDK pulls ~200 ms of imports (httpcore, firecrawl.v1/v2 type trees) and 52 # we only need it when the backend is actually "firecrawl". We expose 53 # ``Firecrawl`` as a thin proxy that imports the SDK on first call/ 54 # isinstance check, so both (a) the in-module ``Firecrawl(...)`` construction 55 # site in _get_firecrawl_client() works unchanged, and (b) tests using 56 # ``patch("tools.web_tools.Firecrawl", ...)`` keep working. 57 if TYPE_CHECKING: 58 from firecrawl import Firecrawl # noqa: F401 — type hints only 59 60 _FIRECRAWL_CLS_CACHE: Optional[type] = None 61 62 63 def _load_firecrawl_cls() -> type: 64 """Import and cache ``firecrawl.Firecrawl``.""" 65 global _FIRECRAWL_CLS_CACHE 66 if _FIRECRAWL_CLS_CACHE is None: 67 from firecrawl import Firecrawl as _cls 68 _FIRECRAWL_CLS_CACHE = _cls 69 return _FIRECRAWL_CLS_CACHE 70 71 72 class _FirecrawlProxy: 73 """Module-level proxy that looks like ``firecrawl.Firecrawl`` but imports lazily.""" 74 75 __slots__ = () 76 77 def __call__(self, *args, **kwargs): 78 return _load_firecrawl_cls()(*args, **kwargs) 79 80 def __instancecheck__(self, obj): 81 return isinstance(obj, _load_firecrawl_cls()) 82 83 def __repr__(self): 84 return "<lazy firecrawl.Firecrawl proxy>" 85 86 87 Firecrawl = _FirecrawlProxy() 88 89 from agent.auxiliary_client import ( 90 async_call_llm, 91 extract_content_or_reasoning, 92 get_async_text_auxiliary_client, 93 ) 94 from tools.debug_helpers import DebugSession 95 from tools.managed_tool_gateway import ( 96 build_vendor_gateway_url, 97 read_nous_access_token as _read_nous_access_token, 98 resolve_managed_tool_gateway, 99 ) 100 from tools.tool_backend_helpers import managed_nous_tools_enabled, prefers_gateway 101 from tools.url_safety import is_safe_url 102 from tools.website_policy import check_website_access 103 104 logger = logging.getLogger(__name__) 105 106 107 # ─── Backend Selection ──────────────────────────────────────────────────────── 108 109 def _has_env(name: str) -> bool: 110 val = os.getenv(name) 111 return bool(val and val.strip()) 112 113 def _load_web_config() -> dict: 114 """Load the ``web:`` section from ~/.hermes/config.yaml.""" 115 try: 116 from hermes_cli.config import load_config 117 return load_config().get("web", {}) 118 except (ImportError, Exception): 119 return {} 120 121 def _get_backend() -> str: 122 """Determine which web backend to use. 123 124 Reads ``web.backend`` from config.yaml (set by ``hermes tools``). 125 Falls back to whichever API key is present for users who configured 126 keys manually without running setup. 127 """ 128 configured = (_load_web_config().get("backend") or "").lower().strip() 129 if configured in ("parallel", "firecrawl", "tavily", "exa"): 130 return configured 131 132 # Fallback for manual / legacy config — pick the highest-priority 133 # available backend. Firecrawl also counts as available when the managed 134 # tool gateway is configured for Nous subscribers. 135 backend_candidates = ( 136 ("firecrawl", _has_env("FIRECRAWL_API_KEY") or _has_env("FIRECRAWL_API_URL") or _is_tool_gateway_ready()), 137 ("parallel", _has_env("PARALLEL_API_KEY")), 138 ("tavily", _has_env("TAVILY_API_KEY")), 139 ("exa", _has_env("EXA_API_KEY")), 140 ) 141 for backend, available in backend_candidates: 142 if available: 143 return backend 144 145 return "firecrawl" # default (backward compat) 146 147 148 def _is_backend_available(backend: str) -> bool: 149 """Return True when the selected backend is currently usable.""" 150 if backend == "exa": 151 return _has_env("EXA_API_KEY") 152 if backend == "parallel": 153 return _has_env("PARALLEL_API_KEY") 154 if backend == "firecrawl": 155 return check_firecrawl_api_key() 156 if backend == "tavily": 157 return _has_env("TAVILY_API_KEY") 158 return False 159 160 # ─── Firecrawl Client ──────────────────────────────────────────────────────── 161 162 _firecrawl_client = None 163 _firecrawl_client_config = None 164 165 166 def _get_direct_firecrawl_config() -> Optional[tuple[Dict[str, str], tuple[str, Optional[str], Optional[str]]]]: 167 """Return explicit direct Firecrawl kwargs + cache key, or None when unset.""" 168 api_key = os.getenv("FIRECRAWL_API_KEY", "").strip() 169 api_url = os.getenv("FIRECRAWL_API_URL", "").strip().rstrip("/") 170 171 if not api_key and not api_url: 172 return None 173 174 kwargs: Dict[str, str] = {} 175 if api_key: 176 kwargs["api_key"] = api_key 177 if api_url: 178 kwargs["api_url"] = api_url 179 180 return kwargs, ("direct", api_url or None, api_key or None) 181 182 183 def _get_firecrawl_gateway_url() -> str: 184 """Return configured Firecrawl gateway URL.""" 185 return build_vendor_gateway_url("firecrawl") 186 187 188 def _is_tool_gateway_ready() -> bool: 189 """Return True when gateway URL and a Nous Subscriber token are available.""" 190 return resolve_managed_tool_gateway("firecrawl", token_reader=_read_nous_access_token) is not None 191 192 193 def _has_direct_firecrawl_config() -> bool: 194 """Return True when direct Firecrawl config is explicitly configured.""" 195 return _get_direct_firecrawl_config() is not None 196 197 198 def _raise_web_backend_configuration_error() -> None: 199 """Raise a clear error for unsupported web backend configuration.""" 200 message = ( 201 "Web tools are not configured. " 202 "Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL for a self-hosted Firecrawl instance." 203 ) 204 if managed_nous_tools_enabled(): 205 message += ( 206 " With your Nous subscription you can also use the Tool Gateway — " 207 "run `hermes tools` and select Nous Subscription as the web provider." 208 ) 209 raise ValueError(message) 210 211 212 def _firecrawl_backend_help_suffix() -> str: 213 """Return optional managed-gateway guidance for Firecrawl help text.""" 214 if not managed_nous_tools_enabled(): 215 return "" 216 return ( 217 ", or use the Nous Tool Gateway via your subscription " 218 "(FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN)" 219 ) 220 221 222 def _web_requires_env() -> list[str]: 223 """Return tool metadata env vars for the currently enabled web backends.""" 224 requires = [ 225 "EXA_API_KEY", 226 "PARALLEL_API_KEY", 227 "TAVILY_API_KEY", 228 "FIRECRAWL_API_KEY", 229 "FIRECRAWL_API_URL", 230 ] 231 if managed_nous_tools_enabled(): 232 requires.extend( 233 [ 234 "FIRECRAWL_GATEWAY_URL", 235 "TOOL_GATEWAY_DOMAIN", 236 "TOOL_GATEWAY_SCHEME", 237 "TOOL_GATEWAY_USER_TOKEN", 238 ] 239 ) 240 return requires 241 242 243 def _get_firecrawl_client(): 244 """Get or create Firecrawl client. 245 246 When ``web.use_gateway`` is set in config, the Tool Gateway is preferred 247 even if direct Firecrawl credentials are present. Otherwise direct 248 Firecrawl takes precedence when explicitly configured. 249 """ 250 global _firecrawl_client, _firecrawl_client_config 251 252 direct_config = _get_direct_firecrawl_config() 253 if direct_config is not None and not prefers_gateway("web"): 254 kwargs, client_config = direct_config 255 else: 256 managed_gateway = resolve_managed_tool_gateway( 257 "firecrawl", 258 token_reader=_read_nous_access_token, 259 ) 260 if managed_gateway is None: 261 logger.error("Firecrawl client initialization failed: missing direct config and tool-gateway auth.") 262 _raise_web_backend_configuration_error() 263 264 kwargs = { 265 "api_key": managed_gateway.nous_user_token, 266 "api_url": managed_gateway.gateway_origin, 267 } 268 client_config = ( 269 "tool-gateway", 270 kwargs["api_url"], 271 managed_gateway.nous_user_token, 272 ) 273 274 if _firecrawl_client is not None and _firecrawl_client_config == client_config: 275 return _firecrawl_client 276 277 # Uses the module-level `Firecrawl` name (lazy proxy at module top). 278 _firecrawl_client = Firecrawl(**kwargs) 279 _firecrawl_client_config = client_config 280 return _firecrawl_client 281 282 # ─── Parallel Client ───────────────────────────────────────────────────────── 283 284 _parallel_client = None 285 _async_parallel_client = None 286 287 def _get_parallel_client(): 288 """Get or create the Parallel sync client (lazy initialization). 289 290 Requires PARALLEL_API_KEY environment variable. 291 """ 292 from parallel import Parallel 293 global _parallel_client 294 if _parallel_client is None: 295 api_key = os.getenv("PARALLEL_API_KEY") 296 if not api_key: 297 raise ValueError( 298 "PARALLEL_API_KEY environment variable not set. " 299 "Get your API key at https://parallel.ai" 300 ) 301 _parallel_client = Parallel(api_key=api_key) 302 return _parallel_client 303 304 305 def _get_async_parallel_client(): 306 """Get or create the Parallel async client (lazy initialization). 307 308 Requires PARALLEL_API_KEY environment variable. 309 """ 310 from parallel import AsyncParallel 311 global _async_parallel_client 312 if _async_parallel_client is None: 313 api_key = os.getenv("PARALLEL_API_KEY") 314 if not api_key: 315 raise ValueError( 316 "PARALLEL_API_KEY environment variable not set. " 317 "Get your API key at https://parallel.ai" 318 ) 319 _async_parallel_client = AsyncParallel(api_key=api_key) 320 return _async_parallel_client 321 322 # ─── Tavily Client ─────────────────────────────────────────────────────────── 323 324 _TAVILY_BASE_URL = os.getenv("TAVILY_BASE_URL", "https://api.tavily.com") 325 326 327 def _tavily_request(endpoint: str, payload: dict) -> dict: 328 """Send a POST request to the Tavily API. 329 330 Auth is provided via ``api_key`` in the JSON body (no header-based auth). 331 Raises ``ValueError`` if ``TAVILY_API_KEY`` is not set. 332 """ 333 api_key = os.getenv("TAVILY_API_KEY") 334 if not api_key: 335 raise ValueError( 336 "TAVILY_API_KEY environment variable not set. " 337 "Get your API key at https://app.tavily.com/home" 338 ) 339 payload["api_key"] = api_key 340 url = f"{_TAVILY_BASE_URL}/{endpoint.lstrip('/')}" 341 logger.info("Tavily %s request to %s", endpoint, url) 342 response = httpx.post(url, json=payload, timeout=60) 343 response.raise_for_status() 344 return response.json() 345 346 347 def _normalize_tavily_search_results(response: dict) -> dict: 348 """Normalize Tavily /search response to the standard web search format. 349 350 Tavily returns ``{results: [{title, url, content, score, ...}]}``. 351 We map to ``{success, data: {web: [{title, url, description, position}]}}``. 352 """ 353 web_results = [] 354 for i, result in enumerate(response.get("results", [])): 355 web_results.append({ 356 "title": result.get("title", ""), 357 "url": result.get("url", ""), 358 "description": result.get("content", ""), 359 "position": i + 1, 360 }) 361 return {"success": True, "data": {"web": web_results}} 362 363 364 def _normalize_tavily_documents(response: dict, fallback_url: str = "") -> List[Dict[str, Any]]: 365 """Normalize Tavily /extract or /crawl response to the standard document format. 366 367 Maps results to ``{url, title, content, raw_content, metadata}`` and 368 includes any ``failed_results`` / ``failed_urls`` as error entries. 369 """ 370 documents: List[Dict[str, Any]] = [] 371 for result in response.get("results", []): 372 url = result.get("url", fallback_url) 373 raw = result.get("raw_content", "") or result.get("content", "") 374 documents.append({ 375 "url": url, 376 "title": result.get("title", ""), 377 "content": raw, 378 "raw_content": raw, 379 "metadata": {"sourceURL": url, "title": result.get("title", "")}, 380 }) 381 # Handle failed results 382 for fail in response.get("failed_results", []): 383 documents.append({ 384 "url": fail.get("url", fallback_url), 385 "title": "", 386 "content": "", 387 "raw_content": "", 388 "error": fail.get("error", "extraction failed"), 389 "metadata": {"sourceURL": fail.get("url", fallback_url)}, 390 }) 391 for fail_url in response.get("failed_urls", []): 392 url_str = fail_url if isinstance(fail_url, str) else str(fail_url) 393 documents.append({ 394 "url": url_str, 395 "title": "", 396 "content": "", 397 "raw_content": "", 398 "error": "extraction failed", 399 "metadata": {"sourceURL": url_str}, 400 }) 401 return documents 402 403 404 def _to_plain_object(value: Any) -> Any: 405 """Convert SDK objects to plain python data structures when possible.""" 406 if value is None: 407 return None 408 409 if isinstance(value, (dict, list, str, int, float, bool)): 410 return value 411 412 if hasattr(value, "model_dump"): 413 try: 414 return value.model_dump() 415 except Exception: 416 pass 417 418 if hasattr(value, "__dict__"): 419 try: 420 return {k: v for k, v in value.__dict__.items() if not k.startswith("_")} 421 except Exception: 422 pass 423 424 return value 425 426 427 def _normalize_result_list(values: Any) -> List[Dict[str, Any]]: 428 """Normalize mixed SDK/list payloads into a list of dicts.""" 429 if not isinstance(values, list): 430 return [] 431 432 normalized: List[Dict[str, Any]] = [] 433 for item in values: 434 plain = _to_plain_object(item) 435 if isinstance(plain, dict): 436 normalized.append(plain) 437 return normalized 438 439 440 def _extract_web_search_results(response: Any) -> List[Dict[str, Any]]: 441 """Extract Firecrawl search results across SDK/direct/gateway response shapes.""" 442 response_plain = _to_plain_object(response) 443 444 if isinstance(response_plain, dict): 445 data = response_plain.get("data") 446 if isinstance(data, list): 447 return _normalize_result_list(data) 448 449 if isinstance(data, dict): 450 data_web = _normalize_result_list(data.get("web")) 451 if data_web: 452 return data_web 453 data_results = _normalize_result_list(data.get("results")) 454 if data_results: 455 return data_results 456 457 top_web = _normalize_result_list(response_plain.get("web")) 458 if top_web: 459 return top_web 460 461 top_results = _normalize_result_list(response_plain.get("results")) 462 if top_results: 463 return top_results 464 465 if hasattr(response, "web"): 466 return _normalize_result_list(getattr(response, "web", [])) 467 468 return [] 469 470 471 def _extract_scrape_payload(scrape_result: Any) -> Dict[str, Any]: 472 """Normalize Firecrawl scrape payload shape across SDK and gateway variants.""" 473 result_plain = _to_plain_object(scrape_result) 474 if not isinstance(result_plain, dict): 475 return {} 476 477 nested = result_plain.get("data") 478 if isinstance(nested, dict): 479 return nested 480 481 return result_plain 482 483 484 DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 485 486 def _is_nous_auxiliary_client(client: Any) -> bool: 487 """Return True when the resolved auxiliary backend is Nous Portal.""" 488 from urllib.parse import urlparse 489 490 base_url = str(getattr(client, "base_url", "") or "") 491 host = (urlparse(base_url).hostname or "").lower() 492 return host == "nousresearch.com" or host.endswith(".nousresearch.com") 493 494 495 def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]: 496 """Resolve the current web-extract auxiliary client, model, and extra body.""" 497 client, default_model = get_async_text_auxiliary_client("web_extract") 498 configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() 499 effective_model = model or configured_model or default_model 500 501 extra_body: Dict[str, Any] = {} 502 if client is not None and _is_nous_auxiliary_client(client): 503 from agent.auxiliary_client import get_auxiliary_extra_body 504 extra_body = get_auxiliary_extra_body() or {"tags": ["product=hermes-agent"]} 505 506 return client, effective_model, extra_body 507 508 509 def _get_default_summarizer_model() -> Optional[str]: 510 """Return the current default model for web extraction summarization.""" 511 _, model, _ = _resolve_web_extract_auxiliary() 512 return model 513 514 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") 515 516 517 async def process_content_with_llm( 518 content: str, 519 url: str = "", 520 title: str = "", 521 model: Optional[str] = None, 522 min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION 523 ) -> Optional[str]: 524 """ 525 Process web content using LLM to create intelligent summaries with key excerpts. 526 527 This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API 528 to intelligently extract key information and create markdown summaries, 529 significantly reducing token usage while preserving all important information. 530 531 For very large content (>500k chars), uses chunked processing with synthesis. 532 For extremely large content (>2M chars), refuses to process entirely. 533 534 Args: 535 content (str): The raw content to process 536 url (str): The source URL (for context, optional) 537 title (str): The page title (for context, optional) 538 model (str): The model to use for processing (default: google/gemini-3-flash-preview) 539 min_length (int): Minimum content length to trigger processing (default: 5000) 540 541 Returns: 542 Optional[str]: Processed markdown content, or None if content too short or processing fails 543 """ 544 # Size thresholds 545 MAX_CONTENT_SIZE = 2_000_000 # 2M chars - refuse entirely above this 546 CHUNK_THRESHOLD = 500_000 # 500k chars - use chunked processing above this 547 CHUNK_SIZE = 100_000 # 100k chars per chunk 548 MAX_OUTPUT_SIZE = 5000 # Hard cap on final output size 549 550 try: 551 content_len = len(content) 552 553 # Refuse if content is absurdly large 554 if content_len > MAX_CONTENT_SIZE: 555 size_mb = content_len / 1_000_000 556 logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb) 557 return f"[Content too large to process: {size_mb:.1f}MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]" 558 559 # Skip processing if content is too short 560 if content_len < min_length: 561 logger.debug("Content too short (%d < %d chars), skipping LLM processing", content_len, min_length) 562 return None 563 564 # Create context information 565 context_info = [] 566 if title: 567 context_info.append(f"Title: {title}") 568 if url: 569 context_info.append(f"Source: {url}") 570 context_str = "\n".join(context_info) + "\n\n" if context_info else "" 571 572 # Check if we need chunked processing 573 if content_len > CHUNK_THRESHOLD: 574 logger.info("Content large (%d chars). Using chunked processing...", content_len) 575 return await _process_large_content_chunked( 576 content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE 577 ) 578 579 # Standard single-pass processing for normal content 580 logger.info("Processing content with LLM (%d characters)", content_len) 581 582 processed_content = await _call_summarizer_llm(content, context_str, model) 583 584 if processed_content: 585 # Enforce output cap 586 if len(processed_content) > MAX_OUTPUT_SIZE: 587 processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]" 588 589 # Log compression metrics 590 processed_length = len(processed_content) 591 compression_ratio = processed_length / content_len if content_len > 0 else 1.0 592 logger.info("Content processed: %d -> %d chars (%.1f%%)", content_len, processed_length, compression_ratio * 100) 593 594 return processed_content 595 596 except Exception as e: 597 logger.warning( 598 "web_extract LLM summarization failed (%s). " 599 "Tip: increase auxiliary.web_extract.timeout in config.yaml " 600 "or switch to a faster auxiliary model.", 601 str(e)[:120], 602 ) 603 # Fall back to truncated raw content instead of returning a useless 604 # error message. The first ~5000 chars are almost always more useful 605 # to the model than "[Failed to process content: ...]". 606 truncated = content[:MAX_OUTPUT_SIZE] 607 if len(content) > MAX_OUTPUT_SIZE: 608 truncated += ( 609 f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of " 610 f"{len(content):,} chars. LLM summarization timed out. " 611 f"To fix: increase auxiliary.web_extract.timeout in config.yaml, " 612 f"or use a faster auxiliary model. Use browser_navigate for the full page.]" 613 ) 614 return truncated 615 616 617 async def _call_summarizer_llm( 618 content: str, 619 context_str: str, 620 model: Optional[str], 621 max_tokens: int = 20000, 622 is_chunk: bool = False, 623 chunk_info: str = "" 624 ) -> Optional[str]: 625 """ 626 Make a single LLM call to summarize content. 627 628 Args: 629 content: The content to summarize 630 context_str: Context information (title, URL) 631 model: Model to use 632 max_tokens: Maximum output tokens 633 is_chunk: Whether this is a chunk of a larger document 634 chunk_info: Information about chunk position (e.g., "Chunk 2/5") 635 636 Returns: 637 Summarized content or None on failure 638 """ 639 if is_chunk: 640 # Chunk-specific prompt - aware that this is partial content 641 system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY. 642 643 Important guidelines for chunk processing: 644 1. Do NOT write introductions or conclusions - this is a partial document 645 2. Focus on extracting ALL key facts, figures, data points, and insights from this section 646 3. Preserve important quotes, code snippets, and specific details verbatim 647 4. Use bullet points and structured formatting for easy synthesis later 648 5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them 649 650 Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow.""" 651 652 user_prompt = f"""Extract key information from this SECTION of a larger document: 653 654 {context_str}{chunk_info} 655 656 SECTION CONTENT: 657 {content} 658 659 Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions.""" 660 661 else: 662 # Standard full-document prompt 663 system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk. 664 665 Create a well-structured markdown summary that includes: 666 1. Key excerpts (quotes, code snippets, important facts) in their original format 667 2. Comprehensive summary of all other important information 668 3. Proper markdown formatting with headers, bullets, and emphasis 669 670 Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized.""" 671 672 user_prompt = f"""Please process this web content and create a comprehensive markdown summary: 673 674 {context_str}CONTENT TO PROCESS: 675 {content} 676 677 Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights.""" 678 679 # Call the LLM with retry logic — keep retries low since summarization 680 # is a nice-to-have; the caller falls back to truncated content on failure. 681 max_retries = 2 682 retry_delay = 2 683 last_error = None 684 685 for attempt in range(max_retries): 686 try: 687 aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model) 688 if aux_client is None or not effective_model: 689 logger.warning("No auxiliary model available for web content processing") 690 return None 691 call_kwargs = { 692 "task": "web_extract", 693 "model": effective_model, 694 "messages": [ 695 {"role": "system", "content": system_prompt}, 696 {"role": "user", "content": user_prompt}, 697 ], 698 "temperature": 0.1, 699 "max_tokens": max_tokens, 700 # No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout 701 # from config (default 360s / 6min). Users with slow local models can 702 # increase it in config.yaml. 703 } 704 if extra_body: 705 call_kwargs["extra_body"] = extra_body 706 response = await async_call_llm(**call_kwargs) 707 content = extract_content_or_reasoning(response) 708 if content: 709 return content 710 # Reasoning-only / empty response — let the retry loop handle it 711 logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries) 712 if attempt < max_retries - 1: 713 await asyncio.sleep(retry_delay) 714 retry_delay = min(retry_delay * 2, 60) 715 continue 716 return content # Return whatever we got after exhausting retries 717 except RuntimeError: 718 logger.warning("No auxiliary model available for web content processing") 719 return None 720 except Exception as api_error: 721 last_error = api_error 722 if attempt < max_retries - 1: 723 logger.warning("LLM API call failed (attempt %d/%d): %s", attempt + 1, max_retries, str(api_error)[:100]) 724 logger.warning("Retrying in %ds...", retry_delay) 725 await asyncio.sleep(retry_delay) 726 retry_delay = min(retry_delay * 2, 60) 727 else: 728 raise last_error 729 730 return None 731 732 733 async def _process_large_content_chunked( 734 content: str, 735 context_str: str, 736 model: Optional[str], 737 chunk_size: int, 738 max_output_size: int 739 ) -> Optional[str]: 740 """ 741 Process large content by chunking, summarizing each chunk in parallel, 742 then synthesizing the summaries. 743 744 Args: 745 content: The large content to process 746 context_str: Context information 747 model: Model to use 748 chunk_size: Size of each chunk in characters 749 max_output_size: Maximum final output size 750 751 Returns: 752 Synthesized summary or None on failure 753 """ 754 # Split content into chunks 755 chunks = [] 756 for i in range(0, len(content), chunk_size): 757 chunk = content[i:i + chunk_size] 758 chunks.append(chunk) 759 760 logger.info("Split into %d chunks of ~%d chars each", len(chunks), chunk_size) 761 762 # Summarize each chunk in parallel 763 async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]: 764 """Summarize a single chunk.""" 765 try: 766 chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]" 767 summary = await _call_summarizer_llm( 768 chunk_content, 769 context_str, 770 model, 771 max_tokens=10000, 772 is_chunk=True, 773 chunk_info=chunk_info 774 ) 775 if summary: 776 logger.info("Chunk %d/%d summarized: %d -> %d chars", chunk_idx + 1, len(chunks), len(chunk_content), len(summary)) 777 return chunk_idx, summary 778 except Exception as e: 779 logger.warning("Chunk %d/%d failed: %s", chunk_idx + 1, len(chunks), str(e)[:50]) 780 return chunk_idx, None 781 782 # Run all chunk summarizations in parallel 783 tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)] 784 results = await asyncio.gather(*tasks) 785 786 # Collect successful summaries in order 787 summaries = [] 788 for chunk_idx, summary in sorted(results, key=lambda x: x[0]): 789 if summary: 790 summaries.append(f"## Section {chunk_idx + 1}\n{summary}") 791 792 if not summaries: 793 logger.debug("All chunk summarizations failed") 794 return "[Failed to process large content: all chunk summarizations failed]" 795 796 logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks)) 797 798 # If only one chunk succeeded, just return it (with cap) 799 if len(summaries) == 1: 800 result = summaries[0] 801 if len(result) > max_output_size: 802 result = result[:max_output_size] + "\n\n[... truncated ...]" 803 return result 804 805 # Synthesize the summaries into a final summary 806 logger.info("Synthesizing %d summaries...", len(summaries)) 807 808 combined_summaries = "\n\n---\n\n".join(summaries) 809 810 synthesis_prompt = f"""You have been given summaries of different sections of a large document. 811 Synthesize these into ONE cohesive, comprehensive summary that: 812 1. Removes redundancy between sections 813 2. Preserves all key facts, figures, and actionable information 814 3. Is well-organized with clear structure 815 4. Is under {max_output_size} characters 816 817 {context_str}SECTION SUMMARIES: 818 {combined_summaries} 819 820 Create a single, unified markdown summary.""" 821 822 try: 823 aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model) 824 if aux_client is None or not effective_model: 825 logger.warning("No auxiliary model for synthesis, concatenating summaries") 826 fallback = "\n\n".join(summaries) 827 if len(fallback) > max_output_size: 828 fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" 829 return fallback 830 831 call_kwargs = { 832 "task": "web_extract", 833 "model": effective_model, 834 "messages": [ 835 {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."}, 836 {"role": "user", "content": synthesis_prompt}, 837 ], 838 "temperature": 0.1, 839 "max_tokens": 20000, 840 } 841 if extra_body: 842 call_kwargs["extra_body"] = extra_body 843 response = await async_call_llm(**call_kwargs) 844 final_summary = extract_content_or_reasoning(response) 845 846 # Retry once on empty content (reasoning-only response) 847 if not final_summary: 848 logger.warning("Synthesis LLM returned empty content, retrying once") 849 response = await async_call_llm(**call_kwargs) 850 final_summary = extract_content_or_reasoning(response) 851 852 # If still None after retry, fall back to concatenated summaries 853 if not final_summary: 854 logger.warning("Synthesis failed after retry — concatenating chunk summaries") 855 fallback = "\n\n".join(summaries) 856 if len(fallback) > max_output_size: 857 fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" 858 return fallback 859 860 # Enforce hard cap 861 if len(final_summary) > max_output_size: 862 final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]" 863 864 original_len = len(content) 865 final_len = len(final_summary) 866 compression = final_len / original_len if original_len > 0 else 1.0 867 868 logger.info("Synthesis complete: %d -> %d chars (%.2f%%)", original_len, final_len, compression * 100) 869 return final_summary 870 871 except Exception as e: 872 logger.warning("Synthesis failed: %s", str(e)[:100]) 873 # Fall back to concatenated summaries with truncation 874 fallback = "\n\n".join(summaries) 875 if len(fallback) > max_output_size: 876 fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]" 877 return fallback 878 879 880 def clean_base64_images(text: str) -> str: 881 """ 882 Remove base64 encoded images from text to reduce token count and clutter. 883 884 This function finds and removes base64 encoded images in various formats: 885 - (data:image/png;base64,...) 886 - (data:image/jpeg;base64,...) 887 - (data:image/svg+xml;base64,...) 888 - data:image/[type];base64,... (without parentheses) 889 890 Args: 891 text: The text content to clean 892 893 Returns: 894 Cleaned text with base64 images replaced with placeholders 895 """ 896 # Pattern to match base64 encoded images wrapped in parentheses 897 # Matches: (data:image/[type];base64,[base64-string]) 898 base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)' 899 900 # Pattern to match base64 encoded images without parentheses 901 # Matches: data:image/[type];base64,[base64-string] 902 base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+' 903 904 # Replace parentheses-wrapped images first 905 cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text) 906 907 # Then replace any remaining non-parentheses images 908 cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text) 909 910 return cleaned_text 911 912 913 # ─── Exa Client ────────────────────────────────────────────────────────────── 914 915 _exa_client = None 916 917 def _get_exa_client(): 918 """Get or create the Exa client (lazy initialization). 919 920 Requires EXA_API_KEY environment variable. 921 """ 922 from exa_py import Exa 923 global _exa_client 924 if _exa_client is None: 925 api_key = os.getenv("EXA_API_KEY") 926 if not api_key: 927 raise ValueError( 928 "EXA_API_KEY environment variable not set. " 929 "Get your API key at https://exa.ai" 930 ) 931 _exa_client = Exa(api_key=api_key) 932 _exa_client.headers["x-exa-integration"] = "hermes-agent" 933 return _exa_client 934 935 936 # ─── Exa Search & Extract Helpers ───────────────────────────────────────────── 937 938 def _exa_search(query: str, limit: int = 10) -> dict: 939 """Search using the Exa SDK and return results as a dict.""" 940 from tools.interrupt import is_interrupted 941 if is_interrupted(): 942 return {"error": "Interrupted", "success": False} 943 944 logger.info("Exa search: '%s' (limit=%d)", query, limit) 945 response = _get_exa_client().search( 946 query, 947 num_results=limit, 948 contents={ 949 "highlights": True, 950 }, 951 ) 952 953 web_results = [] 954 for i, result in enumerate(response.results or []): 955 highlights = result.highlights or [] 956 web_results.append({ 957 "url": result.url or "", 958 "title": result.title or "", 959 "description": " ".join(highlights) if highlights else "", 960 "position": i + 1, 961 }) 962 963 return {"success": True, "data": {"web": web_results}} 964 965 966 def _exa_extract(urls: List[str]) -> List[Dict[str, Any]]: 967 """Extract content from URLs using the Exa SDK. 968 969 Returns a list of result dicts matching the structure expected by the 970 LLM post-processing pipeline (url, title, content, metadata). 971 """ 972 from tools.interrupt import is_interrupted 973 if is_interrupted(): 974 return [{"url": u, "error": "Interrupted", "title": ""} for u in urls] 975 976 logger.info("Exa extract: %d URL(s)", len(urls)) 977 response = _get_exa_client().get_contents( 978 urls, 979 text=True, 980 ) 981 982 results = [] 983 for result in response.results or []: 984 content = result.text or "" 985 url = result.url or "" 986 title = result.title or "" 987 results.append({ 988 "url": url, 989 "title": title, 990 "content": content, 991 "raw_content": content, 992 "metadata": {"sourceURL": url, "title": title}, 993 }) 994 995 return results 996 997 998 # ─── Parallel Search & Extract Helpers ──────────────────────────────────────── 999 1000 def _parallel_search(query: str, limit: int = 5) -> dict: 1001 """Search using the Parallel SDK and return results as a dict.""" 1002 from tools.interrupt import is_interrupted 1003 if is_interrupted(): 1004 return {"error": "Interrupted", "success": False} 1005 1006 mode = os.getenv("PARALLEL_SEARCH_MODE", "agentic").lower().strip() 1007 if mode not in ("fast", "one-shot", "agentic"): 1008 mode = "agentic" 1009 1010 logger.info("Parallel search: '%s' (mode=%s, limit=%d)", query, mode, limit) 1011 response = _get_parallel_client().beta.search( 1012 search_queries=[query], 1013 objective=query, 1014 mode=mode, 1015 max_results=min(limit, 20), 1016 ) 1017 1018 web_results = [] 1019 for i, result in enumerate(response.results or []): 1020 excerpts = result.excerpts or [] 1021 web_results.append({ 1022 "url": result.url or "", 1023 "title": result.title or "", 1024 "description": " ".join(excerpts) if excerpts else "", 1025 "position": i + 1, 1026 }) 1027 1028 return {"success": True, "data": {"web": web_results}} 1029 1030 1031 async def _parallel_extract(urls: List[str]) -> List[Dict[str, Any]]: 1032 """Extract content from URLs using the Parallel async SDK. 1033 1034 Returns a list of result dicts matching the structure expected by the 1035 LLM post-processing pipeline (url, title, content, metadata). 1036 """ 1037 from tools.interrupt import is_interrupted 1038 if is_interrupted(): 1039 return [{"url": u, "error": "Interrupted", "title": ""} for u in urls] 1040 1041 logger.info("Parallel extract: %d URL(s)", len(urls)) 1042 response = await _get_async_parallel_client().beta.extract( 1043 urls=urls, 1044 full_content=True, 1045 ) 1046 1047 results = [] 1048 for result in response.results or []: 1049 content = result.full_content or "" 1050 if not content: 1051 content = "\n\n".join(result.excerpts or []) 1052 url = result.url or "" 1053 title = result.title or "" 1054 results.append({ 1055 "url": url, 1056 "title": title, 1057 "content": content, 1058 "raw_content": content, 1059 "metadata": {"sourceURL": url, "title": title}, 1060 }) 1061 1062 for error in response.errors or []: 1063 results.append({ 1064 "url": error.url or "", 1065 "title": "", 1066 "content": "", 1067 "error": error.content or error.error_type or "extraction failed", 1068 "metadata": {"sourceURL": error.url or ""}, 1069 }) 1070 1071 return results 1072 1073 1074 def web_search_tool(query: str, limit: int = 5) -> str: 1075 """ 1076 Search the web for information using available search API backend. 1077 1078 This function provides a generic interface for web search that can work 1079 with multiple backends (Parallel or Firecrawl). 1080 1081 Note: This function returns search result metadata only (URLs, titles, descriptions). 1082 Use web_extract_tool to get full content from specific URLs. 1083 1084 Args: 1085 query (str): The search query to look up 1086 limit (int): Maximum number of results to return (default: 5) 1087 1088 Returns: 1089 str: JSON string containing search results with the following structure: 1090 { 1091 "success": bool, 1092 "data": { 1093 "web": [ 1094 { 1095 "title": str, 1096 "url": str, 1097 "description": str, 1098 "position": int 1099 }, 1100 ... 1101 ] 1102 } 1103 } 1104 1105 Raises: 1106 Exception: If search fails or API key is not set 1107 """ 1108 try: 1109 limit = int(limit) 1110 except (TypeError, ValueError): 1111 limit = 5 1112 limit = min(max(limit, 1), 100) 1113 1114 debug_call_data = { 1115 "parameters": { 1116 "query": query, 1117 "limit": limit 1118 }, 1119 "error": None, 1120 "results_count": 0, 1121 "original_response_size": 0, 1122 "final_response_size": 0 1123 } 1124 1125 try: 1126 from tools.interrupt import is_interrupted 1127 if is_interrupted(): 1128 return tool_error("Interrupted", success=False) 1129 1130 # Dispatch to the configured backend 1131 backend = _get_backend() 1132 if backend == "parallel": 1133 response_data = _parallel_search(query, limit) 1134 debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) 1135 result_json = json.dumps(response_data, indent=2, ensure_ascii=False) 1136 debug_call_data["final_response_size"] = len(result_json) 1137 _debug.log_call("web_search_tool", debug_call_data) 1138 _debug.save() 1139 return result_json 1140 1141 if backend == "exa": 1142 response_data = _exa_search(query, limit) 1143 debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) 1144 result_json = json.dumps(response_data, indent=2, ensure_ascii=False) 1145 debug_call_data["final_response_size"] = len(result_json) 1146 _debug.log_call("web_search_tool", debug_call_data) 1147 _debug.save() 1148 return result_json 1149 1150 if backend == "tavily": 1151 logger.info("Tavily search: '%s' (limit: %d)", query, limit) 1152 raw = _tavily_request("search", { 1153 "query": query, 1154 "max_results": min(limit, 20), 1155 "include_raw_content": False, 1156 "include_images": False, 1157 }) 1158 response_data = _normalize_tavily_search_results(raw) 1159 debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) 1160 result_json = json.dumps(response_data, indent=2, ensure_ascii=False) 1161 debug_call_data["final_response_size"] = len(result_json) 1162 _debug.log_call("web_search_tool", debug_call_data) 1163 _debug.save() 1164 return result_json 1165 1166 logger.info("Searching the web for: '%s' (limit: %d)", query, limit) 1167 1168 response = _get_firecrawl_client().search( 1169 query=query, 1170 limit=limit 1171 ) 1172 1173 web_results = _extract_web_search_results(response) 1174 results_count = len(web_results) 1175 logger.info("Found %d search results", results_count) 1176 1177 # Build response with just search metadata (URLs, titles, descriptions) 1178 response_data = { 1179 "success": True, 1180 "data": { 1181 "web": web_results 1182 } 1183 } 1184 1185 # Capture debug information 1186 debug_call_data["results_count"] = results_count 1187 1188 # Convert to JSON 1189 result_json = json.dumps(response_data, indent=2, ensure_ascii=False) 1190 1191 debug_call_data["final_response_size"] = len(result_json) 1192 1193 # Log debug information 1194 _debug.log_call("web_search_tool", debug_call_data) 1195 _debug.save() 1196 1197 return result_json 1198 1199 except Exception as e: 1200 error_msg = f"Error searching web: {str(e)}" 1201 logger.debug("%s", error_msg) 1202 1203 debug_call_data["error"] = error_msg 1204 _debug.log_call("web_search_tool", debug_call_data) 1205 _debug.save() 1206 1207 return tool_error(error_msg) 1208 1209 1210 async def web_extract_tool( 1211 urls: List[str], 1212 format: str = None, 1213 use_llm_processing: bool = True, 1214 model: Optional[str] = None, 1215 min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION 1216 ) -> str: 1217 """ 1218 Extract content from specific web pages using available extraction API backend. 1219 1220 This function provides a generic interface for web content extraction that 1221 can work with multiple backends. Currently uses Firecrawl. 1222 1223 Args: 1224 urls (List[str]): List of URLs to extract content from 1225 format (str): Desired output format ("markdown" or "html", optional) 1226 use_llm_processing (bool): Whether to process content with LLM for summarization (default: True) 1227 model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model) 1228 min_length (int): Minimum content length to trigger LLM processing (default: 5000) 1229 1230 Security: URLs are checked for embedded secrets before fetching. 1231 1232 Returns: 1233 str: JSON string containing extracted content. If LLM processing is enabled and successful, 1234 the 'content' field will contain the processed markdown summary instead of raw content. 1235 1236 Raises: 1237 Exception: If extraction fails or API key is not set 1238 """ 1239 # Block URLs containing embedded secrets (exfiltration prevention). 1240 # URL-decode first so percent-encoded secrets (%73k- = sk-) are caught. 1241 from agent.redact import _PREFIX_RE 1242 from urllib.parse import unquote 1243 for _url in urls: 1244 if _PREFIX_RE.search(_url) or _PREFIX_RE.search(unquote(_url)): 1245 return json.dumps({ 1246 "success": False, 1247 "error": "Blocked: URL contains what appears to be an API key or token. " 1248 "Secrets must not be sent in URLs.", 1249 }) 1250 1251 debug_call_data = { 1252 "parameters": { 1253 "urls": urls, 1254 "format": format, 1255 "use_llm_processing": use_llm_processing, 1256 "model": model, 1257 "min_length": min_length 1258 }, 1259 "error": None, 1260 "pages_extracted": 0, 1261 "pages_processed_with_llm": 0, 1262 "original_response_size": 0, 1263 "final_response_size": 0, 1264 "compression_metrics": [], 1265 "processing_applied": [] 1266 } 1267 1268 try: 1269 logger.info("Extracting content from %d URL(s)", len(urls)) 1270 1271 # ── SSRF protection — filter out private/internal URLs before any backend ── 1272 safe_urls = [] 1273 ssrf_blocked: List[Dict[str, Any]] = [] 1274 for url in urls: 1275 if not is_safe_url(url): 1276 ssrf_blocked.append({ 1277 "url": url, "title": "", "content": "", 1278 "error": "Blocked: URL targets a private or internal network address", 1279 }) 1280 else: 1281 safe_urls.append(url) 1282 1283 # Dispatch only safe URLs to the configured backend 1284 if not safe_urls: 1285 results = [] 1286 else: 1287 backend = _get_backend() 1288 1289 if backend == "parallel": 1290 results = await _parallel_extract(safe_urls) 1291 elif backend == "exa": 1292 results = _exa_extract(safe_urls) 1293 elif backend == "tavily": 1294 logger.info("Tavily extract: %d URL(s)", len(safe_urls)) 1295 raw = _tavily_request("extract", { 1296 "urls": safe_urls, 1297 "include_images": False, 1298 }) 1299 results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "") 1300 else: 1301 # ── Firecrawl extraction ── 1302 # Determine requested formats for Firecrawl v2 1303 formats: List[str] = [] 1304 if format == "markdown": 1305 formats = ["markdown"] 1306 elif format == "html": 1307 formats = ["html"] 1308 else: 1309 # Default: request markdown for LLM-readiness and include html as backup 1310 formats = ["markdown", "html"] 1311 1312 # Always use individual scraping for simplicity and reliability 1313 # Batch scraping adds complexity without much benefit for small numbers of URLs 1314 results: List[Dict[str, Any]] = [] 1315 1316 from tools.interrupt import is_interrupted as _is_interrupted 1317 for url in safe_urls: 1318 if _is_interrupted(): 1319 results.append({"url": url, "error": "Interrupted", "title": ""}) 1320 continue 1321 1322 # Website policy check — block before fetching 1323 blocked = check_website_access(url) 1324 if blocked: 1325 logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"]) 1326 results.append({ 1327 "url": url, "title": "", "content": "", 1328 "error": blocked["message"], 1329 "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}, 1330 }) 1331 continue 1332 1333 try: 1334 logger.info("Scraping: %s", url) 1335 # Run synchronous Firecrawl scrape in a thread with a 1336 # 60s timeout so a hung fetch doesn't block the session. 1337 try: 1338 scrape_result = await asyncio.wait_for( 1339 asyncio.to_thread( 1340 _get_firecrawl_client().scrape, 1341 url=url, 1342 formats=formats, 1343 ), 1344 timeout=60, 1345 ) 1346 except asyncio.TimeoutError: 1347 logger.warning("Firecrawl scrape timed out for %s", url) 1348 results.append({ 1349 "url": url, "title": "", "content": "", 1350 "error": "Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.", 1351 }) 1352 continue 1353 1354 scrape_payload = _extract_scrape_payload(scrape_result) 1355 metadata = scrape_payload.get("metadata", {}) 1356 title = "" 1357 content_markdown = scrape_payload.get("markdown") 1358 content_html = scrape_payload.get("html") 1359 1360 # Ensure metadata is a dict (not an object) 1361 if not isinstance(metadata, dict): 1362 if hasattr(metadata, 'model_dump'): 1363 metadata = metadata.model_dump() 1364 elif hasattr(metadata, '__dict__'): 1365 metadata = metadata.__dict__ 1366 else: 1367 metadata = {} 1368 1369 # Get title from metadata 1370 title = metadata.get("title", "") 1371 1372 # Re-check final URL after redirect 1373 final_url = metadata.get("sourceURL", url) 1374 final_blocked = check_website_access(final_url) 1375 if final_blocked: 1376 logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"]) 1377 results.append({ 1378 "url": final_url, "title": title, "content": "", "raw_content": "", 1379 "error": final_blocked["message"], 1380 "blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]}, 1381 }) 1382 continue 1383 1384 # Choose content based on requested format 1385 chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or "" 1386 1387 results.append({ 1388 "url": final_url, 1389 "title": title, 1390 "content": chosen_content, 1391 "raw_content": chosen_content, 1392 "metadata": metadata # Now guaranteed to be a dict 1393 }) 1394 1395 except Exception as scrape_err: 1396 logger.debug("Scrape failed for %s: %s", url, scrape_err) 1397 results.append({ 1398 "url": url, 1399 "title": "", 1400 "content": "", 1401 "raw_content": "", 1402 "error": str(scrape_err) 1403 }) 1404 1405 # Merge any SSRF-blocked results back in 1406 if ssrf_blocked: 1407 results = ssrf_blocked + results 1408 1409 response = {"results": results} 1410 1411 pages_extracted = len(response.get('results', [])) 1412 logger.info("Extracted content from %d pages", pages_extracted) 1413 1414 debug_call_data["pages_extracted"] = pages_extracted 1415 debug_call_data["original_response_size"] = len(json.dumps(response)) 1416 effective_model = model or _get_default_summarizer_model() 1417 auxiliary_available = check_auxiliary_model() 1418 1419 # Process each result with LLM if enabled 1420 if use_llm_processing and auxiliary_available: 1421 logger.info("Processing extracted content with LLM (parallel)...") 1422 debug_call_data["processing_applied"].append("llm_processing") 1423 1424 # Prepare tasks for parallel processing 1425 async def process_single_result(result): 1426 """Process a single result with LLM and return updated result with metrics.""" 1427 url = result.get('url', 'Unknown URL') 1428 title = result.get('title', '') 1429 raw_content = result.get('raw_content', '') or result.get('content', '') 1430 1431 if not raw_content: 1432 return result, None, "no_content" 1433 1434 original_size = len(raw_content) 1435 1436 # Process content with LLM 1437 processed = await process_content_with_llm( 1438 raw_content, url, title, effective_model, min_length 1439 ) 1440 1441 if processed: 1442 processed_size = len(processed) 1443 compression_ratio = processed_size / original_size if original_size > 0 else 1.0 1444 1445 # Update result with processed content 1446 result['content'] = processed 1447 result['raw_content'] = raw_content 1448 1449 metrics = { 1450 "url": url, 1451 "original_size": original_size, 1452 "processed_size": processed_size, 1453 "compression_ratio": compression_ratio, 1454 "model_used": effective_model 1455 } 1456 return result, metrics, "processed" 1457 else: 1458 metrics = { 1459 "url": url, 1460 "original_size": original_size, 1461 "processed_size": original_size, 1462 "compression_ratio": 1.0, 1463 "model_used": None, 1464 "reason": "content_too_short" 1465 } 1466 return result, metrics, "too_short" 1467 1468 # Run all LLM processing in parallel 1469 results_list = response.get('results', []) 1470 tasks = [process_single_result(result) for result in results_list] 1471 processed_results = await asyncio.gather(*tasks) 1472 1473 # Collect metrics and print results 1474 for result, metrics, status in processed_results: 1475 url = result.get('url', 'Unknown URL') 1476 if status == "processed": 1477 debug_call_data["compression_metrics"].append(metrics) 1478 debug_call_data["pages_processed_with_llm"] += 1 1479 logger.info("%s (processed)", url) 1480 elif status == "too_short": 1481 debug_call_data["compression_metrics"].append(metrics) 1482 logger.info("%s (no processing - content too short)", url) 1483 else: 1484 logger.warning("%s (no content to process)", url) 1485 else: 1486 if use_llm_processing and not auxiliary_available: 1487 logger.warning("LLM processing requested but no auxiliary model available, returning raw content") 1488 debug_call_data["processing_applied"].append("llm_processing_unavailable") 1489 # Print summary of extracted pages for debugging (original behavior) 1490 for result in response.get('results', []): 1491 url = result.get('url', 'Unknown URL') 1492 content_length = len(result.get('raw_content', '')) 1493 logger.info("%s (%d characters)", url, content_length) 1494 1495 # Trim output to minimal fields per entry: title, content, error 1496 trimmed_results = [ 1497 { 1498 "url": r.get("url", ""), 1499 "title": r.get("title", ""), 1500 "content": r.get("content", ""), 1501 "error": r.get("error"), 1502 **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}), 1503 } 1504 for r in response.get("results", []) 1505 ] 1506 trimmed_response = {"results": trimmed_results} 1507 1508 if trimmed_response.get("results") == []: 1509 result_json = tool_error("Content was inaccessible or not found") 1510 1511 cleaned_result = clean_base64_images(result_json) 1512 1513 else: 1514 result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False) 1515 1516 cleaned_result = clean_base64_images(result_json) 1517 1518 debug_call_data["final_response_size"] = len(cleaned_result) 1519 debug_call_data["processing_applied"].append("base64_image_removal") 1520 1521 # Log debug information 1522 _debug.log_call("web_extract_tool", debug_call_data) 1523 _debug.save() 1524 1525 return cleaned_result 1526 1527 except Exception as e: 1528 error_msg = f"Error extracting content: {str(e)}" 1529 logger.debug("%s", error_msg) 1530 1531 debug_call_data["error"] = error_msg 1532 _debug.log_call("web_extract_tool", debug_call_data) 1533 _debug.save() 1534 1535 return tool_error(error_msg) 1536 1537 1538 async def web_crawl_tool( 1539 url: str, 1540 instructions: str = None, 1541 depth: str = "basic", 1542 use_llm_processing: bool = True, 1543 model: Optional[str] = None, 1544 min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION 1545 ) -> str: 1546 """ 1547 Crawl a website with specific instructions using available crawling API backend. 1548 1549 This function provides a generic interface for web crawling that can work 1550 with multiple backends. Currently uses Firecrawl. 1551 1552 Args: 1553 url (str): The base URL to crawl (can include or exclude https://) 1554 instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional) 1555 depth (str): Depth of extraction ("basic" or "advanced", default: "basic") 1556 use_llm_processing (bool): Whether to process content with LLM for summarization (default: True) 1557 model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model) 1558 min_length (int): Minimum content length to trigger LLM processing (default: 5000) 1559 1560 Returns: 1561 str: JSON string containing crawled content. If LLM processing is enabled and successful, 1562 the 'content' field will contain the processed markdown summary instead of raw content. 1563 Each page is processed individually. 1564 1565 Raises: 1566 Exception: If crawling fails or API key is not set 1567 """ 1568 debug_call_data = { 1569 "parameters": { 1570 "url": url, 1571 "instructions": instructions, 1572 "depth": depth, 1573 "use_llm_processing": use_llm_processing, 1574 "model": model, 1575 "min_length": min_length 1576 }, 1577 "error": None, 1578 "pages_crawled": 0, 1579 "pages_processed_with_llm": 0, 1580 "original_response_size": 0, 1581 "final_response_size": 0, 1582 "compression_metrics": [], 1583 "processing_applied": [] 1584 } 1585 1586 try: 1587 effective_model = model or _get_default_summarizer_model() 1588 auxiliary_available = check_auxiliary_model() 1589 backend = _get_backend() 1590 1591 # Tavily supports crawl via its /crawl endpoint 1592 if backend == "tavily": 1593 # Ensure URL has protocol 1594 if not url.startswith(('http://', 'https://')): 1595 url = f'https://{url}' 1596 1597 # SSRF protection — block private/internal addresses 1598 if not is_safe_url(url): 1599 return json.dumps({"results": [{"url": url, "title": "", "content": "", 1600 "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False) 1601 1602 # Website policy check 1603 blocked = check_website_access(url) 1604 if blocked: 1605 logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"]) 1606 return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"], 1607 "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False) 1608 1609 from tools.interrupt import is_interrupted as _is_int 1610 if _is_int(): 1611 return tool_error("Interrupted", success=False) 1612 1613 logger.info("Tavily crawl: %s", url) 1614 payload: Dict[str, Any] = { 1615 "url": url, 1616 "limit": 20, 1617 "extract_depth": depth, 1618 } 1619 if instructions: 1620 payload["instructions"] = instructions 1621 raw = _tavily_request("crawl", payload) 1622 results = _normalize_tavily_documents(raw, fallback_url=url) 1623 1624 response = {"results": results} 1625 # Fall through to the shared LLM processing and trimming below 1626 # (skip the Firecrawl-specific crawl logic) 1627 pages_crawled = len(response.get('results', [])) 1628 logger.info("Crawled %d pages", pages_crawled) 1629 debug_call_data["pages_crawled"] = pages_crawled 1630 debug_call_data["original_response_size"] = len(json.dumps(response)) 1631 1632 # Process each result with LLM if enabled 1633 if use_llm_processing and auxiliary_available: 1634 logger.info("Processing crawled content with LLM (parallel)...") 1635 debug_call_data["processing_applied"].append("llm_processing") 1636 1637 async def _process_tavily_crawl(result): 1638 page_url = result.get('url', 'Unknown URL') 1639 title = result.get('title', '') 1640 content = result.get('content', '') 1641 if not content: 1642 return result, None, "no_content" 1643 original_size = len(content) 1644 processed = await process_content_with_llm(content, page_url, title, effective_model, min_length) 1645 if processed: 1646 result['raw_content'] = content 1647 result['content'] = processed 1648 metrics = {"url": page_url, "original_size": original_size, "processed_size": len(processed), 1649 "compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": effective_model} 1650 return result, metrics, "processed" 1651 metrics = {"url": page_url, "original_size": original_size, "processed_size": original_size, 1652 "compression_ratio": 1.0, "model_used": None, "reason": "content_too_short"} 1653 return result, metrics, "too_short" 1654 1655 tasks = [_process_tavily_crawl(r) for r in response.get('results', [])] 1656 processed_results = await asyncio.gather(*tasks) 1657 for result, metrics, status in processed_results: 1658 if status == "processed": 1659 debug_call_data["compression_metrics"].append(metrics) 1660 debug_call_data["pages_processed_with_llm"] += 1 1661 1662 if use_llm_processing and not auxiliary_available: 1663 logger.warning("LLM processing requested but no auxiliary model available, returning raw content") 1664 debug_call_data["processing_applied"].append("llm_processing_unavailable") 1665 1666 trimmed_results = [{"url": r.get("url", ""), "title": r.get("title", ""), "content": r.get("content", ""), "error": r.get("error"), 1667 **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {})} for r in response.get("results", [])] 1668 result_json = json.dumps({"results": trimmed_results}, indent=2, ensure_ascii=False) 1669 cleaned_result = clean_base64_images(result_json) 1670 debug_call_data["final_response_size"] = len(cleaned_result) 1671 _debug.log_call("web_crawl_tool", debug_call_data) 1672 _debug.save() 1673 return cleaned_result 1674 1675 # web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API 1676 if not check_firecrawl_api_key(): 1677 return json.dumps({ 1678 "error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, FIRECRAWL_API_URL" 1679 f"{_firecrawl_backend_help_suffix()}, or use web_search + web_extract instead.", 1680 "success": False, 1681 }, ensure_ascii=False) 1682 1683 # Ensure URL has protocol 1684 if not url.startswith(('http://', 'https://')): 1685 url = f'https://{url}' 1686 logger.info("Added https:// prefix to URL: %s", url) 1687 1688 instructions_text = f" with instructions: '{instructions}'" if instructions else "" 1689 logger.info("Crawling %s%s", url, instructions_text) 1690 1691 # SSRF protection — block private/internal addresses 1692 if not is_safe_url(url): 1693 return json.dumps({"results": [{"url": url, "title": "", "content": "", 1694 "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False) 1695 1696 # Website policy check — block before crawling 1697 blocked = check_website_access(url) 1698 if blocked: 1699 logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"]) 1700 return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"], 1701 "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False) 1702 1703 # Use Firecrawl's v2 crawl functionality 1704 # Docs: https://docs.firecrawl.dev/features/crawl 1705 # The crawl() method automatically waits for completion and returns all data 1706 1707 # Build crawl parameters - keep it simple 1708 crawl_params = { 1709 "limit": 20, # Limit number of pages to crawl 1710 "scrape_options": { 1711 "formats": ["markdown"] # Just markdown for simplicity 1712 } 1713 } 1714 1715 # Note: The 'prompt' parameter is not documented for crawl 1716 # Instructions are typically used with the Extract endpoint, not Crawl 1717 if instructions: 1718 logger.info("Instructions parameter ignored (not supported in crawl API)") 1719 1720 from tools.interrupt import is_interrupted as _is_int 1721 if _is_int(): 1722 return tool_error("Interrupted", success=False) 1723 1724 try: 1725 crawl_result = _get_firecrawl_client().crawl( 1726 url=url, 1727 **crawl_params 1728 ) 1729 except Exception as e: 1730 logger.debug("Crawl API call failed: %s", e) 1731 raise 1732 1733 pages: List[Dict[str, Any]] = [] 1734 1735 # Process crawl results - the crawl method returns a CrawlJob object with data attribute 1736 data_list = [] 1737 1738 # The crawl_result is a CrawlJob object with a 'data' attribute containing list of Document objects 1739 if hasattr(crawl_result, 'data'): 1740 data_list = crawl_result.data if crawl_result.data else [] 1741 logger.info("Status: %s", getattr(crawl_result, 'status', 'unknown')) 1742 logger.info("Retrieved %d pages", len(data_list)) 1743 1744 # Debug: Check other attributes if no data 1745 if not data_list: 1746 logger.debug("CrawlJob attributes: %s", [attr for attr in dir(crawl_result) if not attr.startswith('_')]) 1747 logger.debug("Status: %s", getattr(crawl_result, 'status', 'N/A')) 1748 logger.debug("Total: %s", getattr(crawl_result, 'total', 'N/A')) 1749 logger.debug("Completed: %s", getattr(crawl_result, 'completed', 'N/A')) 1750 1751 elif isinstance(crawl_result, dict) and 'data' in crawl_result: 1752 data_list = crawl_result.get("data", []) 1753 else: 1754 logger.warning("Unexpected crawl result type") 1755 logger.debug("Result type: %s", type(crawl_result)) 1756 if hasattr(crawl_result, '__dict__'): 1757 logger.debug("Result attributes: %s", list(crawl_result.__dict__.keys())) 1758 1759 for item in data_list: 1760 # Process each crawled page - properly handle object serialization 1761 page_url = "Unknown URL" 1762 title = "" 1763 content_markdown = None 1764 content_html = None 1765 metadata = {} 1766 1767 # Extract data from the item 1768 if hasattr(item, 'model_dump'): 1769 # Pydantic model - use model_dump to get dict 1770 item_dict = item.model_dump() 1771 content_markdown = item_dict.get('markdown') 1772 content_html = item_dict.get('html') 1773 metadata = item_dict.get('metadata', {}) 1774 elif hasattr(item, '__dict__'): 1775 # Regular object with attributes 1776 content_markdown = getattr(item, 'markdown', None) 1777 content_html = getattr(item, 'html', None) 1778 1779 # Handle metadata - convert to dict if it's an object 1780 metadata_obj = getattr(item, 'metadata', {}) 1781 if hasattr(metadata_obj, 'model_dump'): 1782 metadata = metadata_obj.model_dump() 1783 elif hasattr(metadata_obj, '__dict__'): 1784 metadata = metadata_obj.__dict__ 1785 elif isinstance(metadata_obj, dict): 1786 metadata = metadata_obj 1787 else: 1788 metadata = {} 1789 elif isinstance(item, dict): 1790 # Already a dictionary 1791 content_markdown = item.get('markdown') 1792 content_html = item.get('html') 1793 metadata = item.get('metadata', {}) 1794 1795 # Ensure metadata is a dict (not an object) 1796 if not isinstance(metadata, dict): 1797 if hasattr(metadata, 'model_dump'): 1798 metadata = metadata.model_dump() 1799 elif hasattr(metadata, '__dict__'): 1800 metadata = metadata.__dict__ 1801 else: 1802 metadata = {} 1803 1804 # Extract URL and title from metadata 1805 page_url = metadata.get("sourceURL", metadata.get("url", "Unknown URL")) 1806 title = metadata.get("title", "") 1807 1808 # Re-check crawled page URL against policy 1809 page_blocked = check_website_access(page_url) 1810 if page_blocked: 1811 logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"]) 1812 pages.append({ 1813 "url": page_url, "title": title, "content": "", "raw_content": "", 1814 "error": page_blocked["message"], 1815 "blocked_by_policy": {"host": page_blocked["host"], "rule": page_blocked["rule"], "source": page_blocked["source"]}, 1816 }) 1817 continue 1818 1819 # Choose content (prefer markdown) 1820 content = content_markdown or content_html or "" 1821 1822 pages.append({ 1823 "url": page_url, 1824 "title": title, 1825 "content": content, 1826 "raw_content": content, 1827 "metadata": metadata # Now guaranteed to be a dict 1828 }) 1829 1830 response = {"results": pages} 1831 1832 pages_crawled = len(response.get('results', [])) 1833 logger.info("Crawled %d pages", pages_crawled) 1834 1835 debug_call_data["pages_crawled"] = pages_crawled 1836 debug_call_data["original_response_size"] = len(json.dumps(response)) 1837 1838 # Process each result with LLM if enabled 1839 if use_llm_processing and auxiliary_available: 1840 logger.info("Processing crawled content with LLM (parallel)...") 1841 debug_call_data["processing_applied"].append("llm_processing") 1842 1843 # Prepare tasks for parallel processing 1844 async def process_single_crawl_result(result): 1845 """Process a single crawl result with LLM and return updated result with metrics.""" 1846 page_url = result.get('url', 'Unknown URL') 1847 title = result.get('title', '') 1848 content = result.get('content', '') 1849 1850 if not content: 1851 return result, None, "no_content" 1852 1853 original_size = len(content) 1854 1855 # Process content with LLM 1856 processed = await process_content_with_llm( 1857 content, page_url, title, effective_model, min_length 1858 ) 1859 1860 if processed: 1861 processed_size = len(processed) 1862 compression_ratio = processed_size / original_size if original_size > 0 else 1.0 1863 1864 # Update result with processed content 1865 result['raw_content'] = content 1866 result['content'] = processed 1867 1868 metrics = { 1869 "url": page_url, 1870 "original_size": original_size, 1871 "processed_size": processed_size, 1872 "compression_ratio": compression_ratio, 1873 "model_used": effective_model 1874 } 1875 return result, metrics, "processed" 1876 else: 1877 metrics = { 1878 "url": page_url, 1879 "original_size": original_size, 1880 "processed_size": original_size, 1881 "compression_ratio": 1.0, 1882 "model_used": None, 1883 "reason": "content_too_short" 1884 } 1885 return result, metrics, "too_short" 1886 1887 # Run all LLM processing in parallel 1888 results_list = response.get('results', []) 1889 tasks = [process_single_crawl_result(result) for result in results_list] 1890 processed_results = await asyncio.gather(*tasks) 1891 1892 # Collect metrics and print results 1893 for result, metrics, status in processed_results: 1894 page_url = result.get('url', 'Unknown URL') 1895 if status == "processed": 1896 debug_call_data["compression_metrics"].append(metrics) 1897 debug_call_data["pages_processed_with_llm"] += 1 1898 logger.info("%s (processed)", page_url) 1899 elif status == "too_short": 1900 debug_call_data["compression_metrics"].append(metrics) 1901 logger.info("%s (no processing - content too short)", page_url) 1902 else: 1903 logger.warning("%s (no content to process)", page_url) 1904 else: 1905 if use_llm_processing and not auxiliary_available: 1906 logger.warning("LLM processing requested but no auxiliary model available, returning raw content") 1907 debug_call_data["processing_applied"].append("llm_processing_unavailable") 1908 # Print summary of crawled pages for debugging (original behavior) 1909 for result in response.get('results', []): 1910 page_url = result.get('url', 'Unknown URL') 1911 content_length = len(result.get('content', '')) 1912 logger.info("%s (%d characters)", page_url, content_length) 1913 1914 # Trim output to minimal fields per entry: title, content, error 1915 trimmed_results = [ 1916 { 1917 "url": r.get("url", ""), 1918 "title": r.get("title", ""), 1919 "content": r.get("content", ""), 1920 "error": r.get("error"), 1921 **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}), 1922 } 1923 for r in response.get("results", []) 1924 ] 1925 trimmed_response = {"results": trimmed_results} 1926 1927 result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False) 1928 # Clean base64 images from crawled content 1929 cleaned_result = clean_base64_images(result_json) 1930 1931 debug_call_data["final_response_size"] = len(cleaned_result) 1932 debug_call_data["processing_applied"].append("base64_image_removal") 1933 1934 # Log debug information 1935 _debug.log_call("web_crawl_tool", debug_call_data) 1936 _debug.save() 1937 1938 return cleaned_result 1939 1940 except Exception as e: 1941 error_msg = f"Error crawling website: {str(e)}" 1942 logger.debug("%s", error_msg) 1943 1944 debug_call_data["error"] = error_msg 1945 _debug.log_call("web_crawl_tool", debug_call_data) 1946 _debug.save() 1947 1948 return tool_error(error_msg) 1949 1950 1951 # Convenience function to check Firecrawl credentials 1952 def check_firecrawl_api_key() -> bool: 1953 """ 1954 Check whether the Firecrawl backend is available. 1955 1956 Availability is true when either: 1957 1) direct Firecrawl config (`FIRECRAWL_API_KEY` or `FIRECRAWL_API_URL`), or 1958 2) Firecrawl gateway origin + Nous Subscriber access token 1959 (fallback when direct Firecrawl is not configured). 1960 1961 Returns: 1962 bool: True if direct Firecrawl or the tool-gateway can be used. 1963 """ 1964 return _has_direct_firecrawl_config() or _is_tool_gateway_ready() 1965 1966 1967 def check_web_api_key() -> bool: 1968 """Check whether the configured web backend is available.""" 1969 configured = _load_web_config().get("backend", "").lower().strip() 1970 if configured in ("exa", "parallel", "firecrawl", "tavily"): 1971 return _is_backend_available(configured) 1972 return any(_is_backend_available(backend) for backend in ("exa", "parallel", "firecrawl", "tavily")) 1973 1974 1975 def check_auxiliary_model() -> bool: 1976 """Check if an auxiliary text model is available for LLM content processing.""" 1977 client, _, _ = _resolve_web_extract_auxiliary() 1978 return client is not None 1979 1980 1981 1982 1983 if __name__ == "__main__": 1984 """ 1985 Simple test/demo when run directly 1986 """ 1987 print("🌐 Standalone Web Tools Module") 1988 print("=" * 40) 1989 1990 # Check if API keys are available 1991 web_available = check_web_api_key() 1992 tool_gateway_available = _is_tool_gateway_ready() 1993 firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip()) 1994 firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip()) 1995 nous_available = check_auxiliary_model() 1996 default_summarizer_model = _get_default_summarizer_model() 1997 1998 if web_available: 1999 backend = _get_backend() 2000 print(f"✅ Web backend: {backend}") 2001 if backend == "exa": 2002 print(" Using Exa API (https://exa.ai)") 2003 elif backend == "parallel": 2004 print(" Using Parallel API (https://parallel.ai)") 2005 elif backend == "tavily": 2006 print(" Using Tavily API (https://tavily.com)") 2007 else: 2008 if firecrawl_url_available: 2009 print(f" Using self-hosted Firecrawl: {os.getenv('FIRECRAWL_API_URL').strip().rstrip('/')}") 2010 elif firecrawl_key_available: 2011 print(" Using direct Firecrawl cloud API") 2012 elif tool_gateway_available: 2013 print(f" Using Firecrawl tool-gateway: {_get_firecrawl_gateway_url()}") 2014 else: 2015 print(" Firecrawl backend selected but not configured") 2016 else: 2017 print("❌ No web search backend configured") 2018 print( 2019 "Set EXA_API_KEY, PARALLEL_API_KEY, TAVILY_API_KEY, FIRECRAWL_API_KEY, FIRECRAWL_API_URL" 2020 f"{_firecrawl_backend_help_suffix()}" 2021 ) 2022 2023 if not nous_available: 2024 print("❌ No auxiliary model available for LLM content processing") 2025 print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY") 2026 print("⚠️ Without an auxiliary model, LLM content processing will be disabled") 2027 else: 2028 print(f"✅ Auxiliary model available: {default_summarizer_model}") 2029 2030 if not web_available: 2031 exit(1) 2032 2033 print("🛠️ Web tools ready for use!") 2034 2035 if nous_available: 2036 print(f"🧠 LLM content processing available with {default_summarizer_model}") 2037 print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars") 2038 2039 # Show debug mode status 2040 if _debug.active: 2041 print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}") 2042 print(f" Debug logs will be saved to: {_debug.log_dir}/web_tools_debug_{_debug.session_id}.json") 2043 else: 2044 print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)") 2045 2046 print("\nBasic usage:") 2047 print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool") 2048 print(" import asyncio") 2049 print("") 2050 print(" # Search (synchronous)") 2051 print(" results = web_search_tool('Python tutorials')") 2052 print("") 2053 print(" # Extract and crawl (asynchronous)") 2054 print(" async def main():") 2055 print(" content = await web_extract_tool(['https://example.com'])") 2056 print(" crawl_data = await web_crawl_tool('example.com', 'Find docs')") 2057 print(" asyncio.run(main())") 2058 2059 if nous_available: 2060 print("\nLLM-enhanced usage:") 2061 print(" # Content automatically processed for pages >5000 chars (default)") 2062 print(" content = await web_extract_tool(['https://python.org/about/'])") 2063 print("") 2064 print(" # Customize processing parameters") 2065 print(" crawl_data = await web_crawl_tool(") 2066 print(" 'docs.python.org',") 2067 print(" 'Find key concepts',") 2068 print(" model='google/gemini-3-flash-preview',") 2069 print(" min_length=3000") 2070 print(" )") 2071 print("") 2072 print(" # Disable LLM processing") 2073 print(" raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)") 2074 2075 print("\nDebug mode:") 2076 print(" # Enable debug logging") 2077 print(" export WEB_TOOLS_DEBUG=true") 2078 print(" # Debug logs capture:") 2079 print(" # - All tool calls with parameters") 2080 print(" # - Original API responses") 2081 print(" # - LLM compression metrics") 2082 print(" # - Final processed results") 2083 print(" # Logs saved to: ./logs/web_tools_debug_UUID.json") 2084 2085 print("\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities") 2086 2087 2088 # --------------------------------------------------------------------------- 2089 # Registry 2090 # --------------------------------------------------------------------------- 2091 from tools.registry import registry, tool_error 2092 2093 WEB_SEARCH_SCHEMA = { 2094 "name": "web_search", 2095 "description": "Search the web for information. Returns up to 5 results by default with titles, URLs, and descriptions. The query is passed through to the configured backend, so operators such as site:domain, filetype:pdf, intitle:word, -term, and \"exact phrase\" may work when the backend supports them.", 2096 "parameters": { 2097 "type": "object", 2098 "properties": { 2099 "query": { 2100 "type": "string", 2101 "description": "The search query to look up on the web. You may include backend-supported operators such as site:example.com, filetype:pdf, intitle:word, -term, or \"exact phrase\"." 2102 }, 2103 "limit": { 2104 "type": "integer", 2105 "description": "Maximum number of results to return. Defaults to 5.", 2106 "minimum": 1, 2107 "maximum": 100, 2108 "default": 5 2109 } 2110 }, 2111 "required": ["query"] 2112 } 2113 } 2114 2115 WEB_EXTRACT_SCHEMA = { 2116 "name": "web_extract", 2117 "description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.", 2118 "parameters": { 2119 "type": "object", 2120 "properties": { 2121 "urls": { 2122 "type": "array", 2123 "items": {"type": "string"}, 2124 "description": "List of URLs to extract content from (max 5 URLs per call)", 2125 "maxItems": 5 2126 } 2127 }, 2128 "required": ["urls"] 2129 } 2130 } 2131 2132 registry.register( 2133 name="web_search", 2134 toolset="web", 2135 schema=WEB_SEARCH_SCHEMA, 2136 handler=lambda args, **kw: web_search_tool(args.get("query", ""), limit=args.get("limit", 5)), 2137 check_fn=check_web_api_key, 2138 requires_env=_web_requires_env(), 2139 emoji="🔍", 2140 max_result_size_chars=100_000, 2141 ) 2142 registry.register( 2143 name="web_extract", 2144 toolset="web", 2145 schema=WEB_EXTRACT_SCHEMA, 2146 handler=lambda args, **kw: web_extract_tool( 2147 args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"), 2148 check_fn=check_web_api_key, 2149 requires_env=_web_requires_env(), 2150 is_async=True, 2151 emoji="📄", 2152 max_result_size_chars=100_000, 2153 )