web_tools.py
1 """ 2 Collection of Python tools for web-related tasks, such as making HTTP requests. 3 """ 4 5 import asyncio 6 import logging 7 import json 8 import uuid 9 from datetime import datetime, timezone 10 from typing import Any, Dict, Optional 11 import ipaddress 12 from urllib.parse import urlparse 13 import socket 14 15 import httpx 16 from markdownify import markdownify as md 17 from bs4 import BeautifulSoup 18 19 from google.adk.tools import ToolContext 20 21 from google.genai import types as adk_types 22 from .tool_definition import BuiltinTool 23 from .tool_result import ToolResult, DataObject, DataDisposition 24 from .registry import tool_registry 25 from ...common.constants import ARTIFACT_TAG_WORKING 26 27 log = logging.getLogger(__name__) 28 29 CATEGORY_NAME = "Web Access" 30 CATEGORY_DESCRIPTION = "Access the web to find information to complete user requests." 31 32 # Response size limits (in bytes) 33 DEFAULT_MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10 MB default 34 ABSOLUTE_MAX_RESPONSE_SIZE = 50 * 1024 * 1024 # 50 MB hard cap (cannot be exceeded even via config) 35 36 def _is_safe_url(url: str) -> bool: 37 """ 38 Checks if a URL is safe to request by resolving its hostname and checking 39 if the IP address is in a private, reserved, or loopback range. 40 """ 41 try: 42 parsed_url = urlparse(url) 43 hostname = parsed_url.hostname 44 if not hostname: 45 log.warning(f"URL has no hostname: {url}") 46 return False 47 48 try: 49 ip_str = socket.gethostbyname(hostname) 50 ip = ipaddress.ip_address(ip_str) 51 except socket.gaierror: 52 log.warning(f"Could not resolve hostname: {hostname}") 53 return False 54 55 if ip.is_private or ip.is_reserved or ip.is_loopback: 56 log.warning(f"URL {url} resolved to a blocked IP: {ip}") 57 return False 58 59 return True 60 61 except Exception as e: 62 log.error(f"Error during URL safety check for {url}: {e}", exc_info=True) 63 return False 64 65 66 async def web_request( 67 url: str, 68 method: str = "GET", 69 headers: Optional[Dict[str, str]] = None, 70 body: Optional[str] = None, 71 output_artifact_filename: Optional[str] = None, 72 tool_context: ToolContext = None, 73 tool_config: Optional[Dict[str, Any]] = None, 74 max_retries: int = 2, 75 ) -> ToolResult: 76 """ 77 Makes an HTTP request to the specified URL with retry logic, processes the content (e.g., HTML to Markdown), 78 and saves the result as an artifact. 79 80 Args: 81 url: The URL to fetch. 82 method: HTTP method (e.g., "GET", "POST"). Defaults to "GET". 83 headers: Optional dictionary of request headers. 84 body: Optional request body string for methods like POST/PUT. If sending JSON, this should be a valid JSON string. 85 output_artifact_filename: Optional. Desired filename for the output artifact. 86 tool_context: The context provided by the ADK framework. 87 tool_config: Optional. Configuration passed by the ADK. Supports: 88 - allow_loopback: bool - Allow requests to loopback addresses (for testing) 89 - max_response_size_bytes: int - Maximum response size in bytes (default: 10MB, max: 50MB) 90 max_retries: Maximum number of retry attempts for failed requests. Defaults to 2. 91 92 Returns: 93 ToolResult with artifact details if successful. 94 """ 95 log_identifier = f"[WebTools:web_request:{method}:{url}]" 96 if not tool_context: 97 log.error(f"{log_identifier} ToolContext is missing.") 98 return ToolResult.error("ToolContext is missing.") 99 100 # Check if loopback URLs are allowed (for testing) 101 allow_loopback = False 102 max_response_size = DEFAULT_MAX_RESPONSE_SIZE 103 104 if tool_config: 105 allow_loopback = tool_config.get("allow_loopback", False) 106 # Get max response size from config, but enforce hard cap 107 configured_size = tool_config.get("max_response_size_bytes") 108 if configured_size is not None: 109 max_response_size = min(int(configured_size), ABSOLUTE_MAX_RESPONSE_SIZE) 110 log.debug(f"{log_identifier} Using configured max_response_size: {max_response_size} bytes") 111 112 if not allow_loopback and not _is_safe_url(url): 113 log.error(f"{log_identifier} URL is not safe to request: {url}") 114 return ToolResult.error("URL is not safe to request.") 115 116 if headers is None: 117 headers = {} 118 119 if not any(h.lower() == "user-agent" for h in headers): 120 headers["User-Agent"] = ( 121 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36" 122 ) 123 124 try: 125 log.info(f"{log_identifier} Processing request.") 126 127 request_body_bytes = None 128 if body: 129 request_body_bytes = body.encode("utf-8") 130 131 # Retry logic with exponential backoff 132 last_error = None 133 response = None 134 for attempt in range(1, max_retries + 1): 135 try: 136 async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: 137 log.info( 138 f"{log_identifier} Attempt {attempt}/{max_retries}: Making {method} request to {url}" 139 ) 140 141 # Use streaming to check Content-Length and limit response size 142 async with client.stream( 143 method=method.upper(), 144 url=url, 145 headers=headers, 146 content=request_body_bytes, 147 ) as stream_response: 148 # Check Content-Length header if available 149 content_length = stream_response.headers.get("content-length") 150 if content_length: 151 content_length_int = int(content_length) 152 if content_length_int > max_response_size: 153 log.warning( 154 f"{log_identifier} Response Content-Length ({content_length_int} bytes) exceeds " 155 f"max_response_size ({max_response_size} bytes). Rejecting request." 156 ) 157 return ToolResult.error( 158 f"Response too large: {content_length_int} bytes exceeds limit of {max_response_size} bytes ({max_response_size // (1024*1024)} MB). " 159 f"Consider using a more specific URL or a different approach." 160 ) 161 162 # Read response with size limit using streaming 163 chunks = [] 164 total_size = 0 165 async for chunk in stream_response.aiter_bytes(): 166 total_size += len(chunk) 167 if total_size > max_response_size: 168 log.warning( 169 f"{log_identifier} Response size ({total_size} bytes) exceeded " 170 f"max_response_size ({max_response_size} bytes) during streaming. Truncating." 171 ) 172 # Keep what we have so far but stop reading 173 break 174 chunks.append(chunk) 175 176 # Create a response-like object with the data we need 177 class StreamedResponse: 178 def __init__(self, stream_resp, content_bytes): 179 self.status_code = stream_resp.status_code 180 self.headers = stream_resp.headers 181 self.content = content_bytes 182 183 response = StreamedResponse(stream_response, b"".join(chunks)) 184 185 log.info( 186 f"{log_identifier} Received response with status code: {response.status_code}, " 187 f"size: {len(response.content)} bytes" 188 ) 189 190 # Success - break out of retry loop 191 break 192 193 except (httpx.ReadTimeout, httpx.ConnectTimeout) as timeout_error: 194 last_error = timeout_error 195 log.warning( 196 f"{log_identifier} Attempt {attempt}/{max_retries} timed out: {timeout_error}" 197 ) 198 199 if attempt < max_retries: 200 # Exponential backoff with jitter 201 import random 202 base_delay = 3.0 * (2 ** (attempt - 1)) # 3s, 6s, 12s... 203 jitter = random.uniform(0, 1.0) 204 delay = base_delay + jitter 205 log.info(f"{log_identifier} Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}") 206 await asyncio.sleep(delay) 207 else: 208 # Final attempt failed 209 error_message = f"Request timed out after {max_retries} attempts (30s timeout per attempt). The website may be slow or blocking automated requests." 210 log.error(f"{log_identifier} {error_message}") 211 return ToolResult.error(error_message) 212 213 except httpx.RequestError as req_error: 214 last_error = req_error 215 log.warning( 216 f"{log_identifier} Attempt {attempt}/{max_retries} failed with request error: {req_error}" 217 ) 218 219 if attempt < max_retries: 220 import random 221 base_delay = 3.0 * (2 ** (attempt - 1)) 222 jitter = random.uniform(0, 1.0) 223 delay = base_delay + jitter 224 log.info(f"{log_identifier} Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}") 225 await asyncio.sleep(delay) 226 else: 227 # Will be handled by the outer exception handler 228 raise 229 230 response_content_bytes = response.content 231 response_status_code = response.status_code 232 original_content_type = ( 233 response.headers.get("content-type", "application/octet-stream") 234 .split(";")[0] 235 .strip() 236 ) 237 238 final_content_to_save_str = "" 239 final_content_to_save_bytes = response_content_bytes 240 processed_content_type = original_content_type 241 242 if response_status_code < 400: 243 if original_content_type.startswith("text/html"): 244 soup = BeautifulSoup(response_content_bytes, "html.parser") 245 246 # Remove images before conversion 247 for img in soup.find_all('img'): 248 img.decompose() 249 250 final_content_to_save_str = md(str(soup), heading_style="ATX") 251 final_content_to_save_bytes = final_content_to_save_str.encode("utf-8") 252 processed_content_type = "text/markdown" 253 log.debug(f"{log_identifier} Converted HTML to Markdown.") 254 elif original_content_type.startswith("text/") or original_content_type in [ 255 "application/json", 256 "application/xml", 257 "application/javascript", 258 ]: 259 try: 260 final_content_to_save_str = response_content_bytes.decode("utf-8") 261 log.debug( 262 f"{log_identifier} Decoded text-based content: {original_content_type}" 263 ) 264 except UnicodeDecodeError: 265 log.warning( 266 f"{log_identifier} Could not decode content as UTF-8. Original type: {original_content_type}. Saving raw bytes." 267 ) 268 269 else: 270 log.warning( 271 f"{log_identifier} HTTP request returned status {response_status_code}. Saving raw response content." 272 ) 273 if original_content_type.startswith("text/") or original_content_type in [ 274 "application/json", 275 "application/xml", 276 "application/javascript", 277 ]: 278 try: 279 final_content_to_save_str = response_content_bytes.decode( 280 "utf-8", errors="replace" 281 ) 282 except Exception: 283 final_content_to_save_str = "[Binary or undecodable content]" 284 285 file_extension = ".bin" 286 if processed_content_type == "text/markdown": 287 file_extension = ".md" 288 elif processed_content_type == "application/json": 289 file_extension = ".json" 290 elif processed_content_type == "application/xml": 291 file_extension = ".xml" 292 elif processed_content_type.startswith("text/"): 293 file_extension = ".txt" 294 elif processed_content_type == "image/jpeg": 295 file_extension = ".jpg" 296 elif processed_content_type == "image/png": 297 file_extension = ".png" 298 elif processed_content_type == "image/gif": 299 file_extension = ".gif" 300 elif processed_content_type == "application/pdf": 301 file_extension = ".pdf" 302 303 if output_artifact_filename: 304 if "." not in output_artifact_filename.split("/")[-1]: 305 final_artifact_filename = f"{output_artifact_filename}{file_extension}" 306 else: 307 final_artifact_filename = output_artifact_filename 308 else: 309 final_artifact_filename = f"web_content_{uuid.uuid4()}{file_extension}" 310 311 metadata = { 312 "url": url, 313 "method": method.upper(), 314 "request_headers": json.dumps( 315 {k: v for k, v in headers.items() if k.lower() != "authorization"} 316 ), 317 "response_status_code": response_status_code, 318 "response_headers": json.dumps(dict(response.headers)), 319 "original_content_type": original_content_type, 320 "processed_content_type": processed_content_type, 321 "generation_tool": "web_request", 322 "timestamp": datetime.now(timezone.utc).isoformat(), 323 } 324 325 preview_text = "" 326 if final_content_to_save_str: 327 preview_text = final_content_to_save_str[:500] 328 if len(final_content_to_save_str) > 500: 329 preview_text += "..." 330 elif response_status_code >= 400: 331 preview_text = response_content_bytes[:500].decode('utf-8', errors='replace') 332 if len(response_content_bytes) > 500: 333 preview_text += "..." 334 335 log.info(f"{log_identifier} Returning web content as DataObject for artifact storage") 336 337 return ToolResult.ok( 338 f"Successfully fetched content from {url} (status: {response_status_code}). " 339 f"Analyze the content before providing a final answer to the user.", 340 data={ 341 "response_status_code": response_status_code, 342 "original_content_type": original_content_type, 343 "processed_content_type": processed_content_type, 344 }, 345 data_objects=[ 346 DataObject( 347 name=final_artifact_filename, 348 content=final_content_to_save_bytes, 349 mime_type=processed_content_type, 350 disposition=DataDisposition.ARTIFACT_WITH_PREVIEW, 351 description=f"Web content from {url} (status: {response_status_code})", 352 metadata=metadata, 353 preview=preview_text if preview_text else None, 354 # Auto-generated web_content_ filenames are intermediate deep-research 355 # artifacts that should be hidden from users by default. 356 tags=[ARTIFACT_TAG_WORKING] if not output_artifact_filename else None, 357 ) 358 ], 359 ) 360 361 except httpx.HTTPStatusError as hse: 362 error_message = f"HTTP error {hse.response.status_code} while fetching {url}: {hse.response.text[:500]}" 363 log.error(f"{log_identifier} {error_message}", exc_info=True) 364 error_filename = f"error_response_{uuid.uuid4()}.txt" 365 error_metadata = { 366 "url": url, 367 "method": method.upper(), 368 "error_type": "HTTPStatusError", 369 "status_code": hse.response.status_code, 370 "timestamp": datetime.now(timezone.utc).isoformat(), 371 } 372 return ToolResult.error( 373 error_message, 374 data={"error_artifact": error_filename}, 375 data_objects=[ 376 DataObject( 377 name=error_filename, 378 content=hse.response.text.encode("utf-8", errors="replace"), 379 mime_type="text/plain", 380 disposition=DataDisposition.ARTIFACT, 381 description=f"Error response from {url} (HTTP {hse.response.status_code})", 382 metadata=error_metadata, 383 ) 384 ], 385 ) 386 387 except httpx.RequestError as re: 388 error_message = f"Request error while fetching {url} after {max_retries} attempts: {re}. The website may be unreachable or blocking requests." 389 log.error(f"{log_identifier} {error_message}", exc_info=True) 390 return ToolResult.error(error_message) 391 except ValueError as ve: 392 log.error(f"{log_identifier} Value error: {ve}", exc_info=True) 393 return ToolResult.error(str(ve)) 394 except Exception as e: 395 log.exception(f"{log_identifier} Unexpected error in web_request: {e}") 396 return ToolResult.error(f"An unexpected error occurred: {e}") 397 398 399 web_request_tool_def = BuiltinTool( 400 name="web_request", 401 implementation=web_request, 402 description="Makes an HTTP request to a URL, processes content (e.g., HTML to Markdown), and saves the result as an artifact.", 403 category="web", 404 category_name=CATEGORY_NAME, 405 category_description=CATEGORY_DESCRIPTION, 406 required_scopes=["tool:web:request"], 407 parameters=adk_types.Schema( 408 type=adk_types.Type.OBJECT, 409 properties={ 410 "url": adk_types.Schema( 411 type=adk_types.Type.STRING, description="The URL to fetch." 412 ), 413 "method": adk_types.Schema( 414 type=adk_types.Type.STRING, 415 description="HTTP method (e.g., 'GET', 'POST'). Defaults to 'GET'.", 416 nullable=True, 417 ), 418 "headers": adk_types.Schema( 419 type=adk_types.Type.OBJECT, 420 description="Optional dictionary of request headers.", 421 nullable=True, 422 ), 423 "body": adk_types.Schema( 424 type=adk_types.Type.STRING, 425 description="Optional request body string for methods like POST/PUT.", 426 nullable=True, 427 ), 428 "output_artifact_filename": adk_types.Schema( 429 type=adk_types.Type.STRING, 430 description="Optional. Desired filename for the output artifact.", 431 nullable=True, 432 ), 433 }, 434 required=["url"], 435 ), 436 examples=[], 437 ) 438 439 tool_registry.register(web_request_tool_def)