Cradicle Explorer

/ src / solace_agent_mesh / agent / tools / web_tools.py
web_tools.py
  1  """
  2  Collection of Python tools for web-related tasks, such as making HTTP requests.
  3  """
  4  
  5  import asyncio
  6  import logging
  7  import json
  8  import uuid
  9  from datetime import datetime, timezone
 10  from typing import Any, Dict, Optional
 11  import ipaddress
 12  from urllib.parse import urlparse
 13  import socket
 14  
 15  import httpx
 16  from markdownify import markdownify as md
 17  from bs4 import BeautifulSoup
 18  
 19  from google.adk.tools import ToolContext
 20  
 21  from google.genai import types as adk_types
 22  from .tool_definition import BuiltinTool
 23  from .tool_result import ToolResult, DataObject, DataDisposition
 24  from .registry import tool_registry
 25  from ...common.constants import ARTIFACT_TAG_WORKING
 26  
 27  log = logging.getLogger(__name__)
 28  
 29  CATEGORY_NAME = "Web Access"
 30  CATEGORY_DESCRIPTION = "Access the web to find information to complete user requests."
 31  
 32  # Response size limits (in bytes)
 33  DEFAULT_MAX_RESPONSE_SIZE = 10 * 1024 * 1024  # 10 MB default
 34  ABSOLUTE_MAX_RESPONSE_SIZE = 50 * 1024 * 1024  # 50 MB hard cap (cannot be exceeded even via config)
 35  
 36  def _is_safe_url(url: str) -> bool:
 37      """
 38      Checks if a URL is safe to request by resolving its hostname and checking
 39      if the IP address is in a private, reserved, or loopback range.
 40      """
 41      try:
 42          parsed_url = urlparse(url)
 43          hostname = parsed_url.hostname
 44          if not hostname:
 45              log.warning(f"URL has no hostname: {url}")
 46              return False
 47  
 48          try:
 49              ip_str = socket.gethostbyname(hostname)
 50              ip = ipaddress.ip_address(ip_str)
 51          except socket.gaierror:
 52              log.warning(f"Could not resolve hostname: {hostname}")
 53              return False
 54  
 55          if ip.is_private or ip.is_reserved or ip.is_loopback:
 56              log.warning(f"URL {url} resolved to a blocked IP: {ip}")
 57              return False
 58  
 59          return True
 60  
 61      except Exception as e:
 62          log.error(f"Error during URL safety check for {url}: {e}", exc_info=True)
 63          return False
 64  
 65  
 66  async def web_request(
 67      url: str,
 68      method: str = "GET",
 69      headers: Optional[Dict[str, str]] = None,
 70      body: Optional[str] = None,
 71      output_artifact_filename: Optional[str] = None,
 72      tool_context: ToolContext = None,
 73      tool_config: Optional[Dict[str, Any]] = None,
 74      max_retries: int = 2,
 75  ) -> ToolResult:
 76      """
 77      Makes an HTTP request to the specified URL with retry logic, processes the content (e.g., HTML to Markdown),
 78      and saves the result as an artifact.
 79  
 80      Args:
 81          url: The URL to fetch.
 82          method: HTTP method (e.g., "GET", "POST"). Defaults to "GET".
 83          headers: Optional dictionary of request headers.
 84          body: Optional request body string for methods like POST/PUT. If sending JSON, this should be a valid JSON string.
 85          output_artifact_filename: Optional. Desired filename for the output artifact.
 86          tool_context: The context provided by the ADK framework.
 87          tool_config: Optional. Configuration passed by the ADK. Supports:
 88              - allow_loopback: bool - Allow requests to loopback addresses (for testing)
 89              - max_response_size_bytes: int - Maximum response size in bytes (default: 10MB, max: 50MB)
 90          max_retries: Maximum number of retry attempts for failed requests. Defaults to 2.
 91  
 92      Returns:
 93          ToolResult with artifact details if successful.
 94      """
 95      log_identifier = f"[WebTools:web_request:{method}:{url}]"
 96      if not tool_context:
 97          log.error(f"{log_identifier} ToolContext is missing.")
 98          return ToolResult.error("ToolContext is missing.")
 99  
100      # Check if loopback URLs are allowed (for testing)
101      allow_loopback = False
102      max_response_size = DEFAULT_MAX_RESPONSE_SIZE
103      
104      if tool_config:
105          allow_loopback = tool_config.get("allow_loopback", False)
106          # Get max response size from config, but enforce hard cap
107          configured_size = tool_config.get("max_response_size_bytes")
108          if configured_size is not None:
109              max_response_size = min(int(configured_size), ABSOLUTE_MAX_RESPONSE_SIZE)
110              log.debug(f"{log_identifier} Using configured max_response_size: {max_response_size} bytes")
111  
112      if not allow_loopback and not _is_safe_url(url):
113          log.error(f"{log_identifier} URL is not safe to request: {url}")
114          return ToolResult.error("URL is not safe to request.")
115  
116      if headers is None:
117          headers = {}
118  
119      if not any(h.lower() == "user-agent" for h in headers):
120          headers["User-Agent"] = (
121              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
122          )
123  
124      try:
125          log.info(f"{log_identifier} Processing request.")
126  
127          request_body_bytes = None
128          if body:
129              request_body_bytes = body.encode("utf-8")
130  
131          # Retry logic with exponential backoff
132          last_error = None
133          response = None
134          for attempt in range(1, max_retries + 1):
135              try:
136                  async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
137                      log.info(
138                          f"{log_identifier} Attempt {attempt}/{max_retries}: Making {method} request to {url}"
139                      )
140                      
141                      # Use streaming to check Content-Length and limit response size
142                      async with client.stream(
143                          method=method.upper(),
144                          url=url,
145                          headers=headers,
146                          content=request_body_bytes,
147                      ) as stream_response:
148                          # Check Content-Length header if available
149                          content_length = stream_response.headers.get("content-length")
150                          if content_length:
151                              content_length_int = int(content_length)
152                              if content_length_int > max_response_size:
153                                  log.warning(
154                                      f"{log_identifier} Response Content-Length ({content_length_int} bytes) exceeds "
155                                      f"max_response_size ({max_response_size} bytes). Rejecting request."
156                                  )
157                                  return ToolResult.error(
158                                      f"Response too large: {content_length_int} bytes exceeds limit of {max_response_size} bytes ({max_response_size // (1024*1024)} MB). "
159                                      f"Consider using a more specific URL or a different approach."
160                                  )
161                          
162                          # Read response with size limit using streaming
163                          chunks = []
164                          total_size = 0
165                          async for chunk in stream_response.aiter_bytes():
166                              total_size += len(chunk)
167                              if total_size > max_response_size:
168                                  log.warning(
169                                      f"{log_identifier} Response size ({total_size} bytes) exceeded "
170                                      f"max_response_size ({max_response_size} bytes) during streaming. Truncating."
171                                  )
172                                  # Keep what we have so far but stop reading
173                                  break
174                              chunks.append(chunk)
175                          
176                          # Create a response-like object with the data we need
177                          class StreamedResponse:
178                              def __init__(self, stream_resp, content_bytes):
179                                  self.status_code = stream_resp.status_code
180                                  self.headers = stream_resp.headers
181                                  self.content = content_bytes
182                          
183                          response = StreamedResponse(stream_response, b"".join(chunks))
184                          
185                          log.info(
186                              f"{log_identifier} Received response with status code: {response.status_code}, "
187                              f"size: {len(response.content)} bytes"
188                          )
189                      
190                      # Success - break out of retry loop
191                      break
192                      
193              except (httpx.ReadTimeout, httpx.ConnectTimeout) as timeout_error:
194                  last_error = timeout_error
195                  log.warning(
196                      f"{log_identifier} Attempt {attempt}/{max_retries} timed out: {timeout_error}"
197                  )
198                  
199                  if attempt < max_retries:
200                      # Exponential backoff with jitter
201                      import random
202                      base_delay = 3.0 * (2 ** (attempt - 1))  # 3s, 6s, 12s...
203                      jitter = random.uniform(0, 1.0)
204                      delay = base_delay + jitter
205                      log.info(f"{log_identifier} Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}")
206                      await asyncio.sleep(delay)
207                  else:
208                      # Final attempt failed
209                      error_message = f"Request timed out after {max_retries} attempts (30s timeout per attempt). The website may be slow or blocking automated requests."
210                      log.error(f"{log_identifier} {error_message}")
211                      return ToolResult.error(error_message)
212                      
213              except httpx.RequestError as req_error:
214                  last_error = req_error
215                  log.warning(
216                      f"{log_identifier} Attempt {attempt}/{max_retries} failed with request error: {req_error}"
217                  )
218                  
219                  if attempt < max_retries:
220                      import random
221                      base_delay = 3.0 * (2 ** (attempt - 1))
222                      jitter = random.uniform(0, 1.0)
223                      delay = base_delay + jitter
224                      log.info(f"{log_identifier} Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}")
225                      await asyncio.sleep(delay)
226                  else:
227                      # Will be handled by the outer exception handler
228                      raise
229  
230          response_content_bytes = response.content
231          response_status_code = response.status_code
232          original_content_type = (
233              response.headers.get("content-type", "application/octet-stream")
234              .split(";")[0]
235              .strip()
236          )
237  
238          final_content_to_save_str = ""
239          final_content_to_save_bytes = response_content_bytes
240          processed_content_type = original_content_type
241  
242          if response_status_code < 400:
243              if original_content_type.startswith("text/html"):
244                  soup = BeautifulSoup(response_content_bytes, "html.parser")
245  
246                  # Remove images before conversion
247                  for img in soup.find_all('img'):
248                      img.decompose()
249  
250                  final_content_to_save_str = md(str(soup), heading_style="ATX")
251                  final_content_to_save_bytes = final_content_to_save_str.encode("utf-8")
252                  processed_content_type = "text/markdown"
253                  log.debug(f"{log_identifier} Converted HTML to Markdown.")
254              elif original_content_type.startswith("text/") or original_content_type in [
255                  "application/json",
256                  "application/xml",
257                  "application/javascript",
258              ]:
259                  try:
260                      final_content_to_save_str = response_content_bytes.decode("utf-8")
261                      log.debug(
262                          f"{log_identifier} Decoded text-based content: {original_content_type}"
263                      )
264                  except UnicodeDecodeError:
265                      log.warning(
266                          f"{log_identifier} Could not decode content as UTF-8. Original type: {original_content_type}. Saving raw bytes."
267                      )
268  
269          else:
270              log.warning(
271                  f"{log_identifier} HTTP request returned status {response_status_code}. Saving raw response content."
272              )
273              if original_content_type.startswith("text/") or original_content_type in [
274                  "application/json",
275                  "application/xml",
276                  "application/javascript",
277              ]:
278                  try:
279                      final_content_to_save_str = response_content_bytes.decode(
280                          "utf-8", errors="replace"
281                      )
282                  except Exception:
283                      final_content_to_save_str = "[Binary or undecodable content]"
284  
285          file_extension = ".bin"
286          if processed_content_type == "text/markdown":
287              file_extension = ".md"
288          elif processed_content_type == "application/json":
289              file_extension = ".json"
290          elif processed_content_type == "application/xml":
291              file_extension = ".xml"
292          elif processed_content_type.startswith("text/"):
293              file_extension = ".txt"
294          elif processed_content_type == "image/jpeg":
295              file_extension = ".jpg"
296          elif processed_content_type == "image/png":
297              file_extension = ".png"
298          elif processed_content_type == "image/gif":
299              file_extension = ".gif"
300          elif processed_content_type == "application/pdf":
301              file_extension = ".pdf"
302  
303          if output_artifact_filename:
304              if "." not in output_artifact_filename.split("/")[-1]:
305                  final_artifact_filename = f"{output_artifact_filename}{file_extension}"
306              else:
307                  final_artifact_filename = output_artifact_filename
308          else:
309              final_artifact_filename = f"web_content_{uuid.uuid4()}{file_extension}"
310  
311          metadata = {
312              "url": url,
313              "method": method.upper(),
314              "request_headers": json.dumps(
315                  {k: v for k, v in headers.items() if k.lower() != "authorization"}
316              ),
317              "response_status_code": response_status_code,
318              "response_headers": json.dumps(dict(response.headers)),
319              "original_content_type": original_content_type,
320              "processed_content_type": processed_content_type,
321              "generation_tool": "web_request",
322              "timestamp": datetime.now(timezone.utc).isoformat(),
323          }
324  
325          preview_text = ""
326          if final_content_to_save_str:
327              preview_text = final_content_to_save_str[:500]
328              if len(final_content_to_save_str) > 500:
329                  preview_text += "..."
330          elif response_status_code >= 400:
331              preview_text = response_content_bytes[:500].decode('utf-8', errors='replace')
332              if len(response_content_bytes) > 500:
333                  preview_text += "..."
334  
335          log.info(f"{log_identifier} Returning web content as DataObject for artifact storage")
336  
337          return ToolResult.ok(
338              f"Successfully fetched content from {url} (status: {response_status_code}). "
339              f"Analyze the content before providing a final answer to the user.",
340              data={
341                  "response_status_code": response_status_code,
342                  "original_content_type": original_content_type,
343                  "processed_content_type": processed_content_type,
344              },
345              data_objects=[
346                  DataObject(
347                      name=final_artifact_filename,
348                      content=final_content_to_save_bytes,
349                      mime_type=processed_content_type,
350                      disposition=DataDisposition.ARTIFACT_WITH_PREVIEW,
351                      description=f"Web content from {url} (status: {response_status_code})",
352                      metadata=metadata,
353                      preview=preview_text if preview_text else None,
354                      # Auto-generated web_content_ filenames are intermediate deep-research
355                      # artifacts that should be hidden from users by default.
356                      tags=[ARTIFACT_TAG_WORKING] if not output_artifact_filename else None,
357                  )
358              ],
359          )
360  
361      except httpx.HTTPStatusError as hse:
362          error_message = f"HTTP error {hse.response.status_code} while fetching {url}: {hse.response.text[:500]}"
363          log.error(f"{log_identifier} {error_message}", exc_info=True)
364          error_filename = f"error_response_{uuid.uuid4()}.txt"
365          error_metadata = {
366              "url": url,
367              "method": method.upper(),
368              "error_type": "HTTPStatusError",
369              "status_code": hse.response.status_code,
370              "timestamp": datetime.now(timezone.utc).isoformat(),
371          }
372          return ToolResult.error(
373              error_message,
374              data={"error_artifact": error_filename},
375              data_objects=[
376                  DataObject(
377                      name=error_filename,
378                      content=hse.response.text.encode("utf-8", errors="replace"),
379                      mime_type="text/plain",
380                      disposition=DataDisposition.ARTIFACT,
381                      description=f"Error response from {url} (HTTP {hse.response.status_code})",
382                      metadata=error_metadata,
383                  )
384              ],
385          )
386  
387      except httpx.RequestError as re:
388          error_message = f"Request error while fetching {url} after {max_retries} attempts: {re}. The website may be unreachable or blocking requests."
389          log.error(f"{log_identifier} {error_message}", exc_info=True)
390          return ToolResult.error(error_message)
391      except ValueError as ve:
392          log.error(f"{log_identifier} Value error: {ve}", exc_info=True)
393          return ToolResult.error(str(ve))
394      except Exception as e:
395          log.exception(f"{log_identifier} Unexpected error in web_request: {e}")
396          return ToolResult.error(f"An unexpected error occurred: {e}")
397  
398  
399  web_request_tool_def = BuiltinTool(
400      name="web_request",
401      implementation=web_request,
402      description="Makes an HTTP request to a URL, processes content (e.g., HTML to Markdown), and saves the result as an artifact.",
403      category="web",
404      category_name=CATEGORY_NAME,
405      category_description=CATEGORY_DESCRIPTION,
406      required_scopes=["tool:web:request"],
407      parameters=adk_types.Schema(
408          type=adk_types.Type.OBJECT,
409          properties={
410              "url": adk_types.Schema(
411                  type=adk_types.Type.STRING, description="The URL to fetch."
412              ),
413              "method": adk_types.Schema(
414                  type=adk_types.Type.STRING,
415                  description="HTTP method (e.g., 'GET', 'POST'). Defaults to 'GET'.",
416                  nullable=True,
417              ),
418              "headers": adk_types.Schema(
419                  type=adk_types.Type.OBJECT,
420                  description="Optional dictionary of request headers.",
421                  nullable=True,
422              ),
423              "body": adk_types.Schema(
424                  type=adk_types.Type.STRING,
425                  description="Optional request body string for methods like POST/PUT.",
426                  nullable=True,
427              ),
428              "output_artifact_filename": adk_types.Schema(
429                  type=adk_types.Type.STRING,
430                  description="Optional. Desired filename for the output artifact.",
431                  nullable=True,
432              ),
433          },
434          required=["url"],
435      ),
436      examples=[],
437  )
438  
439  tool_registry.register(web_request_tool_def)