Cradicle Explorer

/ tools / vision_tools.py
vision_tools.py
   1  #!/usr/bin/env python3
   2  """
   3  Vision Tools Module
   4  
   5  This module provides vision analysis tools that work with image URLs.
   6  Uses the centralized auxiliary vision router, which can select OpenRouter,
   7  Nous, Codex, native Anthropic, or a custom OpenAI-compatible endpoint.
   8  
   9  Available tools:
  10  - vision_analyze_tool: Analyze images from URLs with custom prompts
  11  
  12  Features:
  13  - Downloads images from URLs and converts to base64 for API compatibility
  14  - Comprehensive image description
  15  - Context-aware analysis based on user queries
  16  - Automatic temporary file cleanup
  17  - Proper error handling and validation
  18  - Debug logging support
  19  
  20  Usage:
  21      from vision_tools import vision_analyze_tool
  22      import asyncio
  23      
  24      # Analyze an image
  25      result = await vision_analyze_tool(
  26          image_url="https://example.com/image.jpg",
  27          user_prompt="What architectural style is this building?"
  28      )
  29  """
  30  
  31  import base64
  32  import json
  33  import logging
  34  import os
  35  import uuid
  36  from pathlib import Path
  37  from typing import Any, Awaitable, Dict, Optional
  38  from urllib.parse import urlparse
  39  import httpx
  40  from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
  41  from hermes_constants import get_hermes_dir
  42  from tools.debug_helpers import DebugSession
  43  from tools.website_policy import check_website_access
  44  
  45  logger = logging.getLogger(__name__)
  46  
  47  _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
  48  
  49  # Configurable HTTP download timeout for _download_image().
  50  # Separate from auxiliary.vision.timeout which governs the LLM API call.
  51  # Resolution: config.yaml auxiliary.vision.download_timeout → env var → 30s default.
  52  def _resolve_download_timeout() -> float:
  53      env_val = os.getenv("HERMES_VISION_DOWNLOAD_TIMEOUT", "").strip()
  54      if env_val:
  55          try:
  56              return float(env_val)
  57          except ValueError:
  58              pass
  59      try:
  60          from hermes_cli.config import cfg_get, load_config
  61          cfg = load_config()
  62          val = cfg_get(cfg, "auxiliary", "vision", "download_timeout")
  63          if val is not None:
  64              return float(val)
  65      except Exception:
  66          pass
  67      return 30.0
  68  
  69  _VISION_DOWNLOAD_TIMEOUT = _resolve_download_timeout()
  70  
  71  # Hard cap on downloaded image file size (50 MB). Prevents OOM from
  72  # attacker-hosted multi-gigabyte files or decompression bombs.
  73  _VISION_MAX_DOWNLOAD_BYTES = 50 * 1024 * 1024
  74  
  75  
  76  def _validate_image_url(url: str) -> bool:
  77      """
  78      Basic validation of image URL format.
  79      
  80      Args:
  81          url (str): The URL to validate
  82          
  83      Returns:
  84          bool: True if URL appears to be valid, False otherwise
  85      """
  86      if not url or not isinstance(url, str):
  87          return False
  88  
  89      # Basic HTTP/HTTPS URL check
  90      if not url.startswith(("http://", "https://")):
  91          return False
  92  
  93      # Parse to ensure we at least have a network location; still allow URLs
  94      # without file extensions (e.g. CDN endpoints that redirect to images).
  95      parsed = urlparse(url)
  96      if not parsed.netloc:
  97          return False
  98  
  99      # Block private/internal addresses to prevent SSRF
 100      from tools.url_safety import is_safe_url
 101      if not is_safe_url(url):
 102          return False
 103  
 104      return True
 105  
 106  
 107  def _detect_image_mime_type(image_path: Path) -> Optional[str]:
 108      """Return a MIME type when the file looks like a supported image."""
 109      with image_path.open("rb") as f:
 110          header = f.read(64)
 111  
 112      if header.startswith(b"\x89PNG\r\n\x1a\n"):
 113          return "image/png"
 114      if header.startswith(b"\xff\xd8\xff"):
 115          return "image/jpeg"
 116      if header.startswith((b"GIF87a", b"GIF89a")):
 117          return "image/gif"
 118      if header.startswith(b"BM"):
 119          return "image/bmp"
 120      if len(header) >= 12 and header[:4] == b"RIFF" and header[8:12] == b"WEBP":
 121          return "image/webp"
 122      if image_path.suffix.lower() == ".svg":
 123          head = image_path.read_text(encoding="utf-8", errors="ignore")[:4096].lower()
 124          if "<svg" in head:
 125              return "image/svg+xml"
 126      return None
 127  
 128  
 129  async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
 130      """
 131      Download an image from a URL to a local destination (async) with retry logic.
 132      
 133      Args:
 134          image_url (str): The URL of the image to download
 135          destination (Path): The path where the image should be saved
 136          max_retries (int): Maximum number of retry attempts (default: 3)
 137          
 138      Returns:
 139          Path: The path to the downloaded image
 140          
 141      Raises:
 142          Exception: If download fails after all retries
 143      """
 144      import asyncio
 145      
 146      # Create parent directories if they don't exist
 147      destination.parent.mkdir(parents=True, exist_ok=True)
 148      
 149      async def _ssrf_redirect_guard(response):
 150          """Re-validate each redirect target to prevent redirect-based SSRF.
 151  
 152          Without this, an attacker can host a public URL that 302-redirects
 153          to http://169.254.169.254/ and bypass the pre-flight is_safe_url check.
 154  
 155          Must be async because httpx.AsyncClient awaits event hooks.
 156          """
 157          if response.is_redirect and response.next_request:
 158              redirect_url = str(response.next_request.url)
 159              from tools.url_safety import is_safe_url
 160              if not is_safe_url(redirect_url):
 161                  raise ValueError(
 162                      f"Blocked redirect to private/internal address: {redirect_url}"
 163                  )
 164  
 165      last_error = None
 166      for attempt in range(max_retries):
 167          try:
 168              blocked = check_website_access(image_url)
 169              if blocked:
 170                  raise PermissionError(blocked["message"])
 171  
 172              # Download the image with appropriate headers using async httpx
 173              # Enable follow_redirects to handle image CDNs that redirect (e.g., Imgur, Picsum)
 174              # SSRF: event_hooks validates each redirect target against private IP ranges
 175              async with httpx.AsyncClient(
 176                  timeout=_VISION_DOWNLOAD_TIMEOUT,
 177                  follow_redirects=True,
 178                  event_hooks={"response": [_ssrf_redirect_guard]},
 179              ) as client:
 180                  response = await client.get(
 181                      image_url,
 182                      headers={
 183                          "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 184                          "Accept": "image/*,*/*;q=0.8",
 185                      },
 186                  )
 187                  response.raise_for_status()
 188  
 189                  # Reject overly large images early via Content-Length header.
 190                  cl = response.headers.get("content-length")
 191                  if cl and int(cl) > _VISION_MAX_DOWNLOAD_BYTES:
 192                      raise ValueError(
 193                          f"Image too large ({int(cl)} bytes, max {_VISION_MAX_DOWNLOAD_BYTES})"
 194                      )
 195  
 196                  final_url = str(response.url)
 197                  blocked = check_website_access(final_url)
 198                  if blocked:
 199                      raise PermissionError(blocked["message"])
 200                  
 201                  # Save the image content (double-check actual size)
 202                  body = response.content
 203                  if len(body) > _VISION_MAX_DOWNLOAD_BYTES:
 204                      raise ValueError(
 205                          f"Image too large ({len(body)} bytes, max {_VISION_MAX_DOWNLOAD_BYTES})"
 206                      )
 207                  destination.write_bytes(body)
 208              
 209              return destination
 210          except Exception as e:
 211              last_error = e
 212              if attempt < max_retries - 1:
 213                  wait_time = 2 ** (attempt + 1)  # 2s, 4s, 8s
 214                  logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
 215                  logger.warning("Retrying in %ss...", wait_time)
 216                  await asyncio.sleep(wait_time)
 217              else:
 218                  logger.error(
 219                      "Image download failed after %s attempts: %s",
 220                      max_retries,
 221                      str(e)[:100],
 222                      exc_info=True,
 223                  )
 224      
 225      if last_error is None:
 226          raise RuntimeError(
 227              f"_download_image exited retry loop without attempting (max_retries={max_retries})"
 228          )
 229      raise last_error
 230  
 231  
 232  def _determine_mime_type(image_path: Path) -> str:
 233      """
 234      Determine the MIME type of an image based on its file extension.
 235      
 236      Args:
 237          image_path (Path): Path to the image file
 238          
 239      Returns:
 240          str: The MIME type (defaults to image/jpeg if unknown)
 241      """
 242      extension = image_path.suffix.lower()
 243      mime_types = {
 244          '.jpg': 'image/jpeg',
 245          '.jpeg': 'image/jpeg',
 246          '.png': 'image/png',
 247          '.gif': 'image/gif',
 248          '.bmp': 'image/bmp',
 249          '.webp': 'image/webp',
 250          '.svg': 'image/svg+xml'
 251      }
 252      return mime_types.get(extension, 'image/jpeg')
 253  
 254  
 255  def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) -> str:
 256      """
 257      Convert an image file to a base64-encoded data URL.
 258      
 259      Args:
 260          image_path (Path): Path to the image file
 261          mime_type (Optional[str]): MIME type of the image (auto-detected if None)
 262          
 263      Returns:
 264          str: Base64-encoded data URL (e.g., "data:image/jpeg;base64,...")
 265      """
 266      # Read the image as bytes
 267      data = image_path.read_bytes()
 268      
 269      # Encode to base64
 270      encoded = base64.b64encode(data).decode("ascii")
 271      
 272      # Determine MIME type
 273      mime = mime_type or _determine_mime_type(image_path)
 274      
 275      # Create data URL
 276      data_url = f"data:{mime};base64,{encoded}"
 277      
 278      return data_url
 279  
 280  
 281  # Hard limit for vision API payloads (20 MB) — matches the most restrictive
 282  # major provider (Gemini inline data limit).  Images above this are rejected.
 283  _MAX_BASE64_BYTES = 20 * 1024 * 1024
 284  
 285  # Target size when auto-resizing on API failure (5 MB).  After a provider
 286  # rejects an image, we downscale to this target and retry once.
 287  _RESIZE_TARGET_BYTES = 5 * 1024 * 1024
 288  
 289  
 290  def _is_image_size_error(error: Exception) -> bool:
 291      """Detect if an API error is related to image or payload size."""
 292      err_str = str(error).lower()
 293      return any(hint in err_str for hint in (
 294          "too large", "payload", "413", "content_too_large",
 295          "request_too_large", "image_url", "invalid_request",
 296          "exceeds", "size limit",
 297      ))
 298  
 299  
 300  def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
 301                                max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
 302      """Convert an image to a base64 data URL, auto-resizing if too large.
 303  
 304      Tries Pillow first to progressively downscale oversized images.  If Pillow
 305      is not installed or resizing still exceeds the limit, falls back to the raw
 306      bytes and lets the caller handle the size check.
 307  
 308      Returns the base64 data URL string.
 309      """
 310      # Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
 311      # Skip the expensive full-read + encode if Pillow can resize directly.
 312      file_size = image_path.stat().st_size
 313      estimated_b64 = (file_size * 4) // 3 + 100  # ~header overhead
 314      if estimated_b64 <= max_base64_bytes:
 315          # Small enough — just encode directly.
 316          data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
 317          if len(data_url) <= max_base64_bytes:
 318              return data_url
 319      else:
 320          data_url = None  # defer full encode; try Pillow resize first
 321  
 322      # Attempt auto-resize with Pillow (soft dependency)
 323      try:
 324          from PIL import Image
 325          import io as _io
 326      except ImportError:
 327          logger.info("Pillow not installed — cannot auto-resize oversized image")
 328          if data_url is None:
 329              data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
 330          return data_url  # caller will raise the size error
 331  
 332      logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
 333                  file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
 334                  max_base64_bytes / (1024 * 1024))
 335  
 336      mime = mime_type or _determine_mime_type(image_path)
 337      # Choose output format: JPEG for photos (smaller), PNG for transparency
 338      pil_format = "PNG" if mime == "image/png" else "JPEG"
 339      out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
 340  
 341      try:
 342          img = Image.open(image_path)
 343      except Exception as exc:
 344          logger.info("Pillow cannot open image for resizing: %s", exc)
 345          if data_url is None:
 346              data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
 347          return data_url  # fall through to size-check in caller
 348      # Convert RGBA to RGB for JPEG output
 349      if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
 350          img = img.convert("RGB")
 351  
 352      # Strategy: halve dimensions until base64 fits, up to 4 rounds.
 353      # For JPEG, also try reducing quality at each size step.
 354      # For PNG, quality is irrelevant — only dimension reduction helps.
 355      quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
 356      prev_dims = (img.width, img.height)
 357      candidate = None  # will be set on first loop iteration
 358  
 359      for attempt in range(5):
 360          if attempt > 0:
 361              # Proportional scaling: halve the longer side and scale the
 362              # shorter side to preserve aspect ratio (min dimension 64).
 363              scale = 0.5
 364              new_w = max(int(img.width * scale), 64)
 365              new_h = max(int(img.height * scale), 64)
 366              # Re-derive the scale from whichever dimension hit the floor
 367              # so both axes shrink by the same factor.
 368              if new_w == 64 and img.width > 0:
 369                  effective_scale = 64 / img.width
 370                  new_h = max(int(img.height * effective_scale), 64)
 371              elif new_h == 64 and img.height > 0:
 372                  effective_scale = 64 / img.height
 373                  new_w = max(int(img.width * effective_scale), 64)
 374              # Stop if dimensions can't shrink further
 375              if (new_w, new_h) == prev_dims:
 376                  break
 377              img = img.resize((new_w, new_h), Image.LANCZOS)
 378              prev_dims = (new_w, new_h)
 379              logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
 380  
 381          for q in quality_steps:
 382              buf = _io.BytesIO()
 383              save_kwargs = {"format": pil_format}
 384              if q is not None:
 385                  save_kwargs["quality"] = q
 386              img.save(buf, **save_kwargs)
 387              encoded = base64.b64encode(buf.getvalue()).decode("ascii")
 388              candidate = f"data:{out_mime};base64,{encoded}"
 389              if len(candidate) <= max_base64_bytes:
 390                  logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
 391                              len(candidate) / (1024 * 1024), q,
 392                              img.width, img.height)
 393                  return candidate
 394  
 395      # If we still can't get it small enough, return the best attempt
 396      # and let the caller decide
 397      if candidate is not None:
 398          logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
 399                         max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
 400          return candidate
 401  
 402      # Shouldn't reach here, but fall back to full encode
 403      return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
 404  
 405  
 406  async def vision_analyze_tool(
 407      image_url: str,
 408      user_prompt: str,
 409      model: str = None,
 410  ) -> str:
 411      """
 412      Analyze an image from a URL or local file path using vision AI.
 413      
 414      This tool accepts either an HTTP/HTTPS URL or a local file path. For URLs,
 415      it downloads the image first. In both cases, the image is converted to base64
 416      and processed using Gemini 3 Flash Preview via OpenRouter API.
 417      
 418      The user_prompt parameter is expected to be pre-formatted by the calling
 419      function (typically model_tools.py) to include both full description
 420      requests and specific questions.
 421      
 422      Args:
 423          image_url (str): The URL or local file path of the image to analyze.
 424                           Accepts http://, https:// URLs or absolute/relative file paths.
 425          user_prompt (str): The pre-formatted prompt for the vision model
 426          model (str): The vision model to use (default: google/gemini-3-flash-preview)
 427      
 428      Returns:
 429          str: JSON string containing the analysis results with the following structure:
 430               {
 431                   "success": bool,
 432                   "analysis": str (defaults to error message if None)
 433               }
 434      
 435      Raises:
 436          Exception: If download fails, analysis fails, or API key is not set
 437          
 438      Note:
 439          - For URLs, temporary images are stored under $HERMES_HOME/cache/vision/ and cleaned up
 440          - For local file paths, the file is used directly and NOT deleted
 441          - Supports common image formats (JPEG, PNG, GIF, WebP, etc.)
 442      """
 443      if not isinstance(user_prompt, str):
 444          user_prompt = str(user_prompt) if user_prompt is not None else ""
 445      debug_call_data = {
 446          "parameters": {
 447              "image_url": image_url,
 448              "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
 449              "model": model
 450          },
 451          "error": None,
 452          "success": False,
 453          "analysis_length": 0,
 454          "model_used": model,
 455          "image_size_bytes": 0
 456      }
 457      
 458      temp_image_path = None
 459      # Track whether we should clean up the file after processing.
 460      # Local files (e.g. from the image cache) should NOT be deleted.
 461      should_cleanup = True
 462      detected_mime_type = None
 463      
 464      try:
 465          from tools.interrupt import is_interrupted
 466          if is_interrupted():
 467              return tool_error("Interrupted", success=False)
 468  
 469          logger.info("Analyzing image: %s", image_url[:60])
 470          logger.info("User prompt: %s", user_prompt[:100])
 471          
 472          # Determine if this is a local file path or a remote URL
 473          # Strip file:// scheme so file URIs resolve as local paths.
 474          resolved_url = image_url
 475          if resolved_url.startswith("file://"):
 476              resolved_url = resolved_url[len("file://"):]
 477          local_path = Path(os.path.expanduser(resolved_url))
 478          if local_path.is_file():
 479              # Local file path (e.g. from platform image cache) -- skip download
 480              logger.info("Using local image file: %s", image_url)
 481              temp_image_path = local_path
 482              should_cleanup = False  # Don't delete cached/local files
 483          elif _validate_image_url(image_url):
 484              # Remote URL -- download to a temporary location
 485              blocked = check_website_access(image_url)
 486              if blocked:
 487                  raise PermissionError(blocked["message"])
 488              logger.info("Downloading image from URL...")
 489              temp_dir = get_hermes_dir("cache/vision", "temp_vision_images")
 490              temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg"
 491              await _download_image(image_url, temp_image_path)
 492              should_cleanup = True
 493          else:
 494              raise ValueError(
 495                  "Invalid image source. Provide an HTTP/HTTPS URL or a valid local file path."
 496              )
 497          
 498          # Get image file size for logging
 499          image_size_bytes = temp_image_path.stat().st_size
 500          image_size_kb = image_size_bytes / 1024
 501          logger.info("Image ready (%.1f KB)", image_size_kb)
 502  
 503          detected_mime_type = _detect_image_mime_type(temp_image_path)
 504          if not detected_mime_type:
 505              raise ValueError("Only real image files are supported for vision analysis.")
 506          
 507          # Convert image to base64 — send at full resolution first.
 508          # If the provider rejects it as too large, we auto-resize and retry.
 509          logger.info("Converting image to base64...")
 510          image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
 511          data_size_kb = len(image_data_url) / 1024
 512          logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
 513  
 514          # Hard limit (20 MB) — no provider accepts payloads this large.
 515          if len(image_data_url) > _MAX_BASE64_BYTES:
 516              # Try to resize down to 5 MB before giving up.
 517              image_data_url = _resize_image_for_vision(
 518                  temp_image_path, mime_type=detected_mime_type)
 519              if len(image_data_url) > _MAX_BASE64_BYTES:
 520                  raise ValueError(
 521                      f"Image too large for vision API: base64 payload is "
 522                      f"{len(image_data_url) / (1024 * 1024):.1f} MB "
 523                      f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
 524                      f"even after resizing. "
 525                      f"Install Pillow (`pip install Pillow`) for better auto-resize, "
 526                      f"or compress the image manually."
 527                  )
 528  
 529          debug_call_data["image_size_bytes"] = image_size_bytes
 530          
 531          # Use the prompt as provided (model_tools.py now handles full description formatting)
 532          comprehensive_prompt = user_prompt
 533          
 534          # Prepare the message with base64-encoded image
 535          messages = [
 536              {
 537                  "role": "user",
 538                  "content": [
 539                      {
 540                          "type": "text",
 541                          "text": comprehensive_prompt
 542                      },
 543                      {
 544                          "type": "image_url",
 545                          "image_url": {
 546                              "url": image_data_url
 547                          }
 548                      }
 549                  ]
 550              }
 551          ]
 552          
 553          logger.info("Processing image with vision model...")
 554          
 555          # Call the vision API via centralized router.
 556          # Read timeout from config.yaml (auxiliary.vision.timeout), default 120s.
 557          # Local vision models (llama.cpp, ollama) can take well over 30s.
 558          vision_timeout = 120.0
 559          vision_temperature = 0.1
 560          try:
 561              from hermes_cli.config import cfg_get, load_config
 562              _cfg = load_config()
 563              _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={})
 564              _vt = _vision_cfg.get("timeout")
 565              if _vt is not None:
 566                  vision_timeout = float(_vt)
 567              _vtemp = _vision_cfg.get("temperature")
 568              if _vtemp is not None:
 569                  vision_temperature = float(_vtemp)
 570          except Exception:
 571              pass
 572          call_kwargs = {
 573              "task": "vision",
 574              "messages": messages,
 575              "temperature": vision_temperature,
 576              "max_tokens": 2000,
 577              "timeout": vision_timeout,
 578          }
 579          if model:
 580              call_kwargs["model"] = model
 581          # Try full-size image first; on size-related rejection, downscale and retry.
 582          try:
 583              response = await async_call_llm(**call_kwargs)
 584          except Exception as _api_err:
 585              if (_is_image_size_error(_api_err)
 586                      and len(image_data_url) > _RESIZE_TARGET_BYTES):
 587                  logger.info(
 588                      "API rejected image (%.1f MB, likely too large); "
 589                      "auto-resizing to ~%.0f MB and retrying...",
 590                      len(image_data_url) / (1024 * 1024),
 591                      _RESIZE_TARGET_BYTES / (1024 * 1024),
 592                  )
 593                  image_data_url = _resize_image_for_vision(
 594                      temp_image_path, mime_type=detected_mime_type)
 595                  messages[0]["content"][1]["image_url"]["url"] = image_data_url
 596                  response = await async_call_llm(**call_kwargs)
 597              else:
 598                  raise
 599          
 600          # Extract the analysis — fall back to reasoning if content is empty
 601          analysis = extract_content_or_reasoning(response)
 602  
 603          # Retry once on empty content (reasoning-only response)
 604          if not analysis:
 605              logger.warning("Vision LLM returned empty content, retrying once")
 606              response = await async_call_llm(**call_kwargs)
 607              analysis = extract_content_or_reasoning(response)
 608  
 609          analysis_length = len(analysis)
 610          
 611          logger.info("Image analysis completed (%s characters)", analysis_length)
 612          
 613          # Prepare successful response
 614          result = {
 615              "success": True,
 616              "analysis": analysis or "There was a problem with the request and the image could not be analyzed."
 617          }
 618          
 619          debug_call_data["success"] = True
 620          debug_call_data["analysis_length"] = analysis_length
 621          
 622          # Log debug information
 623          _debug.log_call("vision_analyze_tool", debug_call_data)
 624          _debug.save()
 625          
 626          return json.dumps(result, indent=2, ensure_ascii=False)
 627          
 628      except Exception as e:
 629          error_msg = f"Error analyzing image: {str(e)}"
 630          logger.error("%s", error_msg, exc_info=True)
 631          
 632          # Detect vision capability errors — give the model a clear message
 633          # so it can inform the user instead of a cryptic API error.
 634          err_str = str(e).lower()
 635          if any(hint in err_str for hint in (
 636              "402", "insufficient", "payment required", "credits", "billing",
 637          )):
 638              analysis = (
 639                  "Insufficient credits or payment required. Please top up your "
 640                  f"API provider account and try again. Error: {e}"
 641              )
 642          elif any(hint in err_str for hint in (
 643              "does not support", "not support image",
 644              "content_policy", "multimodal",
 645              "unrecognized request argument", "image input",
 646          )):
 647              analysis = (
 648                  f"{model} does not support vision or our request was not "
 649                  f"accepted by the server. Error: {e}"
 650              )
 651          elif "invalid_request" in err_str or "image_url" in err_str:
 652              analysis = (
 653                  "The vision API rejected the image. This can happen when the "
 654                  "image is in an unsupported format, corrupted, or still too "
 655                  "large after auto-resize. Try a smaller JPEG/PNG and retry. "
 656                  f"Error: {e}"
 657              )
 658          else:
 659              analysis = (
 660                  "There was a problem with the request and the image could not "
 661                  f"be analyzed. Error: {e}"
 662              )
 663          
 664          # Prepare error response
 665          result = {
 666              "success": False,
 667              "error": error_msg,
 668              "analysis": analysis,
 669          }
 670          
 671          debug_call_data["error"] = error_msg
 672          _debug.log_call("vision_analyze_tool", debug_call_data)
 673          _debug.save()
 674          
 675          return json.dumps(result, indent=2, ensure_ascii=False)
 676      
 677      finally:
 678          # Clean up temporary image file (but NOT local/cached files)
 679          if should_cleanup and temp_image_path and temp_image_path.exists():
 680              try:
 681                  temp_image_path.unlink()
 682                  logger.debug("Cleaned up temporary image file")
 683              except Exception as cleanup_error:
 684                  logger.warning(
 685                      "Could not delete temporary file: %s", cleanup_error, exc_info=True
 686                  )
 687  
 688  
 689  def check_vision_requirements() -> bool:
 690      """Check if the configured runtime vision path can resolve a client."""
 691      try:
 692          from agent.auxiliary_client import resolve_vision_provider_client
 693  
 694          _provider, client, _model = resolve_vision_provider_client()
 695          return client is not None
 696      except Exception:
 697          return False
 698  
 699  
 700  
 701  if __name__ == "__main__":
 702      """
 703      Simple test/demo when run directly
 704      """
 705      print("👁️ Vision Tools Module")
 706      print("=" * 40)
 707      
 708      # Check if vision model is available
 709      api_available = check_vision_requirements()
 710      
 711      if not api_available:
 712          print("❌ No auxiliary vision model available")
 713          print("Configure a supported multimodal backend (OpenRouter, Nous, Codex, Anthropic, or a custom OpenAI-compatible endpoint).")
 714          exit(1)
 715      else:
 716          print("✅ Vision model available")
 717      
 718      print("🛠️ Vision tools ready for use!")
 719      
 720      # Show debug mode status
 721      if _debug.active:
 722          print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
 723          print(f"   Debug logs will be saved to: ./logs/vision_tools_debug_{_debug.session_id}.json")
 724      else:
 725          print("🐛 Debug mode disabled (set VISION_TOOLS_DEBUG=true to enable)")
 726      
 727      print("\nBasic usage:")
 728      print("  from vision_tools import vision_analyze_tool")
 729      print("  import asyncio")
 730      print("")
 731      print("  async def main():")
 732      print("      result = await vision_analyze_tool(")
 733      print("          image_url='https://example.com/image.jpg',")
 734      print("          user_prompt='What do you see in this image?'")
 735      print("      )")
 736      print("      print(result)")
 737      print("  asyncio.run(main())")
 738      
 739      print("\nExample prompts:")
 740      print("  - 'What architectural style is this building?'")
 741      print("  - 'Describe the emotions and mood in this image'")
 742      print("  - 'What text can you read in this image?'")
 743      print("  - 'Identify any safety hazards visible'")
 744      print("  - 'What products or brands are shown?'")
 745      
 746      print("\nDebug mode:")
 747      print("  # Enable debug logging")
 748      print("  export VISION_TOOLS_DEBUG=true")
 749      print("  # Debug logs capture all vision analysis calls and results")
 750      print("  # Logs saved to: ./logs/vision_tools_debug_UUID.json")
 751  
 752  
 753  # ---------------------------------------------------------------------------
 754  # Registry
 755  # ---------------------------------------------------------------------------
 756  from tools.registry import registry, tool_error
 757  
 758  VISION_ANALYZE_SCHEMA = {
 759      "name": "vision_analyze",
 760      "description": (
 761          "Inspect an image from a URL, file path, or tool output when you need "
 762          "closer detail than what's visible in the conversation. If the user's "
 763          "image is already attached to the conversation and you can see it, "
 764          "just answer directly — only call this tool for images referenced by "
 765          "URL/path, images returned inside other tool results (browser "
 766          "screenshots, search thumbnails), or when you need a deeper look at "
 767          "a specific region the main model's vision may have missed."
 768      ),
 769      "parameters": {
 770          "type": "object",
 771          "properties": {
 772              "image_url": {
 773                  "type": "string",
 774                  "description": "Image URL (http/https) or local file path to analyze."
 775              },
 776              "question": {
 777                  "type": "string",
 778                  "description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question."
 779              }
 780          },
 781          "required": ["image_url", "question"]
 782      }
 783  }
 784  
 785  
 786  def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
 787      image_url = args.get("image_url", "")
 788      question = args.get("question", "")
 789      full_prompt = (
 790          "Fully describe and explain everything about this image, then answer the "
 791          f"following question:\n\n{question}"
 792      )
 793      model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
 794      return vision_analyze_tool(image_url, full_prompt, model)
 795  
 796  
 797  registry.register(
 798      name="vision_analyze",
 799      toolset="vision",
 800      schema=VISION_ANALYZE_SCHEMA,
 801      handler=_handle_vision_analyze,
 802      check_fn=check_vision_requirements,
 803      is_async=True,
 804      emoji="👁️",
 805  )
 806  
 807  
 808  # ---------------------------------------------------------------------------
 809  # Video Analysis Tool
 810  # ---------------------------------------------------------------------------
 811  
 812  # Extension → MIME. avi/mkv fall back to mp4.
 813  _VIDEO_MIME_TYPES = {
 814      ".mp4": "video/mp4",
 815      ".webm": "video/webm",
 816      ".mov": "video/mov",
 817      ".avi": "video/mp4",
 818      ".mkv": "video/mp4",
 819      ".mpeg": "video/mpeg",
 820      ".mpg": "video/mpeg",
 821  }
 822  
 823  _MAX_VIDEO_BASE64_BYTES = 50 * 1024 * 1024  # 50 MB hard cap
 824  _VIDEO_SIZE_WARN_BYTES = 20 * 1024 * 1024
 825  
 826  
 827  def _detect_video_mime_type(video_path: Path) -> Optional[str]:
 828      """Return a video MIME type based on file extension, or None if unsupported."""
 829      ext = video_path.suffix.lower()
 830      return _VIDEO_MIME_TYPES.get(ext)
 831  
 832  
 833  def _video_to_base64_data_url(video_path: Path, mime_type: Optional[str] = None) -> str:
 834      """Convert a video file to a base64-encoded data URL."""
 835      data = video_path.read_bytes()
 836      encoded = base64.b64encode(data).decode("ascii")
 837      mime = mime_type or _VIDEO_MIME_TYPES.get(video_path.suffix.lower(), "video/mp4")
 838      return f"data:{mime};base64,{encoded}"
 839  
 840  
 841  async def _download_video(video_url: str, destination: Path, max_retries: int = 3) -> Path:
 842      """Download video from URL with SSRF protection and retry."""
 843      import asyncio
 844  
 845      destination.parent.mkdir(parents=True, exist_ok=True)
 846  
 847      async def _ssrf_redirect_guard(response):
 848          if response.is_redirect and response.next_request:
 849              redirect_url = str(response.next_request.url)
 850              from tools.url_safety import is_safe_url
 851              if not is_safe_url(redirect_url):
 852                  raise ValueError(
 853                      f"Blocked redirect to private/internal address: {redirect_url}"
 854                  )
 855  
 856      last_error = None
 857      for attempt in range(max_retries):
 858          try:
 859              blocked = check_website_access(video_url)
 860              if blocked:
 861                  raise PermissionError(blocked["message"])
 862  
 863              async with httpx.AsyncClient(
 864                  timeout=60.0,
 865                  follow_redirects=True,
 866                  event_hooks={"response": [_ssrf_redirect_guard]},
 867              ) as client:
 868                  response = await client.get(
 869                      video_url,
 870                      headers={
 871                          "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 872                          "Accept": "video/*,*/*;q=0.8",
 873                      },
 874                  )
 875                  response.raise_for_status()
 876  
 877                  cl = response.headers.get("content-length")
 878                  if cl and int(cl) > _MAX_VIDEO_BASE64_BYTES:
 879                      raise ValueError(
 880                          f"Video too large ({int(cl)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
 881                      )
 882  
 883                  final_url = str(response.url)
 884                  blocked = check_website_access(final_url)
 885                  if blocked:
 886                      raise PermissionError(blocked["message"])
 887  
 888                  body = response.content
 889                  if len(body) > _MAX_VIDEO_BASE64_BYTES:
 890                      raise ValueError(
 891                          f"Video too large ({len(body)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
 892                      )
 893                  destination.write_bytes(body)
 894  
 895              return destination
 896          except Exception as e:
 897              last_error = e
 898              if attempt < max_retries - 1:
 899                  wait_time = 2 ** (attempt + 1)
 900                  logger.warning("Video download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
 901                  await asyncio.sleep(wait_time)
 902              else:
 903                  logger.error(
 904                      "Video download failed after %s attempts: %s",
 905                      max_retries, str(e)[:100], exc_info=True,
 906                  )
 907  
 908      if last_error is None:
 909          raise RuntimeError(
 910              f"_download_video exited retry loop without attempting (max_retries={max_retries})"
 911          )
 912      raise last_error
 913  
 914  
 915  async def video_analyze_tool(
 916      video_url: str,
 917      user_prompt: str,
 918      model: str = None,
 919  ) -> str:
 920      """Analyze a video via multimodal LLM. Returns JSON {success, analysis}."""
 921      if not isinstance(user_prompt, str):
 922          user_prompt = str(user_prompt) if user_prompt is not None else ""
 923      debug_call_data = {
 924          "parameters": {
 925              "video_url": video_url,
 926              "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
 927              "model": model,
 928          },
 929          "error": None,
 930          "success": False,
 931          "analysis_length": 0,
 932          "model_used": model,
 933          "video_size_bytes": 0,
 934      }
 935  
 936      temp_video_path = None
 937      should_cleanup = True
 938  
 939      try:
 940          from tools.interrupt import is_interrupted
 941          if is_interrupted():
 942              return tool_error("Interrupted", success=False)
 943  
 944          logger.info("Analyzing video: %s", video_url[:60])
 945          logger.info("User prompt: %s", user_prompt[:100])
 946  
 947          # Resolve local path vs remote URL
 948          resolved_url = video_url
 949          if resolved_url.startswith("file://"):
 950              resolved_url = resolved_url[len("file://"):]
 951          local_path = Path(os.path.expanduser(resolved_url))
 952  
 953          if local_path.is_file():
 954              logger.info("Using local video file: %s", video_url)
 955              temp_video_path = local_path
 956              should_cleanup = False
 957          elif _validate_image_url(video_url):
 958              blocked = check_website_access(video_url)
 959              if blocked:
 960                  raise PermissionError(blocked["message"])
 961              temp_dir = get_hermes_dir("cache/video", "temp_video_files")
 962              temp_video_path = temp_dir / f"temp_video_{uuid.uuid4()}.mp4"
 963              await _download_video(video_url, temp_video_path)
 964              should_cleanup = True
 965          else:
 966              raise ValueError(
 967                  "Invalid video source. Provide an HTTP/HTTPS URL or a valid local file path."
 968              )
 969  
 970          video_size_bytes = temp_video_path.stat().st_size
 971          video_size_mb = video_size_bytes / (1024 * 1024)
 972          logger.info("Video ready (%.1f MB)", video_size_mb)
 973  
 974          detected_mime = _detect_video_mime_type(temp_video_path)
 975          if not detected_mime:
 976              raise ValueError(
 977                  f"Unsupported video format: '{temp_video_path.suffix}'. "
 978                  f"Supported: {', '.join(sorted(_VIDEO_MIME_TYPES.keys()))}"
 979              )
 980  
 981          if video_size_bytes > _VIDEO_SIZE_WARN_BYTES:
 982              logger.warning("Video is %.1f MB — may be slow or rejected", video_size_mb)
 983  
 984          video_data_url = _video_to_base64_data_url(temp_video_path, mime_type=detected_mime)
 985          data_size_mb = len(video_data_url) / (1024 * 1024)
 986  
 987          if len(video_data_url) > _MAX_VIDEO_BASE64_BYTES:
 988              raise ValueError(
 989                  f"Video too large for API: base64 payload is {data_size_mb:.1f} MB "
 990                  f"(limit {_MAX_VIDEO_BASE64_BYTES / (1024 * 1024):.0f} MB). "
 991                  f"Compress or trim the video and retry."
 992              )
 993  
 994          debug_call_data["video_size_bytes"] = video_size_bytes
 995  
 996          messages = [
 997              {
 998                  "role": "user",
 999                  "content": [
1000                      {
1001                          "type": "text",
1002                          "text": user_prompt,
1003                      },
1004                      {
1005                          "type": "video_url",
1006                          "video_url": {
1007                              "url": video_data_url,
1008                          },
1009                      },
1010                  ],
1011              }
1012          ]
1013  
1014          vision_timeout = 180.0
1015          vision_temperature = 0.1
1016          try:
1017              from hermes_cli.config import cfg_get, load_config
1018              _cfg = load_config()
1019              _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={})
1020              _vt = _vision_cfg.get("timeout")
1021              if _vt is not None:
1022                  vision_timeout = max(float(_vt), 180.0)
1023              _vtemp = _vision_cfg.get("temperature")
1024              if _vtemp is not None:
1025                  vision_temperature = float(_vtemp)
1026          except Exception:
1027              pass
1028  
1029          call_kwargs = {
1030              "task": "vision",
1031              "messages": messages,
1032              "temperature": vision_temperature,
1033              "max_tokens": 4000,
1034              "timeout": vision_timeout,
1035          }
1036          if model:
1037              call_kwargs["model"] = model
1038  
1039          response = await async_call_llm(**call_kwargs)
1040          analysis = extract_content_or_reasoning(response)
1041  
1042          if not analysis:
1043              logger.warning("Empty video response, retrying once")
1044              response = await async_call_llm(**call_kwargs)
1045              analysis = extract_content_or_reasoning(response)
1046  
1047          analysis_length = len(analysis) if analysis else 0
1048          logger.info("Video analysis completed (%s characters)", analysis_length)
1049  
1050          result = {
1051              "success": True,
1052              "analysis": analysis or "There was a problem with the request and the video could not be analyzed.",
1053          }
1054  
1055          debug_call_data["success"] = True
1056          debug_call_data["analysis_length"] = analysis_length
1057          _debug.log_call("video_analyze_tool", debug_call_data)
1058          _debug.save()
1059  
1060          return json.dumps(result, indent=2, ensure_ascii=False)
1061  
1062      except Exception as e:
1063          error_msg = f"Error analyzing video: {str(e)}"
1064          logger.error("%s", error_msg, exc_info=True)
1065  
1066          err_str = str(e).lower()
1067          if any(hint in err_str for hint in (
1068              "402", "insufficient", "payment required", "credits", "billing",
1069          )):
1070              analysis = (
1071                  "Insufficient credits or payment required. Please top up your "
1072                  f"API provider account and try again. Error: {e}"
1073              )
1074          elif any(hint in err_str for hint in (
1075              "does not support", "not support video",
1076              "content_policy", "multimodal",
1077              "unrecognized request argument", "video input",
1078              "video_url",
1079          )):
1080              analysis = (
1081                  f"The model does not support video analysis or the request was "
1082                  f"rejected. Ensure you're using a video-capable model "
1083                  f"(e.g. google/gemini-2.5-flash). Error: {e}"
1084              )
1085          elif any(hint in err_str for hint in (
1086              "too large", "payload", "413", "content_too_large",
1087              "request_too_large", "exceeds", "size limit",
1088          )):
1089              analysis = (
1090                  "The video is too large for the API. Try compressing or trimming "
1091                  f"the video (max ~50 MB). Error: {e}"
1092              )
1093          else:
1094              analysis = (
1095                  "There was a problem with the request and the video could not "
1096                  f"be analyzed. Error: {e}"
1097              )
1098  
1099          result = {
1100              "success": False,
1101              "error": error_msg,
1102              "analysis": analysis,
1103          }
1104  
1105          debug_call_data["error"] = error_msg
1106          _debug.log_call("video_analyze_tool", debug_call_data)
1107          _debug.save()
1108  
1109          return json.dumps(result, indent=2, ensure_ascii=False)
1110  
1111      finally:
1112          if should_cleanup and temp_video_path and temp_video_path.exists():
1113              try:
1114                  temp_video_path.unlink()
1115                  logger.debug("Cleaned up temporary video file")
1116              except Exception as cleanup_error:
1117                  logger.warning(
1118                      "Could not delete temporary file: %s", cleanup_error, exc_info=True
1119                  )
1120  
1121  
1122  VIDEO_ANALYZE_SCHEMA = {
1123      "name": "video_analyze",
1124      "description": (
1125          "Analyze a video from a URL or local file path using a multimodal AI model. "
1126          "Sends the video to a video-capable model (e.g. Gemini) for understanding. "
1127          "Use this for video files — for images, use vision_analyze instead. "
1128          "Supports mp4, webm, mov, avi, mkv, mpeg formats. "
1129          "Note: large videos (>20 MB) may be slow; max ~50 MB."
1130      ),
1131      "parameters": {
1132          "type": "object",
1133          "properties": {
1134              "video_url": {
1135                  "type": "string",
1136                  "description": "Video URL (http/https) or local file path to analyze.",
1137              },
1138              "question": {
1139                  "type": "string",
1140                  "description": "Your specific question about the video. The AI will describe what happens in the video and answer your question.",
1141              },
1142          },
1143          "required": ["video_url", "question"],
1144      },
1145  }
1146  
1147  
1148  def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
1149      video_url = args.get("video_url", "")
1150      question = args.get("question", "")
1151      full_prompt = (
1152          "Fully describe and explain everything happening in this video, "
1153          "including visual content, motion, audio cues, text overlays, and scene "
1154          f"transitions. Then answer the following question:\n\n{question}"
1155      )
1156      model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
1157      return video_analyze_tool(video_url, full_prompt, model)
1158  
1159  
1160  registry.register(
1161      name="video_analyze",
1162      toolset="video",
1163      schema=VIDEO_ANALYZE_SCHEMA,
1164      handler=_handle_video_analyze,
1165      check_fn=check_vision_requirements,
1166      is_async=True,
1167      emoji="🎬",
1168  )