vision_tools.py
1 #!/usr/bin/env python3 2 """ 3 Vision Tools Module 4 5 This module provides vision analysis tools that work with image URLs. 6 Uses the centralized auxiliary vision router, which can select OpenRouter, 7 Nous, Codex, native Anthropic, or a custom OpenAI-compatible endpoint. 8 9 Available tools: 10 - vision_analyze_tool: Analyze images from URLs with custom prompts 11 12 Features: 13 - Downloads images from URLs and converts to base64 for API compatibility 14 - Comprehensive image description 15 - Context-aware analysis based on user queries 16 - Automatic temporary file cleanup 17 - Proper error handling and validation 18 - Debug logging support 19 20 Usage: 21 from vision_tools import vision_analyze_tool 22 import asyncio 23 24 # Analyze an image 25 result = await vision_analyze_tool( 26 image_url="https://example.com/image.jpg", 27 user_prompt="What architectural style is this building?" 28 ) 29 """ 30 31 import base64 32 import json 33 import logging 34 import os 35 import uuid 36 from pathlib import Path 37 from typing import Any, Awaitable, Dict, Optional 38 from urllib.parse import urlparse 39 import httpx 40 from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning 41 from hermes_constants import get_hermes_dir 42 from tools.debug_helpers import DebugSession 43 from tools.website_policy import check_website_access 44 45 logger = logging.getLogger(__name__) 46 47 _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG") 48 49 # Configurable HTTP download timeout for _download_image(). 50 # Separate from auxiliary.vision.timeout which governs the LLM API call. 51 # Resolution: config.yaml auxiliary.vision.download_timeout → env var → 30s default. 52 def _resolve_download_timeout() -> float: 53 env_val = os.getenv("HERMES_VISION_DOWNLOAD_TIMEOUT", "").strip() 54 if env_val: 55 try: 56 return float(env_val) 57 except ValueError: 58 pass 59 try: 60 from hermes_cli.config import cfg_get, load_config 61 cfg = load_config() 62 val = cfg_get(cfg, "auxiliary", "vision", "download_timeout") 63 if val is not None: 64 return float(val) 65 except Exception: 66 pass 67 return 30.0 68 69 _VISION_DOWNLOAD_TIMEOUT = _resolve_download_timeout() 70 71 # Hard cap on downloaded image file size (50 MB). Prevents OOM from 72 # attacker-hosted multi-gigabyte files or decompression bombs. 73 _VISION_MAX_DOWNLOAD_BYTES = 50 * 1024 * 1024 74 75 76 def _validate_image_url(url: str) -> bool: 77 """ 78 Basic validation of image URL format. 79 80 Args: 81 url (str): The URL to validate 82 83 Returns: 84 bool: True if URL appears to be valid, False otherwise 85 """ 86 if not url or not isinstance(url, str): 87 return False 88 89 # Basic HTTP/HTTPS URL check 90 if not url.startswith(("http://", "https://")): 91 return False 92 93 # Parse to ensure we at least have a network location; still allow URLs 94 # without file extensions (e.g. CDN endpoints that redirect to images). 95 parsed = urlparse(url) 96 if not parsed.netloc: 97 return False 98 99 # Block private/internal addresses to prevent SSRF 100 from tools.url_safety import is_safe_url 101 if not is_safe_url(url): 102 return False 103 104 return True 105 106 107 def _detect_image_mime_type(image_path: Path) -> Optional[str]: 108 """Return a MIME type when the file looks like a supported image.""" 109 with image_path.open("rb") as f: 110 header = f.read(64) 111 112 if header.startswith(b"\x89PNG\r\n\x1a\n"): 113 return "image/png" 114 if header.startswith(b"\xff\xd8\xff"): 115 return "image/jpeg" 116 if header.startswith((b"GIF87a", b"GIF89a")): 117 return "image/gif" 118 if header.startswith(b"BM"): 119 return "image/bmp" 120 if len(header) >= 12 and header[:4] == b"RIFF" and header[8:12] == b"WEBP": 121 return "image/webp" 122 if image_path.suffix.lower() == ".svg": 123 head = image_path.read_text(encoding="utf-8", errors="ignore")[:4096].lower() 124 if "<svg" in head: 125 return "image/svg+xml" 126 return None 127 128 129 async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path: 130 """ 131 Download an image from a URL to a local destination (async) with retry logic. 132 133 Args: 134 image_url (str): The URL of the image to download 135 destination (Path): The path where the image should be saved 136 max_retries (int): Maximum number of retry attempts (default: 3) 137 138 Returns: 139 Path: The path to the downloaded image 140 141 Raises: 142 Exception: If download fails after all retries 143 """ 144 import asyncio 145 146 # Create parent directories if they don't exist 147 destination.parent.mkdir(parents=True, exist_ok=True) 148 149 async def _ssrf_redirect_guard(response): 150 """Re-validate each redirect target to prevent redirect-based SSRF. 151 152 Without this, an attacker can host a public URL that 302-redirects 153 to http://169.254.169.254/ and bypass the pre-flight is_safe_url check. 154 155 Must be async because httpx.AsyncClient awaits event hooks. 156 """ 157 if response.is_redirect and response.next_request: 158 redirect_url = str(response.next_request.url) 159 from tools.url_safety import is_safe_url 160 if not is_safe_url(redirect_url): 161 raise ValueError( 162 f"Blocked redirect to private/internal address: {redirect_url}" 163 ) 164 165 last_error = None 166 for attempt in range(max_retries): 167 try: 168 blocked = check_website_access(image_url) 169 if blocked: 170 raise PermissionError(blocked["message"]) 171 172 # Download the image with appropriate headers using async httpx 173 # Enable follow_redirects to handle image CDNs that redirect (e.g., Imgur, Picsum) 174 # SSRF: event_hooks validates each redirect target against private IP ranges 175 async with httpx.AsyncClient( 176 timeout=_VISION_DOWNLOAD_TIMEOUT, 177 follow_redirects=True, 178 event_hooks={"response": [_ssrf_redirect_guard]}, 179 ) as client: 180 response = await client.get( 181 image_url, 182 headers={ 183 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 184 "Accept": "image/*,*/*;q=0.8", 185 }, 186 ) 187 response.raise_for_status() 188 189 # Reject overly large images early via Content-Length header. 190 cl = response.headers.get("content-length") 191 if cl and int(cl) > _VISION_MAX_DOWNLOAD_BYTES: 192 raise ValueError( 193 f"Image too large ({int(cl)} bytes, max {_VISION_MAX_DOWNLOAD_BYTES})" 194 ) 195 196 final_url = str(response.url) 197 blocked = check_website_access(final_url) 198 if blocked: 199 raise PermissionError(blocked["message"]) 200 201 # Save the image content (double-check actual size) 202 body = response.content 203 if len(body) > _VISION_MAX_DOWNLOAD_BYTES: 204 raise ValueError( 205 f"Image too large ({len(body)} bytes, max {_VISION_MAX_DOWNLOAD_BYTES})" 206 ) 207 destination.write_bytes(body) 208 209 return destination 210 except Exception as e: 211 last_error = e 212 if attempt < max_retries - 1: 213 wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s 214 logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) 215 logger.warning("Retrying in %ss...", wait_time) 216 await asyncio.sleep(wait_time) 217 else: 218 logger.error( 219 "Image download failed after %s attempts: %s", 220 max_retries, 221 str(e)[:100], 222 exc_info=True, 223 ) 224 225 if last_error is None: 226 raise RuntimeError( 227 f"_download_image exited retry loop without attempting (max_retries={max_retries})" 228 ) 229 raise last_error 230 231 232 def _determine_mime_type(image_path: Path) -> str: 233 """ 234 Determine the MIME type of an image based on its file extension. 235 236 Args: 237 image_path (Path): Path to the image file 238 239 Returns: 240 str: The MIME type (defaults to image/jpeg if unknown) 241 """ 242 extension = image_path.suffix.lower() 243 mime_types = { 244 '.jpg': 'image/jpeg', 245 '.jpeg': 'image/jpeg', 246 '.png': 'image/png', 247 '.gif': 'image/gif', 248 '.bmp': 'image/bmp', 249 '.webp': 'image/webp', 250 '.svg': 'image/svg+xml' 251 } 252 return mime_types.get(extension, 'image/jpeg') 253 254 255 def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) -> str: 256 """ 257 Convert an image file to a base64-encoded data URL. 258 259 Args: 260 image_path (Path): Path to the image file 261 mime_type (Optional[str]): MIME type of the image (auto-detected if None) 262 263 Returns: 264 str: Base64-encoded data URL (e.g., "data:image/jpeg;base64,...") 265 """ 266 # Read the image as bytes 267 data = image_path.read_bytes() 268 269 # Encode to base64 270 encoded = base64.b64encode(data).decode("ascii") 271 272 # Determine MIME type 273 mime = mime_type or _determine_mime_type(image_path) 274 275 # Create data URL 276 data_url = f"data:{mime};base64,{encoded}" 277 278 return data_url 279 280 281 # Hard limit for vision API payloads (20 MB) — matches the most restrictive 282 # major provider (Gemini inline data limit). Images above this are rejected. 283 _MAX_BASE64_BYTES = 20 * 1024 * 1024 284 285 # Target size when auto-resizing on API failure (5 MB). After a provider 286 # rejects an image, we downscale to this target and retry once. 287 _RESIZE_TARGET_BYTES = 5 * 1024 * 1024 288 289 290 def _is_image_size_error(error: Exception) -> bool: 291 """Detect if an API error is related to image or payload size.""" 292 err_str = str(error).lower() 293 return any(hint in err_str for hint in ( 294 "too large", "payload", "413", "content_too_large", 295 "request_too_large", "image_url", "invalid_request", 296 "exceeds", "size limit", 297 )) 298 299 300 def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, 301 max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str: 302 """Convert an image to a base64 data URL, auto-resizing if too large. 303 304 Tries Pillow first to progressively downscale oversized images. If Pillow 305 is not installed or resizing still exceeds the limit, falls back to the raw 306 bytes and lets the caller handle the size check. 307 308 Returns the base64 data URL string. 309 """ 310 # Quick file-size estimate: base64 expands by ~4/3, plus data URL header. 311 # Skip the expensive full-read + encode if Pillow can resize directly. 312 file_size = image_path.stat().st_size 313 estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead 314 if estimated_b64 <= max_base64_bytes: 315 # Small enough — just encode directly. 316 data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) 317 if len(data_url) <= max_base64_bytes: 318 return data_url 319 else: 320 data_url = None # defer full encode; try Pillow resize first 321 322 # Attempt auto-resize with Pillow (soft dependency) 323 try: 324 from PIL import Image 325 import io as _io 326 except ImportError: 327 logger.info("Pillow not installed — cannot auto-resize oversized image") 328 if data_url is None: 329 data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) 330 return data_url # caller will raise the size error 331 332 logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...", 333 file_size / (1024 * 1024), estimated_b64 / (1024 * 1024), 334 max_base64_bytes / (1024 * 1024)) 335 336 mime = mime_type or _determine_mime_type(image_path) 337 # Choose output format: JPEG for photos (smaller), PNG for transparency 338 pil_format = "PNG" if mime == "image/png" else "JPEG" 339 out_mime = "image/png" if pil_format == "PNG" else "image/jpeg" 340 341 try: 342 img = Image.open(image_path) 343 except Exception as exc: 344 logger.info("Pillow cannot open image for resizing: %s", exc) 345 if data_url is None: 346 data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) 347 return data_url # fall through to size-check in caller 348 # Convert RGBA to RGB for JPEG output 349 if pil_format == "JPEG" and img.mode in ("RGBA", "P"): 350 img = img.convert("RGB") 351 352 # Strategy: halve dimensions until base64 fits, up to 4 rounds. 353 # For JPEG, also try reducing quality at each size step. 354 # For PNG, quality is irrelevant — only dimension reduction helps. 355 quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,) 356 prev_dims = (img.width, img.height) 357 candidate = None # will be set on first loop iteration 358 359 for attempt in range(5): 360 if attempt > 0: 361 # Proportional scaling: halve the longer side and scale the 362 # shorter side to preserve aspect ratio (min dimension 64). 363 scale = 0.5 364 new_w = max(int(img.width * scale), 64) 365 new_h = max(int(img.height * scale), 64) 366 # Re-derive the scale from whichever dimension hit the floor 367 # so both axes shrink by the same factor. 368 if new_w == 64 and img.width > 0: 369 effective_scale = 64 / img.width 370 new_h = max(int(img.height * effective_scale), 64) 371 elif new_h == 64 and img.height > 0: 372 effective_scale = 64 / img.height 373 new_w = max(int(img.width * effective_scale), 64) 374 # Stop if dimensions can't shrink further 375 if (new_w, new_h) == prev_dims: 376 break 377 img = img.resize((new_w, new_h), Image.LANCZOS) 378 prev_dims = (new_w, new_h) 379 logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt) 380 381 for q in quality_steps: 382 buf = _io.BytesIO() 383 save_kwargs = {"format": pil_format} 384 if q is not None: 385 save_kwargs["quality"] = q 386 img.save(buf, **save_kwargs) 387 encoded = base64.b64encode(buf.getvalue()).decode("ascii") 388 candidate = f"data:{out_mime};base64,{encoded}" 389 if len(candidate) <= max_base64_bytes: 390 logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)", 391 len(candidate) / (1024 * 1024), q, 392 img.width, img.height) 393 return candidate 394 395 # If we still can't get it small enough, return the best attempt 396 # and let the caller decide 397 if candidate is not None: 398 logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)", 399 max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024)) 400 return candidate 401 402 # Shouldn't reach here, but fall back to full encode 403 return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type) 404 405 406 async def vision_analyze_tool( 407 image_url: str, 408 user_prompt: str, 409 model: str = None, 410 ) -> str: 411 """ 412 Analyze an image from a URL or local file path using vision AI. 413 414 This tool accepts either an HTTP/HTTPS URL or a local file path. For URLs, 415 it downloads the image first. In both cases, the image is converted to base64 416 and processed using Gemini 3 Flash Preview via OpenRouter API. 417 418 The user_prompt parameter is expected to be pre-formatted by the calling 419 function (typically model_tools.py) to include both full description 420 requests and specific questions. 421 422 Args: 423 image_url (str): The URL or local file path of the image to analyze. 424 Accepts http://, https:// URLs or absolute/relative file paths. 425 user_prompt (str): The pre-formatted prompt for the vision model 426 model (str): The vision model to use (default: google/gemini-3-flash-preview) 427 428 Returns: 429 str: JSON string containing the analysis results with the following structure: 430 { 431 "success": bool, 432 "analysis": str (defaults to error message if None) 433 } 434 435 Raises: 436 Exception: If download fails, analysis fails, or API key is not set 437 438 Note: 439 - For URLs, temporary images are stored under $HERMES_HOME/cache/vision/ and cleaned up 440 - For local file paths, the file is used directly and NOT deleted 441 - Supports common image formats (JPEG, PNG, GIF, WebP, etc.) 442 """ 443 if not isinstance(user_prompt, str): 444 user_prompt = str(user_prompt) if user_prompt is not None else "" 445 debug_call_data = { 446 "parameters": { 447 "image_url": image_url, 448 "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt, 449 "model": model 450 }, 451 "error": None, 452 "success": False, 453 "analysis_length": 0, 454 "model_used": model, 455 "image_size_bytes": 0 456 } 457 458 temp_image_path = None 459 # Track whether we should clean up the file after processing. 460 # Local files (e.g. from the image cache) should NOT be deleted. 461 should_cleanup = True 462 detected_mime_type = None 463 464 try: 465 from tools.interrupt import is_interrupted 466 if is_interrupted(): 467 return tool_error("Interrupted", success=False) 468 469 logger.info("Analyzing image: %s", image_url[:60]) 470 logger.info("User prompt: %s", user_prompt[:100]) 471 472 # Determine if this is a local file path or a remote URL 473 # Strip file:// scheme so file URIs resolve as local paths. 474 resolved_url = image_url 475 if resolved_url.startswith("file://"): 476 resolved_url = resolved_url[len("file://"):] 477 local_path = Path(os.path.expanduser(resolved_url)) 478 if local_path.is_file(): 479 # Local file path (e.g. from platform image cache) -- skip download 480 logger.info("Using local image file: %s", image_url) 481 temp_image_path = local_path 482 should_cleanup = False # Don't delete cached/local files 483 elif _validate_image_url(image_url): 484 # Remote URL -- download to a temporary location 485 blocked = check_website_access(image_url) 486 if blocked: 487 raise PermissionError(blocked["message"]) 488 logger.info("Downloading image from URL...") 489 temp_dir = get_hermes_dir("cache/vision", "temp_vision_images") 490 temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg" 491 await _download_image(image_url, temp_image_path) 492 should_cleanup = True 493 else: 494 raise ValueError( 495 "Invalid image source. Provide an HTTP/HTTPS URL or a valid local file path." 496 ) 497 498 # Get image file size for logging 499 image_size_bytes = temp_image_path.stat().st_size 500 image_size_kb = image_size_bytes / 1024 501 logger.info("Image ready (%.1f KB)", image_size_kb) 502 503 detected_mime_type = _detect_image_mime_type(temp_image_path) 504 if not detected_mime_type: 505 raise ValueError("Only real image files are supported for vision analysis.") 506 507 # Convert image to base64 — send at full resolution first. 508 # If the provider rejects it as too large, we auto-resize and retry. 509 logger.info("Converting image to base64...") 510 image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type) 511 data_size_kb = len(image_data_url) / 1024 512 logger.info("Image converted to base64 (%.1f KB)", data_size_kb) 513 514 # Hard limit (20 MB) — no provider accepts payloads this large. 515 if len(image_data_url) > _MAX_BASE64_BYTES: 516 # Try to resize down to 5 MB before giving up. 517 image_data_url = _resize_image_for_vision( 518 temp_image_path, mime_type=detected_mime_type) 519 if len(image_data_url) > _MAX_BASE64_BYTES: 520 raise ValueError( 521 f"Image too large for vision API: base64 payload is " 522 f"{len(image_data_url) / (1024 * 1024):.1f} MB " 523 f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) " 524 f"even after resizing. " 525 f"Install Pillow (`pip install Pillow`) for better auto-resize, " 526 f"or compress the image manually." 527 ) 528 529 debug_call_data["image_size_bytes"] = image_size_bytes 530 531 # Use the prompt as provided (model_tools.py now handles full description formatting) 532 comprehensive_prompt = user_prompt 533 534 # Prepare the message with base64-encoded image 535 messages = [ 536 { 537 "role": "user", 538 "content": [ 539 { 540 "type": "text", 541 "text": comprehensive_prompt 542 }, 543 { 544 "type": "image_url", 545 "image_url": { 546 "url": image_data_url 547 } 548 } 549 ] 550 } 551 ] 552 553 logger.info("Processing image with vision model...") 554 555 # Call the vision API via centralized router. 556 # Read timeout from config.yaml (auxiliary.vision.timeout), default 120s. 557 # Local vision models (llama.cpp, ollama) can take well over 30s. 558 vision_timeout = 120.0 559 vision_temperature = 0.1 560 try: 561 from hermes_cli.config import cfg_get, load_config 562 _cfg = load_config() 563 _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={}) 564 _vt = _vision_cfg.get("timeout") 565 if _vt is not None: 566 vision_timeout = float(_vt) 567 _vtemp = _vision_cfg.get("temperature") 568 if _vtemp is not None: 569 vision_temperature = float(_vtemp) 570 except Exception: 571 pass 572 call_kwargs = { 573 "task": "vision", 574 "messages": messages, 575 "temperature": vision_temperature, 576 "max_tokens": 2000, 577 "timeout": vision_timeout, 578 } 579 if model: 580 call_kwargs["model"] = model 581 # Try full-size image first; on size-related rejection, downscale and retry. 582 try: 583 response = await async_call_llm(**call_kwargs) 584 except Exception as _api_err: 585 if (_is_image_size_error(_api_err) 586 and len(image_data_url) > _RESIZE_TARGET_BYTES): 587 logger.info( 588 "API rejected image (%.1f MB, likely too large); " 589 "auto-resizing to ~%.0f MB and retrying...", 590 len(image_data_url) / (1024 * 1024), 591 _RESIZE_TARGET_BYTES / (1024 * 1024), 592 ) 593 image_data_url = _resize_image_for_vision( 594 temp_image_path, mime_type=detected_mime_type) 595 messages[0]["content"][1]["image_url"]["url"] = image_data_url 596 response = await async_call_llm(**call_kwargs) 597 else: 598 raise 599 600 # Extract the analysis — fall back to reasoning if content is empty 601 analysis = extract_content_or_reasoning(response) 602 603 # Retry once on empty content (reasoning-only response) 604 if not analysis: 605 logger.warning("Vision LLM returned empty content, retrying once") 606 response = await async_call_llm(**call_kwargs) 607 analysis = extract_content_or_reasoning(response) 608 609 analysis_length = len(analysis) 610 611 logger.info("Image analysis completed (%s characters)", analysis_length) 612 613 # Prepare successful response 614 result = { 615 "success": True, 616 "analysis": analysis or "There was a problem with the request and the image could not be analyzed." 617 } 618 619 debug_call_data["success"] = True 620 debug_call_data["analysis_length"] = analysis_length 621 622 # Log debug information 623 _debug.log_call("vision_analyze_tool", debug_call_data) 624 _debug.save() 625 626 return json.dumps(result, indent=2, ensure_ascii=False) 627 628 except Exception as e: 629 error_msg = f"Error analyzing image: {str(e)}" 630 logger.error("%s", error_msg, exc_info=True) 631 632 # Detect vision capability errors — give the model a clear message 633 # so it can inform the user instead of a cryptic API error. 634 err_str = str(e).lower() 635 if any(hint in err_str for hint in ( 636 "402", "insufficient", "payment required", "credits", "billing", 637 )): 638 analysis = ( 639 "Insufficient credits or payment required. Please top up your " 640 f"API provider account and try again. Error: {e}" 641 ) 642 elif any(hint in err_str for hint in ( 643 "does not support", "not support image", 644 "content_policy", "multimodal", 645 "unrecognized request argument", "image input", 646 )): 647 analysis = ( 648 f"{model} does not support vision or our request was not " 649 f"accepted by the server. Error: {e}" 650 ) 651 elif "invalid_request" in err_str or "image_url" in err_str: 652 analysis = ( 653 "The vision API rejected the image. This can happen when the " 654 "image is in an unsupported format, corrupted, or still too " 655 "large after auto-resize. Try a smaller JPEG/PNG and retry. " 656 f"Error: {e}" 657 ) 658 else: 659 analysis = ( 660 "There was a problem with the request and the image could not " 661 f"be analyzed. Error: {e}" 662 ) 663 664 # Prepare error response 665 result = { 666 "success": False, 667 "error": error_msg, 668 "analysis": analysis, 669 } 670 671 debug_call_data["error"] = error_msg 672 _debug.log_call("vision_analyze_tool", debug_call_data) 673 _debug.save() 674 675 return json.dumps(result, indent=2, ensure_ascii=False) 676 677 finally: 678 # Clean up temporary image file (but NOT local/cached files) 679 if should_cleanup and temp_image_path and temp_image_path.exists(): 680 try: 681 temp_image_path.unlink() 682 logger.debug("Cleaned up temporary image file") 683 except Exception as cleanup_error: 684 logger.warning( 685 "Could not delete temporary file: %s", cleanup_error, exc_info=True 686 ) 687 688 689 def check_vision_requirements() -> bool: 690 """Check if the configured runtime vision path can resolve a client.""" 691 try: 692 from agent.auxiliary_client import resolve_vision_provider_client 693 694 _provider, client, _model = resolve_vision_provider_client() 695 return client is not None 696 except Exception: 697 return False 698 699 700 701 if __name__ == "__main__": 702 """ 703 Simple test/demo when run directly 704 """ 705 print("👁️ Vision Tools Module") 706 print("=" * 40) 707 708 # Check if vision model is available 709 api_available = check_vision_requirements() 710 711 if not api_available: 712 print("❌ No auxiliary vision model available") 713 print("Configure a supported multimodal backend (OpenRouter, Nous, Codex, Anthropic, or a custom OpenAI-compatible endpoint).") 714 exit(1) 715 else: 716 print("✅ Vision model available") 717 718 print("🛠️ Vision tools ready for use!") 719 720 # Show debug mode status 721 if _debug.active: 722 print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}") 723 print(f" Debug logs will be saved to: ./logs/vision_tools_debug_{_debug.session_id}.json") 724 else: 725 print("🐛 Debug mode disabled (set VISION_TOOLS_DEBUG=true to enable)") 726 727 print("\nBasic usage:") 728 print(" from vision_tools import vision_analyze_tool") 729 print(" import asyncio") 730 print("") 731 print(" async def main():") 732 print(" result = await vision_analyze_tool(") 733 print(" image_url='https://example.com/image.jpg',") 734 print(" user_prompt='What do you see in this image?'") 735 print(" )") 736 print(" print(result)") 737 print(" asyncio.run(main())") 738 739 print("\nExample prompts:") 740 print(" - 'What architectural style is this building?'") 741 print(" - 'Describe the emotions and mood in this image'") 742 print(" - 'What text can you read in this image?'") 743 print(" - 'Identify any safety hazards visible'") 744 print(" - 'What products or brands are shown?'") 745 746 print("\nDebug mode:") 747 print(" # Enable debug logging") 748 print(" export VISION_TOOLS_DEBUG=true") 749 print(" # Debug logs capture all vision analysis calls and results") 750 print(" # Logs saved to: ./logs/vision_tools_debug_UUID.json") 751 752 753 # --------------------------------------------------------------------------- 754 # Registry 755 # --------------------------------------------------------------------------- 756 from tools.registry import registry, tool_error 757 758 VISION_ANALYZE_SCHEMA = { 759 "name": "vision_analyze", 760 "description": ( 761 "Inspect an image from a URL, file path, or tool output when you need " 762 "closer detail than what's visible in the conversation. If the user's " 763 "image is already attached to the conversation and you can see it, " 764 "just answer directly — only call this tool for images referenced by " 765 "URL/path, images returned inside other tool results (browser " 766 "screenshots, search thumbnails), or when you need a deeper look at " 767 "a specific region the main model's vision may have missed." 768 ), 769 "parameters": { 770 "type": "object", 771 "properties": { 772 "image_url": { 773 "type": "string", 774 "description": "Image URL (http/https) or local file path to analyze." 775 }, 776 "question": { 777 "type": "string", 778 "description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question." 779 } 780 }, 781 "required": ["image_url", "question"] 782 } 783 } 784 785 786 def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: 787 image_url = args.get("image_url", "") 788 question = args.get("question", "") 789 full_prompt = ( 790 "Fully describe and explain everything about this image, then answer the " 791 f"following question:\n\n{question}" 792 ) 793 model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None 794 return vision_analyze_tool(image_url, full_prompt, model) 795 796 797 registry.register( 798 name="vision_analyze", 799 toolset="vision", 800 schema=VISION_ANALYZE_SCHEMA, 801 handler=_handle_vision_analyze, 802 check_fn=check_vision_requirements, 803 is_async=True, 804 emoji="👁️", 805 ) 806 807 808 # --------------------------------------------------------------------------- 809 # Video Analysis Tool 810 # --------------------------------------------------------------------------- 811 812 # Extension → MIME. avi/mkv fall back to mp4. 813 _VIDEO_MIME_TYPES = { 814 ".mp4": "video/mp4", 815 ".webm": "video/webm", 816 ".mov": "video/mov", 817 ".avi": "video/mp4", 818 ".mkv": "video/mp4", 819 ".mpeg": "video/mpeg", 820 ".mpg": "video/mpeg", 821 } 822 823 _MAX_VIDEO_BASE64_BYTES = 50 * 1024 * 1024 # 50 MB hard cap 824 _VIDEO_SIZE_WARN_BYTES = 20 * 1024 * 1024 825 826 827 def _detect_video_mime_type(video_path: Path) -> Optional[str]: 828 """Return a video MIME type based on file extension, or None if unsupported.""" 829 ext = video_path.suffix.lower() 830 return _VIDEO_MIME_TYPES.get(ext) 831 832 833 def _video_to_base64_data_url(video_path: Path, mime_type: Optional[str] = None) -> str: 834 """Convert a video file to a base64-encoded data URL.""" 835 data = video_path.read_bytes() 836 encoded = base64.b64encode(data).decode("ascii") 837 mime = mime_type or _VIDEO_MIME_TYPES.get(video_path.suffix.lower(), "video/mp4") 838 return f"data:{mime};base64,{encoded}" 839 840 841 async def _download_video(video_url: str, destination: Path, max_retries: int = 3) -> Path: 842 """Download video from URL with SSRF protection and retry.""" 843 import asyncio 844 845 destination.parent.mkdir(parents=True, exist_ok=True) 846 847 async def _ssrf_redirect_guard(response): 848 if response.is_redirect and response.next_request: 849 redirect_url = str(response.next_request.url) 850 from tools.url_safety import is_safe_url 851 if not is_safe_url(redirect_url): 852 raise ValueError( 853 f"Blocked redirect to private/internal address: {redirect_url}" 854 ) 855 856 last_error = None 857 for attempt in range(max_retries): 858 try: 859 blocked = check_website_access(video_url) 860 if blocked: 861 raise PermissionError(blocked["message"]) 862 863 async with httpx.AsyncClient( 864 timeout=60.0, 865 follow_redirects=True, 866 event_hooks={"response": [_ssrf_redirect_guard]}, 867 ) as client: 868 response = await client.get( 869 video_url, 870 headers={ 871 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 872 "Accept": "video/*,*/*;q=0.8", 873 }, 874 ) 875 response.raise_for_status() 876 877 cl = response.headers.get("content-length") 878 if cl and int(cl) > _MAX_VIDEO_BASE64_BYTES: 879 raise ValueError( 880 f"Video too large ({int(cl)} bytes, max {_MAX_VIDEO_BASE64_BYTES})" 881 ) 882 883 final_url = str(response.url) 884 blocked = check_website_access(final_url) 885 if blocked: 886 raise PermissionError(blocked["message"]) 887 888 body = response.content 889 if len(body) > _MAX_VIDEO_BASE64_BYTES: 890 raise ValueError( 891 f"Video too large ({len(body)} bytes, max {_MAX_VIDEO_BASE64_BYTES})" 892 ) 893 destination.write_bytes(body) 894 895 return destination 896 except Exception as e: 897 last_error = e 898 if attempt < max_retries - 1: 899 wait_time = 2 ** (attempt + 1) 900 logger.warning("Video download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) 901 await asyncio.sleep(wait_time) 902 else: 903 logger.error( 904 "Video download failed after %s attempts: %s", 905 max_retries, str(e)[:100], exc_info=True, 906 ) 907 908 if last_error is None: 909 raise RuntimeError( 910 f"_download_video exited retry loop without attempting (max_retries={max_retries})" 911 ) 912 raise last_error 913 914 915 async def video_analyze_tool( 916 video_url: str, 917 user_prompt: str, 918 model: str = None, 919 ) -> str: 920 """Analyze a video via multimodal LLM. Returns JSON {success, analysis}.""" 921 if not isinstance(user_prompt, str): 922 user_prompt = str(user_prompt) if user_prompt is not None else "" 923 debug_call_data = { 924 "parameters": { 925 "video_url": video_url, 926 "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt, 927 "model": model, 928 }, 929 "error": None, 930 "success": False, 931 "analysis_length": 0, 932 "model_used": model, 933 "video_size_bytes": 0, 934 } 935 936 temp_video_path = None 937 should_cleanup = True 938 939 try: 940 from tools.interrupt import is_interrupted 941 if is_interrupted(): 942 return tool_error("Interrupted", success=False) 943 944 logger.info("Analyzing video: %s", video_url[:60]) 945 logger.info("User prompt: %s", user_prompt[:100]) 946 947 # Resolve local path vs remote URL 948 resolved_url = video_url 949 if resolved_url.startswith("file://"): 950 resolved_url = resolved_url[len("file://"):] 951 local_path = Path(os.path.expanduser(resolved_url)) 952 953 if local_path.is_file(): 954 logger.info("Using local video file: %s", video_url) 955 temp_video_path = local_path 956 should_cleanup = False 957 elif _validate_image_url(video_url): 958 blocked = check_website_access(video_url) 959 if blocked: 960 raise PermissionError(blocked["message"]) 961 temp_dir = get_hermes_dir("cache/video", "temp_video_files") 962 temp_video_path = temp_dir / f"temp_video_{uuid.uuid4()}.mp4" 963 await _download_video(video_url, temp_video_path) 964 should_cleanup = True 965 else: 966 raise ValueError( 967 "Invalid video source. Provide an HTTP/HTTPS URL or a valid local file path." 968 ) 969 970 video_size_bytes = temp_video_path.stat().st_size 971 video_size_mb = video_size_bytes / (1024 * 1024) 972 logger.info("Video ready (%.1f MB)", video_size_mb) 973 974 detected_mime = _detect_video_mime_type(temp_video_path) 975 if not detected_mime: 976 raise ValueError( 977 f"Unsupported video format: '{temp_video_path.suffix}'. " 978 f"Supported: {', '.join(sorted(_VIDEO_MIME_TYPES.keys()))}" 979 ) 980 981 if video_size_bytes > _VIDEO_SIZE_WARN_BYTES: 982 logger.warning("Video is %.1f MB — may be slow or rejected", video_size_mb) 983 984 video_data_url = _video_to_base64_data_url(temp_video_path, mime_type=detected_mime) 985 data_size_mb = len(video_data_url) / (1024 * 1024) 986 987 if len(video_data_url) > _MAX_VIDEO_BASE64_BYTES: 988 raise ValueError( 989 f"Video too large for API: base64 payload is {data_size_mb:.1f} MB " 990 f"(limit {_MAX_VIDEO_BASE64_BYTES / (1024 * 1024):.0f} MB). " 991 f"Compress or trim the video and retry." 992 ) 993 994 debug_call_data["video_size_bytes"] = video_size_bytes 995 996 messages = [ 997 { 998 "role": "user", 999 "content": [ 1000 { 1001 "type": "text", 1002 "text": user_prompt, 1003 }, 1004 { 1005 "type": "video_url", 1006 "video_url": { 1007 "url": video_data_url, 1008 }, 1009 }, 1010 ], 1011 } 1012 ] 1013 1014 vision_timeout = 180.0 1015 vision_temperature = 0.1 1016 try: 1017 from hermes_cli.config import cfg_get, load_config 1018 _cfg = load_config() 1019 _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={}) 1020 _vt = _vision_cfg.get("timeout") 1021 if _vt is not None: 1022 vision_timeout = max(float(_vt), 180.0) 1023 _vtemp = _vision_cfg.get("temperature") 1024 if _vtemp is not None: 1025 vision_temperature = float(_vtemp) 1026 except Exception: 1027 pass 1028 1029 call_kwargs = { 1030 "task": "vision", 1031 "messages": messages, 1032 "temperature": vision_temperature, 1033 "max_tokens": 4000, 1034 "timeout": vision_timeout, 1035 } 1036 if model: 1037 call_kwargs["model"] = model 1038 1039 response = await async_call_llm(**call_kwargs) 1040 analysis = extract_content_or_reasoning(response) 1041 1042 if not analysis: 1043 logger.warning("Empty video response, retrying once") 1044 response = await async_call_llm(**call_kwargs) 1045 analysis = extract_content_or_reasoning(response) 1046 1047 analysis_length = len(analysis) if analysis else 0 1048 logger.info("Video analysis completed (%s characters)", analysis_length) 1049 1050 result = { 1051 "success": True, 1052 "analysis": analysis or "There was a problem with the request and the video could not be analyzed.", 1053 } 1054 1055 debug_call_data["success"] = True 1056 debug_call_data["analysis_length"] = analysis_length 1057 _debug.log_call("video_analyze_tool", debug_call_data) 1058 _debug.save() 1059 1060 return json.dumps(result, indent=2, ensure_ascii=False) 1061 1062 except Exception as e: 1063 error_msg = f"Error analyzing video: {str(e)}" 1064 logger.error("%s", error_msg, exc_info=True) 1065 1066 err_str = str(e).lower() 1067 if any(hint in err_str for hint in ( 1068 "402", "insufficient", "payment required", "credits", "billing", 1069 )): 1070 analysis = ( 1071 "Insufficient credits or payment required. Please top up your " 1072 f"API provider account and try again. Error: {e}" 1073 ) 1074 elif any(hint in err_str for hint in ( 1075 "does not support", "not support video", 1076 "content_policy", "multimodal", 1077 "unrecognized request argument", "video input", 1078 "video_url", 1079 )): 1080 analysis = ( 1081 f"The model does not support video analysis or the request was " 1082 f"rejected. Ensure you're using a video-capable model " 1083 f"(e.g. google/gemini-2.5-flash). Error: {e}" 1084 ) 1085 elif any(hint in err_str for hint in ( 1086 "too large", "payload", "413", "content_too_large", 1087 "request_too_large", "exceeds", "size limit", 1088 )): 1089 analysis = ( 1090 "The video is too large for the API. Try compressing or trimming " 1091 f"the video (max ~50 MB). Error: {e}" 1092 ) 1093 else: 1094 analysis = ( 1095 "There was a problem with the request and the video could not " 1096 f"be analyzed. Error: {e}" 1097 ) 1098 1099 result = { 1100 "success": False, 1101 "error": error_msg, 1102 "analysis": analysis, 1103 } 1104 1105 debug_call_data["error"] = error_msg 1106 _debug.log_call("video_analyze_tool", debug_call_data) 1107 _debug.save() 1108 1109 return json.dumps(result, indent=2, ensure_ascii=False) 1110 1111 finally: 1112 if should_cleanup and temp_video_path and temp_video_path.exists(): 1113 try: 1114 temp_video_path.unlink() 1115 logger.debug("Cleaned up temporary video file") 1116 except Exception as cleanup_error: 1117 logger.warning( 1118 "Could not delete temporary file: %s", cleanup_error, exc_info=True 1119 ) 1120 1121 1122 VIDEO_ANALYZE_SCHEMA = { 1123 "name": "video_analyze", 1124 "description": ( 1125 "Analyze a video from a URL or local file path using a multimodal AI model. " 1126 "Sends the video to a video-capable model (e.g. Gemini) for understanding. " 1127 "Use this for video files — for images, use vision_analyze instead. " 1128 "Supports mp4, webm, mov, avi, mkv, mpeg formats. " 1129 "Note: large videos (>20 MB) may be slow; max ~50 MB." 1130 ), 1131 "parameters": { 1132 "type": "object", 1133 "properties": { 1134 "video_url": { 1135 "type": "string", 1136 "description": "Video URL (http/https) or local file path to analyze.", 1137 }, 1138 "question": { 1139 "type": "string", 1140 "description": "Your specific question about the video. The AI will describe what happens in the video and answer your question.", 1141 }, 1142 }, 1143 "required": ["video_url", "question"], 1144 }, 1145 } 1146 1147 1148 def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: 1149 video_url = args.get("video_url", "") 1150 question = args.get("question", "") 1151 full_prompt = ( 1152 "Fully describe and explain everything happening in this video, " 1153 "including visual content, motion, audio cues, text overlays, and scene " 1154 f"transitions. Then answer the following question:\n\n{question}" 1155 ) 1156 model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None 1157 return video_analyze_tool(video_url, full_prompt, model) 1158 1159 1160 registry.register( 1161 name="video_analyze", 1162 toolset="video", 1163 schema=VIDEO_ANALYZE_SCHEMA, 1164 handler=_handle_video_analyze, 1165 check_fn=check_vision_requirements, 1166 is_async=True, 1167 emoji="🎬", 1168 )