model_metadata.py
1 """Model metadata, context lengths, and token estimation utilities. 2 3 Pure utility functions with no AIAgent dependency. Used by ContextCompressor 4 and run_agent.py for pre-flight context checks. 5 """ 6 7 import ipaddress 8 import logging 9 import os 10 import re 11 import time 12 from pathlib import Path 13 from typing import Any, Dict, List, Optional 14 from urllib.parse import urlparse 15 16 import requests 17 import yaml 18 19 from utils import base_url_host_matches, base_url_hostname 20 21 from hermes_constants import OPENROUTER_MODELS_URL 22 23 logger = logging.getLogger(__name__) 24 25 26 def _resolve_requests_verify() -> bool | str: 27 """Resolve SSL verify setting for `requests` calls from env vars. 28 29 The `requests` library only honours REQUESTS_CA_BUNDLE / CURL_CA_BUNDLE 30 by default. Hermes also honours HERMES_CA_BUNDLE (its own convention) 31 and SSL_CERT_FILE (used by the stdlib `ssl` module and by httpx), so 32 that a single env var can cover both `requests` and `httpx` callsites 33 inside the same process. 34 35 Returns either a filesystem path to a CA bundle, or True to defer to 36 the requests default (certifi). 37 """ 38 for env_var in ("HERMES_CA_BUNDLE", "REQUESTS_CA_BUNDLE", "SSL_CERT_FILE"): 39 val = os.getenv(env_var) 40 if val and os.path.isfile(val): 41 return val 42 return True 43 44 # Provider names that can appear as a "provider:" prefix before a model ID. 45 # Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b") 46 # are preserved so the full model name reaches cache lookups and server queries. 47 _PROVIDER_PREFIXES: frozenset[str] = frozenset({ 48 "openrouter", "nous", "openai-codex", "copilot", "copilot-acp", "opencode-kimi-oauth", 49 "gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "stepfun", "minimax", "minimax-oauth", "minimax-cn", "anthropic", "deepseek", 50 "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba", 51 "qwen-oauth", 52 "xiaomi", 53 "arcee", 54 "gmi", 55 "tencent-tokenhub", 56 "custom", "local", 57 # Common aliases 58 "google", "google-gemini", "google-ai-studio", 59 "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot", 60 "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek", 61 "ollama", 62 "stepfun", "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen", 63 "mimo", "xiaomi-mimo", 64 "tencent", "tokenhub", "tencent-cloud", "tencentmaas", 65 "arcee-ai", "arceeai", 66 "gmi-cloud", "gmicloud", 67 "xai", "x-ai", "x.ai", "grok", 68 "nvidia", "nim", "nvidia-nim", "nemotron", 69 "qwen-portal", 70 }) 71 72 73 _OLLAMA_TAG_PATTERN = re.compile( 74 r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)", 75 re.IGNORECASE, 76 ) 77 78 79 # Tailscale's CGNAT range (RFC 6598). `ipaddress.is_private` excludes this 80 # block, so without an explicit check Ollama reached over Tailscale (e.g. 81 # `http://100.77.243.5:11434`) wouldn't be treated as local and its stream 82 # read / stale timeouts wouldn't get auto-bumped. Built once at import time. 83 _TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10") 84 85 86 def _strip_provider_prefix(model: str) -> str: 87 """Strip a recognised provider prefix from a model string. 88 89 ``"local:my-model"`` → ``"my-model"`` 90 ``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix) 91 ``"qwen:0.5b"`` → ``"qwen:0.5b"`` (unchanged — Ollama model:tag) 92 ``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag) 93 """ 94 if ":" not in model or model.startswith("http"): 95 return model 96 prefix, suffix = model.split(":", 1) 97 prefix_lower = prefix.strip().lower() 98 if prefix_lower in _PROVIDER_PREFIXES: 99 # Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0") 100 if _OLLAMA_TAG_PATTERN.match(suffix.strip()): 101 return model 102 return suffix 103 return model 104 105 _model_metadata_cache: Dict[str, Dict[str, Any]] = {} 106 _model_metadata_cache_time: float = 0 107 _MODEL_CACHE_TTL = 3600 108 _endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {} 109 _endpoint_model_metadata_cache_time: Dict[str, float] = {} 110 _ENDPOINT_MODEL_CACHE_TTL = 300 111 112 # Descending tiers for context length probing when the model is unknown. 113 # We start at 256K (covers GPT-5.x, many current large-context models) and 114 # step down on context-length errors until one works. Tier[0] is also the 115 # default fallback when no detection method succeeds. 116 CONTEXT_PROBE_TIERS = [ 117 256_000, 118 128_000, 119 64_000, 120 32_000, 121 16_000, 122 8_000, 123 ] 124 125 # Default context length when no detection method succeeds. 126 DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0] 127 128 # Minimum context length required to run Hermes Agent. Models with fewer 129 # tokens cannot maintain enough working memory for tool-calling workflows. 130 # Sessions, model switches, and cron jobs should reject models below this. 131 MINIMUM_CONTEXT_LENGTH = 64_000 132 133 # Thin fallback defaults — only broad model family patterns. 134 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic 135 # all miss. Replaced the previous 80+ entry dict. 136 # For provider-specific context lengths, models.dev is the primary source. 137 DEFAULT_CONTEXT_LENGTHS = { 138 # Anthropic Claude 4.6 (1M context) — bare IDs only to avoid 139 # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a 140 # substring of "anthropic/claude-sonnet-4.6"). 141 # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev. 142 "claude-opus-4-7": 1000000, 143 "claude-opus-4.7": 1000000, 144 "claude-opus-4-6": 1000000, 145 "claude-sonnet-4-6": 1000000, 146 "claude-opus-4.6": 1000000, 147 "claude-sonnet-4.6": 1000000, 148 # Catch-all for older Claude models (must sort after specific entries) 149 "claude": 200000, 150 # OpenAI — GPT-5 family (most have 400k; specific overrides first) 151 # Source: https://developers.openai.com/api/docs/models 152 # GPT-5.5 (launched Apr 23 2026) is 1.05M on the direct OpenAI API and 153 # ChatGPT Codex OAuth caps it at 272K; both paths resolve via their own 154 # provider-aware branches (_resolve_codex_oauth_context_length + models.dev). 155 # This hardcoded value is only reached when every probe misses. 156 "gpt-5.5": 1050000, 157 "gpt-5.4-nano": 400000, # 400k (not 1.05M like full 5.4) 158 "gpt-5.4-mini": 400000, # 400k (not 1.05M like full 5.4) 159 "gpt-5.4": 1050000, # GPT-5.4, GPT-5.4 Pro (1.05M context) 160 "gpt-5.1-chat": 128000, # Chat variant has 128k context 161 "gpt-5": 400000, # GPT-5.x base, mini, codex variants (400k) 162 "gpt-4.1": 1047576, 163 "gpt-4": 128000, 164 # Google 165 "gemini": 1048576, 166 # Gemma (open models served via AI Studio) 167 "gemma-4": 256000, # Gemma 4 family 168 "gemma4": 256000, # Ollama-style naming (e.g. gemma4:31b-cloud) 169 "gemma-4-31b": 256000, 170 "gemma-3": 131072, 171 "gemma": 8192, # fallback for older gemma models 172 # DeepSeek — V4 family ships with a 1M context window. The legacy 173 # aliases ``deepseek-chat`` / ``deepseek-reasoner`` are server-side 174 # mapped to the non-thinking / thinking modes of ``deepseek-v4-flash`` 175 # and inherit the same 1M window. The ``deepseek`` substring entry 176 # below remains as a 128K fallback for older / unknown DeepSeek model 177 # ids (e.g. via custom endpoints). 178 # https://api-docs.deepseek.com/zh-cn/quick_start/pricing 179 "deepseek-v4-pro": 1_000_000, 180 "deepseek-v4-flash": 1_000_000, 181 "deepseek-chat": 1_000_000, 182 "deepseek-reasoner": 1_000_000, 183 "deepseek": 128000, 184 # Meta 185 "llama": 131072, 186 # Qwen — specific model families before the catch-all. 187 # Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/ 188 "qwen3-coder-plus": 1000000, # 1M context 189 "qwen3-coder": 262144, # 256K context 190 "qwen": 131072, 191 # MiniMax — official docs: 204,800 context for all models 192 # https://platform.minimax.io/docs/api-reference/text-anthropic-api 193 "minimax": 204800, 194 # GLM 195 "glm": 202752, 196 # xAI Grok — xAI /v1/models does not return context_length metadata, 197 # so these hardcoded fallbacks prevent Hermes from probing-down to 198 # the default 128k when the user points at https://api.x.ai/v1 199 # via a custom provider. Values sourced from models.dev (2026-04). 200 # Keys use substring matching (longest-first), so e.g. "grok-4.20" 201 # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309". 202 "grok-code-fast": 256000, # grok-code-fast-1 203 "grok-4-1-fast": 2000000, # grok-4-1-fast-(non-)reasoning 204 "grok-2-vision": 8192, # grok-2-vision, -1212, -latest 205 "grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning 206 "grok-4.20": 2000000, # grok-4.20-0309-(non-)reasoning, -multi-agent-0309 207 "grok-4": 256000, # grok-4, grok-4-0709 208 "grok-3": 131072, # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast 209 "grok-2": 131072, # grok-2, grok-2-1212, grok-2-latest 210 "grok": 131072, # catch-all (grok-beta, unknown grok-*) 211 # Kimi 212 "kimi": 262144, 213 # Tencent — Hy3 Preview (Hunyuan) with 256K context window 214 "hy3-preview": 256000, 215 # Nemotron — NVIDIA's open-weights series (128K context across all sizes) 216 "nemotron": 131072, 217 # Arcee 218 "trinity": 262144, 219 # OpenRouter 220 "elephant": 262144, 221 # Hugging Face Inference Providers — model IDs use org/name format 222 "Qwen/Qwen3.5-397B-A17B": 131072, 223 "Qwen/Qwen3.5-35B-A3B": 131072, 224 "deepseek-ai/DeepSeek-V3.2": 65536, 225 "moonshotai/Kimi-K2.5": 262144, 226 "moonshotai/Kimi-K2.6": 262144, 227 "moonshotai/Kimi-K2-Thinking": 262144, 228 "MiniMaxAI/MiniMax-M2.5": 204800, 229 "XiaomiMiMo/MiMo-V2-Flash": 262144, 230 "mimo-v2-pro": 1048576, 231 "mimo-v2.5-pro": 1048576, 232 "mimo-v2.5": 1048576, 233 "mimo-v2-omni": 262144, 234 "mimo-v2-flash": 262144, 235 "zai-org/GLM-5": 202752, 236 } 237 238 _CONTEXT_LENGTH_KEYS = ( 239 "context_length", 240 "context_window", 241 "max_context_length", 242 "max_position_embeddings", 243 "max_model_len", 244 "max_input_tokens", 245 "max_sequence_length", 246 "max_seq_len", 247 "n_ctx_train", 248 "n_ctx", 249 "ctx_size", 250 ) 251 252 _MAX_COMPLETION_KEYS = ( 253 "max_completion_tokens", 254 "max_output_tokens", 255 "max_tokens", 256 ) 257 258 # Local server hostnames / address patterns 259 _LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0") 260 # Docker / Podman / Lima DNS names that resolve to the host machine 261 _CONTAINER_LOCAL_SUFFIXES = ( 262 ".docker.internal", 263 ".containers.internal", 264 ".lima.internal", 265 ) 266 267 268 def _normalize_base_url(base_url: str) -> str: 269 return (base_url or "").strip().rstrip("/") 270 271 272 def _auth_headers(api_key: str = "") -> Dict[str, str]: 273 token = str(api_key or "").strip() 274 if not token: 275 return {} 276 return {"Authorization": f"Bearer {token}"} 277 278 279 def _is_openrouter_base_url(base_url: str) -> bool: 280 return base_url_host_matches(base_url, "openrouter.ai") 281 282 283 def _is_custom_endpoint(base_url: str) -> bool: 284 normalized = _normalize_base_url(base_url) 285 return bool(normalized) and not _is_openrouter_base_url(normalized) 286 287 288 _URL_TO_PROVIDER: Dict[str, str] = { 289 "api.openai.com": "openai", 290 "chatgpt.com": "openai", 291 "api.anthropic.com": "anthropic", 292 "api.z.ai": "zai", 293 "open.bigmodel.cn": "zai", 294 "api.moonshot.ai": "kimi-coding", 295 "api.moonshot.cn": "kimi-coding-cn", 296 "api.kimi.com": "kimi-coding", 297 "api.stepfun.ai": "stepfun", 298 "api.stepfun.com": "stepfun", 299 "api.arcee.ai": "arcee", 300 "api.minimax": "minimax", 301 "dashscope.aliyuncs.com": "alibaba", 302 "dashscope-intl.aliyuncs.com": "alibaba", 303 "portal.qwen.ai": "qwen-oauth", 304 "openrouter.ai": "openrouter", 305 "generativelanguage.googleapis.com": "gemini", 306 "inference-api.nousresearch.com": "nous", 307 "api.deepseek.com": "deepseek", 308 "api.githubcopilot.com": "copilot", 309 "models.github.ai": "copilot", 310 "api.fireworks.ai": "fireworks", 311 "opencode.ai": "opencode-go", 312 "api.x.ai": "xai", 313 "integrate.api.nvidia.com": "nvidia", 314 "api.xiaomimimo.com": "xiaomi", 315 "xiaomimimo.com": "xiaomi", 316 "api.gmi-serving.com": "gmi", 317 "tokenhub.tencentmaas.com": "tencent-tokenhub", 318 "ollama.com": "ollama-cloud", 319 } 320 321 322 def _infer_provider_from_url(base_url: str) -> Optional[str]: 323 """Infer the models.dev provider name from a base URL. 324 325 This allows context length resolution via models.dev for custom endpoints 326 like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to 327 explicitly set the provider name in config. 328 """ 329 normalized = _normalize_base_url(base_url) 330 if not normalized: 331 return None 332 parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}") 333 host = parsed.netloc.lower() or parsed.path.lower() 334 for url_part, provider in _URL_TO_PROVIDER.items(): 335 if url_part in host: 336 return provider 337 return None 338 339 340 def _is_known_provider_base_url(base_url: str) -> bool: 341 return _infer_provider_from_url(base_url) is not None 342 343 344 def is_local_endpoint(base_url: str) -> bool: 345 """Return True if base_url points to a local machine. 346 347 Recognises loopback (``localhost``, ``127.0.0.0/8``, ``::1``), 348 container-internal DNS names (``host.docker.internal`` et al.), 349 RFC-1918 private ranges (``10/8``, ``172.16/12``, ``192.168/16``), 350 link-local, and Tailscale CGNAT (``100.64.0.0/10``). Tailscale CGNAT 351 is included so remote-but-trusted Ollama boxes reached over a 352 Tailscale mesh get the same timeout auto-bumps as localhost Ollama. 353 """ 354 normalized = _normalize_base_url(base_url) 355 if not normalized: 356 return False 357 url = normalized if "://" in normalized else f"http://{normalized}" 358 try: 359 parsed = urlparse(url) 360 host = parsed.hostname or "" 361 except Exception: 362 return False 363 if host in _LOCAL_HOSTS: 364 return True 365 # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal) 366 if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES): 367 return True 368 # RFC-1918 private ranges, link-local, and Tailscale CGNAT 369 try: 370 addr = ipaddress.ip_address(host) 371 if addr.is_private or addr.is_loopback or addr.is_link_local: 372 return True 373 if isinstance(addr, ipaddress.IPv4Address) and addr in _TAILSCALE_CGNAT: 374 return True 375 except ValueError: 376 pass 377 # Bare IP that looks like a private range (e.g. 172.26.x.x for WSL) 378 # or Tailscale CGNAT (100.64.x.x–100.127.x.x). 379 parts = host.split(".") 380 if len(parts) == 4: 381 try: 382 first, second = int(parts[0]), int(parts[1]) 383 if first == 10: 384 return True 385 if first == 172 and 16 <= second <= 31: 386 return True 387 if first == 192 and second == 168: 388 return True 389 if first == 100 and 64 <= second <= 127: 390 return True 391 except ValueError: 392 pass 393 return False 394 395 396 def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]: 397 """Detect which local server is running at base_url by probing known endpoints. 398 399 Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None. 400 """ 401 import httpx 402 403 normalized = _normalize_base_url(base_url) 404 server_url = normalized 405 if server_url.endswith("/v1"): 406 server_url = server_url[:-3] 407 408 headers = _auth_headers(api_key) 409 410 try: 411 with httpx.Client(timeout=2.0, headers=headers) as client: 412 # LM Studio exposes /api/v1/models — check first (most specific) 413 try: 414 r = client.get(f"{server_url}/api/v1/models") 415 if r.status_code == 200: 416 return "lm-studio" 417 except Exception: 418 pass 419 # Ollama exposes /api/tags and responds with {"models": [...]} 420 # LM Studio returns {"error": "Unexpected endpoint"} with status 200 421 # on this path, so we must verify the response contains "models". 422 try: 423 r = client.get(f"{server_url}/api/tags") 424 if r.status_code == 200: 425 try: 426 data = r.json() 427 if "models" in data: 428 return "ollama" 429 except Exception: 430 pass 431 except Exception: 432 pass 433 # llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix) 434 try: 435 r = client.get(f"{server_url}/v1/props") 436 if r.status_code != 200: 437 r = client.get(f"{server_url}/props") # fallback for older builds 438 if r.status_code == 200 and "default_generation_settings" in r.text: 439 return "llamacpp" 440 except Exception: 441 pass 442 # vLLM: /version 443 try: 444 r = client.get(f"{server_url}/version") 445 if r.status_code == 200: 446 data = r.json() 447 if "version" in data: 448 return "vllm" 449 except Exception: 450 pass 451 except Exception: 452 pass 453 454 return None 455 456 457 def _iter_nested_dicts(value: Any): 458 if isinstance(value, dict): 459 yield value 460 for nested in value.values(): 461 yield from _iter_nested_dicts(nested) 462 elif isinstance(value, list): 463 for item in value: 464 yield from _iter_nested_dicts(item) 465 466 467 def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]: 468 try: 469 if isinstance(value, bool): 470 return None 471 if isinstance(value, str): 472 value = value.strip().replace(",", "") 473 result = int(value) 474 except (TypeError, ValueError): 475 return None 476 if minimum <= result <= maximum: 477 return result 478 return None 479 480 481 def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]: 482 keyset = {key.lower() for key in keys} 483 for mapping in _iter_nested_dicts(payload): 484 for key, value in mapping.items(): 485 if str(key).lower() not in keyset: 486 continue 487 coerced = _coerce_reasonable_int(value) 488 if coerced is not None: 489 return coerced 490 return None 491 492 493 def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]: 494 return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS) 495 496 497 def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]: 498 return _extract_first_int(payload, _MAX_COMPLETION_KEYS) 499 500 501 def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]: 502 alias_map = { 503 "prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"), 504 "completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"), 505 "request": ("request", "request_cost"), 506 "cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"), 507 "cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"), 508 } 509 for mapping in _iter_nested_dicts(payload): 510 normalized = {str(key).lower(): value for key, value in mapping.items()} 511 if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()): 512 continue 513 pricing: Dict[str, Any] = {} 514 for target, aliases in alias_map.items(): 515 for alias in aliases: 516 if alias in normalized and normalized[alias] not in (None, ""): 517 pricing[target] = normalized[alias] 518 break 519 if pricing: 520 return pricing 521 return {} 522 523 524 def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None: 525 cache[model_id] = entry 526 if "/" in model_id: 527 bare_model = model_id.split("/", 1)[1] 528 cache.setdefault(bare_model, entry) 529 530 531 def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]: 532 """Fetch model metadata from OpenRouter (cached for 1 hour).""" 533 global _model_metadata_cache, _model_metadata_cache_time 534 535 if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL: 536 return _model_metadata_cache 537 538 try: 539 response = requests.get(OPENROUTER_MODELS_URL, timeout=10, verify=_resolve_requests_verify()) 540 response.raise_for_status() 541 data = response.json() 542 543 cache = {} 544 for model in data.get("data", []): 545 model_id = model.get("id", "") 546 entry = { 547 "context_length": model.get("context_length", 128000), 548 "max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096), 549 "name": model.get("name", model_id), 550 "pricing": model.get("pricing", {}), 551 } 552 _add_model_aliases(cache, model_id, entry) 553 canonical = model.get("canonical_slug", "") 554 if canonical and canonical != model_id: 555 _add_model_aliases(cache, canonical, entry) 556 557 _model_metadata_cache = cache 558 _model_metadata_cache_time = time.time() 559 logger.debug("Fetched metadata for %s models from OpenRouter", len(cache)) 560 return cache 561 562 except Exception as e: 563 logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}") 564 return _model_metadata_cache or {} 565 566 567 def fetch_endpoint_model_metadata( 568 base_url: str, 569 api_key: str = "", 570 force_refresh: bool = False, 571 ) -> Dict[str, Dict[str, Any]]: 572 """Fetch model metadata from an OpenAI-compatible ``/models`` endpoint. 573 574 This is used for explicit custom endpoints where hardcoded global model-name 575 defaults are unreliable. Results are cached in memory per base URL. 576 """ 577 normalized = _normalize_base_url(base_url) 578 if not normalized or _is_openrouter_base_url(normalized): 579 return {} 580 581 if not force_refresh: 582 cached = _endpoint_model_metadata_cache.get(normalized) 583 cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0) 584 if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL: 585 return cached 586 587 candidates = [normalized] 588 if normalized.endswith("/v1"): 589 alternate = normalized[:-3].rstrip("/") 590 else: 591 alternate = normalized + "/v1" 592 if alternate and alternate not in candidates: 593 candidates.append(alternate) 594 595 headers = {"Authorization": f"Bearer {api_key}"} if api_key else {} 596 last_error: Optional[Exception] = None 597 598 if is_local_endpoint(normalized): 599 try: 600 if detect_local_server_type(normalized, api_key=api_key) == "lm-studio": 601 server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized 602 response = requests.get( 603 server_url.rstrip("/") + "/api/v1/models", 604 headers=headers, 605 timeout=10, 606 verify=_resolve_requests_verify(), 607 ) 608 response.raise_for_status() 609 payload = response.json() 610 cache: Dict[str, Dict[str, Any]] = {} 611 for model in payload.get("models", []): 612 if not isinstance(model, dict): 613 continue 614 model_id = model.get("key") or model.get("id") 615 if not model_id: 616 continue 617 entry: Dict[str, Any] = {"name": model.get("name", model_id)} 618 619 context_length = None 620 for inst in model.get("loaded_instances", []) or []: 621 if not isinstance(inst, dict): 622 continue 623 cfg = inst.get("config", {}) 624 ctx = cfg.get("context_length") if isinstance(cfg, dict) else None 625 if isinstance(ctx, int) and ctx > 0: 626 context_length = ctx 627 break 628 if context_length is not None: 629 entry["context_length"] = context_length 630 631 max_completion_tokens = _extract_max_completion_tokens(model) 632 if max_completion_tokens is not None: 633 entry["max_completion_tokens"] = max_completion_tokens 634 635 pricing = _extract_pricing(model) 636 if pricing: 637 entry["pricing"] = pricing 638 639 _add_model_aliases(cache, model_id, entry) 640 alt_id = model.get("id") 641 if isinstance(alt_id, str) and alt_id and alt_id != model_id: 642 _add_model_aliases(cache, alt_id, entry) 643 644 _endpoint_model_metadata_cache[normalized] = cache 645 _endpoint_model_metadata_cache_time[normalized] = time.time() 646 return cache 647 except Exception as exc: 648 last_error = exc 649 650 for candidate in candidates: 651 url = candidate.rstrip("/") + "/models" 652 try: 653 response = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify()) 654 response.raise_for_status() 655 payload = response.json() 656 cache: Dict[str, Dict[str, Any]] = {} 657 for model in payload.get("data", []): 658 if not isinstance(model, dict): 659 continue 660 model_id = model.get("id") 661 if not model_id: 662 continue 663 entry: Dict[str, Any] = {"name": model.get("name", model_id)} 664 context_length = _extract_context_length(model) 665 if context_length is not None: 666 entry["context_length"] = context_length 667 max_completion_tokens = _extract_max_completion_tokens(model) 668 if max_completion_tokens is not None: 669 entry["max_completion_tokens"] = max_completion_tokens 670 pricing = _extract_pricing(model) 671 if pricing: 672 entry["pricing"] = pricing 673 _add_model_aliases(cache, model_id, entry) 674 675 # If this is a llama.cpp server, query /props for actual allocated context 676 is_llamacpp = any( 677 m.get("owned_by") == "llamacpp" 678 for m in payload.get("data", []) if isinstance(m, dict) 679 ) 680 if is_llamacpp: 681 try: 682 # Try /v1/props first (current llama.cpp); fall back to /props for older builds 683 base = candidate.rstrip("/").replace("/v1", "") 684 _verify = _resolve_requests_verify() 685 props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5, verify=_verify) 686 if not props_resp.ok: 687 props_resp = requests.get(base + "/props", headers=headers, timeout=5, verify=_verify) 688 if props_resp.ok: 689 props = props_resp.json() 690 gen_settings = props.get("default_generation_settings", {}) 691 n_ctx = gen_settings.get("n_ctx") 692 model_alias = props.get("model_alias", "") 693 if n_ctx and model_alias and model_alias in cache: 694 cache[model_alias]["context_length"] = n_ctx 695 except Exception: 696 pass 697 698 _endpoint_model_metadata_cache[normalized] = cache 699 _endpoint_model_metadata_cache_time[normalized] = time.time() 700 return cache 701 except Exception as exc: 702 last_error = exc 703 704 if last_error: 705 logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error) 706 _endpoint_model_metadata_cache[normalized] = {} 707 _endpoint_model_metadata_cache_time[normalized] = time.time() 708 return {} 709 710 711 def _resolve_endpoint_context_length( 712 model: str, 713 base_url: str, 714 api_key: str = "", 715 ) -> Optional[int]: 716 """Resolve context length from an endpoint's live ``/models`` metadata.""" 717 endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key) 718 matched = endpoint_metadata.get(model) 719 if not matched: 720 if len(endpoint_metadata) == 1: 721 matched = next(iter(endpoint_metadata.values())) 722 else: 723 for key, entry in endpoint_metadata.items(): 724 if model in key or key in model: 725 matched = entry 726 break 727 if matched: 728 context_length = matched.get("context_length") 729 if isinstance(context_length, int): 730 return context_length 731 return None 732 733 734 def _get_context_cache_path() -> Path: 735 """Return path to the persistent context length cache file.""" 736 from hermes_constants import get_hermes_home 737 return get_hermes_home() / "context_length_cache.yaml" 738 739 740 def _load_context_cache() -> Dict[str, int]: 741 """Load the model+provider -> context_length cache from disk.""" 742 path = _get_context_cache_path() 743 if not path.exists(): 744 return {} 745 try: 746 with open(path) as f: 747 data = yaml.safe_load(f) or {} 748 return data.get("context_lengths", {}) 749 except Exception as e: 750 logger.debug("Failed to load context length cache: %s", e) 751 return {} 752 753 754 def save_context_length(model: str, base_url: str, length: int) -> None: 755 """Persist a discovered context length for a model+provider combo. 756 757 Cache key is ``model@base_url`` so the same model name served from 758 different providers can have different limits. 759 """ 760 key = f"{model}@{base_url}" 761 cache = _load_context_cache() 762 if cache.get(key) == length: 763 return # already stored 764 cache[key] = length 765 path = _get_context_cache_path() 766 try: 767 path.parent.mkdir(parents=True, exist_ok=True) 768 with open(path, "w") as f: 769 yaml.dump({"context_lengths": cache}, f, default_flow_style=False) 770 logger.info("Cached context length %s -> %s tokens", key, f"{length:,}") 771 except Exception as e: 772 logger.debug("Failed to save context length cache: %s", e) 773 774 775 def get_cached_context_length(model: str, base_url: str) -> Optional[int]: 776 """Look up a previously discovered context length for model+provider.""" 777 key = f"{model}@{base_url}" 778 cache = _load_context_cache() 779 return cache.get(key) 780 781 782 def _invalidate_cached_context_length(model: str, base_url: str) -> None: 783 """Drop a stale cache entry so it gets re-resolved on the next lookup.""" 784 key = f"{model}@{base_url}" 785 cache = _load_context_cache() 786 if key not in cache: 787 return 788 del cache[key] 789 path = _get_context_cache_path() 790 try: 791 path.parent.mkdir(parents=True, exist_ok=True) 792 with open(path, "w") as f: 793 yaml.dump({"context_lengths": cache}, f, default_flow_style=False) 794 except Exception as e: 795 logger.debug("Failed to invalidate context length cache entry %s: %s", key, e) 796 797 798 def get_next_probe_tier(current_length: int) -> Optional[int]: 799 """Return the next lower probe tier, or None if already at minimum.""" 800 for tier in CONTEXT_PROBE_TIERS: 801 if tier < current_length: 802 return tier 803 return None 804 805 806 def parse_context_limit_from_error(error_msg: str) -> Optional[int]: 807 """Try to extract the actual context limit from an API error message. 808 809 Many providers include the limit in their error text, e.g.: 810 - "maximum context length is 32768 tokens" 811 - "context_length_exceeded: 131072" 812 - "Maximum context size 32768 exceeded" 813 - "model's max context length is 65536" 814 """ 815 error_lower = error_msg.lower() 816 # Pattern: look for numbers near context-related keywords 817 patterns = [ 818 r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})', 819 r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})', 820 r'(\d{4,})\s*(?:token)?\s*(?:context|limit)', 821 r'>\s*(\d{4,})\s*(?:max|limit|token)', # "250000 tokens > 200000 maximum" 822 r'(\d{4,})\s*(?:max(?:imum)?)\b', # "200000 maximum" 823 ] 824 for pattern in patterns: 825 match = re.search(pattern, error_lower) 826 if match: 827 limit = int(match.group(1)) 828 # Sanity check: must be a reasonable context length 829 if 1024 <= limit <= 10_000_000: 830 return limit 831 return None 832 833 834 def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]: 835 """Detect an "output cap too large" error and return how many output tokens are available. 836 837 Background — two distinct context errors exist: 838 1. "Prompt too long" — the INPUT itself exceeds the context window. 839 Fix: compress history and/or halve context_length. 840 2. "max_tokens too large" — input is fine, but input + requested_output > window. 841 Fix: reduce max_tokens (the output cap) for this call. 842 Do NOT touch context_length — the window hasn't shrunk. 843 844 Anthropic's API returns errors like: 845 "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000" 846 847 Returns the number of output tokens that would fit (e.g. 10000 above), or None if 848 the error does not look like a max_tokens-too-large error. 849 """ 850 error_lower = error_msg.lower() 851 852 # Must look like an output-cap error, not a prompt-length error. 853 is_output_cap_error = ( 854 "max_tokens" in error_lower 855 and ("available_tokens" in error_lower or "available tokens" in error_lower) 856 ) 857 if not is_output_cap_error: 858 return None 859 860 # Extract the available_tokens figure. 861 # Anthropic format: "… = available_tokens: 10000" 862 patterns = [ 863 r'available_tokens[:\s]+(\d+)', 864 r'available\s+tokens[:\s]+(\d+)', 865 # fallback: last number after "=" in expressions like "200000 - 190000 = 10000" 866 r'=\s*(\d+)\s*$', 867 ] 868 for pattern in patterns: 869 match = re.search(pattern, error_lower) 870 if match: 871 tokens = int(match.group(1)) 872 if tokens >= 1: 873 return tokens 874 return None 875 876 877 def _model_id_matches(candidate_id: str, lookup_model: str) -> bool: 878 """Return True if *candidate_id* (from server) matches *lookup_model* (configured). 879 880 Supports two forms: 881 - Exact match: "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1" 882 - Slug match: "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1" 883 (the part after the last "/" equals lookup_model) 884 885 This covers LM Studio's native API which stores models as "publisher/slug" 886 while users typically configure only the slug after the "local:" prefix. 887 """ 888 if candidate_id == lookup_model: 889 return True 890 # Slug match: basename of candidate equals the lookup name 891 if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model: 892 return True 893 return False 894 895 896 def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Optional[int]: 897 """Query an Ollama server for the model's context length. 898 899 Returns the model's maximum context from GGUF metadata via ``/api/show``, 900 or the explicit ``num_ctx`` from the Modelfile if set. Returns None if 901 the server is unreachable or not Ollama. 902 903 This is the value that should be passed as ``num_ctx`` in Ollama chat 904 requests to override the default 2048. 905 """ 906 import httpx 907 908 bare_model = _strip_provider_prefix(model) 909 server_url = base_url.rstrip("/") 910 if server_url.endswith("/v1"): 911 server_url = server_url[:-3] 912 913 try: 914 server_type = detect_local_server_type(base_url, api_key=api_key) 915 except Exception: 916 return None 917 if server_type != "ollama": 918 return None 919 920 headers = _auth_headers(api_key) 921 922 try: 923 with httpx.Client(timeout=3.0, headers=headers) as client: 924 resp = client.post(f"{server_url}/api/show", json={"name": bare_model}) 925 if resp.status_code != 200: 926 return None 927 data = resp.json() 928 929 # Prefer explicit num_ctx from Modelfile parameters (user override) 930 params = data.get("parameters", "") 931 if "num_ctx" in params: 932 for line in params.split("\n"): 933 if "num_ctx" in line: 934 parts = line.strip().split() 935 if len(parts) >= 2: 936 try: 937 return int(parts[-1]) 938 except ValueError: 939 pass 940 941 # Fall back to GGUF model_info context_length (training max) 942 model_info = data.get("model_info", {}) 943 for key, value in model_info.items(): 944 if "context_length" in key and isinstance(value, (int, float)): 945 return int(value) 946 except Exception: 947 pass 948 return None 949 950 951 def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]: 952 """Query a local server for the model's context length.""" 953 import httpx 954 955 # Strip recognised provider prefix (e.g., "local:model-name" → "model-name"). 956 # Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved. 957 model = _strip_provider_prefix(model) 958 959 # Strip /v1 suffix to get the server root 960 server_url = base_url.rstrip("/") 961 if server_url.endswith("/v1"): 962 server_url = server_url[:-3] 963 964 headers = _auth_headers(api_key) 965 966 try: 967 server_type = detect_local_server_type(base_url, api_key=api_key) 968 except Exception: 969 server_type = None 970 971 try: 972 with httpx.Client(timeout=3.0, headers=headers) as client: 973 # Ollama: /api/show returns model details with context info 974 if server_type == "ollama": 975 resp = client.post(f"{server_url}/api/show", json={"name": model}) 976 if resp.status_code == 200: 977 data = resp.json() 978 # Prefer explicit num_ctx from Modelfile parameters: this is 979 # the *runtime* context Ollama will actually allocate KV cache 980 # for. The GGUF model_info.context_length is the training max, 981 # which can be larger than num_ctx — using it here would let 982 # Hermes grow conversations past the runtime limit and Ollama 983 # would silently truncate. Matches query_ollama_num_ctx(). 984 params = data.get("parameters", "") 985 if "num_ctx" in params: 986 for line in params.split("\n"): 987 if "num_ctx" in line: 988 parts = line.strip().split() 989 if len(parts) >= 2: 990 try: 991 return int(parts[-1]) 992 except ValueError: 993 pass 994 # Fall back to GGUF model_info context_length (training max) 995 model_info = data.get("model_info", {}) 996 for key, value in model_info.items(): 997 if "context_length" in key and isinstance(value, (int, float)): 998 return int(value) 999 1000 # LM Studio native API: /api/v1/models returns max_context_length. 1001 # This is more reliable than the OpenAI-compat /v1/models which 1002 # doesn't include context window information for LM Studio servers. 1003 # Use _model_id_matches for fuzzy matching: LM Studio stores models as 1004 # "publisher/slug" but users configure only "slug" after "local:" prefix. 1005 if server_type == "lm-studio": 1006 resp = client.get(f"{server_url}/api/v1/models") 1007 if resp.status_code == 200: 1008 data = resp.json() 1009 for m in data.get("models", []): 1010 if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model): 1011 # Prefer loaded instance context (actual runtime value) 1012 for inst in m.get("loaded_instances", []): 1013 cfg = inst.get("config", {}) 1014 ctx = cfg.get("context_length") 1015 if ctx and isinstance(ctx, (int, float)): 1016 return int(ctx) 1017 break 1018 1019 # LM Studio / vLLM / llama.cpp: try /v1/models/{model} 1020 resp = client.get(f"{server_url}/v1/models/{model}") 1021 if resp.status_code == 200: 1022 data = resp.json() 1023 # vLLM returns max_model_len 1024 ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens") 1025 if ctx and isinstance(ctx, (int, float)): 1026 return int(ctx) 1027 1028 # Try /v1/models and find the model in the list. 1029 # Use _model_id_matches to handle "publisher/slug" vs bare "slug". 1030 resp = client.get(f"{server_url}/v1/models") 1031 if resp.status_code == 200: 1032 data = resp.json() 1033 models_list = data.get("data", []) 1034 for m in models_list: 1035 if _model_id_matches(m.get("id", ""), model): 1036 ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens") 1037 if ctx and isinstance(ctx, (int, float)): 1038 return int(ctx) 1039 except Exception: 1040 pass 1041 1042 return None 1043 1044 1045 def _normalize_model_version(model: str) -> str: 1046 """Normalize version separators for matching. 1047 1048 Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5 1049 OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5 1050 Normalize both to dashes for comparison. 1051 """ 1052 return model.replace(".", "-") 1053 1054 1055 def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]: 1056 """Query Anthropic's /v1/models endpoint for context length. 1057 1058 Only works with regular ANTHROPIC_API_KEY (sk-ant-api*). 1059 OAuth tokens (sk-ant-oat*) from Claude Code return 401. 1060 """ 1061 if not api_key or api_key.startswith("sk-ant-oat"): 1062 return None # OAuth tokens can't access /v1/models 1063 try: 1064 base = base_url.rstrip("/") 1065 if base.endswith("/v1"): 1066 base = base[:-3] 1067 url = f"{base}/v1/models?limit=1000" 1068 headers = { 1069 "x-api-key": api_key, 1070 "anthropic-version": "2023-06-01", 1071 } 1072 resp = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify()) 1073 if resp.status_code != 200: 1074 return None 1075 data = resp.json() 1076 for m in data.get("data", []): 1077 if m.get("id") == model: 1078 ctx = m.get("max_input_tokens") 1079 if isinstance(ctx, int) and ctx > 0: 1080 return ctx 1081 except Exception as e: 1082 logger.debug("Anthropic /v1/models query failed: %s", e) 1083 return None 1084 1085 1086 # Known ChatGPT Codex OAuth context windows (observed via live 1087 # chatgpt.com/backend-api/codex/models probe, Apr 2026). These are the 1088 # `context_window` values, which are what Codex actually enforces — the 1089 # direct OpenAI API has larger limits for the same slugs, but Codex OAuth 1090 # caps lower (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex). 1091 # 1092 # Used as a fallback when the live probe fails (no token, network error). 1093 # Longest keys first so substring match picks the most specific entry. 1094 _CODEX_OAUTH_CONTEXT_FALLBACK: Dict[str, int] = { 1095 "gpt-5.1-codex-max": 272_000, 1096 "gpt-5.1-codex-mini": 272_000, 1097 "gpt-5.3-codex": 272_000, 1098 "gpt-5.2-codex": 272_000, 1099 "gpt-5.4-mini": 272_000, 1100 "gpt-5.5": 272_000, 1101 "gpt-5.4": 272_000, 1102 "gpt-5.2": 272_000, 1103 "gpt-5": 272_000, 1104 } 1105 1106 1107 _codex_oauth_context_cache: Dict[str, int] = {} 1108 _codex_oauth_context_cache_time: float = 0.0 1109 _CODEX_OAUTH_CONTEXT_CACHE_TTL = 3600 # 1 hour 1110 1111 1112 def _fetch_codex_oauth_context_lengths(access_token: str) -> Dict[str, int]: 1113 """Probe the ChatGPT Codex /models endpoint for per-slug context windows. 1114 1115 Codex OAuth imposes its own context limits that differ from the direct 1116 OpenAI API (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex). The 1117 `context_window` field in each model entry is the authoritative source. 1118 1119 Returns a ``{slug: context_window}`` dict. Empty on failure. 1120 """ 1121 global _codex_oauth_context_cache, _codex_oauth_context_cache_time 1122 now = time.time() 1123 if ( 1124 _codex_oauth_context_cache 1125 and now - _codex_oauth_context_cache_time < _CODEX_OAUTH_CONTEXT_CACHE_TTL 1126 ): 1127 return _codex_oauth_context_cache 1128 1129 try: 1130 resp = requests.get( 1131 "https://chatgpt.com/backend-api/codex/models?client_version=1.0.0", 1132 headers={"Authorization": f"Bearer {access_token}"}, 1133 timeout=10, 1134 verify=_resolve_requests_verify(), 1135 ) 1136 if resp.status_code != 200: 1137 logger.debug( 1138 "Codex /models probe returned HTTP %s; falling back to hardcoded defaults", 1139 resp.status_code, 1140 ) 1141 return {} 1142 data = resp.json() 1143 except Exception as exc: 1144 logger.debug("Codex /models probe failed: %s", exc) 1145 return {} 1146 1147 entries = data.get("models", []) if isinstance(data, dict) else [] 1148 result: Dict[str, int] = {} 1149 for item in entries: 1150 if not isinstance(item, dict): 1151 continue 1152 slug = item.get("slug") 1153 ctx = item.get("context_window") 1154 if isinstance(slug, str) and isinstance(ctx, int) and ctx > 0: 1155 result[slug.strip()] = ctx 1156 1157 if result: 1158 _codex_oauth_context_cache = result 1159 _codex_oauth_context_cache_time = now 1160 return result 1161 1162 1163 def _resolve_codex_oauth_context_length( 1164 model: str, access_token: str = "" 1165 ) -> Optional[int]: 1166 """Resolve a Codex OAuth model's real context window. 1167 1168 Prefers a live probe of chatgpt.com/backend-api/codex/models (when we 1169 have a bearer token), then falls back to ``_CODEX_OAUTH_CONTEXT_FALLBACK``. 1170 """ 1171 model_bare = _strip_provider_prefix(model).strip() 1172 if not model_bare: 1173 return None 1174 1175 if access_token: 1176 live = _fetch_codex_oauth_context_lengths(access_token) 1177 if model_bare in live: 1178 return live[model_bare] 1179 # Case-insensitive match in case casing drifts 1180 model_lower = model_bare.lower() 1181 for slug, ctx in live.items(): 1182 if slug.lower() == model_lower: 1183 return ctx 1184 1185 # Fallback: longest-key-first substring match over hardcoded defaults. 1186 model_lower = model_bare.lower() 1187 for slug, ctx in sorted( 1188 _CODEX_OAUTH_CONTEXT_FALLBACK.items(), key=lambda x: len(x[0]), reverse=True 1189 ): 1190 if slug in model_lower: 1191 return ctx 1192 1193 return None 1194 1195 1196 def _resolve_nous_context_length(model: str) -> Optional[int]: 1197 """Resolve Nous Portal model context length via OpenRouter metadata. 1198 1199 Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses 1200 prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching 1201 with version normalization (dot↔dash). 1202 """ 1203 metadata = fetch_model_metadata() # OpenRouter cache 1204 # Exact match first 1205 if model in metadata: 1206 return metadata[model].get("context_length") 1207 1208 normalized = _normalize_model_version(model).lower() 1209 1210 for or_id, entry in metadata.items(): 1211 bare = or_id.split("/", 1)[1] if "/" in or_id else or_id 1212 if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized: 1213 return entry.get("context_length") 1214 1215 # Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview 1216 # Require match to be at a word boundary (followed by -, :, or end of string) 1217 model_lower = model.lower() 1218 for or_id, entry in metadata.items(): 1219 bare = or_id.split("/", 1)[1] if "/" in or_id else or_id 1220 for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]: 1221 if candidate.startswith(query) and ( 1222 len(candidate) == len(query) or candidate[len(query)] in "-:." 1223 ): 1224 return entry.get("context_length") 1225 1226 return None 1227 1228 1229 def get_model_context_length( 1230 model: str, 1231 base_url: str = "", 1232 api_key: str = "", 1233 config_context_length: int | None = None, 1234 provider: str = "", 1235 custom_providers: list | None = None, 1236 ) -> int: 1237 """Get the context length for a model. 1238 1239 Resolution order: 1240 0. Explicit config override (model.context_length or custom_providers per-model) 1241 1. Persistent cache (previously discovered via probing) 1242 1b. AWS Bedrock static table (must precede custom-endpoint probe) 1243 2. Active endpoint metadata (/models for explicit custom endpoints) 1244 3. Local server query (for local endpoints) 1245 4. Anthropic /v1/models API (API-key users only, not OAuth) 1246 5. OpenRouter live API metadata 1247 6. Nous suffix-match via OpenRouter cache 1248 7. models.dev registry lookup (provider-aware) 1249 8. Thin hardcoded defaults (broad family patterns) 1250 9. Default fallback (256K) 1251 """ 1252 # 0. Explicit config override — user knows best 1253 if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0: 1254 return config_context_length 1255 1256 # 0b. custom_providers per-model override — check before any probe. 1257 # This closes the gap where /model switch and display paths used to fall 1258 # back to 128K despite the user having a per-model context_length set. 1259 # See #15779. 1260 if custom_providers and base_url and model: 1261 try: 1262 from hermes_cli.config import get_custom_provider_context_length 1263 cp_ctx = get_custom_provider_context_length( 1264 model=model, 1265 base_url=base_url, 1266 custom_providers=custom_providers, 1267 ) 1268 if cp_ctx: 1269 return cp_ctx 1270 except Exception: 1271 pass # fall through to probing 1272 1273 # Normalise provider-prefixed model names (e.g. "local:model-name" → 1274 # "model-name") so cache lookups and server queries use the bare ID that 1275 # local servers actually know about. Ollama "model:tag" colons are preserved. 1276 model = _strip_provider_prefix(model) 1277 1278 # 1. Check persistent cache (model+provider) 1279 # LM Studio is excluded — its loaded context length is transient (the 1280 # user can reload the model with a different context_length at any time 1281 # via /api/v1/models/load), so a stale cached value would mask reloads. 1282 if base_url and provider != "lmstudio": 1283 cached = get_cached_context_length(model, base_url) 1284 if cached is not None: 1285 # Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds 1286 # resolved gpt-5.x to the direct-API value (e.g. 1.05M) via 1287 # models.dev and persisted it. Codex OAuth caps at 272K for every 1288 # slug, so any cached Codex entry at or above 400K is a leftover 1289 # from the old resolution path. Drop it and fall through to the 1290 # live /models probe in step 5 below. 1291 if provider == "openai-codex" and cached >= 400_000: 1292 logger.info( 1293 "Dropping stale Codex cache entry %s@%s -> %s (pre-fix value); " 1294 "re-resolving via live /models probe", 1295 model, base_url, f"{cached:,}", 1296 ) 1297 _invalidate_cached_context_length(model, base_url) 1298 else: 1299 return cached 1300 1301 # 1b. AWS Bedrock — use static context length table. 1302 # Bedrock's ListFoundationModels API doesn't expose context window sizes, 1303 # so we maintain a curated table in bedrock_adapter.py that reflects 1304 # AWS-imposed limits (e.g. 200K for Claude models vs 1M on the native 1305 # Anthropic API). This must run BEFORE the custom-endpoint probe at 1306 # step 2 — bedrock-runtime.<region>.amazonaws.com is not in 1307 # _URL_TO_PROVIDER, so it would otherwise be treated as a custom endpoint, 1308 # fail the /models probe (Bedrock doesn't expose that shape), and fall 1309 # back to the 128K default before reaching the original step 4b branch. 1310 if provider == "bedrock" or ( 1311 base_url 1312 and base_url_hostname(base_url).startswith("bedrock-runtime.") 1313 and base_url_host_matches(base_url, "amazonaws.com") 1314 ): 1315 try: 1316 from agent.bedrock_adapter import get_bedrock_context_length 1317 return get_bedrock_context_length(model) 1318 except ImportError: 1319 pass # boto3 not installed — fall through to generic resolution 1320 1321 # 2. Active endpoint metadata for truly custom/unknown endpoints. 1322 # Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their 1323 # /models endpoint may report a provider-imposed limit (e.g. Copilot 1324 # returns 128k) instead of the model's full context (400k). models.dev 1325 # has the correct per-provider values and is checked at step 5+. 1326 if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url): 1327 context_length = _resolve_endpoint_context_length(model, base_url, api_key=api_key) 1328 if context_length is not None: 1329 return context_length 1330 if not _is_known_provider_base_url(base_url): 1331 # 3. Try querying local server directly 1332 if is_local_endpoint(base_url): 1333 local_ctx = _query_local_context_length(model, base_url, api_key=api_key) 1334 if local_ctx and local_ctx > 0: 1335 if provider != "lmstudio": 1336 save_context_length(model, base_url, local_ctx) 1337 return local_ctx 1338 logger.info( 1339 "Could not detect context length for model %r at %s — " 1340 "defaulting to %s tokens (probe-down). Set model.context_length " 1341 "in config.yaml to override.", 1342 model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}", 1343 ) 1344 return DEFAULT_FALLBACK_CONTEXT 1345 1346 # 4. Anthropic /v1/models API (only for regular API keys, not OAuth) 1347 if provider == "anthropic" or ( 1348 base_url and base_url_hostname(base_url) == "api.anthropic.com" 1349 ): 1350 ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key) 1351 if ctx: 1352 return ctx 1353 1354 # 4b. (Bedrock handled earlier at step 1b — before custom-endpoint probe.) 1355 1356 # 5. Provider-aware lookups (before generic OpenRouter cache) 1357 # These are provider-specific and take priority over the generic OR cache, 1358 # since the same model can have different context limits per provider 1359 # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot). 1360 # If provider is generic (openrouter/custom/empty), try to infer from URL. 1361 effective_provider = provider 1362 if not effective_provider or effective_provider in ("openrouter", "custom"): 1363 if base_url: 1364 inferred = _infer_provider_from_url(base_url) 1365 if inferred: 1366 effective_provider = inferred 1367 1368 # 5a. Copilot live /models API — max_prompt_tokens from the user's account. 1369 # This catches account-specific models (e.g. claude-opus-4.6-1m) that 1370 # don't exist in models.dev. For models that ARE in models.dev, this 1371 # returns the provider-enforced limit which is what users can actually use. 1372 if effective_provider in ("copilot", "copilot-acp", "github-copilot"): 1373 try: 1374 from hermes_cli.models import get_copilot_model_context 1375 ctx = get_copilot_model_context(model, api_key=api_key) 1376 if ctx: 1377 return ctx 1378 except Exception: 1379 pass # Fall through to models.dev 1380 1381 if effective_provider == "nous": 1382 ctx = _resolve_nous_context_length(model) 1383 if ctx: 1384 return ctx 1385 if effective_provider == "openai-codex": 1386 # Codex OAuth enforces lower context limits than the direct OpenAI 1387 # API for the same slug (e.g. gpt-5.5 is 1.05M on the API but 272K 1388 # on Codex). Authoritative source is Codex's own /models endpoint. 1389 codex_ctx = _resolve_codex_oauth_context_length(model, access_token=api_key or "") 1390 if codex_ctx: 1391 if base_url: 1392 save_context_length(model, base_url, codex_ctx) 1393 return codex_ctx 1394 if effective_provider == "gmi" and base_url: 1395 # GMI exposes authoritative context_length via /models, but it is not 1396 # in models.dev yet. Preserve that higher-fidelity endpoint lookup. 1397 ctx = _resolve_endpoint_context_length(model, base_url, api_key=api_key) 1398 if ctx is not None: 1399 return ctx 1400 if effective_provider: 1401 from agent.models_dev import lookup_models_dev_context 1402 ctx = lookup_models_dev_context(effective_provider, model) 1403 if ctx: 1404 return ctx 1405 1406 # 6. OpenRouter live API metadata (provider-unaware fallback) 1407 metadata = fetch_model_metadata() 1408 if model in metadata: 1409 return metadata[model].get("context_length", DEFAULT_FALLBACK_CONTEXT) 1410 1411 # 8. Hardcoded defaults (fuzzy match — longest key first for specificity) 1412 # Only check `default_model in model` (is the key a substring of the input). 1413 # The reverse (`model in default_model`) causes shorter names like 1414 # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M. 1415 model_lower = model.lower() 1416 for default_model, length in sorted( 1417 DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True 1418 ): 1419 if default_model in model_lower: 1420 return length 1421 1422 # 9. Query local server as last resort 1423 if base_url and is_local_endpoint(base_url): 1424 local_ctx = _query_local_context_length(model, base_url, api_key=api_key) 1425 if local_ctx and local_ctx > 0: 1426 if provider != "lmstudio": 1427 save_context_length(model, base_url, local_ctx) 1428 return local_ctx 1429 1430 # 10. Default fallback — 256K 1431 return DEFAULT_FALLBACK_CONTEXT 1432 1433 1434 def estimate_tokens_rough(text: str) -> int: 1435 """Rough token estimate (~4 chars/token) for pre-flight checks. 1436 1437 Uses ceiling division so short texts (1-3 chars) never estimate as 1438 0 tokens, which would cause the compressor and pre-flight checks to 1439 systematically undercount when many short tool results are present. 1440 """ 1441 if not text: 1442 return 0 1443 return (len(text) + 3) // 4 1444 1445 1446 def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int: 1447 """Rough token estimate for a message list (pre-flight only).""" 1448 total_chars = sum(len(str(msg)) for msg in messages) 1449 return (total_chars + 3) // 4 1450 1451 1452 def estimate_request_tokens_rough( 1453 messages: List[Dict[str, Any]], 1454 *, 1455 system_prompt: str = "", 1456 tools: Optional[List[Dict[str, Any]]] = None, 1457 ) -> int: 1458 """Rough token estimate for a full chat-completions request. 1459 1460 Includes the major payload buckets Hermes sends to providers: 1461 system prompt, conversation messages, and tool schemas. With 50+ 1462 tools enabled, schemas alone can add 20-30K tokens — a significant 1463 blind spot when only counting messages. 1464 """ 1465 total_chars = 0 1466 if system_prompt: 1467 total_chars += len(system_prompt) 1468 if messages: 1469 total_chars += sum(len(str(msg)) for msg in messages) 1470 if tools: 1471 total_chars += len(str(tools)) 1472 return (total_chars + 3) // 4