/ agent / model_metadata.py
model_metadata.py
   1  """Model metadata, context lengths, and token estimation utilities.
   2  
   3  Pure utility functions with no AIAgent dependency. Used by ContextCompressor
   4  and run_agent.py for pre-flight context checks.
   5  """
   6  
   7  import ipaddress
   8  import logging
   9  import os
  10  import re
  11  import time
  12  from pathlib import Path
  13  from typing import Any, Dict, List, Optional
  14  from urllib.parse import urlparse
  15  
  16  import requests
  17  import yaml
  18  
  19  from utils import base_url_host_matches, base_url_hostname
  20  
  21  from hermes_constants import OPENROUTER_MODELS_URL
  22  
  23  logger = logging.getLogger(__name__)
  24  
  25  
  26  def _resolve_requests_verify() -> bool | str:
  27      """Resolve SSL verify setting for `requests` calls from env vars.
  28  
  29      The `requests` library only honours REQUESTS_CA_BUNDLE / CURL_CA_BUNDLE
  30      by default. Hermes also honours HERMES_CA_BUNDLE (its own convention)
  31      and SSL_CERT_FILE (used by the stdlib `ssl` module and by httpx), so
  32      that a single env var can cover both `requests` and `httpx` callsites
  33      inside the same process.
  34  
  35      Returns either a filesystem path to a CA bundle, or True to defer to
  36      the requests default (certifi).
  37      """
  38      for env_var in ("HERMES_CA_BUNDLE", "REQUESTS_CA_BUNDLE", "SSL_CERT_FILE"):
  39          val = os.getenv(env_var)
  40          if val and os.path.isfile(val):
  41              return val
  42      return True
  43  
  44  # Provider names that can appear as a "provider:" prefix before a model ID.
  45  # Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
  46  # are preserved so the full model name reaches cache lookups and server queries.
  47  _PROVIDER_PREFIXES: frozenset[str] = frozenset({
  48      "openrouter", "nous", "openai-codex", "copilot", "copilot-acp", "opencode-kimi-oauth",
  49      "gemini", "ollama-cloud", "zai", "kimi-coding", "kimi-coding-cn", "stepfun", "minimax", "minimax-oauth", "minimax-cn", "anthropic", "deepseek",
  50      "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
  51      "qwen-oauth",
  52      "xiaomi",
  53      "arcee",
  54      "gmi",
  55      "tencent-tokenhub",
  56      "custom", "local",
  57      # Common aliases
  58      "google", "google-gemini", "google-ai-studio",
  59      "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
  60      "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
  61      "ollama",
  62      "stepfun", "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
  63      "mimo", "xiaomi-mimo",
  64      "tencent", "tokenhub", "tencent-cloud", "tencentmaas",
  65      "arcee-ai", "arceeai",
  66      "gmi-cloud", "gmicloud",
  67      "xai", "x-ai", "x.ai", "grok",
  68      "nvidia", "nim", "nvidia-nim", "nemotron",
  69      "qwen-portal",
  70  })
  71  
  72  
  73  _OLLAMA_TAG_PATTERN = re.compile(
  74      r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
  75      re.IGNORECASE,
  76  )
  77  
  78  
  79  # Tailscale's CGNAT range (RFC 6598). `ipaddress.is_private` excludes this
  80  # block, so without an explicit check Ollama reached over Tailscale (e.g.
  81  # `http://100.77.243.5:11434`) wouldn't be treated as local and its stream
  82  # read / stale timeouts wouldn't get auto-bumped. Built once at import time.
  83  _TAILSCALE_CGNAT = ipaddress.IPv4Network("100.64.0.0/10")
  84  
  85  
  86  def _strip_provider_prefix(model: str) -> str:
  87      """Strip a recognised provider prefix from a model string.
  88  
  89      ``"local:my-model"`` → ``"my-model"``
  90      ``"qwen3.5:27b"``   → ``"qwen3.5:27b"``  (unchanged — not a provider prefix)
  91      ``"qwen:0.5b"``     → ``"qwen:0.5b"``    (unchanged — Ollama model:tag)
  92      ``"deepseek:latest"``→ ``"deepseek:latest"``(unchanged — Ollama model:tag)
  93      """
  94      if ":" not in model or model.startswith("http"):
  95          return model
  96      prefix, suffix = model.split(":", 1)
  97      prefix_lower = prefix.strip().lower()
  98      if prefix_lower in _PROVIDER_PREFIXES:
  99          # Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
 100          if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
 101              return model
 102          return suffix
 103      return model
 104  
 105  _model_metadata_cache: Dict[str, Dict[str, Any]] = {}
 106  _model_metadata_cache_time: float = 0
 107  _MODEL_CACHE_TTL = 3600
 108  _endpoint_model_metadata_cache: Dict[str, Dict[str, Dict[str, Any]]] = {}
 109  _endpoint_model_metadata_cache_time: Dict[str, float] = {}
 110  _ENDPOINT_MODEL_CACHE_TTL = 300
 111  
 112  # Descending tiers for context length probing when the model is unknown.
 113  # We start at 256K (covers GPT-5.x, many current large-context models) and
 114  # step down on context-length errors until one works.  Tier[0] is also the
 115  # default fallback when no detection method succeeds.
 116  CONTEXT_PROBE_TIERS = [
 117      256_000,
 118      128_000,
 119      64_000,
 120      32_000,
 121      16_000,
 122      8_000,
 123  ]
 124  
 125  # Default context length when no detection method succeeds.
 126  DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
 127  
 128  # Minimum context length required to run Hermes Agent.  Models with fewer
 129  # tokens cannot maintain enough working memory for tool-calling workflows.
 130  # Sessions, model switches, and cron jobs should reject models below this.
 131  MINIMUM_CONTEXT_LENGTH = 64_000
 132  
 133  # Thin fallback defaults — only broad model family patterns.
 134  # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 135  # all miss. Replaced the previous 80+ entry dict.
 136  # For provider-specific context lengths, models.dev is the primary source.
 137  DEFAULT_CONTEXT_LENGTHS = {
 138      # Anthropic Claude 4.6 (1M context) — bare IDs only to avoid
 139      # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
 140      # substring of "anthropic/claude-sonnet-4.6").
 141      # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
 142      "claude-opus-4-7": 1000000,
 143      "claude-opus-4.7": 1000000,
 144      "claude-opus-4-6": 1000000,
 145      "claude-sonnet-4-6": 1000000,
 146      "claude-opus-4.6": 1000000,
 147      "claude-sonnet-4.6": 1000000,
 148      # Catch-all for older Claude models (must sort after specific entries)
 149      "claude": 200000,
 150      # OpenAI — GPT-5 family (most have 400k; specific overrides first)
 151      # Source: https://developers.openai.com/api/docs/models
 152      # GPT-5.5 (launched Apr 23 2026) is 1.05M on the direct OpenAI API and
 153      # ChatGPT Codex OAuth caps it at 272K; both paths resolve via their own
 154      # provider-aware branches (_resolve_codex_oauth_context_length + models.dev).
 155      # This hardcoded value is only reached when every probe misses.
 156      "gpt-5.5": 1050000,
 157      "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
 158      "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
 159      "gpt-5.4": 1050000,               # GPT-5.4, GPT-5.4 Pro (1.05M context)
 160      "gpt-5.1-chat": 128000,           # Chat variant has 128k context
 161      "gpt-5": 400000,                  # GPT-5.x base, mini, codex variants (400k)
 162      "gpt-4.1": 1047576,
 163      "gpt-4": 128000,
 164      # Google
 165      "gemini": 1048576,
 166      # Gemma (open models served via AI Studio)
 167      "gemma-4": 256000,  # Gemma 4 family
 168      "gemma4": 256000,  # Ollama-style naming (e.g. gemma4:31b-cloud)
 169      "gemma-4-31b": 256000,
 170      "gemma-3": 131072,
 171      "gemma": 8192,  # fallback for older gemma models
 172      # DeepSeek — V4 family ships with a 1M context window. The legacy
 173      # aliases ``deepseek-chat`` / ``deepseek-reasoner`` are server-side
 174      # mapped to the non-thinking / thinking modes of ``deepseek-v4-flash``
 175      # and inherit the same 1M window. The ``deepseek`` substring entry
 176      # below remains as a 128K fallback for older / unknown DeepSeek model
 177      # ids (e.g. via custom endpoints).
 178      # https://api-docs.deepseek.com/zh-cn/quick_start/pricing
 179      "deepseek-v4-pro": 1_000_000,
 180      "deepseek-v4-flash": 1_000_000,
 181      "deepseek-chat": 1_000_000,
 182      "deepseek-reasoner": 1_000_000,
 183      "deepseek": 128000,
 184      # Meta
 185      "llama": 131072,
 186      # Qwen — specific model families before the catch-all.
 187      # Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
 188      "qwen3-coder-plus": 1000000,  # 1M context
 189      "qwen3-coder": 262144,        # 256K context
 190      "qwen": 131072,
 191      # MiniMax — official docs: 204,800 context for all models
 192      # https://platform.minimax.io/docs/api-reference/text-anthropic-api
 193      "minimax": 204800,
 194      # GLM
 195      "glm": 202752,
 196      # xAI Grok — xAI /v1/models does not return context_length metadata,
 197      # so these hardcoded fallbacks prevent Hermes from probing-down to
 198      # the default 128k when the user points at https://api.x.ai/v1
 199      # via a custom provider. Values sourced from models.dev (2026-04).
 200      # Keys use substring matching (longest-first), so e.g. "grok-4.20"
 201      # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
 202      "grok-code-fast": 256000,   # grok-code-fast-1
 203      "grok-4-1-fast": 2000000,   # grok-4-1-fast-(non-)reasoning
 204      "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
 205      "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning
 206      "grok-4.20": 2000000,       # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
 207      "grok-4": 256000,           # grok-4, grok-4-0709
 208      "grok-3": 131072,           # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
 209      "grok-2": 131072,           # grok-2, grok-2-1212, grok-2-latest
 210      "grok": 131072,             # catch-all (grok-beta, unknown grok-*)
 211      # Kimi
 212      "kimi": 262144,
 213      # Tencent — Hy3 Preview (Hunyuan) with 256K context window
 214      "hy3-preview": 256000,
 215      # Nemotron — NVIDIA's open-weights series (128K context across all sizes)
 216      "nemotron": 131072,
 217      # Arcee
 218      "trinity": 262144,
 219      # OpenRouter
 220      "elephant": 262144,
 221      # Hugging Face Inference Providers — model IDs use org/name format
 222      "Qwen/Qwen3.5-397B-A17B": 131072,
 223      "Qwen/Qwen3.5-35B-A3B": 131072,
 224      "deepseek-ai/DeepSeek-V3.2": 65536,
 225      "moonshotai/Kimi-K2.5": 262144,
 226      "moonshotai/Kimi-K2.6": 262144,
 227      "moonshotai/Kimi-K2-Thinking": 262144,
 228      "MiniMaxAI/MiniMax-M2.5": 204800,
 229      "XiaomiMiMo/MiMo-V2-Flash": 262144,
 230      "mimo-v2-pro": 1048576,
 231      "mimo-v2.5-pro": 1048576,
 232      "mimo-v2.5": 1048576,
 233      "mimo-v2-omni": 262144,
 234      "mimo-v2-flash": 262144,
 235      "zai-org/GLM-5": 202752,
 236  }
 237  
 238  _CONTEXT_LENGTH_KEYS = (
 239      "context_length",
 240      "context_window",
 241      "max_context_length",
 242      "max_position_embeddings",
 243      "max_model_len",
 244      "max_input_tokens",
 245      "max_sequence_length",
 246      "max_seq_len",
 247      "n_ctx_train",
 248      "n_ctx",
 249      "ctx_size",
 250  )
 251  
 252  _MAX_COMPLETION_KEYS = (
 253      "max_completion_tokens",
 254      "max_output_tokens",
 255      "max_tokens",
 256  )
 257  
 258  # Local server hostnames / address patterns
 259  _LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
 260  # Docker / Podman / Lima DNS names that resolve to the host machine
 261  _CONTAINER_LOCAL_SUFFIXES = (
 262      ".docker.internal",
 263      ".containers.internal",
 264      ".lima.internal",
 265  )
 266  
 267  
 268  def _normalize_base_url(base_url: str) -> str:
 269      return (base_url or "").strip().rstrip("/")
 270  
 271  
 272  def _auth_headers(api_key: str = "") -> Dict[str, str]:
 273      token = str(api_key or "").strip()
 274      if not token:
 275          return {}
 276      return {"Authorization": f"Bearer {token}"}
 277  
 278  
 279  def _is_openrouter_base_url(base_url: str) -> bool:
 280      return base_url_host_matches(base_url, "openrouter.ai")
 281  
 282  
 283  def _is_custom_endpoint(base_url: str) -> bool:
 284      normalized = _normalize_base_url(base_url)
 285      return bool(normalized) and not _is_openrouter_base_url(normalized)
 286  
 287  
 288  _URL_TO_PROVIDER: Dict[str, str] = {
 289      "api.openai.com": "openai",
 290      "chatgpt.com": "openai",
 291      "api.anthropic.com": "anthropic",
 292      "api.z.ai": "zai",
 293      "open.bigmodel.cn": "zai",
 294      "api.moonshot.ai": "kimi-coding",
 295      "api.moonshot.cn": "kimi-coding-cn",
 296      "api.kimi.com": "kimi-coding",
 297      "api.stepfun.ai": "stepfun",
 298      "api.stepfun.com": "stepfun",
 299      "api.arcee.ai": "arcee",
 300      "api.minimax": "minimax",
 301      "dashscope.aliyuncs.com": "alibaba",
 302      "dashscope-intl.aliyuncs.com": "alibaba",
 303      "portal.qwen.ai": "qwen-oauth",
 304      "openrouter.ai": "openrouter",
 305      "generativelanguage.googleapis.com": "gemini",
 306      "inference-api.nousresearch.com": "nous",
 307      "api.deepseek.com": "deepseek",
 308      "api.githubcopilot.com": "copilot",
 309      "models.github.ai": "copilot",
 310      "api.fireworks.ai": "fireworks",
 311      "opencode.ai": "opencode-go",
 312      "api.x.ai": "xai",
 313      "integrate.api.nvidia.com": "nvidia",
 314      "api.xiaomimimo.com": "xiaomi",
 315      "xiaomimimo.com": "xiaomi",
 316      "api.gmi-serving.com": "gmi",
 317      "tokenhub.tencentmaas.com": "tencent-tokenhub",
 318      "ollama.com": "ollama-cloud",
 319  }
 320  
 321  
 322  def _infer_provider_from_url(base_url: str) -> Optional[str]:
 323      """Infer the models.dev provider name from a base URL.
 324  
 325      This allows context length resolution via models.dev for custom endpoints
 326      like DashScope (Alibaba), Z.AI, Kimi, etc. without requiring the user to
 327      explicitly set the provider name in config.
 328      """
 329      normalized = _normalize_base_url(base_url)
 330      if not normalized:
 331          return None
 332      parsed = urlparse(normalized if "://" in normalized else f"https://{normalized}")
 333      host = parsed.netloc.lower() or parsed.path.lower()
 334      for url_part, provider in _URL_TO_PROVIDER.items():
 335          if url_part in host:
 336              return provider
 337      return None
 338  
 339  
 340  def _is_known_provider_base_url(base_url: str) -> bool:
 341      return _infer_provider_from_url(base_url) is not None
 342  
 343  
 344  def is_local_endpoint(base_url: str) -> bool:
 345      """Return True if base_url points to a local machine.
 346  
 347      Recognises loopback (``localhost``, ``127.0.0.0/8``, ``::1``),
 348      container-internal DNS names (``host.docker.internal`` et al.),
 349      RFC-1918 private ranges (``10/8``, ``172.16/12``, ``192.168/16``),
 350      link-local, and Tailscale CGNAT (``100.64.0.0/10``). Tailscale CGNAT
 351      is included so remote-but-trusted Ollama boxes reached over a
 352      Tailscale mesh get the same timeout auto-bumps as localhost Ollama.
 353      """
 354      normalized = _normalize_base_url(base_url)
 355      if not normalized:
 356          return False
 357      url = normalized if "://" in normalized else f"http://{normalized}"
 358      try:
 359          parsed = urlparse(url)
 360          host = parsed.hostname or ""
 361      except Exception:
 362          return False
 363      if host in _LOCAL_HOSTS:
 364          return True
 365      # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
 366      if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
 367          return True
 368      # RFC-1918 private ranges, link-local, and Tailscale CGNAT
 369      try:
 370          addr = ipaddress.ip_address(host)
 371          if addr.is_private or addr.is_loopback or addr.is_link_local:
 372              return True
 373          if isinstance(addr, ipaddress.IPv4Address) and addr in _TAILSCALE_CGNAT:
 374              return True
 375      except ValueError:
 376          pass
 377      # Bare IP that looks like a private range (e.g. 172.26.x.x for WSL)
 378      # or Tailscale CGNAT (100.64.x.x–100.127.x.x).
 379      parts = host.split(".")
 380      if len(parts) == 4:
 381          try:
 382              first, second = int(parts[0]), int(parts[1])
 383              if first == 10:
 384                  return True
 385              if first == 172 and 16 <= second <= 31:
 386                  return True
 387              if first == 192 and second == 168:
 388                  return True
 389              if first == 100 and 64 <= second <= 127:
 390                  return True
 391          except ValueError:
 392              pass
 393      return False
 394  
 395  
 396  def detect_local_server_type(base_url: str, api_key: str = "") -> Optional[str]:
 397      """Detect which local server is running at base_url by probing known endpoints.
 398  
 399      Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
 400      """
 401      import httpx
 402  
 403      normalized = _normalize_base_url(base_url)
 404      server_url = normalized
 405      if server_url.endswith("/v1"):
 406          server_url = server_url[:-3]
 407  
 408      headers = _auth_headers(api_key)
 409  
 410      try:
 411          with httpx.Client(timeout=2.0, headers=headers) as client:
 412              # LM Studio exposes /api/v1/models — check first (most specific)
 413              try:
 414                  r = client.get(f"{server_url}/api/v1/models")
 415                  if r.status_code == 200:
 416                      return "lm-studio"
 417              except Exception:
 418                  pass
 419              # Ollama exposes /api/tags and responds with {"models": [...]}
 420              # LM Studio returns {"error": "Unexpected endpoint"} with status 200
 421              # on this path, so we must verify the response contains "models".
 422              try:
 423                  r = client.get(f"{server_url}/api/tags")
 424                  if r.status_code == 200:
 425                      try:
 426                          data = r.json()
 427                          if "models" in data:
 428                              return "ollama"
 429                      except Exception:
 430                          pass
 431              except Exception:
 432                  pass
 433              # llama.cpp exposes /v1/props (older builds used /props without the /v1 prefix)
 434              try:
 435                  r = client.get(f"{server_url}/v1/props")
 436                  if r.status_code != 200:
 437                      r = client.get(f"{server_url}/props")  # fallback for older builds
 438                  if r.status_code == 200 and "default_generation_settings" in r.text:
 439                      return "llamacpp"
 440              except Exception:
 441                  pass
 442              # vLLM: /version
 443              try:
 444                  r = client.get(f"{server_url}/version")
 445                  if r.status_code == 200:
 446                      data = r.json()
 447                      if "version" in data:
 448                          return "vllm"
 449              except Exception:
 450                  pass
 451      except Exception:
 452          pass
 453  
 454      return None
 455  
 456  
 457  def _iter_nested_dicts(value: Any):
 458      if isinstance(value, dict):
 459          yield value
 460          for nested in value.values():
 461              yield from _iter_nested_dicts(nested)
 462      elif isinstance(value, list):
 463          for item in value:
 464              yield from _iter_nested_dicts(item)
 465  
 466  
 467  def _coerce_reasonable_int(value: Any, minimum: int = 1024, maximum: int = 10_000_000) -> Optional[int]:
 468      try:
 469          if isinstance(value, bool):
 470              return None
 471          if isinstance(value, str):
 472              value = value.strip().replace(",", "")
 473          result = int(value)
 474      except (TypeError, ValueError):
 475          return None
 476      if minimum <= result <= maximum:
 477          return result
 478      return None
 479  
 480  
 481  def _extract_first_int(payload: Dict[str, Any], keys: tuple[str, ...]) -> Optional[int]:
 482      keyset = {key.lower() for key in keys}
 483      for mapping in _iter_nested_dicts(payload):
 484          for key, value in mapping.items():
 485              if str(key).lower() not in keyset:
 486                  continue
 487              coerced = _coerce_reasonable_int(value)
 488              if coerced is not None:
 489                  return coerced
 490      return None
 491  
 492  
 493  def _extract_context_length(payload: Dict[str, Any]) -> Optional[int]:
 494      return _extract_first_int(payload, _CONTEXT_LENGTH_KEYS)
 495  
 496  
 497  def _extract_max_completion_tokens(payload: Dict[str, Any]) -> Optional[int]:
 498      return _extract_first_int(payload, _MAX_COMPLETION_KEYS)
 499  
 500  
 501  def _extract_pricing(payload: Dict[str, Any]) -> Dict[str, Any]:
 502      alias_map = {
 503          "prompt": ("prompt", "input", "input_cost_per_token", "prompt_token_cost"),
 504          "completion": ("completion", "output", "output_cost_per_token", "completion_token_cost"),
 505          "request": ("request", "request_cost"),
 506          "cache_read": ("cache_read", "cached_prompt", "input_cache_read", "cache_read_cost_per_token"),
 507          "cache_write": ("cache_write", "cache_creation", "input_cache_write", "cache_write_cost_per_token"),
 508      }
 509      for mapping in _iter_nested_dicts(payload):
 510          normalized = {str(key).lower(): value for key, value in mapping.items()}
 511          if not any(any(alias in normalized for alias in aliases) for aliases in alias_map.values()):
 512              continue
 513          pricing: Dict[str, Any] = {}
 514          for target, aliases in alias_map.items():
 515              for alias in aliases:
 516                  if alias in normalized and normalized[alias] not in (None, ""):
 517                      pricing[target] = normalized[alias]
 518                      break
 519          if pricing:
 520              return pricing
 521      return {}
 522  
 523  
 524  def _add_model_aliases(cache: Dict[str, Dict[str, Any]], model_id: str, entry: Dict[str, Any]) -> None:
 525      cache[model_id] = entry
 526      if "/" in model_id:
 527          bare_model = model_id.split("/", 1)[1]
 528          cache.setdefault(bare_model, entry)
 529  
 530  
 531  def fetch_model_metadata(force_refresh: bool = False) -> Dict[str, Dict[str, Any]]:
 532      """Fetch model metadata from OpenRouter (cached for 1 hour)."""
 533      global _model_metadata_cache, _model_metadata_cache_time
 534  
 535      if not force_refresh and _model_metadata_cache and (time.time() - _model_metadata_cache_time) < _MODEL_CACHE_TTL:
 536          return _model_metadata_cache
 537  
 538      try:
 539          response = requests.get(OPENROUTER_MODELS_URL, timeout=10, verify=_resolve_requests_verify())
 540          response.raise_for_status()
 541          data = response.json()
 542  
 543          cache = {}
 544          for model in data.get("data", []):
 545              model_id = model.get("id", "")
 546              entry = {
 547                  "context_length": model.get("context_length", 128000),
 548                  "max_completion_tokens": model.get("top_provider", {}).get("max_completion_tokens", 4096),
 549                  "name": model.get("name", model_id),
 550                  "pricing": model.get("pricing", {}),
 551              }
 552              _add_model_aliases(cache, model_id, entry)
 553              canonical = model.get("canonical_slug", "")
 554              if canonical and canonical != model_id:
 555                  _add_model_aliases(cache, canonical, entry)
 556  
 557          _model_metadata_cache = cache
 558          _model_metadata_cache_time = time.time()
 559          logger.debug("Fetched metadata for %s models from OpenRouter", len(cache))
 560          return cache
 561  
 562      except Exception as e:
 563          logging.warning(f"Failed to fetch model metadata from OpenRouter: {e}")
 564          return _model_metadata_cache or {}
 565  
 566  
 567  def fetch_endpoint_model_metadata(
 568      base_url: str,
 569      api_key: str = "",
 570      force_refresh: bool = False,
 571  ) -> Dict[str, Dict[str, Any]]:
 572      """Fetch model metadata from an OpenAI-compatible ``/models`` endpoint.
 573  
 574      This is used for explicit custom endpoints where hardcoded global model-name
 575      defaults are unreliable. Results are cached in memory per base URL.
 576      """
 577      normalized = _normalize_base_url(base_url)
 578      if not normalized or _is_openrouter_base_url(normalized):
 579          return {}
 580  
 581      if not force_refresh:
 582          cached = _endpoint_model_metadata_cache.get(normalized)
 583          cached_at = _endpoint_model_metadata_cache_time.get(normalized, 0)
 584          if cached is not None and (time.time() - cached_at) < _ENDPOINT_MODEL_CACHE_TTL:
 585              return cached
 586  
 587      candidates = [normalized]
 588      if normalized.endswith("/v1"):
 589          alternate = normalized[:-3].rstrip("/")
 590      else:
 591          alternate = normalized + "/v1"
 592      if alternate and alternate not in candidates:
 593          candidates.append(alternate)
 594  
 595      headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
 596      last_error: Optional[Exception] = None
 597  
 598      if is_local_endpoint(normalized):
 599          try:
 600              if detect_local_server_type(normalized, api_key=api_key) == "lm-studio":
 601                  server_url = normalized[:-3].rstrip("/") if normalized.endswith("/v1") else normalized
 602                  response = requests.get(
 603                      server_url.rstrip("/") + "/api/v1/models",
 604                      headers=headers,
 605                      timeout=10,
 606                      verify=_resolve_requests_verify(),
 607                  )
 608                  response.raise_for_status()
 609                  payload = response.json()
 610                  cache: Dict[str, Dict[str, Any]] = {}
 611                  for model in payload.get("models", []):
 612                      if not isinstance(model, dict):
 613                          continue
 614                      model_id = model.get("key") or model.get("id")
 615                      if not model_id:
 616                          continue
 617                      entry: Dict[str, Any] = {"name": model.get("name", model_id)}
 618  
 619                      context_length = None
 620                      for inst in model.get("loaded_instances", []) or []:
 621                          if not isinstance(inst, dict):
 622                              continue
 623                          cfg = inst.get("config", {})
 624                          ctx = cfg.get("context_length") if isinstance(cfg, dict) else None
 625                          if isinstance(ctx, int) and ctx > 0:
 626                              context_length = ctx
 627                              break
 628                      if context_length is not None:
 629                          entry["context_length"] = context_length
 630  
 631                      max_completion_tokens = _extract_max_completion_tokens(model)
 632                      if max_completion_tokens is not None:
 633                          entry["max_completion_tokens"] = max_completion_tokens
 634  
 635                      pricing = _extract_pricing(model)
 636                      if pricing:
 637                          entry["pricing"] = pricing
 638  
 639                      _add_model_aliases(cache, model_id, entry)
 640                      alt_id = model.get("id")
 641                      if isinstance(alt_id, str) and alt_id and alt_id != model_id:
 642                          _add_model_aliases(cache, alt_id, entry)
 643  
 644                  _endpoint_model_metadata_cache[normalized] = cache
 645                  _endpoint_model_metadata_cache_time[normalized] = time.time()
 646                  return cache
 647          except Exception as exc:
 648              last_error = exc
 649  
 650      for candidate in candidates:
 651          url = candidate.rstrip("/") + "/models"
 652          try:
 653              response = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify())
 654              response.raise_for_status()
 655              payload = response.json()
 656              cache: Dict[str, Dict[str, Any]] = {}
 657              for model in payload.get("data", []):
 658                  if not isinstance(model, dict):
 659                      continue
 660                  model_id = model.get("id")
 661                  if not model_id:
 662                      continue
 663                  entry: Dict[str, Any] = {"name": model.get("name", model_id)}
 664                  context_length = _extract_context_length(model)
 665                  if context_length is not None:
 666                      entry["context_length"] = context_length
 667                  max_completion_tokens = _extract_max_completion_tokens(model)
 668                  if max_completion_tokens is not None:
 669                      entry["max_completion_tokens"] = max_completion_tokens
 670                  pricing = _extract_pricing(model)
 671                  if pricing:
 672                      entry["pricing"] = pricing
 673                  _add_model_aliases(cache, model_id, entry)
 674  
 675              # If this is a llama.cpp server, query /props for actual allocated context
 676              is_llamacpp = any(
 677                  m.get("owned_by") == "llamacpp"
 678                  for m in payload.get("data", []) if isinstance(m, dict)
 679              )
 680              if is_llamacpp:
 681                  try:
 682                      # Try /v1/props first (current llama.cpp); fall back to /props for older builds
 683                      base = candidate.rstrip("/").replace("/v1", "")
 684                      _verify = _resolve_requests_verify()
 685                      props_resp = requests.get(base + "/v1/props", headers=headers, timeout=5, verify=_verify)
 686                      if not props_resp.ok:
 687                          props_resp = requests.get(base + "/props", headers=headers, timeout=5, verify=_verify)
 688                      if props_resp.ok:
 689                          props = props_resp.json()
 690                          gen_settings = props.get("default_generation_settings", {})
 691                          n_ctx = gen_settings.get("n_ctx")
 692                          model_alias = props.get("model_alias", "")
 693                          if n_ctx and model_alias and model_alias in cache:
 694                              cache[model_alias]["context_length"] = n_ctx
 695                  except Exception:
 696                      pass
 697  
 698              _endpoint_model_metadata_cache[normalized] = cache
 699              _endpoint_model_metadata_cache_time[normalized] = time.time()
 700              return cache
 701          except Exception as exc:
 702              last_error = exc
 703  
 704      if last_error:
 705          logger.debug("Failed to fetch model metadata from %s/models: %s", normalized, last_error)
 706      _endpoint_model_metadata_cache[normalized] = {}
 707      _endpoint_model_metadata_cache_time[normalized] = time.time()
 708      return {}
 709  
 710  
 711  def _resolve_endpoint_context_length(
 712      model: str,
 713      base_url: str,
 714      api_key: str = "",
 715  ) -> Optional[int]:
 716      """Resolve context length from an endpoint's live ``/models`` metadata."""
 717      endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
 718      matched = endpoint_metadata.get(model)
 719      if not matched:
 720          if len(endpoint_metadata) == 1:
 721              matched = next(iter(endpoint_metadata.values()))
 722          else:
 723              for key, entry in endpoint_metadata.items():
 724                  if model in key or key in model:
 725                      matched = entry
 726                      break
 727      if matched:
 728          context_length = matched.get("context_length")
 729          if isinstance(context_length, int):
 730              return context_length
 731      return None
 732  
 733  
 734  def _get_context_cache_path() -> Path:
 735      """Return path to the persistent context length cache file."""
 736      from hermes_constants import get_hermes_home
 737      return get_hermes_home() / "context_length_cache.yaml"
 738  
 739  
 740  def _load_context_cache() -> Dict[str, int]:
 741      """Load the model+provider -> context_length cache from disk."""
 742      path = _get_context_cache_path()
 743      if not path.exists():
 744          return {}
 745      try:
 746          with open(path) as f:
 747              data = yaml.safe_load(f) or {}
 748          return data.get("context_lengths", {})
 749      except Exception as e:
 750          logger.debug("Failed to load context length cache: %s", e)
 751          return {}
 752  
 753  
 754  def save_context_length(model: str, base_url: str, length: int) -> None:
 755      """Persist a discovered context length for a model+provider combo.
 756  
 757      Cache key is ``model@base_url`` so the same model name served from
 758      different providers can have different limits.
 759      """
 760      key = f"{model}@{base_url}"
 761      cache = _load_context_cache()
 762      if cache.get(key) == length:
 763          return  # already stored
 764      cache[key] = length
 765      path = _get_context_cache_path()
 766      try:
 767          path.parent.mkdir(parents=True, exist_ok=True)
 768          with open(path, "w") as f:
 769              yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
 770          logger.info("Cached context length %s -> %s tokens", key, f"{length:,}")
 771      except Exception as e:
 772          logger.debug("Failed to save context length cache: %s", e)
 773  
 774  
 775  def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
 776      """Look up a previously discovered context length for model+provider."""
 777      key = f"{model}@{base_url}"
 778      cache = _load_context_cache()
 779      return cache.get(key)
 780  
 781  
 782  def _invalidate_cached_context_length(model: str, base_url: str) -> None:
 783      """Drop a stale cache entry so it gets re-resolved on the next lookup."""
 784      key = f"{model}@{base_url}"
 785      cache = _load_context_cache()
 786      if key not in cache:
 787          return
 788      del cache[key]
 789      path = _get_context_cache_path()
 790      try:
 791          path.parent.mkdir(parents=True, exist_ok=True)
 792          with open(path, "w") as f:
 793              yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
 794      except Exception as e:
 795          logger.debug("Failed to invalidate context length cache entry %s: %s", key, e)
 796  
 797  
 798  def get_next_probe_tier(current_length: int) -> Optional[int]:
 799      """Return the next lower probe tier, or None if already at minimum."""
 800      for tier in CONTEXT_PROBE_TIERS:
 801          if tier < current_length:
 802              return tier
 803      return None
 804  
 805  
 806  def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
 807      """Try to extract the actual context limit from an API error message.
 808  
 809      Many providers include the limit in their error text, e.g.:
 810        - "maximum context length is 32768 tokens"
 811        - "context_length_exceeded: 131072"
 812        - "Maximum context size 32768 exceeded"
 813        - "model's max context length is 65536"
 814      """
 815      error_lower = error_msg.lower()
 816      # Pattern: look for numbers near context-related keywords
 817      patterns = [
 818          r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
 819          r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
 820          r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
 821          r'>\s*(\d{4,})\s*(?:max|limit|token)',  # "250000 tokens > 200000 maximum"
 822          r'(\d{4,})\s*(?:max(?:imum)?)\b',  # "200000 maximum"
 823      ]
 824      for pattern in patterns:
 825          match = re.search(pattern, error_lower)
 826          if match:
 827              limit = int(match.group(1))
 828              # Sanity check: must be a reasonable context length
 829              if 1024 <= limit <= 10_000_000:
 830                  return limit
 831      return None
 832  
 833  
 834  def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
 835      """Detect an "output cap too large" error and return how many output tokens are available.
 836  
 837      Background — two distinct context errors exist:
 838        1. "Prompt too long"  — the INPUT itself exceeds the context window.
 839             Fix: compress history and/or halve context_length.
 840        2. "max_tokens too large" — input is fine, but input + requested_output > window.
 841             Fix: reduce max_tokens (the output cap) for this call.
 842             Do NOT touch context_length — the window hasn't shrunk.
 843  
 844      Anthropic's API returns errors like:
 845        "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"
 846  
 847      Returns the number of output tokens that would fit (e.g. 10000 above), or None if
 848      the error does not look like a max_tokens-too-large error.
 849      """
 850      error_lower = error_msg.lower()
 851  
 852      # Must look like an output-cap error, not a prompt-length error.
 853      is_output_cap_error = (
 854          "max_tokens" in error_lower
 855          and ("available_tokens" in error_lower or "available tokens" in error_lower)
 856      )
 857      if not is_output_cap_error:
 858          return None
 859  
 860      # Extract the available_tokens figure.
 861      # Anthropic format: "… = available_tokens: 10000"
 862      patterns = [
 863          r'available_tokens[:\s]+(\d+)',
 864          r'available\s+tokens[:\s]+(\d+)',
 865          # fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
 866          r'=\s*(\d+)\s*$',
 867      ]
 868      for pattern in patterns:
 869          match = re.search(pattern, error_lower)
 870          if match:
 871              tokens = int(match.group(1))
 872              if tokens >= 1:
 873                  return tokens
 874      return None
 875  
 876  
 877  def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
 878      """Return True if *candidate_id* (from server) matches *lookup_model* (configured).
 879  
 880      Supports two forms:
 881      - Exact match:  "nvidia-nemotron-super-49b-v1" == "nvidia-nemotron-super-49b-v1"
 882      - Slug match:   "nvidia/nvidia-nemotron-super-49b-v1" matches "nvidia-nemotron-super-49b-v1"
 883                      (the part after the last "/" equals lookup_model)
 884  
 885      This covers LM Studio's native API which stores models as "publisher/slug"
 886      while users typically configure only the slug after the "local:" prefix.
 887      """
 888      if candidate_id == lookup_model:
 889          return True
 890      # Slug match: basename of candidate equals the lookup name
 891      if "/" in candidate_id and candidate_id.rsplit("/", 1)[1] == lookup_model:
 892          return True
 893      return False
 894  
 895  
 896  def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Optional[int]:
 897      """Query an Ollama server for the model's context length.
 898  
 899      Returns the model's maximum context from GGUF metadata via ``/api/show``,
 900      or the explicit ``num_ctx`` from the Modelfile if set.  Returns None if
 901      the server is unreachable or not Ollama.
 902  
 903      This is the value that should be passed as ``num_ctx`` in Ollama chat
 904      requests to override the default 2048.
 905      """
 906      import httpx
 907  
 908      bare_model = _strip_provider_prefix(model)
 909      server_url = base_url.rstrip("/")
 910      if server_url.endswith("/v1"):
 911          server_url = server_url[:-3]
 912  
 913      try:
 914          server_type = detect_local_server_type(base_url, api_key=api_key)
 915      except Exception:
 916          return None
 917      if server_type != "ollama":
 918          return None
 919  
 920      headers = _auth_headers(api_key)
 921  
 922      try:
 923          with httpx.Client(timeout=3.0, headers=headers) as client:
 924              resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
 925              if resp.status_code != 200:
 926                  return None
 927              data = resp.json()
 928  
 929              # Prefer explicit num_ctx from Modelfile parameters (user override)
 930              params = data.get("parameters", "")
 931              if "num_ctx" in params:
 932                  for line in params.split("\n"):
 933                      if "num_ctx" in line:
 934                          parts = line.strip().split()
 935                          if len(parts) >= 2:
 936                              try:
 937                                  return int(parts[-1])
 938                              except ValueError:
 939                                  pass
 940  
 941              # Fall back to GGUF model_info context_length (training max)
 942              model_info = data.get("model_info", {})
 943              for key, value in model_info.items():
 944                  if "context_length" in key and isinstance(value, (int, float)):
 945                      return int(value)
 946      except Exception:
 947          pass
 948      return None
 949  
 950  
 951  def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
 952      """Query a local server for the model's context length."""
 953      import httpx
 954  
 955      # Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
 956      # Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
 957      model = _strip_provider_prefix(model)
 958  
 959      # Strip /v1 suffix to get the server root
 960      server_url = base_url.rstrip("/")
 961      if server_url.endswith("/v1"):
 962          server_url = server_url[:-3]
 963  
 964      headers = _auth_headers(api_key)
 965  
 966      try:
 967          server_type = detect_local_server_type(base_url, api_key=api_key)
 968      except Exception:
 969          server_type = None
 970  
 971      try:
 972          with httpx.Client(timeout=3.0, headers=headers) as client:
 973              # Ollama: /api/show returns model details with context info
 974              if server_type == "ollama":
 975                  resp = client.post(f"{server_url}/api/show", json={"name": model})
 976                  if resp.status_code == 200:
 977                      data = resp.json()
 978                      # Prefer explicit num_ctx from Modelfile parameters: this is
 979                      # the *runtime* context Ollama will actually allocate KV cache
 980                      # for. The GGUF model_info.context_length is the training max,
 981                      # which can be larger than num_ctx — using it here would let
 982                      # Hermes grow conversations past the runtime limit and Ollama
 983                      # would silently truncate. Matches query_ollama_num_ctx().
 984                      params = data.get("parameters", "")
 985                      if "num_ctx" in params:
 986                          for line in params.split("\n"):
 987                              if "num_ctx" in line:
 988                                  parts = line.strip().split()
 989                                  if len(parts) >= 2:
 990                                      try:
 991                                          return int(parts[-1])
 992                                      except ValueError:
 993                                          pass
 994                      # Fall back to GGUF model_info context_length (training max)
 995                      model_info = data.get("model_info", {})
 996                      for key, value in model_info.items():
 997                          if "context_length" in key and isinstance(value, (int, float)):
 998                              return int(value)
 999  
1000              # LM Studio native API: /api/v1/models returns max_context_length.
1001              # This is more reliable than the OpenAI-compat /v1/models which
1002              # doesn't include context window information for LM Studio servers.
1003              # Use _model_id_matches for fuzzy matching: LM Studio stores models as
1004              # "publisher/slug" but users configure only "slug" after "local:" prefix.
1005              if server_type == "lm-studio":
1006                  resp = client.get(f"{server_url}/api/v1/models")
1007                  if resp.status_code == 200:
1008                      data = resp.json()
1009                      for m in data.get("models", []):
1010                          if _model_id_matches(m.get("key", ""), model) or _model_id_matches(m.get("id", ""), model):
1011                              # Prefer loaded instance context (actual runtime value)
1012                              for inst in m.get("loaded_instances", []):
1013                                  cfg = inst.get("config", {})
1014                                  ctx = cfg.get("context_length")
1015                                  if ctx and isinstance(ctx, (int, float)):
1016                                      return int(ctx)
1017                              break
1018  
1019              # LM Studio / vLLM / llama.cpp: try /v1/models/{model}
1020              resp = client.get(f"{server_url}/v1/models/{model}")
1021              if resp.status_code == 200:
1022                  data = resp.json()
1023                  # vLLM returns max_model_len
1024                  ctx = data.get("max_model_len") or data.get("context_length") or data.get("max_tokens")
1025                  if ctx and isinstance(ctx, (int, float)):
1026                      return int(ctx)
1027  
1028              # Try /v1/models and find the model in the list.
1029              # Use _model_id_matches to handle "publisher/slug" vs bare "slug".
1030              resp = client.get(f"{server_url}/v1/models")
1031              if resp.status_code == 200:
1032                  data = resp.json()
1033                  models_list = data.get("data", [])
1034                  for m in models_list:
1035                      if _model_id_matches(m.get("id", ""), model):
1036                          ctx = m.get("max_model_len") or m.get("context_length") or m.get("max_tokens")
1037                          if ctx and isinstance(ctx, (int, float)):
1038                              return int(ctx)
1039      except Exception:
1040          pass
1041  
1042      return None
1043  
1044  
1045  def _normalize_model_version(model: str) -> str:
1046      """Normalize version separators for matching.
1047  
1048      Nous uses dashes: claude-opus-4-6, claude-sonnet-4-5
1049      OpenRouter uses dots: claude-opus-4.6, claude-sonnet-4.5
1050      Normalize both to dashes for comparison.
1051      """
1052      return model.replace(".", "-")
1053  
1054  
1055  def _query_anthropic_context_length(model: str, base_url: str, api_key: str) -> Optional[int]:
1056      """Query Anthropic's /v1/models endpoint for context length.
1057  
1058      Only works with regular ANTHROPIC_API_KEY (sk-ant-api*).
1059      OAuth tokens (sk-ant-oat*) from Claude Code return 401.
1060      """
1061      if not api_key or api_key.startswith("sk-ant-oat"):
1062          return None  # OAuth tokens can't access /v1/models
1063      try:
1064          base = base_url.rstrip("/")
1065          if base.endswith("/v1"):
1066              base = base[:-3]
1067          url = f"{base}/v1/models?limit=1000"
1068          headers = {
1069              "x-api-key": api_key,
1070              "anthropic-version": "2023-06-01",
1071          }
1072          resp = requests.get(url, headers=headers, timeout=10, verify=_resolve_requests_verify())
1073          if resp.status_code != 200:
1074              return None
1075          data = resp.json()
1076          for m in data.get("data", []):
1077              if m.get("id") == model:
1078                  ctx = m.get("max_input_tokens")
1079                  if isinstance(ctx, int) and ctx > 0:
1080                      return ctx
1081      except Exception as e:
1082          logger.debug("Anthropic /v1/models query failed: %s", e)
1083      return None
1084  
1085  
1086  # Known ChatGPT Codex OAuth context windows (observed via live
1087  # chatgpt.com/backend-api/codex/models probe, Apr 2026). These are the
1088  # `context_window` values, which are what Codex actually enforces — the
1089  # direct OpenAI API has larger limits for the same slugs, but Codex OAuth
1090  # caps lower (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex).
1091  #
1092  # Used as a fallback when the live probe fails (no token, network error).
1093  # Longest keys first so substring match picks the most specific entry.
1094  _CODEX_OAUTH_CONTEXT_FALLBACK: Dict[str, int] = {
1095      "gpt-5.1-codex-max": 272_000,
1096      "gpt-5.1-codex-mini": 272_000,
1097      "gpt-5.3-codex": 272_000,
1098      "gpt-5.2-codex": 272_000,
1099      "gpt-5.4-mini": 272_000,
1100      "gpt-5.5": 272_000,
1101      "gpt-5.4": 272_000,
1102      "gpt-5.2": 272_000,
1103      "gpt-5": 272_000,
1104  }
1105  
1106  
1107  _codex_oauth_context_cache: Dict[str, int] = {}
1108  _codex_oauth_context_cache_time: float = 0.0
1109  _CODEX_OAUTH_CONTEXT_CACHE_TTL = 3600  # 1 hour
1110  
1111  
1112  def _fetch_codex_oauth_context_lengths(access_token: str) -> Dict[str, int]:
1113      """Probe the ChatGPT Codex /models endpoint for per-slug context windows.
1114  
1115      Codex OAuth imposes its own context limits that differ from the direct
1116      OpenAI API (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex). The
1117      `context_window` field in each model entry is the authoritative source.
1118  
1119      Returns a ``{slug: context_window}`` dict. Empty on failure.
1120      """
1121      global _codex_oauth_context_cache, _codex_oauth_context_cache_time
1122      now = time.time()
1123      if (
1124          _codex_oauth_context_cache
1125          and now - _codex_oauth_context_cache_time < _CODEX_OAUTH_CONTEXT_CACHE_TTL
1126      ):
1127          return _codex_oauth_context_cache
1128  
1129      try:
1130          resp = requests.get(
1131              "https://chatgpt.com/backend-api/codex/models?client_version=1.0.0",
1132              headers={"Authorization": f"Bearer {access_token}"},
1133              timeout=10,
1134              verify=_resolve_requests_verify(),
1135          )
1136          if resp.status_code != 200:
1137              logger.debug(
1138                  "Codex /models probe returned HTTP %s; falling back to hardcoded defaults",
1139                  resp.status_code,
1140              )
1141              return {}
1142          data = resp.json()
1143      except Exception as exc:
1144          logger.debug("Codex /models probe failed: %s", exc)
1145          return {}
1146  
1147      entries = data.get("models", []) if isinstance(data, dict) else []
1148      result: Dict[str, int] = {}
1149      for item in entries:
1150          if not isinstance(item, dict):
1151              continue
1152          slug = item.get("slug")
1153          ctx = item.get("context_window")
1154          if isinstance(slug, str) and isinstance(ctx, int) and ctx > 0:
1155              result[slug.strip()] = ctx
1156  
1157      if result:
1158          _codex_oauth_context_cache = result
1159          _codex_oauth_context_cache_time = now
1160      return result
1161  
1162  
1163  def _resolve_codex_oauth_context_length(
1164      model: str, access_token: str = ""
1165  ) -> Optional[int]:
1166      """Resolve a Codex OAuth model's real context window.
1167  
1168      Prefers a live probe of chatgpt.com/backend-api/codex/models (when we
1169      have a bearer token), then falls back to ``_CODEX_OAUTH_CONTEXT_FALLBACK``.
1170      """
1171      model_bare = _strip_provider_prefix(model).strip()
1172      if not model_bare:
1173          return None
1174  
1175      if access_token:
1176          live = _fetch_codex_oauth_context_lengths(access_token)
1177          if model_bare in live:
1178              return live[model_bare]
1179          # Case-insensitive match in case casing drifts
1180          model_lower = model_bare.lower()
1181          for slug, ctx in live.items():
1182              if slug.lower() == model_lower:
1183                  return ctx
1184  
1185      # Fallback: longest-key-first substring match over hardcoded defaults.
1186      model_lower = model_bare.lower()
1187      for slug, ctx in sorted(
1188          _CODEX_OAUTH_CONTEXT_FALLBACK.items(), key=lambda x: len(x[0]), reverse=True
1189      ):
1190          if slug in model_lower:
1191              return ctx
1192  
1193      return None
1194  
1195  
1196  def _resolve_nous_context_length(model: str) -> Optional[int]:
1197      """Resolve Nous Portal model context length via OpenRouter metadata.
1198  
1199      Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
1200      prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
1201      with version normalization (dot↔dash).
1202      """
1203      metadata = fetch_model_metadata()  # OpenRouter cache
1204      # Exact match first
1205      if model in metadata:
1206          return metadata[model].get("context_length")
1207  
1208      normalized = _normalize_model_version(model).lower()
1209  
1210      for or_id, entry in metadata.items():
1211          bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
1212          if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
1213              return entry.get("context_length")
1214  
1215      # Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
1216      # Require match to be at a word boundary (followed by -, :, or end of string)
1217      model_lower = model.lower()
1218      for or_id, entry in metadata.items():
1219          bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
1220          for candidate, query in [(bare.lower(), model_lower), (_normalize_model_version(bare).lower(), normalized)]:
1221              if candidate.startswith(query) and (
1222                  len(candidate) == len(query) or candidate[len(query)] in "-:."
1223              ):
1224                  return entry.get("context_length")
1225  
1226      return None
1227  
1228  
1229  def get_model_context_length(
1230      model: str,
1231      base_url: str = "",
1232      api_key: str = "",
1233      config_context_length: int | None = None,
1234      provider: str = "",
1235      custom_providers: list | None = None,
1236  ) -> int:
1237      """Get the context length for a model.
1238  
1239      Resolution order:
1240      0. Explicit config override (model.context_length or custom_providers per-model)
1241      1. Persistent cache (previously discovered via probing)
1242      1b. AWS Bedrock static table (must precede custom-endpoint probe)
1243      2. Active endpoint metadata (/models for explicit custom endpoints)
1244      3. Local server query (for local endpoints)
1245      4. Anthropic /v1/models API (API-key users only, not OAuth)
1246      5. OpenRouter live API metadata
1247      6. Nous suffix-match via OpenRouter cache
1248      7. models.dev registry lookup (provider-aware)
1249      8. Thin hardcoded defaults (broad family patterns)
1250      9. Default fallback (256K)
1251      """
1252      # 0. Explicit config override — user knows best
1253      if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
1254          return config_context_length
1255  
1256      # 0b. custom_providers per-model override — check before any probe.
1257      # This closes the gap where /model switch and display paths used to fall
1258      # back to 128K despite the user having a per-model context_length set.
1259      # See #15779.
1260      if custom_providers and base_url and model:
1261          try:
1262              from hermes_cli.config import get_custom_provider_context_length
1263              cp_ctx = get_custom_provider_context_length(
1264                  model=model,
1265                  base_url=base_url,
1266                  custom_providers=custom_providers,
1267              )
1268              if cp_ctx:
1269                  return cp_ctx
1270          except Exception:
1271              pass  # fall through to probing
1272  
1273      # Normalise provider-prefixed model names (e.g. "local:model-name" →
1274      # "model-name") so cache lookups and server queries use the bare ID that
1275      # local servers actually know about.  Ollama "model:tag" colons are preserved.
1276      model = _strip_provider_prefix(model)
1277  
1278      # 1. Check persistent cache (model+provider)
1279      # LM Studio is excluded — its loaded context length is transient (the
1280      # user can reload the model with a different context_length at any time
1281      # via /api/v1/models/load), so a stale cached value would mask reloads.
1282      if base_url and provider != "lmstudio":
1283          cached = get_cached_context_length(model, base_url)
1284          if cached is not None:
1285              # Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
1286              # resolved gpt-5.x to the direct-API value (e.g. 1.05M) via
1287              # models.dev and persisted it. Codex OAuth caps at 272K for every
1288              # slug, so any cached Codex entry at or above 400K is a leftover
1289              # from the old resolution path. Drop it and fall through to the
1290              # live /models probe in step 5 below.
1291              if provider == "openai-codex" and cached >= 400_000:
1292                  logger.info(
1293                      "Dropping stale Codex cache entry %s@%s -> %s (pre-fix value); "
1294                      "re-resolving via live /models probe",
1295                      model, base_url, f"{cached:,}",
1296                  )
1297                  _invalidate_cached_context_length(model, base_url)
1298              else:
1299                  return cached
1300  
1301      # 1b. AWS Bedrock — use static context length table.
1302      # Bedrock's ListFoundationModels API doesn't expose context window sizes,
1303      # so we maintain a curated table in bedrock_adapter.py that reflects
1304      # AWS-imposed limits (e.g. 200K for Claude models vs 1M on the native
1305      # Anthropic API).  This must run BEFORE the custom-endpoint probe at
1306      # step 2 — bedrock-runtime.<region>.amazonaws.com is not in
1307      # _URL_TO_PROVIDER, so it would otherwise be treated as a custom endpoint,
1308      # fail the /models probe (Bedrock doesn't expose that shape), and fall
1309      # back to the 128K default before reaching the original step 4b branch.
1310      if provider == "bedrock" or (
1311          base_url
1312          and base_url_hostname(base_url).startswith("bedrock-runtime.")
1313          and base_url_host_matches(base_url, "amazonaws.com")
1314      ):
1315          try:
1316              from agent.bedrock_adapter import get_bedrock_context_length
1317              return get_bedrock_context_length(model)
1318          except ImportError:
1319              pass  # boto3 not installed — fall through to generic resolution
1320  
1321      # 2. Active endpoint metadata for truly custom/unknown endpoints.
1322      # Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
1323      # /models endpoint may report a provider-imposed limit (e.g. Copilot
1324      # returns 128k) instead of the model's full context (400k).  models.dev
1325      # has the correct per-provider values and is checked at step 5+.
1326      if _is_custom_endpoint(base_url) and not _is_known_provider_base_url(base_url):
1327          context_length = _resolve_endpoint_context_length(model, base_url, api_key=api_key)
1328          if context_length is not None:
1329              return context_length
1330          if not _is_known_provider_base_url(base_url):
1331              # 3. Try querying local server directly
1332              if is_local_endpoint(base_url):
1333                  local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
1334                  if local_ctx and local_ctx > 0:
1335                      if provider != "lmstudio":
1336                          save_context_length(model, base_url, local_ctx)
1337                      return local_ctx
1338              logger.info(
1339                  "Could not detect context length for model %r at %s — "
1340                  "defaulting to %s tokens (probe-down). Set model.context_length "
1341                  "in config.yaml to override.",
1342                  model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
1343              )
1344              return DEFAULT_FALLBACK_CONTEXT
1345  
1346      # 4. Anthropic /v1/models API (only for regular API keys, not OAuth)
1347      if provider == "anthropic" or (
1348          base_url and base_url_hostname(base_url) == "api.anthropic.com"
1349      ):
1350          ctx = _query_anthropic_context_length(model, base_url or "https://api.anthropic.com", api_key)
1351          if ctx:
1352              return ctx
1353  
1354      # 4b. (Bedrock handled earlier at step 1b — before custom-endpoint probe.)
1355  
1356      # 5. Provider-aware lookups (before generic OpenRouter cache)
1357      # These are provider-specific and take priority over the generic OR cache,
1358      # since the same model can have different context limits per provider
1359      # (e.g. claude-opus-4.6 is 1M on Anthropic but 128K on GitHub Copilot).
1360      # If provider is generic (openrouter/custom/empty), try to infer from URL.
1361      effective_provider = provider
1362      if not effective_provider or effective_provider in ("openrouter", "custom"):
1363          if base_url:
1364              inferred = _infer_provider_from_url(base_url)
1365              if inferred:
1366                  effective_provider = inferred
1367  
1368      # 5a. Copilot live /models API — max_prompt_tokens from the user's account.
1369      # This catches account-specific models (e.g. claude-opus-4.6-1m) that
1370      # don't exist in models.dev. For models that ARE in models.dev, this
1371      # returns the provider-enforced limit which is what users can actually use.
1372      if effective_provider in ("copilot", "copilot-acp", "github-copilot"):
1373          try:
1374              from hermes_cli.models import get_copilot_model_context
1375              ctx = get_copilot_model_context(model, api_key=api_key)
1376              if ctx:
1377                  return ctx
1378          except Exception:
1379              pass  # Fall through to models.dev
1380  
1381      if effective_provider == "nous":
1382          ctx = _resolve_nous_context_length(model)
1383          if ctx:
1384              return ctx
1385      if effective_provider == "openai-codex":
1386          # Codex OAuth enforces lower context limits than the direct OpenAI
1387          # API for the same slug (e.g. gpt-5.5 is 1.05M on the API but 272K
1388          # on Codex). Authoritative source is Codex's own /models endpoint.
1389          codex_ctx = _resolve_codex_oauth_context_length(model, access_token=api_key or "")
1390          if codex_ctx:
1391              if base_url:
1392                  save_context_length(model, base_url, codex_ctx)
1393              return codex_ctx
1394      if effective_provider == "gmi" and base_url:
1395          # GMI exposes authoritative context_length via /models, but it is not
1396          # in models.dev yet. Preserve that higher-fidelity endpoint lookup.
1397          ctx = _resolve_endpoint_context_length(model, base_url, api_key=api_key)
1398          if ctx is not None:
1399              return ctx
1400      if effective_provider:
1401          from agent.models_dev import lookup_models_dev_context
1402          ctx = lookup_models_dev_context(effective_provider, model)
1403          if ctx:
1404              return ctx
1405  
1406      # 6. OpenRouter live API metadata (provider-unaware fallback)
1407      metadata = fetch_model_metadata()
1408      if model in metadata:
1409          return metadata[model].get("context_length", DEFAULT_FALLBACK_CONTEXT)
1410  
1411      # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
1412      # Only check `default_model in model` (is the key a substring of the input).
1413      # The reverse (`model in default_model`) causes shorter names like
1414      # "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
1415      model_lower = model.lower()
1416      for default_model, length in sorted(
1417          DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
1418      ):
1419          if default_model in model_lower:
1420              return length
1421  
1422      # 9. Query local server as last resort
1423      if base_url and is_local_endpoint(base_url):
1424          local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
1425          if local_ctx and local_ctx > 0:
1426              if provider != "lmstudio":
1427                  save_context_length(model, base_url, local_ctx)
1428              return local_ctx
1429  
1430      # 10. Default fallback — 256K
1431      return DEFAULT_FALLBACK_CONTEXT
1432  
1433  
1434  def estimate_tokens_rough(text: str) -> int:
1435      """Rough token estimate (~4 chars/token) for pre-flight checks.
1436  
1437      Uses ceiling division so short texts (1-3 chars) never estimate as
1438      0 tokens, which would cause the compressor and pre-flight checks to
1439      systematically undercount when many short tool results are present.
1440      """
1441      if not text:
1442          return 0
1443      return (len(text) + 3) // 4
1444  
1445  
1446  def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
1447      """Rough token estimate for a message list (pre-flight only)."""
1448      total_chars = sum(len(str(msg)) for msg in messages)
1449      return (total_chars + 3) // 4
1450  
1451  
1452  def estimate_request_tokens_rough(
1453      messages: List[Dict[str, Any]],
1454      *,
1455      system_prompt: str = "",
1456      tools: Optional[List[Dict[str, Any]]] = None,
1457  ) -> int:
1458      """Rough token estimate for a full chat-completions request.
1459  
1460      Includes the major payload buckets Hermes sends to providers:
1461      system prompt, conversation messages, and tool schemas.  With 50+
1462      tools enabled, schemas alone can add 20-30K tokens — a significant
1463      blind spot when only counting messages.
1464      """
1465      total_chars = 0
1466      if system_prompt:
1467          total_chars += len(system_prompt)
1468      if messages:
1469          total_chars += sum(len(str(msg)) for msg in messages)
1470      if tools:
1471          total_chars += len(str(tools))
1472      return (total_chars + 3) // 4