Cradicle Explorer

/ tests / agent / test_model_metadata.py
test_model_metadata.py
   1  """Tests for agent/model_metadata.py — token estimation, context lengths,
   2  probing, caching, and error parsing.
   3  
   4  Coverage levels:
   5    Token estimation       — concrete value assertions, edge cases
   6    Context length lookup  — resolution order, fuzzy match, cache priority
   7    API metadata fetch     — caching, TTL, canonical slugs, stale fallback
   8    Probe tiers            — descending, boundaries, extreme inputs
   9    Error parsing          — OpenAI, Ollama, Anthropic, edge cases
  10    Persistent cache       — save/load, corruption, update, provider isolation
  11  """
  12  
  13  import os
  14  import time
  15  import tempfile
  16  
  17  import pytest
  18  import yaml
  19  from pathlib import Path
  20  from unittest.mock import patch, MagicMock
  21  
  22  from agent.model_metadata import (
  23      CONTEXT_PROBE_TIERS,
  24      DEFAULT_CONTEXT_LENGTHS,
  25      _strip_provider_prefix,
  26      estimate_tokens_rough,
  27      estimate_messages_tokens_rough,
  28      get_model_context_length,
  29      get_next_probe_tier,
  30      get_cached_context_length,
  31      parse_context_limit_from_error,
  32      save_context_length,
  33      fetch_model_metadata,
  34      _MODEL_CACHE_TTL,
  35  )
  36  
  37  
  38  # =========================================================================
  39  # Token estimation
  40  # =========================================================================
  41  
  42  class TestEstimateTokensRough:
  43      def test_empty_string(self):
  44          assert estimate_tokens_rough("") == 0
  45  
  46      def test_none_returns_zero(self):
  47          assert estimate_tokens_rough(None) == 0
  48  
  49      def test_known_length(self):
  50          assert estimate_tokens_rough("a" * 400) == 100
  51  
  52      def test_short_text(self):
  53          # "hello" = 5 chars → ceil(5/4) = 2
  54          assert estimate_tokens_rough("hello") == 2
  55  
  56      def test_proportional(self):
  57          short = estimate_tokens_rough("hello world")
  58          long = estimate_tokens_rough("hello world " * 100)
  59          assert long > short
  60  
  61      def test_unicode_multibyte(self):
  62          """Unicode chars are still 1 Python char each — 4 chars/token holds."""
  63          text = "你好世界"  # 4 CJK characters
  64          assert estimate_tokens_rough(text) == 1
  65  
  66  
  67  class TestEstimateMessagesTokensRough:
  68      def test_empty_list(self):
  69          assert estimate_messages_tokens_rough([]) == 0
  70  
  71      def test_single_message_concrete_value(self):
  72          """Verify against known str(msg) length (ceiling division)."""
  73          msg = {"role": "user", "content": "a" * 400}
  74          result = estimate_messages_tokens_rough([msg])
  75          n = len(str(msg))
  76          expected = (n + 3) // 4
  77          assert result == expected
  78  
  79      def test_multiple_messages_additive(self):
  80          msgs = [
  81              {"role": "user", "content": "Hello"},
  82              {"role": "assistant", "content": "Hi there, how can I help?"},
  83          ]
  84          result = estimate_messages_tokens_rough(msgs)
  85          n = sum(len(str(m)) for m in msgs)
  86          expected = (n + 3) // 4
  87          assert result == expected
  88  
  89      def test_tool_call_message(self):
  90          """Tool call messages with no 'content' key still contribute tokens."""
  91          msg = {"role": "assistant", "content": None,
  92                 "tool_calls": [{"id": "1", "function": {"name": "terminal", "arguments": "{}"}}]}
  93          result = estimate_messages_tokens_rough([msg])
  94          assert result > 0
  95          assert result == (len(str(msg)) + 3) // 4
  96  
  97      def test_message_with_list_content(self):
  98          """Vision messages with multimodal content arrays."""
  99          msg = {"role": "user", "content": [
 100              {"type": "text", "text": "describe"},
 101              {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}
 102          ]}
 103          result = estimate_messages_tokens_rough([msg])
 104          assert result == (len(str(msg)) + 3) // 4
 105  
 106  
 107  # =========================================================================
 108  # Default context lengths
 109  # =========================================================================
 110  
 111  class TestDefaultContextLengths:
 112      def test_claude_models_context_lengths(self):
 113          for key, value in DEFAULT_CONTEXT_LENGTHS.items():
 114              if "claude" not in key:
 115                  continue
 116              # Claude 4.6+ models (4.6 and 4.7) have 1M context at standard
 117              # API pricing (no long-context premium).  Older Claude 4.x and
 118              # 3.x models cap at 200k.
 119              if any(tag in key for tag in ("4.6", "4-6", "4.7", "4-7")):
 120                  assert value == 1000000, f"{key} should be 1000000"
 121              else:
 122                  assert value == 200000, f"{key} should be 200000"
 123  
 124      def test_gpt4_models_128k_or_1m(self):
 125          # gpt-4.1 and gpt-4.1-mini have 1M context; other gpt-4* have 128k
 126          for key, value in DEFAULT_CONTEXT_LENGTHS.items():
 127              if "gpt-4" in key and "gpt-4.1" not in key:
 128                  assert value == 128000, f"{key} should be 128000"
 129  
 130      def test_gpt41_models_1m(self):
 131          for key, value in DEFAULT_CONTEXT_LENGTHS.items():
 132              if "gpt-4.1" in key:
 133                  assert value == 1047576, f"{key} should be 1047576"
 134  
 135      def test_gemini_models_1m(self):
 136          for key, value in DEFAULT_CONTEXT_LENGTHS.items():
 137              if "gemini" in key:
 138                  assert value == 1048576, f"{key} should be 1048576"
 139  
 140      def test_grok_models_context_lengths(self):
 141          # xAI /v1/models does not return context_length metadata, so
 142          # DEFAULT_CONTEXT_LENGTHS must cover the Grok family explicitly.
 143          # Values sourced from models.dev (2026-04).
 144          expected = {
 145              "grok-4.20": 2000000,
 146              "grok-4-1-fast": 2000000,
 147              "grok-4-fast": 2000000,
 148              "grok-4": 256000,
 149              "grok-code-fast": 256000,
 150              "grok-3": 131072,
 151              "grok-2": 131072,
 152              "grok-2-vision": 8192,
 153              "grok": 131072,
 154          }
 155          for key, value in expected.items():
 156              assert key in DEFAULT_CONTEXT_LENGTHS, f"{key} missing from DEFAULT_CONTEXT_LENGTHS"
 157              assert DEFAULT_CONTEXT_LENGTHS[key] == value, (
 158                  f"{key} should be {value}, got {DEFAULT_CONTEXT_LENGTHS[key]}"
 159              )
 160  
 161      def test_grok_substring_matching(self):
 162          # Longest-first substring matching must resolve the real xAI model
 163          # IDs to the correct fallback entries without 128k probe-down.
 164          from agent.model_metadata import get_model_context_length
 165          from unittest.mock import patch as mock_patch
 166  
 167          # Fake the provider/API/cache layers so the lookup falls through
 168          # to DEFAULT_CONTEXT_LENGTHS.
 169          with mock_patch("agent.model_metadata.fetch_model_metadata", return_value={}),              mock_patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}),              mock_patch("agent.model_metadata.get_cached_context_length", return_value=None):
 170              cases = [
 171                  ("grok-4.20-0309-reasoning", 2000000),
 172                  ("grok-4.20-0309-non-reasoning", 2000000),
 173                  ("grok-4.20-multi-agent-0309", 2000000),
 174                  ("grok-4-1-fast-reasoning", 2000000),
 175                  ("grok-4-1-fast-non-reasoning", 2000000),
 176                  ("grok-4-fast-reasoning", 2000000),
 177                  ("grok-4-fast-non-reasoning", 2000000),
 178                  ("grok-4", 256000),
 179                  ("grok-4-0709", 256000),
 180                  ("grok-code-fast-1", 256000),
 181                  ("grok-3", 131072),
 182                  ("grok-3-mini", 131072),
 183                  ("grok-3-mini-fast", 131072),
 184                  ("grok-2", 131072),
 185                  ("grok-2-vision", 8192),
 186                  ("grok-2-vision-1212", 8192),
 187                  ("grok-beta", 131072),
 188              ]
 189              for model_id, expected_ctx in cases:
 190                  actual = get_model_context_length(model_id)
 191                  assert actual == expected_ctx, (
 192                      f"{model_id}: expected {expected_ctx}, got {actual}"
 193                  )
 194  
 195      def test_deepseek_v4_models_1m_context(self):
 196          from agent.model_metadata import get_model_context_length
 197          from unittest.mock import patch as mock_patch
 198  
 199          expected_keys = {
 200              "deepseek-v4-pro": 1_000_000,
 201              "deepseek-v4-flash": 1_000_000,
 202              "deepseek-chat": 1_000_000,
 203              "deepseek-reasoner": 1_000_000,
 204          }
 205          for key, value in expected_keys.items():
 206              assert key in DEFAULT_CONTEXT_LENGTHS, f"{key} missing"
 207              assert DEFAULT_CONTEXT_LENGTHS[key] == value, (
 208                  f"{key} should be {value}, got {DEFAULT_CONTEXT_LENGTHS[key]}"
 209              )
 210  
 211          # Longest-first substring matching must resolve both the bare V4
 212          # ids (native DeepSeek) and the vendor-prefixed forms (OpenRouter
 213          # / Nous Portal) to 1M without probing down to the legacy 128K
 214          # ``deepseek`` substring fallback.
 215          with mock_patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
 216               mock_patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
 217               mock_patch("agent.model_metadata.get_cached_context_length", return_value=None):
 218              cases = [
 219                  ("deepseek-v4-pro", 1_000_000),
 220                  ("deepseek-v4-flash", 1_000_000),
 221                  ("deepseek/deepseek-v4-pro", 1_000_000),
 222                  ("deepseek/deepseek-v4-flash", 1_000_000),
 223                  ("deepseek-chat", 1_000_000),
 224                  ("deepseek-reasoner", 1_000_000),
 225              ]
 226              for model_id, expected_ctx in cases:
 227                  actual = get_model_context_length(model_id)
 228                  assert actual == expected_ctx, (
 229                      f"{model_id}: expected {expected_ctx}, got {actual}"
 230                  )
 231  
 232      def test_all_values_positive(self):
 233          for key, value in DEFAULT_CONTEXT_LENGTHS.items():
 234              assert value > 0, f"{key} has non-positive context length"
 235  
 236      def test_dict_is_not_empty(self):
 237          assert len(DEFAULT_CONTEXT_LENGTHS) >= 10
 238  
 239  
 240  # =========================================================================
 241  # Codex OAuth context-window resolution (provider="openai-codex")
 242  # =========================================================================
 243  
 244  class TestCodexOAuthContextLength:
 245      """ChatGPT Codex OAuth imposes lower context limits than the direct
 246      OpenAI API for the same slugs. Verified Apr 2026 via live probe of
 247      chatgpt.com/backend-api/codex/models: every model returns 272k, while
 248      models.dev reports 1.05M for gpt-5.5/gpt-5.4 and 400k for the rest.
 249      """
 250  
 251      def setup_method(self):
 252          import agent.model_metadata as mm
 253          mm._codex_oauth_context_cache = {}
 254          mm._codex_oauth_context_cache_time = 0.0
 255  
 256      def test_fallback_table_used_without_token(self):
 257          """With no access token, the hardcoded Codex fallback table wins
 258          over models.dev (which reports 1.05M for gpt-5.5 but Codex is 272k).
 259          """
 260          from agent.model_metadata import get_model_context_length
 261  
 262          with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
 263               patch("agent.model_metadata.save_context_length"):
 264              for model in (
 265                  "gpt-5.5",
 266                  "gpt-5.4",
 267                  "gpt-5.4-mini",
 268                  "gpt-5.3-codex",
 269                  "gpt-5.2-codex",
 270                  "gpt-5.1-codex-max",
 271                  "gpt-5.1-codex-mini",
 272              ):
 273                  ctx = get_model_context_length(
 274                      model=model,
 275                      base_url="https://chatgpt.com/backend-api/codex",
 276                      api_key="",
 277                      provider="openai-codex",
 278                  )
 279                  assert ctx == 272_000, (
 280                      f"Codex {model}: expected 272000 fallback, got {ctx} "
 281                      "(models.dev leakage?)"
 282                  )
 283  
 284      def test_live_probe_overrides_fallback(self):
 285          """When a token is provided, the live /models probe is preferred
 286          and its context_window drives the result."""
 287          from agent.model_metadata import get_model_context_length
 288  
 289          fake_response = MagicMock()
 290          fake_response.status_code = 200
 291          fake_response.json.return_value = {
 292              "models": [
 293                  {"slug": "gpt-5.5", "context_window": 300_000},
 294                  {"slug": "gpt-5.4", "context_window": 400_000},
 295              ]
 296          }
 297  
 298          with patch("agent.model_metadata.requests.get", return_value=fake_response), \
 299               patch("agent.model_metadata.get_cached_context_length", return_value=None), \
 300               patch("agent.model_metadata.save_context_length"):
 301              ctx_55 = get_model_context_length(
 302                  model="gpt-5.5",
 303                  base_url="https://chatgpt.com/backend-api/codex",
 304                  api_key="fake-token",
 305                  provider="openai-codex",
 306              )
 307              ctx_54 = get_model_context_length(
 308                  model="gpt-5.4",
 309                  base_url="https://chatgpt.com/backend-api/codex",
 310                  api_key="fake-token",
 311                  provider="openai-codex",
 312              )
 313          assert ctx_55 == 300_000
 314          assert ctx_54 == 400_000
 315  
 316      def test_probe_failure_falls_back_to_hardcoded(self):
 317          """If the probe fails (non-200 / network error), we still return
 318          the hardcoded 272k rather than leaking through to models.dev 1.05M."""
 319          from agent.model_metadata import get_model_context_length
 320  
 321          fake_response = MagicMock()
 322          fake_response.status_code = 401
 323          fake_response.json.return_value = {}
 324  
 325          with patch("agent.model_metadata.requests.get", return_value=fake_response), \
 326               patch("agent.model_metadata.get_cached_context_length", return_value=None), \
 327               patch("agent.model_metadata.save_context_length"):
 328              ctx = get_model_context_length(
 329                  model="gpt-5.5",
 330                  base_url="https://chatgpt.com/backend-api/codex",
 331                  api_key="expired-token",
 332                  provider="openai-codex",
 333              )
 334          assert ctx == 272_000
 335  
 336      def test_non_codex_providers_unaffected(self):
 337          """Resolving gpt-5.5 on non-Codex providers must NOT use the Codex
 338          272k override — OpenRouter / direct OpenAI API have different limits.
 339          """
 340          from agent.model_metadata import get_model_context_length
 341  
 342          # OpenRouter — should hit its own catalog path first; when mocked
 343          # empty, falls through to hardcoded DEFAULT_CONTEXT_LENGTHS (1.05M,
 344          # matching the real direct-API value — Codex OAuth's 272k cap is
 345          # provider-specific and must not leak here).
 346          with patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
 347               patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
 348               patch("agent.model_metadata.get_cached_context_length", return_value=None), \
 349               patch("agent.models_dev.lookup_models_dev_context", return_value=None):
 350              ctx = get_model_context_length(
 351                  model="openai/gpt-5.5",
 352                  base_url="https://openrouter.ai/api/v1",
 353                  api_key="",
 354                  provider="openrouter",
 355              )
 356          assert ctx == 1_050_000, (
 357              f"Non-Codex gpt-5.5 resolved to {ctx}; Codex 272k override "
 358              "leaked outside openai-codex provider"
 359          )
 360  
 361      def test_stale_codex_cache_over_400k_is_invalidated(self, tmp_path, monkeypatch):
 362          """Pre-PR #14935 builds cached gpt-5.5 at 1.05M (from models.dev)
 363          before the Codex-aware branch existed. Upgrading users keep that
 364          stale entry on disk and the cache-first lookup returns it forever.
 365          Codex OAuth caps at 272k for every slug, so any cached Codex
 366          entry >= 400k must be dropped and re-resolved via the live probe.
 367          """
 368          from agent import model_metadata as mm
 369  
 370          # Isolate the cache file to tmp_path
 371          cache_file = tmp_path / "context_length_cache.yaml"
 372          monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
 373  
 374          base_url = "https://chatgpt.com/backend-api/codex/"
 375          stale_key = f"gpt-5.5@{base_url}"
 376          other_key = "other-model@https://api.openai.com/v1/"
 377          import yaml as _yaml
 378          cache_file.write_text(_yaml.dump({"context_lengths": {
 379              stale_key: 1_050_000,   # stale pre-fix value
 380              other_key: 128_000,     # unrelated, must survive
 381          }}))
 382  
 383          fake_response = MagicMock()
 384          fake_response.status_code = 200
 385          fake_response.json.return_value = {
 386              "models": [{"slug": "gpt-5.5", "context_window": 272_000}]
 387          }
 388  
 389          with patch("agent.model_metadata.requests.get", return_value=fake_response), \
 390               patch("agent.model_metadata.save_context_length") as mock_save:
 391              ctx = mm.get_model_context_length(
 392                  model="gpt-5.5",
 393                  base_url=base_url,
 394                  api_key="fake-token",
 395                  provider="openai-codex",
 396              )
 397  
 398          assert ctx == 272_000, f"Stale entry should have been re-resolved to 272k, got {ctx}"
 399          # Live save was called with the fresh value
 400          mock_save.assert_called_with("gpt-5.5", base_url, 272_000)
 401          # The stale entry was removed from disk; unrelated entries survived
 402          remaining = _yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
 403          assert stale_key not in remaining, "Stale entry was not invalidated from the cache file"
 404          assert remaining.get(other_key) == 128_000, "Unrelated cache entries must not be touched"
 405  
 406      def test_fresh_codex_cache_under_400k_is_respected(self, tmp_path, monkeypatch):
 407          """Codex entries at the correct 272k must NOT be invalidated —
 408          only stale pre-fix values (>= 400k) get dropped."""
 409          from agent import model_metadata as mm
 410  
 411          cache_file = tmp_path / "context_length_cache.yaml"
 412          monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
 413  
 414          base_url = "https://chatgpt.com/backend-api/codex/"
 415          import yaml as _yaml
 416          cache_file.write_text(_yaml.dump({"context_lengths": {
 417              f"gpt-5.5@{base_url}": 272_000,
 418          }}))
 419  
 420          # If the invalidation incorrectly fired, this would be called; assert it isn't.
 421          with patch("agent.model_metadata.requests.get") as mock_get:
 422              ctx = mm.get_model_context_length(
 423                  model="gpt-5.5",
 424                  base_url=base_url,
 425                  api_key="fake-token",
 426                  provider="openai-codex",
 427              )
 428          assert ctx == 272_000
 429          mock_get.assert_not_called()
 430  
 431      def test_stale_invalidation_scoped_to_codex_provider(self, tmp_path, monkeypatch):
 432          """A cached 1M entry for a non-Codex provider (e.g. Anthropic opus on
 433          OpenRouter, legitimately 1M) must NOT be invalidated by this guard."""
 434          from agent import model_metadata as mm
 435  
 436          cache_file = tmp_path / "context_length_cache.yaml"
 437          monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
 438  
 439          base_url = "https://openrouter.ai/api/v1"
 440          import yaml as _yaml
 441          cache_file.write_text(_yaml.dump({"context_lengths": {
 442              f"anthropic/claude-opus-4.6@{base_url}": 1_000_000,
 443          }}))
 444  
 445          ctx = mm.get_model_context_length(
 446              model="anthropic/claude-opus-4.6",
 447              base_url=base_url,
 448              api_key="fake",
 449              provider="openrouter",
 450          )
 451          assert ctx == 1_000_000, "Non-codex 1M cache entries must be respected"
 452  
 453  
 454  # =========================================================================
 455  # get_model_context_length — resolution order
 456  # =========================================================================
 457  
 458  class TestGetModelContextLength:
 459      @patch("agent.model_metadata.fetch_model_metadata")
 460      def test_known_model_from_api(self, mock_fetch):
 461          mock_fetch.return_value = {
 462              "test/model": {"context_length": 32000}
 463          }
 464          assert get_model_context_length("test/model") == 32000
 465  
 466      @patch("agent.model_metadata.fetch_model_metadata")
 467      def test_fallback_to_defaults(self, mock_fetch):
 468          mock_fetch.return_value = {}
 469          assert get_model_context_length("anthropic/claude-sonnet-4") == 200000
 470  
 471      @patch("agent.model_metadata.fetch_model_metadata")
 472      def test_unknown_model_returns_first_probe_tier(self, mock_fetch):
 473          mock_fetch.return_value = {}
 474          assert get_model_context_length("unknown/never-heard-of-this") == CONTEXT_PROBE_TIERS[0]
 475  
 476      @patch("agent.model_metadata.fetch_model_metadata")
 477      def test_partial_match_in_defaults(self, mock_fetch):
 478          mock_fetch.return_value = {}
 479          assert get_model_context_length("openai/gpt-4o") == 128000
 480  
 481      @patch("agent.model_metadata.fetch_model_metadata")
 482      def test_qwen3_coder_plus_context_length(self, mock_fetch):
 483          """qwen3-coder-plus has a 1M context window, not the generic 128K Qwen default."""
 484          mock_fetch.return_value = {}
 485          assert get_model_context_length("qwen3-coder-plus") == 1000000
 486  
 487      @patch("agent.model_metadata.fetch_model_metadata")
 488      def test_qwen3_coder_context_length(self, mock_fetch):
 489          """qwen3-coder has a 256K context window, not the generic 128K Qwen default."""
 490          mock_fetch.return_value = {}
 491          assert get_model_context_length("qwen3-coder") == 262144
 492  
 493      @patch("agent.model_metadata.fetch_model_metadata")
 494      def test_qwen_generic_context_length(self, mock_fetch):
 495          """Generic qwen models still get the 128K default."""
 496          mock_fetch.return_value = {}
 497          assert get_model_context_length("qwen3-plus") == 131072
 498  
 499      @patch("agent.model_metadata.fetch_model_metadata")
 500      def test_api_missing_context_length_key(self, mock_fetch):
 501          """Model in API but without context_length → defaults to the top
 502          probe tier (currently 256K)."""
 503          mock_fetch.return_value = {"test/model": {"name": "Test"}}
 504          assert get_model_context_length("test/model") == CONTEXT_PROBE_TIERS[0]
 505  
 506      @patch("agent.model_metadata.fetch_model_metadata")
 507      def test_cache_takes_priority_over_api(self, mock_fetch, tmp_path):
 508          """Persistent cache should be checked BEFORE API metadata."""
 509          mock_fetch.return_value = {"my/model": {"context_length": 999999}}
 510          cache_file = tmp_path / "cache.yaml"
 511          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 512              save_context_length("my/model", "http://local", 32768)
 513              result = get_model_context_length("my/model", base_url="http://local")
 514              assert result == 32768  # cache wins over API's 999999
 515  
 516      @patch("agent.model_metadata.fetch_model_metadata")
 517      def test_no_base_url_skips_cache(self, mock_fetch, tmp_path):
 518          """Without base_url, cache lookup is skipped."""
 519          mock_fetch.return_value = {}
 520          cache_file = tmp_path / "cache.yaml"
 521          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 522              save_context_length("custom/model", "http://local", 32768)
 523              # No base_url → cache skipped → falls to probe tier
 524              result = get_model_context_length("custom/model")
 525              assert result == CONTEXT_PROBE_TIERS[0]
 526  
 527      @patch("agent.model_metadata.fetch_model_metadata")
 528      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 529      def test_custom_endpoint_metadata_beats_fuzzy_default(self, mock_endpoint_fetch, mock_fetch):
 530          mock_fetch.return_value = {}
 531          mock_endpoint_fetch.return_value = {
 532              "zai-org/GLM-5-TEE": {"context_length": 65536}
 533          }
 534  
 535          result = get_model_context_length(
 536              "zai-org/GLM-5-TEE",
 537              base_url="https://llm.chutes.ai/v1",
 538              api_key="test-key",
 539          )
 540  
 541          assert result == 65536
 542  
 543      @patch("agent.model_metadata.fetch_model_metadata")
 544      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 545      def test_custom_endpoint_without_metadata_skips_name_based_default(self, mock_endpoint_fetch, mock_fetch):
 546          mock_fetch.return_value = {}
 547          mock_endpoint_fetch.return_value = {}
 548  
 549          result = get_model_context_length(
 550              "zai-org/GLM-5-TEE",
 551              base_url="https://llm.chutes.ai/v1",
 552              api_key="test-key",
 553          )
 554  
 555          assert result == CONTEXT_PROBE_TIERS[0]
 556  
 557      @patch("agent.model_metadata.fetch_model_metadata")
 558      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 559      def test_custom_endpoint_single_model_fallback(self, mock_endpoint_fetch, mock_fetch):
 560          """Single-model servers: use the only model even if name doesn't match."""
 561          mock_fetch.return_value = {}
 562          mock_endpoint_fetch.return_value = {
 563              "Qwen3.5-9B-Q4_K_M.gguf": {"context_length": 131072}
 564          }
 565  
 566          result = get_model_context_length(
 567              "qwen3.5:9b",
 568              base_url="http://myserver.example.com:8080/v1",
 569              api_key="test-key",
 570          )
 571  
 572          assert result == 131072
 573  
 574      @patch("agent.model_metadata.fetch_model_metadata")
 575      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 576      def test_custom_endpoint_fuzzy_substring_match(self, mock_endpoint_fetch, mock_fetch):
 577          """Fuzzy match: configured model name is substring of endpoint model."""
 578          mock_fetch.return_value = {}
 579          mock_endpoint_fetch.return_value = {
 580              "org/llama-3.3-70b-instruct-fp8": {"context_length": 131072},
 581              "org/qwen-2.5-72b": {"context_length": 32768},
 582          }
 583  
 584          result = get_model_context_length(
 585              "llama-3.3-70b-instruct",
 586              base_url="http://myserver.example.com:8080/v1",
 587              api_key="test-key",
 588          )
 589  
 590          assert result == 131072
 591  
 592      @patch("agent.model_metadata.fetch_model_metadata")
 593      def test_config_context_length_overrides_all(self, mock_fetch):
 594          """Explicit config_context_length takes priority over everything."""
 595          mock_fetch.return_value = {
 596              "test/model": {"context_length": 200000}
 597          }
 598  
 599          result = get_model_context_length(
 600              "test/model",
 601              config_context_length=65536,
 602          )
 603  
 604          assert result == 65536
 605  
 606      @patch("agent.model_metadata.fetch_model_metadata")
 607      def test_config_context_length_zero_is_ignored(self, mock_fetch):
 608          """config_context_length=0 should be treated as unset."""
 609          mock_fetch.return_value = {}
 610  
 611          result = get_model_context_length(
 612              "anthropic/claude-sonnet-4",
 613              config_context_length=0,
 614          )
 615  
 616          assert result == 200000
 617  
 618      @patch("agent.model_metadata.fetch_model_metadata")
 619      def test_config_context_length_none_is_ignored(self, mock_fetch):
 620          """config_context_length=None should be treated as unset."""
 621          mock_fetch.return_value = {}
 622  
 623          result = get_model_context_length(
 624              "anthropic/claude-sonnet-4",
 625              config_context_length=None,
 626          )
 627  
 628          assert result == 200000
 629  
 630  
 631  # =========================================================================
 632  # Bedrock context resolution — must run BEFORE custom-endpoint probe
 633  # =========================================================================
 634  
 635  class TestBedrockContextResolution:
 636      """Regression tests for Bedrock context-length resolution order.
 637  
 638      Bug: because ``bedrock-runtime.<region>.amazonaws.com`` is not listed in
 639      ``_URL_TO_PROVIDER``, ``_is_known_provider_base_url`` returned False and
 640      the custom-endpoint probe at step 2 ran first — fetching ``/models`` from
 641      Bedrock (which it doesn't serve), returning the 128K default-fallback
 642      before execution ever reached the Bedrock branch.
 643  
 644      Fix: promote the Bedrock branch ahead of the custom-endpoint probe.
 645      """
 646  
 647      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 648      def test_bedrock_provider_returns_static_table_before_probe(self, mock_fetch):
 649          """provider='bedrock' resolves via static table, bypasses /models probe."""
 650          ctx = get_model_context_length(
 651              "anthropic.claude-opus-4-v1:0",
 652              provider="bedrock",
 653              base_url="https://bedrock-runtime.us-east-1.amazonaws.com",
 654          )
 655          # Must return the static Bedrock table value (200K for Claude),
 656          # NOT DEFAULT_FALLBACK_CONTEXT (128K).
 657          assert ctx == 200000
 658          mock_fetch.assert_not_called()
 659  
 660      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 661      def test_bedrock_url_without_provider_hint(self, mock_fetch):
 662          """bedrock-runtime host infers Bedrock even when provider is omitted."""
 663          ctx = get_model_context_length(
 664              "anthropic.claude-sonnet-4-v1:0",
 665              base_url="https://bedrock-runtime.us-west-2.amazonaws.com",
 666          )
 667          assert ctx == 200000
 668          mock_fetch.assert_not_called()
 669  
 670      @patch("agent.model_metadata.fetch_endpoint_model_metadata")
 671      def test_non_bedrock_url_still_probes(self, mock_fetch):
 672          """Non-Bedrock hosts still reach the custom-endpoint probe."""
 673          mock_fetch.return_value = {"some-model": {"context_length": 50000}}
 674          ctx = get_model_context_length(
 675              "some-model",
 676              base_url="https://api.example.com/v1",
 677          )
 678          assert ctx == 50000
 679          assert mock_fetch.called
 680  
 681  
 682  # =========================================================================
 683  # _strip_provider_prefix — Ollama model:tag vs provider:model
 684  # =========================================================================
 685  
 686  class TestStripProviderPrefix:
 687      def test_known_provider_prefix_is_stripped(self):
 688          assert _strip_provider_prefix("local:my-model") == "my-model"
 689          assert _strip_provider_prefix("openrouter:anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
 690          assert _strip_provider_prefix("anthropic:claude-sonnet-4") == "claude-sonnet-4"
 691          assert _strip_provider_prefix("stepfun:step-3.5-flash") == "step-3.5-flash"
 692  
 693      def test_ollama_model_tag_preserved(self):
 694          """Ollama model:tag format must NOT be stripped."""
 695          assert _strip_provider_prefix("qwen3.5:27b") == "qwen3.5:27b"
 696          assert _strip_provider_prefix("llama3.3:70b") == "llama3.3:70b"
 697          assert _strip_provider_prefix("gemma2:9b") == "gemma2:9b"
 698          assert _strip_provider_prefix("codellama:13b-instruct-q4_0") == "codellama:13b-instruct-q4_0"
 699  
 700      def test_http_urls_preserved(self):
 701          assert _strip_provider_prefix("http://example.com") == "http://example.com"
 702          assert _strip_provider_prefix("https://example.com") == "https://example.com"
 703  
 704      def test_no_colon_returns_unchanged(self):
 705          assert _strip_provider_prefix("gpt-4o") == "gpt-4o"
 706          assert _strip_provider_prefix("anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
 707  
 708      @patch("agent.model_metadata.fetch_model_metadata")
 709      def test_ollama_model_tag_not_mangled_in_context_lookup(self, mock_fetch):
 710          """Ensure 'qwen3.5:27b' is NOT reduced to '27b' during context length lookup.
 711  
 712          We mock a custom endpoint that knows 'qwen3.5:27b' — the full name
 713          must reach the endpoint metadata lookup intact.
 714          """
 715          mock_fetch.return_value = {}
 716          with patch("agent.model_metadata.fetch_endpoint_model_metadata") as mock_ep, \
 717               patch("agent.model_metadata._is_custom_endpoint", return_value=True):
 718              mock_ep.return_value = {"qwen3.5:27b": {"context_length": 32768}}
 719              result = get_model_context_length(
 720                  "qwen3.5:27b",
 721                  base_url="http://localhost:11434/v1",
 722              )
 723          assert result == 32768
 724  
 725  
 726  # =========================================================================
 727  # fetch_model_metadata — caching, TTL, slugs, failures
 728  # =========================================================================
 729  
 730  class TestFetchModelMetadata:
 731      def _reset_cache(self):
 732          import agent.model_metadata as mm
 733          mm._model_metadata_cache = {}
 734          mm._model_metadata_cache_time = 0
 735  
 736      @patch("agent.model_metadata.requests.get")
 737      def test_caches_result(self, mock_get):
 738          self._reset_cache()
 739          mock_response = MagicMock()
 740          mock_response.json.return_value = {
 741              "data": [{"id": "test/model", "context_length": 99999, "name": "Test"}]
 742          }
 743          mock_response.raise_for_status = MagicMock()
 744          mock_get.return_value = mock_response
 745  
 746          result1 = fetch_model_metadata(force_refresh=True)
 747          assert "test/model" in result1
 748          assert mock_get.call_count == 1
 749  
 750          result2 = fetch_model_metadata()
 751          assert "test/model" in result2
 752          assert mock_get.call_count == 1  # cached
 753  
 754      @patch("agent.model_metadata.requests.get")
 755      def test_api_failure_returns_empty_on_cold_cache(self, mock_get):
 756          self._reset_cache()
 757          mock_get.side_effect = Exception("Network error")
 758          result = fetch_model_metadata(force_refresh=True)
 759          assert result == {}
 760  
 761      @patch("agent.model_metadata.requests.get")
 762      def test_api_failure_returns_stale_cache(self, mock_get):
 763          """On API failure with existing cache, stale data is returned."""
 764          import agent.model_metadata as mm
 765          mm._model_metadata_cache = {"old/model": {"context_length": 50000}}
 766          mm._model_metadata_cache_time = 0  # expired
 767  
 768          mock_get.side_effect = Exception("Network error")
 769          result = fetch_model_metadata(force_refresh=True)
 770          assert "old/model" in result
 771          assert result["old/model"]["context_length"] == 50000
 772  
 773      @patch("agent.model_metadata.requests.get")
 774      def test_canonical_slug_aliasing(self, mock_get):
 775          """Models with canonical_slug get indexed under both IDs."""
 776          self._reset_cache()
 777          mock_response = MagicMock()
 778          mock_response.json.return_value = {
 779              "data": [{
 780                  "id": "anthropic/claude-3.5-sonnet:beta",
 781                  "canonical_slug": "anthropic/claude-3.5-sonnet",
 782                  "context_length": 200000,
 783                  "name": "Claude 3.5 Sonnet"
 784              }]
 785          }
 786          mock_response.raise_for_status = MagicMock()
 787          mock_get.return_value = mock_response
 788  
 789          result = fetch_model_metadata(force_refresh=True)
 790          # Both the original ID and canonical slug should work
 791          assert "anthropic/claude-3.5-sonnet:beta" in result
 792          assert "anthropic/claude-3.5-sonnet" in result
 793          assert result["anthropic/claude-3.5-sonnet"]["context_length"] == 200000
 794  
 795      @patch("agent.model_metadata.requests.get")
 796      def test_provider_prefixed_models_get_bare_aliases(self, mock_get):
 797          self._reset_cache()
 798          mock_response = MagicMock()
 799          mock_response.json.return_value = {
 800              "data": [{
 801                  "id": "provider/test-model",
 802                  "context_length": 123456,
 803                  "name": "Provider: Test Model",
 804              }]
 805          }
 806          mock_response.raise_for_status = MagicMock()
 807          mock_get.return_value = mock_response
 808  
 809          result = fetch_model_metadata(force_refresh=True)
 810  
 811          assert result["provider/test-model"]["context_length"] == 123456
 812          assert result["test-model"]["context_length"] == 123456
 813  
 814      @patch("agent.model_metadata.requests.get")
 815      def test_ttl_expiry_triggers_refetch(self, mock_get):
 816          """Cache expires after _MODEL_CACHE_TTL seconds."""
 817          import agent.model_metadata as mm
 818          self._reset_cache()
 819  
 820          mock_response = MagicMock()
 821          mock_response.json.return_value = {
 822              "data": [{"id": "m1", "context_length": 1000, "name": "M1"}]
 823          }
 824          mock_response.raise_for_status = MagicMock()
 825          mock_get.return_value = mock_response
 826  
 827          fetch_model_metadata(force_refresh=True)
 828          assert mock_get.call_count == 1
 829  
 830          # Simulate TTL expiry
 831          mm._model_metadata_cache_time = time.time() - _MODEL_CACHE_TTL - 1
 832          fetch_model_metadata()
 833          assert mock_get.call_count == 2  # refetched
 834  
 835      @patch("agent.model_metadata.requests.get")
 836      def test_malformed_json_no_data_key(self, mock_get):
 837          """API returns JSON without 'data' key — empty cache, no crash."""
 838          self._reset_cache()
 839          mock_response = MagicMock()
 840          mock_response.json.return_value = {"error": "something"}
 841          mock_response.raise_for_status = MagicMock()
 842          mock_get.return_value = mock_response
 843  
 844          result = fetch_model_metadata(force_refresh=True)
 845          assert result == {}
 846  
 847  
 848  # =========================================================================
 849  # Context probe tiers
 850  # =========================================================================
 851  
 852  class TestContextProbeTiers:
 853      def test_tiers_descending(self):
 854          for i in range(len(CONTEXT_PROBE_TIERS) - 1):
 855              assert CONTEXT_PROBE_TIERS[i] > CONTEXT_PROBE_TIERS[i + 1]
 856  
 857      def test_first_tier_is_256k(self):
 858          assert CONTEXT_PROBE_TIERS[0] == 256_000
 859  
 860      def test_last_tier_is_8k(self):
 861          assert CONTEXT_PROBE_TIERS[-1] == 8_000
 862  
 863  
 864  class TestGetNextProbeTier:
 865      def test_from_256k(self):
 866          assert get_next_probe_tier(256_000) == 128_000
 867  
 868      def test_from_128k(self):
 869          assert get_next_probe_tier(128_000) == 64_000
 870  
 871      def test_from_64k(self):
 872          assert get_next_probe_tier(64_000) == 32_000
 873  
 874      def test_from_32k(self):
 875          assert get_next_probe_tier(32_000) == 16_000
 876  
 877      def test_from_8k_returns_none(self):
 878          assert get_next_probe_tier(8_000) is None
 879  
 880      def test_from_below_min_returns_none(self):
 881          assert get_next_probe_tier(4_000) is None
 882  
 883      def test_from_arbitrary_value(self):
 884          assert get_next_probe_tier(100_000) == 64_000
 885  
 886      def test_above_max_tier(self):
 887          """Value above 256K should return 256K."""
 888          assert get_next_probe_tier(500_000) == 256_000
 889  
 890      def test_zero_returns_none(self):
 891          assert get_next_probe_tier(0) is None
 892  
 893  
 894  # =========================================================================
 895  # Error message parsing
 896  # =========================================================================
 897  
 898  class TestParseContextLimitFromError:
 899      def test_openai_format(self):
 900          msg = "This model's maximum context length is 32768 tokens. However, your messages resulted in 45000 tokens."
 901          assert parse_context_limit_from_error(msg) == 32768
 902  
 903      def test_context_length_exceeded(self):
 904          msg = "context_length_exceeded: maximum context length is 131072"
 905          assert parse_context_limit_from_error(msg) == 131072
 906  
 907      def test_context_size_exceeded(self):
 908          msg = "Maximum context size 65536 exceeded"
 909          assert parse_context_limit_from_error(msg) == 65536
 910  
 911      def test_no_limit_in_message(self):
 912          assert parse_context_limit_from_error("Something went wrong with the API") is None
 913  
 914      def test_unreasonable_small_number_rejected(self):
 915          assert parse_context_limit_from_error("context length is 42 tokens") is None
 916  
 917      def test_ollama_format(self):
 918          msg = "Context size has been exceeded. Maximum context size is 32768"
 919          assert parse_context_limit_from_error(msg) == 32768
 920  
 921      def test_anthropic_format(self):
 922          msg = "prompt is too long: 250000 tokens > 200000 maximum"
 923          # Should extract 200000 (the limit), not 250000 (the input size)
 924          assert parse_context_limit_from_error(msg) == 200000
 925  
 926      def test_lmstudio_format(self):
 927          msg = "Error: context window of 4096 tokens exceeded"
 928          assert parse_context_limit_from_error(msg) == 4096
 929  
 930      def test_minimax_delta_only_message_returns_none(self):
 931          msg = "invalid params, context window exceeds limit (2013)"
 932          assert parse_context_limit_from_error(msg) is None
 933  
 934      def test_completely_unrelated_error(self):
 935          assert parse_context_limit_from_error("Invalid API key") is None
 936  
 937      def test_empty_string(self):
 938          assert parse_context_limit_from_error("") is None
 939  
 940      def test_number_outside_reasonable_range(self):
 941          """Very large number (>10M) should be rejected."""
 942          msg = "maximum context length is 99999999999"
 943          assert parse_context_limit_from_error(msg) is None
 944  
 945  
 946  # =========================================================================
 947  # Persistent context length cache
 948  # =========================================================================
 949  
 950  class TestContextLengthCache:
 951      def test_save_and_load(self, tmp_path):
 952          cache_file = tmp_path / "cache.yaml"
 953          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 954              save_context_length("test/model", "http://localhost:8080/v1", 32768)
 955              assert get_cached_context_length("test/model", "http://localhost:8080/v1") == 32768
 956  
 957      def test_missing_cache_returns_none(self, tmp_path):
 958          cache_file = tmp_path / "nonexistent.yaml"
 959          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 960              assert get_cached_context_length("test/model", "http://x") is None
 961  
 962      def test_multiple_models_cached(self, tmp_path):
 963          cache_file = tmp_path / "cache.yaml"
 964          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 965              save_context_length("model-a", "http://a", 64000)
 966              save_context_length("model-b", "http://b", 128000)
 967              assert get_cached_context_length("model-a", "http://a") == 64000
 968              assert get_cached_context_length("model-b", "http://b") == 128000
 969  
 970      def test_same_model_different_providers(self, tmp_path):
 971          cache_file = tmp_path / "cache.yaml"
 972          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 973              save_context_length("llama-3", "http://local:8080", 32768)
 974              save_context_length("llama-3", "https://openrouter.ai/api/v1", 131072)
 975              assert get_cached_context_length("llama-3", "http://local:8080") == 32768
 976              assert get_cached_context_length("llama-3", "https://openrouter.ai/api/v1") == 131072
 977  
 978      def test_idempotent_save(self, tmp_path):
 979          cache_file = tmp_path / "cache.yaml"
 980          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 981              save_context_length("model", "http://x", 32768)
 982              save_context_length("model", "http://x", 32768)
 983              with open(cache_file) as f:
 984                  data = yaml.safe_load(f)
 985              assert len(data["context_lengths"]) == 1
 986  
 987      def test_update_existing_value(self, tmp_path):
 988          """Saving a different value for the same key overwrites it."""
 989          cache_file = tmp_path / "cache.yaml"
 990          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
 991              save_context_length("model", "http://x", 128000)
 992              save_context_length("model", "http://x", 64000)
 993              assert get_cached_context_length("model", "http://x") == 64000
 994  
 995      def test_corrupted_yaml_returns_empty(self, tmp_path):
 996          """Corrupted cache file is handled gracefully."""
 997          cache_file = tmp_path / "cache.yaml"
 998          cache_file.write_text("{{{{not valid yaml: [[[")
 999          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
1000              assert get_cached_context_length("model", "http://x") is None
1001  
1002      def test_wrong_structure_returns_none(self, tmp_path):
1003          """YAML that loads but has wrong structure."""
1004          cache_file = tmp_path / "cache.yaml"
1005          cache_file.write_text("just_a_string\n")
1006          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
1007              assert get_cached_context_length("model", "http://x") is None
1008  
1009      @patch("agent.model_metadata.fetch_model_metadata")
1010      def test_cached_value_takes_priority(self, mock_fetch, tmp_path):
1011          mock_fetch.return_value = {}
1012          cache_file = tmp_path / "cache.yaml"
1013          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
1014              save_context_length("unknown/model", "http://local", 65536)
1015              assert get_model_context_length("unknown/model", base_url="http://local") == 65536
1016  
1017      def test_special_chars_in_model_name(self, tmp_path):
1018          """Model names with colons, slashes, etc. don't break the cache."""
1019          cache_file = tmp_path / "cache.yaml"
1020          model = "anthropic/claude-3.5-sonnet:beta"
1021          url = "https://api.example.com/v1"
1022          with patch("agent.model_metadata._get_context_cache_path", return_value=cache_file):
1023              save_context_length(model, url, 200000)
1024              assert get_cached_context_length(model, url) == 200000