/ tests / run_agent / test_run_agent_codex_responses.py
test_run_agent_codex_responses.py
   1  import sys
   2  import types
   3  from types import SimpleNamespace
   4  
   5  import pytest
   6  
   7  
   8  sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
   9  sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
  10  sys.modules.setdefault("fal_client", types.SimpleNamespace())
  11  
  12  import run_agent
  13  
  14  
  15  @pytest.fixture(autouse=True)
  16  def _no_codex_backoff(monkeypatch):
  17      """Short-circuit retry backoff so Codex retry tests don't block on real
  18      wall-clock waits (5s jittered_backoff base delay + tight time.sleep loop)."""
  19      import time as _time
  20      monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
  21      monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
  22  
  23  
  24  def _patch_agent_bootstrap(monkeypatch):
  25      monkeypatch.setattr(
  26          run_agent,
  27          "get_tool_definitions",
  28          lambda **kwargs: [
  29              {
  30                  "type": "function",
  31                  "function": {
  32                      "name": "terminal",
  33                      "description": "Run shell commands.",
  34                      "parameters": {"type": "object", "properties": {}},
  35                  },
  36              }
  37          ],
  38      )
  39      monkeypatch.setattr(run_agent, "check_toolset_requirements", lambda: {})
  40  
  41  
  42  def _build_agent(monkeypatch):
  43      _patch_agent_bootstrap(monkeypatch)
  44  
  45      agent = run_agent.AIAgent(
  46          model="gpt-5-codex",
  47          base_url="https://chatgpt.com/backend-api/codex",
  48          api_key="codex-token",
  49          quiet_mode=True,
  50          max_iterations=4,
  51          skip_context_files=True,
  52          skip_memory=True,
  53      )
  54      agent._cleanup_task_resources = lambda task_id: None
  55      agent._persist_session = lambda messages, history=None: None
  56      agent._save_trajectory = lambda messages, user_message, completed: None
  57      agent._save_session_log = lambda messages: None
  58      return agent
  59  
  60  
  61  def _build_copilot_agent(monkeypatch, *, model="gpt-5.4"):
  62      _patch_agent_bootstrap(monkeypatch)
  63  
  64      agent = run_agent.AIAgent(
  65          model=model,
  66          provider="copilot",
  67          api_mode="codex_responses",
  68          base_url="https://api.githubcopilot.com",
  69          api_key="gh-token",
  70          quiet_mode=True,
  71          max_iterations=4,
  72          skip_context_files=True,
  73          skip_memory=True,
  74      )
  75      agent._cleanup_task_resources = lambda task_id: None
  76      agent._persist_session = lambda messages, history=None: None
  77      agent._save_trajectory = lambda messages, user_message, completed: None
  78      agent._save_session_log = lambda messages: None
  79      return agent
  80  
  81  
  82  def _codex_message_response(text: str):
  83      return SimpleNamespace(
  84          output=[
  85              SimpleNamespace(
  86                  type="message",
  87                  content=[SimpleNamespace(type="output_text", text=text)],
  88              )
  89          ],
  90          usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8),
  91          status="completed",
  92          model="gpt-5-codex",
  93      )
  94  
  95  
  96  def _codex_tool_call_response():
  97      return SimpleNamespace(
  98          output=[
  99              SimpleNamespace(
 100                  type="function_call",
 101                  id="fc_1",
 102                  call_id="call_1",
 103                  name="terminal",
 104                  arguments="{}",
 105              )
 106          ],
 107          usage=SimpleNamespace(input_tokens=12, output_tokens=4, total_tokens=16),
 108          status="completed",
 109          model="gpt-5-codex",
 110      )
 111  
 112  
 113  def _codex_incomplete_message_response(text: str):
 114      return SimpleNamespace(
 115          output=[
 116              SimpleNamespace(
 117                  type="message",
 118                  status="in_progress",
 119                  content=[SimpleNamespace(type="output_text", text=text)],
 120              )
 121          ],
 122          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
 123          status="in_progress",
 124          model="gpt-5-codex",
 125      )
 126  
 127  
 128  def _codex_commentary_message_response(text: str):
 129      return SimpleNamespace(
 130          output=[
 131              SimpleNamespace(
 132                  type="message",
 133                  phase="commentary",
 134                  status="completed",
 135                  content=[SimpleNamespace(type="output_text", text=text)],
 136              )
 137          ],
 138          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
 139          status="completed",
 140          model="gpt-5-codex",
 141      )
 142  
 143  
 144  def _codex_ack_message_response(text: str):
 145      return SimpleNamespace(
 146          output=[
 147              SimpleNamespace(
 148                  type="message",
 149                  status="completed",
 150                  content=[SimpleNamespace(type="output_text", text=text)],
 151              )
 152          ],
 153          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
 154          status="completed",
 155          model="gpt-5-codex",
 156      )
 157  
 158  
 159  class _FakeResponsesStream:
 160      def __init__(self, *, final_response=None, final_error=None):
 161          self._final_response = final_response
 162          self._final_error = final_error
 163  
 164      def __enter__(self):
 165          return self
 166  
 167      def __exit__(self, exc_type, exc, tb):
 168          return False
 169  
 170      def __iter__(self):
 171          return iter(())
 172  
 173      def get_final_response(self):
 174          if self._final_error is not None:
 175              raise self._final_error
 176          return self._final_response
 177  
 178  
 179  class _FakeCreateStream:
 180      def __init__(self, events):
 181          self._events = list(events)
 182          self.closed = False
 183  
 184      def __iter__(self):
 185          return iter(self._events)
 186  
 187      def close(self):
 188          self.closed = True
 189  
 190  
 191  def _codex_request_kwargs():
 192      return {
 193          "model": "gpt-5-codex",
 194          "instructions": "You are Hermes.",
 195          "input": [{"role": "user", "content": "Ping"}],
 196          "tools": None,
 197          "store": False,
 198      }
 199  
 200  
 201  def test_api_mode_uses_explicit_provider_when_codex(monkeypatch):
 202      _patch_agent_bootstrap(monkeypatch)
 203      agent = run_agent.AIAgent(
 204          model="gpt-5-codex",
 205          base_url="https://openrouter.ai/api/v1",
 206          provider="openai-codex",
 207          api_key="codex-token",
 208          quiet_mode=True,
 209          max_iterations=1,
 210          skip_context_files=True,
 211          skip_memory=True,
 212      )
 213      assert agent.api_mode == "codex_responses"
 214      assert agent.provider == "openai-codex"
 215  
 216  
 217  def test_api_mode_normalizes_provider_case(monkeypatch):
 218      _patch_agent_bootstrap(monkeypatch)
 219      agent = run_agent.AIAgent(
 220          model="gpt-5-codex",
 221          base_url="https://openrouter.ai/api/v1",
 222          provider="OpenAI-Codex",
 223          api_key="codex-token",
 224          quiet_mode=True,
 225          max_iterations=1,
 226          skip_context_files=True,
 227          skip_memory=True,
 228      )
 229      assert agent.provider == "openai-codex"
 230      assert agent.api_mode == "codex_responses"
 231  
 232  
 233  def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch):
 234      """GPT-5.x models need codex_responses even on OpenRouter.
 235  
 236      OpenRouter rejects GPT-5 models on /v1/chat/completions with
 237      ``unsupported_api_for_model``.  The model-level check overrides
 238      the provider default.
 239      """
 240      _patch_agent_bootstrap(monkeypatch)
 241      agent = run_agent.AIAgent(
 242          model="gpt-5-codex",
 243          base_url="https://chatgpt.com/backend-api/codex",
 244          provider="openrouter",
 245          api_key="test-token",
 246          quiet_mode=True,
 247          max_iterations=1,
 248          skip_context_files=True,
 249          skip_memory=True,
 250      )
 251      assert agent.api_mode == "codex_responses"
 252      assert agent.provider == "openrouter"
 253  
 254  
 255  def test_copilot_acp_stays_on_chat_completions_for_gpt_5_models(monkeypatch):
 256      _patch_agent_bootstrap(monkeypatch)
 257      agent = run_agent.AIAgent(
 258          model="gpt-5.4-mini",
 259          base_url="acp://copilot",
 260          provider="copilot-acp",
 261          api_key="copilot-acp",
 262          quiet_mode=True,
 263          max_iterations=1,
 264          skip_context_files=True,
 265          skip_memory=True,
 266      )
 267      assert agent.provider == "copilot-acp"
 268      assert agent.api_mode == "chat_completions"
 269  
 270  
 271  def test_copilot_gpt_5_mini_stays_on_chat_completions(monkeypatch):
 272      _patch_agent_bootstrap(monkeypatch)
 273      agent = run_agent.AIAgent(
 274          model="gpt-5-mini",
 275          base_url="https://api.githubcopilot.com",
 276          provider="copilot",
 277          api_key="gh-token",
 278          api_mode="chat_completions",
 279          quiet_mode=True,
 280          max_iterations=1,
 281          skip_context_files=True,
 282          skip_memory=True,
 283      )
 284      assert agent.provider == "copilot"
 285      assert agent.api_mode == "chat_completions"
 286  
 287  
 288  def test_build_api_kwargs_codex(monkeypatch):
 289      agent = _build_agent(monkeypatch)
 290      kwargs = agent._build_api_kwargs(
 291          [
 292              {"role": "system", "content": "You are Hermes."},
 293              {"role": "user", "content": "Ping"},
 294          ]
 295      )
 296  
 297      assert kwargs["model"] == "gpt-5-codex"
 298      assert kwargs["instructions"] == "You are Hermes."
 299      assert kwargs["store"] is False
 300      assert isinstance(kwargs["input"], list)
 301      assert kwargs["input"][0]["role"] == "user"
 302      assert kwargs["tools"][0]["type"] == "function"
 303      assert kwargs["tools"][0]["name"] == "terminal"
 304      assert kwargs["tools"][0]["strict"] is False
 305      assert "function" not in kwargs["tools"][0]
 306      assert kwargs["store"] is False
 307      assert kwargs["tool_choice"] == "auto"
 308      assert kwargs["parallel_tool_calls"] is True
 309      assert isinstance(kwargs["prompt_cache_key"], str)
 310      assert len(kwargs["prompt_cache_key"]) > 0
 311      assert "timeout" not in kwargs
 312      assert "max_tokens" not in kwargs
 313      assert "extra_body" not in kwargs
 314  
 315  
 316  def test_build_api_kwargs_codex_clamps_minimal_effort(monkeypatch):
 317      """'minimal' reasoning effort is clamped to 'low' on the Responses API.
 318  
 319      GPT-5.4 supports none/low/medium/high/xhigh but NOT 'minimal'.
 320      Users may configure 'minimal' via OpenRouter conventions, so the Codex
 321      Responses path must clamp it to the nearest supported level.
 322      """
 323      _patch_agent_bootstrap(monkeypatch)
 324  
 325      agent = run_agent.AIAgent(
 326          model="gpt-5-codex",
 327          base_url="https://chatgpt.com/backend-api/codex",
 328          api_key="codex-token",
 329          quiet_mode=True,
 330          max_iterations=4,
 331          skip_context_files=True,
 332          skip_memory=True,
 333          reasoning_config={"enabled": True, "effort": "minimal"},
 334      )
 335      agent._cleanup_task_resources = lambda task_id: None
 336      agent._persist_session = lambda messages, history=None: None
 337      agent._save_trajectory = lambda messages, user_message, completed: None
 338      agent._save_session_log = lambda messages: None
 339  
 340      kwargs = agent._build_api_kwargs(
 341          [
 342              {"role": "system", "content": "You are Hermes."},
 343              {"role": "user", "content": "Ping"},
 344          ]
 345      )
 346  
 347      assert kwargs["reasoning"]["effort"] == "low"
 348  
 349  
 350  def test_build_api_kwargs_codex_preserves_supported_efforts(monkeypatch):
 351      """Effort levels natively supported by the Responses API pass through unchanged."""
 352      _patch_agent_bootstrap(monkeypatch)
 353  
 354      for effort in ("low", "medium", "high", "xhigh"):
 355          agent = run_agent.AIAgent(
 356              model="gpt-5-codex",
 357              base_url="https://chatgpt.com/backend-api/codex",
 358              api_key="codex-token",
 359              quiet_mode=True,
 360              max_iterations=4,
 361              skip_context_files=True,
 362              skip_memory=True,
 363              reasoning_config={"enabled": True, "effort": effort},
 364          )
 365          agent._cleanup_task_resources = lambda task_id: None
 366          agent._persist_session = lambda messages, history=None: None
 367          agent._save_trajectory = lambda messages, user_message, completed: None
 368          agent._save_session_log = lambda messages: None
 369  
 370          kwargs = agent._build_api_kwargs(
 371              [
 372                  {"role": "system", "content": "sys"},
 373                  {"role": "user", "content": "hi"},
 374              ]
 375          )
 376          assert kwargs["reasoning"]["effort"] == effort, f"{effort} should pass through unchanged"
 377  
 378  
 379  def test_build_api_kwargs_copilot_responses_omits_openai_only_fields(monkeypatch):
 380      agent = _build_copilot_agent(monkeypatch)
 381      kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
 382  
 383      assert kwargs["model"] == "gpt-5.4"
 384      assert kwargs["store"] is False
 385      assert kwargs["tool_choice"] == "auto"
 386      assert kwargs["parallel_tool_calls"] is True
 387      assert kwargs["reasoning"] == {"effort": "medium"}
 388      assert "prompt_cache_key" not in kwargs
 389      assert "include" not in kwargs
 390  
 391  
 392  def test_build_api_kwargs_copilot_responses_omits_reasoning_for_non_reasoning_model(monkeypatch):
 393      agent = _build_copilot_agent(monkeypatch, model="gpt-4.1")
 394      kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
 395  
 396      assert "reasoning" not in kwargs
 397      assert "include" not in kwargs
 398      assert "prompt_cache_key" not in kwargs
 399  
 400  
 401  def test_run_codex_stream_retries_when_completed_event_missing(monkeypatch):
 402      agent = _build_agent(monkeypatch)
 403      calls = {"stream": 0}
 404  
 405      def _fake_stream(**kwargs):
 406          calls["stream"] += 1
 407          if calls["stream"] == 1:
 408              return _FakeResponsesStream(
 409                  final_error=RuntimeError("Didn't receive a `response.completed` event.")
 410              )
 411          return _FakeResponsesStream(final_response=_codex_message_response("stream ok"))
 412  
 413      agent.client = SimpleNamespace(
 414          responses=SimpleNamespace(
 415              stream=_fake_stream,
 416              create=lambda **kwargs: _codex_message_response("fallback"),
 417          )
 418      )
 419  
 420      response = agent._run_codex_stream(_codex_request_kwargs())
 421      assert calls["stream"] == 2
 422      assert response.output[0].content[0].text == "stream ok"
 423  
 424  
 425  def test_run_codex_stream_falls_back_to_create_after_stream_completion_error(monkeypatch):
 426      agent = _build_agent(monkeypatch)
 427      calls = {"stream": 0, "create": 0}
 428  
 429      def _fake_stream(**kwargs):
 430          calls["stream"] += 1
 431          return _FakeResponsesStream(
 432              final_error=RuntimeError("Didn't receive a `response.completed` event.")
 433          )
 434  
 435      def _fake_create(**kwargs):
 436          calls["create"] += 1
 437          return _codex_message_response("create fallback ok")
 438  
 439      agent.client = SimpleNamespace(
 440          responses=SimpleNamespace(
 441              stream=_fake_stream,
 442              create=_fake_create,
 443          )
 444      )
 445  
 446      response = agent._run_codex_stream(_codex_request_kwargs())
 447      assert calls["stream"] == 2
 448      assert calls["create"] == 1
 449      assert response.output[0].content[0].text == "create fallback ok"
 450  
 451  
 452  def test_run_codex_stream_fallback_parses_create_stream_events(monkeypatch):
 453      agent = _build_agent(monkeypatch)
 454      calls = {"stream": 0, "create": 0}
 455      create_stream = _FakeCreateStream(
 456          [
 457              SimpleNamespace(type="response.created"),
 458              SimpleNamespace(type="response.in_progress"),
 459              SimpleNamespace(type="response.completed", response=_codex_message_response("streamed create ok")),
 460          ]
 461      )
 462  
 463      def _fake_stream(**kwargs):
 464          calls["stream"] += 1
 465          return _FakeResponsesStream(
 466              final_error=RuntimeError("Didn't receive a `response.completed` event.")
 467          )
 468  
 469      def _fake_create(**kwargs):
 470          calls["create"] += 1
 471          assert kwargs.get("stream") is True
 472          return create_stream
 473  
 474      agent.client = SimpleNamespace(
 475          responses=SimpleNamespace(
 476              stream=_fake_stream,
 477              create=_fake_create,
 478          )
 479      )
 480  
 481      response = agent._run_codex_stream(_codex_request_kwargs())
 482      assert calls["stream"] == 2
 483      assert calls["create"] == 1
 484      assert create_stream.closed is True
 485      assert response.output[0].content[0].text == "streamed create ok"
 486  
 487  
 488  def test_run_conversation_codex_plain_text(monkeypatch):
 489      agent = _build_agent(monkeypatch)
 490      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: _codex_message_response("OK"))
 491  
 492      result = agent.run_conversation("Say OK")
 493  
 494      assert result["completed"] is True
 495      assert result["final_response"] == "OK"
 496      assert result["messages"][-1]["role"] == "assistant"
 497      assert result["messages"][-1]["content"] == "OK"
 498  
 499  
 500  def test_run_conversation_codex_empty_output_with_output_text(monkeypatch):
 501      """Regression: empty response.output + valid output_text should succeed,
 502      not trigger retry/fallback. The validation stage must defer to
 503      _normalize_codex_response which synthesizes output from output_text."""
 504      agent = _build_agent(monkeypatch)
 505  
 506      def _empty_output_response(api_kwargs):
 507          return SimpleNamespace(
 508              output=[],
 509              output_text="Hello from Codex",
 510              usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8),
 511              status="completed",
 512              model="gpt-5-codex",
 513          )
 514  
 515      monkeypatch.setattr(agent, "_interruptible_api_call", _empty_output_response)
 516  
 517      result = agent.run_conversation("Say hello")
 518  
 519      assert result["completed"] is True
 520      assert result["final_response"] == "Hello from Codex"
 521  
 522  
 523  def test_run_conversation_codex_empty_output_no_output_text_retries(monkeypatch):
 524      """When both output and output_text are empty, validation should
 525      correctly mark the response as invalid and trigger retry."""
 526      agent = _build_agent(monkeypatch)
 527      calls = {"api": 0}
 528  
 529      def _fake_api_call(api_kwargs):
 530          calls["api"] += 1
 531          if calls["api"] == 1:
 532              return SimpleNamespace(
 533                  output=[],
 534                  output_text=None,
 535                  usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8),
 536                  status="completed",
 537                  model="gpt-5-codex",
 538              )
 539          return _codex_message_response("Recovered")
 540  
 541      monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call)
 542  
 543      result = agent.run_conversation("Say hello")
 544  
 545      assert calls["api"] >= 2
 546      assert result["completed"] is True
 547      assert result["final_response"] == "Recovered"
 548  
 549  
 550  def test_run_conversation_codex_refreshes_after_401_and_retries(monkeypatch):
 551      agent = _build_agent(monkeypatch)
 552      calls = {"api": 0, "refresh": 0}
 553  
 554      class _UnauthorizedError(RuntimeError):
 555          def __init__(self):
 556              super().__init__("Error code: 401 - unauthorized")
 557              self.status_code = 401
 558  
 559      def _fake_api_call(api_kwargs):
 560          calls["api"] += 1
 561          if calls["api"] == 1:
 562              raise _UnauthorizedError()
 563          return _codex_message_response("Recovered after refresh")
 564  
 565      def _fake_refresh(*, force=True):
 566          calls["refresh"] += 1
 567          assert force is True
 568          return True
 569  
 570      monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call)
 571      monkeypatch.setattr(agent, "_try_refresh_codex_client_credentials", _fake_refresh)
 572  
 573      result = agent.run_conversation("Say OK")
 574  
 575      assert calls["api"] == 2
 576      assert calls["refresh"] == 1
 577      assert result["completed"] is True
 578      assert result["final_response"] == "Recovered after refresh"
 579  
 580  
 581  def test_run_conversation_copilot_refreshes_after_401_and_retries(monkeypatch):
 582      agent = _build_copilot_agent(monkeypatch)
 583      calls = {"api": 0, "refresh": 0}
 584  
 585      class _UnauthorizedError(RuntimeError):
 586          def __init__(self):
 587              super().__init__("Error code: 401 - unauthorized")
 588              self.status_code = 401
 589  
 590      def _fake_api_call(api_kwargs):
 591          calls["api"] += 1
 592          if calls["api"] == 1:
 593              raise _UnauthorizedError()
 594          return _codex_message_response("Recovered after copilot refresh")
 595  
 596      def _fake_refresh():
 597          calls["refresh"] += 1
 598          return True
 599  
 600      monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call)
 601      monkeypatch.setattr(agent, "_try_refresh_copilot_client_credentials", _fake_refresh)
 602  
 603      result = agent.run_conversation("Say OK")
 604  
 605      assert calls["api"] == 2
 606      assert calls["refresh"] == 1
 607      assert result["completed"] is True
 608      assert result["final_response"] == "Recovered after copilot refresh"
 609  
 610  
 611  def test_try_refresh_codex_client_credentials_rebuilds_client(monkeypatch):
 612      agent = _build_agent(monkeypatch)
 613      closed = {"value": False}
 614      rebuilt = {"kwargs": None}
 615  
 616      class _ExistingClient:
 617          def close(self):
 618              closed["value"] = True
 619  
 620      class _RebuiltClient:
 621          pass
 622  
 623      def _fake_openai(**kwargs):
 624          rebuilt["kwargs"] = kwargs
 625          return _RebuiltClient()
 626  
 627      monkeypatch.setattr(
 628          "hermes_cli.auth.resolve_codex_runtime_credentials",
 629          lambda force_refresh=True: {
 630              "api_key": "new-codex-token",
 631              "base_url": "https://chatgpt.com/backend-api/codex",
 632          },
 633      )
 634      monkeypatch.setattr(run_agent, "OpenAI", _fake_openai)
 635  
 636      agent.client = _ExistingClient()
 637      ok = agent._try_refresh_codex_client_credentials(force=True)
 638  
 639      assert ok is True
 640      assert closed["value"] is True
 641      assert rebuilt["kwargs"]["api_key"] == "new-codex-token"
 642      assert rebuilt["kwargs"]["base_url"] == "https://chatgpt.com/backend-api/codex"
 643      assert isinstance(agent.client, _RebuiltClient)
 644  
 645  
 646  def test_try_refresh_copilot_client_credentials_rebuilds_client(monkeypatch):
 647      agent = _build_copilot_agent(monkeypatch)
 648      closed = {"value": False}
 649      rebuilt = {"kwargs": None}
 650  
 651      class _ExistingClient:
 652          def close(self):
 653              closed["value"] = True
 654  
 655      class _RebuiltClient:
 656          pass
 657  
 658      def _fake_openai(**kwargs):
 659          rebuilt["kwargs"] = kwargs
 660          return _RebuiltClient()
 661  
 662      monkeypatch.setattr(
 663          "hermes_cli.copilot_auth.resolve_copilot_token",
 664          lambda: ("gho_new_token", "GH_TOKEN"),
 665      )
 666      monkeypatch.setattr(run_agent, "OpenAI", _fake_openai)
 667  
 668      agent.client = _ExistingClient()
 669      ok = agent._try_refresh_copilot_client_credentials()
 670  
 671      assert ok is True
 672      assert closed["value"] is True
 673      assert rebuilt["kwargs"]["api_key"] == "gho_new_token"
 674      assert rebuilt["kwargs"]["base_url"] == "https://api.githubcopilot.com"
 675      assert rebuilt["kwargs"]["default_headers"]["Copilot-Integration-Id"] == "vscode-chat"
 676      assert isinstance(agent.client, _RebuiltClient)
 677  
 678  
 679  def test_try_refresh_copilot_client_credentials_rebuilds_even_if_token_unchanged(monkeypatch):
 680      agent = _build_copilot_agent(monkeypatch)
 681      rebuilt = {"count": 0}
 682  
 683      class _RebuiltClient:
 684          pass
 685  
 686      def _fake_openai(**kwargs):
 687          rebuilt["count"] += 1
 688          return _RebuiltClient()
 689  
 690      monkeypatch.setattr(
 691          "hermes_cli.copilot_auth.resolve_copilot_token",
 692          lambda: ("gh-token", "gh auth token"),
 693      )
 694      monkeypatch.setattr(run_agent, "OpenAI", _fake_openai)
 695  
 696      ok = agent._try_refresh_copilot_client_credentials()
 697  
 698      assert ok is True
 699      assert rebuilt["count"] == 1
 700  
 701  
 702  def test_run_conversation_codex_tool_round_trip(monkeypatch):
 703      agent = _build_agent(monkeypatch)
 704      responses = [_codex_tool_call_response(), _codex_message_response("done")]
 705      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
 706  
 707      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
 708          for call in assistant_message.tool_calls:
 709              messages.append(
 710                  {
 711                      "role": "tool",
 712                      "tool_call_id": call.id,
 713                      "content": '{"ok":true}',
 714                  }
 715              )
 716  
 717      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
 718  
 719      result = agent.run_conversation("run a command")
 720  
 721      assert result["completed"] is True
 722      assert result["final_response"] == "done"
 723      assert any(msg.get("tool_calls") for msg in result["messages"] if msg.get("role") == "assistant")
 724      assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"])
 725  
 726  
 727  def test_chat_messages_to_responses_input_uses_call_id_for_function_call(monkeypatch):
 728      agent = _build_agent(monkeypatch)
 729      from agent.codex_responses_adapter import _chat_messages_to_responses_input
 730      items = _chat_messages_to_responses_input(
 731          [
 732              {"role": "user", "content": "Run terminal"},
 733              {
 734                  "role": "assistant",
 735                  "content": "",
 736                  "tool_calls": [
 737                      {
 738                          "id": "call_abc123",
 739                          "type": "function",
 740                          "function": {"name": "terminal", "arguments": "{}"},
 741                      }
 742                  ],
 743              },
 744              {"role": "tool", "tool_call_id": "call_abc123", "content": '{"ok":true}'},
 745          ]
 746      )
 747  
 748      function_call = next(item for item in items if item.get("type") == "function_call")
 749      function_output = next(item for item in items if item.get("type") == "function_call_output")
 750  
 751      assert function_call["call_id"] == "call_abc123"
 752      assert "id" not in function_call
 753      assert function_output["call_id"] == "call_abc123"
 754  
 755  
 756  def test_chat_messages_to_responses_input_accepts_call_pipe_fc_ids(monkeypatch):
 757      agent = _build_agent(monkeypatch)
 758      from agent.codex_responses_adapter import _chat_messages_to_responses_input
 759      items = _chat_messages_to_responses_input(
 760          [
 761              {"role": "user", "content": "Run terminal"},
 762              {
 763                  "role": "assistant",
 764                  "content": "",
 765                  "tool_calls": [
 766                      {
 767                          "id": "call_pair123|fc_pair123",
 768                          "type": "function",
 769                          "function": {"name": "terminal", "arguments": "{}"},
 770                      }
 771                  ],
 772              },
 773              {"role": "tool", "tool_call_id": "call_pair123|fc_pair123", "content": '{"ok":true}'},
 774          ]
 775      )
 776  
 777      function_call = next(item for item in items if item.get("type") == "function_call")
 778      function_output = next(item for item in items if item.get("type") == "function_call_output")
 779  
 780      assert function_call["call_id"] == "call_pair123"
 781      assert "id" not in function_call
 782      assert function_output["call_id"] == "call_pair123"
 783  
 784  
 785  def test_preflight_codex_api_kwargs_strips_optional_function_call_id(monkeypatch):
 786      agent = _build_agent(monkeypatch)
 787      from agent.codex_responses_adapter import _preflight_codex_api_kwargs
 788      preflight = _preflight_codex_api_kwargs(
 789          {
 790              "model": "gpt-5-codex",
 791              "instructions": "You are Hermes.",
 792              "input": [
 793                  {"role": "user", "content": "hi"},
 794                  {
 795                      "type": "function_call",
 796                      "id": "call_bad",
 797                      "call_id": "call_good",
 798                      "name": "terminal",
 799                      "arguments": "{}",
 800                  },
 801              ],
 802              "tools": [],
 803              "store": False,
 804          }
 805      )
 806  
 807      fn_call = next(item for item in preflight["input"] if item.get("type") == "function_call")
 808      assert fn_call["call_id"] == "call_good"
 809      assert "id" not in fn_call
 810  
 811  
 812  def test_preflight_codex_api_kwargs_rejects_function_call_output_without_call_id(monkeypatch):
 813      agent = _build_agent(monkeypatch)
 814  
 815      with pytest.raises(ValueError, match="function_call_output is missing call_id"):
 816          from agent.codex_responses_adapter import _preflight_codex_api_kwargs
 817          _preflight_codex_api_kwargs(
 818              {
 819                  "model": "gpt-5-codex",
 820                  "instructions": "You are Hermes.",
 821                  "input": [{"type": "function_call_output", "output": "{}"}],
 822                  "tools": [],
 823                  "store": False,
 824              }
 825          )
 826  
 827  
 828  def test_preflight_codex_api_kwargs_rejects_unsupported_request_fields(monkeypatch):
 829      agent = _build_agent(monkeypatch)
 830      kwargs = _codex_request_kwargs()
 831      kwargs["some_unknown_field"] = "value"
 832  
 833      with pytest.raises(ValueError, match="unsupported field"):
 834          from agent.codex_responses_adapter import _preflight_codex_api_kwargs
 835          _preflight_codex_api_kwargs(kwargs)
 836  
 837  
 838  def test_preflight_codex_api_kwargs_allows_reasoning_and_temperature(monkeypatch):
 839      agent = _build_agent(monkeypatch)
 840      kwargs = _codex_request_kwargs()
 841      kwargs["reasoning"] = {"effort": "high", "summary": "auto"}
 842      kwargs["include"] = ["reasoning.encrypted_content"]
 843      kwargs["temperature"] = 0.7
 844      kwargs["max_output_tokens"] = 4096
 845  
 846      from agent.codex_responses_adapter import _preflight_codex_api_kwargs
 847      result = _preflight_codex_api_kwargs(kwargs)
 848      assert result["reasoning"] == {"effort": "high", "summary": "auto"}
 849      assert result["include"] == ["reasoning.encrypted_content"]
 850      assert result["temperature"] == 0.7
 851      assert result["max_output_tokens"] == 4096
 852  
 853  
 854  def test_preflight_codex_api_kwargs_allows_service_tier(monkeypatch):
 855      agent = _build_agent(monkeypatch)
 856      kwargs = _codex_request_kwargs()
 857      kwargs["service_tier"] = "priority"
 858  
 859      from agent.codex_responses_adapter import _preflight_codex_api_kwargs
 860      result = _preflight_codex_api_kwargs(kwargs)
 861      assert result["service_tier"] == "priority"
 862  
 863  
 864  def test_run_conversation_codex_replay_payload_keeps_call_id(monkeypatch):
 865      agent = _build_agent(monkeypatch)
 866      responses = [_codex_tool_call_response(), _codex_message_response("done")]
 867      requests = []
 868  
 869      def _fake_api_call(api_kwargs):
 870          requests.append(api_kwargs)
 871          return responses.pop(0)
 872  
 873      monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call)
 874  
 875      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
 876          for call in assistant_message.tool_calls:
 877              messages.append(
 878                  {
 879                      "role": "tool",
 880                      "tool_call_id": call.id,
 881                      "content": '{"ok":true}',
 882                  }
 883              )
 884  
 885      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
 886  
 887      result = agent.run_conversation("run a command")
 888  
 889      assert result["completed"] is True
 890      assert result["final_response"] == "done"
 891      assert len(requests) >= 2
 892  
 893      replay_input = requests[1]["input"]
 894      function_call = next(item for item in replay_input if item.get("type") == "function_call")
 895      function_output = next(item for item in replay_input if item.get("type") == "function_call_output")
 896      assert function_call["call_id"] == "call_1"
 897      assert "id" not in function_call
 898      assert function_output["call_id"] == "call_1"
 899  
 900  
 901  def test_run_conversation_codex_continues_after_incomplete_interim_message(monkeypatch):
 902      agent = _build_agent(monkeypatch)
 903      responses = [
 904          _codex_incomplete_message_response("I'll inspect the repo structure first."),
 905          _codex_tool_call_response(),
 906          _codex_message_response("Architecture summary complete."),
 907      ]
 908      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
 909  
 910      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
 911          for call in assistant_message.tool_calls:
 912              messages.append(
 913                  {
 914                      "role": "tool",
 915                      "tool_call_id": call.id,
 916                      "content": '{"ok":true}',
 917                  }
 918              )
 919  
 920      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
 921  
 922      result = agent.run_conversation("analyze repo")
 923  
 924      assert result["completed"] is True
 925      assert result["final_response"] == "Architecture summary complete."
 926      assert any(
 927          msg.get("role") == "assistant"
 928          and msg.get("finish_reason") == "incomplete"
 929          and "inspect the repo structure" in (msg.get("content") or "")
 930          for msg in result["messages"]
 931      )
 932      assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"])
 933  
 934  
 935  def test_normalize_codex_response_marks_commentary_only_message_as_incomplete(monkeypatch):
 936      agent = _build_agent(monkeypatch)
 937      from agent.codex_responses_adapter import _normalize_codex_response
 938      assistant_message, finish_reason = _normalize_codex_response(
 939          _codex_commentary_message_response("I'll inspect the repository first.")
 940      )
 941  
 942      assert finish_reason == "incomplete"
 943      assert "inspect the repository" in (assistant_message.content or "")
 944  
 945  
 946  def test_normalize_codex_response_preserves_message_status_for_replay(monkeypatch):
 947      """Incomplete Codex output messages must not be replayed as completed."""
 948      agent = _build_agent(monkeypatch)
 949      from agent.codex_responses_adapter import _normalize_codex_response
 950  
 951      response = SimpleNamespace(
 952          output=[
 953              SimpleNamespace(
 954                  type="message",
 955                  id="msg_partial",
 956                  phase="commentary",
 957                  status="in_progress",
 958                  content=[SimpleNamespace(type="output_text", text="Still working...")],
 959              )
 960          ],
 961          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
 962          status="in_progress",
 963          model="gpt-5-codex",
 964      )
 965  
 966      assistant_message, finish_reason = _normalize_codex_response(response)
 967  
 968      assert finish_reason == "incomplete"
 969      assert assistant_message.codex_message_items[0]["id"] == "msg_partial"
 970      assert assistant_message.codex_message_items[0]["status"] == "in_progress"
 971  
 972  
 973  def test_normalize_codex_response_detects_leaked_tool_call_text(monkeypatch):
 974      """Harmony-style `to=functions.foo` leaked into assistant content with no
 975      structured function_call items must be treated as incomplete so the
 976      continuation path can re-elicit a proper tool call. This is the
 977      Taiwan-embassy-email (Discord bug report) failure mode: child agent
 978      produces a confident-looking summary, tool_trace is empty because no
 979      tools actually ran, parent can't audit the claim.
 980      """
 981      agent = _build_agent(monkeypatch)
 982      from agent.codex_responses_adapter import _normalize_codex_response
 983  
 984      leaked_content = (
 985          "I'll check the official page directly.\n"
 986          "to=functions.exec_command {\"cmd\": \"curl https://example.test\"}\n"
 987          "assistant to=functions.exec_command {\"stdout\": \"mailto:foo@example.test\"}\n"
 988          "Extracted: foo@example.test"
 989      )
 990      response = SimpleNamespace(
 991          output=[
 992              SimpleNamespace(
 993                  type="message",
 994                  status="completed",
 995                  content=[SimpleNamespace(type="output_text", text=leaked_content)],
 996              )
 997          ],
 998          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
 999          status="completed",
1000          model="gpt-5.4",
1001      )
1002  
1003      assistant_message, finish_reason = _normalize_codex_response(response)
1004  
1005      assert finish_reason == "incomplete"
1006      # Content is scrubbed so the parent never surfaces the leaked text as a
1007      # summary. tool_calls stays empty because no structured function_call
1008      # item existed.
1009      assert (assistant_message.content or "") == ""
1010      assert assistant_message.tool_calls == []
1011  
1012  
1013  def test_normalize_codex_response_ignores_tool_call_text_when_real_tool_call_present(monkeypatch):
1014      """If the model emitted BOTH a structured function_call AND some text that
1015      happens to contain `to=functions.*` (unlikely but possible), trust the
1016      structured call — don't wipe content that came alongside a real tool use.
1017      """
1018      agent = _build_agent(monkeypatch)
1019      from agent.codex_responses_adapter import _normalize_codex_response
1020  
1021      response = SimpleNamespace(
1022          output=[
1023              SimpleNamespace(
1024                  type="message",
1025                  status="completed",
1026                  content=[SimpleNamespace(
1027                      type="output_text",
1028                      text="Running the command via to=functions.exec_command now.",
1029                  )],
1030              ),
1031              SimpleNamespace(
1032                  type="function_call",
1033                  id="fc_1",
1034                  call_id="call_1",
1035                  name="terminal",
1036                  arguments="{}",
1037              ),
1038          ],
1039          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
1040          status="completed",
1041          model="gpt-5.4",
1042      )
1043  
1044      assistant_message, finish_reason = _normalize_codex_response(response)
1045  
1046      assert finish_reason == "tool_calls"
1047      assert assistant_message.tool_calls  # real call preserved
1048      assert "Running the command" in (assistant_message.content or "")
1049  
1050  
1051  def test_normalize_codex_response_no_leak_passes_through(monkeypatch):
1052      """Sanity: normal assistant content that doesn't contain the leak pattern
1053      is returned verbatim with finish_reason=stop."""
1054      agent = _build_agent(monkeypatch)
1055      from agent.codex_responses_adapter import _normalize_codex_response
1056  
1057      response = SimpleNamespace(
1058          output=[
1059              SimpleNamespace(
1060                  type="message",
1061                  status="completed",
1062                  content=[SimpleNamespace(
1063                      type="output_text",
1064                      text="Here is the answer with no leak.",
1065                  )],
1066              )
1067          ],
1068          usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
1069          status="completed",
1070          model="gpt-5.4",
1071      )
1072  
1073      assistant_message, finish_reason = _normalize_codex_response(response)
1074  
1075      assert finish_reason == "stop"
1076      assert assistant_message.content == "Here is the answer with no leak."
1077      assert assistant_message.tool_calls == []
1078  
1079  
1080  def test_interim_commentary_is_not_marked_already_streamed_without_callbacks(monkeypatch):
1081      agent = _build_agent(monkeypatch)
1082      observed = {}
1083  
1084      agent._fire_stream_delta("short version: yes")
1085      agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update(
1086          {"text": text, "already_streamed": already_streamed}
1087      )
1088  
1089      agent._emit_interim_assistant_message({"role": "assistant", "content": "short version: yes"})
1090  
1091      assert observed == {
1092          "text": "short version: yes",
1093          "already_streamed": False,
1094      }
1095  
1096  
1097  def test_interim_commentary_is_not_marked_already_streamed_when_stream_callback_fails(monkeypatch):
1098      agent = _build_agent(monkeypatch)
1099      observed = {}
1100  
1101      def failing_callback(_text):
1102          raise RuntimeError("display failed")
1103  
1104      agent.stream_delta_callback = failing_callback
1105      agent._fire_stream_delta("short version: yes")
1106      agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update(
1107          {"text": text, "already_streamed": already_streamed}
1108      )
1109  
1110      agent._emit_interim_assistant_message({"role": "assistant", "content": "short version: yes"})
1111  
1112      assert observed == {
1113          "text": "short version: yes",
1114          "already_streamed": False,
1115      }
1116  
1117  
1118  def test_interim_commentary_preserves_assistant_content(monkeypatch):
1119      """Interim commentary must not silently mutate assistant text containing
1120      literal <memory-context> markers — that's legitimate model output (docs,
1121      code).  Streaming-path leak prevention happens delta-by-delta upstream."""
1122      agent = _build_agent(monkeypatch)
1123      observed = {}
1124      agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update(
1125          {"text": text, "already_streamed": already_streamed}
1126      )
1127  
1128      content = (
1129          "<memory-context>\n"
1130          "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
1131          "## Honcho Context\n"
1132          "stale memory\n"
1133          "</memory-context>\n\n"
1134          "I'll inspect the repo structure first."
1135      )
1136  
1137      agent._emit_interim_assistant_message({"role": "assistant", "content": content})
1138  
1139      assert "<memory-context>" in observed["text"]
1140      assert "I'll inspect the repo structure first." in observed["text"]
1141  
1142  
1143  def test_stream_delta_strips_leaked_memory_context(monkeypatch):
1144      agent = _build_agent(monkeypatch)
1145      observed = []
1146      agent.stream_delta_callback = observed.append
1147  
1148      leaked = (
1149          "<memory-context>\n"
1150          "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n"
1151          "## Honcho Context\n"
1152          "stale memory\n"
1153          "</memory-context>\n\n"
1154          "Visible answer"
1155      )
1156  
1157      agent._fire_stream_delta(leaked)
1158  
1159      assert observed == ["Visible answer"]
1160  
1161  
1162  def test_stream_delta_strips_leaked_memory_context_across_chunks(monkeypatch):
1163      """Regression for #5719 — the real streaming case.
1164  
1165      Providers typically emit 1-80 char chunks, so the memory-context open
1166      tag, system-note line, payload, and close tag each arrive in separate
1167      deltas.  The per-delta sanitize_context() regex cannot survive that
1168      — only a stateful scrubber can.  None of the payload, system-note
1169      text, or "## Honcho Context" header may reach the delta callback.
1170      """
1171      agent = _build_agent(monkeypatch)
1172      observed = []
1173      agent.stream_delta_callback = observed.append
1174  
1175      deltas = [
1176          "<memory-context>\n[System note: The following",
1177          " is recalled memory context, NOT new user input. ",
1178          "Treat as informational background data.]\n\n",
1179          "## Honcho Context\n",
1180          "stale memory about eri\n",
1181          "</memory-context>\n\n",
1182          "Visible answer",
1183      ]
1184      for d in deltas:
1185          agent._fire_stream_delta(d)
1186  
1187      combined = "".join(observed)
1188      assert "Visible answer" in combined
1189      # None of the leaked payload may surface.
1190      assert "System note" not in combined
1191      assert "Honcho Context" not in combined
1192      assert "stale memory" not in combined
1193      assert "<memory-context>" not in combined
1194      assert "</memory-context>" not in combined
1195  
1196  
1197  def test_stream_delta_scrubber_resets_between_turns(monkeypatch):
1198      """An unterminated span from a prior turn must not taint the next turn."""
1199      agent = _build_agent(monkeypatch)
1200  
1201      # Simulate a hung span carried over — directly populate the scrubber.
1202      agent._stream_context_scrubber.feed("pre <memory-context>leaked")
1203  
1204      # Normally run_conversation() resets the scrubber at turn start.
1205      agent._stream_context_scrubber.reset()
1206  
1207      observed = []
1208      agent.stream_delta_callback = observed.append
1209      agent._fire_stream_delta("clean new turn text")
1210      assert "".join(observed) == "clean new turn text"
1211  
1212  
1213  def test_stream_delta_preserves_mid_stream_leading_newlines(monkeypatch):
1214      """Mid-stream leading newlines must survive — they are legitimate
1215      markdown (lists, code fences, paragraph breaks).  Stripping them
1216      based on chunk boundaries silently breaks formatting.
1217  
1218      Only the very first delta of a stream gets leading-newlines stripped
1219      (so stale provider preamble doesn't leak); after that, deltas are
1220      emitted verbatim.
1221      """
1222      agent = _build_agent(monkeypatch)
1223      observed = []
1224      agent.stream_delta_callback = observed.append
1225  
1226      # First delta delivers text — strips its own leading "\n" once.
1227      agent._fire_stream_delta("\nHere is a list:")
1228      # Second delta starts with "\n- item" — must NOT be stripped.
1229      agent._fire_stream_delta("\n- first")
1230      agent._fire_stream_delta("\n- second")
1231  
1232      combined = "".join(observed)
1233      assert combined == "Here is a list:\n- first\n- second"
1234  
1235  
1236  def test_stream_delta_preserves_code_fence_newlines(monkeypatch):
1237      """Code blocks span multiple deltas.  A "\\n```python\\n" boundary
1238      is the canonical case where stripping leading newlines corrupts output."""
1239      agent = _build_agent(monkeypatch)
1240      observed = []
1241      agent.stream_delta_callback = observed.append
1242  
1243      agent._fire_stream_delta("Here is the code:")
1244      agent._fire_stream_delta("\n```python\n")
1245      agent._fire_stream_delta("print('hi')\n")
1246      agent._fire_stream_delta("```\n")
1247  
1248      combined = "".join(observed)
1249      assert "```python\n" in combined
1250      assert combined.startswith("Here is the code:\n```python\n")
1251  
1252  
1253  def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch):
1254      agent = _build_agent(monkeypatch)
1255      responses = [
1256          _codex_commentary_message_response("I'll inspect the repo structure first."),
1257          _codex_tool_call_response(),
1258          _codex_message_response("Architecture summary complete."),
1259      ]
1260      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1261  
1262      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
1263          for call in assistant_message.tool_calls:
1264              messages.append(
1265                  {
1266                      "role": "tool",
1267                      "tool_call_id": call.id,
1268                      "content": '{"ok":true}',
1269                  }
1270              )
1271  
1272      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
1273  
1274      result = agent.run_conversation("analyze repo")
1275  
1276      assert result["completed"] is True
1277      assert result["final_response"] == "Architecture summary complete."
1278      assert any(
1279          msg.get("role") == "assistant"
1280          and msg.get("finish_reason") == "incomplete"
1281          and "inspect the repo structure" in (msg.get("content") or "")
1282          for msg in result["messages"]
1283      )
1284      assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"])
1285  
1286  
1287  def test_run_conversation_codex_continues_after_ack_stop_message(monkeypatch):
1288      agent = _build_agent(monkeypatch)
1289      responses = [
1290          _codex_ack_message_response(
1291              "Absolutely — I can do that. I'll inspect ~/openclaw-studio and report back with a walkthrough."
1292          ),
1293          _codex_tool_call_response(),
1294          _codex_message_response("Architecture summary complete."),
1295      ]
1296      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1297  
1298      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
1299          for call in assistant_message.tool_calls:
1300              messages.append(
1301                  {
1302                      "role": "tool",
1303                      "tool_call_id": call.id,
1304                      "content": '{"ok":true}',
1305                  }
1306              )
1307  
1308      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
1309  
1310      result = agent.run_conversation("look into ~/openclaw-studio and tell me how it works")
1311  
1312      assert result["completed"] is True
1313      assert result["final_response"] == "Architecture summary complete."
1314      assert any(
1315          msg.get("role") == "assistant"
1316          and msg.get("finish_reason") == "incomplete"
1317          and "inspect ~/openclaw-studio" in (msg.get("content") or "")
1318          for msg in result["messages"]
1319      )
1320      assert any(
1321          msg.get("role") == "user"
1322          and "Continue now. Execute the required tool calls" in (msg.get("content") or "")
1323          for msg in result["messages"]
1324      )
1325      assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"])
1326  
1327  
1328  def test_run_conversation_codex_continues_after_ack_for_directory_listing_prompt(monkeypatch):
1329      agent = _build_agent(monkeypatch)
1330      responses = [
1331          _codex_ack_message_response(
1332              "I'll check what's in the current directory and call out 3 notable items."
1333          ),
1334          _codex_tool_call_response(),
1335          _codex_message_response("Directory summary complete."),
1336      ]
1337      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1338  
1339      def _fake_execute_tool_calls(assistant_message, messages, effective_task_id):
1340          for call in assistant_message.tool_calls:
1341              messages.append(
1342                  {
1343                      "role": "tool",
1344                      "tool_call_id": call.id,
1345                      "content": '{"ok":true}',
1346                  }
1347              )
1348  
1349      monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls)
1350  
1351      result = agent.run_conversation("look at current directory and list 3 notable things")
1352  
1353      assert result["completed"] is True
1354      assert result["final_response"] == "Directory summary complete."
1355      assert any(
1356          msg.get("role") == "assistant"
1357          and msg.get("finish_reason") == "incomplete"
1358          and "current directory" in (msg.get("content") or "")
1359          for msg in result["messages"]
1360      )
1361      assert any(
1362          msg.get("role") == "user"
1363          and "Continue now. Execute the required tool calls" in (msg.get("content") or "")
1364          for msg in result["messages"]
1365      )
1366      assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"])
1367  
1368  
1369  def test_dump_api_request_debug_uses_responses_url(monkeypatch, tmp_path):
1370      """Debug dumps should show /responses URL when in codex_responses mode."""
1371      import json
1372      agent = _build_agent(monkeypatch)
1373      agent.base_url = "http://127.0.0.1:9208/v1"
1374      agent.logs_dir = tmp_path
1375  
1376      dump_file = agent._dump_api_request_debug(_codex_request_kwargs(), reason="preflight")
1377  
1378      payload = json.loads(dump_file.read_text())
1379      assert payload["request"]["url"] == "http://127.0.0.1:9208/v1/responses"
1380  
1381  
1382  def test_dump_api_request_debug_uses_chat_completions_url(monkeypatch, tmp_path):
1383      """Debug dumps should show /chat/completions URL for chat_completions mode."""
1384      import json
1385      _patch_agent_bootstrap(monkeypatch)
1386      agent = run_agent.AIAgent(
1387          model="gpt-4o",
1388          base_url="http://127.0.0.1:9208/v1",
1389          api_key="test-key",
1390          quiet_mode=True,
1391          max_iterations=1,
1392          skip_context_files=True,
1393          skip_memory=True,
1394      )
1395      agent.logs_dir = tmp_path
1396  
1397      dump_file = agent._dump_api_request_debug(
1398          {"model": "gpt-4o", "messages": [{"role": "user", "content": "hi"}]},
1399          reason="preflight",
1400      )
1401  
1402      payload = json.loads(dump_file.read_text())
1403      assert payload["request"]["url"] == "http://127.0.0.1:9208/v1/chat/completions"
1404  
1405  
1406  # --- Reasoning-only response tests (fix for empty content retry loop) ---
1407  
1408  
1409  def _codex_reasoning_only_response(*, encrypted_content="enc_abc123", summary_text="Thinking..."):
1410      """Codex response containing only reasoning items — no message text, no tool calls."""
1411      return SimpleNamespace(
1412          output=[
1413              SimpleNamespace(
1414                  type="reasoning",
1415                  id="rs_001",
1416                  encrypted_content=encrypted_content,
1417                  summary=[SimpleNamespace(type="summary_text", text=summary_text)],
1418                  status="completed",
1419              )
1420          ],
1421          usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150),
1422          status="completed",
1423          model="gpt-5-codex",
1424      )
1425  
1426  
1427  def test_normalize_codex_response_marks_reasoning_only_as_incomplete(monkeypatch):
1428      """A response with only reasoning items and no content should be 'incomplete', not 'stop'.
1429  
1430      Without this fix, reasoning-only responses get finish_reason='stop' which
1431      sends them into the empty-content retry loop (3 retries then failure).
1432      """
1433      agent = _build_agent(monkeypatch)
1434      from agent.codex_responses_adapter import _normalize_codex_response
1435      assistant_message, finish_reason = _normalize_codex_response(
1436          _codex_reasoning_only_response()
1437      )
1438  
1439      assert finish_reason == "incomplete"
1440      assert assistant_message.content == ""
1441      assert assistant_message.codex_reasoning_items is not None
1442      assert len(assistant_message.codex_reasoning_items) == 1
1443      assert assistant_message.codex_reasoning_items[0]["encrypted_content"] == "enc_abc123"
1444  
1445  
1446  def test_normalize_codex_response_reasoning_with_content_is_stop(monkeypatch):
1447      """If a response has both reasoning and message content, it should still be 'stop'."""
1448      agent = _build_agent(monkeypatch)
1449      response = SimpleNamespace(
1450          output=[
1451              SimpleNamespace(
1452                  type="reasoning",
1453                  id="rs_001",
1454                  encrypted_content="enc_xyz",
1455                  summary=[SimpleNamespace(type="summary_text", text="Thinking...")],
1456                  status="completed",
1457              ),
1458              SimpleNamespace(
1459                  type="message",
1460                  content=[SimpleNamespace(type="output_text", text="Here is the answer.")],
1461                  status="completed",
1462              ),
1463          ],
1464          usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150),
1465          status="completed",
1466          model="gpt-5-codex",
1467      )
1468      from agent.codex_responses_adapter import _normalize_codex_response
1469      assistant_message, finish_reason = _normalize_codex_response(response)
1470  
1471      assert finish_reason == "stop"
1472      assert "Here is the answer" in assistant_message.content
1473  
1474  
1475  def test_run_conversation_codex_continues_after_reasoning_only_response(monkeypatch):
1476      """End-to-end: reasoning-only → final message should succeed, not hit retry loop."""
1477      agent = _build_agent(monkeypatch)
1478      responses = [
1479          _codex_reasoning_only_response(),
1480          _codex_message_response("The final answer is 42."),
1481      ]
1482      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1483  
1484      result = agent.run_conversation("what is the answer?")
1485  
1486      assert result["completed"] is True
1487      assert result["final_response"] == "The final answer is 42."
1488      # The reasoning-only turn should be in messages as an incomplete interim
1489      assert any(
1490          msg.get("role") == "assistant"
1491          and msg.get("finish_reason") == "incomplete"
1492          and msg.get("codex_reasoning_items") is not None
1493          for msg in result["messages"]
1494      )
1495  
1496  
1497  def test_run_conversation_codex_preserves_encrypted_reasoning_in_interim(monkeypatch):
1498      """Encrypted codex_reasoning_items must be preserved in interim messages
1499      even when there is no visible reasoning text or content."""
1500      agent = _build_agent(monkeypatch)
1501      # Response with encrypted reasoning but no human-readable summary
1502      reasoning_response = SimpleNamespace(
1503          output=[
1504              SimpleNamespace(
1505                  type="reasoning",
1506                  id="rs_002",
1507                  encrypted_content="enc_opaque_blob",
1508                  summary=[],
1509                  status="completed",
1510              )
1511          ],
1512          usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150),
1513          status="completed",
1514          model="gpt-5-codex",
1515      )
1516      responses = [
1517          reasoning_response,
1518          _codex_message_response("Done thinking."),
1519      ]
1520      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1521  
1522      result = agent.run_conversation("think hard")
1523  
1524      assert result["completed"] is True
1525      assert result["final_response"] == "Done thinking."
1526      # The interim message must have codex_reasoning_items preserved
1527      interim_msgs = [
1528          msg for msg in result["messages"]
1529          if msg.get("role") == "assistant"
1530          and msg.get("finish_reason") == "incomplete"
1531      ]
1532      assert len(interim_msgs) >= 1
1533      assert interim_msgs[0].get("codex_reasoning_items") is not None
1534      assert interim_msgs[0]["codex_reasoning_items"][0]["encrypted_content"] == "enc_opaque_blob"
1535  
1536  
1537  def test_chat_messages_to_responses_input_reasoning_only_has_following_item(monkeypatch):
1538      """When converting a reasoning-only interim message to Responses API input,
1539      the reasoning items must be followed by an assistant message (even if empty)
1540      to satisfy the API's 'required following item' constraint."""
1541      agent = _build_agent(monkeypatch)
1542      messages = [
1543          {"role": "user", "content": "think hard"},
1544          {
1545              "role": "assistant",
1546              "content": "",
1547              "reasoning": None,
1548              "finish_reason": "incomplete",
1549              "codex_reasoning_items": [
1550                  {"type": "reasoning", "id": "rs_001", "encrypted_content": "enc_abc", "summary": []},
1551              ],
1552          },
1553      ]
1554      from agent.codex_responses_adapter import _chat_messages_to_responses_input
1555      items = _chat_messages_to_responses_input(messages)
1556  
1557      # Find the reasoning item
1558      reasoning_indices = [i for i, it in enumerate(items) if it.get("type") == "reasoning"]
1559      assert len(reasoning_indices) == 1
1560      ri_idx = reasoning_indices[0]
1561  
1562      # There must be a following item after the reasoning
1563      assert ri_idx < len(items) - 1, "Reasoning item must not be the last item (missing_following_item)"
1564      following = items[ri_idx + 1]
1565      assert following.get("role") == "assistant"
1566  
1567  
1568  def test_codex_message_item_status_survives_conversion_and_preflight(monkeypatch):
1569      """Stored Codex assistant message statuses must survive replay normalization."""
1570      agent = _build_agent(monkeypatch)
1571      from agent.codex_responses_adapter import (
1572          _chat_messages_to_responses_input,
1573          _preflight_codex_input_items,
1574      )
1575  
1576      items = _chat_messages_to_responses_input([
1577          {
1578              "role": "assistant",
1579              "content": "partial",
1580              "codex_message_items": [
1581                  {
1582                      "type": "message",
1583                      "role": "assistant",
1584                      "status": "incomplete",
1585                      "id": "msg_incomplete",
1586                      "phase": "commentary",
1587                      "content": [{"type": "output_text", "text": "partial"}],
1588                  }
1589              ],
1590          }
1591      ])
1592      replay_item = next(item for item in items if item.get("type") == "message")
1593      assert replay_item["status"] == "incomplete"
1594  
1595      normalized = _preflight_codex_input_items([
1596          {
1597              "type": "message",
1598              "role": "assistant",
1599              "status": "in_progress",
1600              "content": [{"type": "output_text", "text": "working"}],
1601          }
1602      ])
1603      assert normalized[0]["status"] == "in_progress"
1604  
1605  
1606  def test_duplicate_detection_distinguishes_different_codex_reasoning(monkeypatch):
1607      """Two consecutive reasoning-only responses with different encrypted content
1608      must NOT be treated as duplicates."""
1609      agent = _build_agent(monkeypatch)
1610      responses = [
1611          # First reasoning-only response
1612          SimpleNamespace(
1613              output=[
1614                  SimpleNamespace(
1615                      type="reasoning", id="rs_001",
1616                      encrypted_content="enc_first", summary=[], status="completed",
1617                  )
1618              ],
1619              usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150),
1620              status="completed", model="gpt-5-codex",
1621          ),
1622          # Second reasoning-only response (different encrypted content)
1623          SimpleNamespace(
1624              output=[
1625                  SimpleNamespace(
1626                      type="reasoning", id="rs_002",
1627                      encrypted_content="enc_second", summary=[], status="completed",
1628                  )
1629              ],
1630              usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150),
1631              status="completed", model="gpt-5-codex",
1632          ),
1633          _codex_message_response("Final answer after thinking."),
1634      ]
1635      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1636  
1637      result = agent.run_conversation("think very hard")
1638  
1639      assert result["completed"] is True
1640      assert result["final_response"] == "Final answer after thinking."
1641      # Both reasoning-only interim messages should be in history (not collapsed)
1642      interim_msgs = [
1643          msg for msg in result["messages"]
1644          if msg.get("role") == "assistant"
1645          and msg.get("finish_reason") == "incomplete"
1646      ]
1647      assert len(interim_msgs) == 2
1648      encrypted_contents = [
1649          msg["codex_reasoning_items"][0]["encrypted_content"]
1650          for msg in interim_msgs
1651      ]
1652      assert "enc_first" in encrypted_contents
1653      assert "enc_second" in encrypted_contents
1654  
1655  
1656  def test_duplicate_detection_distinguishes_different_codex_message_items(monkeypatch):
1657      """Incomplete turns with new message ids/phases/statuses must not be collapsed."""
1658      agent = _build_agent(monkeypatch)
1659      responses = [
1660          SimpleNamespace(
1661              output=[
1662                  SimpleNamespace(
1663                      type="message",
1664                      id="msg_first",
1665                      phase="commentary",
1666                      status="in_progress",
1667                      content=[SimpleNamespace(type="output_text", text="Still working...")],
1668                  )
1669              ],
1670              usage=SimpleNamespace(input_tokens=50, output_tokens=10, total_tokens=60),
1671              status="in_progress",
1672              model="gpt-5-codex",
1673          ),
1674          SimpleNamespace(
1675              output=[
1676                  SimpleNamespace(
1677                      type="message",
1678                      id="msg_second",
1679                      phase="commentary",
1680                      status="in_progress",
1681                      content=[SimpleNamespace(type="output_text", text="Still working...")],
1682                  )
1683              ],
1684              usage=SimpleNamespace(input_tokens=50, output_tokens=10, total_tokens=60),
1685              status="in_progress",
1686              model="gpt-5-codex",
1687          ),
1688          _codex_message_response("Final answer after progress updates."),
1689      ]
1690      monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0))
1691  
1692      result = agent.run_conversation("keep going")
1693  
1694      assert result["completed"] is True
1695      interim_msgs = [
1696          msg for msg in result["messages"]
1697          if msg.get("role") == "assistant"
1698          and msg.get("finish_reason") == "incomplete"
1699      ]
1700      assert len(interim_msgs) == 2
1701      assert [msg["codex_message_items"][0]["id"] for msg in interim_msgs] == [
1702          "msg_first",
1703          "msg_second",
1704      ]
1705      assert all(msg["codex_message_items"][0]["status"] == "in_progress" for msg in interim_msgs)
1706  
1707  
1708  def test_chat_messages_to_responses_input_deduplicates_reasoning_ids(monkeypatch):
1709      """Duplicate reasoning item IDs across multi-turn incomplete responses
1710      must be deduplicated so the Responses API doesn't reject with HTTP 400."""
1711      agent = _build_agent(monkeypatch)
1712      messages = [
1713          {"role": "user", "content": "think hard"},
1714          {
1715              "role": "assistant",
1716              "content": "",
1717              "codex_reasoning_items": [
1718                  {"type": "reasoning", "id": "rs_aaa", "encrypted_content": "enc_1"},
1719                  {"type": "reasoning", "id": "rs_bbb", "encrypted_content": "enc_2"},
1720              ],
1721          },
1722          {
1723              "role": "assistant",
1724              "content": "partial answer",
1725              "codex_reasoning_items": [
1726                  # rs_aaa is duplicated from the previous turn
1727                  {"type": "reasoning", "id": "rs_aaa", "encrypted_content": "enc_1"},
1728                  {"type": "reasoning", "id": "rs_ccc", "encrypted_content": "enc_3"},
1729              ],
1730          },
1731      ]
1732      from agent.codex_responses_adapter import _chat_messages_to_responses_input
1733      items = _chat_messages_to_responses_input(messages)
1734  
1735      reasoning_items = [it for it in items if it.get("type") == "reasoning"]
1736      # Dedup: rs_aaa appears in both turns but should only be emitted once.
1737      # 3 unique items total: enc_1 (from rs_aaa), enc_2 (rs_bbb), enc_3 (rs_ccc).
1738      assert len(reasoning_items) == 3
1739      encrypted = [it["encrypted_content"] for it in reasoning_items]
1740      assert encrypted.count("enc_1") == 1
1741      assert "enc_2" in encrypted
1742      assert "enc_3" in encrypted
1743      # IDs must be stripped — with store=False the API 404s on id lookups.
1744      for it in reasoning_items:
1745          assert "id" not in it
1746  
1747  
1748  def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch):
1749      """_preflight_codex_input_items should also deduplicate reasoning items by ID."""
1750      agent = _build_agent(monkeypatch)
1751      raw_input = [
1752          {"role": "user", "content": [{"type": "input_text", "text": "hello"}]},
1753          {"type": "reasoning", "id": "rs_xyz", "encrypted_content": "enc_a"},
1754          {"role": "assistant", "content": "ok"},
1755          {"type": "reasoning", "id": "rs_xyz", "encrypted_content": "enc_a"},
1756          {"type": "reasoning", "id": "rs_zzz", "encrypted_content": "enc_b"},
1757          {"role": "assistant", "content": "done"},
1758      ]
1759      from agent.codex_responses_adapter import _preflight_codex_input_items
1760      normalized = _preflight_codex_input_items(raw_input)
1761  
1762      reasoning_items = [it for it in normalized if it.get("type") == "reasoning"]
1763      # rs_xyz duplicate should be collapsed to one item; rs_zzz kept.
1764      assert len(reasoning_items) == 2
1765      encrypted = [it["encrypted_content"] for it in reasoning_items]
1766      assert encrypted.count("enc_a") == 1
1767      assert "enc_b" in encrypted
1768      # IDs must be stripped — with store=False the API 404s on id lookups.
1769      for it in reasoning_items:
1770          assert "id" not in it