test_run_agent_codex_responses.py
1 import sys 2 import types 3 from types import SimpleNamespace 4 5 import pytest 6 7 8 sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None)) 9 sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object)) 10 sys.modules.setdefault("fal_client", types.SimpleNamespace()) 11 12 import run_agent 13 14 15 @pytest.fixture(autouse=True) 16 def _no_codex_backoff(monkeypatch): 17 """Short-circuit retry backoff so Codex retry tests don't block on real 18 wall-clock waits (5s jittered_backoff base delay + tight time.sleep loop).""" 19 import time as _time 20 monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0) 21 monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None) 22 23 24 def _patch_agent_bootstrap(monkeypatch): 25 monkeypatch.setattr( 26 run_agent, 27 "get_tool_definitions", 28 lambda **kwargs: [ 29 { 30 "type": "function", 31 "function": { 32 "name": "terminal", 33 "description": "Run shell commands.", 34 "parameters": {"type": "object", "properties": {}}, 35 }, 36 } 37 ], 38 ) 39 monkeypatch.setattr(run_agent, "check_toolset_requirements", lambda: {}) 40 41 42 def _build_agent(monkeypatch): 43 _patch_agent_bootstrap(monkeypatch) 44 45 agent = run_agent.AIAgent( 46 model="gpt-5-codex", 47 base_url="https://chatgpt.com/backend-api/codex", 48 api_key="codex-token", 49 quiet_mode=True, 50 max_iterations=4, 51 skip_context_files=True, 52 skip_memory=True, 53 ) 54 agent._cleanup_task_resources = lambda task_id: None 55 agent._persist_session = lambda messages, history=None: None 56 agent._save_trajectory = lambda messages, user_message, completed: None 57 agent._save_session_log = lambda messages: None 58 return agent 59 60 61 def _build_copilot_agent(monkeypatch, *, model="gpt-5.4"): 62 _patch_agent_bootstrap(monkeypatch) 63 64 agent = run_agent.AIAgent( 65 model=model, 66 provider="copilot", 67 api_mode="codex_responses", 68 base_url="https://api.githubcopilot.com", 69 api_key="gh-token", 70 quiet_mode=True, 71 max_iterations=4, 72 skip_context_files=True, 73 skip_memory=True, 74 ) 75 agent._cleanup_task_resources = lambda task_id: None 76 agent._persist_session = lambda messages, history=None: None 77 agent._save_trajectory = lambda messages, user_message, completed: None 78 agent._save_session_log = lambda messages: None 79 return agent 80 81 82 def _codex_message_response(text: str): 83 return SimpleNamespace( 84 output=[ 85 SimpleNamespace( 86 type="message", 87 content=[SimpleNamespace(type="output_text", text=text)], 88 ) 89 ], 90 usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8), 91 status="completed", 92 model="gpt-5-codex", 93 ) 94 95 96 def _codex_tool_call_response(): 97 return SimpleNamespace( 98 output=[ 99 SimpleNamespace( 100 type="function_call", 101 id="fc_1", 102 call_id="call_1", 103 name="terminal", 104 arguments="{}", 105 ) 106 ], 107 usage=SimpleNamespace(input_tokens=12, output_tokens=4, total_tokens=16), 108 status="completed", 109 model="gpt-5-codex", 110 ) 111 112 113 def _codex_incomplete_message_response(text: str): 114 return SimpleNamespace( 115 output=[ 116 SimpleNamespace( 117 type="message", 118 status="in_progress", 119 content=[SimpleNamespace(type="output_text", text=text)], 120 ) 121 ], 122 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 123 status="in_progress", 124 model="gpt-5-codex", 125 ) 126 127 128 def _codex_commentary_message_response(text: str): 129 return SimpleNamespace( 130 output=[ 131 SimpleNamespace( 132 type="message", 133 phase="commentary", 134 status="completed", 135 content=[SimpleNamespace(type="output_text", text=text)], 136 ) 137 ], 138 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 139 status="completed", 140 model="gpt-5-codex", 141 ) 142 143 144 def _codex_ack_message_response(text: str): 145 return SimpleNamespace( 146 output=[ 147 SimpleNamespace( 148 type="message", 149 status="completed", 150 content=[SimpleNamespace(type="output_text", text=text)], 151 ) 152 ], 153 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 154 status="completed", 155 model="gpt-5-codex", 156 ) 157 158 159 class _FakeResponsesStream: 160 def __init__(self, *, final_response=None, final_error=None): 161 self._final_response = final_response 162 self._final_error = final_error 163 164 def __enter__(self): 165 return self 166 167 def __exit__(self, exc_type, exc, tb): 168 return False 169 170 def __iter__(self): 171 return iter(()) 172 173 def get_final_response(self): 174 if self._final_error is not None: 175 raise self._final_error 176 return self._final_response 177 178 179 class _FakeCreateStream: 180 def __init__(self, events): 181 self._events = list(events) 182 self.closed = False 183 184 def __iter__(self): 185 return iter(self._events) 186 187 def close(self): 188 self.closed = True 189 190 191 def _codex_request_kwargs(): 192 return { 193 "model": "gpt-5-codex", 194 "instructions": "You are Hermes.", 195 "input": [{"role": "user", "content": "Ping"}], 196 "tools": None, 197 "store": False, 198 } 199 200 201 def test_api_mode_uses_explicit_provider_when_codex(monkeypatch): 202 _patch_agent_bootstrap(monkeypatch) 203 agent = run_agent.AIAgent( 204 model="gpt-5-codex", 205 base_url="https://openrouter.ai/api/v1", 206 provider="openai-codex", 207 api_key="codex-token", 208 quiet_mode=True, 209 max_iterations=1, 210 skip_context_files=True, 211 skip_memory=True, 212 ) 213 assert agent.api_mode == "codex_responses" 214 assert agent.provider == "openai-codex" 215 216 217 def test_api_mode_normalizes_provider_case(monkeypatch): 218 _patch_agent_bootstrap(monkeypatch) 219 agent = run_agent.AIAgent( 220 model="gpt-5-codex", 221 base_url="https://openrouter.ai/api/v1", 222 provider="OpenAI-Codex", 223 api_key="codex-token", 224 quiet_mode=True, 225 max_iterations=1, 226 skip_context_files=True, 227 skip_memory=True, 228 ) 229 assert agent.provider == "openai-codex" 230 assert agent.api_mode == "codex_responses" 231 232 233 def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch): 234 """GPT-5.x models need codex_responses even on OpenRouter. 235 236 OpenRouter rejects GPT-5 models on /v1/chat/completions with 237 ``unsupported_api_for_model``. The model-level check overrides 238 the provider default. 239 """ 240 _patch_agent_bootstrap(monkeypatch) 241 agent = run_agent.AIAgent( 242 model="gpt-5-codex", 243 base_url="https://chatgpt.com/backend-api/codex", 244 provider="openrouter", 245 api_key="test-token", 246 quiet_mode=True, 247 max_iterations=1, 248 skip_context_files=True, 249 skip_memory=True, 250 ) 251 assert agent.api_mode == "codex_responses" 252 assert agent.provider == "openrouter" 253 254 255 def test_copilot_acp_stays_on_chat_completions_for_gpt_5_models(monkeypatch): 256 _patch_agent_bootstrap(monkeypatch) 257 agent = run_agent.AIAgent( 258 model="gpt-5.4-mini", 259 base_url="acp://copilot", 260 provider="copilot-acp", 261 api_key="copilot-acp", 262 quiet_mode=True, 263 max_iterations=1, 264 skip_context_files=True, 265 skip_memory=True, 266 ) 267 assert agent.provider == "copilot-acp" 268 assert agent.api_mode == "chat_completions" 269 270 271 def test_copilot_gpt_5_mini_stays_on_chat_completions(monkeypatch): 272 _patch_agent_bootstrap(monkeypatch) 273 agent = run_agent.AIAgent( 274 model="gpt-5-mini", 275 base_url="https://api.githubcopilot.com", 276 provider="copilot", 277 api_key="gh-token", 278 api_mode="chat_completions", 279 quiet_mode=True, 280 max_iterations=1, 281 skip_context_files=True, 282 skip_memory=True, 283 ) 284 assert agent.provider == "copilot" 285 assert agent.api_mode == "chat_completions" 286 287 288 def test_build_api_kwargs_codex(monkeypatch): 289 agent = _build_agent(monkeypatch) 290 kwargs = agent._build_api_kwargs( 291 [ 292 {"role": "system", "content": "You are Hermes."}, 293 {"role": "user", "content": "Ping"}, 294 ] 295 ) 296 297 assert kwargs["model"] == "gpt-5-codex" 298 assert kwargs["instructions"] == "You are Hermes." 299 assert kwargs["store"] is False 300 assert isinstance(kwargs["input"], list) 301 assert kwargs["input"][0]["role"] == "user" 302 assert kwargs["tools"][0]["type"] == "function" 303 assert kwargs["tools"][0]["name"] == "terminal" 304 assert kwargs["tools"][0]["strict"] is False 305 assert "function" not in kwargs["tools"][0] 306 assert kwargs["store"] is False 307 assert kwargs["tool_choice"] == "auto" 308 assert kwargs["parallel_tool_calls"] is True 309 assert isinstance(kwargs["prompt_cache_key"], str) 310 assert len(kwargs["prompt_cache_key"]) > 0 311 assert "timeout" not in kwargs 312 assert "max_tokens" not in kwargs 313 assert "extra_body" not in kwargs 314 315 316 def test_build_api_kwargs_codex_clamps_minimal_effort(monkeypatch): 317 """'minimal' reasoning effort is clamped to 'low' on the Responses API. 318 319 GPT-5.4 supports none/low/medium/high/xhigh but NOT 'minimal'. 320 Users may configure 'minimal' via OpenRouter conventions, so the Codex 321 Responses path must clamp it to the nearest supported level. 322 """ 323 _patch_agent_bootstrap(monkeypatch) 324 325 agent = run_agent.AIAgent( 326 model="gpt-5-codex", 327 base_url="https://chatgpt.com/backend-api/codex", 328 api_key="codex-token", 329 quiet_mode=True, 330 max_iterations=4, 331 skip_context_files=True, 332 skip_memory=True, 333 reasoning_config={"enabled": True, "effort": "minimal"}, 334 ) 335 agent._cleanup_task_resources = lambda task_id: None 336 agent._persist_session = lambda messages, history=None: None 337 agent._save_trajectory = lambda messages, user_message, completed: None 338 agent._save_session_log = lambda messages: None 339 340 kwargs = agent._build_api_kwargs( 341 [ 342 {"role": "system", "content": "You are Hermes."}, 343 {"role": "user", "content": "Ping"}, 344 ] 345 ) 346 347 assert kwargs["reasoning"]["effort"] == "low" 348 349 350 def test_build_api_kwargs_codex_preserves_supported_efforts(monkeypatch): 351 """Effort levels natively supported by the Responses API pass through unchanged.""" 352 _patch_agent_bootstrap(monkeypatch) 353 354 for effort in ("low", "medium", "high", "xhigh"): 355 agent = run_agent.AIAgent( 356 model="gpt-5-codex", 357 base_url="https://chatgpt.com/backend-api/codex", 358 api_key="codex-token", 359 quiet_mode=True, 360 max_iterations=4, 361 skip_context_files=True, 362 skip_memory=True, 363 reasoning_config={"enabled": True, "effort": effort}, 364 ) 365 agent._cleanup_task_resources = lambda task_id: None 366 agent._persist_session = lambda messages, history=None: None 367 agent._save_trajectory = lambda messages, user_message, completed: None 368 agent._save_session_log = lambda messages: None 369 370 kwargs = agent._build_api_kwargs( 371 [ 372 {"role": "system", "content": "sys"}, 373 {"role": "user", "content": "hi"}, 374 ] 375 ) 376 assert kwargs["reasoning"]["effort"] == effort, f"{effort} should pass through unchanged" 377 378 379 def test_build_api_kwargs_copilot_responses_omits_openai_only_fields(monkeypatch): 380 agent = _build_copilot_agent(monkeypatch) 381 kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}]) 382 383 assert kwargs["model"] == "gpt-5.4" 384 assert kwargs["store"] is False 385 assert kwargs["tool_choice"] == "auto" 386 assert kwargs["parallel_tool_calls"] is True 387 assert kwargs["reasoning"] == {"effort": "medium"} 388 assert "prompt_cache_key" not in kwargs 389 assert "include" not in kwargs 390 391 392 def test_build_api_kwargs_copilot_responses_omits_reasoning_for_non_reasoning_model(monkeypatch): 393 agent = _build_copilot_agent(monkeypatch, model="gpt-4.1") 394 kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}]) 395 396 assert "reasoning" not in kwargs 397 assert "include" not in kwargs 398 assert "prompt_cache_key" not in kwargs 399 400 401 def test_run_codex_stream_retries_when_completed_event_missing(monkeypatch): 402 agent = _build_agent(monkeypatch) 403 calls = {"stream": 0} 404 405 def _fake_stream(**kwargs): 406 calls["stream"] += 1 407 if calls["stream"] == 1: 408 return _FakeResponsesStream( 409 final_error=RuntimeError("Didn't receive a `response.completed` event.") 410 ) 411 return _FakeResponsesStream(final_response=_codex_message_response("stream ok")) 412 413 agent.client = SimpleNamespace( 414 responses=SimpleNamespace( 415 stream=_fake_stream, 416 create=lambda **kwargs: _codex_message_response("fallback"), 417 ) 418 ) 419 420 response = agent._run_codex_stream(_codex_request_kwargs()) 421 assert calls["stream"] == 2 422 assert response.output[0].content[0].text == "stream ok" 423 424 425 def test_run_codex_stream_falls_back_to_create_after_stream_completion_error(monkeypatch): 426 agent = _build_agent(monkeypatch) 427 calls = {"stream": 0, "create": 0} 428 429 def _fake_stream(**kwargs): 430 calls["stream"] += 1 431 return _FakeResponsesStream( 432 final_error=RuntimeError("Didn't receive a `response.completed` event.") 433 ) 434 435 def _fake_create(**kwargs): 436 calls["create"] += 1 437 return _codex_message_response("create fallback ok") 438 439 agent.client = SimpleNamespace( 440 responses=SimpleNamespace( 441 stream=_fake_stream, 442 create=_fake_create, 443 ) 444 ) 445 446 response = agent._run_codex_stream(_codex_request_kwargs()) 447 assert calls["stream"] == 2 448 assert calls["create"] == 1 449 assert response.output[0].content[0].text == "create fallback ok" 450 451 452 def test_run_codex_stream_fallback_parses_create_stream_events(monkeypatch): 453 agent = _build_agent(monkeypatch) 454 calls = {"stream": 0, "create": 0} 455 create_stream = _FakeCreateStream( 456 [ 457 SimpleNamespace(type="response.created"), 458 SimpleNamespace(type="response.in_progress"), 459 SimpleNamespace(type="response.completed", response=_codex_message_response("streamed create ok")), 460 ] 461 ) 462 463 def _fake_stream(**kwargs): 464 calls["stream"] += 1 465 return _FakeResponsesStream( 466 final_error=RuntimeError("Didn't receive a `response.completed` event.") 467 ) 468 469 def _fake_create(**kwargs): 470 calls["create"] += 1 471 assert kwargs.get("stream") is True 472 return create_stream 473 474 agent.client = SimpleNamespace( 475 responses=SimpleNamespace( 476 stream=_fake_stream, 477 create=_fake_create, 478 ) 479 ) 480 481 response = agent._run_codex_stream(_codex_request_kwargs()) 482 assert calls["stream"] == 2 483 assert calls["create"] == 1 484 assert create_stream.closed is True 485 assert response.output[0].content[0].text == "streamed create ok" 486 487 488 def test_run_conversation_codex_plain_text(monkeypatch): 489 agent = _build_agent(monkeypatch) 490 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: _codex_message_response("OK")) 491 492 result = agent.run_conversation("Say OK") 493 494 assert result["completed"] is True 495 assert result["final_response"] == "OK" 496 assert result["messages"][-1]["role"] == "assistant" 497 assert result["messages"][-1]["content"] == "OK" 498 499 500 def test_run_conversation_codex_empty_output_with_output_text(monkeypatch): 501 """Regression: empty response.output + valid output_text should succeed, 502 not trigger retry/fallback. The validation stage must defer to 503 _normalize_codex_response which synthesizes output from output_text.""" 504 agent = _build_agent(monkeypatch) 505 506 def _empty_output_response(api_kwargs): 507 return SimpleNamespace( 508 output=[], 509 output_text="Hello from Codex", 510 usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8), 511 status="completed", 512 model="gpt-5-codex", 513 ) 514 515 monkeypatch.setattr(agent, "_interruptible_api_call", _empty_output_response) 516 517 result = agent.run_conversation("Say hello") 518 519 assert result["completed"] is True 520 assert result["final_response"] == "Hello from Codex" 521 522 523 def test_run_conversation_codex_empty_output_no_output_text_retries(monkeypatch): 524 """When both output and output_text are empty, validation should 525 correctly mark the response as invalid and trigger retry.""" 526 agent = _build_agent(monkeypatch) 527 calls = {"api": 0} 528 529 def _fake_api_call(api_kwargs): 530 calls["api"] += 1 531 if calls["api"] == 1: 532 return SimpleNamespace( 533 output=[], 534 output_text=None, 535 usage=SimpleNamespace(input_tokens=5, output_tokens=3, total_tokens=8), 536 status="completed", 537 model="gpt-5-codex", 538 ) 539 return _codex_message_response("Recovered") 540 541 monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) 542 543 result = agent.run_conversation("Say hello") 544 545 assert calls["api"] >= 2 546 assert result["completed"] is True 547 assert result["final_response"] == "Recovered" 548 549 550 def test_run_conversation_codex_refreshes_after_401_and_retries(monkeypatch): 551 agent = _build_agent(monkeypatch) 552 calls = {"api": 0, "refresh": 0} 553 554 class _UnauthorizedError(RuntimeError): 555 def __init__(self): 556 super().__init__("Error code: 401 - unauthorized") 557 self.status_code = 401 558 559 def _fake_api_call(api_kwargs): 560 calls["api"] += 1 561 if calls["api"] == 1: 562 raise _UnauthorizedError() 563 return _codex_message_response("Recovered after refresh") 564 565 def _fake_refresh(*, force=True): 566 calls["refresh"] += 1 567 assert force is True 568 return True 569 570 monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) 571 monkeypatch.setattr(agent, "_try_refresh_codex_client_credentials", _fake_refresh) 572 573 result = agent.run_conversation("Say OK") 574 575 assert calls["api"] == 2 576 assert calls["refresh"] == 1 577 assert result["completed"] is True 578 assert result["final_response"] == "Recovered after refresh" 579 580 581 def test_run_conversation_copilot_refreshes_after_401_and_retries(monkeypatch): 582 agent = _build_copilot_agent(monkeypatch) 583 calls = {"api": 0, "refresh": 0} 584 585 class _UnauthorizedError(RuntimeError): 586 def __init__(self): 587 super().__init__("Error code: 401 - unauthorized") 588 self.status_code = 401 589 590 def _fake_api_call(api_kwargs): 591 calls["api"] += 1 592 if calls["api"] == 1: 593 raise _UnauthorizedError() 594 return _codex_message_response("Recovered after copilot refresh") 595 596 def _fake_refresh(): 597 calls["refresh"] += 1 598 return True 599 600 monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) 601 monkeypatch.setattr(agent, "_try_refresh_copilot_client_credentials", _fake_refresh) 602 603 result = agent.run_conversation("Say OK") 604 605 assert calls["api"] == 2 606 assert calls["refresh"] == 1 607 assert result["completed"] is True 608 assert result["final_response"] == "Recovered after copilot refresh" 609 610 611 def test_try_refresh_codex_client_credentials_rebuilds_client(monkeypatch): 612 agent = _build_agent(monkeypatch) 613 closed = {"value": False} 614 rebuilt = {"kwargs": None} 615 616 class _ExistingClient: 617 def close(self): 618 closed["value"] = True 619 620 class _RebuiltClient: 621 pass 622 623 def _fake_openai(**kwargs): 624 rebuilt["kwargs"] = kwargs 625 return _RebuiltClient() 626 627 monkeypatch.setattr( 628 "hermes_cli.auth.resolve_codex_runtime_credentials", 629 lambda force_refresh=True: { 630 "api_key": "new-codex-token", 631 "base_url": "https://chatgpt.com/backend-api/codex", 632 }, 633 ) 634 monkeypatch.setattr(run_agent, "OpenAI", _fake_openai) 635 636 agent.client = _ExistingClient() 637 ok = agent._try_refresh_codex_client_credentials(force=True) 638 639 assert ok is True 640 assert closed["value"] is True 641 assert rebuilt["kwargs"]["api_key"] == "new-codex-token" 642 assert rebuilt["kwargs"]["base_url"] == "https://chatgpt.com/backend-api/codex" 643 assert isinstance(agent.client, _RebuiltClient) 644 645 646 def test_try_refresh_copilot_client_credentials_rebuilds_client(monkeypatch): 647 agent = _build_copilot_agent(monkeypatch) 648 closed = {"value": False} 649 rebuilt = {"kwargs": None} 650 651 class _ExistingClient: 652 def close(self): 653 closed["value"] = True 654 655 class _RebuiltClient: 656 pass 657 658 def _fake_openai(**kwargs): 659 rebuilt["kwargs"] = kwargs 660 return _RebuiltClient() 661 662 monkeypatch.setattr( 663 "hermes_cli.copilot_auth.resolve_copilot_token", 664 lambda: ("gho_new_token", "GH_TOKEN"), 665 ) 666 monkeypatch.setattr(run_agent, "OpenAI", _fake_openai) 667 668 agent.client = _ExistingClient() 669 ok = agent._try_refresh_copilot_client_credentials() 670 671 assert ok is True 672 assert closed["value"] is True 673 assert rebuilt["kwargs"]["api_key"] == "gho_new_token" 674 assert rebuilt["kwargs"]["base_url"] == "https://api.githubcopilot.com" 675 assert rebuilt["kwargs"]["default_headers"]["Copilot-Integration-Id"] == "vscode-chat" 676 assert isinstance(agent.client, _RebuiltClient) 677 678 679 def test_try_refresh_copilot_client_credentials_rebuilds_even_if_token_unchanged(monkeypatch): 680 agent = _build_copilot_agent(monkeypatch) 681 rebuilt = {"count": 0} 682 683 class _RebuiltClient: 684 pass 685 686 def _fake_openai(**kwargs): 687 rebuilt["count"] += 1 688 return _RebuiltClient() 689 690 monkeypatch.setattr( 691 "hermes_cli.copilot_auth.resolve_copilot_token", 692 lambda: ("gh-token", "gh auth token"), 693 ) 694 monkeypatch.setattr(run_agent, "OpenAI", _fake_openai) 695 696 ok = agent._try_refresh_copilot_client_credentials() 697 698 assert ok is True 699 assert rebuilt["count"] == 1 700 701 702 def test_run_conversation_codex_tool_round_trip(monkeypatch): 703 agent = _build_agent(monkeypatch) 704 responses = [_codex_tool_call_response(), _codex_message_response("done")] 705 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 706 707 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 708 for call in assistant_message.tool_calls: 709 messages.append( 710 { 711 "role": "tool", 712 "tool_call_id": call.id, 713 "content": '{"ok":true}', 714 } 715 ) 716 717 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 718 719 result = agent.run_conversation("run a command") 720 721 assert result["completed"] is True 722 assert result["final_response"] == "done" 723 assert any(msg.get("tool_calls") for msg in result["messages"] if msg.get("role") == "assistant") 724 assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"]) 725 726 727 def test_chat_messages_to_responses_input_uses_call_id_for_function_call(monkeypatch): 728 agent = _build_agent(monkeypatch) 729 from agent.codex_responses_adapter import _chat_messages_to_responses_input 730 items = _chat_messages_to_responses_input( 731 [ 732 {"role": "user", "content": "Run terminal"}, 733 { 734 "role": "assistant", 735 "content": "", 736 "tool_calls": [ 737 { 738 "id": "call_abc123", 739 "type": "function", 740 "function": {"name": "terminal", "arguments": "{}"}, 741 } 742 ], 743 }, 744 {"role": "tool", "tool_call_id": "call_abc123", "content": '{"ok":true}'}, 745 ] 746 ) 747 748 function_call = next(item for item in items if item.get("type") == "function_call") 749 function_output = next(item for item in items if item.get("type") == "function_call_output") 750 751 assert function_call["call_id"] == "call_abc123" 752 assert "id" not in function_call 753 assert function_output["call_id"] == "call_abc123" 754 755 756 def test_chat_messages_to_responses_input_accepts_call_pipe_fc_ids(monkeypatch): 757 agent = _build_agent(monkeypatch) 758 from agent.codex_responses_adapter import _chat_messages_to_responses_input 759 items = _chat_messages_to_responses_input( 760 [ 761 {"role": "user", "content": "Run terminal"}, 762 { 763 "role": "assistant", 764 "content": "", 765 "tool_calls": [ 766 { 767 "id": "call_pair123|fc_pair123", 768 "type": "function", 769 "function": {"name": "terminal", "arguments": "{}"}, 770 } 771 ], 772 }, 773 {"role": "tool", "tool_call_id": "call_pair123|fc_pair123", "content": '{"ok":true}'}, 774 ] 775 ) 776 777 function_call = next(item for item in items if item.get("type") == "function_call") 778 function_output = next(item for item in items if item.get("type") == "function_call_output") 779 780 assert function_call["call_id"] == "call_pair123" 781 assert "id" not in function_call 782 assert function_output["call_id"] == "call_pair123" 783 784 785 def test_preflight_codex_api_kwargs_strips_optional_function_call_id(monkeypatch): 786 agent = _build_agent(monkeypatch) 787 from agent.codex_responses_adapter import _preflight_codex_api_kwargs 788 preflight = _preflight_codex_api_kwargs( 789 { 790 "model": "gpt-5-codex", 791 "instructions": "You are Hermes.", 792 "input": [ 793 {"role": "user", "content": "hi"}, 794 { 795 "type": "function_call", 796 "id": "call_bad", 797 "call_id": "call_good", 798 "name": "terminal", 799 "arguments": "{}", 800 }, 801 ], 802 "tools": [], 803 "store": False, 804 } 805 ) 806 807 fn_call = next(item for item in preflight["input"] if item.get("type") == "function_call") 808 assert fn_call["call_id"] == "call_good" 809 assert "id" not in fn_call 810 811 812 def test_preflight_codex_api_kwargs_rejects_function_call_output_without_call_id(monkeypatch): 813 agent = _build_agent(monkeypatch) 814 815 with pytest.raises(ValueError, match="function_call_output is missing call_id"): 816 from agent.codex_responses_adapter import _preflight_codex_api_kwargs 817 _preflight_codex_api_kwargs( 818 { 819 "model": "gpt-5-codex", 820 "instructions": "You are Hermes.", 821 "input": [{"type": "function_call_output", "output": "{}"}], 822 "tools": [], 823 "store": False, 824 } 825 ) 826 827 828 def test_preflight_codex_api_kwargs_rejects_unsupported_request_fields(monkeypatch): 829 agent = _build_agent(monkeypatch) 830 kwargs = _codex_request_kwargs() 831 kwargs["some_unknown_field"] = "value" 832 833 with pytest.raises(ValueError, match="unsupported field"): 834 from agent.codex_responses_adapter import _preflight_codex_api_kwargs 835 _preflight_codex_api_kwargs(kwargs) 836 837 838 def test_preflight_codex_api_kwargs_allows_reasoning_and_temperature(monkeypatch): 839 agent = _build_agent(monkeypatch) 840 kwargs = _codex_request_kwargs() 841 kwargs["reasoning"] = {"effort": "high", "summary": "auto"} 842 kwargs["include"] = ["reasoning.encrypted_content"] 843 kwargs["temperature"] = 0.7 844 kwargs["max_output_tokens"] = 4096 845 846 from agent.codex_responses_adapter import _preflight_codex_api_kwargs 847 result = _preflight_codex_api_kwargs(kwargs) 848 assert result["reasoning"] == {"effort": "high", "summary": "auto"} 849 assert result["include"] == ["reasoning.encrypted_content"] 850 assert result["temperature"] == 0.7 851 assert result["max_output_tokens"] == 4096 852 853 854 def test_preflight_codex_api_kwargs_allows_service_tier(monkeypatch): 855 agent = _build_agent(monkeypatch) 856 kwargs = _codex_request_kwargs() 857 kwargs["service_tier"] = "priority" 858 859 from agent.codex_responses_adapter import _preflight_codex_api_kwargs 860 result = _preflight_codex_api_kwargs(kwargs) 861 assert result["service_tier"] == "priority" 862 863 864 def test_run_conversation_codex_replay_payload_keeps_call_id(monkeypatch): 865 agent = _build_agent(monkeypatch) 866 responses = [_codex_tool_call_response(), _codex_message_response("done")] 867 requests = [] 868 869 def _fake_api_call(api_kwargs): 870 requests.append(api_kwargs) 871 return responses.pop(0) 872 873 monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) 874 875 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 876 for call in assistant_message.tool_calls: 877 messages.append( 878 { 879 "role": "tool", 880 "tool_call_id": call.id, 881 "content": '{"ok":true}', 882 } 883 ) 884 885 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 886 887 result = agent.run_conversation("run a command") 888 889 assert result["completed"] is True 890 assert result["final_response"] == "done" 891 assert len(requests) >= 2 892 893 replay_input = requests[1]["input"] 894 function_call = next(item for item in replay_input if item.get("type") == "function_call") 895 function_output = next(item for item in replay_input if item.get("type") == "function_call_output") 896 assert function_call["call_id"] == "call_1" 897 assert "id" not in function_call 898 assert function_output["call_id"] == "call_1" 899 900 901 def test_run_conversation_codex_continues_after_incomplete_interim_message(monkeypatch): 902 agent = _build_agent(monkeypatch) 903 responses = [ 904 _codex_incomplete_message_response("I'll inspect the repo structure first."), 905 _codex_tool_call_response(), 906 _codex_message_response("Architecture summary complete."), 907 ] 908 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 909 910 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 911 for call in assistant_message.tool_calls: 912 messages.append( 913 { 914 "role": "tool", 915 "tool_call_id": call.id, 916 "content": '{"ok":true}', 917 } 918 ) 919 920 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 921 922 result = agent.run_conversation("analyze repo") 923 924 assert result["completed"] is True 925 assert result["final_response"] == "Architecture summary complete." 926 assert any( 927 msg.get("role") == "assistant" 928 and msg.get("finish_reason") == "incomplete" 929 and "inspect the repo structure" in (msg.get("content") or "") 930 for msg in result["messages"] 931 ) 932 assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"]) 933 934 935 def test_normalize_codex_response_marks_commentary_only_message_as_incomplete(monkeypatch): 936 agent = _build_agent(monkeypatch) 937 from agent.codex_responses_adapter import _normalize_codex_response 938 assistant_message, finish_reason = _normalize_codex_response( 939 _codex_commentary_message_response("I'll inspect the repository first.") 940 ) 941 942 assert finish_reason == "incomplete" 943 assert "inspect the repository" in (assistant_message.content or "") 944 945 946 def test_normalize_codex_response_preserves_message_status_for_replay(monkeypatch): 947 """Incomplete Codex output messages must not be replayed as completed.""" 948 agent = _build_agent(monkeypatch) 949 from agent.codex_responses_adapter import _normalize_codex_response 950 951 response = SimpleNamespace( 952 output=[ 953 SimpleNamespace( 954 type="message", 955 id="msg_partial", 956 phase="commentary", 957 status="in_progress", 958 content=[SimpleNamespace(type="output_text", text="Still working...")], 959 ) 960 ], 961 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 962 status="in_progress", 963 model="gpt-5-codex", 964 ) 965 966 assistant_message, finish_reason = _normalize_codex_response(response) 967 968 assert finish_reason == "incomplete" 969 assert assistant_message.codex_message_items[0]["id"] == "msg_partial" 970 assert assistant_message.codex_message_items[0]["status"] == "in_progress" 971 972 973 def test_normalize_codex_response_detects_leaked_tool_call_text(monkeypatch): 974 """Harmony-style `to=functions.foo` leaked into assistant content with no 975 structured function_call items must be treated as incomplete so the 976 continuation path can re-elicit a proper tool call. This is the 977 Taiwan-embassy-email (Discord bug report) failure mode: child agent 978 produces a confident-looking summary, tool_trace is empty because no 979 tools actually ran, parent can't audit the claim. 980 """ 981 agent = _build_agent(monkeypatch) 982 from agent.codex_responses_adapter import _normalize_codex_response 983 984 leaked_content = ( 985 "I'll check the official page directly.\n" 986 "to=functions.exec_command {\"cmd\": \"curl https://example.test\"}\n" 987 "assistant to=functions.exec_command {\"stdout\": \"mailto:foo@example.test\"}\n" 988 "Extracted: foo@example.test" 989 ) 990 response = SimpleNamespace( 991 output=[ 992 SimpleNamespace( 993 type="message", 994 status="completed", 995 content=[SimpleNamespace(type="output_text", text=leaked_content)], 996 ) 997 ], 998 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 999 status="completed", 1000 model="gpt-5.4", 1001 ) 1002 1003 assistant_message, finish_reason = _normalize_codex_response(response) 1004 1005 assert finish_reason == "incomplete" 1006 # Content is scrubbed so the parent never surfaces the leaked text as a 1007 # summary. tool_calls stays empty because no structured function_call 1008 # item existed. 1009 assert (assistant_message.content or "") == "" 1010 assert assistant_message.tool_calls == [] 1011 1012 1013 def test_normalize_codex_response_ignores_tool_call_text_when_real_tool_call_present(monkeypatch): 1014 """If the model emitted BOTH a structured function_call AND some text that 1015 happens to contain `to=functions.*` (unlikely but possible), trust the 1016 structured call — don't wipe content that came alongside a real tool use. 1017 """ 1018 agent = _build_agent(monkeypatch) 1019 from agent.codex_responses_adapter import _normalize_codex_response 1020 1021 response = SimpleNamespace( 1022 output=[ 1023 SimpleNamespace( 1024 type="message", 1025 status="completed", 1026 content=[SimpleNamespace( 1027 type="output_text", 1028 text="Running the command via to=functions.exec_command now.", 1029 )], 1030 ), 1031 SimpleNamespace( 1032 type="function_call", 1033 id="fc_1", 1034 call_id="call_1", 1035 name="terminal", 1036 arguments="{}", 1037 ), 1038 ], 1039 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 1040 status="completed", 1041 model="gpt-5.4", 1042 ) 1043 1044 assistant_message, finish_reason = _normalize_codex_response(response) 1045 1046 assert finish_reason == "tool_calls" 1047 assert assistant_message.tool_calls # real call preserved 1048 assert "Running the command" in (assistant_message.content or "") 1049 1050 1051 def test_normalize_codex_response_no_leak_passes_through(monkeypatch): 1052 """Sanity: normal assistant content that doesn't contain the leak pattern 1053 is returned verbatim with finish_reason=stop.""" 1054 agent = _build_agent(monkeypatch) 1055 from agent.codex_responses_adapter import _normalize_codex_response 1056 1057 response = SimpleNamespace( 1058 output=[ 1059 SimpleNamespace( 1060 type="message", 1061 status="completed", 1062 content=[SimpleNamespace( 1063 type="output_text", 1064 text="Here is the answer with no leak.", 1065 )], 1066 ) 1067 ], 1068 usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6), 1069 status="completed", 1070 model="gpt-5.4", 1071 ) 1072 1073 assistant_message, finish_reason = _normalize_codex_response(response) 1074 1075 assert finish_reason == "stop" 1076 assert assistant_message.content == "Here is the answer with no leak." 1077 assert assistant_message.tool_calls == [] 1078 1079 1080 def test_interim_commentary_is_not_marked_already_streamed_without_callbacks(monkeypatch): 1081 agent = _build_agent(monkeypatch) 1082 observed = {} 1083 1084 agent._fire_stream_delta("short version: yes") 1085 agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update( 1086 {"text": text, "already_streamed": already_streamed} 1087 ) 1088 1089 agent._emit_interim_assistant_message({"role": "assistant", "content": "short version: yes"}) 1090 1091 assert observed == { 1092 "text": "short version: yes", 1093 "already_streamed": False, 1094 } 1095 1096 1097 def test_interim_commentary_is_not_marked_already_streamed_when_stream_callback_fails(monkeypatch): 1098 agent = _build_agent(monkeypatch) 1099 observed = {} 1100 1101 def failing_callback(_text): 1102 raise RuntimeError("display failed") 1103 1104 agent.stream_delta_callback = failing_callback 1105 agent._fire_stream_delta("short version: yes") 1106 agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update( 1107 {"text": text, "already_streamed": already_streamed} 1108 ) 1109 1110 agent._emit_interim_assistant_message({"role": "assistant", "content": "short version: yes"}) 1111 1112 assert observed == { 1113 "text": "short version: yes", 1114 "already_streamed": False, 1115 } 1116 1117 1118 def test_interim_commentary_preserves_assistant_content(monkeypatch): 1119 """Interim commentary must not silently mutate assistant text containing 1120 literal <memory-context> markers — that's legitimate model output (docs, 1121 code). Streaming-path leak prevention happens delta-by-delta upstream.""" 1122 agent = _build_agent(monkeypatch) 1123 observed = {} 1124 agent.interim_assistant_callback = lambda text, *, already_streamed=False: observed.update( 1125 {"text": text, "already_streamed": already_streamed} 1126 ) 1127 1128 content = ( 1129 "<memory-context>\n" 1130 "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n" 1131 "## Honcho Context\n" 1132 "stale memory\n" 1133 "</memory-context>\n\n" 1134 "I'll inspect the repo structure first." 1135 ) 1136 1137 agent._emit_interim_assistant_message({"role": "assistant", "content": content}) 1138 1139 assert "<memory-context>" in observed["text"] 1140 assert "I'll inspect the repo structure first." in observed["text"] 1141 1142 1143 def test_stream_delta_strips_leaked_memory_context(monkeypatch): 1144 agent = _build_agent(monkeypatch) 1145 observed = [] 1146 agent.stream_delta_callback = observed.append 1147 1148 leaked = ( 1149 "<memory-context>\n" 1150 "[System note: The following is recalled memory context, NOT new user input. Treat as informational background data.]\n\n" 1151 "## Honcho Context\n" 1152 "stale memory\n" 1153 "</memory-context>\n\n" 1154 "Visible answer" 1155 ) 1156 1157 agent._fire_stream_delta(leaked) 1158 1159 assert observed == ["Visible answer"] 1160 1161 1162 def test_stream_delta_strips_leaked_memory_context_across_chunks(monkeypatch): 1163 """Regression for #5719 — the real streaming case. 1164 1165 Providers typically emit 1-80 char chunks, so the memory-context open 1166 tag, system-note line, payload, and close tag each arrive in separate 1167 deltas. The per-delta sanitize_context() regex cannot survive that 1168 — only a stateful scrubber can. None of the payload, system-note 1169 text, or "## Honcho Context" header may reach the delta callback. 1170 """ 1171 agent = _build_agent(monkeypatch) 1172 observed = [] 1173 agent.stream_delta_callback = observed.append 1174 1175 deltas = [ 1176 "<memory-context>\n[System note: The following", 1177 " is recalled memory context, NOT new user input. ", 1178 "Treat as informational background data.]\n\n", 1179 "## Honcho Context\n", 1180 "stale memory about eri\n", 1181 "</memory-context>\n\n", 1182 "Visible answer", 1183 ] 1184 for d in deltas: 1185 agent._fire_stream_delta(d) 1186 1187 combined = "".join(observed) 1188 assert "Visible answer" in combined 1189 # None of the leaked payload may surface. 1190 assert "System note" not in combined 1191 assert "Honcho Context" not in combined 1192 assert "stale memory" not in combined 1193 assert "<memory-context>" not in combined 1194 assert "</memory-context>" not in combined 1195 1196 1197 def test_stream_delta_scrubber_resets_between_turns(monkeypatch): 1198 """An unterminated span from a prior turn must not taint the next turn.""" 1199 agent = _build_agent(monkeypatch) 1200 1201 # Simulate a hung span carried over — directly populate the scrubber. 1202 agent._stream_context_scrubber.feed("pre <memory-context>leaked") 1203 1204 # Normally run_conversation() resets the scrubber at turn start. 1205 agent._stream_context_scrubber.reset() 1206 1207 observed = [] 1208 agent.stream_delta_callback = observed.append 1209 agent._fire_stream_delta("clean new turn text") 1210 assert "".join(observed) == "clean new turn text" 1211 1212 1213 def test_stream_delta_preserves_mid_stream_leading_newlines(monkeypatch): 1214 """Mid-stream leading newlines must survive — they are legitimate 1215 markdown (lists, code fences, paragraph breaks). Stripping them 1216 based on chunk boundaries silently breaks formatting. 1217 1218 Only the very first delta of a stream gets leading-newlines stripped 1219 (so stale provider preamble doesn't leak); after that, deltas are 1220 emitted verbatim. 1221 """ 1222 agent = _build_agent(monkeypatch) 1223 observed = [] 1224 agent.stream_delta_callback = observed.append 1225 1226 # First delta delivers text — strips its own leading "\n" once. 1227 agent._fire_stream_delta("\nHere is a list:") 1228 # Second delta starts with "\n- item" — must NOT be stripped. 1229 agent._fire_stream_delta("\n- first") 1230 agent._fire_stream_delta("\n- second") 1231 1232 combined = "".join(observed) 1233 assert combined == "Here is a list:\n- first\n- second" 1234 1235 1236 def test_stream_delta_preserves_code_fence_newlines(monkeypatch): 1237 """Code blocks span multiple deltas. A "\\n```python\\n" boundary 1238 is the canonical case where stripping leading newlines corrupts output.""" 1239 agent = _build_agent(monkeypatch) 1240 observed = [] 1241 agent.stream_delta_callback = observed.append 1242 1243 agent._fire_stream_delta("Here is the code:") 1244 agent._fire_stream_delta("\n```python\n") 1245 agent._fire_stream_delta("print('hi')\n") 1246 agent._fire_stream_delta("```\n") 1247 1248 combined = "".join(observed) 1249 assert "```python\n" in combined 1250 assert combined.startswith("Here is the code:\n```python\n") 1251 1252 1253 def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch): 1254 agent = _build_agent(monkeypatch) 1255 responses = [ 1256 _codex_commentary_message_response("I'll inspect the repo structure first."), 1257 _codex_tool_call_response(), 1258 _codex_message_response("Architecture summary complete."), 1259 ] 1260 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1261 1262 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 1263 for call in assistant_message.tool_calls: 1264 messages.append( 1265 { 1266 "role": "tool", 1267 "tool_call_id": call.id, 1268 "content": '{"ok":true}', 1269 } 1270 ) 1271 1272 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 1273 1274 result = agent.run_conversation("analyze repo") 1275 1276 assert result["completed"] is True 1277 assert result["final_response"] == "Architecture summary complete." 1278 assert any( 1279 msg.get("role") == "assistant" 1280 and msg.get("finish_reason") == "incomplete" 1281 and "inspect the repo structure" in (msg.get("content") or "") 1282 for msg in result["messages"] 1283 ) 1284 assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"]) 1285 1286 1287 def test_run_conversation_codex_continues_after_ack_stop_message(monkeypatch): 1288 agent = _build_agent(monkeypatch) 1289 responses = [ 1290 _codex_ack_message_response( 1291 "Absolutely — I can do that. I'll inspect ~/openclaw-studio and report back with a walkthrough." 1292 ), 1293 _codex_tool_call_response(), 1294 _codex_message_response("Architecture summary complete."), 1295 ] 1296 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1297 1298 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 1299 for call in assistant_message.tool_calls: 1300 messages.append( 1301 { 1302 "role": "tool", 1303 "tool_call_id": call.id, 1304 "content": '{"ok":true}', 1305 } 1306 ) 1307 1308 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 1309 1310 result = agent.run_conversation("look into ~/openclaw-studio and tell me how it works") 1311 1312 assert result["completed"] is True 1313 assert result["final_response"] == "Architecture summary complete." 1314 assert any( 1315 msg.get("role") == "assistant" 1316 and msg.get("finish_reason") == "incomplete" 1317 and "inspect ~/openclaw-studio" in (msg.get("content") or "") 1318 for msg in result["messages"] 1319 ) 1320 assert any( 1321 msg.get("role") == "user" 1322 and "Continue now. Execute the required tool calls" in (msg.get("content") or "") 1323 for msg in result["messages"] 1324 ) 1325 assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"]) 1326 1327 1328 def test_run_conversation_codex_continues_after_ack_for_directory_listing_prompt(monkeypatch): 1329 agent = _build_agent(monkeypatch) 1330 responses = [ 1331 _codex_ack_message_response( 1332 "I'll check what's in the current directory and call out 3 notable items." 1333 ), 1334 _codex_tool_call_response(), 1335 _codex_message_response("Directory summary complete."), 1336 ] 1337 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1338 1339 def _fake_execute_tool_calls(assistant_message, messages, effective_task_id): 1340 for call in assistant_message.tool_calls: 1341 messages.append( 1342 { 1343 "role": "tool", 1344 "tool_call_id": call.id, 1345 "content": '{"ok":true}', 1346 } 1347 ) 1348 1349 monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) 1350 1351 result = agent.run_conversation("look at current directory and list 3 notable things") 1352 1353 assert result["completed"] is True 1354 assert result["final_response"] == "Directory summary complete." 1355 assert any( 1356 msg.get("role") == "assistant" 1357 and msg.get("finish_reason") == "incomplete" 1358 and "current directory" in (msg.get("content") or "") 1359 for msg in result["messages"] 1360 ) 1361 assert any( 1362 msg.get("role") == "user" 1363 and "Continue now. Execute the required tool calls" in (msg.get("content") or "") 1364 for msg in result["messages"] 1365 ) 1366 assert any(msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" for msg in result["messages"]) 1367 1368 1369 def test_dump_api_request_debug_uses_responses_url(monkeypatch, tmp_path): 1370 """Debug dumps should show /responses URL when in codex_responses mode.""" 1371 import json 1372 agent = _build_agent(monkeypatch) 1373 agent.base_url = "http://127.0.0.1:9208/v1" 1374 agent.logs_dir = tmp_path 1375 1376 dump_file = agent._dump_api_request_debug(_codex_request_kwargs(), reason="preflight") 1377 1378 payload = json.loads(dump_file.read_text()) 1379 assert payload["request"]["url"] == "http://127.0.0.1:9208/v1/responses" 1380 1381 1382 def test_dump_api_request_debug_uses_chat_completions_url(monkeypatch, tmp_path): 1383 """Debug dumps should show /chat/completions URL for chat_completions mode.""" 1384 import json 1385 _patch_agent_bootstrap(monkeypatch) 1386 agent = run_agent.AIAgent( 1387 model="gpt-4o", 1388 base_url="http://127.0.0.1:9208/v1", 1389 api_key="test-key", 1390 quiet_mode=True, 1391 max_iterations=1, 1392 skip_context_files=True, 1393 skip_memory=True, 1394 ) 1395 agent.logs_dir = tmp_path 1396 1397 dump_file = agent._dump_api_request_debug( 1398 {"model": "gpt-4o", "messages": [{"role": "user", "content": "hi"}]}, 1399 reason="preflight", 1400 ) 1401 1402 payload = json.loads(dump_file.read_text()) 1403 assert payload["request"]["url"] == "http://127.0.0.1:9208/v1/chat/completions" 1404 1405 1406 # --- Reasoning-only response tests (fix for empty content retry loop) --- 1407 1408 1409 def _codex_reasoning_only_response(*, encrypted_content="enc_abc123", summary_text="Thinking..."): 1410 """Codex response containing only reasoning items — no message text, no tool calls.""" 1411 return SimpleNamespace( 1412 output=[ 1413 SimpleNamespace( 1414 type="reasoning", 1415 id="rs_001", 1416 encrypted_content=encrypted_content, 1417 summary=[SimpleNamespace(type="summary_text", text=summary_text)], 1418 status="completed", 1419 ) 1420 ], 1421 usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150), 1422 status="completed", 1423 model="gpt-5-codex", 1424 ) 1425 1426 1427 def test_normalize_codex_response_marks_reasoning_only_as_incomplete(monkeypatch): 1428 """A response with only reasoning items and no content should be 'incomplete', not 'stop'. 1429 1430 Without this fix, reasoning-only responses get finish_reason='stop' which 1431 sends them into the empty-content retry loop (3 retries then failure). 1432 """ 1433 agent = _build_agent(monkeypatch) 1434 from agent.codex_responses_adapter import _normalize_codex_response 1435 assistant_message, finish_reason = _normalize_codex_response( 1436 _codex_reasoning_only_response() 1437 ) 1438 1439 assert finish_reason == "incomplete" 1440 assert assistant_message.content == "" 1441 assert assistant_message.codex_reasoning_items is not None 1442 assert len(assistant_message.codex_reasoning_items) == 1 1443 assert assistant_message.codex_reasoning_items[0]["encrypted_content"] == "enc_abc123" 1444 1445 1446 def test_normalize_codex_response_reasoning_with_content_is_stop(monkeypatch): 1447 """If a response has both reasoning and message content, it should still be 'stop'.""" 1448 agent = _build_agent(monkeypatch) 1449 response = SimpleNamespace( 1450 output=[ 1451 SimpleNamespace( 1452 type="reasoning", 1453 id="rs_001", 1454 encrypted_content="enc_xyz", 1455 summary=[SimpleNamespace(type="summary_text", text="Thinking...")], 1456 status="completed", 1457 ), 1458 SimpleNamespace( 1459 type="message", 1460 content=[SimpleNamespace(type="output_text", text="Here is the answer.")], 1461 status="completed", 1462 ), 1463 ], 1464 usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150), 1465 status="completed", 1466 model="gpt-5-codex", 1467 ) 1468 from agent.codex_responses_adapter import _normalize_codex_response 1469 assistant_message, finish_reason = _normalize_codex_response(response) 1470 1471 assert finish_reason == "stop" 1472 assert "Here is the answer" in assistant_message.content 1473 1474 1475 def test_run_conversation_codex_continues_after_reasoning_only_response(monkeypatch): 1476 """End-to-end: reasoning-only → final message should succeed, not hit retry loop.""" 1477 agent = _build_agent(monkeypatch) 1478 responses = [ 1479 _codex_reasoning_only_response(), 1480 _codex_message_response("The final answer is 42."), 1481 ] 1482 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1483 1484 result = agent.run_conversation("what is the answer?") 1485 1486 assert result["completed"] is True 1487 assert result["final_response"] == "The final answer is 42." 1488 # The reasoning-only turn should be in messages as an incomplete interim 1489 assert any( 1490 msg.get("role") == "assistant" 1491 and msg.get("finish_reason") == "incomplete" 1492 and msg.get("codex_reasoning_items") is not None 1493 for msg in result["messages"] 1494 ) 1495 1496 1497 def test_run_conversation_codex_preserves_encrypted_reasoning_in_interim(monkeypatch): 1498 """Encrypted codex_reasoning_items must be preserved in interim messages 1499 even when there is no visible reasoning text or content.""" 1500 agent = _build_agent(monkeypatch) 1501 # Response with encrypted reasoning but no human-readable summary 1502 reasoning_response = SimpleNamespace( 1503 output=[ 1504 SimpleNamespace( 1505 type="reasoning", 1506 id="rs_002", 1507 encrypted_content="enc_opaque_blob", 1508 summary=[], 1509 status="completed", 1510 ) 1511 ], 1512 usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150), 1513 status="completed", 1514 model="gpt-5-codex", 1515 ) 1516 responses = [ 1517 reasoning_response, 1518 _codex_message_response("Done thinking."), 1519 ] 1520 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1521 1522 result = agent.run_conversation("think hard") 1523 1524 assert result["completed"] is True 1525 assert result["final_response"] == "Done thinking." 1526 # The interim message must have codex_reasoning_items preserved 1527 interim_msgs = [ 1528 msg for msg in result["messages"] 1529 if msg.get("role") == "assistant" 1530 and msg.get("finish_reason") == "incomplete" 1531 ] 1532 assert len(interim_msgs) >= 1 1533 assert interim_msgs[0].get("codex_reasoning_items") is not None 1534 assert interim_msgs[0]["codex_reasoning_items"][0]["encrypted_content"] == "enc_opaque_blob" 1535 1536 1537 def test_chat_messages_to_responses_input_reasoning_only_has_following_item(monkeypatch): 1538 """When converting a reasoning-only interim message to Responses API input, 1539 the reasoning items must be followed by an assistant message (even if empty) 1540 to satisfy the API's 'required following item' constraint.""" 1541 agent = _build_agent(monkeypatch) 1542 messages = [ 1543 {"role": "user", "content": "think hard"}, 1544 { 1545 "role": "assistant", 1546 "content": "", 1547 "reasoning": None, 1548 "finish_reason": "incomplete", 1549 "codex_reasoning_items": [ 1550 {"type": "reasoning", "id": "rs_001", "encrypted_content": "enc_abc", "summary": []}, 1551 ], 1552 }, 1553 ] 1554 from agent.codex_responses_adapter import _chat_messages_to_responses_input 1555 items = _chat_messages_to_responses_input(messages) 1556 1557 # Find the reasoning item 1558 reasoning_indices = [i for i, it in enumerate(items) if it.get("type") == "reasoning"] 1559 assert len(reasoning_indices) == 1 1560 ri_idx = reasoning_indices[0] 1561 1562 # There must be a following item after the reasoning 1563 assert ri_idx < len(items) - 1, "Reasoning item must not be the last item (missing_following_item)" 1564 following = items[ri_idx + 1] 1565 assert following.get("role") == "assistant" 1566 1567 1568 def test_codex_message_item_status_survives_conversion_and_preflight(monkeypatch): 1569 """Stored Codex assistant message statuses must survive replay normalization.""" 1570 agent = _build_agent(monkeypatch) 1571 from agent.codex_responses_adapter import ( 1572 _chat_messages_to_responses_input, 1573 _preflight_codex_input_items, 1574 ) 1575 1576 items = _chat_messages_to_responses_input([ 1577 { 1578 "role": "assistant", 1579 "content": "partial", 1580 "codex_message_items": [ 1581 { 1582 "type": "message", 1583 "role": "assistant", 1584 "status": "incomplete", 1585 "id": "msg_incomplete", 1586 "phase": "commentary", 1587 "content": [{"type": "output_text", "text": "partial"}], 1588 } 1589 ], 1590 } 1591 ]) 1592 replay_item = next(item for item in items if item.get("type") == "message") 1593 assert replay_item["status"] == "incomplete" 1594 1595 normalized = _preflight_codex_input_items([ 1596 { 1597 "type": "message", 1598 "role": "assistant", 1599 "status": "in_progress", 1600 "content": [{"type": "output_text", "text": "working"}], 1601 } 1602 ]) 1603 assert normalized[0]["status"] == "in_progress" 1604 1605 1606 def test_duplicate_detection_distinguishes_different_codex_reasoning(monkeypatch): 1607 """Two consecutive reasoning-only responses with different encrypted content 1608 must NOT be treated as duplicates.""" 1609 agent = _build_agent(monkeypatch) 1610 responses = [ 1611 # First reasoning-only response 1612 SimpleNamespace( 1613 output=[ 1614 SimpleNamespace( 1615 type="reasoning", id="rs_001", 1616 encrypted_content="enc_first", summary=[], status="completed", 1617 ) 1618 ], 1619 usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150), 1620 status="completed", model="gpt-5-codex", 1621 ), 1622 # Second reasoning-only response (different encrypted content) 1623 SimpleNamespace( 1624 output=[ 1625 SimpleNamespace( 1626 type="reasoning", id="rs_002", 1627 encrypted_content="enc_second", summary=[], status="completed", 1628 ) 1629 ], 1630 usage=SimpleNamespace(input_tokens=50, output_tokens=100, total_tokens=150), 1631 status="completed", model="gpt-5-codex", 1632 ), 1633 _codex_message_response("Final answer after thinking."), 1634 ] 1635 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1636 1637 result = agent.run_conversation("think very hard") 1638 1639 assert result["completed"] is True 1640 assert result["final_response"] == "Final answer after thinking." 1641 # Both reasoning-only interim messages should be in history (not collapsed) 1642 interim_msgs = [ 1643 msg for msg in result["messages"] 1644 if msg.get("role") == "assistant" 1645 and msg.get("finish_reason") == "incomplete" 1646 ] 1647 assert len(interim_msgs) == 2 1648 encrypted_contents = [ 1649 msg["codex_reasoning_items"][0]["encrypted_content"] 1650 for msg in interim_msgs 1651 ] 1652 assert "enc_first" in encrypted_contents 1653 assert "enc_second" in encrypted_contents 1654 1655 1656 def test_duplicate_detection_distinguishes_different_codex_message_items(monkeypatch): 1657 """Incomplete turns with new message ids/phases/statuses must not be collapsed.""" 1658 agent = _build_agent(monkeypatch) 1659 responses = [ 1660 SimpleNamespace( 1661 output=[ 1662 SimpleNamespace( 1663 type="message", 1664 id="msg_first", 1665 phase="commentary", 1666 status="in_progress", 1667 content=[SimpleNamespace(type="output_text", text="Still working...")], 1668 ) 1669 ], 1670 usage=SimpleNamespace(input_tokens=50, output_tokens=10, total_tokens=60), 1671 status="in_progress", 1672 model="gpt-5-codex", 1673 ), 1674 SimpleNamespace( 1675 output=[ 1676 SimpleNamespace( 1677 type="message", 1678 id="msg_second", 1679 phase="commentary", 1680 status="in_progress", 1681 content=[SimpleNamespace(type="output_text", text="Still working...")], 1682 ) 1683 ], 1684 usage=SimpleNamespace(input_tokens=50, output_tokens=10, total_tokens=60), 1685 status="in_progress", 1686 model="gpt-5-codex", 1687 ), 1688 _codex_message_response("Final answer after progress updates."), 1689 ] 1690 monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: responses.pop(0)) 1691 1692 result = agent.run_conversation("keep going") 1693 1694 assert result["completed"] is True 1695 interim_msgs = [ 1696 msg for msg in result["messages"] 1697 if msg.get("role") == "assistant" 1698 and msg.get("finish_reason") == "incomplete" 1699 ] 1700 assert len(interim_msgs) == 2 1701 assert [msg["codex_message_items"][0]["id"] for msg in interim_msgs] == [ 1702 "msg_first", 1703 "msg_second", 1704 ] 1705 assert all(msg["codex_message_items"][0]["status"] == "in_progress" for msg in interim_msgs) 1706 1707 1708 def test_chat_messages_to_responses_input_deduplicates_reasoning_ids(monkeypatch): 1709 """Duplicate reasoning item IDs across multi-turn incomplete responses 1710 must be deduplicated so the Responses API doesn't reject with HTTP 400.""" 1711 agent = _build_agent(monkeypatch) 1712 messages = [ 1713 {"role": "user", "content": "think hard"}, 1714 { 1715 "role": "assistant", 1716 "content": "", 1717 "codex_reasoning_items": [ 1718 {"type": "reasoning", "id": "rs_aaa", "encrypted_content": "enc_1"}, 1719 {"type": "reasoning", "id": "rs_bbb", "encrypted_content": "enc_2"}, 1720 ], 1721 }, 1722 { 1723 "role": "assistant", 1724 "content": "partial answer", 1725 "codex_reasoning_items": [ 1726 # rs_aaa is duplicated from the previous turn 1727 {"type": "reasoning", "id": "rs_aaa", "encrypted_content": "enc_1"}, 1728 {"type": "reasoning", "id": "rs_ccc", "encrypted_content": "enc_3"}, 1729 ], 1730 }, 1731 ] 1732 from agent.codex_responses_adapter import _chat_messages_to_responses_input 1733 items = _chat_messages_to_responses_input(messages) 1734 1735 reasoning_items = [it for it in items if it.get("type") == "reasoning"] 1736 # Dedup: rs_aaa appears in both turns but should only be emitted once. 1737 # 3 unique items total: enc_1 (from rs_aaa), enc_2 (rs_bbb), enc_3 (rs_ccc). 1738 assert len(reasoning_items) == 3 1739 encrypted = [it["encrypted_content"] for it in reasoning_items] 1740 assert encrypted.count("enc_1") == 1 1741 assert "enc_2" in encrypted 1742 assert "enc_3" in encrypted 1743 # IDs must be stripped — with store=False the API 404s on id lookups. 1744 for it in reasoning_items: 1745 assert "id" not in it 1746 1747 1748 def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch): 1749 """_preflight_codex_input_items should also deduplicate reasoning items by ID.""" 1750 agent = _build_agent(monkeypatch) 1751 raw_input = [ 1752 {"role": "user", "content": [{"type": "input_text", "text": "hello"}]}, 1753 {"type": "reasoning", "id": "rs_xyz", "encrypted_content": "enc_a"}, 1754 {"role": "assistant", "content": "ok"}, 1755 {"type": "reasoning", "id": "rs_xyz", "encrypted_content": "enc_a"}, 1756 {"type": "reasoning", "id": "rs_zzz", "encrypted_content": "enc_b"}, 1757 {"role": "assistant", "content": "done"}, 1758 ] 1759 from agent.codex_responses_adapter import _preflight_codex_input_items 1760 normalized = _preflight_codex_input_items(raw_input) 1761 1762 reasoning_items = [it for it in normalized if it.get("type") == "reasoning"] 1763 # rs_xyz duplicate should be collapsed to one item; rs_zzz kept. 1764 assert len(reasoning_items) == 2 1765 encrypted = [it["encrypted_content"] for it in reasoning_items] 1766 assert encrypted.count("enc_a") == 1 1767 assert "enc_b" in encrypted 1768 # IDs must be stripped — with store=False the API 404s on id lookups. 1769 for it in reasoning_items: 1770 assert "id" not in it