Cradicle Explorer

/ tests / run_agent / test_deepseek_v4_thinking_live.py
test_deepseek_v4_thinking_live.py
  1  """Live DeepSeek V4 thinking-mode tool-call replay smoke test.
  2  
  3  Opt-in only:
  4      HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q
  5  
  6  Requires DEEPSEEK_API_KEY in the process environment. The key is captured at
  7  module import time because tests/conftest.py intentionally removes credential
  8  environment variables before each test body runs.
  9  """
 10  
 11  from __future__ import annotations
 12  
 13  import json
 14  import os
 15  import sys
 16  from typing import Any
 17  
 18  import pytest
 19  
 20  
 21  LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1"
 22  DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "")
 23  LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro")
 24  LIVE_BASE_URL = "https://api.deepseek.com"
 25  
 26  pytestmark = [
 27      pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"),
 28      pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"),
 29  ]
 30  
 31  TOOL_NAME = "lookup_ticket_status"
 32  TOOLS = [
 33      {
 34          "type": "function",
 35          "function": {
 36              "name": TOOL_NAME,
 37              "description": "Return the status for a test ticket id.",
 38              "parameters": {
 39                  "type": "object",
 40                  "properties": {
 41                      "ticket_id": {
 42                          "type": "string",
 43                          "description": "The ticket id to look up.",
 44                      },
 45                  },
 46                  "required": ["ticket_id"],
 47                  "additionalProperties": False,
 48              },
 49          },
 50      }
 51  ]
 52  
 53  
 54  def _thinking_kwargs() -> dict:
 55      return {
 56          "reasoning_effort": "high",
 57          "extra_body": {"thinking": {"type": "enabled"}},
 58      }
 59  
 60  
 61  def _jsonable(value: Any) -> Any:
 62      if hasattr(value, "model_dump"):
 63          return value.model_dump(mode="json")
 64      if isinstance(value, dict):
 65          return {k: _jsonable(v) for k, v in value.items()}
 66      if isinstance(value, list):
 67          return [_jsonable(v) for v in value]
 68      return value
 69  
 70  
 71  def _print_trace(label: str, value: Any) -> None:
 72      sys.__stdout__.write(f"\n--- {label} ---\n")
 73      sys.__stdout__.write(
 74          json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True)
 75      )
 76      sys.__stdout__.write("\n")
 77      sys.__stdout__.flush()
 78  
 79  
 80  def _message_snapshot(message) -> dict:
 81      return {
 82          "content": getattr(message, "content", None),
 83          "reasoning": getattr(message, "reasoning", None),
 84          "reasoning_content": _raw_reasoning_content(message),
 85          "model_extra": getattr(message, "model_extra", None),
 86          "tool_calls": _jsonable(getattr(message, "tool_calls", None)),
 87      }
 88  
 89  
 90  def _make_live_client():
 91      from openai import OpenAI
 92  
 93      return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL)
 94  
 95  
 96  def _make_agent_for_message_building(model: str):
 97      from run_agent import AIAgent
 98  
 99      agent = object.__new__(AIAgent)
100      agent.provider = "deepseek"
101      agent.model = model
102      agent.base_url = LIVE_BASE_URL
103      agent.verbose_logging = False
104      agent.reasoning_callback = None
105      agent.stream_delta_callback = None
106      agent._stream_callback = None
107      return agent
108  
109  
110  def _raw_reasoning_content(message):
111      direct = getattr(message, "reasoning_content", None)
112      if direct is not None:
113          return direct
114      model_extra = getattr(message, "model_extra", None) or {}
115      if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
116          return model_extra["reasoning_content"]
117      return None
118  
119  
120  @pytest.mark.parametrize("live_model", LIVE_MODELS)
121  def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str):
122      """Hit DeepSeek twice and replay the assistant tool-call turn.
123  
124      The first request forces a tool call with thinking enabled. The second
125      request replays that assistant message with content, reasoning_content,
126      and tool_calls, then appends the tool result. DeepSeek accepting the
127      second request is the live guardrail for the V4 thinking replay contract.
128      """
129  
130      client = _make_live_client()
131      agent = _make_agent_for_message_building(live_model)
132  
133      first_request = {
134          "model": live_model,
135          "messages": [
136              {
137                  "role": "user",
138                  "content": (
139                      "You must use the provided lookup_ticket_status tool "
140                      "exactly once with ticket_id 'DS-4242'. Do not answer "
141                      "directly."
142                  ),
143              }
144          ],
145          "tools": TOOLS,
146          "max_tokens": 1024,
147          "timeout": 90,
148          **_thinking_kwargs(),
149      }
150      _print_trace(f"{live_model} first request", first_request)
151      first = client.chat.completions.create(**first_request)
152      _print_trace(f"{live_model} first raw response", first)
153  
154      first_choice = first.choices[0]
155      first_message = first_choice.message
156      _print_trace(
157          f"{live_model} first assistant message",
158          {
159              "finish_reason": first_choice.finish_reason,
160              **_message_snapshot(first_message),
161          },
162      )
163      assert first_message.tool_calls, "DeepSeek did not return a tool call"
164      first_tool_call = first_message.tool_calls[0]
165      assert first_tool_call.function.name == TOOL_NAME
166      assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict)
167  
168      raw_reasoning_content = _raw_reasoning_content(first_message)
169      assert raw_reasoning_content is not None, (
170          "DeepSeek did not return reasoning_content; the thinking payload may "
171          "not have been honored"
172      )
173  
174      stored_assistant = agent._build_assistant_message(
175          first_message,
176          first_choice.finish_reason or "tool_calls",
177      )
178      _print_trace(f"{live_model} stored assistant message", stored_assistant)
179      assert stored_assistant["reasoning_content"] == raw_reasoning_content
180  
181      replay_assistant = {
182          "role": "assistant",
183          "content": stored_assistant.get("content") or "",
184          "tool_calls": stored_assistant["tool_calls"],
185      }
186      agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant)
187      _print_trace(f"{live_model} replay assistant message", replay_assistant)
188  
189      tool_call_id = stored_assistant["tool_calls"][0]["id"]
190      messages = [
191          {
192              "role": "user",
193              "content": (
194                  "You must use the provided lookup_ticket_status tool "
195                  "exactly once with ticket_id 'DS-4242'. Do not answer "
196                  "directly."
197              ),
198          },
199          replay_assistant,
200          {
201              "role": "tool",
202              "tool_call_id": tool_call_id,
203              "content": json.dumps(
204                  {"ticket_id": "DS-4242", "status": "green", "source": "live-test"},
205                  separators=(",", ":"),
206              ),
207          },
208      ]
209  
210      from agent.transports.chat_completions import ChatCompletionsTransport
211  
212      api_messages = ChatCompletionsTransport().convert_messages(messages)
213      _print_trace(
214          f"{live_model} second request messages after transport conversion",
215          api_messages,
216      )
217      assert api_messages[1]["reasoning_content"] == raw_reasoning_content
218      assert "call_id" not in api_messages[1]["tool_calls"][0]
219      assert "response_item_id" not in api_messages[1]["tool_calls"][0]
220  
221      second_request = {
222          "model": live_model,
223          "messages": api_messages,
224          "max_tokens": 1024,
225          "timeout": 90,
226          **_thinking_kwargs(),
227      }
228      _print_trace(f"{live_model} second request", second_request)
229      second = client.chat.completions.create(**second_request)
230      _print_trace(f"{live_model} second raw response", second)
231      _print_trace(
232          f"{live_model} second assistant message",
233          {
234              "finish_reason": second.choices[0].finish_reason,
235              **_message_snapshot(second.choices[0].message),
236          },
237      )
238  
239      second_message = second.choices[0].message
240      final_content = second_message.content or ""
241      final_reasoning = _raw_reasoning_content(second_message) or ""
242      assert second.choices[0].finish_reason == "stop"
243      assert final_content.strip() or final_reasoning.strip(), (
244          "DeepSeek returned neither visible content nor reasoning_content"
245      )