test_deepseek_v4_thinking_live.py
1 """Live DeepSeek V4 thinking-mode tool-call replay smoke test. 2 3 Opt-in only: 4 HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q 5 6 Requires DEEPSEEK_API_KEY in the process environment. The key is captured at 7 module import time because tests/conftest.py intentionally removes credential 8 environment variables before each test body runs. 9 """ 10 11 from __future__ import annotations 12 13 import json 14 import os 15 import sys 16 from typing import Any 17 18 import pytest 19 20 21 LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1" 22 DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "") 23 LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro") 24 LIVE_BASE_URL = "https://api.deepseek.com" 25 26 pytestmark = [ 27 pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"), 28 pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"), 29 ] 30 31 TOOL_NAME = "lookup_ticket_status" 32 TOOLS = [ 33 { 34 "type": "function", 35 "function": { 36 "name": TOOL_NAME, 37 "description": "Return the status for a test ticket id.", 38 "parameters": { 39 "type": "object", 40 "properties": { 41 "ticket_id": { 42 "type": "string", 43 "description": "The ticket id to look up.", 44 }, 45 }, 46 "required": ["ticket_id"], 47 "additionalProperties": False, 48 }, 49 }, 50 } 51 ] 52 53 54 def _thinking_kwargs() -> dict: 55 return { 56 "reasoning_effort": "high", 57 "extra_body": {"thinking": {"type": "enabled"}}, 58 } 59 60 61 def _jsonable(value: Any) -> Any: 62 if hasattr(value, "model_dump"): 63 return value.model_dump(mode="json") 64 if isinstance(value, dict): 65 return {k: _jsonable(v) for k, v in value.items()} 66 if isinstance(value, list): 67 return [_jsonable(v) for v in value] 68 return value 69 70 71 def _print_trace(label: str, value: Any) -> None: 72 sys.__stdout__.write(f"\n--- {label} ---\n") 73 sys.__stdout__.write( 74 json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True) 75 ) 76 sys.__stdout__.write("\n") 77 sys.__stdout__.flush() 78 79 80 def _message_snapshot(message) -> dict: 81 return { 82 "content": getattr(message, "content", None), 83 "reasoning": getattr(message, "reasoning", None), 84 "reasoning_content": _raw_reasoning_content(message), 85 "model_extra": getattr(message, "model_extra", None), 86 "tool_calls": _jsonable(getattr(message, "tool_calls", None)), 87 } 88 89 90 def _make_live_client(): 91 from openai import OpenAI 92 93 return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL) 94 95 96 def _make_agent_for_message_building(model: str): 97 from run_agent import AIAgent 98 99 agent = object.__new__(AIAgent) 100 agent.provider = "deepseek" 101 agent.model = model 102 agent.base_url = LIVE_BASE_URL 103 agent.verbose_logging = False 104 agent.reasoning_callback = None 105 agent.stream_delta_callback = None 106 agent._stream_callback = None 107 return agent 108 109 110 def _raw_reasoning_content(message): 111 direct = getattr(message, "reasoning_content", None) 112 if direct is not None: 113 return direct 114 model_extra = getattr(message, "model_extra", None) or {} 115 if isinstance(model_extra, dict) and "reasoning_content" in model_extra: 116 return model_extra["reasoning_content"] 117 return None 118 119 120 @pytest.mark.parametrize("live_model", LIVE_MODELS) 121 def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str): 122 """Hit DeepSeek twice and replay the assistant tool-call turn. 123 124 The first request forces a tool call with thinking enabled. The second 125 request replays that assistant message with content, reasoning_content, 126 and tool_calls, then appends the tool result. DeepSeek accepting the 127 second request is the live guardrail for the V4 thinking replay contract. 128 """ 129 130 client = _make_live_client() 131 agent = _make_agent_for_message_building(live_model) 132 133 first_request = { 134 "model": live_model, 135 "messages": [ 136 { 137 "role": "user", 138 "content": ( 139 "You must use the provided lookup_ticket_status tool " 140 "exactly once with ticket_id 'DS-4242'. Do not answer " 141 "directly." 142 ), 143 } 144 ], 145 "tools": TOOLS, 146 "max_tokens": 1024, 147 "timeout": 90, 148 **_thinking_kwargs(), 149 } 150 _print_trace(f"{live_model} first request", first_request) 151 first = client.chat.completions.create(**first_request) 152 _print_trace(f"{live_model} first raw response", first) 153 154 first_choice = first.choices[0] 155 first_message = first_choice.message 156 _print_trace( 157 f"{live_model} first assistant message", 158 { 159 "finish_reason": first_choice.finish_reason, 160 **_message_snapshot(first_message), 161 }, 162 ) 163 assert first_message.tool_calls, "DeepSeek did not return a tool call" 164 first_tool_call = first_message.tool_calls[0] 165 assert first_tool_call.function.name == TOOL_NAME 166 assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict) 167 168 raw_reasoning_content = _raw_reasoning_content(first_message) 169 assert raw_reasoning_content is not None, ( 170 "DeepSeek did not return reasoning_content; the thinking payload may " 171 "not have been honored" 172 ) 173 174 stored_assistant = agent._build_assistant_message( 175 first_message, 176 first_choice.finish_reason or "tool_calls", 177 ) 178 _print_trace(f"{live_model} stored assistant message", stored_assistant) 179 assert stored_assistant["reasoning_content"] == raw_reasoning_content 180 181 replay_assistant = { 182 "role": "assistant", 183 "content": stored_assistant.get("content") or "", 184 "tool_calls": stored_assistant["tool_calls"], 185 } 186 agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant) 187 _print_trace(f"{live_model} replay assistant message", replay_assistant) 188 189 tool_call_id = stored_assistant["tool_calls"][0]["id"] 190 messages = [ 191 { 192 "role": "user", 193 "content": ( 194 "You must use the provided lookup_ticket_status tool " 195 "exactly once with ticket_id 'DS-4242'. Do not answer " 196 "directly." 197 ), 198 }, 199 replay_assistant, 200 { 201 "role": "tool", 202 "tool_call_id": tool_call_id, 203 "content": json.dumps( 204 {"ticket_id": "DS-4242", "status": "green", "source": "live-test"}, 205 separators=(",", ":"), 206 ), 207 }, 208 ] 209 210 from agent.transports.chat_completions import ChatCompletionsTransport 211 212 api_messages = ChatCompletionsTransport().convert_messages(messages) 213 _print_trace( 214 f"{live_model} second request messages after transport conversion", 215 api_messages, 216 ) 217 assert api_messages[1]["reasoning_content"] == raw_reasoning_content 218 assert "call_id" not in api_messages[1]["tool_calls"][0] 219 assert "response_item_id" not in api_messages[1]["tool_calls"][0] 220 221 second_request = { 222 "model": live_model, 223 "messages": api_messages, 224 "max_tokens": 1024, 225 "timeout": 90, 226 **_thinking_kwargs(), 227 } 228 _print_trace(f"{live_model} second request", second_request) 229 second = client.chat.completions.create(**second_request) 230 _print_trace(f"{live_model} second raw response", second) 231 _print_trace( 232 f"{live_model} second assistant message", 233 { 234 "finish_reason": second.choices[0].finish_reason, 235 **_message_snapshot(second.choices[0].message), 236 }, 237 ) 238 239 second_message = second.choices[0].message 240 final_content = second_message.content or "" 241 final_reasoning = _raw_reasoning_content(second_message) or "" 242 assert second.choices[0].finish_reason == "stop" 243 assert final_content.strip() or final_reasoning.strip(), ( 244 "DeepSeek returned neither visible content nor reasoning_content" 245 )