test_semantic_kernel_autolog.py
1 import asyncio 2 from unittest import mock 3 4 import openai 5 import pytest 6 import pytest_asyncio 7 from semantic_kernel import Kernel 8 from semantic_kernel.agents import AgentResponseItem 9 from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion 10 from semantic_kernel.contents import ChatMessageContent 11 from semantic_kernel.exceptions import FunctionExecutionException, KernelInvokeException 12 from semantic_kernel.functions.function_result import FunctionResult 13 from semantic_kernel.utils.telemetry.agent_diagnostics import ( 14 gen_ai_attributes as agent_gen_ai_attributes, 15 ) 16 from semantic_kernel.utils.telemetry.model_diagnostics import ( 17 gen_ai_attributes as model_gen_ai_attributes, 18 ) 19 20 import mlflow.semantic_kernel 21 from mlflow.entities import SpanType 22 from mlflow.entities.span_status import SpanStatusCode 23 from mlflow.environment_variables import MLFLOW_USE_DEFAULT_TRACER_PROVIDER 24 from mlflow.semantic_kernel.autolog import SemanticKernelSpanProcessor 25 from mlflow.tracing.constant import ( 26 SpanAttributeKey, 27 TokenUsageKey, 28 ) 29 from mlflow.version import IS_TRACING_SDK_ONLY 30 31 from tests.semantic_kernel.resources import ( 32 _create_and_invoke_chat_agent, 33 _create_and_invoke_chat_completion_direct, 34 _create_and_invoke_embeddings, 35 _create_and_invoke_kernel_complex, 36 _create_and_invoke_kernel_function_object, 37 _create_and_invoke_kernel_simple, 38 _create_and_invoke_text_completion, 39 ) 40 from tests.tracing.helper import get_traces 41 42 lock = asyncio.Lock() 43 44 45 @pytest_asyncio.fixture(autouse=True) 46 async def lock_fixture(): 47 async with lock: 48 yield 49 50 51 @pytest.fixture(params=[True, False]) 52 def with_openai_autolog(request): 53 # Test with OpenAI autologging enabled and disabled 54 if request.param: 55 mlflow.openai.autolog() 56 else: 57 mlflow.openai.autolog(disable=True) 58 59 return request.param 60 61 62 @pytest.mark.asyncio 63 async def test_sk_invoke_simple(mock_openai, with_openai_autolog, mock_litellm_cost): 64 mlflow.semantic_kernel.autolog() 65 result = await _create_and_invoke_kernel_simple(mock_openai) 66 67 # The mock OpenAI endpoint echos the user message back 68 prompt = "Is sushi the best food ever?" 69 expected_content = '[{"role": "user", "content": "Is sushi the best food ever?"}]' 70 71 # Validate the result is not mutated by tracing logic 72 assert isinstance(result, FunctionResult) 73 assert isinstance(result.value[0], ChatMessageContent) 74 assert result.value[0].items[0].text == expected_content 75 76 # Trace 77 traces = get_traces() 78 assert len(traces) == 1 79 trace = traces[0] 80 assert trace.info.request_id 81 assert trace.info.experiment_id == "0" 82 assert trace.info.timestamp_ms > 0 83 assert trace.info.status == "OK" 84 assert "Is sushi the best food ever?" in trace.info.request_preview 85 assert "Is sushi the best food ever?" in trace.info.response_preview 86 87 spans = trace.data.spans 88 assert len(spans) == (5 if with_openai_autolog else 4) 89 90 # Kernel.invoke_prompt 91 assert spans[0].name == "Kernel.invoke_prompt" 92 assert spans[0].span_type == SpanType.AGENT 93 assert spans[0].inputs == {"prompt": prompt} 94 assert spans[0].outputs == [{"role": "assistant", "content": expected_content}] 95 96 # Kernel.invoke_prompt 97 assert spans[1].name == "Kernel.invoke" 98 assert spans[1].span_type == SpanType.AGENT 99 assert spans[1].inputs["function"] is not None 100 assert spans[1].outputs == [{"role": "assistant", "content": expected_content}] 101 102 # Execute LLM as a tool 103 assert spans[2].name.startswith("execute_tool") 104 assert spans[2].span_type == SpanType.TOOL 105 106 # Actual LLM call 107 assert spans[3].name in ("chat.completions gpt-4o-mini", "chat gpt-4o-mini") 108 assert "gen_ai.operation.name" in spans[3].attributes 109 assert spans[3].inputs == {"messages": [{"role": "user", "content": prompt}]} 110 assert spans[3].outputs == {"messages": [{"role": "assistant", "content": expected_content}]} 111 112 chat_usage = spans[3].get_attribute(SpanAttributeKey.CHAT_USAGE) 113 assert chat_usage[TokenUsageKey.INPUT_TOKENS] == 9 114 assert chat_usage[TokenUsageKey.OUTPUT_TOKENS] == 12 115 assert chat_usage[TokenUsageKey.TOTAL_TOKENS] == 21 116 assert spans[3].get_attribute(SpanAttributeKey.SPAN_TYPE) == SpanType.CHAT_MODEL 117 assert spans[3].model_name == "gpt-4o-mini" 118 if not IS_TRACING_SDK_ONLY: 119 # Verify cost is calculated (9 input tokens * 1.0 + 12 output tokens * 2.0) 120 assert spans[3].llm_cost == { 121 "input_cost": 9.0, 122 "output_cost": 24.0, 123 "total_cost": 33.0, 124 } 125 126 # OpenAI autologging 127 if with_openai_autolog: 128 assert spans[4].name == "AsyncCompletions" 129 assert spans[4].span_type == SpanType.CHAT_MODEL 130 assert spans[4].parent_id == spans[3].span_id 131 assert spans[4].inputs == { 132 "messages": [{"role": "user", "content": prompt}], 133 "model": "gpt-4o-mini", 134 "stream": False, 135 } 136 assert spans[4].get_attribute(SpanAttributeKey.CHAT_USAGE) == { 137 "input_tokens": 9, 138 "output_tokens": 12, 139 "total_tokens": 21, 140 } 141 assert spans[4].model_name == "gpt-4o-mini" 142 if not IS_TRACING_SDK_ONLY: 143 assert spans[4].llm_cost == { 144 "input_cost": 9.0, 145 "output_cost": 24.0, 146 "total_cost": 33.0, 147 } 148 149 # Trace level token usage should not double-count 150 assert trace.info.token_usage == { 151 "input_tokens": 9, 152 "output_tokens": 12, 153 "total_tokens": 21, 154 } 155 156 157 @pytest.mark.asyncio 158 async def test_sk_invoke_simple_with_sk_initialization_of_tracer(mock_openai): 159 from opentelemetry.sdk.resources import Resource 160 from opentelemetry.sdk.trace import TracerProvider 161 from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor 162 from opentelemetry.semconv.resource import ResourceAttributes 163 from opentelemetry.trace import get_tracer_provider, set_tracer_provider 164 165 resource = Resource.create({ResourceAttributes.SERVICE_NAME: "telemetry-console-quickstart"}) 166 tracer_provider = TracerProvider(resource=resource) 167 tracer_provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter())) 168 set_tracer_provider(tracer_provider) 169 170 mlflow.semantic_kernel.autolog() 171 _tracer_provider = get_tracer_provider() 172 assert isinstance(_tracer_provider, TracerProvider) 173 span_processors = _tracer_provider._active_span_processor._span_processors 174 assert len(span_processors) == 2 175 assert any(isinstance(p, SemanticKernelSpanProcessor) for p in span_processors) 176 177 _ = await _create_and_invoke_kernel_simple(mock_openai) 178 179 traces = get_traces() 180 assert len(traces) == 1 181 trace = traces[0] 182 assert trace.info.request_id 183 assert len(trace.data.spans) == 4 184 185 186 @pytest.mark.asyncio 187 async def test_sk_invoke_complex(mock_openai, mock_litellm_cost): 188 mlflow.semantic_kernel.autolog() 189 result = await _create_and_invoke_kernel_complex(mock_openai) 190 191 # Validate the result is not mutated by tracing logic 192 assert isinstance(result, FunctionResult) 193 assert isinstance(result.value[0], ChatMessageContent) 194 assert result.value[0].items[0].text.startswith('[{"role": "system",') 195 196 # Trace 197 traces = get_traces() 198 assert len(traces) == 1 199 spans = traces[0].data.spans 200 assert len(spans) == 3 # Kernel.invoke, execute_tool, chat.completions 201 202 kernel_span, tool_span, chat_span = spans 203 assert kernel_span.name == "Kernel.invoke" 204 assert kernel_span.span_type == SpanType.AGENT 205 function_metadata = kernel_span.inputs["function"]["metadata"] 206 assert function_metadata["name"] == "Chat" 207 assert function_metadata["plugin_name"] == "ChatBot" 208 prompt = kernel_span.inputs["function"]["prompt_template"]["prompt_template_config"] 209 assert prompt["template"] == "{{$chat_history}}{{$user_input}}" 210 arguments = kernel_span.inputs["arguments"] 211 assert arguments["user_input"] == "I want to find a hotel in Seattle with free wifi and a pool." 212 assert len(arguments["chat_history"]) == 2 213 214 assert tool_span.name == "execute_tool ChatBot-Chat" 215 assert tool_span.span_type == SpanType.TOOL 216 assert tool_span.parent_id == kernel_span.span_id 217 218 assert chat_span.name in ("chat.completions gpt-4o-mini", "chat gpt-4o-mini") 219 assert chat_span.parent_id == tool_span.span_id 220 assert chat_span.span_type == SpanType.CHAT_MODEL 221 assert chat_span.get_attribute(model_gen_ai_attributes.OPERATION).startswith("chat") 222 assert chat_span.get_attribute(model_gen_ai_attributes.SYSTEM) == "openai" 223 assert chat_span.get_attribute(model_gen_ai_attributes.MODEL) == "gpt-4o-mini" 224 assert chat_span.get_attribute(model_gen_ai_attributes.RESPONSE_ID) == "chatcmpl-123" 225 assert chat_span.get_attribute(model_gen_ai_attributes.FINISH_REASON) == "FinishReason.STOP" 226 assert chat_span.get_attribute(model_gen_ai_attributes.INPUT_TOKENS) == 9 227 assert chat_span.get_attribute(model_gen_ai_attributes.OUTPUT_TOKENS) == 12 228 assert chat_span.model_name == "gpt-4o-mini" 229 230 assert any( 231 "I want to find a hotel in Seattle with free wifi and a pool." in m.get("content", "") 232 for m in chat_span.inputs.get("messages", []) 233 ) 234 assert isinstance(chat_span.outputs["messages"], list) 235 236 chat_usage = chat_span.get_attribute(SpanAttributeKey.CHAT_USAGE) 237 assert chat_usage[TokenUsageKey.INPUT_TOKENS] == 9 238 assert chat_usage[TokenUsageKey.OUTPUT_TOKENS] == 12 239 assert chat_usage[TokenUsageKey.TOTAL_TOKENS] == 21 240 if not IS_TRACING_SDK_ONLY: 241 assert chat_span.llm_cost == { 242 "input_cost": 9.0, 243 "output_cost": 24.0, 244 "total_cost": 33.0, 245 } 246 247 248 @pytest.mark.asyncio 249 async def test_sk_invoke_agent(mock_openai): 250 mlflow.semantic_kernel.autolog() 251 result = await _create_and_invoke_chat_agent(mock_openai) 252 assert isinstance(result, AgentResponseItem) 253 254 traces = get_traces() 255 assert len(traces) == 1 256 trace = traces[0] 257 spans = trace.data.spans 258 assert len(spans) == 3 259 260 root_span, child_span, grandchild_span = spans 261 262 assert root_span.name == "invoke_agent sushi_agent" 263 assert root_span.span_type == SpanType.AGENT 264 assert root_span.get_attribute(model_gen_ai_attributes.OPERATION) == "invoke_agent" 265 assert root_span.get_attribute(agent_gen_ai_attributes.AGENT_NAME) == "sushi_agent" 266 267 assert child_span.name == "AutoFunctionInvocationLoop" 268 assert child_span.span_type == SpanType.UNKNOWN 269 assert "sk.available_functions" in child_span.attributes 270 271 assert grandchild_span.name.startswith("chat") 272 assert grandchild_span.span_type == SpanType.CHAT_MODEL 273 assert grandchild_span.get_attribute(model_gen_ai_attributes.MODEL) == "gpt-4o-mini" 274 assert grandchild_span.model_name == "gpt-4o-mini" 275 assert isinstance(grandchild_span.inputs["messages"], list) 276 assert isinstance(grandchild_span.outputs["messages"], list) 277 assert ( 278 grandchild_span.get_attribute(model_gen_ai_attributes.FINISH_REASON) == "FinishReason.STOP" 279 ) 280 281 282 @pytest.mark.asyncio 283 async def test_sk_autolog_trace_on_exception(mock_openai): 284 mlflow.semantic_kernel.autolog() 285 openai_client = openai.AsyncOpenAI(api_key="test", base_url=mock_openai) 286 287 kernel = Kernel() 288 kernel.add_service( 289 OpenAIChatCompletion( 290 service_id="chat-gpt", 291 ai_model_id="gpt-4o-mini", 292 async_client=openai_client, 293 ) 294 ) 295 296 error_message = "thiswillfail" 297 with mock.patch.object( 298 openai_client.chat.completions, "create", side_effect=RuntimeError(error_message) 299 ): 300 with pytest.raises( 301 KernelInvokeException, match="Error occurred while invoking function" 302 ) as exc_info: 303 await kernel.invoke_prompt("Hello?") 304 305 assert isinstance(exc_info.value.__cause__, FunctionExecutionException) 306 307 traces = get_traces() 308 assert traces, "No traces recorded" 309 assert len(traces) == 1 310 trace = traces[0] 311 assert len(trace.data.spans) == 4 312 assert trace.info.status == "ERROR" 313 314 _, _, _, llm_span = trace.data.spans 315 assert llm_span.status.status_code == SpanStatusCode.ERROR 316 assert llm_span.events[0].name == "exception" 317 assert error_message in llm_span.events[0].attributes["exception.message"] 318 319 320 @pytest.mark.asyncio 321 async def test_tracing_autolog_with_active_span(mock_openai, with_openai_autolog): 322 mlflow.semantic_kernel.autolog() 323 324 with mlflow.start_span("parent"): 325 response = await _create_and_invoke_kernel_simple(mock_openai) 326 327 assert isinstance(response, FunctionResult) 328 329 traces = get_traces() 330 assert len(traces) == 1 331 332 trace = traces[0] 333 spans = trace.data.spans 334 assert len(spans) == (6 if with_openai_autolog else 5) 335 336 assert trace.info.request_id is not None 337 assert trace.info.status == "OK" 338 assert trace.info.tags["mlflow.traceName"] == "parent" 339 340 parent = trace.data.spans[0] 341 assert parent.name == "parent" 342 assert parent.parent_id is None 343 assert parent.span_type == SpanType.UNKNOWN 344 345 assert spans[1].name == "Kernel.invoke_prompt" 346 assert spans[1].parent_id == parent.span_id 347 assert spans[2].name == "Kernel.invoke" 348 assert spans[2].parent_id == spans[1].span_id 349 assert spans[3].name.startswith("execute_tool") 350 assert spans[3].parent_id == spans[2].span_id 351 assert spans[4].name in ("chat.completions gpt-4o-mini", "chat gpt-4o-mini") 352 assert spans[4].parent_id == spans[3].span_id 353 354 if with_openai_autolog: 355 assert spans[5].name == "AsyncCompletions" 356 assert spans[5].parent_id == spans[4].span_id 357 358 359 @pytest.mark.asyncio 360 async def test_tracing_attribution_with_threaded_calls(mock_openai): 361 mlflow.semantic_kernel.autolog() 362 363 n = 3 364 openai_client = openai.AsyncOpenAI(api_key="test", base_url=mock_openai) 365 366 kernel = Kernel() 367 kernel.add_service( 368 OpenAIChatCompletion( 369 service_id="chat-gpt", 370 ai_model_id="gpt-4o-mini", 371 async_client=openai_client, 372 ) 373 ) 374 375 async def call(prompt: str): 376 return await kernel.invoke_prompt(prompt) 377 378 prompts = [f"What is this number: {i}" for i in range(n)] 379 _ = await asyncio.gather(*(call(p) for p in prompts)) 380 381 traces = get_traces() 382 assert len(traces) == n 383 384 unique_messages = set() 385 for trace in traces: 386 spans = trace.data.spans 387 assert len(spans) == 4 388 389 assert spans[0].span_type == SpanType.AGENT 390 assert spans[1].span_type == SpanType.AGENT 391 assert spans[2].span_type == SpanType.TOOL 392 assert spans[3].span_type == SpanType.CHAT_MODEL 393 assert spans[3].model_name == "gpt-4o-mini" 394 395 message = spans[3].inputs["messages"][0]["content"] 396 assert message.startswith("What is this number: ") 397 unique_messages.add(message) 398 assert spans[3].outputs["messages"][0]["content"] 399 400 assert len(unique_messages) == n 401 402 403 @pytest.mark.parametrize( 404 ("create_and_invoke_func", "span_name_pattern", "expected_span_input_keys"), 405 [ 406 ( 407 _create_and_invoke_kernel_simple, 408 "chat", 409 ["messages"], 410 ), 411 ( 412 _create_and_invoke_text_completion, 413 "text", 414 # Text completion input should be stored as a raw string 415 None, 416 ), 417 ( 418 _create_and_invoke_chat_completion_direct, 419 "chat", 420 ["messages"], 421 ), 422 ], 423 ) 424 @pytest.mark.asyncio 425 async def test_sk_input_parsing( 426 mock_openai, create_and_invoke_func, span_name_pattern, expected_span_input_keys 427 ): 428 mlflow.semantic_kernel.autolog() 429 430 _ = await create_and_invoke_func(mock_openai) 431 432 traces = get_traces() 433 assert len(traces) == 1 434 trace = traces[0] 435 436 target_span = None 437 for span in trace.data.spans: 438 if span_name_pattern in span.name: 439 target_span = span 440 break 441 442 assert target_span is not None, f"No span found with pattern '{span_name_pattern}'" 443 if expected_span_input_keys: 444 for key in expected_span_input_keys: 445 assert key in target_span.inputs, ( 446 f"Expected '{key}' in span inputs for {target_span.name}, got: {target_span.inputs}" 447 ) 448 else: 449 assert isinstance(target_span.inputs, str) 450 451 452 @pytest.mark.asyncio 453 async def test_sk_invoke_with_kernel_arguments(mock_openai): 454 mlflow.semantic_kernel.autolog() 455 _ = await _create_and_invoke_kernel_function_object(mock_openai) 456 traces = get_traces() 457 assert len(traces) == 1 458 459 # Check that kernel arguments were passed through to the prompt 460 child_span = next(s for s in traces[0].data.spans if "chat" in s.name) 461 assert child_span.inputs["messages"][0]["content"] == "Add 5 and 3" 462 463 464 @pytest.mark.asyncio 465 async def test_sk_embeddings(mock_openai): 466 mlflow.semantic_kernel.autolog() 467 468 result = await _create_and_invoke_embeddings(mock_openai) 469 470 assert result is not None 471 assert len(result) == 3 472 473 # NOTE: Semantic Kernel currently does not instrument embeddings with OpenTelemetry 474 # spans, so no traces are generated for embedding operations 475 traces = get_traces() 476 assert len(traces) == 0 477 478 479 @pytest.mark.asyncio 480 async def test_kernel_invoke_function_object(mock_openai): 481 mlflow.semantic_kernel.autolog() 482 483 await _create_and_invoke_kernel_function_object(mock_openai) 484 485 traces = get_traces() 486 assert len(traces) == 1 487 488 # Verify trace structure 489 assert len(traces[0].data.spans) == 3 490 491 # Root span should be execute_tool 492 kernel_span, tool_span, chat_span = traces[0].data.spans 493 494 assert kernel_span.name == "Kernel.invoke" 495 assert kernel_span.span_type == SpanType.AGENT 496 assert kernel_span.inputs["function"] is not None 497 assert kernel_span.outputs is not None 498 499 assert tool_span.name == "execute_tool MathPlugin-Add" 500 assert tool_span.span_type == SpanType.TOOL 501 502 # Child span should be chat completion 503 assert chat_span.name in ("chat.completions gpt-4o-mini", "chat gpt-4o-mini") 504 assert chat_span.span_type == SpanType.CHAT_MODEL 505 assert chat_span.model_name == "gpt-4o-mini" 506 507 508 @pytest.mark.asyncio 509 async def test_sk_shared_provider_no_recursion(monkeypatch, mock_openai): 510 # Verify semantic_kernel.autolog() works with shared tracer provider (no RecursionError) 511 monkeypatch.setenv(MLFLOW_USE_DEFAULT_TRACER_PROVIDER.name, "false") 512 513 mlflow.semantic_kernel.autolog() 514 result = await _create_and_invoke_kernel_simple(mock_openai) 515 516 assert isinstance(result, FunctionResult) 517 518 traces = get_traces() 519 assert len(traces) == 1 520 spans = traces[0].data.spans 521 assert len(spans) >= 3 522 assert spans[0].name == "Kernel.invoke_prompt"