Cradicle Explorer

test_tracing.py
  1  import importlib
  2  import json
  3  import logging
  4  from pathlib import Path
  5  
  6  import pytest
  7  from claude_agent_sdk.types import (
  8      AssistantMessage,
  9      ResultMessage,
 10      TextBlock,
 11      ToolResultBlock,
 12      ToolUseBlock,
 13      UserMessage,
 14  )
 15  
 16  import mlflow
 17  import mlflow.claude_code.tracing as tracing_module
 18  from mlflow.claude_code.tracing import (
 19      CLAUDE_TRACING_LEVEL,
 20      METADATA_KEY_CLAUDE_CODE_VERSION,
 21      find_last_user_message_index,
 22      get_hook_response,
 23      parse_timestamp_to_ns,
 24      process_sdk_messages,
 25      process_transcript,
 26      setup_logging,
 27  )
 28  from mlflow.entities.span import SpanType
 29  from mlflow.tracing.constant import SpanAttributeKey, TraceMetadataKey
 30  
 31  # ============================================================================
 32  # TIMESTAMP PARSING TESTS
 33  # ============================================================================
 34  
 35  
 36  def test_parse_timestamp_to_ns_iso_string():
 37      iso_timestamp = "2024-01-15T10:30:45.123456Z"
 38      result = parse_timestamp_to_ns(iso_timestamp)
 39  
 40      # Verify it returns an integer (nanoseconds)
 41      assert isinstance(result, int)
 42      assert result > 0
 43  
 44  
 45  def test_parse_timestamp_to_ns_unix_seconds():
 46      unix_timestamp = 1705312245.123456
 47      result = parse_timestamp_to_ns(unix_timestamp)
 48  
 49      # Should convert seconds to nanoseconds
 50      expected = int(unix_timestamp * 1_000_000_000)
 51      assert result == expected
 52  
 53  
 54  def test_parse_timestamp_to_ns_large_number():
 55      large_timestamp = 1705312245123
 56      result = parse_timestamp_to_ns(large_timestamp)
 57  
 58      # Function treats large numbers as seconds and converts to nanoseconds
 59      # Just verify we get a reasonable nanosecond value
 60      assert isinstance(result, int)
 61      assert result > 0
 62  
 63  
 64  # ============================================================================
 65  # LOGGING TESTS
 66  # ============================================================================
 67  
 68  
 69  def test_setup_logging_creates_logger(monkeypatch, tmp_path):
 70      monkeypatch.chdir(tmp_path)
 71      logger = setup_logging()
 72  
 73      # Verify logger was created
 74      assert logger is not None
 75      assert logger.name == "mlflow.claude_code.tracing"
 76  
 77      # Verify log directory was created
 78      log_dir = tmp_path / ".claude" / "mlflow"
 79      assert log_dir.exists()
 80      assert log_dir.is_dir()
 81  
 82  
 83  def test_custom_logging_level():
 84      setup_logging()
 85  
 86      assert CLAUDE_TRACING_LEVEL > logging.INFO
 87      assert CLAUDE_TRACING_LEVEL < logging.WARNING
 88      assert logging.getLevelName(CLAUDE_TRACING_LEVEL) == "CLAUDE_TRACING"
 89  
 90  
 91  def test_get_logger_lazy_initialization(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
 92      monkeypatch.chdir(tmp_path)
 93  
 94      # Force reload to reset the module state
 95      importlib.reload(tracing_module)
 96  
 97      log_dir = tmp_path / ".claude" / "mlflow"
 98  
 99      # Before calling get_logger(), the log directory should NOT exist
100      assert not log_dir.exists()
101  
102      # Call get_logger() for the first time - this should trigger initialization
103      logger1 = tracing_module.get_logger()
104  
105      # After calling get_logger(), the log directory SHOULD exist
106      assert log_dir.exists()
107      assert log_dir.is_dir()
108  
109      # Verify logger was created properly
110      assert logger1 is not None
111      assert logger1.name == "mlflow.claude_code.tracing"
112  
113      # Call get_logger() again - should return the same logger instance
114      logger2 = tracing_module.get_logger()
115      assert logger2 is logger1
116  
117  
118  # ============================================================================
119  # HOOK RESPONSE TESTS
120  # ============================================================================
121  
122  
123  def test_get_hook_response_success():
124      response = get_hook_response()
125      assert response == {"continue": True}
126  
127  
128  def test_get_hook_response_with_error():
129      response = get_hook_response(error="Test error")
130      assert response == {"continue": False, "stopReason": "Test error"}
131  
132  
133  def test_get_hook_response_with_additional_fields():
134      response = get_hook_response(custom_field="value")
135      assert response == {"continue": True, "custom_field": "value"}
136  
137  
138  # ============================================================================
139  # ASYNC TRACE LOGGING UTILITY TESTS
140  # ============================================================================
141  
142  
143  def test_flush_trace_async_logging_calls_flush(monkeypatch):
144      mock_exporter = type("MockExporter", (), {"_async_queue": True})()
145      monkeypatch.setattr(tracing_module, "_get_trace_exporter", lambda: mock_exporter)
146      flushed = []
147      monkeypatch.setattr(mlflow, "flush_trace_async_logging", lambda: flushed.append(True))
148      tracing_module._flush_trace_async_logging()
149      assert len(flushed) == 1
150  
151  
152  def test_flush_trace_async_logging_skips_without_async_queue(monkeypatch):
153      mock_exporter = object()  # no _async_queue attribute
154      monkeypatch.setattr(tracing_module, "_get_trace_exporter", lambda: mock_exporter)
155      flushed = []
156      monkeypatch.setattr(mlflow, "flush_trace_async_logging", lambda: flushed.append(True))
157      tracing_module._flush_trace_async_logging()
158      assert len(flushed) == 0
159  
160  
161  # ============================================================================
162  # INTEGRATION TESTS
163  # ============================================================================
164  
165  # Sample Claude Code transcript for testing
166  DUMMY_TRANSCRIPT = [
167      {
168          "type": "user",
169          "message": {"role": "user", "content": "What is 2 + 2?"},
170          "timestamp": "2025-01-15T10:00:00.000Z",
171          "sessionId": "test-session-123",
172      },
173      {
174          "type": "assistant",
175          "message": {
176              "role": "assistant",
177              "content": [{"type": "text", "text": "Let me calculate that for you."}],
178          },
179          "timestamp": "2025-01-15T10:00:01.000Z",
180      },
181      {
182          "type": "assistant",
183          "message": {
184              "role": "assistant",
185              "content": [
186                  {
187                      "type": "tool_use",
188                      "id": "tool_123",
189                      "name": "Bash",
190                      "input": {"command": "echo $((2 + 2))"},
191                  }
192              ],
193          },
194          "timestamp": "2025-01-15T10:00:02.000Z",
195      },
196      {
197          "type": "user",
198          "message": {
199              "role": "user",
200              "content": [{"type": "tool_result", "tool_use_id": "tool_123", "content": "4"}],
201          },
202          "timestamp": "2025-01-15T10:00:03.000Z",
203      },
204      {
205          "type": "assistant",
206          "message": {
207              "role": "assistant",
208              "content": [{"type": "text", "text": "The answer is 4."}],
209          },
210          "timestamp": "2025-01-15T10:00:04.000Z",
211      },
212  ]
213  
214  
215  @pytest.fixture
216  def mock_transcript_file(tmp_path):
217      transcript_path = tmp_path / "transcript.jsonl"
218      with open(transcript_path, "w") as f:
219          for entry in DUMMY_TRANSCRIPT:
220              f.write(json.dumps(entry) + "\n")
221      return str(transcript_path)
222  
223  
224  def test_process_transript_creates_trace(mock_transcript_file):
225      trace = process_transcript(mock_transcript_file, "test-session-123")
226  
227      # Verify trace was created
228      assert trace is not None
229  
230      # Verify trace has spans
231      spans = list(trace.search_spans())
232      assert len(spans) > 0
233  
234      # Verify root span and metadata
235      root_span = trace.data.spans[0]
236      assert root_span.name == "claude_code_conversation"
237      assert root_span.span_type == SpanType.AGENT
238      assert trace.info.trace_metadata.get("mlflow.trace.session") == "test-session-123"
239  
240  
241  def test_process_transcript_creates_spans(mock_transcript_file):
242      trace = process_transcript(mock_transcript_file, "test-session-123")
243  
244      assert trace is not None
245  
246      # Verify trace has spans
247      spans = list(trace.search_spans())
248      assert len(spans) > 0
249  
250      # Find LLM and tool spans
251      llm_spans = [s for s in spans if s.span_type == SpanType.LLM]
252      tool_spans = [s for s in spans if s.span_type == SpanType.TOOL]
253  
254      assert len(llm_spans) == 2
255      assert len(tool_spans) == 1
256  
257      # Verify tool span has proper attributes
258      tool_span = tool_spans[0]
259      assert tool_span.name == "tool_Bash"
260  
261      # Verify LLM spans have MESSAGE_FORMAT set to "anthropic" for Chat UI rendering
262      for llm_span in llm_spans:
263          assert llm_span.get_attribute(SpanAttributeKey.MESSAGE_FORMAT) == "anthropic"
264  
265      # Verify LLM span outputs are in Anthropic response format
266      first_llm = llm_spans[0]
267      outputs = first_llm.outputs
268      assert outputs["type"] == "message"
269      assert outputs["role"] == "assistant"
270      assert isinstance(outputs["content"], list)
271  
272      # Verify LLM span inputs contain messages in Anthropic format
273      inputs = first_llm.inputs
274      assert "messages" in inputs
275      messages = inputs["messages"]
276      assert any(m["role"] == "user" for m in messages)
277  
278  
279  def test_process_transcript_returns_none_for_nonexistent_file():
280      result = process_transcript("/nonexistent/path/transcript.jsonl", "test-session-123")
281      assert result is None
282  
283  
284  def test_process_transcript_links_trace_to_run(mock_transcript_file):
285      with mlflow.start_run() as run:
286          trace = process_transcript(mock_transcript_file, "test-session-123")
287  
288          assert trace is not None
289          assert trace.info.trace_metadata.get(TraceMetadataKey.SOURCE_RUN) == run.info.run_id
290  
291  
292  # Sample Claude Code transcript with token usage for testing
293  DUMMY_TRANSCRIPT_WITH_USAGE = [
294      {
295          "type": "user",
296          "message": {"role": "user", "content": "Hello Claude!"},
297          "timestamp": "2025-01-15T10:00:00.000Z",
298          "sessionId": "test-session-usage",
299      },
300      {
301          "type": "assistant",
302          "message": {
303              "role": "assistant",
304              "content": [{"type": "text", "text": "Hello! How can I help you today?"}],
305              "model": "claude-sonnet-4-20250514",
306              "usage": {"input_tokens": 150, "output_tokens": 25},
307          },
308          "timestamp": "2025-01-15T10:00:01.000Z",
309      },
310  ]
311  
312  
313  @pytest.fixture
314  def mock_transcript_file_with_usage(tmp_path):
315      transcript_path = tmp_path / "transcript_with_usage.jsonl"
316      with open(transcript_path, "w") as f:
317          for entry in DUMMY_TRANSCRIPT_WITH_USAGE:
318              f.write(json.dumps(entry) + "\n")
319      return str(transcript_path)
320  
321  
322  def test_process_transcript_tracks_token_usage(mock_transcript_file_with_usage):
323      trace = process_transcript(mock_transcript_file_with_usage, "test-session-usage")
324  
325      assert trace is not None
326  
327      # Find the LLM span
328      spans = list(trace.search_spans())
329      llm_spans = [s for s in spans if s.span_type == SpanType.LLM]
330  
331      assert len(llm_spans) == 1
332      llm_span = llm_spans[0]
333  
334      # Verify token usage is tracked using the standardized CHAT_USAGE attribute
335      token_usage = llm_span.get_attribute(SpanAttributeKey.CHAT_USAGE)
336      assert token_usage is not None
337      assert token_usage["input_tokens"] == 150
338      assert token_usage["output_tokens"] == 25
339      assert token_usage["total_tokens"] == 175
340  
341      # Verify trace-level token usage aggregation works
342      assert trace.info.token_usage is not None
343      assert trace.info.token_usage["input_tokens"] == 150
344      assert trace.info.token_usage["output_tokens"] == 25
345      assert trace.info.token_usage["total_tokens"] == 175
346  
347  
348  def test_process_transcript_preserves_cache_tokens(tmp_path):
349      """Verify cache_read/cache_creation fields from Anthropic usage survive on the
350      CHAT_USAGE span attribute so prompt-cache hit rate is observable.
351      """
352      transcript_entries = [
353          {
354              "type": "user",
355              "message": {"role": "user", "content": "Cached prompt"},
356              "timestamp": "2025-01-15T10:00:00.000Z",
357              "sessionId": "cache-transcript-session",
358          },
359          {
360              "type": "assistant",
361              "message": {
362                  "role": "assistant",
363                  "content": [{"type": "text", "text": "Answer using cache."}],
364                  "model": "claude-sonnet-4-20250514",
365                  "usage": {
366                      "input_tokens": 36,
367                      "cache_creation_input_tokens": 23554,
368                      "cache_read_input_tokens": 139035,
369                      "output_tokens": 3344,
370                  },
371              },
372              "timestamp": "2025-01-15T10:00:01.000Z",
373          },
374      ]
375  
376      transcript_path = tmp_path / "transcript_cache.jsonl"
377      with open(transcript_path, "w") as f:
378          for entry in transcript_entries:
379              f.write(json.dumps(entry) + "\n")
380  
381      trace = process_transcript(str(transcript_path), "cache-transcript-session")
382  
383      assert trace is not None
384      llm_spans = [s for s in trace.search_spans() if s.span_type == SpanType.LLM]
385      assert len(llm_spans) == 1
386  
387      # input_tokens is the non-cached input the Anthropic API reports, matching
388      # mlflow.anthropic.autolog. Cache fields are exposed as separate keys so
389      # consumers can compute cache hit rate.
390      token_usage = llm_spans[0].get_attribute(SpanAttributeKey.CHAT_USAGE)
391      assert token_usage["input_tokens"] == 36
392      assert token_usage["output_tokens"] == 3344
393      assert token_usage["total_tokens"] == 36 + 3344
394      assert token_usage["cache_read_input_tokens"] == 139035
395      assert token_usage["cache_creation_input_tokens"] == 23554
396  
397  
398  # ============================================================================
399  # SDK MESSAGE PROCESSING TESTS
400  # ============================================================================
401  
402  
403  def test_process_sdk_messages_empty_list():
404      assert process_sdk_messages([]) is None
405  
406  
407  def test_process_sdk_messages_no_user_prompt():
408      messages = [
409          AssistantMessage(
410              content=[TextBlock(text="Hello!")],
411              model="claude-sonnet-4-20250514",
412          ),
413      ]
414      assert process_sdk_messages(messages) is None
415  
416  
417  def test_process_sdk_messages_simple_conversation():
418      messages = [
419          UserMessage(content="What is 2 + 2?"),
420          AssistantMessage(
421              content=[TextBlock(text="The answer is 4.")],
422              model="claude-sonnet-4-20250514",
423          ),
424          ResultMessage(
425              subtype="success",
426              duration_ms=1000,
427              duration_api_ms=800,
428              is_error=False,
429              num_turns=1,
430              session_id="test-sdk-session",
431              usage={"input_tokens": 100, "output_tokens": 20},
432          ),
433      ]
434  
435      trace = process_sdk_messages(messages, "test-sdk-session")
436  
437      assert trace is not None
438      spans = list(trace.search_spans())
439  
440      root_span = trace.data.spans[0]
441      assert root_span.name == "claude_code_conversation"
442      assert root_span.span_type == SpanType.AGENT
443  
444      # LLM span should have conversation context as input in Anthropic format
445      llm_spans = [s for s in spans if s.span_type == SpanType.LLM]
446      assert len(llm_spans) == 1
447      assert llm_spans[0].name == "llm"
448      assert llm_spans[0].inputs["model"] == "claude-sonnet-4-20250514"
449      assert llm_spans[0].inputs["messages"] == [{"role": "user", "content": "What is 2 + 2?"}]
450      assert llm_spans[0].get_attribute(SpanAttributeKey.MESSAGE_FORMAT) == "anthropic"
451  
452      # Output should be in Anthropic response format
453      outputs = llm_spans[0].outputs
454      assert outputs["type"] == "message"
455      assert outputs["role"] == "assistant"
456      assert outputs["content"] == [{"type": "text", "text": "The answer is 4."}]
457  
458      # Token usage from ResultMessage should be on the root span and trace level
459      token_usage = root_span.get_attribute(SpanAttributeKey.CHAT_USAGE)
460      assert token_usage is not None
461      assert token_usage["input_tokens"] == 100
462      assert token_usage["output_tokens"] == 20
463      assert token_usage["total_tokens"] == 120
464  
465      assert trace.info.token_usage is not None
466      assert trace.info.token_usage["input_tokens"] == 100
467      assert trace.info.token_usage["output_tokens"] == 20
468      assert trace.info.token_usage["total_tokens"] == 120
469  
470      # Duration should reflect ResultMessage.duration_ms (1000ms = 1s)
471      duration_ns = root_span.end_time_ns - root_span.start_time_ns
472      assert abs(duration_ns - 1_000_000_000) < 1_000_000  # within 1ms tolerance
473  
474      assert trace.info.trace_metadata.get("mlflow.trace.session") == "test-sdk-session"
475      assert trace.info.request_preview == "What is 2 + 2?"
476      assert trace.info.response_preview == "The answer is 4."
477  
478  
479  def test_process_sdk_messages_multiple_tools():
480      messages = [
481          UserMessage(content="Read two files"),
482          AssistantMessage(
483              content=[
484                  ToolUseBlock(id="tool_1", name="Read", input={"path": "a.py"}),
485                  ToolUseBlock(id="tool_2", name="Read", input={"path": "b.py"}),
486              ],
487              model="claude-sonnet-4-20250514",
488          ),
489          UserMessage(
490              content=[
491                  ToolResultBlock(tool_use_id="tool_1", content="content of a"),
492                  ToolResultBlock(tool_use_id="tool_2", content="content of b"),
493              ],
494              tool_use_result={"tool_use_id": "tool_1"},
495          ),
496          AssistantMessage(
497              content=[TextBlock(text="Here are the contents.")],
498              model="claude-sonnet-4-20250514",
499          ),
500          ResultMessage(
501              subtype="success",
502              duration_ms=2000,
503              duration_api_ms=1500,
504              is_error=False,
505              num_turns=2,
506              session_id="multi-tool-session",
507          ),
508      ]
509  
510      trace = process_sdk_messages(messages, "multi-tool-session")
511  
512      assert trace is not None
513      spans = list(trace.search_spans())
514  
515      tool_spans = [s for s in spans if s.span_type == SpanType.TOOL]
516      assert len(tool_spans) == 2
517      assert all(s.name == "tool_Read" for s in tool_spans)
518      tool_results = {s.outputs["result"] for s in tool_spans}
519      assert tool_results == {"content of a", "content of b"}
520  
521  
522  def test_process_sdk_messages_cache_tokens():
523      messages = [
524          UserMessage(content="Hello"),
525          AssistantMessage(
526              content=[TextBlock(text="Hi!")],
527              model="claude-sonnet-4-20250514",
528          ),
529          ResultMessage(
530              subtype="success",
531              duration_ms=5000,
532              duration_api_ms=4000,
533              is_error=False,
534              num_turns=1,
535              session_id="cache-session",
536              usage={
537                  "input_tokens": 36,
538                  "cache_creation_input_tokens": 23554,
539                  "cache_read_input_tokens": 139035,
540                  "output_tokens": 3344,
541              },
542          ),
543      ]
544  
545      trace = process_sdk_messages(messages, "cache-session")
546  
547      assert trace is not None
548      root_span = trace.data.spans[0]
549  
550      # input_tokens is the non-cached input the Anthropic API reports, matching
551      # mlflow.anthropic.autolog. Cache fields are exposed as separate keys so
552      # consumers can compute cache hit rate without scraping transcripts.
553      token_usage = root_span.get_attribute(SpanAttributeKey.CHAT_USAGE)
554      assert token_usage["input_tokens"] == 36
555      assert token_usage["output_tokens"] == 3344
556      assert token_usage["total_tokens"] == 36 + 3344
557      assert token_usage["cache_read_input_tokens"] == 139035
558      assert token_usage["cache_creation_input_tokens"] == 23554
559  
560      # Trace-level aggregation should match
561      assert trace.info.token_usage["input_tokens"] == 36
562      assert trace.info.token_usage["output_tokens"] == 3344
563  
564  
565  # ============================================================================
566  # FIND LAST USER MESSAGE INDEX TESTS
567  # ============================================================================
568  
569  
570  def test_find_last_user_message_skips_skill_injection():
571      transcript = [
572          {"type": "queue-operation"},
573          {"type": "queue-operation"},
574          # Entry 2: actual user prompt
575          {
576              "type": "user",
577              "message": {"role": "user", "content": "Enable tracing on the agent."},
578              "timestamp": "2025-01-01T00:00:00Z",
579          },
580          # Entry 3: assistant thinking
581          {
582              "type": "assistant",
583              "message": {
584                  "role": "assistant",
585                  "content": [{"type": "thinking", "thinking": "Let me use the skill."}],
586              },
587              "timestamp": "2025-01-01T00:00:01Z",
588          },
589          # Entry 4: assistant invokes Skill tool
590          {
591              "type": "assistant",
592              "message": {
593                  "role": "assistant",
594                  "content": [
595                      {
596                          "type": "tool_use",
597                          "id": "toolu_abc123",
598                          "name": "Skill",
599                          "input": {"skill": "instrumenting-with-mlflow-tracing"},
600                      }
601                  ],
602              },
603              "timestamp": "2025-01-01T00:00:02Z",
604          },
605          # Entry 5: tool result with commandName (correctly skipped by toolUseResult check)
606          {
607              "type": "user",
608              "toolUseResult": {
609                  "success": True,
610                  "commandName": "instrumenting-with-mlflow-tracing",
611              },
612              "message": {
613                  "role": "user",
614                  "content": [
615                      {
616                          "type": "tool_result",
617                          "tool_use_id": "toolu_abc123",
618                          "content": "Launching skill: instrumenting-with-mlflow-tracing",
619                      }
620                  ],
621              },
622              "timestamp": "2025-01-01T00:00:03Z",
623          },
624          # Entry 6: skill content injection (BUG: not flagged as tool result)
625          {
626              "type": "user",
627              "message": {
628                  "role": "user",
629                  "content": [
630                      {
631                          "type": "text",
632                          "text": (
633                              "Base directory for this skill: /path/to/skill\n\n"
634                              "# MLflow Tracing Guide\n\n...(full skill content)..."
635                          ),
636                      }
637                  ],
638              },
639              "timestamp": "2025-01-01T00:00:04Z",
640          },
641          # Entry 7: assistant continues
642          {
643              "type": "assistant",
644              "message": {
645                  "role": "assistant",
646                  "content": [{"type": "thinking", "thinking": "Now let me implement tracing."}],
647              },
648              "timestamp": "2025-01-01T00:00:05Z",
649          },
650          # Entry 8: assistant text response
651          {
652              "type": "assistant",
653              "message": {
654                  "role": "assistant",
655                  "content": [{"type": "text", "text": "I've enabled tracing on the agent."}],
656              },
657              "timestamp": "2025-01-01T00:00:06Z",
658          },
659      ]
660  
661      idx = find_last_user_message_index(transcript)
662  
663      # Should return index 2 (actual user prompt), not 6 (skill injection)
664      assert idx == 2
665      assert transcript[idx]["message"]["content"] == "Enable tracing on the agent."
666  
667  
668  def test_find_last_user_message_index_basic():
669      transcript = [
670          {"type": "queue-operation"},
671          {
672              "type": "user",
673              "message": {"role": "user", "content": "First question"},
674              "timestamp": "2025-01-01T00:00:00Z",
675          },
676          {
677              "type": "assistant",
678              "message": {
679                  "role": "assistant",
680                  "content": [{"type": "text", "text": "First answer"}],
681              },
682              "timestamp": "2025-01-01T00:00:01Z",
683          },
684          {
685              "type": "user",
686              "message": {"role": "user", "content": "Second question"},
687              "timestamp": "2025-01-01T00:00:02Z",
688          },
689          {
690              "type": "assistant",
691              "message": {
692                  "role": "assistant",
693                  "content": [{"type": "text", "text": "Second answer"}],
694              },
695              "timestamp": "2025-01-01T00:00:03Z",
696          },
697      ]
698  
699      idx = find_last_user_message_index(transcript)
700  
701      assert idx == 3
702      assert transcript[idx]["message"]["content"] == "Second question"
703  
704  
705  def test_find_last_user_message_skips_consecutive_skill_injections():
706      transcript = [
707          # Entry 0: actual user prompt
708          {
709              "type": "user",
710              "message": {"role": "user", "content": "Do the thing."},
711              "timestamp": "2025-01-01T00:00:00Z",
712          },
713          # Entry 1: assistant invokes first Skill
714          {
715              "type": "assistant",
716              "message": {
717                  "role": "assistant",
718                  "content": [
719                      {
720                          "type": "tool_use",
721                          "id": "toolu_1",
722                          "name": "Skill",
723                          "input": {"skill": "skill-one"},
724                      }
725                  ],
726              },
727              "timestamp": "2025-01-01T00:00:01Z",
728          },
729          # Entry 2: first skill tool result
730          {
731              "type": "user",
732              "toolUseResult": {"success": True, "commandName": "skill-one"},
733              "message": {
734                  "role": "user",
735                  "content": [
736                      {
737                          "type": "tool_result",
738                          "tool_use_id": "toolu_1",
739                          "content": "Launching skill: skill-one",
740                      }
741                  ],
742              },
743              "timestamp": "2025-01-01T00:00:02Z",
744          },
745          # Entry 3: first skill content injection
746          {
747              "type": "user",
748              "message": {
749                  "role": "user",
750                  "content": [{"type": "text", "text": "Base directory: /skill-one\n# Skill One"}],
751              },
752              "timestamp": "2025-01-01T00:00:03Z",
753          },
754          # Entry 4: assistant invokes second Skill
755          {
756              "type": "assistant",
757              "message": {
758                  "role": "assistant",
759                  "content": [
760                      {
761                          "type": "tool_use",
762                          "id": "toolu_2",
763                          "name": "Skill",
764                          "input": {"skill": "skill-two"},
765                      }
766                  ],
767              },
768              "timestamp": "2025-01-01T00:00:04Z",
769          },
770          # Entry 5: second skill tool result
771          {
772              "type": "user",
773              "toolUseResult": {"success": True, "commandName": "skill-two"},
774              "message": {
775                  "role": "user",
776                  "content": [
777                      {
778                          "type": "tool_result",
779                          "tool_use_id": "toolu_2",
780                          "content": "Launching skill: skill-two",
781                      }
782                  ],
783              },
784              "timestamp": "2025-01-01T00:00:05Z",
785          },
786          # Entry 6: second skill content injection
787          {
788              "type": "user",
789              "message": {
790                  "role": "user",
791                  "content": [{"type": "text", "text": "Base directory: /skill-two\n# Skill Two"}],
792              },
793              "timestamp": "2025-01-01T00:00:06Z",
794          },
795          # Entry 7: assistant response
796          {
797              "type": "assistant",
798              "message": {
799                  "role": "assistant",
800                  "content": [{"type": "text", "text": "Done."}],
801              },
802              "timestamp": "2025-01-01T00:00:07Z",
803          },
804      ]
805  
806      idx = find_last_user_message_index(transcript)
807  
808      # Should skip both skill injections (entries 3 and 6) and return entry 0
809      assert idx == 0
810      assert transcript[idx]["message"]["content"] == "Do the thing."
811  
812  
813  def test_process_transcript_captures_claude_code_version(tmp_path):
814      transcript = [
815          {
816              "type": "queue-operation",
817              "operation": "dequeue",
818              "timestamp": "2025-01-15T09:59:59.000Z",
819              "sessionId": "test-version-session",
820          },
821          {
822              "type": "user",
823              "version": "2.1.34",
824              "message": {"role": "user", "content": "Hello!"},
825              "timestamp": "2025-01-15T10:00:00.000Z",
826          },
827          {
828              "type": "assistant",
829              "version": "2.1.34",
830              "message": {
831                  "role": "assistant",
832                  "content": [{"type": "text", "text": "Hi there!"}],
833              },
834              "timestamp": "2025-01-15T10:00:01.000Z",
835          },
836      ]
837  
838      transcript_path = tmp_path / "version_transcript.jsonl"
839      transcript_path.write_text("\n".join(json.dumps(entry) for entry in transcript) + "\n")
840      trace = process_transcript(str(transcript_path), "test-version-session")
841  
842      assert trace is not None
843      assert trace.info.trace_metadata.get(METADATA_KEY_CLAUDE_CODE_VERSION) == "2.1.34"
844  
845  
846  def test_process_transcript_no_version_field(mock_transcript_file):
847      trace = process_transcript(mock_transcript_file, "test-session-no-version")
848  
849      assert trace is not None
850      assert METADATA_KEY_CLAUDE_CODE_VERSION not in trace.info.trace_metadata
851  
852  
853  def test_process_transcript_includes_steer_messages(tmp_path):
854      transcript = [
855          {
856              "type": "user",
857              "message": {"role": "user", "content": "Tell me about Python."},
858              "timestamp": "2025-01-15T10:00:00.000Z",
859          },
860          {
861              "type": "assistant",
862              "message": {
863                  "role": "assistant",
864                  "content": [{"type": "text", "text": "Python is a programming language."}],
865              },
866              "timestamp": "2025-01-15T10:00:01.000Z",
867          },
868          {
869              "type": "queue-operation",
870              "operation": "enqueue",
871              "content": "also tell me about Java",
872              "timestamp": "2025-01-15T10:00:02.000Z",
873              "sessionId": "test-steer-session",
874          },
875          {
876              "type": "queue-operation",
877              "operation": "remove",
878              "timestamp": "2025-01-15T10:00:03.000Z",
879              "sessionId": "test-steer-session",
880          },
881          {
882              "type": "assistant",
883              "message": {
884                  "role": "assistant",
885                  "content": [{"type": "text", "text": "Java is also a programming language."}],
886              },
887              "timestamp": "2025-01-15T10:00:04.000Z",
888          },
889      ]
890  
891      transcript_path = tmp_path / "steer_transcript.jsonl"
892      transcript_path.write_text("\n".join(json.dumps(entry) for entry in transcript) + "\n")
893      trace = process_transcript(str(transcript_path), "test-steer-session")
894      assert trace is not None
895  
896      spans = list(trace.search_spans())
897      llm_spans = [s for s in spans if s.span_type == SpanType.LLM]
898      assert len(llm_spans) == 2
899  
900      # The second LLM span should include the steer message in its inputs
901      second_llm = llm_spans[1]
902      input_messages = second_llm.inputs["messages"]
903      steer_messages = [m for m in input_messages if m.get("content") == "also tell me about Java"]
904      assert len(steer_messages) == 1
905      assert steer_messages[0]["role"] == "user"