test_telemetry.py
1 from unittest import mock 2 3 import pytest 4 5 from mlflow.genai import Scorer, scorer 6 from mlflow.genai.evaluation.telemetry import ( 7 _BATCH_SIZE_HEADER, 8 _CLIENT_NAME_HEADER, 9 _CLIENT_VERSION_HEADER, 10 _SESSION_ID_HEADER, 11 emit_metric_usage_event, 12 ) 13 from mlflow.genai.judges import make_judge 14 from mlflow.genai.scorers import Correctness, Guidelines, UserFrustration 15 from mlflow.genai.scorers.validation import IS_DBX_AGENTS_INSTALLED 16 from mlflow.version import VERSION 17 18 if not IS_DBX_AGENTS_INSTALLED: 19 pytest.skip("Skipping Databricks only test.", allow_module_level=True) 20 21 22 @scorer 23 def is_concise(outputs) -> bool: 24 return len(outputs) < 100 25 26 27 @scorer 28 def is_correct(outputs, expectations) -> bool: 29 return outputs == expectations["expected_response"] 30 31 32 class IsEmpty(Scorer): 33 name: str = "is_empty" 34 35 def __call__(self, *, outputs) -> bool: 36 return outputs == "" 37 38 39 from databricks.agents.evals import metric 40 41 42 @metric 43 def not_empty(response): 44 return response != "" 45 46 47 session_level_judge = make_judge( 48 name="session_quality", 49 instructions="Evaluate if the {{ conversation }} is coherent and complete.", 50 feedback_value_type=bool, 51 ) 52 53 54 @pytest.fixture 55 def mock_http_request(): 56 with ( 57 mock.patch("mlflow.genai.evaluation.telemetry.is_databricks_uri", return_value=True), 58 mock.patch( 59 "mlflow.genai.evaluation.telemetry.http_request", autospec=True 60 ) as mock_http_request, 61 mock.patch("mlflow.genai.evaluation.telemetry.get_databricks_host_creds"), 62 ): 63 yield mock_http_request 64 65 66 def test_emit_metric_usage_event_skip_outside_databricks(): 67 with ( 68 mock.patch("mlflow.genai.evaluation.telemetry.is_databricks_uri", return_value=False), 69 mock.patch( 70 "mlflow.genai.evaluation.telemetry.http_request", autospec=True 71 ) as mock_http_request, 72 mock.patch("mlflow.genai.evaluation.telemetry.get_databricks_host_creds"), 73 ): 74 emit_metric_usage_event( 75 scorers=[is_concise], 76 trace_count=10, 77 session_count=0, 78 aggregated_metrics={"is_concise/mean": 0.5}, 79 ) 80 mock_http_request.assert_not_called() 81 82 83 def test_emit_metric_usage_event_skip_when_no_scorers(mock_http_request): 84 emit_metric_usage_event(scorers=[], trace_count=10, session_count=0, aggregated_metrics={}) 85 mock_http_request.assert_not_called() 86 87 88 def test_emit_metric_usage_event_custom_scorers_only(mock_http_request): 89 is_kind = make_judge( 90 name="is_kind", 91 instructions="The answer must be kind. {{ outputs }}", 92 feedback_value_type=str, 93 ) 94 emit_metric_usage_event( 95 scorers=[is_concise, is_correct, IsEmpty(), is_kind, not_empty], 96 trace_count=10, 97 session_count=0, 98 aggregated_metrics={ 99 "is_concise/mean": 0.1, 100 "is_correct/mean": 0.2, 101 "is_empty/mean": 0.3, 102 "is_kind/mean": 0.4, 103 "not_empty/mean": 0.5, 104 }, 105 ) 106 107 mock_http_request.assert_called_once() 108 payload = mock_http_request.call_args[1]["json"] 109 110 assert payload == { 111 "agent_evaluation_client_usage_events": [ 112 { 113 "custom_metric_usage_event": { 114 "eval_count": 10, 115 "metrics": [ 116 {"name": mock.ANY, "average": 0.1, "count": 10}, 117 {"name": mock.ANY, "average": 0.2, "count": 10}, 118 {"name": mock.ANY, "average": 0.3, "count": 10}, 119 {"name": mock.ANY, "average": 0.4, "count": 10}, 120 {"name": mock.ANY, "average": 0.5, "count": 10}, 121 ], 122 } 123 } 124 ] 125 } 126 127 128 def test_emit_metric_usage_event_builtin_scorers_only(mock_http_request): 129 emit_metric_usage_event( 130 scorers=[Correctness(), Guidelines(guidelines="Be concise")], 131 trace_count=5, 132 session_count=0, 133 aggregated_metrics={"correctness/mean": 0.8, "guidelines/mean": 0.9}, 134 ) 135 136 mock_http_request.assert_called_once() 137 payload = mock_http_request.call_args[1]["json"] 138 139 assert payload == { 140 "agent_evaluation_client_usage_events": [ 141 { 142 "builtin_scorer_usage_event": { 143 "metrics": [ 144 {"name": "Correctness", "count": 5}, 145 {"name": "Guidelines", "count": 5}, 146 ], 147 } 148 } 149 ] 150 } 151 152 153 def test_emit_metric_usage_event_mixed_custom_and_builtin_scorers(mock_http_request): 154 emit_metric_usage_event( 155 scorers=[Correctness(), is_concise, Guidelines(guidelines="Be concise")], 156 trace_count=10, 157 session_count=0, 158 aggregated_metrics={ 159 "correctness/mean": 0.7, 160 "is_concise/mean": 0.5, 161 "guidelines/mean": 0.8, 162 }, 163 ) 164 165 mock_http_request.assert_called_once() 166 payload = mock_http_request.call_args[1]["json"] 167 168 assert payload == { 169 "agent_evaluation_client_usage_events": [ 170 { 171 "custom_metric_usage_event": { 172 "eval_count": 10, 173 "metrics": [{"name": mock.ANY, "average": 0.5, "count": 10}], 174 } 175 }, 176 { 177 "builtin_scorer_usage_event": { 178 "metrics": [ 179 {"name": "Correctness", "count": 10}, 180 {"name": "Guidelines", "count": 10}, 181 ], 182 } 183 }, 184 ] 185 } 186 187 188 def test_emit_metric_usage_event_headers(mock_http_request): 189 emit_metric_usage_event( 190 scorers=[is_concise], 191 trace_count=10, 192 session_count=0, 193 aggregated_metrics={"is_concise/mean": 0.5}, 194 ) 195 196 call_args = mock_http_request.call_args[1] 197 assert call_args["method"] == "POST" 198 assert call_args["endpoint"] == "/api/2.0/agents/evaluation-client-usage-events" 199 200 headers = call_args["extra_headers"] 201 assert headers[_CLIENT_VERSION_HEADER] == VERSION 202 assert headers[_SESSION_ID_HEADER] is not None 203 assert headers[_BATCH_SIZE_HEADER] == "10" 204 assert headers[_CLIENT_NAME_HEADER] == "mlflow" 205 206 207 def test_emit_metric_usage_event_with_multiple_calls(mock_http_request): 208 for _ in range(3): 209 emit_metric_usage_event( 210 scorers=[is_concise, Correctness()], 211 trace_count=10, 212 session_count=0, 213 aggregated_metrics={"is_concise/mean": 0.5, "correctness/mean": 0.8}, 214 ) 215 216 assert mock_http_request.call_count == 3 217 session_ids = [ 218 call[1]["extra_headers"][_SESSION_ID_HEADER] for call in mock_http_request.call_args_list 219 ] 220 assert len(set(session_ids)) == 1 221 222 223 def test_emit_metric_usage_event_session_level_custom_scorer(mock_http_request): 224 emit_metric_usage_event( 225 scorers=[session_level_judge], 226 trace_count=10, 227 session_count=3, 228 aggregated_metrics={"session_quality/mean": 0.7}, 229 ) 230 231 mock_http_request.assert_called_once() 232 payload = mock_http_request.call_args[1]["json"] 233 234 assert payload == { 235 "agent_evaluation_client_usage_events": [ 236 { 237 "custom_metric_usage_event": { 238 "eval_count": 10, 239 "metrics": [{"name": mock.ANY, "average": 0.7, "count": 3}], 240 } 241 } 242 ] 243 } 244 245 246 def test_emit_metric_usage_event_session_level_builtin_scorer(mock_http_request): 247 emit_metric_usage_event( 248 scorers=[UserFrustration()], 249 trace_count=10, 250 session_count=3, 251 aggregated_metrics={"user_frustration/mean": 0.8}, 252 ) 253 254 mock_http_request.assert_called_once() 255 payload = mock_http_request.call_args[1]["json"] 256 257 assert payload == { 258 "agent_evaluation_client_usage_events": [ 259 { 260 "builtin_scorer_usage_event": { 261 "metrics": [{"name": "UserFrustration", "count": 3}], 262 } 263 } 264 ] 265 } 266 267 268 def test_emit_metric_usage_event_mixed_session_and_trace_level_scorers(mock_http_request): 269 emit_metric_usage_event( 270 scorers=[is_concise, session_level_judge, Correctness()], 271 trace_count=10, 272 session_count=3, 273 aggregated_metrics={ 274 "is_concise/mean": 0.5, 275 "session_quality/mean": 0.7, 276 "correctness/mean": 0.8, 277 }, 278 ) 279 280 mock_http_request.assert_called_once() 281 payload = mock_http_request.call_args[1]["json"] 282 283 assert payload == { 284 "agent_evaluation_client_usage_events": [ 285 { 286 "custom_metric_usage_event": { 287 "eval_count": 10, 288 "metrics": [ 289 {"name": mock.ANY, "average": 0.5, "count": 10}, 290 {"name": mock.ANY, "average": 0.7, "count": 3}, 291 ], 292 } 293 }, 294 { 295 "builtin_scorer_usage_event": { 296 "metrics": [{"name": "Correctness", "count": 10}], 297 } 298 }, 299 ] 300 }