test_evaluation.py
1 import hashlib 2 import io 3 import json 4 import os 5 import re 6 import signal 7 import subprocess 8 import uuid 9 from typing import Any, NamedTuple 10 from unittest import mock 11 12 import numpy as np 13 import pandas as pd 14 import pytest 15 import sklearn 16 import sklearn.compose 17 import sklearn.datasets 18 import sklearn.impute 19 import sklearn.linear_model 20 import sklearn.pipeline 21 import sklearn.preprocessing 22 from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact, DummyEvaluator 23 from PIL import Image, ImageChops 24 from pyspark.ml.linalg import Vectors 25 from pyspark.ml.regression import LinearRegression as SparkLinearRegression 26 from pyspark.sql import SparkSession 27 from sklearn.metrics import ( 28 accuracy_score, 29 confusion_matrix, 30 mean_absolute_error, 31 mean_squared_error, 32 ) 33 34 import mlflow 35 from mlflow import MlflowClient 36 from mlflow.data.evaluation_dataset import EvaluationDataset, _gen_md5_for_arraylike_obj 37 from mlflow.data.pandas_dataset import from_pandas 38 from mlflow.entities import Trace, TraceData 39 from mlflow.exceptions import MlflowException 40 from mlflow.models.evaluation import ( 41 EvaluationArtifact, 42 EvaluationResult, 43 ModelEvaluator, 44 evaluate, 45 ) 46 from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact 47 from mlflow.models.evaluation.base import ( 48 _get_model_from_deployment_endpoint_uri, 49 _is_model_deployment_endpoint_uri, 50 _start_run_or_reuse_active_run, 51 resolve_evaluators_and_configs, 52 ) 53 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry 54 from mlflow.pyfunc import _ServedPyFuncModel 55 from mlflow.pyfunc.scoring_server.client import ScoringServerClient 56 from mlflow.tracing.constant import AssessmentMetadataKey, TraceMetadataKey 57 from mlflow.tracking.artifact_utils import get_artifact_uri 58 from mlflow.utils.file_utils import TempDir 59 60 from tests.tracing.helper import create_test_trace_info, get_traces 61 from tests.utils.test_file_utils import spark_session # noqa: F401 62 63 INFERENCE_FILE_NAME = "inference_inputs_outputs.json" 64 65 66 def get_iris(): 67 iris = sklearn.datasets.load_iris() 68 return iris.data, iris.target 69 70 71 def get_diabetes_dataset(): 72 data = sklearn.datasets.load_diabetes() 73 return data.data, data.target 74 75 76 def get_diabetes_spark_dataset(): 77 data = sklearn.datasets.load_diabetes() 78 spark = SparkSession.builder.master("local[*]").getOrCreate() 79 rows = [ 80 (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target) 81 ] 82 return spark.createDataFrame(spark.sparkContext.parallelize(rows, 1), ["features", "label"]) 83 84 85 def get_breast_cancer_dataset(): 86 data = sklearn.datasets.load_breast_cancer() 87 return data.data, data.target 88 89 90 class RunData(NamedTuple): 91 params: dict[str, Any] 92 metrics: dict[str, Any] 93 tags: dict[str, Any] 94 artifacts: list[str] 95 96 97 def get_run_data(run_id): 98 client = MlflowClient() 99 data = client.get_run(run_id).data 100 artifacts = [f.path for f in client.list_artifacts(run_id)] 101 return RunData(params=data.params, metrics=data.metrics, tags=data.tags, artifacts=artifacts) 102 103 104 def get_run_datasets(run_id): 105 client = MlflowClient() 106 return client.get_run(run_id).inputs.dataset_inputs 107 108 109 def get_raw_tag(run_id, tag_name): 110 client = MlflowClient() 111 data = client.get_run(run_id).data 112 return data.tags[tag_name] 113 114 115 def get_local_artifact_path(run_id, artifact_path): 116 return get_artifact_uri(run_id, artifact_path).replace("file://", "") 117 118 119 @pytest.fixture(scope="module") 120 def iris_dataset(): 121 X, y = get_iris() 122 eval_X = X[0::3] 123 eval_y = y[0::3] 124 constructor_args = {"data": eval_X, "targets": eval_y, "name": "dataset"} 125 ds = EvaluationDataset(**constructor_args) 126 ds._constructor_args = constructor_args 127 return ds 128 129 130 @pytest.fixture(scope="module") 131 def diabetes_dataset(): 132 X, y = get_diabetes_dataset() 133 eval_X = X[0::3] 134 eval_y = y[0::3] 135 constructor_args = {"data": eval_X, "targets": eval_y} 136 ds = EvaluationDataset(**constructor_args) 137 ds._constructor_args = constructor_args 138 return ds 139 140 141 @pytest.fixture(scope="module") 142 def diabetes_spark_dataset(): 143 spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1) 144 constructor_args = {"data": spark_df, "targets": "label"} 145 ds = EvaluationDataset(**constructor_args) 146 ds._constructor_args = constructor_args 147 return ds 148 149 150 @pytest.fixture(scope="module") 151 def breast_cancer_dataset(): 152 X, y = get_breast_cancer_dataset() 153 eval_X = X[0::3] 154 eval_y = y[0::3] 155 constructor_args = {"data": eval_X, "targets": eval_y} 156 ds = EvaluationDataset(**constructor_args) 157 ds._constructor_args = constructor_args 158 return ds 159 160 161 def get_pipeline_model_dataset(): 162 """ 163 The dataset tweaks the IRIS dataset by changing its first 2 features into categorical features, 164 and replace some feature values with NA values. 165 The dataset is prepared for a pipeline model, see `pipeline_model_uri`. 166 """ 167 X, y = get_iris() 168 169 def convert_num_to_label(x): 170 return f"v_{round(x)}" 171 172 f1 = np.array(list(map(convert_num_to_label, X[:, 0]))) 173 f2 = np.array(list(map(convert_num_to_label, X[:, 1]))) 174 f3 = X[:, 2] 175 f4 = X[:, 3] 176 177 f1[0::8] = None 178 f2[1::8] = None 179 f3[2::8] = np.nan 180 f4[3::8] = np.nan 181 182 data = pd.DataFrame({ 183 "f1": f1, 184 "f2": f2, 185 "f3": f3, 186 "f4": f4, 187 "y": y, 188 }) 189 return data, "y" 190 191 192 @pytest.fixture 193 def pipeline_model_uri(): 194 return get_pipeline_model_uri() 195 196 197 def get_pipeline_model_uri(): 198 """ 199 Create a pipeline model that transforms and trains on the dataset returned by 200 `get_pipeline_model_dataset`. The pipeline model imputes the missing values in 201 input dataset, encodes categorical features, and then trains a logistic regression 202 model. 203 """ 204 data, target_col = get_pipeline_model_dataset() 205 X = data.drop(target_col, axis=1) 206 y = data[target_col].to_numpy() 207 208 encoder = sklearn.preprocessing.OrdinalEncoder() 209 str_imputer = sklearn.impute.SimpleImputer(missing_values=None, strategy="most_frequent") 210 num_imputer = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="mean") 211 preproc_pipeline = sklearn.pipeline.Pipeline([ 212 ("imputer", str_imputer), 213 ("encoder", encoder), 214 ]) 215 216 pipeline = sklearn.pipeline.Pipeline([ 217 ( 218 "transformer", 219 sklearn.compose.make_column_transformer( 220 (preproc_pipeline, ["f1", "f2"]), 221 (num_imputer, ["f3", "f4"]), 222 ), 223 ), 224 ("clf", sklearn.linear_model.LogisticRegression()), 225 ]) 226 pipeline.fit(X, y) 227 228 with mlflow.start_run(): 229 model_info = mlflow.sklearn.log_model(pipeline, name="pipeline_model") 230 return model_info.model_uri 231 232 233 @pytest.fixture 234 def linear_regressor_model_uri(): 235 return get_linear_regressor_model_uri() 236 237 238 def get_linear_regressor_model_uri(): 239 X, y = get_diabetes_dataset() 240 reg = sklearn.linear_model.LinearRegression() 241 reg.fit(X, y) 242 243 with mlflow.start_run(): 244 model_info = mlflow.sklearn.log_model(reg, name="reg_model") 245 return model_info.model_uri 246 247 248 @pytest.fixture 249 def spark_linear_regressor_model_uri(): 250 return get_spark_linear_regressor_model_uri() 251 252 253 def get_spark_linear_regressor_model_uri(): 254 spark_df = get_diabetes_spark_dataset() 255 reg = SparkLinearRegression() 256 spark_reg_model = reg.fit(spark_df) 257 258 with mlflow.start_run(): 259 model_info = mlflow.spark.log_model(spark_reg_model, artifact_path="spark_reg_model") 260 return model_info.model_uri 261 262 263 @pytest.fixture 264 def multiclass_logistic_regressor_model_uri(): 265 return multiclass_logistic_regressor_model_uri_by_max_iter(2) 266 267 268 def multiclass_logistic_regressor_model_uri_by_max_iter(max_iter): 269 X, y = get_iris() 270 clf = sklearn.linear_model.LogisticRegression(max_iter=max_iter) 271 clf.fit(X, y) 272 273 with mlflow.start_run(): 274 model_info = mlflow.sklearn.log_model(clf, name=f"clf_model_{max_iter}_iters") 275 return model_info.model_uri 276 277 278 @pytest.fixture 279 def binary_logistic_regressor_model_uri(): 280 return get_binary_logistic_regressor_model_uri() 281 282 283 def get_binary_logistic_regressor_model_uri(): 284 X, y = get_breast_cancer_dataset() 285 clf = sklearn.linear_model.LogisticRegression() 286 clf.fit(X, y) 287 288 with mlflow.start_run(): 289 model_info = mlflow.sklearn.log_model(clf, name="bin_clf_model") 290 return model_info.model_uri 291 292 293 @pytest.fixture 294 def svm_model_uri(): 295 return get_svm_model_url() 296 297 298 def get_svm_model_url(): 299 X, y = get_breast_cancer_dataset() 300 clf = sklearn.svm.LinearSVC() 301 clf.fit(X, y) 302 303 with mlflow.start_run(): 304 model_info = mlflow.sklearn.log_model(clf, name="svm_model") 305 return model_info.model_uri 306 307 308 @pytest.fixture 309 def iris_pandas_df_dataset(): 310 X, y = get_iris() 311 eval_X = X[0::3] 312 eval_y = y[0::3] 313 data = pd.DataFrame({ 314 "f1": eval_X[:, 0], 315 "f2": eval_X[:, 1], 316 "f3": eval_X[:, 2], 317 "f4": eval_X[:, 3], 318 "y": eval_y, 319 }) 320 constructor_args = {"data": data, "targets": "y"} 321 ds = EvaluationDataset(**constructor_args) 322 ds._constructor_args = constructor_args 323 return ds 324 325 326 @pytest.fixture 327 def iris_pandas_df_num_cols_dataset(): 328 X, y = get_iris() 329 eval_X = X[0::3] 330 eval_y = y[0::3] 331 data = pd.DataFrame(eval_X) 332 data["y"] = eval_y 333 constructor_args = {"data": data, "targets": "y"} 334 ds = EvaluationDataset(**constructor_args) 335 ds._constructor_args = constructor_args 336 return ds 337 338 339 def test_mlflow_evaluate_logs_traces(): 340 eval_data = pd.DataFrame({ 341 "inputs": [ 342 "What is MLflow?", 343 "What is Spark?", 344 ], 345 "ground_truth": ["What is MLflow?", "Not what is Spark?"], 346 }) 347 348 @mlflow.trace 349 def model(inputs): 350 return inputs 351 352 with mlflow.start_run() as run: 353 evaluate( 354 model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()] 355 ) 356 assert len(get_traces()) == 1 357 assert run.info.run_id == get_traces()[0].info.request_metadata[TraceMetadataKey.SOURCE_RUN] 358 359 360 def test_pyfunc_evaluate_logs_traces(): 361 class Model(mlflow.pyfunc.PythonModel): 362 @mlflow.trace() 363 def predict(self, context, model_input): 364 return self.add(model_input, model_input) 365 366 @mlflow.trace() 367 def add(self, x, y): 368 return x + y 369 370 eval_data = pd.DataFrame({ 371 "inputs": [1, 2, 4], 372 "ground_truth": [2, 4, 8], 373 }) 374 375 with mlflow.start_run() as run: 376 model_info = mlflow.pyfunc.log_model(name="model", python_model=Model()) 377 evaluate( 378 model_info.model_uri, 379 eval_data, 380 targets="ground_truth", 381 extra_metrics=[mlflow.metrics.exact_match()], 382 ) 383 traces = get_traces() 384 assert len(traces) == 1 385 assert len(traces[0].data.spans) == 2 386 assert run.info.run_id == traces[0].info.request_metadata[TraceMetadataKey.SOURCE_RUN] 387 assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_info.model_id 388 389 390 def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): 391 y_true = iris_dataset.labels_data 392 classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) 393 y_pred = classifier_model.predict(iris_dataset.features_data) 394 expected_accuracy_score = accuracy_score(y_true, y_pred) 395 expected_metrics = { 396 "accuracy_score": expected_accuracy_score, 397 } 398 expected_saved_metrics = { 399 "accuracy_score": expected_accuracy_score, 400 } 401 402 expected_csv_artifact = confusion_matrix(y_true, y_pred) 403 cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred).figure_ 404 img_buf = io.BytesIO() 405 cm_figure.savefig(img_buf) 406 img_buf.seek(0) 407 expected_image_artifact = Image.open(img_buf) 408 409 with mlflow.start_run() as run: 410 eval_result = evaluate( 411 multiclass_logistic_regressor_model_uri, 412 iris_dataset._constructor_args["data"], 413 model_type="classifier", 414 targets=iris_dataset._constructor_args["targets"], 415 evaluators="dummy_evaluator", 416 ) 417 418 csv_artifact_name = "confusion_matrix" 419 saved_csv_artifact_path = get_local_artifact_path(run.info.run_id, csv_artifact_name + ".csv") 420 421 png_artifact_name = "confusion_matrix_image" 422 saved_png_artifact_path = get_local_artifact_path(run.info.run_id, png_artifact_name) + ".png" 423 424 _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) 425 assert saved_metrics == expected_saved_metrics 426 assert set(saved_artifacts) == {csv_artifact_name + ".csv", png_artifact_name + ".png"} 427 428 assert eval_result.metrics == expected_metrics 429 confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name] 430 np.testing.assert_array_equal(confusion_matrix_artifact.content, expected_csv_artifact) 431 assert confusion_matrix_artifact.uri == get_artifact_uri( 432 run.info.run_id, csv_artifact_name + ".csv" 433 ) 434 np.testing.assert_array_equal( 435 confusion_matrix_artifact._load(saved_csv_artifact_path), expected_csv_artifact 436 ) 437 confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name] 438 assert ( 439 ImageChops.difference( 440 confusion_matrix_image_artifact.content, expected_image_artifact 441 ).getbbox() 442 is None 443 ) 444 assert confusion_matrix_image_artifact.uri == get_artifact_uri( 445 run.info.run_id, png_artifact_name + ".png" 446 ) 447 assert ( 448 ImageChops.difference( 449 confusion_matrix_image_artifact._load(saved_png_artifact_path), 450 expected_image_artifact, 451 ).getbbox() 452 is None 453 ) 454 455 with TempDir() as temp_dir: 456 temp_dir_path = temp_dir.path() 457 eval_result.save(temp_dir_path) 458 459 with open(temp_dir.path("metrics.json")) as fp: 460 assert json.load(fp) == eval_result.metrics 461 462 with open(temp_dir.path("artifacts_metadata.json")) as fp: 463 json_dict = json.load(fp) 464 assert "confusion_matrix" in json_dict 465 assert json_dict["confusion_matrix"] == { 466 "uri": confusion_matrix_artifact.uri, 467 "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact", 468 } 469 470 assert "confusion_matrix_image" in json_dict 471 assert json_dict["confusion_matrix_image"] == { 472 "uri": confusion_matrix_image_artifact.uri, 473 "class_name": "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact", 474 } 475 476 assert set(os.listdir(temp_dir.path("artifacts"))) == { 477 "confusion_matrix.csv", 478 "confusion_matrix_image.png", 479 } 480 481 loaded_eval_result = EvaluationResult.load(temp_dir_path) 482 assert loaded_eval_result.metrics == eval_result.metrics 483 loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[csv_artifact_name] 484 assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri 485 np.testing.assert_array_equal( 486 confusion_matrix_artifact.content, 487 loaded_confusion_matrix_artifact.content, 488 ) 489 loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[png_artifact_name] 490 assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri 491 assert ( 492 ImageChops.difference( 493 confusion_matrix_image_artifact.content, 494 loaded_confusion_matrix_image_artifact.content, 495 ).getbbox() 496 is None 497 ) 498 499 new_confusion_matrix_artifact = Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri) 500 new_confusion_matrix_artifact._load() 501 np.testing.assert_array_equal( 502 confusion_matrix_artifact.content, 503 new_confusion_matrix_artifact.content, 504 ) 505 new_confusion_matrix_image_artifact = ImageEvaluationArtifact( 506 uri=confusion_matrix_image_artifact.uri 507 ) 508 new_confusion_matrix_image_artifact._load() 509 np.testing.assert_array_equal( 510 confusion_matrix_image_artifact.content, 511 new_confusion_matrix_image_artifact.content, 512 ) 513 514 515 def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset): 516 y_true = diabetes_dataset.labels_data 517 regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri) 518 y_pred = regressor_model.predict(diabetes_dataset.features_data) 519 expected_mae = mean_absolute_error(y_true, y_pred) 520 expected_mse = mean_squared_error(y_true, y_pred) 521 expected_metrics = { 522 "mean_absolute_error": expected_mae, 523 "mean_squared_error": expected_mse, 524 } 525 expected_saved_metrics = { 526 "mean_absolute_error": expected_mae, 527 "mean_squared_error": expected_mse, 528 } 529 530 with mlflow.start_run() as run: 531 eval_result = evaluate( 532 linear_regressor_model_uri, 533 diabetes_dataset._constructor_args["data"], 534 model_type="regressor", 535 targets=diabetes_dataset._constructor_args["targets"], 536 evaluators="dummy_evaluator", 537 ) 538 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 539 assert saved_metrics == expected_saved_metrics 540 assert eval_result.metrics == expected_metrics 541 542 543 def _load_diabetes_dataset_in_required_format(format): 544 data = sklearn.datasets.load_diabetes() 545 if format == "numpy": 546 return data.data, data.target 547 elif format == "pandas": 548 df = pd.DataFrame(data.data, columns=data.feature_names) 549 df["label"] = data.target 550 return df, "label" 551 elif format == "spark": 552 spark = SparkSession.builder.master("local[*]").getOrCreate() 553 panda_df = pd.DataFrame(data.data, columns=data.feature_names) 554 panda_df["label"] = data.target 555 spark_df = spark.createDataFrame(panda_df) 556 return spark_df, "label" 557 elif format == "list": 558 return data.data.tolist(), data.target.tolist() 559 else: 560 raise TypeError( 561 f"`format` must be one of 'numpy', 'pandas', 'spark' or 'list', but received {format}." 562 ) 563 564 565 @pytest.mark.parametrize("data_format", ["list", "numpy", "pandas", "spark"]) 566 def test_regressor_evaluation(linear_regressor_model_uri, data_format): 567 data, target = _load_diabetes_dataset_in_required_format(data_format) 568 569 with mlflow.start_run() as run: 570 eval_result = evaluate( 571 linear_regressor_model_uri, 572 data=data, 573 targets=target, 574 model_type="regressor", 575 evaluators=["default"], 576 ) 577 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 578 579 for k, v in eval_result.metrics.items(): 580 assert v == saved_metrics[k] 581 582 datasets = get_run_datasets(run.info.run_id) 583 assert len(datasets) == 1 584 assert len(datasets[0].tags) == 0 585 assert datasets[0].dataset.source_type == "code" 586 587 588 def test_pandas_df_regressor_evaluation_mlflow_dataset_with_metric_prefix( 589 linear_regressor_model_uri, 590 ): 591 data = sklearn.datasets.load_diabetes() 592 df = pd.DataFrame(data.data, columns=data.feature_names) 593 df["y"] = data.target 594 mlflow_df = from_pandas(df=df, source="my_src", targets="y") 595 with mlflow.start_run() as run: 596 eval_result = evaluate( 597 linear_regressor_model_uri, 598 data=mlflow_df, 599 model_type="regressor", 600 evaluators=["default"], 601 evaluator_config={ 602 "default": { 603 "metric_prefix": "eval", 604 } 605 }, 606 ) 607 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 608 609 for k, v in eval_result.metrics.items(): 610 assert v == saved_metrics[k] 611 612 datasets = get_run_datasets(run.info.run_id) 613 assert len(datasets) == 1 614 assert datasets[0].tags[0].value == "eval" 615 616 617 def test_pandas_df_regressor_evaluation_mlflow_dataset(linear_regressor_model_uri): 618 data = sklearn.datasets.load_diabetes() 619 df = pd.DataFrame(data.data, columns=data.feature_names) 620 df["y"] = data.target 621 mlflow_df = from_pandas(df=df, source="my_src", targets="y") 622 with mlflow.start_run() as run: 623 eval_result = evaluate( 624 linear_regressor_model_uri, 625 data=mlflow_df, 626 model_type="regressor", 627 evaluators=["default"], 628 ) 629 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 630 631 for k, v in eval_result.metrics.items(): 632 assert v == saved_metrics[k] 633 634 datasets = get_run_datasets(run.info.run_id) 635 assert len(datasets) == 1 636 assert len(datasets[0].tags) == 0 637 638 639 def test_pandas_df_regressor_evaluation_mlflow_dataset_with_targets_from_dataset( 640 linear_regressor_model_uri, 641 ): 642 data = sklearn.datasets.load_diabetes() 643 df = pd.DataFrame(data.data, columns=data.feature_names) 644 df["y"] = data.target 645 mlflow_df = from_pandas(df=df, source="my_src", targets="y") 646 with mlflow.start_run() as run: 647 eval_result = evaluate( 648 linear_regressor_model_uri, 649 data=mlflow_df, 650 model_type="regressor", 651 evaluators=["default"], 652 ) 653 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 654 655 for k, v in eval_result.metrics.items(): 656 assert v == saved_metrics[k] 657 658 datasets = get_run_datasets(run.info.run_id) 659 assert len(datasets) == 1 660 assert len(datasets[0].tags) == 0 661 662 663 def test_dataset_name(): 664 X, y = get_iris() 665 d1 = EvaluationDataset(data=X, targets=y, name="a1") 666 assert d1.name == "a1" 667 d2 = EvaluationDataset(data=X, targets=y) 668 assert d2.name == d2.hash 669 670 671 def test_dataset_metadata(): 672 X, y = get_iris() 673 d1 = EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1") 674 assert d1._metadata == { 675 "hash": "6bdf4e119bf1a37e7907dfd9f0e68733", 676 "name": "a1", 677 "path": "/path/to/a1", 678 } 679 680 681 def test_gen_md5_for_arraylike_obj(): 682 def get_md5(data): 683 md5_gen = hashlib.md5(usedforsecurity=False) 684 _gen_md5_for_arraylike_obj(md5_gen, data) 685 return md5_gen.hexdigest() 686 687 list0 = list(range(20)) 688 list1 = [100] + list0[1:] 689 list2 = list0[:-1] + [100] 690 list3 = list0[:10] + [100] + list0[10:] 691 692 assert len({get_md5(list0), get_md5(list1), get_md5(list2), get_md5(list3)}) == 4 693 694 list4 = list0[:10] + [99] + list0[10:] 695 assert get_md5(list3) == get_md5(list4) 696 697 698 def test_gen_md5_for_arraylike_obj_with_pandas_df_using_float_idx_does_not_raise_keyerror(): 699 float_indices = np.random.uniform(low=0.5, high=13.3, size=(10,)) 700 df = pd.DataFrame(np.random.randn(10, 4), index=float_indices, columns=["A", "B", "C", "D"]) 701 md5_gen = hashlib.md5(usedforsecurity=False) 702 assert _gen_md5_for_arraylike_obj(md5_gen, df) is None 703 704 705 def test_dataset_hash( 706 iris_dataset, iris_pandas_df_dataset, iris_pandas_df_num_cols_dataset, diabetes_spark_dataset 707 ): 708 assert iris_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3" 709 assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06" 710 assert iris_pandas_df_num_cols_dataset.hash == "3c5fc56830a0646001253e25e17bdce4" 711 assert diabetes_spark_dataset.hash == "ebfb050519e7e5b463bd38b0c8d04243" 712 713 714 def test_trace_dataset_hash(): 715 # Validates that a dataset containing Traces can be hashed. 716 df = pd.DataFrame({ 717 "request": ["Hello"], 718 "trace": [Trace(info=create_test_trace_info("tr"), data=TraceData([]))], 719 }) 720 dataset = EvaluationDataset(data=df) 721 assert dataset.hash == "757c14bf38aa42d36b93ccd70b1ea719" 722 # Hash of a dataset with a different column should be different 723 df2 = pd.DataFrame({ 724 "request": ["Hi"], 725 "trace": [Trace(info=create_test_trace_info("tr"), data=TraceData([]))], 726 }) 727 dataset2 = EvaluationDataset(data=df2) 728 assert dataset2.hash != dataset.hash 729 730 731 def test_dataset_with_pandas_dataframe(): 732 data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1]}) 733 eval_dataset = EvaluationDataset(data=data, targets="label") 734 735 assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"] 736 np.testing.assert_array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2]) 737 np.testing.assert_array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4]) 738 np.testing.assert_array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6]) 739 np.testing.assert_array_equal(eval_dataset.labels_data, [0, 1]) 740 741 eval_dataset2 = EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"]) 742 assert list(eval_dataset2.features_data.columns) == ["f3", "f2"] 743 np.testing.assert_array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4]) 744 np.testing.assert_array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6]) 745 746 747 def test_dataset_with_array_data(): 748 features = [[1, 2], [3, 4]] 749 labels = [0, 1] 750 751 for input_data in [features, np.array(features)]: 752 eval_dataset1 = EvaluationDataset(data=input_data, targets=labels) 753 np.testing.assert_array_equal(eval_dataset1.features_data, features) 754 np.testing.assert_array_equal(eval_dataset1.labels_data, labels) 755 assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"] 756 757 assert EvaluationDataset( 758 data=input_data, targets=labels, feature_names=["a", "b"] 759 ).feature_names == ["a", "b"] 760 761 with pytest.raises(MlflowException, match="all elements must have the same length"): 762 EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels) 763 764 765 def test_dataset_autogen_feature_names(): 766 labels = [0] 767 eval_dataset2 = EvaluationDataset(data=[list(range(9))], targets=labels) 768 assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)] 769 770 eval_dataset2 = EvaluationDataset(data=[list(range(10))], targets=labels) 771 assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)] 772 773 eval_dataset2 = EvaluationDataset(data=[list(range(99))], targets=labels) 774 assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)] 775 776 eval_dataset2 = EvaluationDataset(data=[list(range(100))], targets=labels) 777 assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)] 778 779 with pytest.raises( 780 MlflowException, match="features example rows must be the same length with labels array" 781 ): 782 EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3]) 783 784 785 def test_dataset_from_spark_df(spark_session): 786 spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"]) 787 with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5): 788 dataset = EvaluationDataset(spark_df, targets="y") 789 assert list(dataset.features_data.columns) == ["f1", "f2"] 790 assert list(dataset.features_data["f1"]) == [1.0] * 5 791 assert list(dataset.features_data["f2"]) == [2.0] * 5 792 assert list(dataset.labels_data) == [3.0] * 5 793 794 795 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset): 796 model_uuid = uuid.uuid4().hex 797 with mlflow.start_run() as run: 798 client = MlflowClient() 799 iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) 800 _, _, tags, _ = get_run_data(run.info.run_id) 801 802 logged_meta1 = {**iris_dataset._metadata, "model": model_uuid} 803 logged_meta2 = {**iris_pandas_df_dataset._metadata, "model": model_uuid} 804 805 assert json.loads(tags["mlflow.datasets"]) == [logged_meta1] 806 807 raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets") 808 assert " " not in raw_tag # assert the tag string remove all whitespace chars. 809 810 # Test appending dataset tag 811 iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) 812 _, _, tags, _ = get_run_data(run.info.run_id) 813 assert json.loads(tags["mlflow.datasets"]) == [ 814 logged_meta1, 815 logged_meta2, 816 ] 817 818 # Test log repetitive dataset 819 iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) 820 _, _, tags, _ = get_run_data(run.info.run_id) 821 assert json.loads(tags["mlflow.datasets"]) == [ 822 logged_meta1, 823 logged_meta2, 824 ] 825 826 827 class FakeEvaluator1(ModelEvaluator): 828 @classmethod 829 def can_evaluate(cls, *, model_type, evaluator_config, **kwargs): 830 raise RuntimeError() 831 832 def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): 833 raise RuntimeError() 834 835 836 class FakeEvaluator2(ModelEvaluator): 837 @classmethod 838 def can_evaluate(cls, *, model_type, evaluator_config, **kwargs): 839 raise RuntimeError() 840 841 def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): 842 raise RuntimeError() 843 844 845 class FakeArtifact1(EvaluationArtifact): 846 def _save(self, output_artifact_path): 847 raise RuntimeError() 848 849 def _load_content_from_file(self, local_artifact_path): 850 raise RuntimeError() 851 852 853 class FakeArtifact2(EvaluationArtifact): 854 def _save(self, output_artifact_path): 855 raise RuntimeError() 856 857 def _load_content_from_file(self, local_artifact_path): 858 raise RuntimeError() 859 860 861 class PyFuncModelMatcher: 862 def __eq__(self, other): 863 return isinstance(other, mlflow.pyfunc.PyFuncModel) 864 865 866 class ModelPredictFuncMatcher: 867 def __eq__(self, other): 868 return callable(other) 869 870 871 def test_evaluator_evaluation_interface(multiclass_logistic_regressor_model_uri, iris_dataset): 872 with mock.patch.object( 873 _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1} 874 ): 875 evaluator1_config = {"eval1_config_a": 3, "eval1_config_b": 4} 876 evaluator1_return_value = EvaluationResult( 877 metrics={"m1": 5, "m2": 6}, 878 artifacts={"a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2")}, 879 ) 880 with ( 881 mock.patch.object( 882 FakeEvaluator1, "can_evaluate", return_value=False 883 ) as mock_can_evaluate, 884 mock.patch.object( 885 FakeEvaluator1, "evaluate", return_value=evaluator1_return_value 886 ) as mock_evaluate, 887 ): 888 with mlflow.start_run(): 889 with pytest.raises( 890 MlflowException, 891 match="The model could not be evaluated by any of the registered evaluators", 892 ): 893 evaluate( 894 multiclass_logistic_regressor_model_uri, 895 data=iris_dataset._constructor_args["data"], 896 model_type="classifier", 897 targets=iris_dataset._constructor_args["targets"], 898 evaluators="test_evaluator1", 899 evaluator_config=evaluator1_config, 900 ) 901 mock_can_evaluate.assert_called_once_with( 902 model_type="classifier", evaluator_config=evaluator1_config 903 ) 904 mock_evaluate.assert_not_called() 905 with ( 906 mock.patch.object( 907 FakeEvaluator1, "can_evaluate", return_value=True 908 ) as mock_can_evaluate, 909 mock.patch.object( 910 FakeEvaluator1, "evaluate", return_value=evaluator1_return_value 911 ) as mock_evaluate, 912 ): 913 with mlflow.start_run() as run: 914 eval1_result = evaluate( 915 multiclass_logistic_regressor_model_uri, 916 iris_dataset._constructor_args["data"], 917 model_type="classifier", 918 targets=iris_dataset._constructor_args["targets"], 919 evaluators="test_evaluator1", 920 evaluator_config=evaluator1_config, 921 extra_metrics=None, 922 ) 923 assert eval1_result.metrics == evaluator1_return_value.metrics 924 assert eval1_result.artifacts == evaluator1_return_value.artifacts 925 926 mock_can_evaluate.assert_called_once_with( 927 model_type="classifier", evaluator_config=evaluator1_config 928 ) 929 mock_evaluate.assert_called_once_with( 930 model=PyFuncModelMatcher(), 931 model_type="classifier", 932 model_id=multiclass_logistic_regressor_model_uri.split("/")[-1], 933 dataset=iris_dataset, 934 run_id=run.info.run_id, 935 evaluator_config=evaluator1_config, 936 extra_metrics=None, 937 custom_artifacts=None, 938 predictions=None, 939 ) 940 941 942 def test_evaluate_with_multi_evaluators( 943 multiclass_logistic_regressor_model_uri, 944 iris_dataset, 945 ): 946 with mock.patch.object( 947 _model_evaluation_registry, 948 "_registry", 949 {"test_evaluator1": FakeEvaluator1, "test_evaluator2": FakeEvaluator2}, 950 ): 951 evaluator1_config = {"eval1_config": 3} 952 evaluator2_config = {"eval2_config": 4} 953 evaluator1_return_value = EvaluationResult( 954 metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")} 955 ) 956 957 evaluator2_return_value = EvaluationResult( 958 metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")} 959 ) 960 961 def get_evaluate_call_arg(model, evaluator_config): 962 return { 963 "model": model, 964 "model_type": "classifier", 965 "model_id": model.model_id, 966 "dataset": iris_dataset, 967 "run_id": run.info.run_id, 968 "evaluator_config": evaluator_config, 969 "extra_metrics": None, 970 "custom_artifacts": None, 971 "predictions": None, 972 } 973 974 # evaluators = None is the case evaluators unspecified, it should fetch all registered 975 # evaluators, and the evaluation results should equal to the case of 976 # evaluators=["test_evaluator1", "test_evaluator2"] 977 for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]: 978 with ( 979 mock.patch.object( 980 FakeEvaluator1, "can_evaluate", return_value=True 981 ) as mock_can_evaluate1, 982 mock.patch.object( 983 FakeEvaluator1, "evaluate", return_value=evaluator1_return_value 984 ) as mock_evaluate1, 985 mock.patch.object( 986 FakeEvaluator2, "can_evaluate", return_value=True 987 ) as mock_can_evaluate2, 988 mock.patch.object( 989 FakeEvaluator2, "evaluate", return_value=evaluator2_return_value 990 ) as mock_evaluate2, 991 ): 992 with mlflow.start_run() as run: 993 eval_result = evaluate( 994 multiclass_logistic_regressor_model_uri, 995 iris_dataset._constructor_args["data"], 996 model_type="classifier", 997 targets=iris_dataset._constructor_args["targets"], 998 evaluators=evaluators, 999 evaluator_config={ 1000 "test_evaluator1": evaluator1_config, 1001 "test_evaluator2": evaluator2_config, 1002 }, 1003 ) 1004 assert eval_result.metrics == { 1005 **evaluator1_return_value.metrics, 1006 **evaluator2_return_value.metrics, 1007 } 1008 assert eval_result.artifacts == { 1009 **evaluator1_return_value.artifacts, 1010 **evaluator2_return_value.artifacts, 1011 } 1012 mock_evaluate1.assert_called_once_with( 1013 **get_evaluate_call_arg( 1014 mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri), 1015 evaluator1_config, 1016 ) 1017 ) 1018 mock_can_evaluate1.assert_has_calls([ 1019 mock.call(model_type="classifier", evaluator_config=evaluator1_config) 1020 ]) 1021 mock_evaluate2.assert_called_once_with( 1022 **get_evaluate_call_arg( 1023 mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri), 1024 evaluator2_config, 1025 ) 1026 ) 1027 mock_can_evaluate2.assert_has_calls([ 1028 mock.call(model_type="classifier", evaluator_config=evaluator2_config) 1029 ]) 1030 1031 1032 def test_custom_evaluators_no_model_or_preds(multiclass_logistic_regressor_model_uri, iris_dataset): 1033 """ 1034 Tests that custom evaluators are called correctly when no model or predictions are provided 1035 """ 1036 with ( 1037 mock.patch.object( 1038 _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1} 1039 ), 1040 mock.patch.object(FakeEvaluator1, "can_evaluate", return_value=True) as mock_can_evaluate, 1041 mock.patch.object(FakeEvaluator1, "evaluate") as mock_evaluate, 1042 ): 1043 with mlflow.start_run() as run: 1044 evaluate( 1045 model=None, 1046 data=iris_dataset._constructor_args["data"], 1047 predictions=None, 1048 model_type="classifier", 1049 targets=iris_dataset._constructor_args["targets"], 1050 evaluators="test_evaluator1", 1051 evaluator_config=None, 1052 extra_metrics=None, 1053 ) 1054 1055 mock_can_evaluate.assert_called_once_with(model_type="classifier", evaluator_config={}) 1056 mock_evaluate.assert_called_once_with( 1057 model=None, 1058 dataset=iris_dataset, 1059 predictions=None, 1060 model_type="classifier", 1061 model_id=None, 1062 run_id=run.info.run_id, 1063 evaluator_config={}, 1064 extra_metrics=None, 1065 custom_artifacts=None, 1066 ) 1067 1068 1069 def test_start_run_or_reuse_active_run(): 1070 with _start_run_or_reuse_active_run() as run: 1071 assert mlflow.active_run().info.run_id == run.info.run_id 1072 1073 with mlflow.start_run() as run: 1074 active_run_id = run.info.run_id 1075 1076 with _start_run_or_reuse_active_run() as run: 1077 assert run.info.run_id == active_run_id 1078 1079 with _start_run_or_reuse_active_run() as run: 1080 assert run.info.run_id == active_run_id 1081 1082 1083 def test_resolve_evaluators_and_configs(): 1084 from mlflow.models.evaluation.evaluators.classifier import ClassifierEvaluator 1085 from mlflow.models.evaluation.evaluators.default import DefaultEvaluator 1086 from mlflow.models.evaluation.evaluators.regressor import RegressorEvaluator 1087 from mlflow.models.evaluation.evaluators.shap import ShapEvaluator 1088 1089 def assert_equal(actual, expected): 1090 assert len(actual) == len(expected) 1091 for actual_i, expected_i in zip(actual, expected): 1092 assert actual_i.name == expected_i[0] 1093 assert isinstance(actual_i.evaluator, expected_i[1]) 1094 assert actual_i.config == expected_i[2] 1095 1096 with mock.patch.object( 1097 _model_evaluation_registry, 1098 "_registry", 1099 {"default": DefaultEvaluator}, 1100 ): 1101 assert_equal( 1102 resolve_evaluators_and_configs(None, None), [("default", DefaultEvaluator, {})] 1103 ) 1104 assert_equal( 1105 actual=resolve_evaluators_and_configs(None, {"a": 3}), 1106 expected=[("default", DefaultEvaluator, {"a": 3})], 1107 ) 1108 assert_equal( 1109 actual=resolve_evaluators_and_configs(None, {"default": {"a": 3}}), 1110 expected=[("default", DefaultEvaluator, {"a": 3})], 1111 ) 1112 1113 # 1. evaluators is None -> only default evaluator is used 1114 assert_equal( 1115 actual=resolve_evaluators_and_configs(None, None), 1116 expected=[("default", DefaultEvaluator, {})], 1117 ) 1118 assert_equal( 1119 actual=resolve_evaluators_and_configs(None, {"a": 3}), 1120 expected=[("default", DefaultEvaluator, {"a": 3})], 1121 ) 1122 1123 # 2. evaluators is None and model type is classifier -> builtin classifier evaluators 1124 # are used instead of the default. Also dummy evaluator can evaluate classifier. 1125 assert_equal( 1126 actual=resolve_evaluators_and_configs( 1127 evaluators=None, evaluator_config={"a": 3}, model_type="classifier" 1128 ), 1129 expected=[ 1130 ("classifier", ClassifierEvaluator, {"a": 3}), 1131 ("shap", ShapEvaluator, {"a": 3}), 1132 ("dummy_evaluator", DummyEvaluator, {"a": 3}), 1133 ], 1134 ) 1135 1136 assert_equal( 1137 resolve_evaluators_and_configs( 1138 evaluators=None, 1139 # config for a specific evaluator 1140 evaluator_config={"shap": {"a": 3}}, 1141 model_type="classifier", 1142 ), 1143 expected=[ 1144 ("classifier", ClassifierEvaluator, {}), 1145 ("shap", ShapEvaluator, {"a": 3}), 1146 ("dummy_evaluator", DummyEvaluator, {}), 1147 ], 1148 ) 1149 1150 assert_equal( 1151 resolve_evaluators_and_configs( 1152 evaluators=None, 1153 # config for a "default" copied to builtin evaluators 1154 evaluator_config={"default": {"a": 3}}, 1155 model_type="classifier", 1156 ), 1157 expected=[ 1158 ("classifier", ClassifierEvaluator, {"a": 3}), 1159 ("shap", ShapEvaluator, {"a": 3}), 1160 ("dummy_evaluator", DummyEvaluator, {}), 1161 ], 1162 ) 1163 1164 # 3. evaluators is string -> the specified evaluator is used 1165 assert_equal( 1166 actual=resolve_evaluators_and_configs("dummy_evaluator", {"a": 3}, "regressor"), 1167 expected=[("dummy_evaluator", DummyEvaluator, {"a": 3})], 1168 ) 1169 assert_equal( 1170 actual=resolve_evaluators_and_configs("default", {"a": 3}), 1171 expected=[("default", DefaultEvaluator, {"a": 3})], 1172 ) 1173 assert_equal( 1174 actual=resolve_evaluators_and_configs("default", {"a": 3}, "regressor"), 1175 expected=[ 1176 ("regressor", RegressorEvaluator, {"a": 3}), 1177 ("shap", ShapEvaluator, {"a": 3}), 1178 ], 1179 ) 1180 assert_equal( 1181 actual=resolve_evaluators_and_configs("regressor", {"a": 3}, "regressor"), 1182 expected=[("regressor", RegressorEvaluator, {"a": 3})], 1183 ) 1184 assert_equal( 1185 actual=resolve_evaluators_and_configs("non-existing", {"a": 3}), 1186 expected=[], # empty because not registered evaluator 1187 ) 1188 1189 # 4. evaluators is a list of strings -> the specified evaluators are used 1190 assert_equal( 1191 actual=resolve_evaluators_and_configs( 1192 evaluators=["default", "dummy_evaluator"], 1193 evaluator_config={"dummy_evaluator": {"a": 3}, "default": {"a": 5}}, 1194 model_type="classifier", 1195 ), 1196 expected=[ 1197 ("classifier", ClassifierEvaluator, {"a": 5}), 1198 ("shap", ShapEvaluator, {"a": 5}), 1199 ("dummy_evaluator", DummyEvaluator, {"a": 3}), 1200 ], 1201 ) 1202 1203 assert_equal( 1204 actual=resolve_evaluators_and_configs( 1205 evaluators=["regressor"], 1206 evaluator_config={"regressor": {"a": 5}}, 1207 model_type="regressor", 1208 ), 1209 expected=[("regressor", RegressorEvaluator, {"a": 5})], 1210 ) 1211 1212 with pytest.raises( 1213 MlflowException, 1214 match="If `evaluators` argument is an evaluator name list, evaluator_config must", 1215 ): 1216 resolve_evaluators_and_configs(["default", "dummy_evaluator"], {"abc": {"a": 3}}) 1217 1218 1219 def test_resolve_evaluators_raise_for_missing_databricks_agent_dependency(): 1220 with pytest.raises( 1221 MlflowException, 1222 match="Databricks Agents SDK must be installed to use the `databricks-agent` model type.", 1223 ): 1224 resolve_evaluators_and_configs( 1225 evaluators=None, evaluator_config=None, model_type="databricks-agent" 1226 ) 1227 1228 1229 def test_evaluate_env_manager_params(multiclass_logistic_regressor_model_uri, iris_dataset): 1230 model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) 1231 1232 with mock.patch.object( 1233 _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1} 1234 ): 1235 with pytest.raises(MlflowException, match="The model argument must be a string URI"): 1236 evaluate( 1237 model, 1238 iris_dataset._constructor_args["data"], 1239 model_type="classifier", 1240 targets=iris_dataset._constructor_args["targets"], 1241 evaluators=None, 1242 env_manager="virtualenv", 1243 ) 1244 1245 with pytest.raises(MlflowException, match="Invalid value for `env_manager`"): 1246 evaluate( 1247 multiclass_logistic_regressor_model_uri, 1248 iris_dataset._constructor_args["data"], 1249 model_type="classifier", 1250 targets=iris_dataset._constructor_args["targets"], 1251 evaluators=None, 1252 env_manager="manager", 1253 ) 1254 1255 1256 @pytest.mark.parametrize("env_manager", ["virtualenv", "conda"]) 1257 def test_evaluate_restores_env(tmp_path, env_manager, iris_dataset): 1258 class EnvRestoringTestModel(mlflow.pyfunc.PythonModel): 1259 def __init__(self): 1260 pass 1261 1262 def predict(self, context, model_input, params=None): 1263 pred_value = 1 if sklearn.__version__ == "1.4.2" else 0 1264 1265 return model_input.apply(lambda row: pred_value, axis=1) 1266 1267 class FakeEvauatorEnv(ModelEvaluator): 1268 @classmethod 1269 def can_evaluate(cls, *, model_type, evaluator_config, **kwargs): 1270 return True 1271 1272 def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): 1273 y = model.predict(pd.DataFrame(dataset.features_data)) 1274 return EvaluationResult(metrics={"test": y[0]}, artifacts={}) 1275 1276 model_path = os.path.join(tmp_path, "model") 1277 1278 mlflow.pyfunc.save_model( 1279 path=model_path, 1280 python_model=EnvRestoringTestModel(), 1281 pip_requirements=["scikit-learn==1.4.2"], 1282 ) 1283 1284 with mock.patch.object( 1285 _model_evaluation_registry, 1286 "_registry", 1287 {"test_evaluator_env": FakeEvauatorEnv}, 1288 ): 1289 result = evaluate( 1290 model_path, 1291 iris_dataset._constructor_args["data"], 1292 model_type="classifier", 1293 targets=iris_dataset._constructor_args["targets"], 1294 evaluators=None, 1295 env_manager=env_manager, 1296 ) 1297 assert result.metrics["test"] == 1 1298 1299 1300 def test_evaluate_terminates_model_servers(multiclass_logistic_regressor_model_uri, iris_dataset): 1301 # Mock the _load_model_or_server() results to avoid starting model servers 1302 model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) 1303 client = ScoringServerClient("127.0.0.1", "8080") 1304 served_model_1 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=1) 1305 served_model_2 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=2) 1306 1307 with ( 1308 mock.patch.object( 1309 _model_evaluation_registry, 1310 "_registry", 1311 {"test_evaluator1": FakeEvaluator1}, 1312 ), 1313 mock.patch.object(FakeEvaluator1, "can_evaluate", return_value=True), 1314 mock.patch.object( 1315 FakeEvaluator1, "evaluate", return_value=EvaluationResult(metrics={}, artifacts={}) 1316 ), 1317 mock.patch("mlflow.pyfunc._load_model_or_server") as server_loader, 1318 mock.patch("os.kill") as os_mock, 1319 ): 1320 server_loader.side_effect = [served_model_1, served_model_2] 1321 evaluate( 1322 multiclass_logistic_regressor_model_uri, 1323 iris_dataset._constructor_args["data"], 1324 model_type="classifier", 1325 targets=iris_dataset._constructor_args["targets"], 1326 evaluators=None, 1327 env_manager="virtualenv", 1328 ) 1329 assert os_mock.call_count == 1 1330 os_mock.assert_has_calls([mock.call(1, signal.SIGTERM)]) 1331 1332 1333 def test_evaluate_stdin_scoring_server(): 1334 X, y = sklearn.datasets.load_iris(return_X_y=True) 1335 X = X[::5] 1336 y = y[::5] 1337 model = sklearn.linear_model.LogisticRegression() 1338 model.fit(X, y) 1339 1340 with mlflow.start_run(): 1341 model_info = mlflow.sklearn.log_model(model, name="model") 1342 1343 with mock.patch("mlflow.pyfunc.check_port_connectivity", return_value=False): 1344 mlflow.evaluate( 1345 model_info.model_uri, 1346 X, 1347 targets=y, 1348 model_type="classifier", 1349 evaluators=["default"], 1350 env_manager="virtualenv", 1351 ) 1352 1353 1354 @pytest.mark.parametrize("model_type", ["regressor", "classifier"]) 1355 def test_targets_is_required_for_regressor_and_classifier_models(model_type): 1356 with pytest.raises(MlflowException, match="The targets argument must be specified"): 1357 mlflow.evaluate( 1358 "models:/test", 1359 data=pd.DataFrame(), 1360 model_type=model_type, 1361 ) 1362 1363 1364 def test_evaluate_xgboost_classifier(): 1365 import xgboost as xgb 1366 1367 X, y = sklearn.datasets.load_iris(return_X_y=True, as_frame=True) 1368 X = X[::5] 1369 y = y[::5] 1370 data = xgb.DMatrix(X, label=y) 1371 model = xgb.train({"objective": "multi:softmax", "num_class": 3}, data, num_boost_round=5) 1372 1373 with mlflow.start_run() as run: 1374 model_info = mlflow.xgboost.log_model(model, name="model") 1375 mlflow.evaluate( 1376 model_info.model_uri, 1377 X.assign(y=y), 1378 targets="y", 1379 model_type="classifier", 1380 ) 1381 1382 run = mlflow.get_run(run.info.run_id) 1383 assert "accuracy_score" in run.data.metrics 1384 assert "recall_score" in run.data.metrics 1385 assert "precision_score" in run.data.metrics 1386 assert "f1_score" in run.data.metrics 1387 1388 1389 def test_evaluate_lightgbm_regressor(): 1390 import lightgbm as lgb 1391 1392 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1393 X = X[::5] 1394 y = y[::5] 1395 data = lgb.Dataset(X, label=y) 1396 model = lgb.train({"objective": "regression"}, data, num_boost_round=5) 1397 1398 with mlflow.start_run() as run: 1399 model_info = mlflow.lightgbm.log_model(model, name="model") 1400 mlflow.evaluate( 1401 model_info.model_uri, 1402 X.assign(y=y), 1403 targets="y", 1404 model_type="regressor", 1405 ) 1406 1407 run = mlflow.get_run(run.info.run_id) 1408 assert "mean_absolute_error" in run.data.metrics 1409 assert "mean_squared_error" in run.data.metrics 1410 assert "root_mean_squared_error" in run.data.metrics 1411 1412 1413 def test_evaluate_with_targets_error_handling(): 1414 import lightgbm as lgb 1415 1416 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1417 X = X[::5] 1418 y = y[::5] 1419 lgb_data = lgb.Dataset(X, label=y) 1420 model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5) 1421 ERROR_TYPE_1 = ( 1422 "The top-level targets parameter should not be specified since a Dataset " 1423 "is used. Please only specify the targets column name in the Dataset. For example: " 1424 "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. " 1425 "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`." 1426 ) 1427 ERROR_TYPE_2 = ( 1428 "The targets column name must be specified in the provided Dataset " 1429 "for regressor models. For example: " 1430 "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`" 1431 ) 1432 ERROR_TYPE_3 = "The targets argument must be specified for regressor models." 1433 1434 pandas_dataset_no_targets = X 1435 mlflow_dataset_no_targets = mlflow.data.from_pandas(df=X.assign(y=y)) 1436 mlflow_dataset_with_targets = mlflow.data.from_pandas(df=X.assign(y=y), targets="y") 1437 1438 with mlflow.start_run(): 1439 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)): 1440 mlflow.evaluate( 1441 model=model, 1442 data=mlflow_dataset_with_targets, 1443 model_type="regressor", 1444 targets="y", 1445 ) 1446 1447 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)): 1448 mlflow.evaluate( 1449 model=model, 1450 data=mlflow_dataset_no_targets, 1451 model_type="regressor", 1452 targets="y", 1453 ) 1454 1455 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)): 1456 mlflow.evaluate( 1457 model=model, 1458 data=mlflow_dataset_with_targets, 1459 model_type="question-answering", 1460 targets="y", 1461 ) 1462 1463 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)): 1464 mlflow.evaluate( 1465 model=model, 1466 data=mlflow_dataset_no_targets, 1467 model_type="question-answering", 1468 targets="y", 1469 ) 1470 1471 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_2)): 1472 mlflow.evaluate( 1473 model=model, 1474 data=mlflow_dataset_no_targets, 1475 model_type="regressor", 1476 ) 1477 1478 with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_3)): 1479 mlflow.evaluate( 1480 model=model, 1481 data=pandas_dataset_no_targets, 1482 model_type="regressor", 1483 ) 1484 1485 1486 def test_evaluate_with_predictions_error_handling(): 1487 import lightgbm as lgb 1488 1489 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1490 X = X[::5] 1491 y = y[::5] 1492 lgb_data = lgb.Dataset(X, label=y) 1493 model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5) 1494 mlflow_dataset_with_predictions = mlflow.data.from_pandas( 1495 df=X.assign(y=y, model_output=y), 1496 targets="y", 1497 predictions="model_output", 1498 ) 1499 with mlflow.start_run(): 1500 with pytest.raises( 1501 MlflowException, 1502 match="The predictions parameter should not be specified in the Dataset since a model " 1503 "is specified. Please remove the predictions column from the Dataset.", 1504 ): 1505 mlflow.evaluate( 1506 model=model, 1507 data=mlflow_dataset_with_predictions, 1508 model_type="regressor", 1509 ) 1510 1511 1512 def test_evaluate_with_function_input_single_output(): 1513 import lightgbm as lgb 1514 1515 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1516 X = X[::5] 1517 y = y[::5] 1518 data = lgb.Dataset(X, label=y) 1519 model = lgb.train({"objective": "regression"}, data, num_boost_round=5) 1520 1521 def fn(X): 1522 return model.predict(X) 1523 1524 with mlflow.start_run() as run: 1525 mlflow.evaluate( 1526 fn, 1527 X.assign(y=y), 1528 targets="y", 1529 model_type="regressor", 1530 ) 1531 run = mlflow.get_run(run.info.run_id) 1532 assert "mean_absolute_error" in run.data.metrics 1533 assert "mean_squared_error" in run.data.metrics 1534 assert "root_mean_squared_error" in run.data.metrics 1535 1536 1537 def test_evaluate_with_loaded_pyfunc_model(): 1538 import lightgbm as lgb 1539 1540 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1541 X = X[::5] 1542 y = y[::5] 1543 data = lgb.Dataset(X, label=y) 1544 model = lgb.train({"objective": "regression"}, data, num_boost_round=5) 1545 1546 with mlflow.start_run() as run: 1547 model_info = mlflow.lightgbm.log_model(model, name="model") 1548 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 1549 mlflow.evaluate( 1550 loaded_model, 1551 X.assign(y=y), 1552 targets="y", 1553 model_type="regressor", 1554 ) 1555 1556 run = mlflow.get_run(run.info.run_id) 1557 assert "mean_absolute_error" in run.data.metrics 1558 assert "mean_squared_error" in run.data.metrics 1559 assert "root_mean_squared_error" in run.data.metrics 1560 1561 1562 def test_evaluate_with_static_dataset_input_single_output(): 1563 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1564 X = X[::5] 1565 y = y[::5] 1566 with mlflow.start_run() as run: 1567 mlflow.evaluate( 1568 data=X.assign(y=y, model_output=y), 1569 targets="y", 1570 predictions="model_output", 1571 model_type="regressor", 1572 ) 1573 1574 run = mlflow.get_run(run.info.run_id) 1575 assert "mean_absolute_error" in run.data.metrics 1576 assert "mean_squared_error" in run.data.metrics 1577 assert "root_mean_squared_error" in run.data.metrics 1578 1579 1580 def test_evaluate_with_static_mlflow_dataset_input(): 1581 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1582 X = X[::5] 1583 y = y[::5] 1584 data = mlflow.data.from_pandas( 1585 df=X.assign(y=y, model_output=y), targets="y", predictions="model_output" 1586 ) 1587 with mlflow.start_run() as run: 1588 mlflow.evaluate( 1589 data=data, 1590 model_type="regressor", 1591 ) 1592 1593 run = mlflow.get_run(run.info.run_id) 1594 assert "mean_absolute_error" in run.data.metrics 1595 assert "mean_squared_error" in run.data.metrics 1596 assert "root_mean_squared_error" in run.data.metrics 1597 1598 1599 def test_evaluate_with_static_dataset_error_handling_pandas_dataframe(): 1600 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1601 X = X[::5] 1602 y = y[::5] 1603 with mlflow.start_run(): 1604 with pytest.raises(MlflowException, match="The data argument cannot be None."): 1605 mlflow.evaluate( 1606 data=None, 1607 targets="y", 1608 model_type="regressor", 1609 ) 1610 1611 with pytest.raises( 1612 MlflowException, 1613 match="The specified pandas DataFrame does not contain the specified predictions" 1614 " column 'prediction'.", 1615 ): 1616 mlflow.evaluate( 1617 data=X.assign(y=y, model_output=y), 1618 targets="y", 1619 predictions="prediction", 1620 model_type="regressor", 1621 ) 1622 1623 1624 def test_evaluate_with_static_dataset_error_handling_pandas_dataset(): 1625 X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) 1626 X = X[::5] 1627 y = y[::5] 1628 dataset_with_predictions = mlflow.data.from_pandas( 1629 df=X.assign(y=y, model_output=y), targets="y", predictions="model_output" 1630 ) 1631 dataset_no_predictions = mlflow.data.from_pandas(df=X.assign(y=y, model_output=y), targets="y") 1632 ERROR_MESSAGE = ( 1633 "The top-level predictions parameter should not be specified since a Dataset is " 1634 "used. Please only specify the predictions column name in the Dataset. For example: " 1635 "`data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`" 1636 "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`." 1637 ) 1638 with mlflow.start_run(): 1639 with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)): 1640 mlflow.evaluate( 1641 data=dataset_with_predictions, 1642 model_type="regressor", 1643 predictions="model_output", 1644 ) 1645 1646 with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)): 1647 mlflow.evaluate( 1648 data=dataset_no_predictions, 1649 model_type="regressor", 1650 predictions="model_output", 1651 ) 1652 1653 1654 def test_binary_classification_missing_minority_class_exception_override( 1655 binary_logistic_regressor_model_uri, breast_cancer_dataset, monkeypatch 1656 ): 1657 monkeypatch.setenv("_MLFLOW_EVALUATE_SUPPRESS_CLASSIFICATION_ERRORS", "True") 1658 1659 ds_targets = breast_cancer_dataset._constructor_args["targets"] 1660 # Simulate a missing target label 1661 ds_targets = np.where(ds_targets == 0, 1, ds_targets) 1662 1663 with mlflow.start_run() as run: 1664 eval_result = evaluate( 1665 binary_logistic_regressor_model_uri, 1666 breast_cancer_dataset._constructor_args["data"], 1667 model_type="classifier", 1668 targets=ds_targets, 1669 evaluators=["default"], 1670 ) 1671 _, saved_metrics, _, _ = get_run_data(run.info.run_id) 1672 1673 for key, saved_val in saved_metrics.items(): 1674 eval_val = eval_result.metrics[key] 1675 # some nan fields are due to the class imbalance. 1676 # for example, the roc_auc_score metric will return 1677 # nan since we override all classes to `1` here 1678 if np.isnan(saved_val): 1679 assert np.isnan(eval_val) 1680 else: 1681 assert eval_val == saved_val 1682 1683 1684 def test_multiclass_classification_missing_minority_class_exception_override( 1685 multiclass_logistic_regressor_model_uri, iris_dataset, monkeypatch 1686 ): 1687 monkeypatch.setenv("_MLFLOW_EVALUATE_SUPPRESS_CLASSIFICATION_ERRORS", "True") 1688 1689 ds_targets = iris_dataset._constructor_args["targets"] 1690 # Simulate a missing target label 1691 ds_targets = np.where(ds_targets == 0, 1, ds_targets) 1692 1693 with mlflow.start_run() as run: 1694 eval_result = evaluate( 1695 multiclass_logistic_regressor_model_uri, 1696 iris_dataset._constructor_args["data"], 1697 model_type="classifier", 1698 targets=ds_targets, 1699 evaluators=["default"], 1700 ) 1701 _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id) 1702 1703 assert saved_metrics == eval_result.metrics 1704 assert "shap_beeswarm_plot.png" not in saved_artifacts 1705 1706 1707 @pytest.mark.parametrize( 1708 ("model", "is_endpoint_uri"), 1709 [ 1710 ("endpoints:/test", True), 1711 ("endpoints:///my-chat", True), 1712 ("models:/test", False), 1713 (None, False), 1714 ], 1715 ) 1716 def test_is_model_deployment_endpoint_uri(model, is_endpoint_uri): 1717 assert _is_model_deployment_endpoint_uri(model) == is_endpoint_uri 1718 1719 1720 _DUMMY_CHAT_RESPONSE = { 1721 "id": "1", 1722 "object": "text_completion", 1723 "created": "2021-10-01T00:00:00.000000Z", 1724 "model": "gpt-4o-mini", 1725 "choices": [ 1726 { 1727 "index": 0, 1728 "message": { 1729 "content": "This is a response", 1730 "role": "assistant", 1731 }, 1732 "finish_reason": "length", 1733 } 1734 ], 1735 "usage": { 1736 "prompt_tokens": 1, 1737 "completion_tokens": 1, 1738 "total_tokens": 2, 1739 }, 1740 } 1741 1742 _TEST_QUERY_LIST = ["What is MLflow?", "What is Spark?"] 1743 _TEST_GT_LIST = [ 1744 "MLflow is an open-source platform for machine learning (ML).", 1745 "Apache Spark is an open-source, distributed computing system.", 1746 ] 1747 1748 1749 @pytest.mark.parametrize( 1750 ("input_data", "feature_names", "targets"), 1751 [ 1752 # String input column 1753 ( 1754 pd.DataFrame({"inputs": _TEST_QUERY_LIST, "ground_truth": _TEST_GT_LIST}), 1755 None, 1756 "ground_truth", 1757 ), 1758 # String input column with feature_names 1759 ( 1760 pd.DataFrame({"question": _TEST_QUERY_LIST, "ground_truth": _TEST_GT_LIST}), 1761 ["question"], 1762 "ground_truth", 1763 ), 1764 # Dictionary input column that contains message history 1765 ( 1766 pd.DataFrame({ 1767 "inputs": [ 1768 { 1769 "messages": [{"content": q, "role": "user"}], 1770 "max_tokens": 10, 1771 } 1772 for q in _TEST_QUERY_LIST 1773 ], 1774 "ground_truth": _TEST_GT_LIST, 1775 }), 1776 None, 1777 "ground_truth", 1778 ), 1779 # List of string 1780 ( 1781 _TEST_QUERY_LIST, 1782 None, 1783 _TEST_GT_LIST, 1784 ), 1785 # List of string with feature_names 1786 ( 1787 _TEST_QUERY_LIST, 1788 ["question"], 1789 _TEST_GT_LIST, 1790 ), 1791 # List of string with feature_names and w/o targets 1792 ( 1793 _TEST_QUERY_LIST, 1794 ["question"], 1795 None, 1796 ), 1797 # List of dictionary with feature_names 1798 ( 1799 [ 1800 { 1801 "messages": [{"content": q, "role": "user"}], 1802 "max_tokens": 10, 1803 } 1804 for q in _TEST_QUERY_LIST 1805 ], 1806 None, 1807 _TEST_GT_LIST, 1808 ), 1809 ], 1810 ) 1811 def test_evaluate_on_chat_model_endpoint(input_data, feature_names, targets): 1812 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 1813 mock_deploy_client.return_value.predict.return_value = _DUMMY_CHAT_RESPONSE 1814 mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"} 1815 1816 with mlflow.start_run(): 1817 eval_result = mlflow.evaluate( 1818 model="endpoints:/chat", 1819 data=input_data, 1820 model_type="question-answering", 1821 feature_names=feature_names, 1822 targets=targets, 1823 inference_params={"max_tokens": 10, "temperature": 0.5}, 1824 ) 1825 1826 # Validate the endpoint is called with correct payloads 1827 call_args_list = mock_deploy_client.return_value.predict.call_args_list 1828 expected_calls = [ 1829 mock.call( 1830 endpoint="chat", 1831 inputs={ 1832 "messages": [{"content": "What is MLflow?", "role": "user"}], 1833 "max_tokens": 10, 1834 "temperature": 0.5, 1835 }, 1836 ), 1837 mock.call( 1838 endpoint="chat", 1839 inputs={ 1840 "messages": [{"content": "What is Spark?", "role": "user"}], 1841 "max_tokens": 10, 1842 "temperature": 0.5, 1843 }, 1844 ), 1845 ] 1846 assert call_args_list == expected_calls 1847 1848 # Validate the evaluation metrics 1849 expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean"} 1850 if targets: 1851 expected_metrics_subset.add("exact_match/v1") 1852 assert expected_metrics_subset.issubset(set(eval_result.metrics.keys())) 1853 1854 # Validate the model output is passed to the evaluator in the correct format (string) 1855 eval_results_table = eval_result.tables["eval_results_table"] 1856 assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2)) 1857 1858 1859 _DUMMY_COMPLETION_RESPONSE = { 1860 "id": "1", 1861 "object": "text_completion", 1862 "created": "2021-10-01T00:00:00.000000Z", 1863 "model": "gpt-4o-mini", 1864 "choices": [{"index": 0, "text": "This is a response", "finish_reason": "length"}], 1865 "usage": { 1866 "prompt_tokens": 1, 1867 "completion_tokens": 1, 1868 "total_tokens": 2, 1869 }, 1870 } 1871 1872 1873 @pytest.mark.parametrize( 1874 ("input_data", "feature_names"), 1875 [ 1876 (pd.DataFrame({"inputs": _TEST_QUERY_LIST}), None), 1877 (pd.DataFrame({"question": _TEST_QUERY_LIST}), ["question"]), 1878 (pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}), None), 1879 (_TEST_QUERY_LIST, None), 1880 ([{"prompt": q} for q in _TEST_QUERY_LIST], None), 1881 ], 1882 ) 1883 def test_evaluate_on_completion_model_endpoint(input_data, feature_names): 1884 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 1885 mock_deploy_client.return_value.predict.return_value = _DUMMY_COMPLETION_RESPONSE 1886 mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/completions"} 1887 1888 with mlflow.start_run(): 1889 eval_result = mlflow.evaluate( 1890 model="endpoints:/completions", 1891 data=input_data, 1892 inference_params={"max_tokens": 10}, 1893 model_type="text", 1894 feature_names=feature_names, 1895 ) 1896 1897 # Validate the endpoint is called with correct payloads 1898 call_args_list = mock_deploy_client.return_value.predict.call_args_list 1899 expected_calls = [ 1900 mock.call(endpoint="completions", inputs={"prompt": "What is MLflow?", "max_tokens": 10}), 1901 mock.call(endpoint="completions", inputs={"prompt": "What is Spark?", "max_tokens": 10}), 1902 ] 1903 assert call_args_list == expected_calls 1904 1905 # Validate the evaluation metrics 1906 expected_metrics_subset = { 1907 "toxicity/v1/ratio", 1908 "ari_grade_level/v1/mean", 1909 "flesch_kincaid_grade_level/v1/mean", 1910 } 1911 assert expected_metrics_subset.issubset(set(eval_result.metrics.keys())) 1912 1913 # Validate the model output is passed to the evaluator in the correct format (string) 1914 eval_results_table = eval_result.tables["eval_results_table"] 1915 assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2)) 1916 1917 1918 def test_evaluate_on_model_endpoint_without_type(): 1919 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 1920 # An endpoint that does not have endpoint type. For such endpoint, we simply 1921 # pass the input data to the endpoint without any modification and return 1922 # the response as is. 1923 mock_deploy_client.return_value.get_endpoint.return_value = {} 1924 mock_deploy_client.return_value.predict.return_value = "This is a response" 1925 1926 input_data = pd.DataFrame({ 1927 "inputs": [ 1928 { 1929 "messages": [{"content": q, "role": "user"}], 1930 "max_tokens": 10, 1931 } 1932 for q in _TEST_QUERY_LIST 1933 ], 1934 "ground_truth": _TEST_GT_LIST, 1935 }) 1936 1937 with mlflow.start_run(): 1938 eval_result = mlflow.evaluate( 1939 model="endpoints:/random", 1940 data=input_data, 1941 model_type="question-answering", 1942 targets="ground_truth", 1943 inference_params={"max_tokens": 10, "temperature": 0.5}, 1944 ) 1945 1946 # Validate the endpoint is called with correct payloads 1947 call_args_list = mock_deploy_client.return_value.predict.call_args_list 1948 expected_calls = [ 1949 mock.call( 1950 endpoint="random", 1951 inputs={ 1952 "messages": [{"content": "What is MLflow?", "role": "user"}], 1953 "max_tokens": 10, 1954 "temperature": 0.5, 1955 }, 1956 ), 1957 mock.call( 1958 endpoint="random", 1959 inputs={ 1960 "messages": [{"content": "What is Spark?", "role": "user"}], 1961 "max_tokens": 10, 1962 "temperature": 0.5, 1963 }, 1964 ), 1965 ] 1966 assert call_args_list == expected_calls 1967 1968 # Validate the evaluation metrics 1969 expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean", "exact_match/v1"} 1970 assert expected_metrics_subset.issubset(set(eval_result.metrics.keys())) 1971 1972 # Validate the model output is passed to the evaluator in the correct format (string) 1973 eval_results_table = eval_result.tables["eval_results_table"] 1974 assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2)) 1975 1976 1977 def test_evaluate_on_model_endpoint_invalid_payload(): 1978 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 1979 # An endpoint that does not have endpoint type. For such endpoint, we simply 1980 # pass the input data to the endpoint without any modification and return 1981 # the response as is. 1982 mock_deploy_client.return_value.get_endpoint.return_value = {} 1983 mock_deploy_client.return_value.predict.side_effect = ValueError("Invalid payload") 1984 1985 input_data = pd.DataFrame({ 1986 "inputs": [{"invalid": "payload"}], 1987 }) 1988 1989 with pytest.raises(MlflowException, match="Failed to call the deployment endpoint"): 1990 mlflow.evaluate( 1991 model="endpoints:/random", 1992 data=input_data, 1993 model_type="question-answering", 1994 inference_params={"max_tokens": 10, "temperature": 0.5}, 1995 ) 1996 1997 1998 @pytest.mark.parametrize( 1999 ("input_data", "error_message"), 2000 [ 2001 # Extra input columns 2002 ( 2003 pd.DataFrame({ 2004 "inputs": _TEST_QUERY_LIST, 2005 "extra_input": ["a", "b"], 2006 "ground_truth": _TEST_GT_LIST, 2007 }), 2008 "The number of input columns must be 1", 2009 ), 2010 # Missing input columns 2011 ( 2012 pd.DataFrame({"ground_truth": _TEST_GT_LIST}), 2013 "The number of input columns must be 1", 2014 ), 2015 # Input column not str or dict 2016 ( 2017 pd.DataFrame({"inputs": [1, 2], "ground_truth": _TEST_GT_LIST}), 2018 "Invalid input data type", 2019 ), 2020 ], 2021 ) 2022 def test_evaluate_on_model_endpoint_invalid_input_data(input_data, error_message): 2023 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 2024 mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"} 2025 2026 with pytest.raises(MlflowException, match=error_message): 2027 with mlflow.start_run(): 2028 mlflow.evaluate( 2029 model="endpoints:/chat", 2030 data=input_data, 2031 model_type="question-answering", 2032 targets="ground_truth", 2033 inference_params={"max_tokens": 10, "temperature": 0.5}, 2034 ) 2035 2036 2037 @pytest.mark.parametrize( 2038 "model_input", 2039 [ 2040 # Case 1: Single chat dictionary. 2041 # This is an expected input format from the Databricks RAG Evaluator. 2042 { 2043 "messages": [{"content": "What is MLflow?", "role": "user"}], 2044 "max_tokens": 10, 2045 }, 2046 # Case 2: List of chat dictionaries. 2047 # This is not a typical input format from either default or Databricks RAG evaluators, 2048 # but we support it for compatibility with the normal Pyfunc models. 2049 [ 2050 {"messages": [{"content": "What is MLflow?", "role": "user"}]}, 2051 {"messages": [{"content": "What is Spark?", "role": "user"}]}, 2052 ], 2053 # Case 3: DataFrame with a column of dictionaries 2054 pd.DataFrame({ 2055 "inputs": [ 2056 { 2057 "messages": [{"content": "What is MLflow?", "role": "user"}], 2058 "max_tokens": 10, 2059 }, 2060 { 2061 "messages": [{"content": "What is Spark?", "role": "user"}], 2062 }, 2063 ] 2064 }), 2065 # Case 4: DataFrame with a column of strings 2066 pd.DataFrame({ 2067 "inputs": ["What is MLflow?", "What is Spark?"], 2068 }), 2069 ], 2070 ) 2071 def test_model_from_deployment_endpoint(model_input): 2072 with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client: 2073 mock_deploy_client.return_value.predict.return_value = _DUMMY_CHAT_RESPONSE 2074 mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"} 2075 2076 model = _get_model_from_deployment_endpoint_uri("endpoints:/chat") 2077 2078 response = model.predict(model_input) 2079 2080 if isinstance(model_input, dict): 2081 assert mock_deploy_client.return_value.predict.call_count == 1 2082 # Chat response should be unwrapped 2083 assert response == "This is a response" 2084 else: 2085 assert mock_deploy_client.return_value.predict.call_count == 2 2086 assert pd.Series(response).equals(pd.Series(["This is a response"] * 2)) 2087 2088 2089 def test_import_evaluation_dataset(): 2090 # This test is to validate both imports work at the same time 2091 from mlflow.models.evaluation import EvaluationDataset 2092 from mlflow.models.evaluation.base import EvaluationDataset # noqa: F401 2093 2094 2095 def test_evaluate_shows_server_stdout_and_stderr_on_error( 2096 linear_regressor_model_uri, diabetes_dataset 2097 ): 2098 with mlflow.start_run(): 2099 server_proc = subprocess.Popen( 2100 ["echo", "test1324"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT 2101 ) 2102 with mock.patch( 2103 "mlflow.pyfunc.backend.PyFuncBackend.serve", 2104 return_value=server_proc, 2105 ) as mock_serve: 2106 with pytest.raises(MlflowException, match="test1324"): 2107 evaluate( 2108 linear_regressor_model_uri, 2109 diabetes_dataset._constructor_args["data"], 2110 model_type="regressor", 2111 targets=diabetes_dataset._constructor_args["targets"], 2112 evaluators="dummy_evaluator", 2113 env_manager="virtualenv", 2114 ) 2115 mock_serve.assert_called_once() 2116 2117 2118 def test_env_manager_set_on_served_pyfunc_model(multiclass_logistic_regressor_model_uri): 2119 model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) 2120 client = ScoringServerClient("127.0.0.1", "8080") 2121 served_model_1 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=1) 2122 served_model_1.env_manager = "virtualenv" 2123 assert served_model_1.env_manager == "virtualenv" 2124 2125 2126 def test_metrics_logged_to_model_on_evaluation( 2127 multiclass_logistic_regressor_model_uri, iris_dataset 2128 ): 2129 with mlflow.start_run(): 2130 # Log the model and retrieve its model_id 2131 model_info = mlflow.sklearn.log_model( 2132 mlflow.sklearn.load_model(multiclass_logistic_regressor_model_uri), name="model" 2133 ) 2134 model_id = model_info.model_id 2135 2136 # Evaluate the model using its model_id 2137 eval_result = mlflow.evaluate( 2138 model=model_info.model_uri, 2139 data=iris_dataset._constructor_args["data"], 2140 model_type="classifier", 2141 targets=iris_dataset._constructor_args["targets"], 2142 evaluators=["default"], 2143 ) 2144 2145 # Retrieve metrics logged to the model 2146 logged_model_metrics = mlflow.get_logged_model(model_id).metrics 2147 2148 # Ensure metrics are logged to the model 2149 assert eval_result.metrics == {metric.key: metric.value for metric in logged_model_metrics} 2150 2151 # Validate that all metrics have the correct model_id in their metadata 2152 assert all(metric.model_id == model_id for metric in logged_model_metrics) 2153 2154 2155 def test_evaluate_with_model_id(iris_dataset): 2156 # Create and log a model 2157 with mlflow.start_run(): 2158 model = sklearn.linear_model.LogisticRegression() 2159 model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"]) 2160 model_info = mlflow.sklearn.log_model(model, name="model") 2161 model_id = model_info.model_id 2162 2163 # Evaluate the model with the specified model ID 2164 with mlflow.start_run(): 2165 result = evaluate( 2166 model_info.model_uri, 2167 iris_dataset._constructor_args["data"], 2168 model_type="classifier", 2169 targets=iris_dataset._constructor_args["targets"], 2170 model_id=model_id, 2171 ) 2172 2173 # Verify metrics were logged 2174 assert result.metrics is not None 2175 assert len(result.metrics) > 0 2176 2177 # Verify metrics are linked to the model ID 2178 logged_model = mlflow.get_logged_model(model_id) 2179 assert logged_model is not None 2180 assert logged_model.model_id == model_id 2181 2182 # Convert metrics list to a dictionary for easier lookup 2183 logged_metrics = {metric.key: metric.value for metric in logged_model.metrics} 2184 2185 # Verify each metric from the evaluation result matches the logged model metrics 2186 for metric_name, metric_value in result.metrics.items(): 2187 assert metric_name in logged_metrics, ( 2188 f"Metric {metric_name} not found in logged model metrics" 2189 ) 2190 assert logged_metrics[metric_name] == metric_value, ( 2191 f"Metric {metric_name} value mismatch: " 2192 f"expected {metric_value}, got {logged_metrics[metric_name]}" 2193 ) 2194 2195 2196 def test_evaluate_model_id_consistency_check(multiclass_logistic_regressor_model_uri, iris_dataset): 2197 """ 2198 Test that an error is thrown when the specified model_id contradicts the model's associated ID. 2199 """ 2200 # Create a model with a known model ID 2201 with mlflow.start_run(): 2202 model = sklearn.linear_model.LogisticRegression() 2203 model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"]) 2204 model_info = mlflow.sklearn.log_model( 2205 model, 2206 name="model", 2207 ) 2208 model_uri = model_info.model_uri 2209 model_id = model_info.model_uuid 2210 2211 # Test that specifying matching model_id works 2212 evaluate( 2213 model_uri, 2214 iris_dataset._constructor_args["data"], 2215 targets=iris_dataset._constructor_args["targets"], 2216 model_type="classifier", 2217 model_id=model_id, 2218 ) 2219 2220 # Test that specifying different model_id raises 2221 with pytest.raises( 2222 MlflowException, 2223 match=( 2224 r"The specified value of the 'model_id' parameter '.*' " 2225 r"contradicts the model_id '.*' associated with the model\. Please ensure " 2226 r"they match or omit the 'model_id' parameter\." 2227 ), 2228 ): 2229 evaluate( 2230 model_uri, 2231 iris_dataset._constructor_args["data"], 2232 targets=iris_dataset._constructor_args["targets"], 2233 model_type="classifier", 2234 model_id="different_model_id", 2235 ) 2236 2237 # Test that not specifying model_id works 2238 evaluate( 2239 model_uri, 2240 iris_dataset._constructor_args["data"], 2241 targets=iris_dataset._constructor_args["targets"], 2242 model_type="classifier", 2243 ) 2244 2245 2246 def test_evaluate_log_metrics_to_active_model(iris_dataset): 2247 # Set active model 2248 mlflow.set_active_model(name="my-model") 2249 active_model_id = mlflow.get_active_model_id() 2250 2251 model = sklearn.linear_model.LogisticRegression() 2252 model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"]) 2253 eval_df = pd.DataFrame({ 2254 "inputs": iris_dataset._constructor_args["data"].tolist(), 2255 "targets": iris_dataset._constructor_args["targets"], 2256 "predictions": model.predict(iris_dataset._constructor_args["data"]), 2257 }) 2258 2259 eval_dataset = mlflow.data.from_pandas( 2260 df=eval_df, 2261 name="eval_dataset", 2262 targets="targets", 2263 predictions="predictions", 2264 ) 2265 2266 # Evaluate the model without model_id, active model_id should be used 2267 with mlflow.start_run(): 2268 result = evaluate( 2269 data=eval_dataset, 2270 model_type="classifier", 2271 ) 2272 2273 # Verify metrics were logged 2274 assert result.metrics is not None 2275 assert len(result.metrics) > 0 2276 2277 # Verify metrics are linked to the active model ID 2278 logged_model = mlflow.get_logged_model(active_model_id) 2279 assert logged_model is not None 2280 assert logged_model.model_id == active_model_id 2281 2282 # Convert metrics list to a dictionary for easier lookup 2283 logged_metrics = {metric.key: metric.value for metric in logged_model.metrics} 2284 2285 # Verify each metric from the evaluation result matches the logged model metrics 2286 assert logged_metrics.items() <= result.metrics.items() 2287 2288 2289 def test_mlflow_evaluate_logs_traces_to_active_model(): 2290 eval_data = pd.DataFrame({ 2291 "inputs": [ 2292 "What is MLflow?", 2293 "What is Spark?", 2294 ], 2295 "ground_truth": ["What is MLflow?", "Not what is Spark?"], 2296 }) 2297 2298 @mlflow.trace 2299 def model(inputs): 2300 return inputs 2301 2302 # no model_id used when no active model is set or passed 2303 evaluate(model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()]) 2304 traces = get_traces() 2305 assert len(traces) == 1 2306 assert TraceMetadataKey.MODEL_ID not in traces[0].info.request_metadata 2307 2308 # no active model set and pass model_id explicitly 2309 assert mlflow.get_active_model_id() is None 2310 model_id = mlflow.create_external_model(name="my-model").model_id 2311 evaluate( 2312 model, 2313 eval_data, 2314 targets="ground_truth", 2315 extra_metrics=[mlflow.metrics.exact_match()], 2316 model_id=model_id, 2317 ) 2318 traces = get_traces() 2319 assert len(traces) == 2 2320 assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_id 2321 2322 # set active model 2323 with mlflow.set_active_model(name="my-model") as active_model: 2324 model_id = active_model.model_id 2325 evaluate( 2326 model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()] 2327 ) 2328 traces = get_traces() 2329 assert len(traces) == 3 2330 assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_id 2331 2332 # pass model_id explicitly takes precedence over active model 2333 assert mlflow.get_active_model_id() is not None 2334 another_model_id = mlflow.create_external_model(name="another-model").model_id 2335 evaluate( 2336 model, 2337 eval_data, 2338 targets="ground_truth", 2339 extra_metrics=[mlflow.metrics.exact_match()], 2340 model_id=another_model_id, 2341 ) 2342 traces = get_traces() 2343 assert len(traces) == 4 2344 assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == another_model_id 2345 2346 # model_id of the passed model takes precedence over active model 2347 assert mlflow.get_active_model_id() is not None 2348 model_info = mlflow.pyfunc.log_model( 2349 name="model", 2350 python_model=model, 2351 input_example="What is MLflow?", 2352 ) 2353 evaluate( 2354 model_info.model_uri, 2355 eval_data, 2356 targets="ground_truth", 2357 extra_metrics=[mlflow.metrics.exact_match()], 2358 ) 2359 traces = get_traces() 2360 assert len(traces) == 5 2361 assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_info.model_id 2362 # TODO: test registered ModelVersion's model_id works after it's supported 2363 2364 2365 def test_delete_run_deletes_assessments_with_source_run_id(): 2366 @mlflow.trace 2367 def model(inputs): 2368 return inputs 2369 2370 eval_data = pd.DataFrame({ 2371 "inputs": ["What is MLflow?"], 2372 "ground_truth": ["MLflow is an ML platform."], 2373 }) 2374 2375 with mlflow.start_run() as run: 2376 evaluate( 2377 model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()] 2378 ) 2379 2380 traces = get_traces() 2381 assert len(traces) == 1 2382 trace_id = traces[0].info.trace_id 2383 2384 # Log a feedback assessment linked to the run via sourceRunId metadata 2385 linked_feedback = mlflow.log_feedback( 2386 trace_id=trace_id, 2387 name="eval_feedback", 2388 value="good", 2389 metadata={AssessmentMetadataKey.SOURCE_RUN_ID: run.info.run_id}, 2390 ) 2391 2392 # Log another feedback assessment NOT linked to any run 2393 unlinked_feedback = mlflow.log_feedback( 2394 trace_id=trace_id, 2395 name="unlinked_feedback", 2396 value="also good", 2397 ) 2398 2399 # Verify both assessments exist 2400 trace = mlflow.get_trace(trace_id) 2401 assert len(trace.info.assessments) >= 2 2402 assessment_ids = {a.assessment_id for a in trace.info.assessments} 2403 assert linked_feedback.assessment_id in assessment_ids 2404 assert unlinked_feedback.assessment_id in assessment_ids 2405 2406 # Delete the run 2407 MlflowClient().delete_run(run.info.run_id) 2408 2409 # Verify the linked assessment was deleted but the unlinked one survives 2410 trace = mlflow.get_trace(trace_id) 2411 remaining_ids = {a.assessment_id for a in trace.info.assessments} 2412 assert linked_feedback.assessment_id not in remaining_ids 2413 assert unlinked_feedback.assessment_id in remaining_ids