/ tests / evaluate / test_evaluation.py
test_evaluation.py
   1  import hashlib
   2  import io
   3  import json
   4  import os
   5  import re
   6  import signal
   7  import subprocess
   8  import uuid
   9  from typing import Any, NamedTuple
  10  from unittest import mock
  11  
  12  import numpy as np
  13  import pandas as pd
  14  import pytest
  15  import sklearn
  16  import sklearn.compose
  17  import sklearn.datasets
  18  import sklearn.impute
  19  import sklearn.linear_model
  20  import sklearn.pipeline
  21  import sklearn.preprocessing
  22  from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact, DummyEvaluator
  23  from PIL import Image, ImageChops
  24  from pyspark.ml.linalg import Vectors
  25  from pyspark.ml.regression import LinearRegression as SparkLinearRegression
  26  from pyspark.sql import SparkSession
  27  from sklearn.metrics import (
  28      accuracy_score,
  29      confusion_matrix,
  30      mean_absolute_error,
  31      mean_squared_error,
  32  )
  33  
  34  import mlflow
  35  from mlflow import MlflowClient
  36  from mlflow.data.evaluation_dataset import EvaluationDataset, _gen_md5_for_arraylike_obj
  37  from mlflow.data.pandas_dataset import from_pandas
  38  from mlflow.entities import Trace, TraceData
  39  from mlflow.exceptions import MlflowException
  40  from mlflow.models.evaluation import (
  41      EvaluationArtifact,
  42      EvaluationResult,
  43      ModelEvaluator,
  44      evaluate,
  45  )
  46  from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact
  47  from mlflow.models.evaluation.base import (
  48      _get_model_from_deployment_endpoint_uri,
  49      _is_model_deployment_endpoint_uri,
  50      _start_run_or_reuse_active_run,
  51      resolve_evaluators_and_configs,
  52  )
  53  from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
  54  from mlflow.pyfunc import _ServedPyFuncModel
  55  from mlflow.pyfunc.scoring_server.client import ScoringServerClient
  56  from mlflow.tracing.constant import AssessmentMetadataKey, TraceMetadataKey
  57  from mlflow.tracking.artifact_utils import get_artifact_uri
  58  from mlflow.utils.file_utils import TempDir
  59  
  60  from tests.tracing.helper import create_test_trace_info, get_traces
  61  from tests.utils.test_file_utils import spark_session  # noqa: F401
  62  
  63  INFERENCE_FILE_NAME = "inference_inputs_outputs.json"
  64  
  65  
  66  def get_iris():
  67      iris = sklearn.datasets.load_iris()
  68      return iris.data, iris.target
  69  
  70  
  71  def get_diabetes_dataset():
  72      data = sklearn.datasets.load_diabetes()
  73      return data.data, data.target
  74  
  75  
  76  def get_diabetes_spark_dataset():
  77      data = sklearn.datasets.load_diabetes()
  78      spark = SparkSession.builder.master("local[*]").getOrCreate()
  79      rows = [
  80          (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target)
  81      ]
  82      return spark.createDataFrame(spark.sparkContext.parallelize(rows, 1), ["features", "label"])
  83  
  84  
  85  def get_breast_cancer_dataset():
  86      data = sklearn.datasets.load_breast_cancer()
  87      return data.data, data.target
  88  
  89  
  90  class RunData(NamedTuple):
  91      params: dict[str, Any]
  92      metrics: dict[str, Any]
  93      tags: dict[str, Any]
  94      artifacts: list[str]
  95  
  96  
  97  def get_run_data(run_id):
  98      client = MlflowClient()
  99      data = client.get_run(run_id).data
 100      artifacts = [f.path for f in client.list_artifacts(run_id)]
 101      return RunData(params=data.params, metrics=data.metrics, tags=data.tags, artifacts=artifacts)
 102  
 103  
 104  def get_run_datasets(run_id):
 105      client = MlflowClient()
 106      return client.get_run(run_id).inputs.dataset_inputs
 107  
 108  
 109  def get_raw_tag(run_id, tag_name):
 110      client = MlflowClient()
 111      data = client.get_run(run_id).data
 112      return data.tags[tag_name]
 113  
 114  
 115  def get_local_artifact_path(run_id, artifact_path):
 116      return get_artifact_uri(run_id, artifact_path).replace("file://", "")
 117  
 118  
 119  @pytest.fixture(scope="module")
 120  def iris_dataset():
 121      X, y = get_iris()
 122      eval_X = X[0::3]
 123      eval_y = y[0::3]
 124      constructor_args = {"data": eval_X, "targets": eval_y, "name": "dataset"}
 125      ds = EvaluationDataset(**constructor_args)
 126      ds._constructor_args = constructor_args
 127      return ds
 128  
 129  
 130  @pytest.fixture(scope="module")
 131  def diabetes_dataset():
 132      X, y = get_diabetes_dataset()
 133      eval_X = X[0::3]
 134      eval_y = y[0::3]
 135      constructor_args = {"data": eval_X, "targets": eval_y}
 136      ds = EvaluationDataset(**constructor_args)
 137      ds._constructor_args = constructor_args
 138      return ds
 139  
 140  
 141  @pytest.fixture(scope="module")
 142  def diabetes_spark_dataset():
 143      spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
 144      constructor_args = {"data": spark_df, "targets": "label"}
 145      ds = EvaluationDataset(**constructor_args)
 146      ds._constructor_args = constructor_args
 147      return ds
 148  
 149  
 150  @pytest.fixture(scope="module")
 151  def breast_cancer_dataset():
 152      X, y = get_breast_cancer_dataset()
 153      eval_X = X[0::3]
 154      eval_y = y[0::3]
 155      constructor_args = {"data": eval_X, "targets": eval_y}
 156      ds = EvaluationDataset(**constructor_args)
 157      ds._constructor_args = constructor_args
 158      return ds
 159  
 160  
 161  def get_pipeline_model_dataset():
 162      """
 163      The dataset tweaks the IRIS dataset by changing its first 2 features into categorical features,
 164      and replace some feature values with NA values.
 165      The dataset is prepared for a pipeline model, see `pipeline_model_uri`.
 166      """
 167      X, y = get_iris()
 168  
 169      def convert_num_to_label(x):
 170          return f"v_{round(x)}"
 171  
 172      f1 = np.array(list(map(convert_num_to_label, X[:, 0])))
 173      f2 = np.array(list(map(convert_num_to_label, X[:, 1])))
 174      f3 = X[:, 2]
 175      f4 = X[:, 3]
 176  
 177      f1[0::8] = None
 178      f2[1::8] = None
 179      f3[2::8] = np.nan
 180      f4[3::8] = np.nan
 181  
 182      data = pd.DataFrame({
 183          "f1": f1,
 184          "f2": f2,
 185          "f3": f3,
 186          "f4": f4,
 187          "y": y,
 188      })
 189      return data, "y"
 190  
 191  
 192  @pytest.fixture
 193  def pipeline_model_uri():
 194      return get_pipeline_model_uri()
 195  
 196  
 197  def get_pipeline_model_uri():
 198      """
 199      Create a pipeline model that transforms and trains on the dataset returned by
 200      `get_pipeline_model_dataset`. The pipeline model imputes the missing values in
 201      input dataset, encodes categorical features, and then trains a logistic regression
 202      model.
 203      """
 204      data, target_col = get_pipeline_model_dataset()
 205      X = data.drop(target_col, axis=1)
 206      y = data[target_col].to_numpy()
 207  
 208      encoder = sklearn.preprocessing.OrdinalEncoder()
 209      str_imputer = sklearn.impute.SimpleImputer(missing_values=None, strategy="most_frequent")
 210      num_imputer = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="mean")
 211      preproc_pipeline = sklearn.pipeline.Pipeline([
 212          ("imputer", str_imputer),
 213          ("encoder", encoder),
 214      ])
 215  
 216      pipeline = sklearn.pipeline.Pipeline([
 217          (
 218              "transformer",
 219              sklearn.compose.make_column_transformer(
 220                  (preproc_pipeline, ["f1", "f2"]),
 221                  (num_imputer, ["f3", "f4"]),
 222              ),
 223          ),
 224          ("clf", sklearn.linear_model.LogisticRegression()),
 225      ])
 226      pipeline.fit(X, y)
 227  
 228      with mlflow.start_run():
 229          model_info = mlflow.sklearn.log_model(pipeline, name="pipeline_model")
 230          return model_info.model_uri
 231  
 232  
 233  @pytest.fixture
 234  def linear_regressor_model_uri():
 235      return get_linear_regressor_model_uri()
 236  
 237  
 238  def get_linear_regressor_model_uri():
 239      X, y = get_diabetes_dataset()
 240      reg = sklearn.linear_model.LinearRegression()
 241      reg.fit(X, y)
 242  
 243      with mlflow.start_run():
 244          model_info = mlflow.sklearn.log_model(reg, name="reg_model")
 245          return model_info.model_uri
 246  
 247  
 248  @pytest.fixture
 249  def spark_linear_regressor_model_uri():
 250      return get_spark_linear_regressor_model_uri()
 251  
 252  
 253  def get_spark_linear_regressor_model_uri():
 254      spark_df = get_diabetes_spark_dataset()
 255      reg = SparkLinearRegression()
 256      spark_reg_model = reg.fit(spark_df)
 257  
 258      with mlflow.start_run():
 259          model_info = mlflow.spark.log_model(spark_reg_model, artifact_path="spark_reg_model")
 260          return model_info.model_uri
 261  
 262  
 263  @pytest.fixture
 264  def multiclass_logistic_regressor_model_uri():
 265      return multiclass_logistic_regressor_model_uri_by_max_iter(2)
 266  
 267  
 268  def multiclass_logistic_regressor_model_uri_by_max_iter(max_iter):
 269      X, y = get_iris()
 270      clf = sklearn.linear_model.LogisticRegression(max_iter=max_iter)
 271      clf.fit(X, y)
 272  
 273      with mlflow.start_run():
 274          model_info = mlflow.sklearn.log_model(clf, name=f"clf_model_{max_iter}_iters")
 275          return model_info.model_uri
 276  
 277  
 278  @pytest.fixture
 279  def binary_logistic_regressor_model_uri():
 280      return get_binary_logistic_regressor_model_uri()
 281  
 282  
 283  def get_binary_logistic_regressor_model_uri():
 284      X, y = get_breast_cancer_dataset()
 285      clf = sklearn.linear_model.LogisticRegression()
 286      clf.fit(X, y)
 287  
 288      with mlflow.start_run():
 289          model_info = mlflow.sklearn.log_model(clf, name="bin_clf_model")
 290          return model_info.model_uri
 291  
 292  
 293  @pytest.fixture
 294  def svm_model_uri():
 295      return get_svm_model_url()
 296  
 297  
 298  def get_svm_model_url():
 299      X, y = get_breast_cancer_dataset()
 300      clf = sklearn.svm.LinearSVC()
 301      clf.fit(X, y)
 302  
 303      with mlflow.start_run():
 304          model_info = mlflow.sklearn.log_model(clf, name="svm_model")
 305          return model_info.model_uri
 306  
 307  
 308  @pytest.fixture
 309  def iris_pandas_df_dataset():
 310      X, y = get_iris()
 311      eval_X = X[0::3]
 312      eval_y = y[0::3]
 313      data = pd.DataFrame({
 314          "f1": eval_X[:, 0],
 315          "f2": eval_X[:, 1],
 316          "f3": eval_X[:, 2],
 317          "f4": eval_X[:, 3],
 318          "y": eval_y,
 319      })
 320      constructor_args = {"data": data, "targets": "y"}
 321      ds = EvaluationDataset(**constructor_args)
 322      ds._constructor_args = constructor_args
 323      return ds
 324  
 325  
 326  @pytest.fixture
 327  def iris_pandas_df_num_cols_dataset():
 328      X, y = get_iris()
 329      eval_X = X[0::3]
 330      eval_y = y[0::3]
 331      data = pd.DataFrame(eval_X)
 332      data["y"] = eval_y
 333      constructor_args = {"data": data, "targets": "y"}
 334      ds = EvaluationDataset(**constructor_args)
 335      ds._constructor_args = constructor_args
 336      return ds
 337  
 338  
 339  def test_mlflow_evaluate_logs_traces():
 340      eval_data = pd.DataFrame({
 341          "inputs": [
 342              "What is MLflow?",
 343              "What is Spark?",
 344          ],
 345          "ground_truth": ["What is MLflow?", "Not what is Spark?"],
 346      })
 347  
 348      @mlflow.trace
 349      def model(inputs):
 350          return inputs
 351  
 352      with mlflow.start_run() as run:
 353          evaluate(
 354              model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()]
 355          )
 356      assert len(get_traces()) == 1
 357      assert run.info.run_id == get_traces()[0].info.request_metadata[TraceMetadataKey.SOURCE_RUN]
 358  
 359  
 360  def test_pyfunc_evaluate_logs_traces():
 361      class Model(mlflow.pyfunc.PythonModel):
 362          @mlflow.trace()
 363          def predict(self, context, model_input):
 364              return self.add(model_input, model_input)
 365  
 366          @mlflow.trace()
 367          def add(self, x, y):
 368              return x + y
 369  
 370      eval_data = pd.DataFrame({
 371          "inputs": [1, 2, 4],
 372          "ground_truth": [2, 4, 8],
 373      })
 374  
 375      with mlflow.start_run() as run:
 376          model_info = mlflow.pyfunc.log_model(name="model", python_model=Model())
 377          evaluate(
 378              model_info.model_uri,
 379              eval_data,
 380              targets="ground_truth",
 381              extra_metrics=[mlflow.metrics.exact_match()],
 382          )
 383      traces = get_traces()
 384      assert len(traces) == 1
 385      assert len(traces[0].data.spans) == 2
 386      assert run.info.run_id == traces[0].info.request_metadata[TraceMetadataKey.SOURCE_RUN]
 387      assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_info.model_id
 388  
 389  
 390  def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
 391      y_true = iris_dataset.labels_data
 392      classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
 393      y_pred = classifier_model.predict(iris_dataset.features_data)
 394      expected_accuracy_score = accuracy_score(y_true, y_pred)
 395      expected_metrics = {
 396          "accuracy_score": expected_accuracy_score,
 397      }
 398      expected_saved_metrics = {
 399          "accuracy_score": expected_accuracy_score,
 400      }
 401  
 402      expected_csv_artifact = confusion_matrix(y_true, y_pred)
 403      cm_figure = sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_true, y_pred).figure_
 404      img_buf = io.BytesIO()
 405      cm_figure.savefig(img_buf)
 406      img_buf.seek(0)
 407      expected_image_artifact = Image.open(img_buf)
 408  
 409      with mlflow.start_run() as run:
 410          eval_result = evaluate(
 411              multiclass_logistic_regressor_model_uri,
 412              iris_dataset._constructor_args["data"],
 413              model_type="classifier",
 414              targets=iris_dataset._constructor_args["targets"],
 415              evaluators="dummy_evaluator",
 416          )
 417  
 418      csv_artifact_name = "confusion_matrix"
 419      saved_csv_artifact_path = get_local_artifact_path(run.info.run_id, csv_artifact_name + ".csv")
 420  
 421      png_artifact_name = "confusion_matrix_image"
 422      saved_png_artifact_path = get_local_artifact_path(run.info.run_id, png_artifact_name) + ".png"
 423  
 424      _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
 425      assert saved_metrics == expected_saved_metrics
 426      assert set(saved_artifacts) == {csv_artifact_name + ".csv", png_artifact_name + ".png"}
 427  
 428      assert eval_result.metrics == expected_metrics
 429      confusion_matrix_artifact = eval_result.artifacts[csv_artifact_name]
 430      np.testing.assert_array_equal(confusion_matrix_artifact.content, expected_csv_artifact)
 431      assert confusion_matrix_artifact.uri == get_artifact_uri(
 432          run.info.run_id, csv_artifact_name + ".csv"
 433      )
 434      np.testing.assert_array_equal(
 435          confusion_matrix_artifact._load(saved_csv_artifact_path), expected_csv_artifact
 436      )
 437      confusion_matrix_image_artifact = eval_result.artifacts[png_artifact_name]
 438      assert (
 439          ImageChops.difference(
 440              confusion_matrix_image_artifact.content, expected_image_artifact
 441          ).getbbox()
 442          is None
 443      )
 444      assert confusion_matrix_image_artifact.uri == get_artifact_uri(
 445          run.info.run_id, png_artifact_name + ".png"
 446      )
 447      assert (
 448          ImageChops.difference(
 449              confusion_matrix_image_artifact._load(saved_png_artifact_path),
 450              expected_image_artifact,
 451          ).getbbox()
 452          is None
 453      )
 454  
 455      with TempDir() as temp_dir:
 456          temp_dir_path = temp_dir.path()
 457          eval_result.save(temp_dir_path)
 458  
 459          with open(temp_dir.path("metrics.json")) as fp:
 460              assert json.load(fp) == eval_result.metrics
 461  
 462          with open(temp_dir.path("artifacts_metadata.json")) as fp:
 463              json_dict = json.load(fp)
 464              assert "confusion_matrix" in json_dict
 465              assert json_dict["confusion_matrix"] == {
 466                  "uri": confusion_matrix_artifact.uri,
 467                  "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
 468              }
 469  
 470              assert "confusion_matrix_image" in json_dict
 471              assert json_dict["confusion_matrix_image"] == {
 472                  "uri": confusion_matrix_image_artifact.uri,
 473                  "class_name": "mlflow.models.evaluation.artifacts.ImageEvaluationArtifact",
 474              }
 475  
 476          assert set(os.listdir(temp_dir.path("artifacts"))) == {
 477              "confusion_matrix.csv",
 478              "confusion_matrix_image.png",
 479          }
 480  
 481          loaded_eval_result = EvaluationResult.load(temp_dir_path)
 482          assert loaded_eval_result.metrics == eval_result.metrics
 483          loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[csv_artifact_name]
 484          assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
 485          np.testing.assert_array_equal(
 486              confusion_matrix_artifact.content,
 487              loaded_confusion_matrix_artifact.content,
 488          )
 489          loaded_confusion_matrix_image_artifact = loaded_eval_result.artifacts[png_artifact_name]
 490          assert confusion_matrix_image_artifact.uri == loaded_confusion_matrix_image_artifact.uri
 491          assert (
 492              ImageChops.difference(
 493                  confusion_matrix_image_artifact.content,
 494                  loaded_confusion_matrix_image_artifact.content,
 495              ).getbbox()
 496              is None
 497          )
 498  
 499          new_confusion_matrix_artifact = Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri)
 500          new_confusion_matrix_artifact._load()
 501          np.testing.assert_array_equal(
 502              confusion_matrix_artifact.content,
 503              new_confusion_matrix_artifact.content,
 504          )
 505          new_confusion_matrix_image_artifact = ImageEvaluationArtifact(
 506              uri=confusion_matrix_image_artifact.uri
 507          )
 508          new_confusion_matrix_image_artifact._load()
 509          np.testing.assert_array_equal(
 510              confusion_matrix_image_artifact.content,
 511              new_confusion_matrix_image_artifact.content,
 512          )
 513  
 514  
 515  def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
 516      y_true = diabetes_dataset.labels_data
 517      regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
 518      y_pred = regressor_model.predict(diabetes_dataset.features_data)
 519      expected_mae = mean_absolute_error(y_true, y_pred)
 520      expected_mse = mean_squared_error(y_true, y_pred)
 521      expected_metrics = {
 522          "mean_absolute_error": expected_mae,
 523          "mean_squared_error": expected_mse,
 524      }
 525      expected_saved_metrics = {
 526          "mean_absolute_error": expected_mae,
 527          "mean_squared_error": expected_mse,
 528      }
 529  
 530      with mlflow.start_run() as run:
 531          eval_result = evaluate(
 532              linear_regressor_model_uri,
 533              diabetes_dataset._constructor_args["data"],
 534              model_type="regressor",
 535              targets=diabetes_dataset._constructor_args["targets"],
 536              evaluators="dummy_evaluator",
 537          )
 538      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
 539      assert saved_metrics == expected_saved_metrics
 540      assert eval_result.metrics == expected_metrics
 541  
 542  
 543  def _load_diabetes_dataset_in_required_format(format):
 544      data = sklearn.datasets.load_diabetes()
 545      if format == "numpy":
 546          return data.data, data.target
 547      elif format == "pandas":
 548          df = pd.DataFrame(data.data, columns=data.feature_names)
 549          df["label"] = data.target
 550          return df, "label"
 551      elif format == "spark":
 552          spark = SparkSession.builder.master("local[*]").getOrCreate()
 553          panda_df = pd.DataFrame(data.data, columns=data.feature_names)
 554          panda_df["label"] = data.target
 555          spark_df = spark.createDataFrame(panda_df)
 556          return spark_df, "label"
 557      elif format == "list":
 558          return data.data.tolist(), data.target.tolist()
 559      else:
 560          raise TypeError(
 561              f"`format` must be one of 'numpy', 'pandas', 'spark' or 'list', but received {format}."
 562          )
 563  
 564  
 565  @pytest.mark.parametrize("data_format", ["list", "numpy", "pandas", "spark"])
 566  def test_regressor_evaluation(linear_regressor_model_uri, data_format):
 567      data, target = _load_diabetes_dataset_in_required_format(data_format)
 568  
 569      with mlflow.start_run() as run:
 570          eval_result = evaluate(
 571              linear_regressor_model_uri,
 572              data=data,
 573              targets=target,
 574              model_type="regressor",
 575              evaluators=["default"],
 576          )
 577      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
 578  
 579      for k, v in eval_result.metrics.items():
 580          assert v == saved_metrics[k]
 581  
 582      datasets = get_run_datasets(run.info.run_id)
 583      assert len(datasets) == 1
 584      assert len(datasets[0].tags) == 0
 585      assert datasets[0].dataset.source_type == "code"
 586  
 587  
 588  def test_pandas_df_regressor_evaluation_mlflow_dataset_with_metric_prefix(
 589      linear_regressor_model_uri,
 590  ):
 591      data = sklearn.datasets.load_diabetes()
 592      df = pd.DataFrame(data.data, columns=data.feature_names)
 593      df["y"] = data.target
 594      mlflow_df = from_pandas(df=df, source="my_src", targets="y")
 595      with mlflow.start_run() as run:
 596          eval_result = evaluate(
 597              linear_regressor_model_uri,
 598              data=mlflow_df,
 599              model_type="regressor",
 600              evaluators=["default"],
 601              evaluator_config={
 602                  "default": {
 603                      "metric_prefix": "eval",
 604                  }
 605              },
 606          )
 607      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
 608  
 609      for k, v in eval_result.metrics.items():
 610          assert v == saved_metrics[k]
 611  
 612      datasets = get_run_datasets(run.info.run_id)
 613      assert len(datasets) == 1
 614      assert datasets[0].tags[0].value == "eval"
 615  
 616  
 617  def test_pandas_df_regressor_evaluation_mlflow_dataset(linear_regressor_model_uri):
 618      data = sklearn.datasets.load_diabetes()
 619      df = pd.DataFrame(data.data, columns=data.feature_names)
 620      df["y"] = data.target
 621      mlflow_df = from_pandas(df=df, source="my_src", targets="y")
 622      with mlflow.start_run() as run:
 623          eval_result = evaluate(
 624              linear_regressor_model_uri,
 625              data=mlflow_df,
 626              model_type="regressor",
 627              evaluators=["default"],
 628          )
 629      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
 630  
 631      for k, v in eval_result.metrics.items():
 632          assert v == saved_metrics[k]
 633  
 634      datasets = get_run_datasets(run.info.run_id)
 635      assert len(datasets) == 1
 636      assert len(datasets[0].tags) == 0
 637  
 638  
 639  def test_pandas_df_regressor_evaluation_mlflow_dataset_with_targets_from_dataset(
 640      linear_regressor_model_uri,
 641  ):
 642      data = sklearn.datasets.load_diabetes()
 643      df = pd.DataFrame(data.data, columns=data.feature_names)
 644      df["y"] = data.target
 645      mlflow_df = from_pandas(df=df, source="my_src", targets="y")
 646      with mlflow.start_run() as run:
 647          eval_result = evaluate(
 648              linear_regressor_model_uri,
 649              data=mlflow_df,
 650              model_type="regressor",
 651              evaluators=["default"],
 652          )
 653      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
 654  
 655      for k, v in eval_result.metrics.items():
 656          assert v == saved_metrics[k]
 657  
 658      datasets = get_run_datasets(run.info.run_id)
 659      assert len(datasets) == 1
 660      assert len(datasets[0].tags) == 0
 661  
 662  
 663  def test_dataset_name():
 664      X, y = get_iris()
 665      d1 = EvaluationDataset(data=X, targets=y, name="a1")
 666      assert d1.name == "a1"
 667      d2 = EvaluationDataset(data=X, targets=y)
 668      assert d2.name == d2.hash
 669  
 670  
 671  def test_dataset_metadata():
 672      X, y = get_iris()
 673      d1 = EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
 674      assert d1._metadata == {
 675          "hash": "6bdf4e119bf1a37e7907dfd9f0e68733",
 676          "name": "a1",
 677          "path": "/path/to/a1",
 678      }
 679  
 680  
 681  def test_gen_md5_for_arraylike_obj():
 682      def get_md5(data):
 683          md5_gen = hashlib.md5(usedforsecurity=False)
 684          _gen_md5_for_arraylike_obj(md5_gen, data)
 685          return md5_gen.hexdigest()
 686  
 687      list0 = list(range(20))
 688      list1 = [100] + list0[1:]
 689      list2 = list0[:-1] + [100]
 690      list3 = list0[:10] + [100] + list0[10:]
 691  
 692      assert len({get_md5(list0), get_md5(list1), get_md5(list2), get_md5(list3)}) == 4
 693  
 694      list4 = list0[:10] + [99] + list0[10:]
 695      assert get_md5(list3) == get_md5(list4)
 696  
 697  
 698  def test_gen_md5_for_arraylike_obj_with_pandas_df_using_float_idx_does_not_raise_keyerror():
 699      float_indices = np.random.uniform(low=0.5, high=13.3, size=(10,))
 700      df = pd.DataFrame(np.random.randn(10, 4), index=float_indices, columns=["A", "B", "C", "D"])
 701      md5_gen = hashlib.md5(usedforsecurity=False)
 702      assert _gen_md5_for_arraylike_obj(md5_gen, df) is None
 703  
 704  
 705  def test_dataset_hash(
 706      iris_dataset, iris_pandas_df_dataset, iris_pandas_df_num_cols_dataset, diabetes_spark_dataset
 707  ):
 708      assert iris_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3"
 709      assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06"
 710      assert iris_pandas_df_num_cols_dataset.hash == "3c5fc56830a0646001253e25e17bdce4"
 711      assert diabetes_spark_dataset.hash == "ebfb050519e7e5b463bd38b0c8d04243"
 712  
 713  
 714  def test_trace_dataset_hash():
 715      # Validates that a dataset containing Traces can be hashed.
 716      df = pd.DataFrame({
 717          "request": ["Hello"],
 718          "trace": [Trace(info=create_test_trace_info("tr"), data=TraceData([]))],
 719      })
 720      dataset = EvaluationDataset(data=df)
 721      assert dataset.hash == "757c14bf38aa42d36b93ccd70b1ea719"
 722      # Hash of a dataset with a different column should be different
 723      df2 = pd.DataFrame({
 724          "request": ["Hi"],
 725          "trace": [Trace(info=create_test_trace_info("tr"), data=TraceData([]))],
 726      })
 727      dataset2 = EvaluationDataset(data=df2)
 728      assert dataset2.hash != dataset.hash
 729  
 730  
 731  def test_dataset_with_pandas_dataframe():
 732      data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1]})
 733      eval_dataset = EvaluationDataset(data=data, targets="label")
 734  
 735      assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"]
 736      np.testing.assert_array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
 737      np.testing.assert_array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4])
 738      np.testing.assert_array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6])
 739      np.testing.assert_array_equal(eval_dataset.labels_data, [0, 1])
 740  
 741      eval_dataset2 = EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"])
 742      assert list(eval_dataset2.features_data.columns) == ["f3", "f2"]
 743      np.testing.assert_array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4])
 744      np.testing.assert_array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
 745  
 746  
 747  def test_dataset_with_array_data():
 748      features = [[1, 2], [3, 4]]
 749      labels = [0, 1]
 750  
 751      for input_data in [features, np.array(features)]:
 752          eval_dataset1 = EvaluationDataset(data=input_data, targets=labels)
 753          np.testing.assert_array_equal(eval_dataset1.features_data, features)
 754          np.testing.assert_array_equal(eval_dataset1.labels_data, labels)
 755          assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
 756  
 757      assert EvaluationDataset(
 758          data=input_data, targets=labels, feature_names=["a", "b"]
 759      ).feature_names == ["a", "b"]
 760  
 761      with pytest.raises(MlflowException, match="all elements must have the same length"):
 762          EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
 763  
 764  
 765  def test_dataset_autogen_feature_names():
 766      labels = [0]
 767      eval_dataset2 = EvaluationDataset(data=[list(range(9))], targets=labels)
 768      assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)]
 769  
 770      eval_dataset2 = EvaluationDataset(data=[list(range(10))], targets=labels)
 771      assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)]
 772  
 773      eval_dataset2 = EvaluationDataset(data=[list(range(99))], targets=labels)
 774      assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)]
 775  
 776      eval_dataset2 = EvaluationDataset(data=[list(range(100))], targets=labels)
 777      assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)]
 778  
 779      with pytest.raises(
 780          MlflowException, match="features example rows must be the same length with labels array"
 781      ):
 782          EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
 783  
 784  
 785  def test_dataset_from_spark_df(spark_session):
 786      spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
 787      with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
 788          dataset = EvaluationDataset(spark_df, targets="y")
 789          assert list(dataset.features_data.columns) == ["f1", "f2"]
 790          assert list(dataset.features_data["f1"]) == [1.0] * 5
 791          assert list(dataset.features_data["f2"]) == [2.0] * 5
 792          assert list(dataset.labels_data) == [3.0] * 5
 793  
 794  
 795  def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
 796      model_uuid = uuid.uuid4().hex
 797      with mlflow.start_run() as run:
 798          client = MlflowClient()
 799          iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
 800          _, _, tags, _ = get_run_data(run.info.run_id)
 801  
 802          logged_meta1 = {**iris_dataset._metadata, "model": model_uuid}
 803          logged_meta2 = {**iris_pandas_df_dataset._metadata, "model": model_uuid}
 804  
 805          assert json.loads(tags["mlflow.datasets"]) == [logged_meta1]
 806  
 807          raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets")
 808          assert " " not in raw_tag  # assert the tag string remove all whitespace chars.
 809  
 810          # Test appending dataset tag
 811          iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
 812          _, _, tags, _ = get_run_data(run.info.run_id)
 813          assert json.loads(tags["mlflow.datasets"]) == [
 814              logged_meta1,
 815              logged_meta2,
 816          ]
 817  
 818          # Test log repetitive dataset
 819          iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
 820          _, _, tags, _ = get_run_data(run.info.run_id)
 821          assert json.loads(tags["mlflow.datasets"]) == [
 822              logged_meta1,
 823              logged_meta2,
 824          ]
 825  
 826  
 827  class FakeEvaluator1(ModelEvaluator):
 828      @classmethod
 829      def can_evaluate(cls, *, model_type, evaluator_config, **kwargs):
 830          raise RuntimeError()
 831  
 832      def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
 833          raise RuntimeError()
 834  
 835  
 836  class FakeEvaluator2(ModelEvaluator):
 837      @classmethod
 838      def can_evaluate(cls, *, model_type, evaluator_config, **kwargs):
 839          raise RuntimeError()
 840  
 841      def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
 842          raise RuntimeError()
 843  
 844  
 845  class FakeArtifact1(EvaluationArtifact):
 846      def _save(self, output_artifact_path):
 847          raise RuntimeError()
 848  
 849      def _load_content_from_file(self, local_artifact_path):
 850          raise RuntimeError()
 851  
 852  
 853  class FakeArtifact2(EvaluationArtifact):
 854      def _save(self, output_artifact_path):
 855          raise RuntimeError()
 856  
 857      def _load_content_from_file(self, local_artifact_path):
 858          raise RuntimeError()
 859  
 860  
 861  class PyFuncModelMatcher:
 862      def __eq__(self, other):
 863          return isinstance(other, mlflow.pyfunc.PyFuncModel)
 864  
 865  
 866  class ModelPredictFuncMatcher:
 867      def __eq__(self, other):
 868          return callable(other)
 869  
 870  
 871  def test_evaluator_evaluation_interface(multiclass_logistic_regressor_model_uri, iris_dataset):
 872      with mock.patch.object(
 873          _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1}
 874      ):
 875          evaluator1_config = {"eval1_config_a": 3, "eval1_config_b": 4}
 876          evaluator1_return_value = EvaluationResult(
 877              metrics={"m1": 5, "m2": 6},
 878              artifacts={"a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2")},
 879          )
 880          with (
 881              mock.patch.object(
 882                  FakeEvaluator1, "can_evaluate", return_value=False
 883              ) as mock_can_evaluate,
 884              mock.patch.object(
 885                  FakeEvaluator1, "evaluate", return_value=evaluator1_return_value
 886              ) as mock_evaluate,
 887          ):
 888              with mlflow.start_run():
 889                  with pytest.raises(
 890                      MlflowException,
 891                      match="The model could not be evaluated by any of the registered evaluators",
 892                  ):
 893                      evaluate(
 894                          multiclass_logistic_regressor_model_uri,
 895                          data=iris_dataset._constructor_args["data"],
 896                          model_type="classifier",
 897                          targets=iris_dataset._constructor_args["targets"],
 898                          evaluators="test_evaluator1",
 899                          evaluator_config=evaluator1_config,
 900                      )
 901                  mock_can_evaluate.assert_called_once_with(
 902                      model_type="classifier", evaluator_config=evaluator1_config
 903                  )
 904                  mock_evaluate.assert_not_called()
 905          with (
 906              mock.patch.object(
 907                  FakeEvaluator1, "can_evaluate", return_value=True
 908              ) as mock_can_evaluate,
 909              mock.patch.object(
 910                  FakeEvaluator1, "evaluate", return_value=evaluator1_return_value
 911              ) as mock_evaluate,
 912          ):
 913              with mlflow.start_run() as run:
 914                  eval1_result = evaluate(
 915                      multiclass_logistic_regressor_model_uri,
 916                      iris_dataset._constructor_args["data"],
 917                      model_type="classifier",
 918                      targets=iris_dataset._constructor_args["targets"],
 919                      evaluators="test_evaluator1",
 920                      evaluator_config=evaluator1_config,
 921                      extra_metrics=None,
 922                  )
 923                  assert eval1_result.metrics == evaluator1_return_value.metrics
 924                  assert eval1_result.artifacts == evaluator1_return_value.artifacts
 925  
 926                  mock_can_evaluate.assert_called_once_with(
 927                      model_type="classifier", evaluator_config=evaluator1_config
 928                  )
 929                  mock_evaluate.assert_called_once_with(
 930                      model=PyFuncModelMatcher(),
 931                      model_type="classifier",
 932                      model_id=multiclass_logistic_regressor_model_uri.split("/")[-1],
 933                      dataset=iris_dataset,
 934                      run_id=run.info.run_id,
 935                      evaluator_config=evaluator1_config,
 936                      extra_metrics=None,
 937                      custom_artifacts=None,
 938                      predictions=None,
 939                  )
 940  
 941  
 942  def test_evaluate_with_multi_evaluators(
 943      multiclass_logistic_regressor_model_uri,
 944      iris_dataset,
 945  ):
 946      with mock.patch.object(
 947          _model_evaluation_registry,
 948          "_registry",
 949          {"test_evaluator1": FakeEvaluator1, "test_evaluator2": FakeEvaluator2},
 950      ):
 951          evaluator1_config = {"eval1_config": 3}
 952          evaluator2_config = {"eval2_config": 4}
 953          evaluator1_return_value = EvaluationResult(
 954              metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")}
 955          )
 956  
 957          evaluator2_return_value = EvaluationResult(
 958              metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")}
 959          )
 960  
 961          def get_evaluate_call_arg(model, evaluator_config):
 962              return {
 963                  "model": model,
 964                  "model_type": "classifier",
 965                  "model_id": model.model_id,
 966                  "dataset": iris_dataset,
 967                  "run_id": run.info.run_id,
 968                  "evaluator_config": evaluator_config,
 969                  "extra_metrics": None,
 970                  "custom_artifacts": None,
 971                  "predictions": None,
 972              }
 973  
 974          # evaluators = None is the case evaluators unspecified, it should fetch all registered
 975          # evaluators, and the evaluation results should equal to the case of
 976          # evaluators=["test_evaluator1", "test_evaluator2"]
 977          for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]:
 978              with (
 979                  mock.patch.object(
 980                      FakeEvaluator1, "can_evaluate", return_value=True
 981                  ) as mock_can_evaluate1,
 982                  mock.patch.object(
 983                      FakeEvaluator1, "evaluate", return_value=evaluator1_return_value
 984                  ) as mock_evaluate1,
 985                  mock.patch.object(
 986                      FakeEvaluator2, "can_evaluate", return_value=True
 987                  ) as mock_can_evaluate2,
 988                  mock.patch.object(
 989                      FakeEvaluator2, "evaluate", return_value=evaluator2_return_value
 990                  ) as mock_evaluate2,
 991              ):
 992                  with mlflow.start_run() as run:
 993                      eval_result = evaluate(
 994                          multiclass_logistic_regressor_model_uri,
 995                          iris_dataset._constructor_args["data"],
 996                          model_type="classifier",
 997                          targets=iris_dataset._constructor_args["targets"],
 998                          evaluators=evaluators,
 999                          evaluator_config={
1000                              "test_evaluator1": evaluator1_config,
1001                              "test_evaluator2": evaluator2_config,
1002                          },
1003                      )
1004                      assert eval_result.metrics == {
1005                          **evaluator1_return_value.metrics,
1006                          **evaluator2_return_value.metrics,
1007                      }
1008                      assert eval_result.artifacts == {
1009                          **evaluator1_return_value.artifacts,
1010                          **evaluator2_return_value.artifacts,
1011                      }
1012                      mock_evaluate1.assert_called_once_with(
1013                          **get_evaluate_call_arg(
1014                              mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri),
1015                              evaluator1_config,
1016                          )
1017                      )
1018                      mock_can_evaluate1.assert_has_calls([
1019                          mock.call(model_type="classifier", evaluator_config=evaluator1_config)
1020                      ])
1021                      mock_evaluate2.assert_called_once_with(
1022                          **get_evaluate_call_arg(
1023                              mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri),
1024                              evaluator2_config,
1025                          )
1026                      )
1027                      mock_can_evaluate2.assert_has_calls([
1028                          mock.call(model_type="classifier", evaluator_config=evaluator2_config)
1029                      ])
1030  
1031  
1032  def test_custom_evaluators_no_model_or_preds(multiclass_logistic_regressor_model_uri, iris_dataset):
1033      """
1034      Tests that custom evaluators are called correctly when no model or predictions are provided
1035      """
1036      with (
1037          mock.patch.object(
1038              _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1}
1039          ),
1040          mock.patch.object(FakeEvaluator1, "can_evaluate", return_value=True) as mock_can_evaluate,
1041          mock.patch.object(FakeEvaluator1, "evaluate") as mock_evaluate,
1042      ):
1043          with mlflow.start_run() as run:
1044              evaluate(
1045                  model=None,
1046                  data=iris_dataset._constructor_args["data"],
1047                  predictions=None,
1048                  model_type="classifier",
1049                  targets=iris_dataset._constructor_args["targets"],
1050                  evaluators="test_evaluator1",
1051                  evaluator_config=None,
1052                  extra_metrics=None,
1053              )
1054  
1055              mock_can_evaluate.assert_called_once_with(model_type="classifier", evaluator_config={})
1056              mock_evaluate.assert_called_once_with(
1057                  model=None,
1058                  dataset=iris_dataset,
1059                  predictions=None,
1060                  model_type="classifier",
1061                  model_id=None,
1062                  run_id=run.info.run_id,
1063                  evaluator_config={},
1064                  extra_metrics=None,
1065                  custom_artifacts=None,
1066              )
1067  
1068  
1069  def test_start_run_or_reuse_active_run():
1070      with _start_run_or_reuse_active_run() as run:
1071          assert mlflow.active_run().info.run_id == run.info.run_id
1072  
1073      with mlflow.start_run() as run:
1074          active_run_id = run.info.run_id
1075  
1076          with _start_run_or_reuse_active_run() as run:
1077              assert run.info.run_id == active_run_id
1078  
1079          with _start_run_or_reuse_active_run() as run:
1080              assert run.info.run_id == active_run_id
1081  
1082  
1083  def test_resolve_evaluators_and_configs():
1084      from mlflow.models.evaluation.evaluators.classifier import ClassifierEvaluator
1085      from mlflow.models.evaluation.evaluators.default import DefaultEvaluator
1086      from mlflow.models.evaluation.evaluators.regressor import RegressorEvaluator
1087      from mlflow.models.evaluation.evaluators.shap import ShapEvaluator
1088  
1089      def assert_equal(actual, expected):
1090          assert len(actual) == len(expected)
1091          for actual_i, expected_i in zip(actual, expected):
1092              assert actual_i.name == expected_i[0]
1093              assert isinstance(actual_i.evaluator, expected_i[1])
1094              assert actual_i.config == expected_i[2]
1095  
1096      with mock.patch.object(
1097          _model_evaluation_registry,
1098          "_registry",
1099          {"default": DefaultEvaluator},
1100      ):
1101          assert_equal(
1102              resolve_evaluators_and_configs(None, None), [("default", DefaultEvaluator, {})]
1103          )
1104          assert_equal(
1105              actual=resolve_evaluators_and_configs(None, {"a": 3}),
1106              expected=[("default", DefaultEvaluator, {"a": 3})],
1107          )
1108          assert_equal(
1109              actual=resolve_evaluators_and_configs(None, {"default": {"a": 3}}),
1110              expected=[("default", DefaultEvaluator, {"a": 3})],
1111          )
1112  
1113      # 1. evaluators is None -> only default evaluator is used
1114      assert_equal(
1115          actual=resolve_evaluators_and_configs(None, None),
1116          expected=[("default", DefaultEvaluator, {})],
1117      )
1118      assert_equal(
1119          actual=resolve_evaluators_and_configs(None, {"a": 3}),
1120          expected=[("default", DefaultEvaluator, {"a": 3})],
1121      )
1122  
1123      # 2. evaluators is None and model type is classifier -> builtin classifier evaluators
1124      #   are used instead of the default. Also dummy evaluator can evaluate classifier.
1125      assert_equal(
1126          actual=resolve_evaluators_and_configs(
1127              evaluators=None, evaluator_config={"a": 3}, model_type="classifier"
1128          ),
1129          expected=[
1130              ("classifier", ClassifierEvaluator, {"a": 3}),
1131              ("shap", ShapEvaluator, {"a": 3}),
1132              ("dummy_evaluator", DummyEvaluator, {"a": 3}),
1133          ],
1134      )
1135  
1136      assert_equal(
1137          resolve_evaluators_and_configs(
1138              evaluators=None,
1139              # config for a specific evaluator
1140              evaluator_config={"shap": {"a": 3}},
1141              model_type="classifier",
1142          ),
1143          expected=[
1144              ("classifier", ClassifierEvaluator, {}),
1145              ("shap", ShapEvaluator, {"a": 3}),
1146              ("dummy_evaluator", DummyEvaluator, {}),
1147          ],
1148      )
1149  
1150      assert_equal(
1151          resolve_evaluators_and_configs(
1152              evaluators=None,
1153              # config for a "default" copied to builtin evaluators
1154              evaluator_config={"default": {"a": 3}},
1155              model_type="classifier",
1156          ),
1157          expected=[
1158              ("classifier", ClassifierEvaluator, {"a": 3}),
1159              ("shap", ShapEvaluator, {"a": 3}),
1160              ("dummy_evaluator", DummyEvaluator, {}),
1161          ],
1162      )
1163  
1164      # 3. evaluators is string -> the specified evaluator is used
1165      assert_equal(
1166          actual=resolve_evaluators_and_configs("dummy_evaluator", {"a": 3}, "regressor"),
1167          expected=[("dummy_evaluator", DummyEvaluator, {"a": 3})],
1168      )
1169      assert_equal(
1170          actual=resolve_evaluators_and_configs("default", {"a": 3}),
1171          expected=[("default", DefaultEvaluator, {"a": 3})],
1172      )
1173      assert_equal(
1174          actual=resolve_evaluators_and_configs("default", {"a": 3}, "regressor"),
1175          expected=[
1176              ("regressor", RegressorEvaluator, {"a": 3}),
1177              ("shap", ShapEvaluator, {"a": 3}),
1178          ],
1179      )
1180      assert_equal(
1181          actual=resolve_evaluators_and_configs("regressor", {"a": 3}, "regressor"),
1182          expected=[("regressor", RegressorEvaluator, {"a": 3})],
1183      )
1184      assert_equal(
1185          actual=resolve_evaluators_and_configs("non-existing", {"a": 3}),
1186          expected=[],  # empty because not registered evaluator
1187      )
1188  
1189      # 4. evaluators is a list of strings -> the specified evaluators are used
1190      assert_equal(
1191          actual=resolve_evaluators_and_configs(
1192              evaluators=["default", "dummy_evaluator"],
1193              evaluator_config={"dummy_evaluator": {"a": 3}, "default": {"a": 5}},
1194              model_type="classifier",
1195          ),
1196          expected=[
1197              ("classifier", ClassifierEvaluator, {"a": 5}),
1198              ("shap", ShapEvaluator, {"a": 5}),
1199              ("dummy_evaluator", DummyEvaluator, {"a": 3}),
1200          ],
1201      )
1202  
1203      assert_equal(
1204          actual=resolve_evaluators_and_configs(
1205              evaluators=["regressor"],
1206              evaluator_config={"regressor": {"a": 5}},
1207              model_type="regressor",
1208          ),
1209          expected=[("regressor", RegressorEvaluator, {"a": 5})],
1210      )
1211  
1212      with pytest.raises(
1213          MlflowException,
1214          match="If `evaluators` argument is an evaluator name list, evaluator_config must",
1215      ):
1216          resolve_evaluators_and_configs(["default", "dummy_evaluator"], {"abc": {"a": 3}})
1217  
1218  
1219  def test_resolve_evaluators_raise_for_missing_databricks_agent_dependency():
1220      with pytest.raises(
1221          MlflowException,
1222          match="Databricks Agents SDK must be installed to use the `databricks-agent` model type.",
1223      ):
1224          resolve_evaluators_and_configs(
1225              evaluators=None, evaluator_config=None, model_type="databricks-agent"
1226          )
1227  
1228  
1229  def test_evaluate_env_manager_params(multiclass_logistic_regressor_model_uri, iris_dataset):
1230      model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
1231  
1232      with mock.patch.object(
1233          _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvaluator1}
1234      ):
1235          with pytest.raises(MlflowException, match="The model argument must be a string URI"):
1236              evaluate(
1237                  model,
1238                  iris_dataset._constructor_args["data"],
1239                  model_type="classifier",
1240                  targets=iris_dataset._constructor_args["targets"],
1241                  evaluators=None,
1242                  env_manager="virtualenv",
1243              )
1244  
1245          with pytest.raises(MlflowException, match="Invalid value for `env_manager`"):
1246              evaluate(
1247                  multiclass_logistic_regressor_model_uri,
1248                  iris_dataset._constructor_args["data"],
1249                  model_type="classifier",
1250                  targets=iris_dataset._constructor_args["targets"],
1251                  evaluators=None,
1252                  env_manager="manager",
1253              )
1254  
1255  
1256  @pytest.mark.parametrize("env_manager", ["virtualenv", "conda"])
1257  def test_evaluate_restores_env(tmp_path, env_manager, iris_dataset):
1258      class EnvRestoringTestModel(mlflow.pyfunc.PythonModel):
1259          def __init__(self):
1260              pass
1261  
1262          def predict(self, context, model_input, params=None):
1263              pred_value = 1 if sklearn.__version__ == "1.4.2" else 0
1264  
1265              return model_input.apply(lambda row: pred_value, axis=1)
1266  
1267      class FakeEvauatorEnv(ModelEvaluator):
1268          @classmethod
1269          def can_evaluate(cls, *, model_type, evaluator_config, **kwargs):
1270              return True
1271  
1272          def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
1273              y = model.predict(pd.DataFrame(dataset.features_data))
1274              return EvaluationResult(metrics={"test": y[0]}, artifacts={})
1275  
1276      model_path = os.path.join(tmp_path, "model")
1277  
1278      mlflow.pyfunc.save_model(
1279          path=model_path,
1280          python_model=EnvRestoringTestModel(),
1281          pip_requirements=["scikit-learn==1.4.2"],
1282      )
1283  
1284      with mock.patch.object(
1285          _model_evaluation_registry,
1286          "_registry",
1287          {"test_evaluator_env": FakeEvauatorEnv},
1288      ):
1289          result = evaluate(
1290              model_path,
1291              iris_dataset._constructor_args["data"],
1292              model_type="classifier",
1293              targets=iris_dataset._constructor_args["targets"],
1294              evaluators=None,
1295              env_manager=env_manager,
1296          )
1297          assert result.metrics["test"] == 1
1298  
1299  
1300  def test_evaluate_terminates_model_servers(multiclass_logistic_regressor_model_uri, iris_dataset):
1301      # Mock the _load_model_or_server() results to avoid starting model servers
1302      model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
1303      client = ScoringServerClient("127.0.0.1", "8080")
1304      served_model_1 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=1)
1305      served_model_2 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=2)
1306  
1307      with (
1308          mock.patch.object(
1309              _model_evaluation_registry,
1310              "_registry",
1311              {"test_evaluator1": FakeEvaluator1},
1312          ),
1313          mock.patch.object(FakeEvaluator1, "can_evaluate", return_value=True),
1314          mock.patch.object(
1315              FakeEvaluator1, "evaluate", return_value=EvaluationResult(metrics={}, artifacts={})
1316          ),
1317          mock.patch("mlflow.pyfunc._load_model_or_server") as server_loader,
1318          mock.patch("os.kill") as os_mock,
1319      ):
1320          server_loader.side_effect = [served_model_1, served_model_2]
1321          evaluate(
1322              multiclass_logistic_regressor_model_uri,
1323              iris_dataset._constructor_args["data"],
1324              model_type="classifier",
1325              targets=iris_dataset._constructor_args["targets"],
1326              evaluators=None,
1327              env_manager="virtualenv",
1328          )
1329          assert os_mock.call_count == 1
1330          os_mock.assert_has_calls([mock.call(1, signal.SIGTERM)])
1331  
1332  
1333  def test_evaluate_stdin_scoring_server():
1334      X, y = sklearn.datasets.load_iris(return_X_y=True)
1335      X = X[::5]
1336      y = y[::5]
1337      model = sklearn.linear_model.LogisticRegression()
1338      model.fit(X, y)
1339  
1340      with mlflow.start_run():
1341          model_info = mlflow.sklearn.log_model(model, name="model")
1342  
1343      with mock.patch("mlflow.pyfunc.check_port_connectivity", return_value=False):
1344          mlflow.evaluate(
1345              model_info.model_uri,
1346              X,
1347              targets=y,
1348              model_type="classifier",
1349              evaluators=["default"],
1350              env_manager="virtualenv",
1351          )
1352  
1353  
1354  @pytest.mark.parametrize("model_type", ["regressor", "classifier"])
1355  def test_targets_is_required_for_regressor_and_classifier_models(model_type):
1356      with pytest.raises(MlflowException, match="The targets argument must be specified"):
1357          mlflow.evaluate(
1358              "models:/test",
1359              data=pd.DataFrame(),
1360              model_type=model_type,
1361          )
1362  
1363  
1364  def test_evaluate_xgboost_classifier():
1365      import xgboost as xgb
1366  
1367      X, y = sklearn.datasets.load_iris(return_X_y=True, as_frame=True)
1368      X = X[::5]
1369      y = y[::5]
1370      data = xgb.DMatrix(X, label=y)
1371      model = xgb.train({"objective": "multi:softmax", "num_class": 3}, data, num_boost_round=5)
1372  
1373      with mlflow.start_run() as run:
1374          model_info = mlflow.xgboost.log_model(model, name="model")
1375          mlflow.evaluate(
1376              model_info.model_uri,
1377              X.assign(y=y),
1378              targets="y",
1379              model_type="classifier",
1380          )
1381  
1382      run = mlflow.get_run(run.info.run_id)
1383      assert "accuracy_score" in run.data.metrics
1384      assert "recall_score" in run.data.metrics
1385      assert "precision_score" in run.data.metrics
1386      assert "f1_score" in run.data.metrics
1387  
1388  
1389  def test_evaluate_lightgbm_regressor():
1390      import lightgbm as lgb
1391  
1392      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1393      X = X[::5]
1394      y = y[::5]
1395      data = lgb.Dataset(X, label=y)
1396      model = lgb.train({"objective": "regression"}, data, num_boost_round=5)
1397  
1398      with mlflow.start_run() as run:
1399          model_info = mlflow.lightgbm.log_model(model, name="model")
1400          mlflow.evaluate(
1401              model_info.model_uri,
1402              X.assign(y=y),
1403              targets="y",
1404              model_type="regressor",
1405          )
1406  
1407      run = mlflow.get_run(run.info.run_id)
1408      assert "mean_absolute_error" in run.data.metrics
1409      assert "mean_squared_error" in run.data.metrics
1410      assert "root_mean_squared_error" in run.data.metrics
1411  
1412  
1413  def test_evaluate_with_targets_error_handling():
1414      import lightgbm as lgb
1415  
1416      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1417      X = X[::5]
1418      y = y[::5]
1419      lgb_data = lgb.Dataset(X, label=y)
1420      model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5)
1421      ERROR_TYPE_1 = (
1422          "The top-level targets parameter should not be specified since a Dataset "
1423          "is used. Please only specify the targets column name in the Dataset. For example: "
1424          "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. "
1425          "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`."
1426      )
1427      ERROR_TYPE_2 = (
1428          "The targets column name must be specified in the provided Dataset "
1429          "for regressor models. For example: "
1430          "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`"
1431      )
1432      ERROR_TYPE_3 = "The targets argument must be specified for regressor models."
1433  
1434      pandas_dataset_no_targets = X
1435      mlflow_dataset_no_targets = mlflow.data.from_pandas(df=X.assign(y=y))
1436      mlflow_dataset_with_targets = mlflow.data.from_pandas(df=X.assign(y=y), targets="y")
1437  
1438      with mlflow.start_run():
1439          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
1440              mlflow.evaluate(
1441                  model=model,
1442                  data=mlflow_dataset_with_targets,
1443                  model_type="regressor",
1444                  targets="y",
1445              )
1446  
1447          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
1448              mlflow.evaluate(
1449                  model=model,
1450                  data=mlflow_dataset_no_targets,
1451                  model_type="regressor",
1452                  targets="y",
1453              )
1454  
1455          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
1456              mlflow.evaluate(
1457                  model=model,
1458                  data=mlflow_dataset_with_targets,
1459                  model_type="question-answering",
1460                  targets="y",
1461              )
1462  
1463          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
1464              mlflow.evaluate(
1465                  model=model,
1466                  data=mlflow_dataset_no_targets,
1467                  model_type="question-answering",
1468                  targets="y",
1469              )
1470  
1471          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_2)):
1472              mlflow.evaluate(
1473                  model=model,
1474                  data=mlflow_dataset_no_targets,
1475                  model_type="regressor",
1476              )
1477  
1478          with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_3)):
1479              mlflow.evaluate(
1480                  model=model,
1481                  data=pandas_dataset_no_targets,
1482                  model_type="regressor",
1483              )
1484  
1485  
1486  def test_evaluate_with_predictions_error_handling():
1487      import lightgbm as lgb
1488  
1489      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1490      X = X[::5]
1491      y = y[::5]
1492      lgb_data = lgb.Dataset(X, label=y)
1493      model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5)
1494      mlflow_dataset_with_predictions = mlflow.data.from_pandas(
1495          df=X.assign(y=y, model_output=y),
1496          targets="y",
1497          predictions="model_output",
1498      )
1499      with mlflow.start_run():
1500          with pytest.raises(
1501              MlflowException,
1502              match="The predictions parameter should not be specified in the Dataset since a model "
1503              "is specified. Please remove the predictions column from the Dataset.",
1504          ):
1505              mlflow.evaluate(
1506                  model=model,
1507                  data=mlflow_dataset_with_predictions,
1508                  model_type="regressor",
1509              )
1510  
1511  
1512  def test_evaluate_with_function_input_single_output():
1513      import lightgbm as lgb
1514  
1515      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1516      X = X[::5]
1517      y = y[::5]
1518      data = lgb.Dataset(X, label=y)
1519      model = lgb.train({"objective": "regression"}, data, num_boost_round=5)
1520  
1521      def fn(X):
1522          return model.predict(X)
1523  
1524      with mlflow.start_run() as run:
1525          mlflow.evaluate(
1526              fn,
1527              X.assign(y=y),
1528              targets="y",
1529              model_type="regressor",
1530          )
1531      run = mlflow.get_run(run.info.run_id)
1532      assert "mean_absolute_error" in run.data.metrics
1533      assert "mean_squared_error" in run.data.metrics
1534      assert "root_mean_squared_error" in run.data.metrics
1535  
1536  
1537  def test_evaluate_with_loaded_pyfunc_model():
1538      import lightgbm as lgb
1539  
1540      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1541      X = X[::5]
1542      y = y[::5]
1543      data = lgb.Dataset(X, label=y)
1544      model = lgb.train({"objective": "regression"}, data, num_boost_round=5)
1545  
1546      with mlflow.start_run() as run:
1547          model_info = mlflow.lightgbm.log_model(model, name="model")
1548          loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
1549          mlflow.evaluate(
1550              loaded_model,
1551              X.assign(y=y),
1552              targets="y",
1553              model_type="regressor",
1554          )
1555  
1556      run = mlflow.get_run(run.info.run_id)
1557      assert "mean_absolute_error" in run.data.metrics
1558      assert "mean_squared_error" in run.data.metrics
1559      assert "root_mean_squared_error" in run.data.metrics
1560  
1561  
1562  def test_evaluate_with_static_dataset_input_single_output():
1563      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1564      X = X[::5]
1565      y = y[::5]
1566      with mlflow.start_run() as run:
1567          mlflow.evaluate(
1568              data=X.assign(y=y, model_output=y),
1569              targets="y",
1570              predictions="model_output",
1571              model_type="regressor",
1572          )
1573  
1574      run = mlflow.get_run(run.info.run_id)
1575      assert "mean_absolute_error" in run.data.metrics
1576      assert "mean_squared_error" in run.data.metrics
1577      assert "root_mean_squared_error" in run.data.metrics
1578  
1579  
1580  def test_evaluate_with_static_mlflow_dataset_input():
1581      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1582      X = X[::5]
1583      y = y[::5]
1584      data = mlflow.data.from_pandas(
1585          df=X.assign(y=y, model_output=y), targets="y", predictions="model_output"
1586      )
1587      with mlflow.start_run() as run:
1588          mlflow.evaluate(
1589              data=data,
1590              model_type="regressor",
1591          )
1592  
1593      run = mlflow.get_run(run.info.run_id)
1594      assert "mean_absolute_error" in run.data.metrics
1595      assert "mean_squared_error" in run.data.metrics
1596      assert "root_mean_squared_error" in run.data.metrics
1597  
1598  
1599  def test_evaluate_with_static_dataset_error_handling_pandas_dataframe():
1600      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1601      X = X[::5]
1602      y = y[::5]
1603      with mlflow.start_run():
1604          with pytest.raises(MlflowException, match="The data argument cannot be None."):
1605              mlflow.evaluate(
1606                  data=None,
1607                  targets="y",
1608                  model_type="regressor",
1609              )
1610  
1611          with pytest.raises(
1612              MlflowException,
1613              match="The specified pandas DataFrame does not contain the specified predictions"
1614              " column 'prediction'.",
1615          ):
1616              mlflow.evaluate(
1617                  data=X.assign(y=y, model_output=y),
1618                  targets="y",
1619                  predictions="prediction",
1620                  model_type="regressor",
1621              )
1622  
1623  
1624  def test_evaluate_with_static_dataset_error_handling_pandas_dataset():
1625      X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
1626      X = X[::5]
1627      y = y[::5]
1628      dataset_with_predictions = mlflow.data.from_pandas(
1629          df=X.assign(y=y, model_output=y), targets="y", predictions="model_output"
1630      )
1631      dataset_no_predictions = mlflow.data.from_pandas(df=X.assign(y=y, model_output=y), targets="y")
1632      ERROR_MESSAGE = (
1633          "The top-level predictions parameter should not be specified since a Dataset is "
1634          "used. Please only specify the predictions column name in the Dataset. For example: "
1635          "`data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`"
1636          "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`."
1637      )
1638      with mlflow.start_run():
1639          with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)):
1640              mlflow.evaluate(
1641                  data=dataset_with_predictions,
1642                  model_type="regressor",
1643                  predictions="model_output",
1644              )
1645  
1646          with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)):
1647              mlflow.evaluate(
1648                  data=dataset_no_predictions,
1649                  model_type="regressor",
1650                  predictions="model_output",
1651              )
1652  
1653  
1654  def test_binary_classification_missing_minority_class_exception_override(
1655      binary_logistic_regressor_model_uri, breast_cancer_dataset, monkeypatch
1656  ):
1657      monkeypatch.setenv("_MLFLOW_EVALUATE_SUPPRESS_CLASSIFICATION_ERRORS", "True")
1658  
1659      ds_targets = breast_cancer_dataset._constructor_args["targets"]
1660      # Simulate a missing target label
1661      ds_targets = np.where(ds_targets == 0, 1, ds_targets)
1662  
1663      with mlflow.start_run() as run:
1664          eval_result = evaluate(
1665              binary_logistic_regressor_model_uri,
1666              breast_cancer_dataset._constructor_args["data"],
1667              model_type="classifier",
1668              targets=ds_targets,
1669              evaluators=["default"],
1670          )
1671      _, saved_metrics, _, _ = get_run_data(run.info.run_id)
1672  
1673      for key, saved_val in saved_metrics.items():
1674          eval_val = eval_result.metrics[key]
1675          # some nan fields are due to the class imbalance.
1676          # for example, the roc_auc_score metric will return
1677          # nan since we override all classes to `1` here
1678          if np.isnan(saved_val):
1679              assert np.isnan(eval_val)
1680          else:
1681              assert eval_val == saved_val
1682  
1683  
1684  def test_multiclass_classification_missing_minority_class_exception_override(
1685      multiclass_logistic_regressor_model_uri, iris_dataset, monkeypatch
1686  ):
1687      monkeypatch.setenv("_MLFLOW_EVALUATE_SUPPRESS_CLASSIFICATION_ERRORS", "True")
1688  
1689      ds_targets = iris_dataset._constructor_args["targets"]
1690      # Simulate a missing target label
1691      ds_targets = np.where(ds_targets == 0, 1, ds_targets)
1692  
1693      with mlflow.start_run() as run:
1694          eval_result = evaluate(
1695              multiclass_logistic_regressor_model_uri,
1696              iris_dataset._constructor_args["data"],
1697              model_type="classifier",
1698              targets=ds_targets,
1699              evaluators=["default"],
1700          )
1701      _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
1702  
1703      assert saved_metrics == eval_result.metrics
1704      assert "shap_beeswarm_plot.png" not in saved_artifacts
1705  
1706  
1707  @pytest.mark.parametrize(
1708      ("model", "is_endpoint_uri"),
1709      [
1710          ("endpoints:/test", True),
1711          ("endpoints:///my-chat", True),
1712          ("models:/test", False),
1713          (None, False),
1714      ],
1715  )
1716  def test_is_model_deployment_endpoint_uri(model, is_endpoint_uri):
1717      assert _is_model_deployment_endpoint_uri(model) == is_endpoint_uri
1718  
1719  
1720  _DUMMY_CHAT_RESPONSE = {
1721      "id": "1",
1722      "object": "text_completion",
1723      "created": "2021-10-01T00:00:00.000000Z",
1724      "model": "gpt-4o-mini",
1725      "choices": [
1726          {
1727              "index": 0,
1728              "message": {
1729                  "content": "This is a response",
1730                  "role": "assistant",
1731              },
1732              "finish_reason": "length",
1733          }
1734      ],
1735      "usage": {
1736          "prompt_tokens": 1,
1737          "completion_tokens": 1,
1738          "total_tokens": 2,
1739      },
1740  }
1741  
1742  _TEST_QUERY_LIST = ["What is MLflow?", "What is Spark?"]
1743  _TEST_GT_LIST = [
1744      "MLflow is an open-source platform for machine learning (ML).",
1745      "Apache Spark is an open-source, distributed computing system.",
1746  ]
1747  
1748  
1749  @pytest.mark.parametrize(
1750      ("input_data", "feature_names", "targets"),
1751      [
1752          # String input column
1753          (
1754              pd.DataFrame({"inputs": _TEST_QUERY_LIST, "ground_truth": _TEST_GT_LIST}),
1755              None,
1756              "ground_truth",
1757          ),
1758          # String input column with feature_names
1759          (
1760              pd.DataFrame({"question": _TEST_QUERY_LIST, "ground_truth": _TEST_GT_LIST}),
1761              ["question"],
1762              "ground_truth",
1763          ),
1764          # Dictionary input column that contains message history
1765          (
1766              pd.DataFrame({
1767                  "inputs": [
1768                      {
1769                          "messages": [{"content": q, "role": "user"}],
1770                          "max_tokens": 10,
1771                      }
1772                      for q in _TEST_QUERY_LIST
1773                  ],
1774                  "ground_truth": _TEST_GT_LIST,
1775              }),
1776              None,
1777              "ground_truth",
1778          ),
1779          # List of string
1780          (
1781              _TEST_QUERY_LIST,
1782              None,
1783              _TEST_GT_LIST,
1784          ),
1785          # List of string with feature_names
1786          (
1787              _TEST_QUERY_LIST,
1788              ["question"],
1789              _TEST_GT_LIST,
1790          ),
1791          # List of string with feature_names and w/o targets
1792          (
1793              _TEST_QUERY_LIST,
1794              ["question"],
1795              None,
1796          ),
1797          # List of dictionary with feature_names
1798          (
1799              [
1800                  {
1801                      "messages": [{"content": q, "role": "user"}],
1802                      "max_tokens": 10,
1803                  }
1804                  for q in _TEST_QUERY_LIST
1805              ],
1806              None,
1807              _TEST_GT_LIST,
1808          ),
1809      ],
1810  )
1811  def test_evaluate_on_chat_model_endpoint(input_data, feature_names, targets):
1812      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
1813          mock_deploy_client.return_value.predict.return_value = _DUMMY_CHAT_RESPONSE
1814          mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"}
1815  
1816          with mlflow.start_run():
1817              eval_result = mlflow.evaluate(
1818                  model="endpoints:/chat",
1819                  data=input_data,
1820                  model_type="question-answering",
1821                  feature_names=feature_names,
1822                  targets=targets,
1823                  inference_params={"max_tokens": 10, "temperature": 0.5},
1824              )
1825  
1826      # Validate the endpoint is called with correct payloads
1827      call_args_list = mock_deploy_client.return_value.predict.call_args_list
1828      expected_calls = [
1829          mock.call(
1830              endpoint="chat",
1831              inputs={
1832                  "messages": [{"content": "What is MLflow?", "role": "user"}],
1833                  "max_tokens": 10,
1834                  "temperature": 0.5,
1835              },
1836          ),
1837          mock.call(
1838              endpoint="chat",
1839              inputs={
1840                  "messages": [{"content": "What is Spark?", "role": "user"}],
1841                  "max_tokens": 10,
1842                  "temperature": 0.5,
1843              },
1844          ),
1845      ]
1846      assert call_args_list == expected_calls
1847  
1848      # Validate the evaluation metrics
1849      expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean"}
1850      if targets:
1851          expected_metrics_subset.add("exact_match/v1")
1852      assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
1853  
1854      # Validate the model output is passed to the evaluator in the correct format (string)
1855      eval_results_table = eval_result.tables["eval_results_table"]
1856      assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
1857  
1858  
1859  _DUMMY_COMPLETION_RESPONSE = {
1860      "id": "1",
1861      "object": "text_completion",
1862      "created": "2021-10-01T00:00:00.000000Z",
1863      "model": "gpt-4o-mini",
1864      "choices": [{"index": 0, "text": "This is a response", "finish_reason": "length"}],
1865      "usage": {
1866          "prompt_tokens": 1,
1867          "completion_tokens": 1,
1868          "total_tokens": 2,
1869      },
1870  }
1871  
1872  
1873  @pytest.mark.parametrize(
1874      ("input_data", "feature_names"),
1875      [
1876          (pd.DataFrame({"inputs": _TEST_QUERY_LIST}), None),
1877          (pd.DataFrame({"question": _TEST_QUERY_LIST}), ["question"]),
1878          (pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}), None),
1879          (_TEST_QUERY_LIST, None),
1880          ([{"prompt": q} for q in _TEST_QUERY_LIST], None),
1881      ],
1882  )
1883  def test_evaluate_on_completion_model_endpoint(input_data, feature_names):
1884      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
1885          mock_deploy_client.return_value.predict.return_value = _DUMMY_COMPLETION_RESPONSE
1886          mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/completions"}
1887  
1888          with mlflow.start_run():
1889              eval_result = mlflow.evaluate(
1890                  model="endpoints:/completions",
1891                  data=input_data,
1892                  inference_params={"max_tokens": 10},
1893                  model_type="text",
1894                  feature_names=feature_names,
1895              )
1896  
1897      # Validate the endpoint is called with correct payloads
1898      call_args_list = mock_deploy_client.return_value.predict.call_args_list
1899      expected_calls = [
1900          mock.call(endpoint="completions", inputs={"prompt": "What is MLflow?", "max_tokens": 10}),
1901          mock.call(endpoint="completions", inputs={"prompt": "What is Spark?", "max_tokens": 10}),
1902      ]
1903      assert call_args_list == expected_calls
1904  
1905      # Validate the evaluation metrics
1906      expected_metrics_subset = {
1907          "toxicity/v1/ratio",
1908          "ari_grade_level/v1/mean",
1909          "flesch_kincaid_grade_level/v1/mean",
1910      }
1911      assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
1912  
1913      # Validate the model output is passed to the evaluator in the correct format (string)
1914      eval_results_table = eval_result.tables["eval_results_table"]
1915      assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
1916  
1917  
1918  def test_evaluate_on_model_endpoint_without_type():
1919      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
1920          # An endpoint that does not have endpoint type. For such endpoint, we simply
1921          # pass the input data to the endpoint without any modification and return
1922          # the response as is.
1923          mock_deploy_client.return_value.get_endpoint.return_value = {}
1924          mock_deploy_client.return_value.predict.return_value = "This is a response"
1925  
1926          input_data = pd.DataFrame({
1927              "inputs": [
1928                  {
1929                      "messages": [{"content": q, "role": "user"}],
1930                      "max_tokens": 10,
1931                  }
1932                  for q in _TEST_QUERY_LIST
1933              ],
1934              "ground_truth": _TEST_GT_LIST,
1935          })
1936  
1937          with mlflow.start_run():
1938              eval_result = mlflow.evaluate(
1939                  model="endpoints:/random",
1940                  data=input_data,
1941                  model_type="question-answering",
1942                  targets="ground_truth",
1943                  inference_params={"max_tokens": 10, "temperature": 0.5},
1944              )
1945  
1946      # Validate the endpoint is called with correct payloads
1947      call_args_list = mock_deploy_client.return_value.predict.call_args_list
1948      expected_calls = [
1949          mock.call(
1950              endpoint="random",
1951              inputs={
1952                  "messages": [{"content": "What is MLflow?", "role": "user"}],
1953                  "max_tokens": 10,
1954                  "temperature": 0.5,
1955              },
1956          ),
1957          mock.call(
1958              endpoint="random",
1959              inputs={
1960                  "messages": [{"content": "What is Spark?", "role": "user"}],
1961                  "max_tokens": 10,
1962                  "temperature": 0.5,
1963              },
1964          ),
1965      ]
1966      assert call_args_list == expected_calls
1967  
1968      # Validate the evaluation metrics
1969      expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean", "exact_match/v1"}
1970      assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
1971  
1972      # Validate the model output is passed to the evaluator in the correct format (string)
1973      eval_results_table = eval_result.tables["eval_results_table"]
1974      assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
1975  
1976  
1977  def test_evaluate_on_model_endpoint_invalid_payload():
1978      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
1979          # An endpoint that does not have endpoint type. For such endpoint, we simply
1980          # pass the input data to the endpoint without any modification and return
1981          # the response as is.
1982          mock_deploy_client.return_value.get_endpoint.return_value = {}
1983          mock_deploy_client.return_value.predict.side_effect = ValueError("Invalid payload")
1984  
1985          input_data = pd.DataFrame({
1986              "inputs": [{"invalid": "payload"}],
1987          })
1988  
1989          with pytest.raises(MlflowException, match="Failed to call the deployment endpoint"):
1990              mlflow.evaluate(
1991                  model="endpoints:/random",
1992                  data=input_data,
1993                  model_type="question-answering",
1994                  inference_params={"max_tokens": 10, "temperature": 0.5},
1995              )
1996  
1997  
1998  @pytest.mark.parametrize(
1999      ("input_data", "error_message"),
2000      [
2001          # Extra input columns
2002          (
2003              pd.DataFrame({
2004                  "inputs": _TEST_QUERY_LIST,
2005                  "extra_input": ["a", "b"],
2006                  "ground_truth": _TEST_GT_LIST,
2007              }),
2008              "The number of input columns must be 1",
2009          ),
2010          # Missing input columns
2011          (
2012              pd.DataFrame({"ground_truth": _TEST_GT_LIST}),
2013              "The number of input columns must be 1",
2014          ),
2015          # Input column not str or dict
2016          (
2017              pd.DataFrame({"inputs": [1, 2], "ground_truth": _TEST_GT_LIST}),
2018              "Invalid input data type",
2019          ),
2020      ],
2021  )
2022  def test_evaluate_on_model_endpoint_invalid_input_data(input_data, error_message):
2023      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
2024          mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"}
2025  
2026          with pytest.raises(MlflowException, match=error_message):
2027              with mlflow.start_run():
2028                  mlflow.evaluate(
2029                      model="endpoints:/chat",
2030                      data=input_data,
2031                      model_type="question-answering",
2032                      targets="ground_truth",
2033                      inference_params={"max_tokens": 10, "temperature": 0.5},
2034                  )
2035  
2036  
2037  @pytest.mark.parametrize(
2038      "model_input",
2039      [
2040          # Case 1: Single chat dictionary.
2041          # This is an expected input format from the Databricks RAG Evaluator.
2042          {
2043              "messages": [{"content": "What is MLflow?", "role": "user"}],
2044              "max_tokens": 10,
2045          },
2046          # Case 2: List of chat dictionaries.
2047          # This is not a typical input format from either default or Databricks RAG evaluators,
2048          # but we support it for compatibility with the normal Pyfunc models.
2049          [
2050              {"messages": [{"content": "What is MLflow?", "role": "user"}]},
2051              {"messages": [{"content": "What is Spark?", "role": "user"}]},
2052          ],
2053          # Case 3: DataFrame with a column of dictionaries
2054          pd.DataFrame({
2055              "inputs": [
2056                  {
2057                      "messages": [{"content": "What is MLflow?", "role": "user"}],
2058                      "max_tokens": 10,
2059                  },
2060                  {
2061                      "messages": [{"content": "What is Spark?", "role": "user"}],
2062                  },
2063              ]
2064          }),
2065          # Case 4: DataFrame with a column of strings
2066          pd.DataFrame({
2067              "inputs": ["What is MLflow?", "What is Spark?"],
2068          }),
2069      ],
2070  )
2071  def test_model_from_deployment_endpoint(model_input):
2072      with mock.patch("mlflow.deployments.get_deploy_client") as mock_deploy_client:
2073          mock_deploy_client.return_value.predict.return_value = _DUMMY_CHAT_RESPONSE
2074          mock_deploy_client.return_value.get_endpoint.return_value = {"task": "llm/v1/chat"}
2075  
2076          model = _get_model_from_deployment_endpoint_uri("endpoints:/chat")
2077  
2078          response = model.predict(model_input)
2079  
2080      if isinstance(model_input, dict):
2081          assert mock_deploy_client.return_value.predict.call_count == 1
2082          # Chat response should be unwrapped
2083          assert response == "This is a response"
2084      else:
2085          assert mock_deploy_client.return_value.predict.call_count == 2
2086          assert pd.Series(response).equals(pd.Series(["This is a response"] * 2))
2087  
2088  
2089  def test_import_evaluation_dataset():
2090      # This test is to validate both imports work at the same time
2091      from mlflow.models.evaluation import EvaluationDataset
2092      from mlflow.models.evaluation.base import EvaluationDataset  # noqa: F401
2093  
2094  
2095  def test_evaluate_shows_server_stdout_and_stderr_on_error(
2096      linear_regressor_model_uri, diabetes_dataset
2097  ):
2098      with mlflow.start_run():
2099          server_proc = subprocess.Popen(
2100              ["echo", "test1324"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
2101          )
2102          with mock.patch(
2103              "mlflow.pyfunc.backend.PyFuncBackend.serve",
2104              return_value=server_proc,
2105          ) as mock_serve:
2106              with pytest.raises(MlflowException, match="test1324"):
2107                  evaluate(
2108                      linear_regressor_model_uri,
2109                      diabetes_dataset._constructor_args["data"],
2110                      model_type="regressor",
2111                      targets=diabetes_dataset._constructor_args["targets"],
2112                      evaluators="dummy_evaluator",
2113                      env_manager="virtualenv",
2114                  )
2115              mock_serve.assert_called_once()
2116  
2117  
2118  def test_env_manager_set_on_served_pyfunc_model(multiclass_logistic_regressor_model_uri):
2119      model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
2120      client = ScoringServerClient("127.0.0.1", "8080")
2121      served_model_1 = _ServedPyFuncModel(model_meta=model.metadata, client=client, server_pid=1)
2122      served_model_1.env_manager = "virtualenv"
2123      assert served_model_1.env_manager == "virtualenv"
2124  
2125  
2126  def test_metrics_logged_to_model_on_evaluation(
2127      multiclass_logistic_regressor_model_uri, iris_dataset
2128  ):
2129      with mlflow.start_run():
2130          # Log the model and retrieve its model_id
2131          model_info = mlflow.sklearn.log_model(
2132              mlflow.sklearn.load_model(multiclass_logistic_regressor_model_uri), name="model"
2133          )
2134          model_id = model_info.model_id
2135  
2136          # Evaluate the model using its model_id
2137          eval_result = mlflow.evaluate(
2138              model=model_info.model_uri,
2139              data=iris_dataset._constructor_args["data"],
2140              model_type="classifier",
2141              targets=iris_dataset._constructor_args["targets"],
2142              evaluators=["default"],
2143          )
2144  
2145          # Retrieve metrics logged to the model
2146          logged_model_metrics = mlflow.get_logged_model(model_id).metrics
2147  
2148          # Ensure metrics are logged to the model
2149          assert eval_result.metrics == {metric.key: metric.value for metric in logged_model_metrics}
2150  
2151          # Validate that all metrics have the correct model_id in their metadata
2152          assert all(metric.model_id == model_id for metric in logged_model_metrics)
2153  
2154  
2155  def test_evaluate_with_model_id(iris_dataset):
2156      # Create and log a model
2157      with mlflow.start_run():
2158          model = sklearn.linear_model.LogisticRegression()
2159          model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"])
2160          model_info = mlflow.sklearn.log_model(model, name="model")
2161          model_id = model_info.model_id
2162  
2163      # Evaluate the model with the specified model ID
2164      with mlflow.start_run():
2165          result = evaluate(
2166              model_info.model_uri,
2167              iris_dataset._constructor_args["data"],
2168              model_type="classifier",
2169              targets=iris_dataset._constructor_args["targets"],
2170              model_id=model_id,
2171          )
2172  
2173          # Verify metrics were logged
2174          assert result.metrics is not None
2175          assert len(result.metrics) > 0
2176  
2177          # Verify metrics are linked to the model ID
2178          logged_model = mlflow.get_logged_model(model_id)
2179          assert logged_model is not None
2180          assert logged_model.model_id == model_id
2181  
2182          # Convert metrics list to a dictionary for easier lookup
2183          logged_metrics = {metric.key: metric.value for metric in logged_model.metrics}
2184  
2185          # Verify each metric from the evaluation result matches the logged model metrics
2186          for metric_name, metric_value in result.metrics.items():
2187              assert metric_name in logged_metrics, (
2188                  f"Metric {metric_name} not found in logged model metrics"
2189              )
2190              assert logged_metrics[metric_name] == metric_value, (
2191                  f"Metric {metric_name} value mismatch: "
2192                  f"expected {metric_value}, got {logged_metrics[metric_name]}"
2193              )
2194  
2195  
2196  def test_evaluate_model_id_consistency_check(multiclass_logistic_regressor_model_uri, iris_dataset):
2197      """
2198      Test that an error is thrown when the specified model_id contradicts the model's associated ID.
2199      """
2200      # Create a model with a known model ID
2201      with mlflow.start_run():
2202          model = sklearn.linear_model.LogisticRegression()
2203          model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"])
2204          model_info = mlflow.sklearn.log_model(
2205              model,
2206              name="model",
2207          )
2208          model_uri = model_info.model_uri
2209          model_id = model_info.model_uuid
2210  
2211          # Test that specifying matching model_id works
2212          evaluate(
2213              model_uri,
2214              iris_dataset._constructor_args["data"],
2215              targets=iris_dataset._constructor_args["targets"],
2216              model_type="classifier",
2217              model_id=model_id,
2218          )
2219  
2220          # Test that specifying different model_id raises
2221          with pytest.raises(
2222              MlflowException,
2223              match=(
2224                  r"The specified value of the 'model_id' parameter '.*' "
2225                  r"contradicts the model_id '.*' associated with the model\. Please ensure "
2226                  r"they match or omit the 'model_id' parameter\."
2227              ),
2228          ):
2229              evaluate(
2230                  model_uri,
2231                  iris_dataset._constructor_args["data"],
2232                  targets=iris_dataset._constructor_args["targets"],
2233                  model_type="classifier",
2234                  model_id="different_model_id",
2235              )
2236  
2237          # Test that not specifying model_id works
2238          evaluate(
2239              model_uri,
2240              iris_dataset._constructor_args["data"],
2241              targets=iris_dataset._constructor_args["targets"],
2242              model_type="classifier",
2243          )
2244  
2245  
2246  def test_evaluate_log_metrics_to_active_model(iris_dataset):
2247      # Set active model
2248      mlflow.set_active_model(name="my-model")
2249      active_model_id = mlflow.get_active_model_id()
2250  
2251      model = sklearn.linear_model.LogisticRegression()
2252      model.fit(iris_dataset._constructor_args["data"], iris_dataset._constructor_args["targets"])
2253      eval_df = pd.DataFrame({
2254          "inputs": iris_dataset._constructor_args["data"].tolist(),
2255          "targets": iris_dataset._constructor_args["targets"],
2256          "predictions": model.predict(iris_dataset._constructor_args["data"]),
2257      })
2258  
2259      eval_dataset = mlflow.data.from_pandas(
2260          df=eval_df,
2261          name="eval_dataset",
2262          targets="targets",
2263          predictions="predictions",
2264      )
2265  
2266      # Evaluate the model without model_id, active model_id should be used
2267      with mlflow.start_run():
2268          result = evaluate(
2269              data=eval_dataset,
2270              model_type="classifier",
2271          )
2272  
2273          # Verify metrics were logged
2274          assert result.metrics is not None
2275          assert len(result.metrics) > 0
2276  
2277          # Verify metrics are linked to the active model ID
2278          logged_model = mlflow.get_logged_model(active_model_id)
2279          assert logged_model is not None
2280          assert logged_model.model_id == active_model_id
2281  
2282          # Convert metrics list to a dictionary for easier lookup
2283          logged_metrics = {metric.key: metric.value for metric in logged_model.metrics}
2284  
2285          # Verify each metric from the evaluation result matches the logged model metrics
2286          assert logged_metrics.items() <= result.metrics.items()
2287  
2288  
2289  def test_mlflow_evaluate_logs_traces_to_active_model():
2290      eval_data = pd.DataFrame({
2291          "inputs": [
2292              "What is MLflow?",
2293              "What is Spark?",
2294          ],
2295          "ground_truth": ["What is MLflow?", "Not what is Spark?"],
2296      })
2297  
2298      @mlflow.trace
2299      def model(inputs):
2300          return inputs
2301  
2302      # no model_id used when no active model is set or passed
2303      evaluate(model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()])
2304      traces = get_traces()
2305      assert len(traces) == 1
2306      assert TraceMetadataKey.MODEL_ID not in traces[0].info.request_metadata
2307  
2308      # no active model set and pass model_id explicitly
2309      assert mlflow.get_active_model_id() is None
2310      model_id = mlflow.create_external_model(name="my-model").model_id
2311      evaluate(
2312          model,
2313          eval_data,
2314          targets="ground_truth",
2315          extra_metrics=[mlflow.metrics.exact_match()],
2316          model_id=model_id,
2317      )
2318      traces = get_traces()
2319      assert len(traces) == 2
2320      assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_id
2321  
2322      # set active model
2323      with mlflow.set_active_model(name="my-model") as active_model:
2324          model_id = active_model.model_id
2325          evaluate(
2326              model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()]
2327          )
2328          traces = get_traces()
2329          assert len(traces) == 3
2330          assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_id
2331  
2332          # pass model_id explicitly takes precedence over active model
2333          assert mlflow.get_active_model_id() is not None
2334          another_model_id = mlflow.create_external_model(name="another-model").model_id
2335          evaluate(
2336              model,
2337              eval_data,
2338              targets="ground_truth",
2339              extra_metrics=[mlflow.metrics.exact_match()],
2340              model_id=another_model_id,
2341          )
2342          traces = get_traces()
2343          assert len(traces) == 4
2344          assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == another_model_id
2345  
2346          # model_id of the passed model takes precedence over active model
2347          assert mlflow.get_active_model_id() is not None
2348          model_info = mlflow.pyfunc.log_model(
2349              name="model",
2350              python_model=model,
2351              input_example="What is MLflow?",
2352          )
2353          evaluate(
2354              model_info.model_uri,
2355              eval_data,
2356              targets="ground_truth",
2357              extra_metrics=[mlflow.metrics.exact_match()],
2358          )
2359          traces = get_traces()
2360          assert len(traces) == 5
2361          assert traces[0].info.request_metadata[TraceMetadataKey.MODEL_ID] == model_info.model_id
2362      # TODO: test registered ModelVersion's model_id works after it's supported
2363  
2364  
2365  def test_delete_run_deletes_assessments_with_source_run_id():
2366      @mlflow.trace
2367      def model(inputs):
2368          return inputs
2369  
2370      eval_data = pd.DataFrame({
2371          "inputs": ["What is MLflow?"],
2372          "ground_truth": ["MLflow is an ML platform."],
2373      })
2374  
2375      with mlflow.start_run() as run:
2376          evaluate(
2377              model, eval_data, targets="ground_truth", extra_metrics=[mlflow.metrics.exact_match()]
2378          )
2379  
2380      traces = get_traces()
2381      assert len(traces) == 1
2382      trace_id = traces[0].info.trace_id
2383  
2384      # Log a feedback assessment linked to the run via sourceRunId metadata
2385      linked_feedback = mlflow.log_feedback(
2386          trace_id=trace_id,
2387          name="eval_feedback",
2388          value="good",
2389          metadata={AssessmentMetadataKey.SOURCE_RUN_ID: run.info.run_id},
2390      )
2391  
2392      # Log another feedback assessment NOT linked to any run
2393      unlinked_feedback = mlflow.log_feedback(
2394          trace_id=trace_id,
2395          name="unlinked_feedback",
2396          value="also good",
2397      )
2398  
2399      # Verify both assessments exist
2400      trace = mlflow.get_trace(trace_id)
2401      assert len(trace.info.assessments) >= 2
2402      assessment_ids = {a.assessment_id for a in trace.info.assessments}
2403      assert linked_feedback.assessment_id in assessment_ids
2404      assert unlinked_feedback.assessment_id in assessment_ids
2405  
2406      # Delete the run
2407      MlflowClient().delete_run(run.info.run_id)
2408  
2409      # Verify the linked assessment was deleted but the unlinked one survives
2410      trace = mlflow.get_trace(trace_id)
2411      remaining_ids = {a.assessment_id for a in trace.info.assessments}
2412      assert linked_feedback.assessment_id not in remaining_ids
2413      assert unlinked_feedback.assessment_id in remaining_ids