/ mlflow / models / evaluation / base.py
base.py
   1  import inspect
   2  import json
   3  import keyword
   4  import logging
   5  import os
   6  import pathlib
   7  import signal
   8  import urllib.parse
   9  from abc import ABCMeta, abstractmethod
  10  from contextlib import contextmanager, nullcontext
  11  from dataclasses import dataclass
  12  from inspect import Parameter, Signature
  13  from types import FunctionType
  14  from typing import Any
  15  
  16  import mlflow
  17  from mlflow.data.dataset import Dataset
  18  from mlflow.data.evaluation_dataset import (
  19      EvaluationDataset,
  20      convert_data_to_mlflow_dataset,
  21  )
  22  from mlflow.entities.dataset_input import DatasetInput
  23  from mlflow.entities.input_tag import InputTag
  24  from mlflow.entities.logged_model_input import LoggedModelInput
  25  from mlflow.exceptions import MlflowException
  26  from mlflow.models.evaluation.utils.trace import configure_autologging_for_evaluation
  27  from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
  28  from mlflow.store.artifact.utils.models import _parse_model_id_if_present
  29  from mlflow.telemetry.events import EvaluateEvent
  30  from mlflow.telemetry.track import record_usage_event
  31  from mlflow.tracking.artifact_utils import _download_artifact_from_uri
  32  from mlflow.tracking.client import MlflowClient
  33  from mlflow.tracking.fluent import _set_active_model
  34  from mlflow.utils import _get_fully_qualified_class_name
  35  from mlflow.utils.annotations import developer_stable
  36  from mlflow.utils.class_utils import _get_class_from_string
  37  from mlflow.utils.file_utils import TempDir
  38  from mlflow.utils.mlflow_tags import MLFLOW_DATASET_CONTEXT
  39  from mlflow.utils.proto_json_utils import NumpyEncoder
  40  
  41  try:
  42      # `pandas` is not required for `mlflow-skinny`.
  43      import pandas as pd
  44  except ImportError:
  45      pass
  46  
  47  _logger = logging.getLogger(__name__)
  48  
  49  
  50  class _ModelType:
  51      REGRESSOR = "regressor"
  52      CLASSIFIER = "classifier"
  53      QUESTION_ANSWERING = "question-answering"
  54      TEXT_SUMMARIZATION = "text-summarization"
  55      TEXT = "text"
  56      RETRIEVER = "retriever"
  57      # This model type is used for Mosaic AI Agent evaluation and only available in Databricks
  58      # https://docs.databricks.com/en/generative-ai/agent-evaluation/index.html
  59      DATABRICKS_AGENT = "databricks-agent"
  60  
  61      def __init__(self):
  62          raise NotImplementedError("This class is not meant to be instantiated.")
  63  
  64      @classmethod
  65      def values(cls):
  66          return (
  67              cls.REGRESSOR,
  68              cls.CLASSIFIER,
  69              cls.QUESTION_ANSWERING,
  70              cls.TEXT_SUMMARIZATION,
  71              cls.TEXT,
  72              cls.RETRIEVER,
  73          )
  74  
  75  
  76  class EvaluationMetric:
  77      '''
  78      An evaluation metric.
  79  
  80      Args:
  81          eval_fn: A function that computes the metric with the following signature:
  82  
  83              .. code-block:: python
  84  
  85                  def eval_fn(
  86                      predictions: pandas.Series,
  87                      targets: pandas.Series,
  88                      metrics: Dict[str, MetricValue],
  89                      **kwargs,
  90                  ) -> Union[float, MetricValue]:
  91                      """
  92                      Args:
  93                          predictions: A pandas Series containing the predictions made by the model.
  94                          targets: (Optional) A pandas Series containing the corresponding labels
  95                              for the predictions made on that input.
  96                          metrics: (Optional) A dictionary containing the metrics calculated by the
  97                              default evaluator.  The keys are the names of the metrics and the values
  98                              are the metric values.  To access the MetricValue for the metrics
  99                              calculated by the system, make sure to specify the type hint for this
 100                              parameter as Dict[str, MetricValue].  Refer to the DefaultEvaluator
 101                              behavior section for what metrics will be returned based on the type of
 102                              model (i.e. classifier or regressor).
 103                          kwargs: Includes a list of args that are used to compute the metric. These
 104                              args could be information coming from input data, model outputs,
 105                              other metrics, or parameters specified in the `evaluator_config`
 106                              argument of the `mlflow.evaluate` API.
 107  
 108                      Returns: MetricValue with per-row scores, per-row justifications, and aggregate
 109                          results.
 110                      """
 111                      ...
 112  
 113          name: The name of the metric.
 114          greater_is_better: Whether a greater value of the metric is better.
 115          long_name: (Optional) The long name of the metric. For example,
 116              ``"root_mean_squared_error"`` for ``"mse"``.
 117          version: (Optional) The metric version. For example ``v1``.
 118          metric_details: (Optional) A description of the metric and how it is calculated.
 119          metric_metadata: (Optional) A dictionary containing metadata for the metric.
 120          genai_metric_args: (Optional) A dictionary containing arguments specified by users
 121              when calling make_genai_metric or make_genai_metric_from_prompt. Those args
 122              are persisted so that we can deserialize the same metric object later.
 123      '''
 124  
 125      def __init__(
 126          self,
 127          eval_fn,
 128          name,
 129          greater_is_better,
 130          long_name=None,
 131          version=None,
 132          metric_details=None,
 133          metric_metadata=None,
 134          genai_metric_args=None,
 135      ):
 136          self.eval_fn = eval_fn
 137          self.name = name
 138          self.greater_is_better = greater_is_better
 139          self.long_name = long_name or name
 140          self.version = version
 141          self.metric_details = metric_details
 142          self.metric_metadata = metric_metadata
 143          self.genai_metric_args = genai_metric_args
 144  
 145      def __str__(self):
 146          parts = [f"name={self.name}, greater_is_better={self.greater_is_better}"]
 147  
 148          if self.long_name:
 149              parts.append(f"long_name={self.long_name}")
 150          if self.version:
 151              parts.append(f"version={self.version}")
 152          if self.metric_details:
 153              parts.append(f"metric_details={self.metric_details}")
 154          if self.metric_metadata:
 155              parts.append(f"metric_metadata={self.metric_metadata}")
 156  
 157          return "EvaluationMetric(" + ", ".join(parts) + ")"
 158  
 159  
 160  # NB: we need this function because we cannot modify the signature of
 161  # a class's __call__ method after the class has been defined.
 162  # This is also useful to distinguish between the metric signatures with different eval_fn signatures
 163  def _generate_eval_metric_class(eval_fn, require_strict_signature=False):
 164      """
 165      Dynamically generate a GenAIEvaluationMetric class that can be used to evaluate the metric
 166      on the given input data. The generated class is callable with a __call__ method that
 167      takes the arguments specified in the signature of the eval_fn function.
 168  
 169      Args:
 170          eval_fn: the evaluation function of the EvaluationMetric.
 171          require_strict_signature: (Optional) Whether the eval_fn needs to follow a strict signature.
 172              If True, then the eval_fn must follow below signature:
 173  
 174                  .. code-block:: python
 175  
 176                      def eval_fn(
 177                          predictions: "pd.Series",
 178                          metrics: Dict[str, MetricValue],
 179                          inputs: "pd.Series",
 180                          *args,
 181                      ) -> MetricValue:
 182                          pass
 183  
 184              When generating a metric from `make_genai_metric`, this should be set to True.
 185              Default to False.
 186  
 187      Returns:
 188          A dynamically generated callable CallableEvaluationMetric class.
 189      """
 190      from mlflow.metrics.base import MetricValue
 191  
 192      if require_strict_signature:
 193          allowed_kwargs_names = [
 194              param_name
 195              for param_name in inspect.signature(eval_fn).parameters.keys()
 196              if param_name not in ["predictions", "metrics", "inputs"]
 197          ]
 198  
 199          def genai_call_method(
 200              self,
 201              *,
 202              predictions: pd.Series | str | list[str],
 203              inputs: pd.Series | str | list[str],
 204              metrics: dict[str, MetricValue] | None = None,
 205              **kwargs,
 206          ) -> MetricValue:
 207              if missed_kwargs := set(allowed_kwargs_names) - set(kwargs.keys()):
 208                  raise MlflowException.invalid_parameter_value(
 209                      f"Missing required arguments: {missed_kwargs}",
 210                  )
 211              if extra_kwargs := set(kwargs.keys()) - set(allowed_kwargs_names):
 212                  raise MlflowException.invalid_parameter_value(
 213                      f"Unexpected arguments: {extra_kwargs}",
 214                  )
 215              return self.eval_fn(
 216                  _convert_val_to_pd_Series(predictions, "predictions"),
 217                  metrics or {},
 218                  _convert_val_to_pd_Series(inputs, "inputs"),
 219                  # Note: based on https://github.com/mlflow/mlflow/blob/4fef77afdbe4d76302cb0b1aad2bd72b5cde64e9/mlflow/metrics/genai/genai_metric.py#L49-L53
 220                  # the extra params passed https://github.com/mlflow/mlflow/blob/4fef77afdbe4d76302cb0b1aad2bd72b5cde64e9/mlflow/metrics/genai/genai_metric.py#L513
 221                  # should always be pandas Series
 222                  *[
 223                      _convert_val_to_pd_Series(kwargs[arg_name], arg_name)
 224                      for arg_name in allowed_kwargs_names
 225                  ],
 226              )
 227  
 228          genai_call_method.__signature__ = Signature(
 229              parameters=[
 230                  Parameter("self", Parameter.POSITIONAL_OR_KEYWORD),
 231                  Parameter(
 232                      "predictions",
 233                      Parameter.KEYWORD_ONLY,
 234                      annotation=pd.Series | str | list[str],
 235                  ),
 236                  Parameter(
 237                      "inputs",
 238                      Parameter.KEYWORD_ONLY,
 239                      annotation=pd.Series | str | list[str],
 240                  ),
 241                  Parameter(
 242                      "metrics",
 243                      Parameter.KEYWORD_ONLY,
 244                      annotation=dict[str, MetricValue] | None,
 245                      default=None,
 246                  ),
 247                  *[
 248                      Parameter(name, Parameter.KEYWORD_ONLY, annotation=pd.Series | str | list[str])
 249                      for name in allowed_kwargs_names
 250                  ],
 251              ]
 252          )
 253          genai_call_method.__doc__ = f"""
 254              Evaluate the metric on the given inputs and predictions.
 255              Note: only keyword arguments are supported.
 256  
 257              Args:
 258                  predictions: predictions made by the model.
 259                  inputs: inputs used to make the predictions.
 260                  metrics: metrics calculated by the default evaluator.
 261                  kwargs: additional arguments used to compute the metric.
 262                      Required arguments: {allowed_kwargs_names}
 263  
 264              Returns:
 265                  evaluation result as MetricValue object.
 266              """
 267          call_method = genai_call_method
 268  
 269      else:
 270  
 271          def _call_method(
 272              self,
 273              **kwargs,
 274          ) -> MetricValue:
 275              return self.eval_fn(**kwargs)
 276  
 277          allowed_kwargs_params = inspect.signature(eval_fn).parameters
 278          _call_method.__signature__ = Signature(
 279              parameters=[
 280                  Parameter("self", Parameter.POSITIONAL_OR_KEYWORD),
 281                  *[
 282                      Parameter(
 283                          name,
 284                          Parameter.KEYWORD_ONLY,
 285                          annotation=allowed_kwargs_params[name].annotation,
 286                      )
 287                      for name in allowed_kwargs_params.keys()
 288                  ],
 289              ]
 290          )
 291          _call_method.__doc__ = f"""
 292              Evaluate the metric on the given inputs and predictions.
 293              Note: only keyword arguments are supported.
 294  
 295              Args:
 296                  kwargs: additional arguments used to compute the metric.
 297                      Required arguments: {list(allowed_kwargs_params.keys())}
 298  
 299              Returns:
 300                  evaluation result as MetricValue object.
 301              """
 302          call_method = _call_method
 303  
 304      return type(
 305          "CallableEvaluationMetric",
 306          (EvaluationMetric,),
 307          {"__call__": call_method},
 308      )
 309  
 310  
 311  def _convert_val_to_pd_Series(val, name):
 312      if val is not None and not isinstance(val, pd.Series):
 313          if isinstance(val, str):
 314              return pd.Series([val])
 315          elif isinstance(val, list):
 316              return pd.Series(val)
 317          else:
 318              raise TypeError(
 319                  f"Expected {name} to be a string, list, or Pandas Series, got {type(val)}"
 320              )
 321      return val
 322  
 323  
 324  def make_metric(
 325      *,
 326      eval_fn,
 327      greater_is_better,
 328      name=None,
 329      long_name=None,
 330      version=None,
 331      metric_details=None,
 332      metric_metadata=None,
 333      genai_metric_args=None,
 334  ):
 335      '''
 336      A factory function to create an :py:class:`EvaluationMetric` object.
 337  
 338      Args:
 339          eval_fn: A function that computes the metric with the following signature:
 340  
 341              .. code-block:: python
 342  
 343                  def eval_fn(
 344                      predictions: pandas.Series,
 345                      targets: pandas.Series,
 346                      metrics: Dict[str, MetricValue],
 347                      **kwargs,
 348                  ) -> Union[float, MetricValue]:
 349                      """
 350                      Args:
 351                          predictions: A pandas Series containing the predictions made by the model.
 352                          targets: (Optional) A pandas Series containing the corresponding labels
 353                              for the predictions made on that input.
 354                          metrics: (Optional) A dictionary containing the metrics calculated by the
 355                              default evaluator.  The keys are the names of the metrics and the values
 356                              are the metric values.  To access the MetricValue for the metrics
 357                              calculated by the system, make sure to specify the type hint for this
 358                              parameter as Dict[str, MetricValue].  Refer to the DefaultEvaluator
 359                              behavior section for what metrics will be returned based on the type of
 360                              model (i.e. classifier or regressor).  kwargs: Includes a list of args
 361                              that are used to compute the metric. These args could information coming
 362                              from input data, model outputs or parameters specified in the
 363                              `evaluator_config` argument of the `mlflow.evaluate` API.
 364                          kwargs: Includes a list of args that are used to compute the metric. These
 365                              args could be information coming from input data, model outputs,
 366                              other metrics, or parameters specified in the `evaluator_config`
 367                              argument of the `mlflow.evaluate` API.
 368  
 369                      Returns: MetricValue with per-row scores, per-row justifications, and aggregate
 370                          results.
 371                      """
 372                      ...
 373  
 374          greater_is_better: Whether a greater value of the metric is better.
 375          name: The name of the metric. This argument must be specified if ``eval_fn`` is a lambda
 376                      function or the ``eval_fn.__name__`` attribute is not available.
 377          long_name: (Optional) The long name of the metric. For example, ``"mean_squared_error"``
 378              for ``"mse"``.
 379          version: (Optional) The metric version. For example ``v1``.
 380          metric_details: (Optional) A description of the metric and how it is calculated.
 381          metric_metadata: (Optional) A dictionary containing metadata for the metric.
 382          genai_metric_args: (Optional) A dictionary containing arguments specified by users
 383              when calling make_genai_metric or make_genai_metric_from_prompt. Those args
 384              are persisted so that we can deserialize the same metric object later.
 385  
 386      .. seealso::
 387  
 388          - :py:class:`mlflow.models.EvaluationMetric`
 389          - :py:func:`mlflow.evaluate`
 390      '''
 391      return _make_metric(
 392          eval_fn=eval_fn,
 393          greater_is_better=greater_is_better,
 394          name=name,
 395          long_name=long_name,
 396          version=version,
 397          metric_details=metric_details,
 398          metric_metadata=metric_metadata,
 399          genai_metric_args=genai_metric_args,
 400          require_strict_signature=False,
 401      )
 402  
 403  
 404  def _make_metric(
 405      *,
 406      eval_fn,
 407      greater_is_better,
 408      name=None,
 409      long_name=None,
 410      version=None,
 411      metric_details=None,
 412      metric_metadata=None,
 413      genai_metric_args=None,
 414      require_strict_signature=False,
 415  ):
 416      '''
 417      A factory function to create an :py:class:`EvaluationMetric` object.
 418  
 419      Args:
 420          eval_fn: A function that computes the metric with the following signature:
 421  
 422              .. code-block:: python
 423  
 424                  def eval_fn(
 425                      predictions: pandas.Series,
 426                      targets: pandas.Series,
 427                      metrics: Dict[str, MetricValue],
 428                      **kwargs,
 429                  ) -> Union[float, MetricValue]:
 430                      """
 431                      Args:
 432                          predictions: A pandas Series containing the predictions made by the model.
 433                          targets: (Optional) A pandas Series containing the corresponding labels
 434                              for the predictions made on that input.
 435                          metrics: (Optional) A dictionary containing the metrics calculated by the
 436                              default evaluator.  The keys are the names of the metrics and the values
 437                              are the metric values.  To access the MetricValue for the metrics
 438                              calculated by the system, make sure to specify the type hint for this
 439                              parameter as Dict[str, MetricValue].  Refer to the DefaultEvaluator
 440                              behavior section for what metrics will be returned based on the type of
 441                              model (i.e. classifier or regressor).  kwargs: Includes a list of args
 442                              that are used to compute the metric. These args could information coming
 443                              from input data, model outputs or parameters specified in the
 444                              `evaluator_config` argument of the `mlflow.evaluate` API.
 445                          kwargs: Includes a list of args that are used to compute the metric. These
 446                              args could be information coming from input data, model outputs,
 447                              other metrics, or parameters specified in the `evaluator_config`
 448                              argument of the `mlflow.evaluate` API.
 449  
 450                      Returns: MetricValue with per-row scores, per-row justifications, and aggregate
 451                          results.
 452                      """
 453                      ...
 454  
 455          greater_is_better: Whether a greater value of the metric is better.
 456          name: The name of the metric. This argument must be specified if ``eval_fn`` is a lambda
 457                      function or the ``eval_fn.__name__`` attribute is not available.
 458          long_name: (Optional) The long name of the metric. For example, ``"mean_squared_error"``
 459              for ``"mse"``.
 460          version: (Optional) The metric version. For example ``v1``.
 461          metric_details: (Optional) A description of the metric and how it is calculated.
 462          metric_metadata: (Optional) A dictionary containing metadata for the metric.
 463          genai_metric_args: (Optional) A dictionary containing arguments specified by users
 464              when calling make_genai_metric or make_genai_metric_from_prompt. Those args
 465              are persisted so that we can deserialize the same metric object later.
 466          require_strict_signature: (Optional) Whether the eval_fn needs to follow a strict signature.
 467              If True, then the eval_fn must follow below signature:
 468  
 469                  .. code-block:: python
 470  
 471                      def eval_fn(
 472                          predictions: "pd.Series",
 473                          metrics: Dict[str, MetricValue],
 474                          inputs: "pd.Series",
 475                          *args,
 476                      ) -> MetricValue:
 477                          pass
 478  
 479              When generating a metric from `make_genai_metric`, this should be set to True.
 480              Default to False.
 481  
 482      .. seealso::
 483  
 484          - :py:class:`mlflow.models.EvaluationMetric`
 485          - :py:func:`mlflow.evaluate`
 486      '''
 487      if name is None:
 488          if isinstance(eval_fn, FunctionType) and eval_fn.__name__ == "<lambda>":
 489              raise MlflowException(
 490                  "`name` must be specified if `eval_fn` is a lambda function.",
 491                  INVALID_PARAMETER_VALUE,
 492              )
 493          if not hasattr(eval_fn, "__name__"):
 494              raise MlflowException(
 495                  "`name` must be specified if `eval_fn` does not have a `__name__` attribute.",
 496                  INVALID_PARAMETER_VALUE,
 497              )
 498          name = eval_fn.__name__
 499  
 500      if "/" in name:
 501          raise MlflowException(
 502              f"Invalid metric name '{name}'. Metric names cannot include forward slashes ('/').",
 503              INVALID_PARAMETER_VALUE,
 504          )
 505  
 506      if not name.isidentifier():
 507          _logger.warning(
 508              f"The metric name '{name}' provided is not a valid Python identifier, which will "
 509              "prevent its use as a base metric for derived metrics. Please use a valid identifier "
 510              "to enable creation of derived metrics that use the given metric."
 511          )
 512  
 513      if keyword.iskeyword(name):
 514          _logger.warning(
 515              f"The metric name '{name}' is a reserved Python keyword, which will "
 516              "prevent its use as a base metric for derived metrics. Please use a valid identifier "
 517              "to enable creation of derived metrics that use the given metric."
 518          )
 519  
 520      if name in ["predictions", "targets", "metrics"]:
 521          _logger.warning(
 522              f"The metric name '{name}' is used as a special parameter in MLflow metrics, which "
 523              "will prevent its use as a base metric for derived metrics. Please use a different "
 524              "name to enable creation of derived metrics that use the given metric."
 525          )
 526  
 527      return _generate_eval_metric_class(eval_fn, require_strict_signature=require_strict_signature)(
 528          eval_fn=eval_fn,
 529          name=name,
 530          greater_is_better=greater_is_better,
 531          long_name=long_name,
 532          version=version,
 533          metric_details=metric_details,
 534          metric_metadata=metric_metadata,
 535          genai_metric_args=genai_metric_args,
 536      )
 537  
 538  
 539  @developer_stable
 540  class EvaluationArtifact(metaclass=ABCMeta):
 541      """
 542      A model evaluation artifact containing an artifact uri and content.
 543      """
 544  
 545      def __init__(self, uri, content=None):
 546          self._uri = uri
 547          self._content = content
 548  
 549      @abstractmethod
 550      def _load_content_from_file(self, local_artifact_path):
 551          """
 552          Abstract interface to load the content from local artifact file path,
 553          and return the loaded content.
 554          """
 555  
 556      def _load(self, local_artifact_path=None):
 557          """
 558          If ``local_artifact_path`` is ``None``, download artifact from the artifact uri.
 559          Otherwise, load artifact content from the specified path. Assign the loaded content to
 560          ``self._content``, and return the loaded content.
 561          """
 562          if local_artifact_path is not None:
 563              self._content = self._load_content_from_file(local_artifact_path)
 564          else:
 565              with TempDir() as temp_dir:
 566                  temp_dir_path = temp_dir.path()
 567                  _download_artifact_from_uri(self._uri, temp_dir_path)
 568                  local_artifact_file = temp_dir.path(os.listdir(temp_dir_path)[0])
 569                  self._content = self._load_content_from_file(local_artifact_file)
 570          return self._content
 571  
 572      @abstractmethod
 573      def _save(self, output_artifact_path):
 574          """Save artifact content into specified path."""
 575  
 576      @property
 577      def content(self):
 578          """
 579          The content of the artifact (representation varies)
 580          """
 581          if self._content is None:
 582              self._load()
 583          return self._content
 584  
 585      @property
 586      def uri(self) -> str:
 587          """
 588          The URI of the artifact
 589          """
 590          return self._uri
 591  
 592      def __repr__(self):
 593          return f"{self.__class__.__name__}(uri='{self.uri}')"
 594  
 595  
 596  class EvaluationResult:
 597      """
 598      Represents the model evaluation outputs of a `mlflow.evaluate()` API call, containing
 599      both scalar metrics and output artifacts such as performance plots.
 600      """
 601  
 602      def __init__(self, metrics, artifacts, run_id=None):
 603          self._metrics = metrics
 604          self._artifacts = artifacts
 605          self._run_id = (
 606              run_id
 607              if run_id is not None
 608              else (mlflow.active_run().info.run_id if mlflow.active_run() is not None else None)
 609          )
 610  
 611      @classmethod
 612      def load(cls, path):
 613          """Load the evaluation results from the specified local filesystem path"""
 614          with open(os.path.join(path, "metrics.json")) as fp:
 615              metrics = json.load(fp)
 616  
 617          with open(os.path.join(path, "artifacts_metadata.json")) as fp:
 618              artifacts_metadata = json.load(fp)
 619  
 620          artifacts = {}
 621  
 622          artifacts_dir = os.path.join(path, "artifacts")
 623  
 624          for artifact_name, meta in artifacts_metadata.items():
 625              uri = meta["uri"]
 626              ArtifactCls = _get_class_from_string(meta["class_name"])
 627              artifact = ArtifactCls(uri=uri)
 628              filename = pathlib.Path(urllib.parse.urlparse(uri).path).name
 629              artifact._load(os.path.join(artifacts_dir, filename))
 630              artifacts[artifact_name] = artifact
 631  
 632          return EvaluationResult(metrics=metrics, artifacts=artifacts)
 633  
 634      def save(self, path):
 635          """Write the evaluation results to the specified local filesystem path"""
 636          os.makedirs(path, exist_ok=True)
 637          with open(os.path.join(path, "metrics.json"), "w") as fp:
 638              json.dump(self.metrics, fp, cls=NumpyEncoder)
 639  
 640          artifacts_metadata = {
 641              artifact_name: {
 642                  "uri": artifact.uri,
 643                  "class_name": _get_fully_qualified_class_name(artifact),
 644              }
 645              for artifact_name, artifact in self.artifacts.items()
 646          }
 647          with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
 648              json.dump(artifacts_metadata, fp)
 649  
 650          artifacts_dir = os.path.join(path, "artifacts")
 651          os.makedirs(artifacts_dir, exist_ok=True)
 652  
 653          for artifact in self.artifacts.values():
 654              filename = pathlib.Path(urllib.parse.urlparse(artifact.uri).path).name
 655              artifact._save(os.path.join(artifacts_dir, filename))
 656  
 657      @property
 658      def metrics(self) -> dict[str, Any]:
 659          """
 660          A dictionary mapping scalar metric names to scalar metric values
 661          """
 662          return self._metrics
 663  
 664      @property
 665      def artifacts(self) -> dict[str, "mlflow.models.EvaluationArtifact"]:
 666          """
 667          A dictionary mapping standardized artifact names (e.g. "roc_data") to
 668          artifact content and location information
 669          """
 670          return self._artifacts
 671  
 672      @property
 673      def run_id(self) -> str:
 674          """
 675          The ID of the MLflow Run to which the evaluation results were logged.
 676          """
 677          return self._run_id
 678  
 679      @property
 680      def tables(self) -> dict[str, "pd.DataFrame"]:
 681          """
 682          A dictionary mapping standardized artifact names (e.g. "eval_results_table") to
 683          corresponding table content as pandas DataFrame.
 684          """
 685          eval_tables = {}
 686          if self._run_id is None:
 687              _logger.warning("Cannot load eval_results_table because run_id is not specified.")
 688              return eval_tables
 689  
 690          for table_name, table_path in self._artifacts.items():
 691              path = urllib.parse.urlparse(table_path.uri).path
 692              table_fileName = os.path.basename(path)
 693              try:
 694                  eval_tables[table_name] = mlflow.load_table(table_fileName, run_ids=[self._run_id])
 695              except Exception:
 696                  pass  # Swallow the exception since we assume its not a table.
 697  
 698          return eval_tables
 699  
 700  
 701  @developer_stable
 702  class ModelEvaluator(metaclass=ABCMeta):
 703      @classmethod
 704      @abstractmethod
 705      def can_evaluate(cls, *, model_type, evaluator_config, **kwargs) -> bool:
 706          """
 707          Args:
 708              model_type: A string describing the model type (e.g., "regressor", "classifier", …).
 709              evaluator_config: A dictionary of additional configurations for
 710                  the evaluator.
 711              kwargs: For forwards compatibility, a placeholder for additional arguments
 712                  that may be added to the evaluation interface in the future.
 713  
 714          Returns:
 715              True if the evaluator can evaluate the specified model on the
 716              specified dataset. False otherwise.
 717          """
 718  
 719      @abstractmethod
 720      def evaluate(
 721          self,
 722          *,
 723          model_type,
 724          dataset,
 725          run_id,
 726          evaluator_config,
 727          model=None,
 728          extra_metrics=None,
 729          custom_artifacts=None,
 730          predictions=None,
 731          **kwargs,
 732      ):
 733          """
 734          The abstract API to log metrics and artifacts, and return evaluation results.
 735  
 736          Args:
 737              model_type: A string describing the model type
 738                  (e.g., ``"regressor"``, ``"classifier"``, …).
 739              dataset: An instance of `mlflow.models.evaluation.base._EvaluationDataset`
 740                  containing features and labels (optional) for model evaluation.
 741              run_id: The ID of the MLflow Run to which to log results.
 742              evaluator_config: A dictionary of additional configurations for
 743                  the evaluator.
 744              model: A pyfunc model instance. If None, the model output is supposed to be found in
 745                  ``dataset.predictions_data``.
 746              extra_metrics: A list of :py:class:`EvaluationMetric` objects.
 747              custom_artifacts: A list of callable custom artifact functions.
 748              predictions: The column name of the model output column that is used for evaluation.
 749                  This is only used when a model returns a pandas dataframe that contains
 750                  multiple columns.
 751              kwargs: For forwards compatibility, a placeholder for additional arguments that
 752                  may be added to the evaluation interface in the future.
 753  
 754          Returns:
 755              A :py:class:`mlflow.models.EvaluationResult` instance containing
 756              evaluation metrics and artifacts for the model.
 757          """
 758  
 759  
 760  def list_evaluators():
 761      """
 762      Return a name list for all available Evaluators.
 763      """
 764      # import _model_evaluation_registry inside function to avoid circuit importing
 765      from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 766  
 767      return list(_model_evaluation_registry._registry.keys())
 768  
 769  
 770  @contextmanager
 771  def _start_run_or_reuse_active_run():
 772      """
 773      A manager context return:
 774       - If there's an active run, return the active run.
 775       - otherwise start a mflow run with the specified run_id,
 776         if specified run_id is None, start a new run.
 777      """
 778      active_run = mlflow.active_run()
 779      if not active_run:
 780          # Note `mlflow.start_run` throws if `run_id` is not found.
 781          with mlflow.start_run() as run:
 782              yield run
 783      else:
 784          yield active_run
 785  
 786  
 787  # NB: We often pass around evaluator name, config, and its instance together. Ideally, the
 788  # evaluator class should have name and config as class attributes, however, it was not
 789  # designed that way. Adding them while keeping backward compatibility is not trivial.
 790  # So, we use a dataclass to bundle them together.
 791  @dataclass
 792  class EvaluatorBundle:
 793      name: str
 794      evaluator: ModelEvaluator
 795      config: dict[str, Any]
 796  
 797  
 798  def _resolve_default_evaluator(model_type, default_config) -> list[EvaluatorBundle]:
 799      """
 800      Determine which built-in evaluators should be used for the given model type by default.
 801  
 802      Previously, MLflow evaluate API only had a single "default" evaluator used for all models like
 803      classifier, regressor, etc. We split it into multiple built-in evaluators for different model
 804      types for maintainability, but in order to maintain backward compatibility, we need to map
 805      the "default" provided by users to the correct built-in evaluators.
 806  
 807      Args:
 808          model_type: A string describing the model type (e.g., "regressor", "classifier", …).
 809          default_config: A dictionary of configurations for the "default" evaluator. If any
 810              non-default built-in evaluator is applicable, this config will be applied to them.
 811      """
 812      from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 813  
 814      builtin_evaluators = []
 815      for name in _model_evaluation_registry._registry:
 816          evaluator = _model_evaluation_registry.get_evaluator(name)
 817          if (
 818              name != "default"
 819              and _model_evaluation_registry.is_builtin(name)
 820              and evaluator.can_evaluate(model_type=model_type, evaluator_config=default_config)
 821          ):
 822              builtin_evaluators.append(EvaluatorBundle(name, evaluator, default_config))
 823  
 824      # We should use DefaultEvaluator only if there is no other built-in evaluator applicable.
 825      if not builtin_evaluators:
 826          default_evaluator = _model_evaluation_registry.get_evaluator("default")
 827          builtin_evaluators = [EvaluatorBundle("default", default_evaluator, default_config)]
 828  
 829      return builtin_evaluators
 830  
 831  
 832  def resolve_evaluators_and_configs(
 833      evaluators: str | list[str] | None,
 834      evaluator_config: dict[str, Any] | None,
 835      model_type: str | None = None,
 836  ) -> list[EvaluatorBundle]:
 837      """
 838      The `evaluators` and `evaluator_config` arguments of the `evaluate` API can be specified
 839      in multiple ways. This function normalizes the arguments into a single format for easier
 840      downstream processing.
 841  
 842      Args:
 843          evaluators: A string or a list of strings specifying the evaluators to use for model
 844              evaluation. If None, all available evaluators will be used.
 845          evaluator_config: A dictionary containing configuration items for the evaluators.
 846          model_type: A string describing the model type (e.g., "regressor", "classifier", …).
 847  
 848      Returns:
 849          A list of EvaluatorBundle that contains name, evaluator, config for each evaluator.
 850      """
 851      from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry as rg
 852  
 853      # NB: The `databricks-agents` package must be installed to use the 'databricks-agent' model
 854      # type. Ideally this check should be done in the 'databricks-agent' evaluator implementation,
 855      # but we need to do it here because the code won't reach the evaluator implementation if the
 856      # package is not installed.
 857      if model_type == _ModelType.DATABRICKS_AGENT:
 858          try:
 859              import databricks.agents  # noqa: F401
 860          except ImportError as e:
 861              raise MlflowException(
 862                  message="Databricks Agents SDK must be installed to use the "
 863                  f"`{_ModelType.DATABRICKS_AGENT}` model type. Run `pip install databricks-agents` "
 864                  "to install the package and try again.",
 865                  error_code=INVALID_PARAMETER_VALUE,
 866              ) from e
 867  
 868      def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map):
 869          return isinstance(_evaluator_name_to_conf_map, dict) and all(
 870              k in _evaluator_name_list and isinstance(v, dict)
 871              for k, v in _evaluator_name_to_conf_map.items()
 872          )
 873  
 874      if evaluators is None:
 875          # If no evaluators are specified, use all available evaluators.
 876          evaluators = list(rg._registry.keys())
 877  
 878          evaluator_config = evaluator_config or {}
 879          if evaluator_config is not None and not any(
 880              name in evaluator_config for name in evaluators
 881          ):
 882              # If evaluator config is passed but any of available evaluator key is not
 883              # in the evaluator config, we assume the evaluator config to be a flat dict,
 884              # which is globally applied to all evaluators.
 885              evaluator_config = dict.fromkeys(evaluators, evaluator_config)
 886  
 887          # Filter out evaluators that cannot evaluate the model type.
 888          resolved = []
 889          for name in evaluators:
 890              evaluator = rg.get_evaluator(name)
 891              config = evaluator_config.get(name, {})
 892              if evaluator.can_evaluate(model_type=model_type, evaluator_config=config):
 893                  resolved.append(EvaluatorBundle(name=name, evaluator=evaluator, config=config))
 894  
 895          # If any of built-in evaluator can apply, skip "default" evaluator.
 896          default = next((ev for ev in resolved if ev.name == "default"), None)
 897          non_default_builtins = [
 898              ev for ev in resolved if ev.name != "default" and rg.is_builtin(ev.name)
 899          ]
 900          if default and non_default_builtins:
 901              resolved.remove(default)
 902              # Apply default config (passed like `evaluator_config={"default": config}`) to
 903              # non-default built-in evaluators (e.g., ClassifierEvaluator) if they don't have
 904              # explicitly specified configs. This is for backward compatibility where we only
 905              # had a single "default" evaluator used for all models.
 906              # For example, if the user passes this for a classifier model:
 907              #     evaluator_config = {"default": my_config}
 908              # it should be equivalent to
 909              #    evaluator_config = {"classifier": my_config, "shap": my_config}
 910              for ev in non_default_builtins:
 911                  ev.config = ev.config or default.config
 912  
 913          return resolved
 914  
 915      elif isinstance(evaluators, str):
 916          # Single evaluator name specified
 917          if not (evaluator_config is None or isinstance(evaluator_config, dict)):
 918              raise MlflowException(
 919                  message="If `evaluators` argument is the name of an evaluator, evaluator_config"
 920                  " must be None or a dict containing config items for the evaluator.",
 921                  error_code=INVALID_PARAMETER_VALUE,
 922              )
 923  
 924          evaluator_config = evaluator_config or {}
 925          if evaluators == "default":
 926              # Previously we only had a single "default" evaluator used for all models.
 927              # We need to map "default" to the new dedicated builtin evaluators.
 928              return _resolve_default_evaluator(model_type, evaluator_config)
 929          elif rg.is_registered(evaluators):
 930              return [EvaluatorBundle(evaluators, rg.get_evaluator(evaluators), evaluator_config)]
 931          else:
 932              return []
 933  
 934      elif isinstance(evaluators, list):
 935          if evaluator_config is not None and not check_nesting_config_dict(
 936              evaluators, evaluator_config
 937          ):
 938              raise MlflowException(
 939                  message="If `evaluators` argument is an evaluator name list, evaluator_config "
 940                  "must be a dict containing mapping from evaluator name to individual "
 941                  "evaluator config dict.",
 942                  error_code=INVALID_PARAMETER_VALUE,
 943              )
 944          evaluator_config = evaluator_config or {}
 945  
 946          # Previously we only had a single "default" evaluator used for all models.
 947          # We need to map "default" to the new dedicated builtin evaluators.
 948          resolved = []
 949          for name in evaluators:
 950              config = evaluator_config.get(name, {})
 951              if name == "default":
 952                  builtin_evaluators = _resolve_default_evaluator(model_type, config)
 953                  resolved.extend(builtin_evaluators)
 954              else:
 955                  resolved.append(EvaluatorBundle(name, rg.get_evaluator(name), config))
 956          return resolved
 957      else:
 958          raise MlflowException(
 959              message="Invalid `evaluators` and `evaluator_config` arguments. "
 960              "Please refer to the documentation for correct usage.",
 961              error_code=INVALID_PARAMETER_VALUE,
 962          )
 963  
 964  
 965  def _model_validation_contains_model_comparison(validation_thresholds):
 966      """
 967      Helper function for determining if validation_thresholds contains
 968      thresholds for model comparison: either min_relative_change or min_absolute_change
 969      """
 970      if not validation_thresholds:
 971          return False
 972      thresholds = validation_thresholds.values()
 973      return any(
 974          threshold.min_relative_change or threshold.min_absolute_change for threshold in thresholds
 975      )
 976  
 977  
 978  _last_failed_evaluator = None
 979  
 980  
 981  def _get_last_failed_evaluator():
 982      """
 983      Return the evaluator name of the last failed evaluator when calling `evaluate`.
 984      This can be used to check which evaluator fail when `evaluate` API fail.
 985      """
 986      return _last_failed_evaluator
 987  
 988  
 989  # DO NOT CHANGE THE ORDER OF THE ARGUMENTS
 990  # The order of the arguments need to be preserved. You can add new arguments at the end
 991  # of the argument list, but do not change the order of the existing arguments.
 992  @record_usage_event(EvaluateEvent)
 993  def _evaluate(
 994      *,
 995      model,
 996      model_type,
 997      model_id,
 998      dataset,
 999      run_id,
1000      # The `evaluator_name_list` and `evaluator_name_to_conf_map` are not used by MLflow at all,
1001      # but we need to keep these for backward compatibility.
1002      evaluator_name_list,
1003      evaluator_name_to_conf_map,
1004      extra_metrics,
1005      custom_artifacts,
1006      predictions,
1007      evaluators,
1008  ):
1009      """
1010      The public API "evaluate" will verify argument first, and then pass normalized arguments
1011      to the _evaluate method.
1012      """
1013      global _last_failed_evaluator
1014      _last_failed_evaluator = None
1015  
1016      client = MlflowClient()
1017  
1018      model_uuid = getattr(model, "metadata", None)
1019  
1020      if model_uuid is not None:
1021          model_uuid = model_uuid.model_uuid
1022          dataset._log_dataset_tag(client, run_id, model_uuid)
1023  
1024      eval_results = []
1025      should_enable_tracing = model is not None  # Do not enable tracing if static dataset is provided
1026      for eval_ in evaluators:
1027          _logger.debug(f"Evaluating the model with the {eval_.name} evaluator.")
1028          _last_failed_evaluator = eval_.name
1029          if eval_.evaluator.can_evaluate(model_type=model_type, evaluator_config=eval_.config):
1030              with configure_autologging_for_evaluation(enable_tracing=should_enable_tracing):
1031                  eval_result = eval_.evaluator.evaluate(
1032                      model=model,
1033                      model_type=model_type,
1034                      model_id=model_id,
1035                      dataset=dataset,
1036                      run_id=run_id,
1037                      evaluator_config=eval_.config,
1038                      extra_metrics=extra_metrics,
1039                      custom_artifacts=custom_artifacts,
1040                      predictions=predictions,
1041                  )
1042  
1043              if eval_result is not None:
1044                  eval_results.append(eval_result)
1045  
1046      _last_failed_evaluator = None
1047  
1048      if len(eval_results) == 0:
1049          raise MlflowException(
1050              message="The model could not be evaluated by any of the registered evaluators, please "
1051              "verify that the model type and other configs are set correctly.",
1052              error_code=INVALID_PARAMETER_VALUE,
1053          )
1054  
1055      merged_eval_result = EvaluationResult({}, {}, None)
1056  
1057      for eval_result in eval_results:
1058          merged_eval_result.metrics.update(eval_result.metrics)
1059          merged_eval_result.artifacts.update(eval_result.artifacts)
1060  
1061      return merged_eval_result
1062  
1063  
1064  def _get_model_from_function(fn):
1065      from mlflow.pyfunc.model import _PythonModelPyfuncWrapper
1066  
1067      class ModelFromFunction(mlflow.pyfunc.PythonModel):
1068          def predict(self, context, model_input: pd.DataFrame):
1069              return fn(model_input)
1070  
1071      python_model = ModelFromFunction()
1072      return _PythonModelPyfuncWrapper(python_model, None, None)
1073  
1074  
1075  def _is_model_deployment_endpoint_uri(model: Any) -> bool:
1076      if not isinstance(model, str):
1077          return False
1078  
1079      from mlflow.metrics.genai.model_utils import _parse_model_uri
1080  
1081      try:
1082          schema, path = _parse_model_uri(model)
1083          return schema in ["endpoints", "apps"]
1084      except MlflowException:
1085          return False
1086  
1087  
1088  def _get_model_from_deployment_endpoint_uri(
1089      endpoint_uri: str, params: dict[str, Any] | None = None
1090  ):
1091      from mlflow.metrics.genai.model_utils import _parse_model_uri
1092      from mlflow.pyfunc.model import ModelFromDeploymentEndpoint, _PythonModelPyfuncWrapper
1093  
1094      _, endpoint = _parse_model_uri(endpoint_uri)
1095      params = params or {}
1096  
1097      python_model = ModelFromDeploymentEndpoint(endpoint, params)
1098      return _PythonModelPyfuncWrapper(python_model, None, None)
1099  
1100  
1101  def evaluate(
1102      model=None,
1103      data=None,
1104      *,
1105      model_type=None,
1106      targets=None,
1107      predictions=None,
1108      dataset_path=None,
1109      feature_names=None,
1110      evaluators=None,
1111      evaluator_config=None,
1112      extra_metrics=None,
1113      custom_artifacts=None,
1114      env_manager="local",
1115      model_config=None,
1116      inference_params=None,
1117      model_id=None,
1118      _called_from_genai_evaluate=False,
1119  ):
1120      '''
1121      Evaluate the model performance on given data and selected metrics.
1122  
1123      This function evaluates a PyFunc model or custom callable on the specified dataset using
1124      specified ``evaluators``, and logs resulting metrics & artifacts to MLflow tracking server.
1125      Users can also skip setting ``model`` and put the model outputs in ``data`` directly for
1126      evaluation. For detailed information, please read
1127      `the Model Evaluation documentation <../../model-evaluation/index.html>`_.
1128  
1129      Default Evaluator behavior:
1130       - The default evaluator, which can be invoked with ``evaluators="default"`` or
1131         ``evaluators=None``, supports model types listed below. For each pre-defined model type, the
1132         default evaluator evaluates your model on a selected set of metrics and generate artifacts
1133         like plots. Please find more details below.
1134  
1135       - For both the ``"regressor"`` and ``"classifier"`` model types, the default evaluator
1136         generates model summary plots and feature importance plots using
1137         `SHAP <https://shap.readthedocs.io/en/latest/index.html>`_.
1138  
1139       - For regressor models, the default evaluator additionally logs:
1140          - **metrics**: example_count, mean_absolute_error, mean_squared_error,
1141            root_mean_squared_error, sum_on_target, mean_on_target, r2_score, max_error,
1142            mean_absolute_percentage_error.
1143  
1144       - For binary classifiers, the default evaluator additionally logs:
1145          - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
1146            precision, f1_score, accuracy_score, example_count, log_loss, roc_auc,
1147            precision_recall_auc.
1148          - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
1149  
1150       - For multiclass classifiers, the default evaluator additionally logs:
1151          - **metrics**: accuracy_score, example_count, f1_score_micro, f1_score_macro, log_loss
1152          - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes
1153            true_negatives/false_positives/false_negatives/true_positives/recall/precision/roc_auc,
1154            precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
1155  
1156       - For question-answering models, the default evaluator logs:
1157          - **metrics**: ``exact_match``, ``token_count``, `toxicity`_ (requires `evaluate`_,
1158            `torch`_, `flesch_kincaid_grade_level`_ (requires `textstat`_) and `ari_grade_level`_.
1159          - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
1160            argument is supplied), and per-row metrics of the model in tabular format.
1161  
1162          .. _toxicity:
1163              https://huggingface.co/spaces/evaluate-measurement/toxicity
1164  
1165          .. _torch:
1166              https://pytorch.org/get-started/locally/
1167  
1168          .. _transformers:
1169              https://huggingface.co/docs/transformers/installation
1170  
1171          .. _ari_grade_level:
1172              https://en.wikipedia.org/wiki/Automated_readability_index
1173  
1174          .. _flesch_kincaid_grade_level:
1175              https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
1176  
1177          .. _evaluate:
1178              https://pypi.org/project/evaluate
1179  
1180          .. _textstat:
1181              https://pypi.org/project/textstat
1182  
1183       - For text-summarization models, the default evaluator logs:
1184          - **metrics**: ``token_count``, `ROUGE`_ (requires `evaluate`_, `nltk`_, and
1185            `rouge_score`_ to be installed), `toxicity`_ (requires `evaluate`_, `torch`_,
1186            `transformers`_), `ari_grade_level`_ (requires `textstat`_),
1187            `flesch_kincaid_grade_level`_ (requires `textstat`_).
1188          - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
1189            argument is supplied), and per-row metrics of the model in the tabular format.
1190  
1191          .. _ROUGE:
1192              https://huggingface.co/spaces/evaluate-metric/rouge
1193  
1194          .. _toxicity:
1195              https://huggingface.co/spaces/evaluate-measurement/toxicity
1196  
1197          .. _torch:
1198              https://pytorch.org/get-started/locally/
1199  
1200          .. _transformers:
1201              https://huggingface.co/docs/transformers/installation
1202  
1203          .. _ari_grade_level:
1204              https://en.wikipedia.org/wiki/Automated_readability_index
1205  
1206          .. _flesch_kincaid_grade_level:
1207              https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
1208  
1209          .. _evaluate:
1210              https://pypi.org/project/evaluate
1211  
1212          .. _nltk:
1213              https://pypi.org/project/nltk
1214  
1215          .. _rouge_score:
1216              https://pypi.org/project/rouge-score
1217  
1218          .. _textstat:
1219              https://pypi.org/project/textstat
1220  
1221       - For text models, the default evaluator logs:
1222          - **metrics**: ``token_count``, `toxicity`_ (requires `evaluate`_, `torch`_,
1223            `transformers`_), `ari_grade_level`_ (requires `textstat`_),
1224            `flesch_kincaid_grade_level`_ (requires `textstat`_).
1225          - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
1226            argument is supplied), and per-row metrics of the model in tabular format.
1227  
1228          .. _evaluate:
1229              https://pypi.org/project/evaluate
1230  
1231          .. _toxicity:
1232              https://huggingface.co/spaces/evaluate-measurement/toxicity
1233  
1234          .. _torch:
1235              https://pytorch.org/get-started/locally/
1236  
1237          .. _transformers:
1238              https://huggingface.co/docs/transformers/installation
1239  
1240          .. _ari_grade_level:
1241              https://en.wikipedia.org/wiki/Automated_readability_index
1242  
1243          .. _flesch_kincaid_grade_level:
1244              https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
1245  
1246          .. _textstat:
1247              https://pypi.org/project/textstat
1248  
1249       - For retriever models, the default evaluator logs:
1250          - **metrics**: :mod:`precision_at_k(k) <mlflow.metrics.precision_at_k>`,
1251            :mod:`recall_at_k(k) <mlflow.metrics.recall_at_k>` and
1252            :mod:`ndcg_at_k(k) <mlflow.metrics.ndcg_at_k>` - all have a default value of
1253            ``retriever_k`` = 3.
1254          - **artifacts**: A JSON file containing the inputs, outputs, targets, and per-row metrics
1255            of the model in tabular format.
1256  
1257       - For sklearn models, the default evaluator additionally logs the model's evaluation criterion
1258         (e.g. mean accuracy for a classifier) computed by `model.score` method.
1259  
1260       - The metrics/artifacts listed above are logged to the active MLflow run.
1261         If no active run exists, a new MLflow run is created for logging these metrics and
1262         artifacts.
1263  
1264       - Additionally, information about the specified dataset - hash, name (if specified), path
1265         (if specified), and the UUID of the model that evaluated it - is logged to the
1266         ``mlflow.datasets`` tag.
1267  
1268       - The available ``evaluator_config`` options for the default evaluator include:
1269          - **log_model_explainability**: A boolean value specifying whether or not to log model
1270            explainability insights, default value is True.
1271          - **log_explainer**: If True, log the explainer used to compute model explainability
1272              insights as a model. Default value is False.
1273          - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
1274            explainability. Supported algorithm includes: 'exact', 'permutation', 'partition',
1275            'kernel'.
1276            If not set, ``shap.Explainer`` is used with the "auto" algorithm, which chooses the best
1277            Explainer based on the model.
1278          - **explainability_nsamples**: The number of sample rows to use for computing model
1279            explainability insights. Default value is 2000.
1280          - **explainability_kernel_link**: The kernel link function used by shap kernel explainer.
1281            Available values are "identity" and "logit". Default value is "identity".
1282          - **max_classes_for_multiclass_roc_pr**:
1283            For multiclass classification tasks, the maximum number of classes for which to log
1284            the per-class ROC curve and Precision-Recall curve. If the number of classes is
1285            larger than the configured maximum, these curves are not logged.
1286          - **metric_prefix**: An optional prefix to prepend to the name of each metric and artifact
1287            produced during evaluation.
1288          - **log_metrics_with_dataset_info**: A boolean value specifying whether or not to include
1289            information about the evaluation dataset in the name of each metric logged to MLflow
1290            Tracking during evaluation, default value is True.
1291          - **pos_label**: If specified, the positive label to use when computing classification
1292            metrics such as precision, recall, f1, etc. for binary classification models. For
1293            multiclass classification and regression models, this parameter will be ignored.
1294          - **average**: The averaging method to use when computing classification metrics such as
1295            precision, recall, f1, etc. for multiclass classification models
1296            (default: ``'weighted'``). For binary classification and regression models, this
1297            parameter will be ignored.
1298          - **sample_weights**: Weights for each sample to apply when computing model performance
1299            metrics.
1300          - **col_mapping**: A dictionary mapping column names in the input dataset or output
1301            predictions to column names used when invoking the evaluation functions.
1302          - **retriever_k**: A parameter used when ``model_type="retriever"`` as the number of
1303            top-ranked retrieved documents to use when computing the built-in metric
1304            :mod:`precision_at_k(k) <mlflow.metrics.precision_at_k>`,
1305            :mod:`recall_at_k(k) <mlflow.metrics.recall_at_k>` and
1306            :mod:`ndcg_at_k(k) <mlflow.metrics.ndcg_at_k>`. Default value is 3. For all other
1307            model types, this parameter will be ignored.
1308  
1309       - Limitations of evaluation dataset:
1310          - For classification tasks, dataset labels are used to infer the total number of classes.
1311          - For binary classification tasks, the negative label value must be 0 or -1 or False, and
1312            the positive label value must be 1 or True.
1313  
1314       - Limitations of metrics/artifacts computation:
1315          - For classification tasks, some metric and artifact computations require the model to
1316            output class probabilities. Currently, for scikit-learn models, the default evaluator
1317            calls the ``predict_proba`` method on the underlying model to obtain probabilities. For
1318            other model types, the default evaluator does not compute metrics/artifacts that require
1319            probability outputs.
1320  
1321       - Limitations of default evaluator logging model explainability insights:
1322          - The ``shap.Explainer`` ``auto`` algorithm uses the ``Linear`` explainer for linear models
1323            and the ``Tree`` explainer for tree models. Because SHAP's ``Linear`` and ``Tree``
1324            explainers do not support multi-class classification, the default evaluator falls back to
1325            using the ``Exact`` or ``Permutation`` explainers for multi-class classification tasks.
1326          - Logging model explainability insights is not currently supported for PySpark models.
1327          - The evaluation dataset label values must be numeric or boolean, all feature values
1328            must be numeric, and each feature column must only contain scalar values.
1329  
1330       - Limitations when environment restoration is enabled:
1331          - When environment restoration is enabled for the evaluated model (i.e. a non-local
1332            ``env_manager`` is specified), the model is loaded as a client that invokes a MLflow
1333            Model Scoring Server process in an independent Python environment with the model's
1334            training time dependencies installed. As such, methods like ``predict_proba`` (for
1335            probability outputs) or ``score`` (computes the evaluation criterian for sklearn models)
1336            of the model become inaccessible and the default evaluator does not compute metrics or
1337            artifacts that require those methods.
1338          - Because the model is an MLflow Model Server process, SHAP explanations are slower to
1339            compute. As such, model explainaibility is disabled when a non-local ``env_manager``
1340            specified, unless the ``evaluator_config`` option **log_model_explainability** is
1341            explicitly set to ``True``.
1342  
1343      Args:
1344          model: Optional. If specified, it should be one of the following:
1345  
1346              - A pyfunc model instance
1347              - A URI referring to a pyfunc model
1348              - A URI referring to an MLflow Deployments endpoint e.g. ``"endpoints:/my-chat"``
1349              - A callable function: This function should be able to take in model input and
1350                return predictions. It should follow the signature of the
1351                :py:func:`predict <mlflow.pyfunc.PyFuncModel.predict>` method. Here's an example
1352                of a valid function:
1353  
1354                .. code-block:: python
1355  
1356                    model = mlflow.pyfunc.load_model(model_uri)
1357  
1358  
1359                    def fn(model_input):
1360                        return model.predict(model_input)
1361  
1362              If omitted, it indicates a static dataset will be used for evaluation instead of a
1363              model.  In this case, the ``data`` argument must be a Pandas DataFrame or an mlflow
1364              PandasDataset that contains model outputs, and the ``predictions`` argument must be the
1365              name of the column in ``data`` that contains model outputs.
1366  
1367          data: One of the
1368              following:
1369  
1370              - A numpy array or list of evaluation features, excluding labels.
1371              - A Pandas DataFrame containing evaluation features, labels, and optionally model
1372                  outputs. Model outputs are required to be provided when model is unspecified.
1373                  If ``feature_names`` argument not specified, all columns except for the label
1374                  column and predictions column are regarded as feature columns. Otherwise,
1375                  only column names present in ``feature_names`` are regarded as feature columns.
1376              -  A Spark DataFrame containing evaluation features and labels. If
1377                  ``feature_names`` argument not specified, all columns except for the label
1378                  column are regarded as feature columns. Otherwise, only column names present in
1379                  ``feature_names`` are regarded as feature columns. Only the first 10000 rows in
1380                  the Spark DataFrame will be used as evaluation data.
1381              - A :py:class:`mlflow.data.dataset.Dataset` instance containing evaluation
1382                  features, labels, and optionally model outputs. Model outputs are only supported
1383                  with a PandasDataset. Model outputs are required when model is unspecified, and
1384                  should be specified via the ``predictions`` property of the PandasDataset.
1385  
1386          model_type: (Optional) A string describing the model type. The default evaluator
1387              supports the following model types:
1388  
1389              - ``'classifier'``
1390              - ``'regressor'``
1391              - ``'question-answering'``
1392              - ``'text-summarization'``
1393              - ``'text'``
1394              - ``'retriever'``
1395  
1396              If no ``model_type`` is specified, then you must provide a a list of
1397              metrics to compute via the ``extra_metrics`` param.
1398  
1399              .. note::
1400                  ``'question-answering'``, ``'text-summarization'``, ``'text'``, and
1401                  ``'retriever'`` are experimental and may be changed or removed in a
1402                  future release.
1403  
1404          targets: If ``data`` is a numpy array or list, a numpy array or list of evaluation
1405              labels. If ``data`` is a DataFrame, the string name of a column from ``data``
1406              that contains evaluation labels. Required for classifier and regressor models,
1407              but optional for question-answering, text-summarization, and text models. If
1408              ``data`` is a :py:class:`mlflow.data.dataset.Dataset` that defines targets,
1409              then ``targets`` is optional.
1410  
1411          predictions: Optional. The name of the column that contains model outputs.
1412  
1413              - When ``model`` is specified and outputs multiple columns, ``predictions`` can be used
1414                to specify the name of the column that will be used to store model outputs for
1415                evaluation.
1416              - When ``model`` is not specified and ``data`` is a pandas dataframe,
1417                ``predictions`` can be used to specify the name of the column in ``data`` that
1418                contains model outputs.
1419  
1420              .. code-block:: python
1421                  :caption: Example usage of predictions
1422  
1423                  # Evaluate a model that outputs multiple columns
1424                  data = pd.DataFrame({"question": ["foo"]})
1425  
1426  
1427                  def model(inputs):
1428                      return pd.DataFrame({"answer": ["bar"], "source": ["baz"]})
1429  
1430  
1431                  results = evaluate(
1432                      model=model,
1433                      data=data,
1434                      predictions="answer",
1435                      # other arguments if needed
1436                  )
1437  
1438                  # Evaluate a static dataset
1439                  data = pd.DataFrame({"question": ["foo"], "answer": ["bar"], "source": ["baz"]})
1440                  results = evaluate(
1441                      data=data,
1442                      predictions="answer",
1443                      # other arguments if needed
1444                  )
1445          dataset_path: (Optional) The path where the data is stored. Must not contain double
1446              quotes (``"``). If specified, the path is logged to the ``mlflow.datasets``
1447              tag for lineage tracking purposes.
1448  
1449          feature_names: (Optional) A list. If the ``data`` argument is a numpy array or list,
1450              ``feature_names`` is a list of the feature names for each feature. If
1451              ``feature_names=None``, then the ``feature_names`` are generated using the
1452              format ``feature_{feature_index}``. If the ``data`` argument is a Pandas
1453              DataFrame or a Spark DataFrame, ``feature_names`` is a list of the names
1454              of the feature columns in the DataFrame. If ``feature_names=None``, then
1455              all columns except the label column and the predictions column are
1456              regarded as feature columns.
1457  
1458          evaluators: The name of the evaluator to use for model evaluation, or a list of
1459              evaluator names. If unspecified, all evaluators capable of evaluating the
1460              specified model on the specified dataset are used. The default evaluator
1461              can be referred to by the name ``"default"``. To see all available
1462              evaluators, call :py:func:`mlflow.models.list_evaluators`.
1463  
1464          evaluator_config: A dictionary of additional configurations to supply to the evaluator.
1465              If multiple evaluators are specified, each configuration should be
1466              supplied as a nested dictionary whose key is the evaluator name.
1467  
1468          extra_metrics:
1469              (Optional) A list of :py:class:`EvaluationMetric <mlflow.models.EvaluationMetric>`
1470              objects.  These metrics are computed in addition to the default metrics associated with
1471              pre-defined `model_type`, and setting `model_type=None` will only compute the metrics
1472              specified in `extra_metrics`. See the `mlflow.metrics` module for more information about
1473              the builtin metrics and how to define extra metrics.
1474  
1475              .. code-block:: python
1476                  :caption: Example usage of extra metrics
1477  
1478                  import mlflow
1479                  import numpy as np
1480  
1481  
1482                  def root_mean_squared_error(eval_df, _builtin_metrics):
1483                      return np.sqrt((np.abs(eval_df["prediction"] - eval_df["target"]) ** 2).mean())
1484  
1485  
1486                  rmse_metric = mlflow.models.make_metric(
1487                      eval_fn=root_mean_squared_error,
1488                      greater_is_better=False,
1489                  )
1490                  mlflow.evaluate(..., extra_metrics=[rmse_metric])
1491  
1492          custom_artifacts:
1493              (Optional) A list of custom artifact functions with the following signature:
1494  
1495              .. code-block:: python
1496  
1497                  def custom_artifact(
1498                      eval_df: Union[pandas.Dataframe, pyspark.sql.DataFrame],
1499                      builtin_metrics: Dict[str, float],
1500                      artifacts_dir: str,
1501                  ) -> Dict[str, Any]:
1502                      """
1503                      Args:
1504                          eval_df:
1505                              A Pandas or Spark DataFrame containing ``prediction`` and ``target``
1506                              column.  The ``prediction`` column contains the predictions made by the
1507                              model.  The ``target`` column contains the corresponding labels to the
1508                              predictions made on that row.
1509                          builtin_metrics:
1510                              A dictionary containing the metrics calculated by the default evaluator.
1511                              The keys are the names of the metrics and the values are the scalar
1512                              values of the metrics. Refer to the DefaultEvaluator behavior section
1513                              for what metrics will be returned based on the type of model (i.e.
1514                              classifier or regressor).
1515                          artifacts_dir:
1516                              A temporary directory path that can be used by the custom artifacts
1517                              function to temporarily store produced artifacts. The directory will be
1518                              deleted after the artifacts are logged.
1519  
1520                      Returns:
1521                          A dictionary that maps artifact names to artifact objects
1522                          (e.g. a Matplotlib Figure) or to artifact paths within ``artifacts_dir``.
1523                      """
1524                      ...
1525  
1526              Object types that artifacts can be represented as:
1527  
1528                  - A string uri representing the file path to the artifact. MLflow will infer the
1529                    type of the artifact based on the file extension.
1530                  - A string representation of a JSON object. This will be saved as a .json artifact.
1531                  - Pandas DataFrame. This will be resolved as a CSV artifact.
1532                  - Numpy array. This will be saved as a .npy artifact.
1533                  - Matplotlib Figure. This will be saved as an image artifact. Note that
1534                    ``matplotlib.pyplot.savefig`` is called behind the scene with default
1535                    configurations.
1536                    To customize, either save the figure with the desired configurations and return
1537                    its file path or define customizations through environment variables in
1538                    ``matplotlib.rcParams``.
1539                  - Other objects will be attempted to be pickled with the default protocol.
1540  
1541              .. code-block:: python
1542                  :caption: Example usage of custom artifacts
1543  
1544                  import mlflow
1545                  import matplotlib.pyplot as plt
1546  
1547  
1548                  def scatter_plot(eval_df, builtin_metrics, artifacts_dir):
1549                      plt.scatter(eval_df["prediction"], eval_df["target"])
1550                      plt.xlabel("Targets")
1551                      plt.ylabel("Predictions")
1552                      plt.title("Targets vs. Predictions")
1553                      plt.savefig(os.path.join(artifacts_dir, "example.png"))
1554                      plt.close()
1555                      return {"pred_target_scatter": os.path.join(artifacts_dir, "example.png")}
1556  
1557  
1558                  def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
1559                      return {"pred_sample": pred_sample.head(10)}
1560  
1561  
1562                  mlflow.evaluate(..., custom_artifacts=[scatter_plot, pred_sample])
1563  
1564          env_manager: Specify an environment manager to load the candidate ``model`` in
1565              isolated Python environments and restore their
1566              dependencies. Default value is ``local``, and the following values are
1567              supported:
1568  
1569              - ``virtualenv``: (Recommended) Use virtualenv to restore the python
1570                environment that was used to train the model.
1571              - ``conda``:  Use Conda to restore the software environment that was used
1572                to train the model.
1573              - ``local``: Use the current Python environment for model inference, which
1574                may differ from the environment used to train the model and may lead to
1575                errors or invalid predictions.
1576  
1577          model_config: the model configuration to use for loading the model with pyfunc. Inspect
1578              the model's pyfunc flavor to know which keys are supported for your
1579              specific model. If not indicated, the default model configuration
1580              from the model is used (if any).
1581  
1582          inference_params: (Optional) A dictionary of inference parameters to be passed to the model
1583              when making predictions, such as ``{"max_tokens": 100}``. This is only used when
1584              the ``model`` is an MLflow Deployments endpoint URI e.g. ``"endpoints:/my-chat"``
1585  
1586          model_id: (Optional) The ID of the MLflow LoggedModel or Model Version to which the
1587                    evaluation results (e.g. metrics and traces) will be linked. If `model_id` is not
1588                    specified but `model` is specified, the ID from `model` will be used.
1589  
1590          _called_from_genai_evaluate: (Optional) Only used internally.
1591  
1592      Returns:
1593          An :py:class:`mlflow.models.EvaluationResult` instance containing
1594          metrics of evaluating the model with the given dataset.
1595      '''
1596      from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
1597      from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel
1598      from mlflow.utils import env_manager as _EnvManager
1599  
1600      # Inference params are currently only supported for passing a deployment endpoint as the model.
1601      # TODO: We should support inference_params for other model types
1602  
1603      if inference_params is not None and not _is_model_deployment_endpoint_uri(model):
1604          raise MlflowException(
1605              message="The inference_params argument can only be specified when the model "
1606              "is an MLflow Deployments endpoint URI like `endpoints:/my-chat`",
1607              error_code=INVALID_PARAMETER_VALUE,
1608          )
1609  
1610      if evaluator_config is not None:
1611          col_mapping = evaluator_config.get("col_mapping", {})
1612  
1613          if isinstance(targets, str):
1614              targets = col_mapping.get(targets, targets)
1615  
1616          if isinstance(predictions, str):
1617              predictions = col_mapping.get(predictions, predictions)
1618  
1619      if data is None:
1620          raise MlflowException(
1621              message="The data argument cannot be None.",
1622              error_code=INVALID_PARAMETER_VALUE,
1623          )
1624  
1625      _EnvManager.validate(env_manager)
1626  
1627      # If Dataset is provided, the targets can only be specified by the Dataset,
1628      # not the targets parameters of the mlflow.evaluate() API.
1629      if isinstance(data, Dataset) and targets is not None:
1630          raise MlflowException(
1631              message="The top-level targets parameter should not be specified since a Dataset "
1632              "is used. Please only specify the targets column name in the Dataset. For example: "
1633              "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. "
1634              "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`.",
1635              error_code=INVALID_PARAMETER_VALUE,
1636          )
1637      # If Dataset is provided and model is None, then the predictions can only be specified by the
1638      # Dataset, not the predictions parameters of the mlflow.evaluate() API.
1639      if isinstance(data, Dataset) and model is None and predictions is not None:
1640          raise MlflowException(
1641              message="The top-level predictions parameter should not be specified since a Dataset "
1642              "is used. Please only specify the predictions column name in the Dataset. For example:"
1643              " `data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`"
1644              "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`.",
1645              error_code=INVALID_PARAMETER_VALUE,
1646          )
1647      # If Dataset is provided and model is specified, then the data.predictions cannot be specified.
1648      if (
1649          isinstance(data, Dataset)
1650          and model is not None
1651          and getattr(data, "predictions", None) is not None
1652      ):
1653          raise MlflowException(
1654              message="The predictions parameter should not be specified in the Dataset since a "
1655              "model is specified. Please remove the predictions column from the Dataset.",
1656              error_code=INVALID_PARAMETER_VALUE,
1657          )
1658  
1659      if model_type in [_ModelType.REGRESSOR, _ModelType.CLASSIFIER]:
1660          if isinstance(data, Dataset):
1661              if getattr(data, "targets", None) is not None:
1662                  targets = data.targets
1663              else:
1664                  raise MlflowException(
1665                      message="The targets column name must be specified in the provided Dataset "
1666                      f"for {model_type} models. For example: "
1667                      "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`",
1668                      error_code=INVALID_PARAMETER_VALUE,
1669                  )
1670          else:
1671              if targets is None:
1672                  raise MlflowException(
1673                      f"The targets argument must be specified for {model_type} models.",
1674                      error_code=INVALID_PARAMETER_VALUE,
1675                  )
1676      elif model_type is None:
1677          if not extra_metrics:
1678              raise MlflowException(
1679                  message="The extra_metrics argument must be specified model_type is None.",
1680                  error_code=INVALID_PARAMETER_VALUE,
1681              )
1682  
1683      specified_model_id = model_id
1684      model_id = None
1685      if isinstance(model, str):
1686          model_id = _parse_model_id_if_present(model)
1687          if _is_model_deployment_endpoint_uri(model):
1688              model = _get_model_from_deployment_endpoint_uri(model, inference_params)
1689          else:
1690              model = _load_model_or_server(model, env_manager, model_config)
1691      elif env_manager != _EnvManager.LOCAL:
1692          raise MlflowException(
1693              message="The model argument must be a string URI referring to an MLflow model when a "
1694              "non-local env_manager is specified.",
1695              error_code=INVALID_PARAMETER_VALUE,
1696          )
1697      elif isinstance(model, PyFuncModel):
1698          model_id = model.model_id
1699          if model_config:
1700              raise MlflowException(
1701                  message="Indicating ``model_config`` when passing a `PyFuncModel`` object as "
1702                  "model argument is not allowed. If you need to change the model configuration "
1703                  "for the evaluation model, use "
1704                  "``mlflow.pyfunc.load_model(model_uri, model_config=<value>)`` and indicate "
1705                  "the desired configuration there.",
1706                  error_code=INVALID_PARAMETER_VALUE,
1707              )
1708      elif callable(model):
1709          model = _get_model_from_function(model)
1710      elif model is not None:
1711          raise MlflowException(
1712              message="The model argument must be a string URI referring to an MLflow model, "
1713              "an MLflow Deployments endpoint URI, an instance of `mlflow.pyfunc.PyFuncModel`, "
1714              "a function, or None.",
1715              error_code=INVALID_PARAMETER_VALUE,
1716          )
1717  
1718      # If model_id is specified, verify it matches the derived model_id
1719      if specified_model_id is not None and model_id is not None and specified_model_id != model_id:
1720          raise MlflowException(
1721              message=(
1722                  f"The specified value of the 'model_id' parameter '{specified_model_id}' "
1723                  f"contradicts the model_id '{model_id}' associated with the model. Please ensure "
1724                  f"they match or omit the 'model_id' parameter."
1725              ),
1726              error_code=INVALID_PARAMETER_VALUE,
1727          )
1728  
1729      # Use specified model_id if provided, otherwise use derived model_id
1730      model_id = specified_model_id if specified_model_id is not None else model_id
1731      # If none of the model_id and model is specified, use the active model_id
1732      model_id = model_id or mlflow.get_active_model_id()
1733  
1734      evaluators: list[EvaluatorBundle] = resolve_evaluators_and_configs(
1735          evaluators, evaluator_config, model_type
1736      )
1737  
1738      # NB: MLflow do not use either of these two variables. However, we need to pass these to
1739      # _evaluate() function for backward compatibility.
1740      evaluator_name_list = [evaluator.name for evaluator in evaluators]
1741      evaluator_name_to_conf_map = {evaluator.name: evaluator.config for evaluator in evaluators}
1742  
1743      with _start_run_or_reuse_active_run() as run:
1744          if not isinstance(data, Dataset):
1745              # Convert data to `mlflow.data.dataset.Dataset`.
1746              if model is None:
1747                  data = convert_data_to_mlflow_dataset(
1748                      data=data, targets=targets, predictions=predictions
1749                  )
1750              else:
1751                  data = convert_data_to_mlflow_dataset(data=data, targets=targets)
1752  
1753          from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin
1754  
1755          # model_id could be None
1756          with _set_active_model(model_id=model_id) if model_id else nullcontext():
1757              if isinstance(data, Dataset) and issubclass(
1758                  data.__class__, PyFuncConvertibleDatasetMixin
1759              ):
1760                  dataset = data.to_evaluation_dataset(dataset_path, feature_names)
1761  
1762                  # Use metric_prefix configured for builtin evaluators as a dataset tag
1763                  context = None
1764                  for e in evaluators:
1765                      if _model_evaluation_registry.is_builtin(e.name) and e.config.get(
1766                          "metric_prefix"
1767                      ):
1768                          context = e.config.get("metric_prefix")
1769                          break
1770  
1771                  client = MlflowClient()
1772                  tags = [InputTag(key=MLFLOW_DATASET_CONTEXT, value=context)] if context else []
1773                  dataset_input = DatasetInput(dataset=data._to_mlflow_entity(), tags=tags)
1774                  client.log_inputs(
1775                      run.info.run_id,
1776                      [dataset_input],
1777                      models=[LoggedModelInput(model_id)] if model_id else None,
1778                  )
1779              else:
1780                  dataset = EvaluationDataset(
1781                      data,
1782                      targets=targets,
1783                      path=dataset_path,
1784                      feature_names=feature_names,
1785                      predictions=predictions,
1786                  )
1787              predictions_expected_in_model_output = predictions if model is not None else None
1788  
1789              try:
1790                  evaluate_result = _evaluate(
1791                      model=model,
1792                      model_type=model_type,
1793                      model_id=model_id,
1794                      dataset=dataset,
1795                      run_id=run.info.run_id,
1796                      evaluator_name_list=evaluator_name_list,
1797                      evaluator_name_to_conf_map=evaluator_name_to_conf_map,
1798                      extra_metrics=extra_metrics,
1799                      custom_artifacts=custom_artifacts,
1800                      predictions=predictions_expected_in_model_output,
1801                      evaluators=evaluators,
1802                  )
1803              finally:
1804                  if isinstance(model, _ServedPyFuncModel):
1805                      os.kill(model.pid, signal.SIGTERM)
1806  
1807              # if model_id is specified log metrics to the eval run and logged model
1808              if model_id is not None:
1809                  mlflow.log_metrics(metrics=evaluate_result.metrics, dataset=data, model_id=model_id)
1810  
1811      return evaluate_result