base.py
1 import inspect 2 import json 3 import keyword 4 import logging 5 import os 6 import pathlib 7 import signal 8 import urllib.parse 9 from abc import ABCMeta, abstractmethod 10 from contextlib import contextmanager, nullcontext 11 from dataclasses import dataclass 12 from inspect import Parameter, Signature 13 from types import FunctionType 14 from typing import Any 15 16 import mlflow 17 from mlflow.data.dataset import Dataset 18 from mlflow.data.evaluation_dataset import ( 19 EvaluationDataset, 20 convert_data_to_mlflow_dataset, 21 ) 22 from mlflow.entities.dataset_input import DatasetInput 23 from mlflow.entities.input_tag import InputTag 24 from mlflow.entities.logged_model_input import LoggedModelInput 25 from mlflow.exceptions import MlflowException 26 from mlflow.models.evaluation.utils.trace import configure_autologging_for_evaluation 27 from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE 28 from mlflow.store.artifact.utils.models import _parse_model_id_if_present 29 from mlflow.telemetry.events import EvaluateEvent 30 from mlflow.telemetry.track import record_usage_event 31 from mlflow.tracking.artifact_utils import _download_artifact_from_uri 32 from mlflow.tracking.client import MlflowClient 33 from mlflow.tracking.fluent import _set_active_model 34 from mlflow.utils import _get_fully_qualified_class_name 35 from mlflow.utils.annotations import developer_stable 36 from mlflow.utils.class_utils import _get_class_from_string 37 from mlflow.utils.file_utils import TempDir 38 from mlflow.utils.mlflow_tags import MLFLOW_DATASET_CONTEXT 39 from mlflow.utils.proto_json_utils import NumpyEncoder 40 41 try: 42 # `pandas` is not required for `mlflow-skinny`. 43 import pandas as pd 44 except ImportError: 45 pass 46 47 _logger = logging.getLogger(__name__) 48 49 50 class _ModelType: 51 REGRESSOR = "regressor" 52 CLASSIFIER = "classifier" 53 QUESTION_ANSWERING = "question-answering" 54 TEXT_SUMMARIZATION = "text-summarization" 55 TEXT = "text" 56 RETRIEVER = "retriever" 57 # This model type is used for Mosaic AI Agent evaluation and only available in Databricks 58 # https://docs.databricks.com/en/generative-ai/agent-evaluation/index.html 59 DATABRICKS_AGENT = "databricks-agent" 60 61 def __init__(self): 62 raise NotImplementedError("This class is not meant to be instantiated.") 63 64 @classmethod 65 def values(cls): 66 return ( 67 cls.REGRESSOR, 68 cls.CLASSIFIER, 69 cls.QUESTION_ANSWERING, 70 cls.TEXT_SUMMARIZATION, 71 cls.TEXT, 72 cls.RETRIEVER, 73 ) 74 75 76 class EvaluationMetric: 77 ''' 78 An evaluation metric. 79 80 Args: 81 eval_fn: A function that computes the metric with the following signature: 82 83 .. code-block:: python 84 85 def eval_fn( 86 predictions: pandas.Series, 87 targets: pandas.Series, 88 metrics: Dict[str, MetricValue], 89 **kwargs, 90 ) -> Union[float, MetricValue]: 91 """ 92 Args: 93 predictions: A pandas Series containing the predictions made by the model. 94 targets: (Optional) A pandas Series containing the corresponding labels 95 for the predictions made on that input. 96 metrics: (Optional) A dictionary containing the metrics calculated by the 97 default evaluator. The keys are the names of the metrics and the values 98 are the metric values. To access the MetricValue for the metrics 99 calculated by the system, make sure to specify the type hint for this 100 parameter as Dict[str, MetricValue]. Refer to the DefaultEvaluator 101 behavior section for what metrics will be returned based on the type of 102 model (i.e. classifier or regressor). 103 kwargs: Includes a list of args that are used to compute the metric. These 104 args could be information coming from input data, model outputs, 105 other metrics, or parameters specified in the `evaluator_config` 106 argument of the `mlflow.evaluate` API. 107 108 Returns: MetricValue with per-row scores, per-row justifications, and aggregate 109 results. 110 """ 111 ... 112 113 name: The name of the metric. 114 greater_is_better: Whether a greater value of the metric is better. 115 long_name: (Optional) The long name of the metric. For example, 116 ``"root_mean_squared_error"`` for ``"mse"``. 117 version: (Optional) The metric version. For example ``v1``. 118 metric_details: (Optional) A description of the metric and how it is calculated. 119 metric_metadata: (Optional) A dictionary containing metadata for the metric. 120 genai_metric_args: (Optional) A dictionary containing arguments specified by users 121 when calling make_genai_metric or make_genai_metric_from_prompt. Those args 122 are persisted so that we can deserialize the same metric object later. 123 ''' 124 125 def __init__( 126 self, 127 eval_fn, 128 name, 129 greater_is_better, 130 long_name=None, 131 version=None, 132 metric_details=None, 133 metric_metadata=None, 134 genai_metric_args=None, 135 ): 136 self.eval_fn = eval_fn 137 self.name = name 138 self.greater_is_better = greater_is_better 139 self.long_name = long_name or name 140 self.version = version 141 self.metric_details = metric_details 142 self.metric_metadata = metric_metadata 143 self.genai_metric_args = genai_metric_args 144 145 def __str__(self): 146 parts = [f"name={self.name}, greater_is_better={self.greater_is_better}"] 147 148 if self.long_name: 149 parts.append(f"long_name={self.long_name}") 150 if self.version: 151 parts.append(f"version={self.version}") 152 if self.metric_details: 153 parts.append(f"metric_details={self.metric_details}") 154 if self.metric_metadata: 155 parts.append(f"metric_metadata={self.metric_metadata}") 156 157 return "EvaluationMetric(" + ", ".join(parts) + ")" 158 159 160 # NB: we need this function because we cannot modify the signature of 161 # a class's __call__ method after the class has been defined. 162 # This is also useful to distinguish between the metric signatures with different eval_fn signatures 163 def _generate_eval_metric_class(eval_fn, require_strict_signature=False): 164 """ 165 Dynamically generate a GenAIEvaluationMetric class that can be used to evaluate the metric 166 on the given input data. The generated class is callable with a __call__ method that 167 takes the arguments specified in the signature of the eval_fn function. 168 169 Args: 170 eval_fn: the evaluation function of the EvaluationMetric. 171 require_strict_signature: (Optional) Whether the eval_fn needs to follow a strict signature. 172 If True, then the eval_fn must follow below signature: 173 174 .. code-block:: python 175 176 def eval_fn( 177 predictions: "pd.Series", 178 metrics: Dict[str, MetricValue], 179 inputs: "pd.Series", 180 *args, 181 ) -> MetricValue: 182 pass 183 184 When generating a metric from `make_genai_metric`, this should be set to True. 185 Default to False. 186 187 Returns: 188 A dynamically generated callable CallableEvaluationMetric class. 189 """ 190 from mlflow.metrics.base import MetricValue 191 192 if require_strict_signature: 193 allowed_kwargs_names = [ 194 param_name 195 for param_name in inspect.signature(eval_fn).parameters.keys() 196 if param_name not in ["predictions", "metrics", "inputs"] 197 ] 198 199 def genai_call_method( 200 self, 201 *, 202 predictions: pd.Series | str | list[str], 203 inputs: pd.Series | str | list[str], 204 metrics: dict[str, MetricValue] | None = None, 205 **kwargs, 206 ) -> MetricValue: 207 if missed_kwargs := set(allowed_kwargs_names) - set(kwargs.keys()): 208 raise MlflowException.invalid_parameter_value( 209 f"Missing required arguments: {missed_kwargs}", 210 ) 211 if extra_kwargs := set(kwargs.keys()) - set(allowed_kwargs_names): 212 raise MlflowException.invalid_parameter_value( 213 f"Unexpected arguments: {extra_kwargs}", 214 ) 215 return self.eval_fn( 216 _convert_val_to_pd_Series(predictions, "predictions"), 217 metrics or {}, 218 _convert_val_to_pd_Series(inputs, "inputs"), 219 # Note: based on https://github.com/mlflow/mlflow/blob/4fef77afdbe4d76302cb0b1aad2bd72b5cde64e9/mlflow/metrics/genai/genai_metric.py#L49-L53 220 # the extra params passed https://github.com/mlflow/mlflow/blob/4fef77afdbe4d76302cb0b1aad2bd72b5cde64e9/mlflow/metrics/genai/genai_metric.py#L513 221 # should always be pandas Series 222 *[ 223 _convert_val_to_pd_Series(kwargs[arg_name], arg_name) 224 for arg_name in allowed_kwargs_names 225 ], 226 ) 227 228 genai_call_method.__signature__ = Signature( 229 parameters=[ 230 Parameter("self", Parameter.POSITIONAL_OR_KEYWORD), 231 Parameter( 232 "predictions", 233 Parameter.KEYWORD_ONLY, 234 annotation=pd.Series | str | list[str], 235 ), 236 Parameter( 237 "inputs", 238 Parameter.KEYWORD_ONLY, 239 annotation=pd.Series | str | list[str], 240 ), 241 Parameter( 242 "metrics", 243 Parameter.KEYWORD_ONLY, 244 annotation=dict[str, MetricValue] | None, 245 default=None, 246 ), 247 *[ 248 Parameter(name, Parameter.KEYWORD_ONLY, annotation=pd.Series | str | list[str]) 249 for name in allowed_kwargs_names 250 ], 251 ] 252 ) 253 genai_call_method.__doc__ = f""" 254 Evaluate the metric on the given inputs and predictions. 255 Note: only keyword arguments are supported. 256 257 Args: 258 predictions: predictions made by the model. 259 inputs: inputs used to make the predictions. 260 metrics: metrics calculated by the default evaluator. 261 kwargs: additional arguments used to compute the metric. 262 Required arguments: {allowed_kwargs_names} 263 264 Returns: 265 evaluation result as MetricValue object. 266 """ 267 call_method = genai_call_method 268 269 else: 270 271 def _call_method( 272 self, 273 **kwargs, 274 ) -> MetricValue: 275 return self.eval_fn(**kwargs) 276 277 allowed_kwargs_params = inspect.signature(eval_fn).parameters 278 _call_method.__signature__ = Signature( 279 parameters=[ 280 Parameter("self", Parameter.POSITIONAL_OR_KEYWORD), 281 *[ 282 Parameter( 283 name, 284 Parameter.KEYWORD_ONLY, 285 annotation=allowed_kwargs_params[name].annotation, 286 ) 287 for name in allowed_kwargs_params.keys() 288 ], 289 ] 290 ) 291 _call_method.__doc__ = f""" 292 Evaluate the metric on the given inputs and predictions. 293 Note: only keyword arguments are supported. 294 295 Args: 296 kwargs: additional arguments used to compute the metric. 297 Required arguments: {list(allowed_kwargs_params.keys())} 298 299 Returns: 300 evaluation result as MetricValue object. 301 """ 302 call_method = _call_method 303 304 return type( 305 "CallableEvaluationMetric", 306 (EvaluationMetric,), 307 {"__call__": call_method}, 308 ) 309 310 311 def _convert_val_to_pd_Series(val, name): 312 if val is not None and not isinstance(val, pd.Series): 313 if isinstance(val, str): 314 return pd.Series([val]) 315 elif isinstance(val, list): 316 return pd.Series(val) 317 else: 318 raise TypeError( 319 f"Expected {name} to be a string, list, or Pandas Series, got {type(val)}" 320 ) 321 return val 322 323 324 def make_metric( 325 *, 326 eval_fn, 327 greater_is_better, 328 name=None, 329 long_name=None, 330 version=None, 331 metric_details=None, 332 metric_metadata=None, 333 genai_metric_args=None, 334 ): 335 ''' 336 A factory function to create an :py:class:`EvaluationMetric` object. 337 338 Args: 339 eval_fn: A function that computes the metric with the following signature: 340 341 .. code-block:: python 342 343 def eval_fn( 344 predictions: pandas.Series, 345 targets: pandas.Series, 346 metrics: Dict[str, MetricValue], 347 **kwargs, 348 ) -> Union[float, MetricValue]: 349 """ 350 Args: 351 predictions: A pandas Series containing the predictions made by the model. 352 targets: (Optional) A pandas Series containing the corresponding labels 353 for the predictions made on that input. 354 metrics: (Optional) A dictionary containing the metrics calculated by the 355 default evaluator. The keys are the names of the metrics and the values 356 are the metric values. To access the MetricValue for the metrics 357 calculated by the system, make sure to specify the type hint for this 358 parameter as Dict[str, MetricValue]. Refer to the DefaultEvaluator 359 behavior section for what metrics will be returned based on the type of 360 model (i.e. classifier or regressor). kwargs: Includes a list of args 361 that are used to compute the metric. These args could information coming 362 from input data, model outputs or parameters specified in the 363 `evaluator_config` argument of the `mlflow.evaluate` API. 364 kwargs: Includes a list of args that are used to compute the metric. These 365 args could be information coming from input data, model outputs, 366 other metrics, or parameters specified in the `evaluator_config` 367 argument of the `mlflow.evaluate` API. 368 369 Returns: MetricValue with per-row scores, per-row justifications, and aggregate 370 results. 371 """ 372 ... 373 374 greater_is_better: Whether a greater value of the metric is better. 375 name: The name of the metric. This argument must be specified if ``eval_fn`` is a lambda 376 function or the ``eval_fn.__name__`` attribute is not available. 377 long_name: (Optional) The long name of the metric. For example, ``"mean_squared_error"`` 378 for ``"mse"``. 379 version: (Optional) The metric version. For example ``v1``. 380 metric_details: (Optional) A description of the metric and how it is calculated. 381 metric_metadata: (Optional) A dictionary containing metadata for the metric. 382 genai_metric_args: (Optional) A dictionary containing arguments specified by users 383 when calling make_genai_metric or make_genai_metric_from_prompt. Those args 384 are persisted so that we can deserialize the same metric object later. 385 386 .. seealso:: 387 388 - :py:class:`mlflow.models.EvaluationMetric` 389 - :py:func:`mlflow.evaluate` 390 ''' 391 return _make_metric( 392 eval_fn=eval_fn, 393 greater_is_better=greater_is_better, 394 name=name, 395 long_name=long_name, 396 version=version, 397 metric_details=metric_details, 398 metric_metadata=metric_metadata, 399 genai_metric_args=genai_metric_args, 400 require_strict_signature=False, 401 ) 402 403 404 def _make_metric( 405 *, 406 eval_fn, 407 greater_is_better, 408 name=None, 409 long_name=None, 410 version=None, 411 metric_details=None, 412 metric_metadata=None, 413 genai_metric_args=None, 414 require_strict_signature=False, 415 ): 416 ''' 417 A factory function to create an :py:class:`EvaluationMetric` object. 418 419 Args: 420 eval_fn: A function that computes the metric with the following signature: 421 422 .. code-block:: python 423 424 def eval_fn( 425 predictions: pandas.Series, 426 targets: pandas.Series, 427 metrics: Dict[str, MetricValue], 428 **kwargs, 429 ) -> Union[float, MetricValue]: 430 """ 431 Args: 432 predictions: A pandas Series containing the predictions made by the model. 433 targets: (Optional) A pandas Series containing the corresponding labels 434 for the predictions made on that input. 435 metrics: (Optional) A dictionary containing the metrics calculated by the 436 default evaluator. The keys are the names of the metrics and the values 437 are the metric values. To access the MetricValue for the metrics 438 calculated by the system, make sure to specify the type hint for this 439 parameter as Dict[str, MetricValue]. Refer to the DefaultEvaluator 440 behavior section for what metrics will be returned based on the type of 441 model (i.e. classifier or regressor). kwargs: Includes a list of args 442 that are used to compute the metric. These args could information coming 443 from input data, model outputs or parameters specified in the 444 `evaluator_config` argument of the `mlflow.evaluate` API. 445 kwargs: Includes a list of args that are used to compute the metric. These 446 args could be information coming from input data, model outputs, 447 other metrics, or parameters specified in the `evaluator_config` 448 argument of the `mlflow.evaluate` API. 449 450 Returns: MetricValue with per-row scores, per-row justifications, and aggregate 451 results. 452 """ 453 ... 454 455 greater_is_better: Whether a greater value of the metric is better. 456 name: The name of the metric. This argument must be specified if ``eval_fn`` is a lambda 457 function or the ``eval_fn.__name__`` attribute is not available. 458 long_name: (Optional) The long name of the metric. For example, ``"mean_squared_error"`` 459 for ``"mse"``. 460 version: (Optional) The metric version. For example ``v1``. 461 metric_details: (Optional) A description of the metric and how it is calculated. 462 metric_metadata: (Optional) A dictionary containing metadata for the metric. 463 genai_metric_args: (Optional) A dictionary containing arguments specified by users 464 when calling make_genai_metric or make_genai_metric_from_prompt. Those args 465 are persisted so that we can deserialize the same metric object later. 466 require_strict_signature: (Optional) Whether the eval_fn needs to follow a strict signature. 467 If True, then the eval_fn must follow below signature: 468 469 .. code-block:: python 470 471 def eval_fn( 472 predictions: "pd.Series", 473 metrics: Dict[str, MetricValue], 474 inputs: "pd.Series", 475 *args, 476 ) -> MetricValue: 477 pass 478 479 When generating a metric from `make_genai_metric`, this should be set to True. 480 Default to False. 481 482 .. seealso:: 483 484 - :py:class:`mlflow.models.EvaluationMetric` 485 - :py:func:`mlflow.evaluate` 486 ''' 487 if name is None: 488 if isinstance(eval_fn, FunctionType) and eval_fn.__name__ == "<lambda>": 489 raise MlflowException( 490 "`name` must be specified if `eval_fn` is a lambda function.", 491 INVALID_PARAMETER_VALUE, 492 ) 493 if not hasattr(eval_fn, "__name__"): 494 raise MlflowException( 495 "`name` must be specified if `eval_fn` does not have a `__name__` attribute.", 496 INVALID_PARAMETER_VALUE, 497 ) 498 name = eval_fn.__name__ 499 500 if "/" in name: 501 raise MlflowException( 502 f"Invalid metric name '{name}'. Metric names cannot include forward slashes ('/').", 503 INVALID_PARAMETER_VALUE, 504 ) 505 506 if not name.isidentifier(): 507 _logger.warning( 508 f"The metric name '{name}' provided is not a valid Python identifier, which will " 509 "prevent its use as a base metric for derived metrics. Please use a valid identifier " 510 "to enable creation of derived metrics that use the given metric." 511 ) 512 513 if keyword.iskeyword(name): 514 _logger.warning( 515 f"The metric name '{name}' is a reserved Python keyword, which will " 516 "prevent its use as a base metric for derived metrics. Please use a valid identifier " 517 "to enable creation of derived metrics that use the given metric." 518 ) 519 520 if name in ["predictions", "targets", "metrics"]: 521 _logger.warning( 522 f"The metric name '{name}' is used as a special parameter in MLflow metrics, which " 523 "will prevent its use as a base metric for derived metrics. Please use a different " 524 "name to enable creation of derived metrics that use the given metric." 525 ) 526 527 return _generate_eval_metric_class(eval_fn, require_strict_signature=require_strict_signature)( 528 eval_fn=eval_fn, 529 name=name, 530 greater_is_better=greater_is_better, 531 long_name=long_name, 532 version=version, 533 metric_details=metric_details, 534 metric_metadata=metric_metadata, 535 genai_metric_args=genai_metric_args, 536 ) 537 538 539 @developer_stable 540 class EvaluationArtifact(metaclass=ABCMeta): 541 """ 542 A model evaluation artifact containing an artifact uri and content. 543 """ 544 545 def __init__(self, uri, content=None): 546 self._uri = uri 547 self._content = content 548 549 @abstractmethod 550 def _load_content_from_file(self, local_artifact_path): 551 """ 552 Abstract interface to load the content from local artifact file path, 553 and return the loaded content. 554 """ 555 556 def _load(self, local_artifact_path=None): 557 """ 558 If ``local_artifact_path`` is ``None``, download artifact from the artifact uri. 559 Otherwise, load artifact content from the specified path. Assign the loaded content to 560 ``self._content``, and return the loaded content. 561 """ 562 if local_artifact_path is not None: 563 self._content = self._load_content_from_file(local_artifact_path) 564 else: 565 with TempDir() as temp_dir: 566 temp_dir_path = temp_dir.path() 567 _download_artifact_from_uri(self._uri, temp_dir_path) 568 local_artifact_file = temp_dir.path(os.listdir(temp_dir_path)[0]) 569 self._content = self._load_content_from_file(local_artifact_file) 570 return self._content 571 572 @abstractmethod 573 def _save(self, output_artifact_path): 574 """Save artifact content into specified path.""" 575 576 @property 577 def content(self): 578 """ 579 The content of the artifact (representation varies) 580 """ 581 if self._content is None: 582 self._load() 583 return self._content 584 585 @property 586 def uri(self) -> str: 587 """ 588 The URI of the artifact 589 """ 590 return self._uri 591 592 def __repr__(self): 593 return f"{self.__class__.__name__}(uri='{self.uri}')" 594 595 596 class EvaluationResult: 597 """ 598 Represents the model evaluation outputs of a `mlflow.evaluate()` API call, containing 599 both scalar metrics and output artifacts such as performance plots. 600 """ 601 602 def __init__(self, metrics, artifacts, run_id=None): 603 self._metrics = metrics 604 self._artifacts = artifacts 605 self._run_id = ( 606 run_id 607 if run_id is not None 608 else (mlflow.active_run().info.run_id if mlflow.active_run() is not None else None) 609 ) 610 611 @classmethod 612 def load(cls, path): 613 """Load the evaluation results from the specified local filesystem path""" 614 with open(os.path.join(path, "metrics.json")) as fp: 615 metrics = json.load(fp) 616 617 with open(os.path.join(path, "artifacts_metadata.json")) as fp: 618 artifacts_metadata = json.load(fp) 619 620 artifacts = {} 621 622 artifacts_dir = os.path.join(path, "artifacts") 623 624 for artifact_name, meta in artifacts_metadata.items(): 625 uri = meta["uri"] 626 ArtifactCls = _get_class_from_string(meta["class_name"]) 627 artifact = ArtifactCls(uri=uri) 628 filename = pathlib.Path(urllib.parse.urlparse(uri).path).name 629 artifact._load(os.path.join(artifacts_dir, filename)) 630 artifacts[artifact_name] = artifact 631 632 return EvaluationResult(metrics=metrics, artifacts=artifacts) 633 634 def save(self, path): 635 """Write the evaluation results to the specified local filesystem path""" 636 os.makedirs(path, exist_ok=True) 637 with open(os.path.join(path, "metrics.json"), "w") as fp: 638 json.dump(self.metrics, fp, cls=NumpyEncoder) 639 640 artifacts_metadata = { 641 artifact_name: { 642 "uri": artifact.uri, 643 "class_name": _get_fully_qualified_class_name(artifact), 644 } 645 for artifact_name, artifact in self.artifacts.items() 646 } 647 with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp: 648 json.dump(artifacts_metadata, fp) 649 650 artifacts_dir = os.path.join(path, "artifacts") 651 os.makedirs(artifacts_dir, exist_ok=True) 652 653 for artifact in self.artifacts.values(): 654 filename = pathlib.Path(urllib.parse.urlparse(artifact.uri).path).name 655 artifact._save(os.path.join(artifacts_dir, filename)) 656 657 @property 658 def metrics(self) -> dict[str, Any]: 659 """ 660 A dictionary mapping scalar metric names to scalar metric values 661 """ 662 return self._metrics 663 664 @property 665 def artifacts(self) -> dict[str, "mlflow.models.EvaluationArtifact"]: 666 """ 667 A dictionary mapping standardized artifact names (e.g. "roc_data") to 668 artifact content and location information 669 """ 670 return self._artifacts 671 672 @property 673 def run_id(self) -> str: 674 """ 675 The ID of the MLflow Run to which the evaluation results were logged. 676 """ 677 return self._run_id 678 679 @property 680 def tables(self) -> dict[str, "pd.DataFrame"]: 681 """ 682 A dictionary mapping standardized artifact names (e.g. "eval_results_table") to 683 corresponding table content as pandas DataFrame. 684 """ 685 eval_tables = {} 686 if self._run_id is None: 687 _logger.warning("Cannot load eval_results_table because run_id is not specified.") 688 return eval_tables 689 690 for table_name, table_path in self._artifacts.items(): 691 path = urllib.parse.urlparse(table_path.uri).path 692 table_fileName = os.path.basename(path) 693 try: 694 eval_tables[table_name] = mlflow.load_table(table_fileName, run_ids=[self._run_id]) 695 except Exception: 696 pass # Swallow the exception since we assume its not a table. 697 698 return eval_tables 699 700 701 @developer_stable 702 class ModelEvaluator(metaclass=ABCMeta): 703 @classmethod 704 @abstractmethod 705 def can_evaluate(cls, *, model_type, evaluator_config, **kwargs) -> bool: 706 """ 707 Args: 708 model_type: A string describing the model type (e.g., "regressor", "classifier", …). 709 evaluator_config: A dictionary of additional configurations for 710 the evaluator. 711 kwargs: For forwards compatibility, a placeholder for additional arguments 712 that may be added to the evaluation interface in the future. 713 714 Returns: 715 True if the evaluator can evaluate the specified model on the 716 specified dataset. False otherwise. 717 """ 718 719 @abstractmethod 720 def evaluate( 721 self, 722 *, 723 model_type, 724 dataset, 725 run_id, 726 evaluator_config, 727 model=None, 728 extra_metrics=None, 729 custom_artifacts=None, 730 predictions=None, 731 **kwargs, 732 ): 733 """ 734 The abstract API to log metrics and artifacts, and return evaluation results. 735 736 Args: 737 model_type: A string describing the model type 738 (e.g., ``"regressor"``, ``"classifier"``, …). 739 dataset: An instance of `mlflow.models.evaluation.base._EvaluationDataset` 740 containing features and labels (optional) for model evaluation. 741 run_id: The ID of the MLflow Run to which to log results. 742 evaluator_config: A dictionary of additional configurations for 743 the evaluator. 744 model: A pyfunc model instance. If None, the model output is supposed to be found in 745 ``dataset.predictions_data``. 746 extra_metrics: A list of :py:class:`EvaluationMetric` objects. 747 custom_artifacts: A list of callable custom artifact functions. 748 predictions: The column name of the model output column that is used for evaluation. 749 This is only used when a model returns a pandas dataframe that contains 750 multiple columns. 751 kwargs: For forwards compatibility, a placeholder for additional arguments that 752 may be added to the evaluation interface in the future. 753 754 Returns: 755 A :py:class:`mlflow.models.EvaluationResult` instance containing 756 evaluation metrics and artifacts for the model. 757 """ 758 759 760 def list_evaluators(): 761 """ 762 Return a name list for all available Evaluators. 763 """ 764 # import _model_evaluation_registry inside function to avoid circuit importing 765 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry 766 767 return list(_model_evaluation_registry._registry.keys()) 768 769 770 @contextmanager 771 def _start_run_or_reuse_active_run(): 772 """ 773 A manager context return: 774 - If there's an active run, return the active run. 775 - otherwise start a mflow run with the specified run_id, 776 if specified run_id is None, start a new run. 777 """ 778 active_run = mlflow.active_run() 779 if not active_run: 780 # Note `mlflow.start_run` throws if `run_id` is not found. 781 with mlflow.start_run() as run: 782 yield run 783 else: 784 yield active_run 785 786 787 # NB: We often pass around evaluator name, config, and its instance together. Ideally, the 788 # evaluator class should have name and config as class attributes, however, it was not 789 # designed that way. Adding them while keeping backward compatibility is not trivial. 790 # So, we use a dataclass to bundle them together. 791 @dataclass 792 class EvaluatorBundle: 793 name: str 794 evaluator: ModelEvaluator 795 config: dict[str, Any] 796 797 798 def _resolve_default_evaluator(model_type, default_config) -> list[EvaluatorBundle]: 799 """ 800 Determine which built-in evaluators should be used for the given model type by default. 801 802 Previously, MLflow evaluate API only had a single "default" evaluator used for all models like 803 classifier, regressor, etc. We split it into multiple built-in evaluators for different model 804 types for maintainability, but in order to maintain backward compatibility, we need to map 805 the "default" provided by users to the correct built-in evaluators. 806 807 Args: 808 model_type: A string describing the model type (e.g., "regressor", "classifier", …). 809 default_config: A dictionary of configurations for the "default" evaluator. If any 810 non-default built-in evaluator is applicable, this config will be applied to them. 811 """ 812 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry 813 814 builtin_evaluators = [] 815 for name in _model_evaluation_registry._registry: 816 evaluator = _model_evaluation_registry.get_evaluator(name) 817 if ( 818 name != "default" 819 and _model_evaluation_registry.is_builtin(name) 820 and evaluator.can_evaluate(model_type=model_type, evaluator_config=default_config) 821 ): 822 builtin_evaluators.append(EvaluatorBundle(name, evaluator, default_config)) 823 824 # We should use DefaultEvaluator only if there is no other built-in evaluator applicable. 825 if not builtin_evaluators: 826 default_evaluator = _model_evaluation_registry.get_evaluator("default") 827 builtin_evaluators = [EvaluatorBundle("default", default_evaluator, default_config)] 828 829 return builtin_evaluators 830 831 832 def resolve_evaluators_and_configs( 833 evaluators: str | list[str] | None, 834 evaluator_config: dict[str, Any] | None, 835 model_type: str | None = None, 836 ) -> list[EvaluatorBundle]: 837 """ 838 The `evaluators` and `evaluator_config` arguments of the `evaluate` API can be specified 839 in multiple ways. This function normalizes the arguments into a single format for easier 840 downstream processing. 841 842 Args: 843 evaluators: A string or a list of strings specifying the evaluators to use for model 844 evaluation. If None, all available evaluators will be used. 845 evaluator_config: A dictionary containing configuration items for the evaluators. 846 model_type: A string describing the model type (e.g., "regressor", "classifier", …). 847 848 Returns: 849 A list of EvaluatorBundle that contains name, evaluator, config for each evaluator. 850 """ 851 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry as rg 852 853 # NB: The `databricks-agents` package must be installed to use the 'databricks-agent' model 854 # type. Ideally this check should be done in the 'databricks-agent' evaluator implementation, 855 # but we need to do it here because the code won't reach the evaluator implementation if the 856 # package is not installed. 857 if model_type == _ModelType.DATABRICKS_AGENT: 858 try: 859 import databricks.agents # noqa: F401 860 except ImportError as e: 861 raise MlflowException( 862 message="Databricks Agents SDK must be installed to use the " 863 f"`{_ModelType.DATABRICKS_AGENT}` model type. Run `pip install databricks-agents` " 864 "to install the package and try again.", 865 error_code=INVALID_PARAMETER_VALUE, 866 ) from e 867 868 def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map): 869 return isinstance(_evaluator_name_to_conf_map, dict) and all( 870 k in _evaluator_name_list and isinstance(v, dict) 871 for k, v in _evaluator_name_to_conf_map.items() 872 ) 873 874 if evaluators is None: 875 # If no evaluators are specified, use all available evaluators. 876 evaluators = list(rg._registry.keys()) 877 878 evaluator_config = evaluator_config or {} 879 if evaluator_config is not None and not any( 880 name in evaluator_config for name in evaluators 881 ): 882 # If evaluator config is passed but any of available evaluator key is not 883 # in the evaluator config, we assume the evaluator config to be a flat dict, 884 # which is globally applied to all evaluators. 885 evaluator_config = dict.fromkeys(evaluators, evaluator_config) 886 887 # Filter out evaluators that cannot evaluate the model type. 888 resolved = [] 889 for name in evaluators: 890 evaluator = rg.get_evaluator(name) 891 config = evaluator_config.get(name, {}) 892 if evaluator.can_evaluate(model_type=model_type, evaluator_config=config): 893 resolved.append(EvaluatorBundle(name=name, evaluator=evaluator, config=config)) 894 895 # If any of built-in evaluator can apply, skip "default" evaluator. 896 default = next((ev for ev in resolved if ev.name == "default"), None) 897 non_default_builtins = [ 898 ev for ev in resolved if ev.name != "default" and rg.is_builtin(ev.name) 899 ] 900 if default and non_default_builtins: 901 resolved.remove(default) 902 # Apply default config (passed like `evaluator_config={"default": config}`) to 903 # non-default built-in evaluators (e.g., ClassifierEvaluator) if they don't have 904 # explicitly specified configs. This is for backward compatibility where we only 905 # had a single "default" evaluator used for all models. 906 # For example, if the user passes this for a classifier model: 907 # evaluator_config = {"default": my_config} 908 # it should be equivalent to 909 # evaluator_config = {"classifier": my_config, "shap": my_config} 910 for ev in non_default_builtins: 911 ev.config = ev.config or default.config 912 913 return resolved 914 915 elif isinstance(evaluators, str): 916 # Single evaluator name specified 917 if not (evaluator_config is None or isinstance(evaluator_config, dict)): 918 raise MlflowException( 919 message="If `evaluators` argument is the name of an evaluator, evaluator_config" 920 " must be None or a dict containing config items for the evaluator.", 921 error_code=INVALID_PARAMETER_VALUE, 922 ) 923 924 evaluator_config = evaluator_config or {} 925 if evaluators == "default": 926 # Previously we only had a single "default" evaluator used for all models. 927 # We need to map "default" to the new dedicated builtin evaluators. 928 return _resolve_default_evaluator(model_type, evaluator_config) 929 elif rg.is_registered(evaluators): 930 return [EvaluatorBundle(evaluators, rg.get_evaluator(evaluators), evaluator_config)] 931 else: 932 return [] 933 934 elif isinstance(evaluators, list): 935 if evaluator_config is not None and not check_nesting_config_dict( 936 evaluators, evaluator_config 937 ): 938 raise MlflowException( 939 message="If `evaluators` argument is an evaluator name list, evaluator_config " 940 "must be a dict containing mapping from evaluator name to individual " 941 "evaluator config dict.", 942 error_code=INVALID_PARAMETER_VALUE, 943 ) 944 evaluator_config = evaluator_config or {} 945 946 # Previously we only had a single "default" evaluator used for all models. 947 # We need to map "default" to the new dedicated builtin evaluators. 948 resolved = [] 949 for name in evaluators: 950 config = evaluator_config.get(name, {}) 951 if name == "default": 952 builtin_evaluators = _resolve_default_evaluator(model_type, config) 953 resolved.extend(builtin_evaluators) 954 else: 955 resolved.append(EvaluatorBundle(name, rg.get_evaluator(name), config)) 956 return resolved 957 else: 958 raise MlflowException( 959 message="Invalid `evaluators` and `evaluator_config` arguments. " 960 "Please refer to the documentation for correct usage.", 961 error_code=INVALID_PARAMETER_VALUE, 962 ) 963 964 965 def _model_validation_contains_model_comparison(validation_thresholds): 966 """ 967 Helper function for determining if validation_thresholds contains 968 thresholds for model comparison: either min_relative_change or min_absolute_change 969 """ 970 if not validation_thresholds: 971 return False 972 thresholds = validation_thresholds.values() 973 return any( 974 threshold.min_relative_change or threshold.min_absolute_change for threshold in thresholds 975 ) 976 977 978 _last_failed_evaluator = None 979 980 981 def _get_last_failed_evaluator(): 982 """ 983 Return the evaluator name of the last failed evaluator when calling `evaluate`. 984 This can be used to check which evaluator fail when `evaluate` API fail. 985 """ 986 return _last_failed_evaluator 987 988 989 # DO NOT CHANGE THE ORDER OF THE ARGUMENTS 990 # The order of the arguments need to be preserved. You can add new arguments at the end 991 # of the argument list, but do not change the order of the existing arguments. 992 @record_usage_event(EvaluateEvent) 993 def _evaluate( 994 *, 995 model, 996 model_type, 997 model_id, 998 dataset, 999 run_id, 1000 # The `evaluator_name_list` and `evaluator_name_to_conf_map` are not used by MLflow at all, 1001 # but we need to keep these for backward compatibility. 1002 evaluator_name_list, 1003 evaluator_name_to_conf_map, 1004 extra_metrics, 1005 custom_artifacts, 1006 predictions, 1007 evaluators, 1008 ): 1009 """ 1010 The public API "evaluate" will verify argument first, and then pass normalized arguments 1011 to the _evaluate method. 1012 """ 1013 global _last_failed_evaluator 1014 _last_failed_evaluator = None 1015 1016 client = MlflowClient() 1017 1018 model_uuid = getattr(model, "metadata", None) 1019 1020 if model_uuid is not None: 1021 model_uuid = model_uuid.model_uuid 1022 dataset._log_dataset_tag(client, run_id, model_uuid) 1023 1024 eval_results = [] 1025 should_enable_tracing = model is not None # Do not enable tracing if static dataset is provided 1026 for eval_ in evaluators: 1027 _logger.debug(f"Evaluating the model with the {eval_.name} evaluator.") 1028 _last_failed_evaluator = eval_.name 1029 if eval_.evaluator.can_evaluate(model_type=model_type, evaluator_config=eval_.config): 1030 with configure_autologging_for_evaluation(enable_tracing=should_enable_tracing): 1031 eval_result = eval_.evaluator.evaluate( 1032 model=model, 1033 model_type=model_type, 1034 model_id=model_id, 1035 dataset=dataset, 1036 run_id=run_id, 1037 evaluator_config=eval_.config, 1038 extra_metrics=extra_metrics, 1039 custom_artifacts=custom_artifacts, 1040 predictions=predictions, 1041 ) 1042 1043 if eval_result is not None: 1044 eval_results.append(eval_result) 1045 1046 _last_failed_evaluator = None 1047 1048 if len(eval_results) == 0: 1049 raise MlflowException( 1050 message="The model could not be evaluated by any of the registered evaluators, please " 1051 "verify that the model type and other configs are set correctly.", 1052 error_code=INVALID_PARAMETER_VALUE, 1053 ) 1054 1055 merged_eval_result = EvaluationResult({}, {}, None) 1056 1057 for eval_result in eval_results: 1058 merged_eval_result.metrics.update(eval_result.metrics) 1059 merged_eval_result.artifacts.update(eval_result.artifacts) 1060 1061 return merged_eval_result 1062 1063 1064 def _get_model_from_function(fn): 1065 from mlflow.pyfunc.model import _PythonModelPyfuncWrapper 1066 1067 class ModelFromFunction(mlflow.pyfunc.PythonModel): 1068 def predict(self, context, model_input: pd.DataFrame): 1069 return fn(model_input) 1070 1071 python_model = ModelFromFunction() 1072 return _PythonModelPyfuncWrapper(python_model, None, None) 1073 1074 1075 def _is_model_deployment_endpoint_uri(model: Any) -> bool: 1076 if not isinstance(model, str): 1077 return False 1078 1079 from mlflow.metrics.genai.model_utils import _parse_model_uri 1080 1081 try: 1082 schema, path = _parse_model_uri(model) 1083 return schema in ["endpoints", "apps"] 1084 except MlflowException: 1085 return False 1086 1087 1088 def _get_model_from_deployment_endpoint_uri( 1089 endpoint_uri: str, params: dict[str, Any] | None = None 1090 ): 1091 from mlflow.metrics.genai.model_utils import _parse_model_uri 1092 from mlflow.pyfunc.model import ModelFromDeploymentEndpoint, _PythonModelPyfuncWrapper 1093 1094 _, endpoint = _parse_model_uri(endpoint_uri) 1095 params = params or {} 1096 1097 python_model = ModelFromDeploymentEndpoint(endpoint, params) 1098 return _PythonModelPyfuncWrapper(python_model, None, None) 1099 1100 1101 def evaluate( 1102 model=None, 1103 data=None, 1104 *, 1105 model_type=None, 1106 targets=None, 1107 predictions=None, 1108 dataset_path=None, 1109 feature_names=None, 1110 evaluators=None, 1111 evaluator_config=None, 1112 extra_metrics=None, 1113 custom_artifacts=None, 1114 env_manager="local", 1115 model_config=None, 1116 inference_params=None, 1117 model_id=None, 1118 _called_from_genai_evaluate=False, 1119 ): 1120 ''' 1121 Evaluate the model performance on given data and selected metrics. 1122 1123 This function evaluates a PyFunc model or custom callable on the specified dataset using 1124 specified ``evaluators``, and logs resulting metrics & artifacts to MLflow tracking server. 1125 Users can also skip setting ``model`` and put the model outputs in ``data`` directly for 1126 evaluation. For detailed information, please read 1127 `the Model Evaluation documentation <../../model-evaluation/index.html>`_. 1128 1129 Default Evaluator behavior: 1130 - The default evaluator, which can be invoked with ``evaluators="default"`` or 1131 ``evaluators=None``, supports model types listed below. For each pre-defined model type, the 1132 default evaluator evaluates your model on a selected set of metrics and generate artifacts 1133 like plots. Please find more details below. 1134 1135 - For both the ``"regressor"`` and ``"classifier"`` model types, the default evaluator 1136 generates model summary plots and feature importance plots using 1137 `SHAP <https://shap.readthedocs.io/en/latest/index.html>`_. 1138 1139 - For regressor models, the default evaluator additionally logs: 1140 - **metrics**: example_count, mean_absolute_error, mean_squared_error, 1141 root_mean_squared_error, sum_on_target, mean_on_target, r2_score, max_error, 1142 mean_absolute_percentage_error. 1143 1144 - For binary classifiers, the default evaluator additionally logs: 1145 - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall, 1146 precision, f1_score, accuracy_score, example_count, log_loss, roc_auc, 1147 precision_recall_auc. 1148 - **artifacts**: lift curve plot, precision-recall plot, ROC plot. 1149 1150 - For multiclass classifiers, the default evaluator additionally logs: 1151 - **metrics**: accuracy_score, example_count, f1_score_micro, f1_score_macro, log_loss 1152 - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes 1153 true_negatives/false_positives/false_negatives/true_positives/recall/precision/roc_auc, 1154 precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot. 1155 1156 - For question-answering models, the default evaluator logs: 1157 - **metrics**: ``exact_match``, ``token_count``, `toxicity`_ (requires `evaluate`_, 1158 `torch`_, `flesch_kincaid_grade_level`_ (requires `textstat`_) and `ari_grade_level`_. 1159 - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets`` 1160 argument is supplied), and per-row metrics of the model in tabular format. 1161 1162 .. _toxicity: 1163 https://huggingface.co/spaces/evaluate-measurement/toxicity 1164 1165 .. _torch: 1166 https://pytorch.org/get-started/locally/ 1167 1168 .. _transformers: 1169 https://huggingface.co/docs/transformers/installation 1170 1171 .. _ari_grade_level: 1172 https://en.wikipedia.org/wiki/Automated_readability_index 1173 1174 .. _flesch_kincaid_grade_level: 1175 https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level 1176 1177 .. _evaluate: 1178 https://pypi.org/project/evaluate 1179 1180 .. _textstat: 1181 https://pypi.org/project/textstat 1182 1183 - For text-summarization models, the default evaluator logs: 1184 - **metrics**: ``token_count``, `ROUGE`_ (requires `evaluate`_, `nltk`_, and 1185 `rouge_score`_ to be installed), `toxicity`_ (requires `evaluate`_, `torch`_, 1186 `transformers`_), `ari_grade_level`_ (requires `textstat`_), 1187 `flesch_kincaid_grade_level`_ (requires `textstat`_). 1188 - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets`` 1189 argument is supplied), and per-row metrics of the model in the tabular format. 1190 1191 .. _ROUGE: 1192 https://huggingface.co/spaces/evaluate-metric/rouge 1193 1194 .. _toxicity: 1195 https://huggingface.co/spaces/evaluate-measurement/toxicity 1196 1197 .. _torch: 1198 https://pytorch.org/get-started/locally/ 1199 1200 .. _transformers: 1201 https://huggingface.co/docs/transformers/installation 1202 1203 .. _ari_grade_level: 1204 https://en.wikipedia.org/wiki/Automated_readability_index 1205 1206 .. _flesch_kincaid_grade_level: 1207 https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level 1208 1209 .. _evaluate: 1210 https://pypi.org/project/evaluate 1211 1212 .. _nltk: 1213 https://pypi.org/project/nltk 1214 1215 .. _rouge_score: 1216 https://pypi.org/project/rouge-score 1217 1218 .. _textstat: 1219 https://pypi.org/project/textstat 1220 1221 - For text models, the default evaluator logs: 1222 - **metrics**: ``token_count``, `toxicity`_ (requires `evaluate`_, `torch`_, 1223 `transformers`_), `ari_grade_level`_ (requires `textstat`_), 1224 `flesch_kincaid_grade_level`_ (requires `textstat`_). 1225 - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets`` 1226 argument is supplied), and per-row metrics of the model in tabular format. 1227 1228 .. _evaluate: 1229 https://pypi.org/project/evaluate 1230 1231 .. _toxicity: 1232 https://huggingface.co/spaces/evaluate-measurement/toxicity 1233 1234 .. _torch: 1235 https://pytorch.org/get-started/locally/ 1236 1237 .. _transformers: 1238 https://huggingface.co/docs/transformers/installation 1239 1240 .. _ari_grade_level: 1241 https://en.wikipedia.org/wiki/Automated_readability_index 1242 1243 .. _flesch_kincaid_grade_level: 1244 https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level 1245 1246 .. _textstat: 1247 https://pypi.org/project/textstat 1248 1249 - For retriever models, the default evaluator logs: 1250 - **metrics**: :mod:`precision_at_k(k) <mlflow.metrics.precision_at_k>`, 1251 :mod:`recall_at_k(k) <mlflow.metrics.recall_at_k>` and 1252 :mod:`ndcg_at_k(k) <mlflow.metrics.ndcg_at_k>` - all have a default value of 1253 ``retriever_k`` = 3. 1254 - **artifacts**: A JSON file containing the inputs, outputs, targets, and per-row metrics 1255 of the model in tabular format. 1256 1257 - For sklearn models, the default evaluator additionally logs the model's evaluation criterion 1258 (e.g. mean accuracy for a classifier) computed by `model.score` method. 1259 1260 - The metrics/artifacts listed above are logged to the active MLflow run. 1261 If no active run exists, a new MLflow run is created for logging these metrics and 1262 artifacts. 1263 1264 - Additionally, information about the specified dataset - hash, name (if specified), path 1265 (if specified), and the UUID of the model that evaluated it - is logged to the 1266 ``mlflow.datasets`` tag. 1267 1268 - The available ``evaluator_config`` options for the default evaluator include: 1269 - **log_model_explainability**: A boolean value specifying whether or not to log model 1270 explainability insights, default value is True. 1271 - **log_explainer**: If True, log the explainer used to compute model explainability 1272 insights as a model. Default value is False. 1273 - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model 1274 explainability. Supported algorithm includes: 'exact', 'permutation', 'partition', 1275 'kernel'. 1276 If not set, ``shap.Explainer`` is used with the "auto" algorithm, which chooses the best 1277 Explainer based on the model. 1278 - **explainability_nsamples**: The number of sample rows to use for computing model 1279 explainability insights. Default value is 2000. 1280 - **explainability_kernel_link**: The kernel link function used by shap kernel explainer. 1281 Available values are "identity" and "logit". Default value is "identity". 1282 - **max_classes_for_multiclass_roc_pr**: 1283 For multiclass classification tasks, the maximum number of classes for which to log 1284 the per-class ROC curve and Precision-Recall curve. If the number of classes is 1285 larger than the configured maximum, these curves are not logged. 1286 - **metric_prefix**: An optional prefix to prepend to the name of each metric and artifact 1287 produced during evaluation. 1288 - **log_metrics_with_dataset_info**: A boolean value specifying whether or not to include 1289 information about the evaluation dataset in the name of each metric logged to MLflow 1290 Tracking during evaluation, default value is True. 1291 - **pos_label**: If specified, the positive label to use when computing classification 1292 metrics such as precision, recall, f1, etc. for binary classification models. For 1293 multiclass classification and regression models, this parameter will be ignored. 1294 - **average**: The averaging method to use when computing classification metrics such as 1295 precision, recall, f1, etc. for multiclass classification models 1296 (default: ``'weighted'``). For binary classification and regression models, this 1297 parameter will be ignored. 1298 - **sample_weights**: Weights for each sample to apply when computing model performance 1299 metrics. 1300 - **col_mapping**: A dictionary mapping column names in the input dataset or output 1301 predictions to column names used when invoking the evaluation functions. 1302 - **retriever_k**: A parameter used when ``model_type="retriever"`` as the number of 1303 top-ranked retrieved documents to use when computing the built-in metric 1304 :mod:`precision_at_k(k) <mlflow.metrics.precision_at_k>`, 1305 :mod:`recall_at_k(k) <mlflow.metrics.recall_at_k>` and 1306 :mod:`ndcg_at_k(k) <mlflow.metrics.ndcg_at_k>`. Default value is 3. For all other 1307 model types, this parameter will be ignored. 1308 1309 - Limitations of evaluation dataset: 1310 - For classification tasks, dataset labels are used to infer the total number of classes. 1311 - For binary classification tasks, the negative label value must be 0 or -1 or False, and 1312 the positive label value must be 1 or True. 1313 1314 - Limitations of metrics/artifacts computation: 1315 - For classification tasks, some metric and artifact computations require the model to 1316 output class probabilities. Currently, for scikit-learn models, the default evaluator 1317 calls the ``predict_proba`` method on the underlying model to obtain probabilities. For 1318 other model types, the default evaluator does not compute metrics/artifacts that require 1319 probability outputs. 1320 1321 - Limitations of default evaluator logging model explainability insights: 1322 - The ``shap.Explainer`` ``auto`` algorithm uses the ``Linear`` explainer for linear models 1323 and the ``Tree`` explainer for tree models. Because SHAP's ``Linear`` and ``Tree`` 1324 explainers do not support multi-class classification, the default evaluator falls back to 1325 using the ``Exact`` or ``Permutation`` explainers for multi-class classification tasks. 1326 - Logging model explainability insights is not currently supported for PySpark models. 1327 - The evaluation dataset label values must be numeric or boolean, all feature values 1328 must be numeric, and each feature column must only contain scalar values. 1329 1330 - Limitations when environment restoration is enabled: 1331 - When environment restoration is enabled for the evaluated model (i.e. a non-local 1332 ``env_manager`` is specified), the model is loaded as a client that invokes a MLflow 1333 Model Scoring Server process in an independent Python environment with the model's 1334 training time dependencies installed. As such, methods like ``predict_proba`` (for 1335 probability outputs) or ``score`` (computes the evaluation criterian for sklearn models) 1336 of the model become inaccessible and the default evaluator does not compute metrics or 1337 artifacts that require those methods. 1338 - Because the model is an MLflow Model Server process, SHAP explanations are slower to 1339 compute. As such, model explainaibility is disabled when a non-local ``env_manager`` 1340 specified, unless the ``evaluator_config`` option **log_model_explainability** is 1341 explicitly set to ``True``. 1342 1343 Args: 1344 model: Optional. If specified, it should be one of the following: 1345 1346 - A pyfunc model instance 1347 - A URI referring to a pyfunc model 1348 - A URI referring to an MLflow Deployments endpoint e.g. ``"endpoints:/my-chat"`` 1349 - A callable function: This function should be able to take in model input and 1350 return predictions. It should follow the signature of the 1351 :py:func:`predict <mlflow.pyfunc.PyFuncModel.predict>` method. Here's an example 1352 of a valid function: 1353 1354 .. code-block:: python 1355 1356 model = mlflow.pyfunc.load_model(model_uri) 1357 1358 1359 def fn(model_input): 1360 return model.predict(model_input) 1361 1362 If omitted, it indicates a static dataset will be used for evaluation instead of a 1363 model. In this case, the ``data`` argument must be a Pandas DataFrame or an mlflow 1364 PandasDataset that contains model outputs, and the ``predictions`` argument must be the 1365 name of the column in ``data`` that contains model outputs. 1366 1367 data: One of the 1368 following: 1369 1370 - A numpy array or list of evaluation features, excluding labels. 1371 - A Pandas DataFrame containing evaluation features, labels, and optionally model 1372 outputs. Model outputs are required to be provided when model is unspecified. 1373 If ``feature_names`` argument not specified, all columns except for the label 1374 column and predictions column are regarded as feature columns. Otherwise, 1375 only column names present in ``feature_names`` are regarded as feature columns. 1376 - A Spark DataFrame containing evaluation features and labels. If 1377 ``feature_names`` argument not specified, all columns except for the label 1378 column are regarded as feature columns. Otherwise, only column names present in 1379 ``feature_names`` are regarded as feature columns. Only the first 10000 rows in 1380 the Spark DataFrame will be used as evaluation data. 1381 - A :py:class:`mlflow.data.dataset.Dataset` instance containing evaluation 1382 features, labels, and optionally model outputs. Model outputs are only supported 1383 with a PandasDataset. Model outputs are required when model is unspecified, and 1384 should be specified via the ``predictions`` property of the PandasDataset. 1385 1386 model_type: (Optional) A string describing the model type. The default evaluator 1387 supports the following model types: 1388 1389 - ``'classifier'`` 1390 - ``'regressor'`` 1391 - ``'question-answering'`` 1392 - ``'text-summarization'`` 1393 - ``'text'`` 1394 - ``'retriever'`` 1395 1396 If no ``model_type`` is specified, then you must provide a a list of 1397 metrics to compute via the ``extra_metrics`` param. 1398 1399 .. note:: 1400 ``'question-answering'``, ``'text-summarization'``, ``'text'``, and 1401 ``'retriever'`` are experimental and may be changed or removed in a 1402 future release. 1403 1404 targets: If ``data`` is a numpy array or list, a numpy array or list of evaluation 1405 labels. If ``data`` is a DataFrame, the string name of a column from ``data`` 1406 that contains evaluation labels. Required for classifier and regressor models, 1407 but optional for question-answering, text-summarization, and text models. If 1408 ``data`` is a :py:class:`mlflow.data.dataset.Dataset` that defines targets, 1409 then ``targets`` is optional. 1410 1411 predictions: Optional. The name of the column that contains model outputs. 1412 1413 - When ``model`` is specified and outputs multiple columns, ``predictions`` can be used 1414 to specify the name of the column that will be used to store model outputs for 1415 evaluation. 1416 - When ``model`` is not specified and ``data`` is a pandas dataframe, 1417 ``predictions`` can be used to specify the name of the column in ``data`` that 1418 contains model outputs. 1419 1420 .. code-block:: python 1421 :caption: Example usage of predictions 1422 1423 # Evaluate a model that outputs multiple columns 1424 data = pd.DataFrame({"question": ["foo"]}) 1425 1426 1427 def model(inputs): 1428 return pd.DataFrame({"answer": ["bar"], "source": ["baz"]}) 1429 1430 1431 results = evaluate( 1432 model=model, 1433 data=data, 1434 predictions="answer", 1435 # other arguments if needed 1436 ) 1437 1438 # Evaluate a static dataset 1439 data = pd.DataFrame({"question": ["foo"], "answer": ["bar"], "source": ["baz"]}) 1440 results = evaluate( 1441 data=data, 1442 predictions="answer", 1443 # other arguments if needed 1444 ) 1445 dataset_path: (Optional) The path where the data is stored. Must not contain double 1446 quotes (``"``). If specified, the path is logged to the ``mlflow.datasets`` 1447 tag for lineage tracking purposes. 1448 1449 feature_names: (Optional) A list. If the ``data`` argument is a numpy array or list, 1450 ``feature_names`` is a list of the feature names for each feature. If 1451 ``feature_names=None``, then the ``feature_names`` are generated using the 1452 format ``feature_{feature_index}``. If the ``data`` argument is a Pandas 1453 DataFrame or a Spark DataFrame, ``feature_names`` is a list of the names 1454 of the feature columns in the DataFrame. If ``feature_names=None``, then 1455 all columns except the label column and the predictions column are 1456 regarded as feature columns. 1457 1458 evaluators: The name of the evaluator to use for model evaluation, or a list of 1459 evaluator names. If unspecified, all evaluators capable of evaluating the 1460 specified model on the specified dataset are used. The default evaluator 1461 can be referred to by the name ``"default"``. To see all available 1462 evaluators, call :py:func:`mlflow.models.list_evaluators`. 1463 1464 evaluator_config: A dictionary of additional configurations to supply to the evaluator. 1465 If multiple evaluators are specified, each configuration should be 1466 supplied as a nested dictionary whose key is the evaluator name. 1467 1468 extra_metrics: 1469 (Optional) A list of :py:class:`EvaluationMetric <mlflow.models.EvaluationMetric>` 1470 objects. These metrics are computed in addition to the default metrics associated with 1471 pre-defined `model_type`, and setting `model_type=None` will only compute the metrics 1472 specified in `extra_metrics`. See the `mlflow.metrics` module for more information about 1473 the builtin metrics and how to define extra metrics. 1474 1475 .. code-block:: python 1476 :caption: Example usage of extra metrics 1477 1478 import mlflow 1479 import numpy as np 1480 1481 1482 def root_mean_squared_error(eval_df, _builtin_metrics): 1483 return np.sqrt((np.abs(eval_df["prediction"] - eval_df["target"]) ** 2).mean()) 1484 1485 1486 rmse_metric = mlflow.models.make_metric( 1487 eval_fn=root_mean_squared_error, 1488 greater_is_better=False, 1489 ) 1490 mlflow.evaluate(..., extra_metrics=[rmse_metric]) 1491 1492 custom_artifacts: 1493 (Optional) A list of custom artifact functions with the following signature: 1494 1495 .. code-block:: python 1496 1497 def custom_artifact( 1498 eval_df: Union[pandas.Dataframe, pyspark.sql.DataFrame], 1499 builtin_metrics: Dict[str, float], 1500 artifacts_dir: str, 1501 ) -> Dict[str, Any]: 1502 """ 1503 Args: 1504 eval_df: 1505 A Pandas or Spark DataFrame containing ``prediction`` and ``target`` 1506 column. The ``prediction`` column contains the predictions made by the 1507 model. The ``target`` column contains the corresponding labels to the 1508 predictions made on that row. 1509 builtin_metrics: 1510 A dictionary containing the metrics calculated by the default evaluator. 1511 The keys are the names of the metrics and the values are the scalar 1512 values of the metrics. Refer to the DefaultEvaluator behavior section 1513 for what metrics will be returned based on the type of model (i.e. 1514 classifier or regressor). 1515 artifacts_dir: 1516 A temporary directory path that can be used by the custom artifacts 1517 function to temporarily store produced artifacts. The directory will be 1518 deleted after the artifacts are logged. 1519 1520 Returns: 1521 A dictionary that maps artifact names to artifact objects 1522 (e.g. a Matplotlib Figure) or to artifact paths within ``artifacts_dir``. 1523 """ 1524 ... 1525 1526 Object types that artifacts can be represented as: 1527 1528 - A string uri representing the file path to the artifact. MLflow will infer the 1529 type of the artifact based on the file extension. 1530 - A string representation of a JSON object. This will be saved as a .json artifact. 1531 - Pandas DataFrame. This will be resolved as a CSV artifact. 1532 - Numpy array. This will be saved as a .npy artifact. 1533 - Matplotlib Figure. This will be saved as an image artifact. Note that 1534 ``matplotlib.pyplot.savefig`` is called behind the scene with default 1535 configurations. 1536 To customize, either save the figure with the desired configurations and return 1537 its file path or define customizations through environment variables in 1538 ``matplotlib.rcParams``. 1539 - Other objects will be attempted to be pickled with the default protocol. 1540 1541 .. code-block:: python 1542 :caption: Example usage of custom artifacts 1543 1544 import mlflow 1545 import matplotlib.pyplot as plt 1546 1547 1548 def scatter_plot(eval_df, builtin_metrics, artifacts_dir): 1549 plt.scatter(eval_df["prediction"], eval_df["target"]) 1550 plt.xlabel("Targets") 1551 plt.ylabel("Predictions") 1552 plt.title("Targets vs. Predictions") 1553 plt.savefig(os.path.join(artifacts_dir, "example.png")) 1554 plt.close() 1555 return {"pred_target_scatter": os.path.join(artifacts_dir, "example.png")} 1556 1557 1558 def pred_sample(eval_df, _builtin_metrics, _artifacts_dir): 1559 return {"pred_sample": pred_sample.head(10)} 1560 1561 1562 mlflow.evaluate(..., custom_artifacts=[scatter_plot, pred_sample]) 1563 1564 env_manager: Specify an environment manager to load the candidate ``model`` in 1565 isolated Python environments and restore their 1566 dependencies. Default value is ``local``, and the following values are 1567 supported: 1568 1569 - ``virtualenv``: (Recommended) Use virtualenv to restore the python 1570 environment that was used to train the model. 1571 - ``conda``: Use Conda to restore the software environment that was used 1572 to train the model. 1573 - ``local``: Use the current Python environment for model inference, which 1574 may differ from the environment used to train the model and may lead to 1575 errors or invalid predictions. 1576 1577 model_config: the model configuration to use for loading the model with pyfunc. Inspect 1578 the model's pyfunc flavor to know which keys are supported for your 1579 specific model. If not indicated, the default model configuration 1580 from the model is used (if any). 1581 1582 inference_params: (Optional) A dictionary of inference parameters to be passed to the model 1583 when making predictions, such as ``{"max_tokens": 100}``. This is only used when 1584 the ``model`` is an MLflow Deployments endpoint URI e.g. ``"endpoints:/my-chat"`` 1585 1586 model_id: (Optional) The ID of the MLflow LoggedModel or Model Version to which the 1587 evaluation results (e.g. metrics and traces) will be linked. If `model_id` is not 1588 specified but `model` is specified, the ID from `model` will be used. 1589 1590 _called_from_genai_evaluate: (Optional) Only used internally. 1591 1592 Returns: 1593 An :py:class:`mlflow.models.EvaluationResult` instance containing 1594 metrics of evaluating the model with the given dataset. 1595 ''' 1596 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry 1597 from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel 1598 from mlflow.utils import env_manager as _EnvManager 1599 1600 # Inference params are currently only supported for passing a deployment endpoint as the model. 1601 # TODO: We should support inference_params for other model types 1602 1603 if inference_params is not None and not _is_model_deployment_endpoint_uri(model): 1604 raise MlflowException( 1605 message="The inference_params argument can only be specified when the model " 1606 "is an MLflow Deployments endpoint URI like `endpoints:/my-chat`", 1607 error_code=INVALID_PARAMETER_VALUE, 1608 ) 1609 1610 if evaluator_config is not None: 1611 col_mapping = evaluator_config.get("col_mapping", {}) 1612 1613 if isinstance(targets, str): 1614 targets = col_mapping.get(targets, targets) 1615 1616 if isinstance(predictions, str): 1617 predictions = col_mapping.get(predictions, predictions) 1618 1619 if data is None: 1620 raise MlflowException( 1621 message="The data argument cannot be None.", 1622 error_code=INVALID_PARAMETER_VALUE, 1623 ) 1624 1625 _EnvManager.validate(env_manager) 1626 1627 # If Dataset is provided, the targets can only be specified by the Dataset, 1628 # not the targets parameters of the mlflow.evaluate() API. 1629 if isinstance(data, Dataset) and targets is not None: 1630 raise MlflowException( 1631 message="The top-level targets parameter should not be specified since a Dataset " 1632 "is used. Please only specify the targets column name in the Dataset. For example: " 1633 "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. " 1634 "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`.", 1635 error_code=INVALID_PARAMETER_VALUE, 1636 ) 1637 # If Dataset is provided and model is None, then the predictions can only be specified by the 1638 # Dataset, not the predictions parameters of the mlflow.evaluate() API. 1639 if isinstance(data, Dataset) and model is None and predictions is not None: 1640 raise MlflowException( 1641 message="The top-level predictions parameter should not be specified since a Dataset " 1642 "is used. Please only specify the predictions column name in the Dataset. For example:" 1643 " `data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`" 1644 "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`.", 1645 error_code=INVALID_PARAMETER_VALUE, 1646 ) 1647 # If Dataset is provided and model is specified, then the data.predictions cannot be specified. 1648 if ( 1649 isinstance(data, Dataset) 1650 and model is not None 1651 and getattr(data, "predictions", None) is not None 1652 ): 1653 raise MlflowException( 1654 message="The predictions parameter should not be specified in the Dataset since a " 1655 "model is specified. Please remove the predictions column from the Dataset.", 1656 error_code=INVALID_PARAMETER_VALUE, 1657 ) 1658 1659 if model_type in [_ModelType.REGRESSOR, _ModelType.CLASSIFIER]: 1660 if isinstance(data, Dataset): 1661 if getattr(data, "targets", None) is not None: 1662 targets = data.targets 1663 else: 1664 raise MlflowException( 1665 message="The targets column name must be specified in the provided Dataset " 1666 f"for {model_type} models. For example: " 1667 "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`", 1668 error_code=INVALID_PARAMETER_VALUE, 1669 ) 1670 else: 1671 if targets is None: 1672 raise MlflowException( 1673 f"The targets argument must be specified for {model_type} models.", 1674 error_code=INVALID_PARAMETER_VALUE, 1675 ) 1676 elif model_type is None: 1677 if not extra_metrics: 1678 raise MlflowException( 1679 message="The extra_metrics argument must be specified model_type is None.", 1680 error_code=INVALID_PARAMETER_VALUE, 1681 ) 1682 1683 specified_model_id = model_id 1684 model_id = None 1685 if isinstance(model, str): 1686 model_id = _parse_model_id_if_present(model) 1687 if _is_model_deployment_endpoint_uri(model): 1688 model = _get_model_from_deployment_endpoint_uri(model, inference_params) 1689 else: 1690 model = _load_model_or_server(model, env_manager, model_config) 1691 elif env_manager != _EnvManager.LOCAL: 1692 raise MlflowException( 1693 message="The model argument must be a string URI referring to an MLflow model when a " 1694 "non-local env_manager is specified.", 1695 error_code=INVALID_PARAMETER_VALUE, 1696 ) 1697 elif isinstance(model, PyFuncModel): 1698 model_id = model.model_id 1699 if model_config: 1700 raise MlflowException( 1701 message="Indicating ``model_config`` when passing a `PyFuncModel`` object as " 1702 "model argument is not allowed. If you need to change the model configuration " 1703 "for the evaluation model, use " 1704 "``mlflow.pyfunc.load_model(model_uri, model_config=<value>)`` and indicate " 1705 "the desired configuration there.", 1706 error_code=INVALID_PARAMETER_VALUE, 1707 ) 1708 elif callable(model): 1709 model = _get_model_from_function(model) 1710 elif model is not None: 1711 raise MlflowException( 1712 message="The model argument must be a string URI referring to an MLflow model, " 1713 "an MLflow Deployments endpoint URI, an instance of `mlflow.pyfunc.PyFuncModel`, " 1714 "a function, or None.", 1715 error_code=INVALID_PARAMETER_VALUE, 1716 ) 1717 1718 # If model_id is specified, verify it matches the derived model_id 1719 if specified_model_id is not None and model_id is not None and specified_model_id != model_id: 1720 raise MlflowException( 1721 message=( 1722 f"The specified value of the 'model_id' parameter '{specified_model_id}' " 1723 f"contradicts the model_id '{model_id}' associated with the model. Please ensure " 1724 f"they match or omit the 'model_id' parameter." 1725 ), 1726 error_code=INVALID_PARAMETER_VALUE, 1727 ) 1728 1729 # Use specified model_id if provided, otherwise use derived model_id 1730 model_id = specified_model_id if specified_model_id is not None else model_id 1731 # If none of the model_id and model is specified, use the active model_id 1732 model_id = model_id or mlflow.get_active_model_id() 1733 1734 evaluators: list[EvaluatorBundle] = resolve_evaluators_and_configs( 1735 evaluators, evaluator_config, model_type 1736 ) 1737 1738 # NB: MLflow do not use either of these two variables. However, we need to pass these to 1739 # _evaluate() function for backward compatibility. 1740 evaluator_name_list = [evaluator.name for evaluator in evaluators] 1741 evaluator_name_to_conf_map = {evaluator.name: evaluator.config for evaluator in evaluators} 1742 1743 with _start_run_or_reuse_active_run() as run: 1744 if not isinstance(data, Dataset): 1745 # Convert data to `mlflow.data.dataset.Dataset`. 1746 if model is None: 1747 data = convert_data_to_mlflow_dataset( 1748 data=data, targets=targets, predictions=predictions 1749 ) 1750 else: 1751 data = convert_data_to_mlflow_dataset(data=data, targets=targets) 1752 1753 from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin 1754 1755 # model_id could be None 1756 with _set_active_model(model_id=model_id) if model_id else nullcontext(): 1757 if isinstance(data, Dataset) and issubclass( 1758 data.__class__, PyFuncConvertibleDatasetMixin 1759 ): 1760 dataset = data.to_evaluation_dataset(dataset_path, feature_names) 1761 1762 # Use metric_prefix configured for builtin evaluators as a dataset tag 1763 context = None 1764 for e in evaluators: 1765 if _model_evaluation_registry.is_builtin(e.name) and e.config.get( 1766 "metric_prefix" 1767 ): 1768 context = e.config.get("metric_prefix") 1769 break 1770 1771 client = MlflowClient() 1772 tags = [InputTag(key=MLFLOW_DATASET_CONTEXT, value=context)] if context else [] 1773 dataset_input = DatasetInput(dataset=data._to_mlflow_entity(), tags=tags) 1774 client.log_inputs( 1775 run.info.run_id, 1776 [dataset_input], 1777 models=[LoggedModelInput(model_id)] if model_id else None, 1778 ) 1779 else: 1780 dataset = EvaluationDataset( 1781 data, 1782 targets=targets, 1783 path=dataset_path, 1784 feature_names=feature_names, 1785 predictions=predictions, 1786 ) 1787 predictions_expected_in_model_output = predictions if model is not None else None 1788 1789 try: 1790 evaluate_result = _evaluate( 1791 model=model, 1792 model_type=model_type, 1793 model_id=model_id, 1794 dataset=dataset, 1795 run_id=run.info.run_id, 1796 evaluator_name_list=evaluator_name_list, 1797 evaluator_name_to_conf_map=evaluator_name_to_conf_map, 1798 extra_metrics=extra_metrics, 1799 custom_artifacts=custom_artifacts, 1800 predictions=predictions_expected_in_model_output, 1801 evaluators=evaluators, 1802 ) 1803 finally: 1804 if isinstance(model, _ServedPyFuncModel): 1805 os.kill(model.pid, signal.SIGTERM) 1806 1807 # if model_id is specified log metrics to the eval run and logged model 1808 if model_id is not None: 1809 mlflow.log_metrics(metrics=evaluate_result.metrics, dataset=data, model_id=model_id) 1810 1811 return evaluate_result