evaluation.py
1 """ 2 THE 'mlflow.evaluation` MODULE IS LEGACY AND WILL BE REMOVED IN MLFLOW 3.0. 3 For assessment functionality, use `mlflow.entities.assessment` for assessment classes and 4 `mlflow.tracing.assessments` for assessment APIs. There are no alternatives for Evaluation and 5 EvaluationEntity objects and related APIs. 6 """ 7 8 import hashlib 9 import json 10 from typing import Any 11 12 from mlflow.entities._mlflow_object import _MlflowObject 13 from mlflow.entities.metric import Metric 14 from mlflow.evaluation.assessment import Assessment, AssessmentEntity 15 from mlflow.evaluation.evaluation_tag import ( 16 EvaluationTag, # Assuming EvaluationTag is in this module 17 ) 18 from mlflow.tracing.utils import TraceJSONEncoder 19 from mlflow.utils.annotations import deprecated 20 21 22 @deprecated(since="3.0.0") 23 class EvaluationEntity(_MlflowObject): 24 """ 25 Evaluation result data, including inputs, outputs, targets, assessments, and more. 26 """ 27 28 def __init__( 29 self, 30 evaluation_id: str, 31 run_id: str, 32 inputs_id: str, 33 inputs: dict[str, Any], 34 outputs: dict[str, Any] | None = None, 35 request_id: str | None = None, 36 targets: dict[str, Any] | None = None, 37 error_code: str | None = None, 38 error_message: str | None = None, 39 assessments: list[AssessmentEntity] | None = None, 40 metrics: list[Metric] | None = None, 41 tags: list[EvaluationTag] | None = None, 42 ): 43 """ 44 Construct a new mlflow.evaluation.EvaluationEntity instance. 45 46 Args: 47 evaluation_id: A unique identifier for the evaluation. 48 run_id: The ID of the MLflow Run containing the Evaluation. 49 inputs_id: A unique identifier for the input names and values for evaluation. 50 inputs: Input names and values for evaluation. 51 outputs: Outputs obtained during inference. 52 request_id: The ID of an MLflow Trace corresponding to the inputs and outputs. 53 targets: Expected values that the model should produce during inference. 54 error_code: An error code representing any issues encountered during the evaluation. 55 error_message: A descriptive error message representing any issues encountered during 56 the evaluation. 57 assessments: Assessments for the evaluation. 58 metrics: Objective numerical metrics for the evaluation, e.g., "number of input tokens", 59 "number of output tokens". 60 tags: List of tags associated with the evaluation. 61 """ 62 self._evaluation_id = evaluation_id 63 self._run_id = run_id 64 self._inputs_id = inputs_id 65 self._inputs = inputs 66 self._outputs = outputs 67 self._request_id = request_id 68 self._targets = targets 69 self._error_code = error_code 70 self._error_message = error_message 71 self._assessments = assessments 72 self._metrics = metrics 73 self._tags = tags 74 75 @property 76 def evaluation_id(self) -> str: 77 """The evaluation ID.""" 78 return self._evaluation_id 79 80 @property 81 def run_id(self) -> str: 82 """The ID of the MLflow Run containing the evaluation""" 83 return self._run_id 84 85 @property 86 def inputs_id(self) -> str: 87 """The evaluation inputs ID.""" 88 return self._inputs_id 89 90 @property 91 def inputs(self) -> dict[str, Any]: 92 """The evaluation inputs.""" 93 return self._inputs 94 95 @property 96 def outputs(self) -> dict[str, Any] | None: 97 """The evaluation outputs.""" 98 return self._outputs 99 100 @property 101 def request_id(self) -> str | None: 102 """The evaluation request ID.""" 103 return self._request_id 104 105 @property 106 def targets(self) -> dict[str, Any] | None: 107 """The evaluation targets.""" 108 return self._targets 109 110 @property 111 def error_code(self) -> str | None: 112 """The evaluation error code.""" 113 return self._error_code 114 115 @property 116 def error_message(self) -> str | None: 117 """The evaluation error message.""" 118 return self._error_message 119 120 @property 121 def assessments(self) -> list[AssessmentEntity] | None: 122 """The evaluation assessments.""" 123 return self._assessments 124 125 @property 126 def metrics(self) -> list[Metric] | None: 127 """The evaluation metrics.""" 128 return self._metrics 129 130 @property 131 def tags(self) -> list[EvaluationTag] | None: 132 """The evaluation tags.""" 133 return self._tags 134 135 def __eq__(self, __o): 136 if isinstance(__o, self.__class__): 137 return self.to_dictionary() == __o.to_dictionary() 138 return False 139 140 def to_dictionary(self) -> dict[str, Any]: 141 """ 142 Convert the Evaluation object to a dictionary. 143 144 Returns: 145 dict: The Evaluation object represented as a dictionary. 146 """ 147 evaluation_dict = { 148 "evaluation_id": self.evaluation_id, 149 "run_id": self.run_id, 150 "inputs_id": self.inputs_id, 151 "inputs": self.inputs, 152 "outputs": self.outputs, 153 "request_id": self.request_id, 154 "targets": self.targets, 155 "error_code": self.error_code, 156 "error_message": self.error_message, 157 "assessments": [assess.to_dictionary() for assess in self.assessments] 158 if self.assessments 159 else None, 160 "metrics": [metric.to_dictionary() for metric in self.metrics] 161 if self.metrics 162 else None, 163 "tags": [tag.to_dictionary() for tag in self.tags] if self.tags else None, 164 } 165 return {k: v for k, v in evaluation_dict.items() if v is not None} 166 167 @classmethod 168 def from_dictionary(cls, evaluation_dict: dict[str, Any]): 169 """ 170 Create an Evaluation object from a dictionary. 171 172 Args: 173 evaluation_dict (dict): Dictionary containing evaluation information. 174 175 Returns: 176 Evaluation: The Evaluation object created from the dictionary. 177 """ 178 assessments = None 179 if "assessments" in evaluation_dict: 180 assessments = [ 181 AssessmentEntity.from_dictionary(assess) 182 for assess in evaluation_dict["assessments"] 183 ] 184 metrics = None 185 if "metrics" in evaluation_dict: 186 metrics = [Metric.from_dictionary(metric) for metric in evaluation_dict["metrics"]] 187 tags = None 188 if "tags" in evaluation_dict: 189 tags = [EvaluationTag(tag["key"], tag["value"]) for tag in evaluation_dict["tags"]] 190 return cls( 191 evaluation_id=evaluation_dict["evaluation_id"], 192 run_id=evaluation_dict["run_id"], 193 inputs_id=evaluation_dict["inputs_id"], 194 inputs=evaluation_dict["inputs"], 195 outputs=evaluation_dict.get("outputs"), 196 request_id=evaluation_dict.get("request_id"), 197 targets=evaluation_dict.get("targets"), 198 error_code=evaluation_dict.get("error_code"), 199 error_message=evaluation_dict.get("error_message"), 200 assessments=assessments, 201 metrics=metrics, 202 tags=tags, 203 ) 204 205 206 @deprecated(since="3.0.0") 207 class Evaluation(_MlflowObject): 208 """ 209 Evaluation result data. 210 """ 211 212 def __init__( 213 self, 214 inputs: dict[str, Any], 215 outputs: dict[str, Any] | None = None, 216 inputs_id: str | None = None, 217 request_id: str | None = None, 218 targets: dict[str, Any] | None = None, 219 error_code: str | None = None, 220 error_message: str | None = None, 221 assessments: list[Assessment] | None = None, 222 metrics: dict[str, float] | list[Metric] | None = None, 223 tags: dict[str, str] | None = None, 224 ): 225 """ 226 Construct a new Evaluation instance. 227 228 Args: 229 inputs: Input names and values for evaluation. 230 outputs: Outputs obtained during inference. 231 inputs_id: A unique identifier for the input names and values for evaluation. 232 request_id: The ID of an MLflow Trace corresponding to the inputs and outputs. 233 targets: Expected values that the model should produce during inference. 234 error_code: An error code representing any issues encountered during the evaluation. 235 error_message: A descriptive error message representing any issues encountered during 236 the evaluation. 237 assessments: Assessments for the evaluation. 238 metrics: Objective numerical metrics for the evaluation, e.g., "number of input tokens", 239 "number of output tokens". 240 tags: Dictionary of tags associated with the evaluation. 241 """ 242 if isinstance(metrics, dict): 243 metrics = [ 244 Metric(key=key, value=value, timestamp=0, step=0) for key, value in metrics.items() 245 ] 246 if isinstance(tags, dict): 247 tags = [EvaluationTag(key=str(key), value=str(value)) for key, value in tags.items()] 248 249 self._inputs = inputs 250 self._outputs = outputs 251 self._inputs_id = inputs_id or _generate_inputs_id(inputs) 252 self._request_id = request_id 253 self._targets = targets 254 self._error_code = error_code 255 self._error_message = error_message 256 self._assessments = assessments 257 self._metrics = metrics 258 self._tags = tags 259 260 @property 261 def inputs_id(self) -> str: 262 """The evaluation inputs ID.""" 263 return self._inputs_id 264 265 @property 266 def inputs(self) -> dict[str, Any]: 267 """The evaluation inputs.""" 268 return self._inputs 269 270 @property 271 def outputs(self) -> dict[str, Any] | None: 272 """The evaluation outputs.""" 273 return self._outputs 274 275 @property 276 def request_id(self) -> str | None: 277 """The evaluation request ID.""" 278 return self._request_id 279 280 @property 281 def targets(self) -> dict[str, Any] | None: 282 """The evaluation targets.""" 283 return self._targets 284 285 @property 286 def error_code(self) -> str | None: 287 """The evaluation error code.""" 288 return self._error_code 289 290 @property 291 def error_message(self) -> str | None: 292 """The evaluation error message.""" 293 return self._error_message 294 295 @property 296 def assessments(self) -> list[Assessment] | None: 297 """The evaluation assessments.""" 298 return self._assessments 299 300 @property 301 def metrics(self) -> list[Metric] | None: 302 """The evaluation metrics.""" 303 return self._metrics 304 305 @property 306 def tags(self) -> dict[str, str] | None: 307 """The evaluation tags.""" 308 return self._tags 309 310 def __eq__(self, __o): 311 if isinstance(__o, self.__class__): 312 return self.to_dictionary() == __o.to_dictionary() 313 return False 314 315 def _to_entity(self, run_id: str, evaluation_id: str) -> EvaluationEntity: 316 """ 317 Convert the Evaluation object to an EvaluationEntity object. 318 319 Returns: 320 EvaluationEntity: An EvaluationEntity object. 321 """ 322 return EvaluationEntity( 323 evaluation_id=evaluation_id, 324 run_id=run_id, 325 inputs_id=self.inputs_id, 326 inputs=self.inputs, 327 outputs=self.outputs, 328 request_id=self.request_id, 329 targets=self.targets, 330 error_code=self.error_code, 331 error_message=self.error_message, 332 assessments=[assess._to_entity(evaluation_id) for assess in self.assessments] 333 if self.assessments 334 else None, 335 metrics=self.metrics, 336 tags=self.tags, 337 ) 338 339 def to_dictionary(self) -> dict[str, Any]: 340 """ 341 Convert the Evaluation object to a dictionary. 342 343 Returns: 344 dict: The Evaluation object represented as a dictionary. 345 """ 346 evaluation_dict = { 347 "inputs_id": self.inputs_id, 348 "inputs": self.inputs, 349 "outputs": self.outputs, 350 "request_id": self.request_id, 351 "targets": self.targets, 352 "error_code": self.error_code, 353 "error_message": self.error_message, 354 "assessments": [assess.to_dictionary() for assess in self.assessments] 355 if self.assessments 356 else None, 357 "metrics": [metric.to_dictionary() for metric in self.metrics] 358 if self.metrics 359 else None, 360 "tags": [tag.to_dictionary() for tag in self.tags] if self.tags else None, 361 } 362 return {k: v for k, v in evaluation_dict.items() if v is not None} 363 364 @classmethod 365 def from_dictionary(cls, evaluation_dict: dict[str, Any]): 366 """ 367 Create an Evaluation object from a dictionary. 368 369 Args: 370 evaluation_dict (dict): Dictionary containing evaluation information. 371 372 Returns: 373 Evaluation: The Evaluation object created from the dictionary. 374 """ 375 assessments = None 376 if "assessments" in evaluation_dict: 377 assessments = [ 378 Assessment.from_dictionary(assess) for assess in evaluation_dict["assessments"] 379 ] 380 metrics = None 381 if "metrics" in evaluation_dict: 382 metrics = [Metric.from_dictionary(metric) for metric in evaluation_dict["metrics"]] 383 tags = None 384 if "tags" in evaluation_dict: 385 tags = [EvaluationTag(tag["key"], tag["value"]) for tag in evaluation_dict["tags"]] 386 return cls( 387 inputs_id=evaluation_dict["inputs_id"], 388 inputs=evaluation_dict["inputs"], 389 outputs=evaluation_dict.get("outputs"), 390 request_id=evaluation_dict.get("request_id"), 391 targets=evaluation_dict.get("targets"), 392 error_code=evaluation_dict.get("error_code"), 393 error_message=evaluation_dict.get("error_message"), 394 assessments=assessments, 395 metrics=metrics, 396 tags=tags, 397 ) 398 399 400 def _generate_inputs_id(inputs: dict[str, Any]) -> str: 401 """ 402 Generates a unique identifier for the inputs. 403 404 Args: 405 inputs (Dict[str, Any]): Input fields used by the model to compute outputs. 406 407 Returns: 408 str: A unique identifier for the inputs. 409 """ 410 inputs_json = json.dumps(inputs, sort_keys=True, cls=TraceJSONEncoder) 411 return hashlib.sha256(inputs_json.encode("utf-8")).hexdigest()