/ mlflow / evaluation / evaluation.py
evaluation.py
  1  """
  2  THE 'mlflow.evaluation` MODULE IS LEGACY AND WILL BE REMOVED IN MLFLOW 3.0.
  3  For assessment functionality, use `mlflow.entities.assessment` for assessment classes and
  4  `mlflow.tracing.assessments` for assessment APIs. There are no alternatives for Evaluation and
  5  EvaluationEntity objects and related APIs.
  6  """
  7  
  8  import hashlib
  9  import json
 10  from typing import Any
 11  
 12  from mlflow.entities._mlflow_object import _MlflowObject
 13  from mlflow.entities.metric import Metric
 14  from mlflow.evaluation.assessment import Assessment, AssessmentEntity
 15  from mlflow.evaluation.evaluation_tag import (
 16      EvaluationTag,  # Assuming EvaluationTag is in this module
 17  )
 18  from mlflow.tracing.utils import TraceJSONEncoder
 19  from mlflow.utils.annotations import deprecated
 20  
 21  
 22  @deprecated(since="3.0.0")
 23  class EvaluationEntity(_MlflowObject):
 24      """
 25      Evaluation result data, including inputs, outputs, targets, assessments, and more.
 26      """
 27  
 28      def __init__(
 29          self,
 30          evaluation_id: str,
 31          run_id: str,
 32          inputs_id: str,
 33          inputs: dict[str, Any],
 34          outputs: dict[str, Any] | None = None,
 35          request_id: str | None = None,
 36          targets: dict[str, Any] | None = None,
 37          error_code: str | None = None,
 38          error_message: str | None = None,
 39          assessments: list[AssessmentEntity] | None = None,
 40          metrics: list[Metric] | None = None,
 41          tags: list[EvaluationTag] | None = None,
 42      ):
 43          """
 44          Construct a new mlflow.evaluation.EvaluationEntity instance.
 45  
 46          Args:
 47              evaluation_id: A unique identifier for the evaluation.
 48              run_id: The ID of the MLflow Run containing the Evaluation.
 49              inputs_id: A unique identifier for the input names and values for evaluation.
 50              inputs: Input names and values for evaluation.
 51              outputs: Outputs obtained during inference.
 52              request_id: The ID of an MLflow Trace corresponding to the inputs and outputs.
 53              targets: Expected values that the model should produce during inference.
 54              error_code: An error code representing any issues encountered during the evaluation.
 55              error_message: A descriptive error message representing any issues encountered during
 56                  the evaluation.
 57              assessments: Assessments for the evaluation.
 58              metrics: Objective numerical metrics for the evaluation, e.g., "number of input tokens",
 59                  "number of output tokens".
 60              tags: List of tags associated with the evaluation.
 61          """
 62          self._evaluation_id = evaluation_id
 63          self._run_id = run_id
 64          self._inputs_id = inputs_id
 65          self._inputs = inputs
 66          self._outputs = outputs
 67          self._request_id = request_id
 68          self._targets = targets
 69          self._error_code = error_code
 70          self._error_message = error_message
 71          self._assessments = assessments
 72          self._metrics = metrics
 73          self._tags = tags
 74  
 75      @property
 76      def evaluation_id(self) -> str:
 77          """The evaluation ID."""
 78          return self._evaluation_id
 79  
 80      @property
 81      def run_id(self) -> str:
 82          """The ID of the MLflow Run containing the evaluation"""
 83          return self._run_id
 84  
 85      @property
 86      def inputs_id(self) -> str:
 87          """The evaluation inputs ID."""
 88          return self._inputs_id
 89  
 90      @property
 91      def inputs(self) -> dict[str, Any]:
 92          """The evaluation inputs."""
 93          return self._inputs
 94  
 95      @property
 96      def outputs(self) -> dict[str, Any] | None:
 97          """The evaluation outputs."""
 98          return self._outputs
 99  
100      @property
101      def request_id(self) -> str | None:
102          """The evaluation request ID."""
103          return self._request_id
104  
105      @property
106      def targets(self) -> dict[str, Any] | None:
107          """The evaluation targets."""
108          return self._targets
109  
110      @property
111      def error_code(self) -> str | None:
112          """The evaluation error code."""
113          return self._error_code
114  
115      @property
116      def error_message(self) -> str | None:
117          """The evaluation error message."""
118          return self._error_message
119  
120      @property
121      def assessments(self) -> list[AssessmentEntity] | None:
122          """The evaluation assessments."""
123          return self._assessments
124  
125      @property
126      def metrics(self) -> list[Metric] | None:
127          """The evaluation metrics."""
128          return self._metrics
129  
130      @property
131      def tags(self) -> list[EvaluationTag] | None:
132          """The evaluation tags."""
133          return self._tags
134  
135      def __eq__(self, __o):
136          if isinstance(__o, self.__class__):
137              return self.to_dictionary() == __o.to_dictionary()
138          return False
139  
140      def to_dictionary(self) -> dict[str, Any]:
141          """
142          Convert the Evaluation object to a dictionary.
143  
144          Returns:
145              dict: The Evaluation object represented as a dictionary.
146          """
147          evaluation_dict = {
148              "evaluation_id": self.evaluation_id,
149              "run_id": self.run_id,
150              "inputs_id": self.inputs_id,
151              "inputs": self.inputs,
152              "outputs": self.outputs,
153              "request_id": self.request_id,
154              "targets": self.targets,
155              "error_code": self.error_code,
156              "error_message": self.error_message,
157              "assessments": [assess.to_dictionary() for assess in self.assessments]
158              if self.assessments
159              else None,
160              "metrics": [metric.to_dictionary() for metric in self.metrics]
161              if self.metrics
162              else None,
163              "tags": [tag.to_dictionary() for tag in self.tags] if self.tags else None,
164          }
165          return {k: v for k, v in evaluation_dict.items() if v is not None}
166  
167      @classmethod
168      def from_dictionary(cls, evaluation_dict: dict[str, Any]):
169          """
170          Create an Evaluation object from a dictionary.
171  
172          Args:
173              evaluation_dict (dict): Dictionary containing evaluation information.
174  
175          Returns:
176              Evaluation: The Evaluation object created from the dictionary.
177          """
178          assessments = None
179          if "assessments" in evaluation_dict:
180              assessments = [
181                  AssessmentEntity.from_dictionary(assess)
182                  for assess in evaluation_dict["assessments"]
183              ]
184          metrics = None
185          if "metrics" in evaluation_dict:
186              metrics = [Metric.from_dictionary(metric) for metric in evaluation_dict["metrics"]]
187          tags = None
188          if "tags" in evaluation_dict:
189              tags = [EvaluationTag(tag["key"], tag["value"]) for tag in evaluation_dict["tags"]]
190          return cls(
191              evaluation_id=evaluation_dict["evaluation_id"],
192              run_id=evaluation_dict["run_id"],
193              inputs_id=evaluation_dict["inputs_id"],
194              inputs=evaluation_dict["inputs"],
195              outputs=evaluation_dict.get("outputs"),
196              request_id=evaluation_dict.get("request_id"),
197              targets=evaluation_dict.get("targets"),
198              error_code=evaluation_dict.get("error_code"),
199              error_message=evaluation_dict.get("error_message"),
200              assessments=assessments,
201              metrics=metrics,
202              tags=tags,
203          )
204  
205  
206  @deprecated(since="3.0.0")
207  class Evaluation(_MlflowObject):
208      """
209      Evaluation result data.
210      """
211  
212      def __init__(
213          self,
214          inputs: dict[str, Any],
215          outputs: dict[str, Any] | None = None,
216          inputs_id: str | None = None,
217          request_id: str | None = None,
218          targets: dict[str, Any] | None = None,
219          error_code: str | None = None,
220          error_message: str | None = None,
221          assessments: list[Assessment] | None = None,
222          metrics: dict[str, float] | list[Metric] | None = None,
223          tags: dict[str, str] | None = None,
224      ):
225          """
226          Construct a new Evaluation instance.
227  
228          Args:
229              inputs: Input names and values for evaluation.
230              outputs: Outputs obtained during inference.
231              inputs_id: A unique identifier for the input names and values for evaluation.
232              request_id: The ID of an MLflow Trace corresponding to the inputs and outputs.
233              targets: Expected values that the model should produce during inference.
234              error_code: An error code representing any issues encountered during the evaluation.
235              error_message: A descriptive error message representing any issues encountered during
236                  the evaluation.
237              assessments: Assessments for the evaluation.
238              metrics: Objective numerical metrics for the evaluation, e.g., "number of input tokens",
239                  "number of output tokens".
240              tags: Dictionary of tags associated with the evaluation.
241          """
242          if isinstance(metrics, dict):
243              metrics = [
244                  Metric(key=key, value=value, timestamp=0, step=0) for key, value in metrics.items()
245              ]
246          if isinstance(tags, dict):
247              tags = [EvaluationTag(key=str(key), value=str(value)) for key, value in tags.items()]
248  
249          self._inputs = inputs
250          self._outputs = outputs
251          self._inputs_id = inputs_id or _generate_inputs_id(inputs)
252          self._request_id = request_id
253          self._targets = targets
254          self._error_code = error_code
255          self._error_message = error_message
256          self._assessments = assessments
257          self._metrics = metrics
258          self._tags = tags
259  
260      @property
261      def inputs_id(self) -> str:
262          """The evaluation inputs ID."""
263          return self._inputs_id
264  
265      @property
266      def inputs(self) -> dict[str, Any]:
267          """The evaluation inputs."""
268          return self._inputs
269  
270      @property
271      def outputs(self) -> dict[str, Any] | None:
272          """The evaluation outputs."""
273          return self._outputs
274  
275      @property
276      def request_id(self) -> str | None:
277          """The evaluation request ID."""
278          return self._request_id
279  
280      @property
281      def targets(self) -> dict[str, Any] | None:
282          """The evaluation targets."""
283          return self._targets
284  
285      @property
286      def error_code(self) -> str | None:
287          """The evaluation error code."""
288          return self._error_code
289  
290      @property
291      def error_message(self) -> str | None:
292          """The evaluation error message."""
293          return self._error_message
294  
295      @property
296      def assessments(self) -> list[Assessment] | None:
297          """The evaluation assessments."""
298          return self._assessments
299  
300      @property
301      def metrics(self) -> list[Metric] | None:
302          """The evaluation metrics."""
303          return self._metrics
304  
305      @property
306      def tags(self) -> dict[str, str] | None:
307          """The evaluation tags."""
308          return self._tags
309  
310      def __eq__(self, __o):
311          if isinstance(__o, self.__class__):
312              return self.to_dictionary() == __o.to_dictionary()
313          return False
314  
315      def _to_entity(self, run_id: str, evaluation_id: str) -> EvaluationEntity:
316          """
317          Convert the Evaluation object to an EvaluationEntity object.
318  
319          Returns:
320              EvaluationEntity: An EvaluationEntity object.
321          """
322          return EvaluationEntity(
323              evaluation_id=evaluation_id,
324              run_id=run_id,
325              inputs_id=self.inputs_id,
326              inputs=self.inputs,
327              outputs=self.outputs,
328              request_id=self.request_id,
329              targets=self.targets,
330              error_code=self.error_code,
331              error_message=self.error_message,
332              assessments=[assess._to_entity(evaluation_id) for assess in self.assessments]
333              if self.assessments
334              else None,
335              metrics=self.metrics,
336              tags=self.tags,
337          )
338  
339      def to_dictionary(self) -> dict[str, Any]:
340          """
341          Convert the Evaluation object to a dictionary.
342  
343          Returns:
344              dict: The Evaluation object represented as a dictionary.
345          """
346          evaluation_dict = {
347              "inputs_id": self.inputs_id,
348              "inputs": self.inputs,
349              "outputs": self.outputs,
350              "request_id": self.request_id,
351              "targets": self.targets,
352              "error_code": self.error_code,
353              "error_message": self.error_message,
354              "assessments": [assess.to_dictionary() for assess in self.assessments]
355              if self.assessments
356              else None,
357              "metrics": [metric.to_dictionary() for metric in self.metrics]
358              if self.metrics
359              else None,
360              "tags": [tag.to_dictionary() for tag in self.tags] if self.tags else None,
361          }
362          return {k: v for k, v in evaluation_dict.items() if v is not None}
363  
364      @classmethod
365      def from_dictionary(cls, evaluation_dict: dict[str, Any]):
366          """
367          Create an Evaluation object from a dictionary.
368  
369          Args:
370              evaluation_dict (dict): Dictionary containing evaluation information.
371  
372          Returns:
373              Evaluation: The Evaluation object created from the dictionary.
374          """
375          assessments = None
376          if "assessments" in evaluation_dict:
377              assessments = [
378                  Assessment.from_dictionary(assess) for assess in evaluation_dict["assessments"]
379              ]
380          metrics = None
381          if "metrics" in evaluation_dict:
382              metrics = [Metric.from_dictionary(metric) for metric in evaluation_dict["metrics"]]
383          tags = None
384          if "tags" in evaluation_dict:
385              tags = [EvaluationTag(tag["key"], tag["value"]) for tag in evaluation_dict["tags"]]
386          return cls(
387              inputs_id=evaluation_dict["inputs_id"],
388              inputs=evaluation_dict["inputs"],
389              outputs=evaluation_dict.get("outputs"),
390              request_id=evaluation_dict.get("request_id"),
391              targets=evaluation_dict.get("targets"),
392              error_code=evaluation_dict.get("error_code"),
393              error_message=evaluation_dict.get("error_message"),
394              assessments=assessments,
395              metrics=metrics,
396              tags=tags,
397          )
398  
399  
400  def _generate_inputs_id(inputs: dict[str, Any]) -> str:
401      """
402      Generates a unique identifier for the inputs.
403  
404      Args:
405          inputs (Dict[str, Any]): Input fields used by the model to compute outputs.
406  
407      Returns:
408          str: A unique identifier for the inputs.
409      """
410      inputs_json = json.dumps(inputs, sort_keys=True, cls=TraceJSONEncoder)
411      return hashlib.sha256(inputs_json.encode("utf-8")).hexdigest()