test_fluent.py
1 import pytest 2 3 import mlflow 4 from mlflow.entities import Metric 5 from mlflow.evaluation import Assessment, Evaluation, log_evaluations 6 from mlflow.evaluation.assessment import AssessmentSource, AssessmentSourceType 7 from mlflow.evaluation.evaluation_tag import EvaluationTag 8 9 from tests.evaluate.logging.utils import get_evaluation 10 11 12 @pytest.fixture 13 def end_run_at_test_end(): 14 yield 15 mlflow.end_run() 16 17 18 def test_log_evaluations_with_minimal_params_succeeds(): 19 inputs1 = {"feature1": 1.0, "feature2": 2.0} 20 outputs1 = {"prediction": 0.5} 21 22 inputs2 = {"feature3": 3.0, "feature4": 4.0} 23 outputs2 = {"prediction": 0.8} 24 25 with mlflow.start_run(): 26 # Create evaluation objects 27 evaluation1 = Evaluation(inputs=inputs1, outputs=outputs1) 28 evaluation2 = Evaluation(inputs=inputs2, outputs=outputs2) 29 30 # Log the evaluations 31 logged_evaluations = log_evaluations(evaluations=[evaluation1, evaluation2]) 32 assert len(logged_evaluations) == 2 33 34 for logged_evaluation, expected_evaluation in zip( 35 logged_evaluations, [evaluation1, evaluation2] 36 ): 37 assert logged_evaluation.inputs == expected_evaluation.inputs 38 assert logged_evaluation.outputs == expected_evaluation.outputs 39 retrieved_evaluation = get_evaluation( 40 evaluation_id=logged_evaluation.evaluation_id, 41 run_id=mlflow.active_run().info.run_id, 42 ) 43 assert retrieved_evaluation is not None 44 assert retrieved_evaluation.inputs == logged_evaluation.inputs 45 assert retrieved_evaluation.outputs == logged_evaluation.outputs 46 47 48 def test_log_evaluations_with_all_params(): 49 evaluations_data = [ 50 ( 51 {"feature1": 1.0, "feature2": 2.0}, 52 {"prediction": 0.5}, 53 {"actual": 1.0}, 54 [ 55 { 56 "name": "assessment1", 57 "value": 1.0, 58 "source": { 59 "source_type": "HUMAN", 60 "source_id": "user_1", 61 }, 62 }, 63 { 64 "name": "assessment2", 65 "value": 0.84, 66 "source": { 67 "source_type": "HUMAN", 68 "source_id": "user_1", 69 }, 70 }, 71 ], 72 [ 73 Metric(key="metric1", value=1.4, timestamp=1717047609503, step=0), 74 Metric(key="metric2", value=1.2, timestamp=1717047609504, step=0), 75 ], 76 {"tag1": "value1", "tag2": "value2"}, 77 ), 78 ( 79 {"feature1": "text1", "feature2": "text2"}, 80 {"prediction": "output_text"}, 81 {"actual": "expected_text"}, 82 [ 83 Assessment( 84 name="accuracy", 85 value=0.8, 86 source=AssessmentSource( 87 source_type=AssessmentSourceType.HUMAN, 88 source_id="user-1", 89 ), 90 ) 91 ], 92 {"metric1": 0.8, "metric2": 0.84}, 93 {"tag3": "value3", "tag4": "value4"}, 94 ), 95 ] 96 97 inputs_id = "unique-inputs-id" 98 request_id = "unique-request-id" 99 100 with mlflow.start_run() as run: 101 run_id = run.info.run_id 102 103 evaluations = [] 104 for inputs, outputs, targets, assessments, metrics, tags in evaluations_data: 105 if isinstance(assessments[0], dict): 106 assessments = [Assessment.from_dictionary(assessment) for assessment in assessments] 107 108 if isinstance(metrics, dict): 109 metrics = [ 110 Metric(key=key, value=value, timestamp=0, step=0) 111 for key, value in metrics.items() 112 ] 113 114 evaluation = Evaluation( 115 inputs=inputs, 116 outputs=outputs, 117 inputs_id=inputs_id, 118 request_id=request_id, 119 targets=targets, 120 assessments=assessments, 121 metrics=metrics, 122 tags=tags, 123 ) 124 evaluations.append(evaluation) 125 126 # Log the evaluations 127 logged_evaluations = log_evaluations(evaluations=evaluations, run_id=run_id) 128 129 for logged_evaluation, (inputs, outputs, targets, assessments, metrics, tags) in zip( 130 logged_evaluations, evaluations_data 131 ): 132 # Assert the fields of the logged evaluation 133 assert logged_evaluation.inputs == inputs 134 assert logged_evaluation.outputs == outputs 135 assert logged_evaluation.inputs_id == inputs_id 136 assert logged_evaluation.request_id == request_id 137 assert logged_evaluation.targets == targets 138 139 logged_metrics = ( 140 {metric.key: metric.value for metric in logged_evaluation.metrics} 141 if isinstance(metrics, list) and isinstance(metrics[0], Metric) 142 else metrics 143 ) 144 assert { 145 metric.key: metric.value for metric in logged_evaluation.metrics 146 } == logged_metrics 147 148 logged_tags = ( 149 {tag.key: tag.value for tag in logged_evaluation.tags} 150 if isinstance(tags, list) and isinstance(tags[0], EvaluationTag) 151 else tags 152 ) 153 assert {tag.key: tag.value for tag in logged_evaluation.tags} == logged_tags 154 155 assessment_entities = [ 156 Assessment.from_dictionary(assessment)._to_entity( 157 evaluation_id=logged_evaluation.evaluation_id 158 ) 159 if isinstance(assessment, dict) 160 else assessment._to_entity(evaluation_id=logged_evaluation.evaluation_id) 161 for assessment in assessments 162 ] 163 164 for logged_assessment, assessment_entity in zip( 165 logged_evaluation.assessments, assessment_entities 166 ): 167 assert logged_assessment.name == assessment_entity.name 168 assert logged_assessment.boolean_value == assessment_entity.boolean_value 169 assert logged_assessment.numeric_value == assessment_entity.numeric_value 170 assert logged_assessment.string_value == assessment_entity.string_value 171 assert logged_assessment.metadata == assessment_entity.metadata 172 assert logged_assessment.source == assessment_entity.source 173 174 retrieved_evaluation = get_evaluation( 175 evaluation_id=logged_evaluation.evaluation_id, run_id=run_id 176 ) 177 assert logged_evaluation == retrieved_evaluation 178 179 180 def test_log_evaluations_starts_run_if_not_started(end_run_at_test_end): 181 inputs = {"feature1": 1.0, "feature2": {"nested_feature": 2.0}} 182 outputs = {"prediction": 0.5} 183 184 # Ensure there is no active run 185 if mlflow.active_run() is not None: 186 mlflow.end_run() 187 188 # Log evaluation without explicitly starting a run 189 logged_evaluation = log_evaluations(evaluations=[Evaluation(inputs=inputs, outputs=outputs)])[0] 190 191 # Verify that a run has been started 192 active_run = mlflow.active_run() 193 assert active_run is not None, "Expected a run to be started automatically." 194 195 # Retrieve the evaluation using the run ID 196 retrieved_evaluation = get_evaluation( 197 evaluation_id=logged_evaluation.evaluation_id, run_id=active_run.info.run_id 198 ) 199 assert retrieved_evaluation == logged_evaluation 200 201 202 def test_evaluation_module_exposes_relevant_apis_for_logging(): 203 import mlflow.evaluation 204 205 assert hasattr(mlflow.evaluation, "log_evaluations") 206 assert hasattr(mlflow.evaluation, "Evaluation") 207 assert hasattr(mlflow.evaluation, "Assessment") 208 assert hasattr(mlflow.evaluation, "AssessmentSource") 209 assert hasattr(mlflow.evaluation, "AssessmentSourceType") 210 211 212 def test_log_evaluations_works_properly_with_empty_evaluations_list(): 213 with mlflow.start_run(): 214 log_evaluations(evaluations=[]) 215 216 artifacts = mlflow.MlflowClient().list_artifacts(mlflow.active_run().info.run_id) 217 assert len(artifacts) == 0