test_utils.py
1 from mlflow.entities import Metric 2 from mlflow.evaluation.assessment import AssessmentEntity, AssessmentSource 3 from mlflow.evaluation.evaluation import EvaluationEntity 4 from mlflow.evaluation.evaluation_tag import EvaluationTag 5 from mlflow.evaluation.utils import evaluations_to_dataframes 6 7 8 def test_evaluations_to_dataframes_basic(): 9 # Setup an evaluation with minimal data 10 evaluation = EvaluationEntity( 11 evaluation_id="eval1", 12 run_id="run1", 13 inputs_id="inputs1", 14 inputs={"feature1": 1.0, "feature2": 2.0}, 15 ) 16 17 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 18 19 # Check the evaluations DataFrame 20 assert len(evaluations_df) == 1 21 assert evaluations_df["evaluation_id"].iloc[0] == "eval1" 22 assert evaluations_df["run_id"].iloc[0] == "run1" 23 assert evaluations_df["inputs_id"].iloc[0] == "inputs1" 24 assert evaluations_df["inputs"].iloc[0] == {"feature1": 1.0, "feature2": 2.0} 25 26 # Check that the other DataFrames are empty 27 assert metrics_df.empty 28 assert assessments_df.empty 29 assert tags_df.empty 30 31 32 def test_evaluations_to_dataframes_full_data(): 33 # Setup an evaluation with full data 34 source = AssessmentSource(source_type="HUMAN", source_id="user_1") 35 assessment = AssessmentEntity( 36 evaluation_id="eval1", 37 name="accuracy", 38 source=source, 39 timestamp=123456789, 40 numeric_value=0.95, 41 rationale="Good performance", 42 ) 43 metric = Metric(key="metric1", value=0.9, timestamp=1234567890, step=0) 44 tag = EvaluationTag(key="tag1", value="value1") 45 46 evaluation = EvaluationEntity( 47 evaluation_id="eval1", 48 run_id="run1", 49 inputs_id="inputs1", 50 inputs={"feature1": 1.0, "feature2": 2.0}, 51 outputs={"output1": 0.5}, 52 request_id="request1", 53 targets={"target1": 0.6}, 54 error_code="E001", 55 error_message="An error occurred", 56 assessments=[assessment], 57 metrics=[metric], 58 tags=[tag], 59 ) 60 61 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 62 63 # Check the evaluations DataFrame 64 assert len(evaluations_df) == 1 65 assert evaluations_df["evaluation_id"].iloc[0] == "eval1" 66 assert evaluations_df["run_id"].iloc[0] == "run1" 67 assert evaluations_df["inputs_id"].iloc[0] == "inputs1" 68 assert evaluations_df["inputs"].iloc[0] == {"feature1": 1.0, "feature2": 2.0} 69 assert evaluations_df["outputs"].iloc[0] == {"output1": 0.5} 70 assert evaluations_df["request_id"].iloc[0] == "request1" 71 assert evaluations_df["targets"].iloc[0] == {"target1": 0.6} 72 assert evaluations_df["error_code"].iloc[0] == "E001" 73 assert evaluations_df["error_message"].iloc[0] == "An error occurred" 74 75 # Check the metrics DataFrame 76 assert len(metrics_df) == 1 77 assert metrics_df["evaluation_id"].iloc[0] == "eval1" 78 assert metrics_df["key"].iloc[0] == "metric1" 79 assert metrics_df["value"].iloc[0] == 0.9 80 assert metrics_df["timestamp"].iloc[0] == 1234567890 81 82 # Check the assessments DataFrame 83 assert len(assessments_df) == 1 84 assert assessments_df["evaluation_id"].iloc[0] == "eval1" 85 assert assessments_df["name"].iloc[0] == "accuracy" 86 assert assessments_df["source"].iloc[0] == source.to_dictionary() 87 assert assessments_df["boolean_value"].iloc[0] is None 88 assert assessments_df["numeric_value"].iloc[0] == 0.95 89 assert assessments_df["string_value"].iloc[0] is None 90 assert assessments_df["rationale"].iloc[0] == "Good performance" 91 assert assessments_df["error_code"].iloc[0] is None 92 assert assessments_df["error_message"].iloc[0] is None 93 94 # Check the tags DataFrame 95 assert len(tags_df) == 1 96 assert tags_df["evaluation_id"].iloc[0] == "eval1" 97 assert tags_df["key"].iloc[0] == "tag1" 98 assert tags_df["value"].iloc[0] == "value1" 99 100 101 def test_evaluations_to_dataframes_empty(): 102 # Empty evaluations list 103 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([]) 104 105 # Verify that the DataFrames are empty 106 assert evaluations_df.empty 107 assert metrics_df.empty 108 assert assessments_df.empty 109 assert tags_df.empty 110 111 # Verify the column names of the empty DataFrames 112 expected_evaluation_columns = [ 113 "evaluation_id", 114 "run_id", 115 "inputs_id", 116 "inputs", 117 "outputs", 118 "request_id", 119 "targets", 120 "error_code", 121 "error_message", 122 ] 123 expected_metrics_columns = [ 124 "evaluation_id", 125 "key", 126 "value", 127 "timestamp", 128 "model_id", 129 "dataset_name", 130 "dataset_digest", 131 "run_id", 132 ] 133 expected_assessments_columns = [ 134 "evaluation_id", 135 "name", 136 "source", 137 "timestamp", 138 "boolean_value", 139 "numeric_value", 140 "string_value", 141 "rationale", 142 "metadata", 143 "error_code", 144 "error_message", 145 "span_id", 146 ] 147 expected_tags_columns = ["evaluation_id", "key", "value"] 148 149 assert list(evaluations_df.columns) == expected_evaluation_columns 150 assert list(metrics_df.columns) == expected_metrics_columns 151 assert list(assessments_df.columns) == expected_assessments_columns 152 assert list(tags_df.columns) == expected_tags_columns 153 154 155 def test_evaluations_to_dataframes_basic(): 156 # Setup an evaluation with minimal data 157 evaluation = EvaluationEntity( 158 evaluation_id="eval1", 159 run_id="run1", 160 inputs_id="inputs1", 161 inputs={"feature1": 1.0, "feature2": 2.0}, 162 ) 163 164 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 165 166 # Check the evaluations DataFrame 167 assert len(evaluations_df) == 1 168 assert evaluations_df["evaluation_id"].iloc[0] == "eval1" 169 assert evaluations_df["run_id"].iloc[0] == "run1" 170 assert evaluations_df["inputs_id"].iloc[0] == "inputs1" 171 assert evaluations_df["inputs"].iloc[0] == {"feature1": 1.0, "feature2": 2.0} 172 173 # Check that the other 174 175 176 def test_evaluations_to_dataframes_different_assessments(): 177 # Different types of assessments in evaluations 178 source = AssessmentSource(source_type="HUMAN", source_id="user_1") 179 assessment_1 = AssessmentEntity( 180 evaluation_id="eval1", 181 name="accuracy", 182 source=source, 183 timestamp=123456789, 184 numeric_value=0.95, 185 rationale="Good performance", 186 ) 187 assessment_2 = AssessmentEntity( 188 evaluation_id="eval1", 189 name="precision", 190 source=source, 191 timestamp=123456789, 192 numeric_value=0.85, 193 rationale="Reasonable performance", 194 ) 195 196 evaluation = EvaluationEntity( 197 evaluation_id="eval1", 198 run_id="run1", 199 inputs_id="inputs1", 200 inputs={"feature1": 1.0, "feature2": 2.0}, 201 assessments=[assessment_1, assessment_2], 202 ) 203 204 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 205 206 # Check the assessments DataFrame 207 assert len(assessments_df) == 2 208 assert assessments_df["evaluation_id"].iloc[0] == "eval1" 209 assert assessments_df["name"].iloc[0] == "accuracy" 210 assert assessments_df["numeric_value"].iloc[0] == 0.95 211 212 assert assessments_df["evaluation_id"].iloc[1] == "eval1" 213 assert assessments_df["name"].iloc[1] == "precision" 214 assert assessments_df["numeric_value"].iloc[1] == 0.85 215 216 217 def test_evaluations_to_dataframes_different_metrics(): 218 # Different types of metrics in evaluations 219 metric_1 = Metric(key="metric1", value=0.9, timestamp=1234567890, step=0) 220 metric_2 = Metric(key="metric2", value=0.8, timestamp=1234567891, step=0) 221 222 evaluation = EvaluationEntity( 223 evaluation_id="eval1", 224 run_id="run1", 225 inputs_id="inputs1", 226 inputs={"feature1": 1.0, "feature2": 2.0}, 227 metrics=[metric_1, metric_2], 228 ) 229 230 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 231 232 # Check the metrics DataFrame 233 assert len(metrics_df) == 2 234 assert metrics_df["evaluation_id"].iloc[0] == "eval1" 235 assert metrics_df["key"].iloc[0] == "metric1" 236 assert metrics_df["value"].iloc[0] == 0.9 237 assert metrics_df["timestamp"].iloc[0] == 1234567890 238 239 assert metrics_df["evaluation_id"].iloc[1] == "eval1" 240 assert metrics_df["key"].iloc[1] == "metric2" 241 assert metrics_df["value"].iloc[1] == 0.8 242 assert metrics_df["timestamp"].iloc[1] == 1234567891 243 244 245 def test_evaluations_to_dataframes_different_tags(): 246 # Different tags in evaluations 247 tag1 = EvaluationTag(key="tag1", value="value1") 248 tag2 = EvaluationTag(key="tag2", value="value2") 249 250 evaluation = EvaluationEntity( 251 evaluation_id="eval1", 252 run_id="run1", 253 inputs_id="inputs1", 254 inputs={"feature1": 1.0, "feature2": 2.0}, 255 tags=[tag1, tag2], 256 ) 257 258 evaluations_df, metrics_df, assessments_df, tags_df = evaluations_to_dataframes([evaluation]) 259 260 # Check the tags DataFrame 261 assert len(tags_df) == 2 262 assert tags_df["evaluation_id"].iloc[0] == "eval1" 263 assert tags_df["key"].iloc[0] == "tag1" 264 assert tags_df["value"].iloc[0] == "value1" 265 266 assert tags_df["evaluation_id"].iloc[1] == "eval1" 267 assert tags_df["key"].iloc[1] == "tag2" 268 assert tags_df["value"].iloc[1] == "value2"