test_column_value_list_metric.py
1 import json 2 from typing import Optional 3 4 import numpy as np 5 import pandas as pd 6 import pytest 7 8 from evidently._pydantic_compat import parse_obj_as 9 from evidently.legacy.metrics import ColumnValueListMetric 10 from evidently.legacy.metrics.data_quality.column_value_list_metric import ColumnValueListMetricResult 11 from evidently.legacy.metrics.data_quality.column_value_list_metric import ValueListStat 12 from evidently.legacy.pipeline.column_mapping import ColumnMapping 13 from evidently.legacy.report import Report 14 15 16 @pytest.mark.parametrize( 17 "current_dataset, reference_dataset, metric, expected_result", 18 ( 19 ( 20 pd.DataFrame({"category_feature": []}), 21 None, 22 ColumnValueListMetric(column_name="category_feature", values=["test"]), 23 ColumnValueListMetricResult( 24 column_name="category_feature", 25 values=["test"], 26 current=ValueListStat( 27 number_in_list=0, 28 number_not_in_list=0, 29 share_in_list=0, 30 share_not_in_list=0, 31 values_in_list=[], 32 values_not_in_list=[], 33 rows_count=0, 34 ), 35 reference=None, 36 ), 37 ), 38 ( 39 pd.DataFrame({"category_feature": [np.nan, np.nan, np.nan]}), 40 None, 41 ColumnValueListMetric(column_name="category_feature", values=["test"]), 42 ColumnValueListMetricResult( 43 column_name="category_feature", 44 values=["test"], 45 current=ValueListStat( 46 number_in_list=0, 47 number_not_in_list=3, 48 share_in_list=0, 49 share_not_in_list=1, 50 values_in_list=[("test", 0)], 51 values_not_in_list=[], 52 rows_count=3, 53 ), 54 reference=None, 55 ), 56 ), 57 ( 58 pd.DataFrame({"category_feature": [1, np.nan, 1, 2]}), 59 None, 60 ColumnValueListMetric(column_name="category_feature", values=[1, 2, 3]), 61 ColumnValueListMetricResult( 62 column_name="category_feature", 63 values=[1, 2, 3], 64 current=ValueListStat( 65 number_in_list=3, 66 number_not_in_list=1, 67 share_in_list=0.75, 68 share_not_in_list=0.25, 69 values_in_list=[(1, 2), (2, 1), (3, 0)], 70 values_not_in_list=[], 71 rows_count=4, 72 ), 73 reference=None, 74 ), 75 ), 76 ( 77 pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}), 78 None, 79 ColumnValueListMetric(column_name="category_feature", values=["d"]), 80 ColumnValueListMetricResult( 81 column_name="category_feature", 82 values=["d"], 83 current=ValueListStat( 84 number_in_list=1, 85 number_not_in_list=3, 86 share_in_list=0.25, 87 share_not_in_list=0.75, 88 values_in_list=[("d", 1)], 89 values_not_in_list=[("n", 2), ("p", 1)], 90 rows_count=4, 91 ), 92 reference=None, 93 ), 94 ), 95 ( 96 pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}), 97 None, 98 ColumnValueListMetric(column_name="numerical_feature", values=[2]), 99 ColumnValueListMetricResult( 100 column_name="numerical_feature", 101 values=[2], 102 current=ValueListStat( 103 number_in_list=2, 104 number_not_in_list=2, 105 share_in_list=0.5, 106 share_not_in_list=0.5, 107 values_in_list=[(2, 2)], 108 values_not_in_list=[(0, 1), (432, 1)], 109 rows_count=4, 110 ), 111 reference=None, 112 ), 113 ), 114 ( 115 pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}), 116 pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}), 117 ColumnValueListMetric(column_name="category_feature"), 118 ColumnValueListMetricResult( 119 column_name="category_feature", 120 values=["n", "d", "p"], 121 current=ValueListStat( 122 number_in_list=4, 123 number_not_in_list=0, 124 share_in_list=1.0, 125 share_not_in_list=0.0, 126 values_in_list=[("n", 2), ("d", 1), ("p", 1)], 127 values_not_in_list=[], 128 rows_count=4, 129 ), 130 reference=ValueListStat( 131 number_in_list=4, 132 number_not_in_list=0, 133 share_in_list=1.0, 134 share_not_in_list=0.0, 135 values_in_list=[("n", 2), ("d", 1), ("p", 1)], 136 values_not_in_list=[], 137 rows_count=4, 138 ), 139 ), 140 ), 141 ), 142 ) 143 def test_data_quality_value_list_metric_success( 144 current_dataset: pd.DataFrame, 145 reference_dataset: Optional[pd.DataFrame], 146 metric: ColumnValueListMetric, 147 expected_result: ColumnValueListMetricResult, 148 ) -> None: 149 data_mapping = ColumnMapping() 150 report = Report(metrics=[metric]) 151 report.run(current_data=current_dataset, reference_data=reference_dataset, column_mapping=data_mapping) 152 result = metric.get_result() 153 assert result == expected_result 154 155 156 @pytest.mark.parametrize( 157 "current_dataset, reference_dataset, metric, error_message", 158 ( 159 ( 160 pd.DataFrame({"feature": [1, 2, 3]}), 161 None, 162 ColumnValueListMetric(column_name="test", values=[1]), 163 "Column 'test' is not in current data.", 164 ), 165 ( 166 pd.DataFrame({"test": [1, 2, 3]}), 167 pd.DataFrame({"feature": [1, 2, 3]}), 168 ColumnValueListMetric(column_name="test"), 169 "Column 'test' is not in reference data.", 170 ), 171 ( 172 pd.DataFrame({"test": ["a", "b", "c"]}), 173 None, 174 ColumnValueListMetric(column_name="test"), 175 "Reference or values list should be present.", 176 ), 177 ( 178 pd.DataFrame({"feature": [1, 2, 3]}), 179 pd.DataFrame({"feature": [1, 2, "a"]}), 180 ColumnValueListMetric(column_name="feature", values=[]), 181 "Values list should not be empty.", 182 ), 183 ( 184 pd.DataFrame({"feature": [1, 2, 3]}), 185 pd.DataFrame({"feature": [np.nan]}), 186 ColumnValueListMetric(column_name="feature", values=[]), 187 "Values list should not be empty.", 188 ), 189 ), 190 ) 191 def test_data_quality_value_list_metric_value_errors( 192 current_dataset: pd.DataFrame, 193 reference_dataset: Optional[pd.DataFrame], 194 metric: ColumnValueListMetric, 195 error_message: str, 196 ) -> None: 197 with pytest.raises(ValueError) as error: 198 report = Report(metrics=[metric]) 199 report.run(current_data=current_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 200 metric.get_result() 201 202 assert error.value.args[0] == error_message 203 204 205 @pytest.mark.parametrize( 206 "current_data, reference_data, metric, old_json", 207 ( 208 ( 209 pd.DataFrame({"col": [1, 2, 3]}), 210 None, 211 ColumnValueListMetric(column_name="col", values=[1]), 212 { 213 "column_name": "col", 214 "current": { 215 "number_in_list": 1, 216 "number_not_in_list": 2, 217 "rows_count": 3, 218 "share_in_list": 0.3333333333333333, 219 "share_not_in_list": 0.6666666666666666, 220 "values_in_list": [[1, 1]], 221 "values_not_in_list": [[2, 1], [3, 1]], 222 }, 223 "reference": None, 224 "values": [1], 225 }, 226 ), 227 ( 228 pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}), 229 pd.DataFrame( 230 { 231 "col1": [10, 20, 3.5], 232 "col2": [1, 2, 3], 233 } 234 ), 235 ColumnValueListMetric(column_name="col1"), 236 { 237 "column_name": "col1", 238 "current": { 239 "number_in_list": 0, 240 "number_not_in_list": 3, 241 "rows_count": 3, 242 "share_in_list": 0.0, 243 "share_not_in_list": 1.0, 244 "values_in_list": [[10.0, 0], [20.0, 0], [3.5, 0]], 245 "values_not_in_list": [[1, 1], [2, 1], [3, 1]], 246 }, 247 "reference": { 248 "number_in_list": 3, 249 "number_not_in_list": 0, 250 "rows_count": 3, 251 "share_in_list": 1.0, 252 "share_not_in_list": 0.0, 253 "values_in_list": [[10.0, 1], [20.0, 1], [3.5, 1]], 254 "values_not_in_list": [], 255 }, 256 "values": [10.0, 20.0, 3.5], 257 }, 258 ), 259 ), 260 ) 261 def test_data_quality_value_list_metric_with_report_compat( 262 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, old_json: dict 263 ): 264 report = Report(metrics=[metric]) 265 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 266 267 result = parse_obj_as(ColumnValueListMetricResult, old_json) 268 assert metric.get_result() == result 269 270 271 @pytest.mark.parametrize( 272 "current_data, reference_data, metric, expected_json", 273 ( 274 ( 275 pd.DataFrame({"col": [1, 2, 3]}), 276 None, 277 ColumnValueListMetric(column_name="col", values=[1]), 278 { 279 "column_name": "col", 280 "current": { 281 "number_in_list": 1, 282 "number_not_in_list": 2, 283 "rows_count": 3, 284 "share_in_list": 0.3333333333333333, 285 "share_not_in_list": 0.6666666666666666, 286 "values_in_list_dist": {"x": [1], "y": [1]}, 287 "values_not_in_list_dist": {"x": [2, 3], "y": [1, 1]}, 288 }, 289 "reference": None, 290 "values": [1], 291 }, 292 ), 293 ( 294 pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}), 295 pd.DataFrame( 296 { 297 "col1": [10, 20, 3.5], 298 "col2": [1, 2, 3], 299 } 300 ), 301 ColumnValueListMetric(column_name="col1"), 302 { 303 "column_name": "col1", 304 "current": { 305 "number_in_list": 0, 306 "number_not_in_list": 3, 307 "rows_count": 3, 308 "share_in_list": 0.0, 309 "share_not_in_list": 1.0, 310 "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [0, 0, 0]}, 311 "values_not_in_list_dist": {"x": [1, 2, 3], "y": [1, 1, 1]}, 312 }, 313 "reference": { 314 "number_in_list": 3, 315 "number_not_in_list": 0, 316 "rows_count": 3, 317 "share_in_list": 1.0, 318 "share_not_in_list": 0.0, 319 "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [1, 1, 1]}, 320 "values_not_in_list_dist": {"x": [], "y": []}, 321 }, 322 "values": [10.0, 20.0, 3.5], 323 }, 324 ), 325 ), 326 ) 327 def test_data_quality_value_list_metric_with_report( 328 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, expected_json: dict 329 ) -> None: 330 report = Report(metrics=[metric]) 331 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 332 assert report.show() 333 result_json = report.json() 334 assert len(result_json) > 0 335 result = json.loads(result_json) 336 assert result["metrics"][0]["metric"] == "ColumnValueListMetric" 337 assert result["metrics"][0]["result"] == expected_json