test_column_regexp_metric.py
1 import json 2 3 import numpy as np 4 import pandas as pd 5 import pytest 6 7 from evidently.legacy.metrics import ColumnRegExpMetric 8 from evidently.legacy.metrics.data_integrity.column_regexp_metric import DataIntegrityValueByRegexpMetricResult 9 from evidently.legacy.metrics.data_integrity.column_regexp_metric import DataIntegrityValueByRegexpStat 10 from evidently.legacy.pipeline.column_mapping import ColumnMapping 11 from evidently.legacy.report import Report 12 13 14 @pytest.mark.parametrize( 15 "current_data, reference_data, column_name, reg_exp, expected_result", 16 ( 17 ( 18 pd.DataFrame( 19 { 20 "category_feature": ["3", "a", "b5", "a", np.nan], 21 "target": [1, 2, 1, 2, 1], 22 "prediction": [1, 1, 1, 2, 2], 23 } 24 ), 25 None, 26 "category_feature", 27 r".*\d+.*", 28 DataIntegrityValueByRegexpMetricResult( 29 column_name="category_feature", 30 reg_exp=r".*\d+.*", 31 top=10, 32 current=DataIntegrityValueByRegexpStat( 33 number_of_matched=2, 34 number_of_not_matched=2, 35 number_of_rows=5, 36 table_of_matched={"3": 1, "b5": 1}, 37 table_of_not_matched={"a": 2}, 38 ), 39 reference=None, 40 ), 41 ), 42 ( 43 pd.DataFrame( 44 { 45 "feature": [" a", "a", "\tb", np.nan, np.nan], 46 } 47 ), 48 pd.DataFrame( 49 { 50 "feature": ["a", "a", "c"], 51 } 52 ), 53 "feature", 54 r"^\s+.*", 55 DataIntegrityValueByRegexpMetricResult( 56 column_name="feature", 57 reg_exp=r"^\s+.*", 58 top=10, 59 current=DataIntegrityValueByRegexpStat( 60 number_of_matched=2, 61 number_of_not_matched=1, 62 number_of_rows=5, 63 table_of_matched={" a": 1, "\tb": 1}, 64 table_of_not_matched={"a": 1}, 65 ), 66 reference=DataIntegrityValueByRegexpStat( 67 number_of_matched=0, 68 number_of_not_matched=3, 69 number_of_rows=3, 70 table_of_matched={}, 71 table_of_not_matched={"a": 2, "c": 1}, 72 ), 73 ), 74 ), 75 ), 76 ) 77 def test_column_regexp_metric_success( 78 current_data: pd.DataFrame, 79 reference_data: pd.DataFrame, 80 column_name: str, 81 reg_exp: str, 82 expected_result: DataIntegrityValueByRegexpMetricResult, 83 ) -> None: 84 metric = ColumnRegExpMetric(column_name=column_name, reg_exp=reg_exp) 85 report = Report(metrics=[metric]) 86 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 87 result = metric.get_result() 88 assert result == expected_result 89 90 91 @pytest.mark.parametrize( 92 "current_data, reference_data, metric", 93 ( 94 ( 95 pd.DataFrame( 96 { 97 "col": [1, 2, 1, 2, 1], 98 } 99 ), 100 None, 101 ColumnRegExpMetric(column_name="test", reg_exp=r".*\d+.*"), 102 ), 103 ( 104 pd.DataFrame( 105 { 106 "feature": [" a", "a", "\tb", np.nan, np.nan], 107 } 108 ), 109 pd.DataFrame( 110 { 111 "test": ["a", "a", "c"], 112 } 113 ), 114 ColumnRegExpMetric(column_name="feature", reg_exp=r".*\d+.*"), 115 ), 116 ( 117 pd.DataFrame( 118 { 119 "col": [1, 2, 1, 2, 1], 120 } 121 ), 122 None, 123 ColumnRegExpMetric(column_name="col", reg_exp=""), 124 ), 125 ( 126 pd.DataFrame( 127 { 128 "col": [1, 2, 1, 2, 1], 129 } 130 ), 131 None, 132 ColumnRegExpMetric(column_name="col", reg_exp=r"\d*", top=0), 133 ), 134 ), 135 ) 136 def test_column_regexp_metric_value_error( 137 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnRegExpMetric 138 ) -> None: 139 with pytest.raises(ValueError): 140 report = Report(metrics=[metric]) 141 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 142 metric.get_result() 143 144 145 @pytest.mark.parametrize( 146 "current_data, reference_data, metric, expected_json", 147 ( 148 ( 149 pd.DataFrame( 150 { 151 "col": [1, 2, 1, 2, 1], 152 } 153 ), 154 None, 155 ColumnRegExpMetric(column_name="col", reg_exp=r".*\d+.*"), 156 { 157 "column_name": "col", 158 "current": { 159 "number_of_matched": 5, 160 "number_of_not_matched": 0, 161 "number_of_rows": 5, 162 "table_of_matched": {"1": 3, "2": 2}, 163 "table_of_not_matched": {}, 164 }, 165 "reference": None, 166 "reg_exp": ".*\\d+.*", 167 "top": 10, 168 }, 169 ), 170 ( 171 pd.DataFrame( 172 { 173 "feature": [np.nan, np.nan], 174 } 175 ), 176 pd.DataFrame( 177 { 178 "feature": ["a", "a", "c"], 179 } 180 ), 181 ColumnRegExpMetric(column_name="feature", reg_exp=r".*a+.*"), 182 { 183 "column_name": "feature", 184 "current": { 185 "number_of_matched": 0, 186 "number_of_not_matched": 0, 187 "number_of_rows": 2, 188 "table_of_matched": {}, 189 "table_of_not_matched": {}, 190 }, 191 "reference": { 192 "number_of_matched": 2, 193 "number_of_not_matched": 1, 194 "number_of_rows": 3, 195 "table_of_matched": {"a": 2}, 196 "table_of_not_matched": {"c": 1}, 197 }, 198 "reg_exp": ".*a+.*", 199 "top": 10, 200 }, 201 ), 202 ( 203 pd.DataFrame( 204 { 205 "col": [1, 2, 3, 4, 5, "a", "b", "c", 1, 1234567890, "a", "a", "d", "e", "f"], 206 } 207 ), 208 None, 209 ColumnRegExpMetric(column_name="col", reg_exp=r"\d", top=3), 210 { 211 "column_name": "col", 212 "current": { 213 "number_of_matched": 7, 214 "number_of_not_matched": 8, 215 "number_of_rows": 15, 216 "table_of_matched": {"1": 2, "2": 1, "3": 1}, 217 "table_of_not_matched": {"a": 3, "b": 1, "c": 1}, 218 }, 219 "reference": None, 220 "reg_exp": "\\d", 221 "top": 3, 222 }, 223 ), 224 ), 225 ) 226 def test_column_regexp_metric_with_report( 227 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnRegExpMetric, expected_json: dict 228 ) -> None: 229 report = Report(metrics=[metric]) 230 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 231 assert report.show() 232 json_result = report.json() 233 assert len(json_result) > 0 234 result = json.loads(json_result) 235 assert result["metrics"][0]["metric"] == "ColumnRegExpMetric" 236 assert result["metrics"][0]["result"] == expected_json