/ tests / metrics / data_interity / test_column_regexp_metric.py
test_column_regexp_metric.py
  1  import json
  2  
  3  import numpy as np
  4  import pandas as pd
  5  import pytest
  6  
  7  from evidently.legacy.metrics import ColumnRegExpMetric
  8  from evidently.legacy.metrics.data_integrity.column_regexp_metric import DataIntegrityValueByRegexpMetricResult
  9  from evidently.legacy.metrics.data_integrity.column_regexp_metric import DataIntegrityValueByRegexpStat
 10  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 11  from evidently.legacy.report import Report
 12  
 13  
 14  @pytest.mark.parametrize(
 15      "current_data, reference_data, column_name, reg_exp, expected_result",
 16      (
 17          (
 18              pd.DataFrame(
 19                  {
 20                      "category_feature": ["3", "a", "b5", "a", np.nan],
 21                      "target": [1, 2, 1, 2, 1],
 22                      "prediction": [1, 1, 1, 2, 2],
 23                  }
 24              ),
 25              None,
 26              "category_feature",
 27              r".*\d+.*",
 28              DataIntegrityValueByRegexpMetricResult(
 29                  column_name="category_feature",
 30                  reg_exp=r".*\d+.*",
 31                  top=10,
 32                  current=DataIntegrityValueByRegexpStat(
 33                      number_of_matched=2,
 34                      number_of_not_matched=2,
 35                      number_of_rows=5,
 36                      table_of_matched={"3": 1, "b5": 1},
 37                      table_of_not_matched={"a": 2},
 38                  ),
 39                  reference=None,
 40              ),
 41          ),
 42          (
 43              pd.DataFrame(
 44                  {
 45                      "feature": [" a", "a", "\tb", np.nan, np.nan],
 46                  }
 47              ),
 48              pd.DataFrame(
 49                  {
 50                      "feature": ["a", "a", "c"],
 51                  }
 52              ),
 53              "feature",
 54              r"^\s+.*",
 55              DataIntegrityValueByRegexpMetricResult(
 56                  column_name="feature",
 57                  reg_exp=r"^\s+.*",
 58                  top=10,
 59                  current=DataIntegrityValueByRegexpStat(
 60                      number_of_matched=2,
 61                      number_of_not_matched=1,
 62                      number_of_rows=5,
 63                      table_of_matched={" a": 1, "\tb": 1},
 64                      table_of_not_matched={"a": 1},
 65                  ),
 66                  reference=DataIntegrityValueByRegexpStat(
 67                      number_of_matched=0,
 68                      number_of_not_matched=3,
 69                      number_of_rows=3,
 70                      table_of_matched={},
 71                      table_of_not_matched={"a": 2, "c": 1},
 72                  ),
 73              ),
 74          ),
 75      ),
 76  )
 77  def test_column_regexp_metric_success(
 78      current_data: pd.DataFrame,
 79      reference_data: pd.DataFrame,
 80      column_name: str,
 81      reg_exp: str,
 82      expected_result: DataIntegrityValueByRegexpMetricResult,
 83  ) -> None:
 84      metric = ColumnRegExpMetric(column_name=column_name, reg_exp=reg_exp)
 85      report = Report(metrics=[metric])
 86      report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
 87      result = metric.get_result()
 88      assert result == expected_result
 89  
 90  
 91  @pytest.mark.parametrize(
 92      "current_data, reference_data, metric",
 93      (
 94          (
 95              pd.DataFrame(
 96                  {
 97                      "col": [1, 2, 1, 2, 1],
 98                  }
 99              ),
100              None,
101              ColumnRegExpMetric(column_name="test", reg_exp=r".*\d+.*"),
102          ),
103          (
104              pd.DataFrame(
105                  {
106                      "feature": [" a", "a", "\tb", np.nan, np.nan],
107                  }
108              ),
109              pd.DataFrame(
110                  {
111                      "test": ["a", "a", "c"],
112                  }
113              ),
114              ColumnRegExpMetric(column_name="feature", reg_exp=r".*\d+.*"),
115          ),
116          (
117              pd.DataFrame(
118                  {
119                      "col": [1, 2, 1, 2, 1],
120                  }
121              ),
122              None,
123              ColumnRegExpMetric(column_name="col", reg_exp=""),
124          ),
125          (
126              pd.DataFrame(
127                  {
128                      "col": [1, 2, 1, 2, 1],
129                  }
130              ),
131              None,
132              ColumnRegExpMetric(column_name="col", reg_exp=r"\d*", top=0),
133          ),
134      ),
135  )
136  def test_column_regexp_metric_value_error(
137      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnRegExpMetric
138  ) -> None:
139      with pytest.raises(ValueError):
140          report = Report(metrics=[metric])
141          report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
142          metric.get_result()
143  
144  
145  @pytest.mark.parametrize(
146      "current_data, reference_data, metric, expected_json",
147      (
148          (
149              pd.DataFrame(
150                  {
151                      "col": [1, 2, 1, 2, 1],
152                  }
153              ),
154              None,
155              ColumnRegExpMetric(column_name="col", reg_exp=r".*\d+.*"),
156              {
157                  "column_name": "col",
158                  "current": {
159                      "number_of_matched": 5,
160                      "number_of_not_matched": 0,
161                      "number_of_rows": 5,
162                      "table_of_matched": {"1": 3, "2": 2},
163                      "table_of_not_matched": {},
164                  },
165                  "reference": None,
166                  "reg_exp": ".*\\d+.*",
167                  "top": 10,
168              },
169          ),
170          (
171              pd.DataFrame(
172                  {
173                      "feature": [np.nan, np.nan],
174                  }
175              ),
176              pd.DataFrame(
177                  {
178                      "feature": ["a", "a", "c"],
179                  }
180              ),
181              ColumnRegExpMetric(column_name="feature", reg_exp=r".*a+.*"),
182              {
183                  "column_name": "feature",
184                  "current": {
185                      "number_of_matched": 0,
186                      "number_of_not_matched": 0,
187                      "number_of_rows": 2,
188                      "table_of_matched": {},
189                      "table_of_not_matched": {},
190                  },
191                  "reference": {
192                      "number_of_matched": 2,
193                      "number_of_not_matched": 1,
194                      "number_of_rows": 3,
195                      "table_of_matched": {"a": 2},
196                      "table_of_not_matched": {"c": 1},
197                  },
198                  "reg_exp": ".*a+.*",
199                  "top": 10,
200              },
201          ),
202          (
203              pd.DataFrame(
204                  {
205                      "col": [1, 2, 3, 4, 5, "a", "b", "c", 1, 1234567890, "a", "a", "d", "e", "f"],
206                  }
207              ),
208              None,
209              ColumnRegExpMetric(column_name="col", reg_exp=r"\d", top=3),
210              {
211                  "column_name": "col",
212                  "current": {
213                      "number_of_matched": 7,
214                      "number_of_not_matched": 8,
215                      "number_of_rows": 15,
216                      "table_of_matched": {"1": 2, "2": 1, "3": 1},
217                      "table_of_not_matched": {"a": 3, "b": 1, "c": 1},
218                  },
219                  "reference": None,
220                  "reg_exp": "\\d",
221                  "top": 3,
222              },
223          ),
224      ),
225  )
226  def test_column_regexp_metric_with_report(
227      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnRegExpMetric, expected_json: dict
228  ) -> None:
229      report = Report(metrics=[metric])
230      report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
231      assert report.show()
232      json_result = report.json()
233      assert len(json_result) > 0
234      result = json.loads(json_result)
235      assert result["metrics"][0]["metric"] == "ColumnRegExpMetric"
236      assert result["metrics"][0]["result"] == expected_json