/ tests / metrics / data_quality / test_column_value_list_metric.py
test_column_value_list_metric.py
  1  import json
  2  from typing import Optional
  3  
  4  import numpy as np
  5  import pandas as pd
  6  import pytest
  7  
  8  from evidently._pydantic_compat import parse_obj_as
  9  from evidently.legacy.metrics import ColumnValueListMetric
 10  from evidently.legacy.metrics.data_quality.column_value_list_metric import ColumnValueListMetricResult
 11  from evidently.legacy.metrics.data_quality.column_value_list_metric import ValueListStat
 12  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 13  from evidently.legacy.report import Report
 14  
 15  
 16  @pytest.mark.parametrize(
 17      "current_dataset, reference_dataset, metric, expected_result",
 18      (
 19          (
 20              pd.DataFrame({"category_feature": []}),
 21              None,
 22              ColumnValueListMetric(column_name="category_feature", values=["test"]),
 23              ColumnValueListMetricResult(
 24                  column_name="category_feature",
 25                  values=["test"],
 26                  current=ValueListStat(
 27                      number_in_list=0,
 28                      number_not_in_list=0,
 29                      share_in_list=0,
 30                      share_not_in_list=0,
 31                      values_in_list=[],
 32                      values_not_in_list=[],
 33                      rows_count=0,
 34                  ),
 35                  reference=None,
 36              ),
 37          ),
 38          (
 39              pd.DataFrame({"category_feature": [np.nan, np.nan, np.nan]}),
 40              None,
 41              ColumnValueListMetric(column_name="category_feature", values=["test"]),
 42              ColumnValueListMetricResult(
 43                  column_name="category_feature",
 44                  values=["test"],
 45                  current=ValueListStat(
 46                      number_in_list=0,
 47                      number_not_in_list=3,
 48                      share_in_list=0,
 49                      share_not_in_list=1,
 50                      values_in_list=[("test", 0)],
 51                      values_not_in_list=[],
 52                      rows_count=3,
 53                  ),
 54                  reference=None,
 55              ),
 56          ),
 57          (
 58              pd.DataFrame({"category_feature": [1, np.nan, 1, 2]}),
 59              None,
 60              ColumnValueListMetric(column_name="category_feature", values=[1, 2, 3]),
 61              ColumnValueListMetricResult(
 62                  column_name="category_feature",
 63                  values=[1, 2, 3],
 64                  current=ValueListStat(
 65                      number_in_list=3,
 66                      number_not_in_list=1,
 67                      share_in_list=0.75,
 68                      share_not_in_list=0.25,
 69                      values_in_list=[(1, 2), (2, 1), (3, 0)],
 70                      values_not_in_list=[],
 71                      rows_count=4,
 72                  ),
 73                  reference=None,
 74              ),
 75          ),
 76          (
 77              pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}),
 78              None,
 79              ColumnValueListMetric(column_name="category_feature", values=["d"]),
 80              ColumnValueListMetricResult(
 81                  column_name="category_feature",
 82                  values=["d"],
 83                  current=ValueListStat(
 84                      number_in_list=1,
 85                      number_not_in_list=3,
 86                      share_in_list=0.25,
 87                      share_not_in_list=0.75,
 88                      values_in_list=[("d", 1)],
 89                      values_not_in_list=[("n", 2), ("p", 1)],
 90                      rows_count=4,
 91                  ),
 92                  reference=None,
 93              ),
 94          ),
 95          (
 96              pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}),
 97              None,
 98              ColumnValueListMetric(column_name="numerical_feature", values=[2]),
 99              ColumnValueListMetricResult(
100                  column_name="numerical_feature",
101                  values=[2],
102                  current=ValueListStat(
103                      number_in_list=2,
104                      number_not_in_list=2,
105                      share_in_list=0.5,
106                      share_not_in_list=0.5,
107                      values_in_list=[(2, 2)],
108                      values_not_in_list=[(0, 1), (432, 1)],
109                      rows_count=4,
110                  ),
111                  reference=None,
112              ),
113          ),
114          (
115              pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}),
116              pd.DataFrame({"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 2, 2, 432]}),
117              ColumnValueListMetric(column_name="category_feature"),
118              ColumnValueListMetricResult(
119                  column_name="category_feature",
120                  values=["n", "d", "p"],
121                  current=ValueListStat(
122                      number_in_list=4,
123                      number_not_in_list=0,
124                      share_in_list=1.0,
125                      share_not_in_list=0.0,
126                      values_in_list=[("n", 2), ("d", 1), ("p", 1)],
127                      values_not_in_list=[],
128                      rows_count=4,
129                  ),
130                  reference=ValueListStat(
131                      number_in_list=4,
132                      number_not_in_list=0,
133                      share_in_list=1.0,
134                      share_not_in_list=0.0,
135                      values_in_list=[("n", 2), ("d", 1), ("p", 1)],
136                      values_not_in_list=[],
137                      rows_count=4,
138                  ),
139              ),
140          ),
141      ),
142  )
143  def test_data_quality_value_list_metric_success(
144      current_dataset: pd.DataFrame,
145      reference_dataset: Optional[pd.DataFrame],
146      metric: ColumnValueListMetric,
147      expected_result: ColumnValueListMetricResult,
148  ) -> None:
149      data_mapping = ColumnMapping()
150      report = Report(metrics=[metric])
151      report.run(current_data=current_dataset, reference_data=reference_dataset, column_mapping=data_mapping)
152      result = metric.get_result()
153      assert result == expected_result
154  
155  
156  @pytest.mark.parametrize(
157      "current_dataset, reference_dataset, metric, error_message",
158      (
159          (
160              pd.DataFrame({"feature": [1, 2, 3]}),
161              None,
162              ColumnValueListMetric(column_name="test", values=[1]),
163              "Column 'test' is not in current data.",
164          ),
165          (
166              pd.DataFrame({"test": [1, 2, 3]}),
167              pd.DataFrame({"feature": [1, 2, 3]}),
168              ColumnValueListMetric(column_name="test"),
169              "Column 'test' is not in reference data.",
170          ),
171          (
172              pd.DataFrame({"test": ["a", "b", "c"]}),
173              None,
174              ColumnValueListMetric(column_name="test"),
175              "Reference or values list should be present.",
176          ),
177          (
178              pd.DataFrame({"feature": [1, 2, 3]}),
179              pd.DataFrame({"feature": [1, 2, "a"]}),
180              ColumnValueListMetric(column_name="feature", values=[]),
181              "Values list should not be empty.",
182          ),
183          (
184              pd.DataFrame({"feature": [1, 2, 3]}),
185              pd.DataFrame({"feature": [np.nan]}),
186              ColumnValueListMetric(column_name="feature", values=[]),
187              "Values list should not be empty.",
188          ),
189      ),
190  )
191  def test_data_quality_value_list_metric_value_errors(
192      current_dataset: pd.DataFrame,
193      reference_dataset: Optional[pd.DataFrame],
194      metric: ColumnValueListMetric,
195      error_message: str,
196  ) -> None:
197      with pytest.raises(ValueError) as error:
198          report = Report(metrics=[metric])
199          report.run(current_data=current_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
200          metric.get_result()
201  
202      assert error.value.args[0] == error_message
203  
204  
205  @pytest.mark.parametrize(
206      "current_data, reference_data, metric, old_json",
207      (
208          (
209              pd.DataFrame({"col": [1, 2, 3]}),
210              None,
211              ColumnValueListMetric(column_name="col", values=[1]),
212              {
213                  "column_name": "col",
214                  "current": {
215                      "number_in_list": 1,
216                      "number_not_in_list": 2,
217                      "rows_count": 3,
218                      "share_in_list": 0.3333333333333333,
219                      "share_not_in_list": 0.6666666666666666,
220                      "values_in_list": [[1, 1]],
221                      "values_not_in_list": [[2, 1], [3, 1]],
222                  },
223                  "reference": None,
224                  "values": [1],
225              },
226          ),
227          (
228              pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}),
229              pd.DataFrame(
230                  {
231                      "col1": [10, 20, 3.5],
232                      "col2": [1, 2, 3],
233                  }
234              ),
235              ColumnValueListMetric(column_name="col1"),
236              {
237                  "column_name": "col1",
238                  "current": {
239                      "number_in_list": 0,
240                      "number_not_in_list": 3,
241                      "rows_count": 3,
242                      "share_in_list": 0.0,
243                      "share_not_in_list": 1.0,
244                      "values_in_list": [[10.0, 0], [20.0, 0], [3.5, 0]],
245                      "values_not_in_list": [[1, 1], [2, 1], [3, 1]],
246                  },
247                  "reference": {
248                      "number_in_list": 3,
249                      "number_not_in_list": 0,
250                      "rows_count": 3,
251                      "share_in_list": 1.0,
252                      "share_not_in_list": 0.0,
253                      "values_in_list": [[10.0, 1], [20.0, 1], [3.5, 1]],
254                      "values_not_in_list": [],
255                  },
256                  "values": [10.0, 20.0, 3.5],
257              },
258          ),
259      ),
260  )
261  def test_data_quality_value_list_metric_with_report_compat(
262      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, old_json: dict
263  ):
264      report = Report(metrics=[metric])
265      report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
266  
267      result = parse_obj_as(ColumnValueListMetricResult, old_json)
268      assert metric.get_result() == result
269  
270  
271  @pytest.mark.parametrize(
272      "current_data, reference_data, metric, expected_json",
273      (
274          (
275              pd.DataFrame({"col": [1, 2, 3]}),
276              None,
277              ColumnValueListMetric(column_name="col", values=[1]),
278              {
279                  "column_name": "col",
280                  "current": {
281                      "number_in_list": 1,
282                      "number_not_in_list": 2,
283                      "rows_count": 3,
284                      "share_in_list": 0.3333333333333333,
285                      "share_not_in_list": 0.6666666666666666,
286                      "values_in_list_dist": {"x": [1], "y": [1]},
287                      "values_not_in_list_dist": {"x": [2, 3], "y": [1, 1]},
288                  },
289                  "reference": None,
290                  "values": [1],
291              },
292          ),
293          (
294              pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}),
295              pd.DataFrame(
296                  {
297                      "col1": [10, 20, 3.5],
298                      "col2": [1, 2, 3],
299                  }
300              ),
301              ColumnValueListMetric(column_name="col1"),
302              {
303                  "column_name": "col1",
304                  "current": {
305                      "number_in_list": 0,
306                      "number_not_in_list": 3,
307                      "rows_count": 3,
308                      "share_in_list": 0.0,
309                      "share_not_in_list": 1.0,
310                      "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [0, 0, 0]},
311                      "values_not_in_list_dist": {"x": [1, 2, 3], "y": [1, 1, 1]},
312                  },
313                  "reference": {
314                      "number_in_list": 3,
315                      "number_not_in_list": 0,
316                      "rows_count": 3,
317                      "share_in_list": 1.0,
318                      "share_not_in_list": 0.0,
319                      "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [1, 1, 1]},
320                      "values_not_in_list_dist": {"x": [], "y": []},
321                  },
322                  "values": [10.0, 20.0, 3.5],
323              },
324          ),
325      ),
326  )
327  def test_data_quality_value_list_metric_with_report(
328      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, expected_json: dict
329  ) -> None:
330      report = Report(metrics=[metric])
331      report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
332      assert report.show()
333      result_json = report.json()
334      assert len(result_json) > 0
335      result = json.loads(result_json)
336      assert result["metrics"][0]["metric"] == "ColumnValueListMetric"
337      assert result["metrics"][0]["result"] == expected_json