/ tests / metrics / data_interity / test_dataset_missing_values_metric.py
test_dataset_missing_values_metric.py
  1  import numpy as np
  2  import pandas as pd
  3  import pytest
  4  
  5  from evidently.legacy.metrics import DatasetMissingValuesMetric
  6  from evidently.legacy.pipeline.column_mapping import ColumnMapping
  7  from evidently.legacy.report import Report
  8  
  9  
 10  @pytest.mark.parametrize(
 11      "current_data, reference_data, metric",
 12      (
 13          (
 14              pd.DataFrame(
 15                  {
 16                      "feature": [" a", "a", "\tb", np.nan, np.nan],
 17                  }
 18              ),
 19              None,
 20              DatasetMissingValuesMetric(missing_values=[None]),
 21          ),
 22          (
 23              pd.DataFrame(
 24                  {
 25                      "feature": [" a", "a", "\tb", np.nan, np.nan],
 26                  }
 27              ),
 28              pd.DataFrame(
 29                  {
 30                      "feature": [" a", np.nan, "\tb", pd.NaT, np.inf],
 31                  }
 32              ),
 33              DatasetMissingValuesMetric(missing_values=[None]),
 34          ),
 35      ),
 36  )
 37  def test_dataset_missing_values_metric_with_report(
 38      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: DatasetMissingValuesMetric
 39  ) -> None:
 40      report = Report(metrics=[metric])
 41      report.run(current_data=current_data, reference_data=reference_data)
 42      assert report.show()
 43      assert report.json()
 44  
 45  
 46  def test_dataset_missing_values_metric_different_missing_values() -> None:
 47      test_dataset = pd.DataFrame(
 48          {
 49              "category_feature_1": ["", "n/a", "3"],
 50              "category_feature_2": ["", None, np.inf],
 51              "numerical_feature_1": [3, -9999, 0],
 52              "numerical_feature_2": [0, None, -np.inf],
 53              "prediction": [1, pd.NaT, 1],
 54              "target": [None, np.nan, 1],
 55          }
 56      )
 57      data_mapping = ColumnMapping()
 58      metric = DatasetMissingValuesMetric()
 59      report = Report(metrics=[metric])
 60      report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping)
 61      result = metric.get_result()
 62      assert result is not None
 63      # expect na values and an empty string as null-values
 64      assert result.current.different_missing_values == {None: 5, -np.inf: 1, np.inf: 1, "": 2}
 65      assert result.current.number_of_different_missing_values == 4
 66      assert result.current.number_of_missing_values == 9
 67      assert result.current.number_of_rows_with_missing_values == 3
 68      assert result.current.different_missing_values_by_column == {
 69          "category_feature_1": {None: 0, -np.inf: 0, np.inf: 0, "": 1},
 70          "category_feature_2": {None: 1, -np.inf: 0, np.inf: 1, "": 1},
 71          "numerical_feature_1": {None: 0, -np.inf: 0, np.inf: 0, "": 0},
 72          "numerical_feature_2": {None: 1, -np.inf: 1, np.inf: 0, "": 0},
 73          "prediction": {None: 1, -np.inf: 0, np.inf: 0, "": 0},
 74          "target": {None: 2, -np.inf: 0, np.inf: 0, "": 0},
 75      }
 76      assert result.current.number_of_different_missing_values_by_column == {
 77          "category_feature_1": 1,
 78          "category_feature_2": 3,
 79          "numerical_feature_1": 0,
 80          "numerical_feature_2": 2,
 81          "prediction": 1,
 82          "target": 1,
 83      }
 84      assert result.current.number_of_missing_values_by_column == {
 85          "category_feature_1": 1,
 86          "category_feature_2": 3,
 87          "numerical_feature_1": 0,
 88          "numerical_feature_2": 2,
 89          "prediction": 1,
 90          "target": 2,
 91      }
 92      assert result.reference is None
 93  
 94      metric = DatasetMissingValuesMetric(missing_values=["n/a"], replace=False)
 95      report = Report(metrics=[metric])
 96      report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping)
 97      result = metric.get_result()
 98      assert result is not None
 99      # expect n/a and other defaults as null-values
100      assert result.current.number_of_different_missing_values == 5
101      assert result.current.number_of_missing_values == 10
102      assert result.reference is None
103  
104      # test custom list of null values, no default, but with Pandas nulls
105      metric = DatasetMissingValuesMetric(missing_values=["", 0, "n/a", -9999, None], replace=True)
106      report = Report(metrics=[metric])
107      report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping)
108      result = metric.get_result()
109      assert result is not None
110      assert result.current.number_of_different_missing_values == 5
111      assert result.current.number_of_missing_values == 11
112      assert result.reference is None
113  
114      # test custom list of null values and ignore pandas null values
115      metric = DatasetMissingValuesMetric(missing_values=["", 0, "n/a", -9999], replace=True)
116      report = Report(metrics=[metric])
117      report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping)
118      result = metric.get_result()
119      assert result is not None
120      assert result.current.number_of_different_missing_values == 4
121      assert result.current.number_of_missing_values == 6
122      assert result.reference is None
123  
124  
125  @pytest.mark.parametrize(
126      "current_data, reference_data, metric",
127      (
128          (
129              pd.DataFrame(
130                  {
131                      "col": [1, 2, 1, 2, 1],
132                  }
133              ),
134              None,
135              DatasetMissingValuesMetric(missing_values=[], replace=True),
136          ),
137      ),
138  )
139  def test_dataset_missing_values_metrics_value_error(
140      current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: DatasetMissingValuesMetric
141  ) -> None:
142      with pytest.raises(ValueError):
143          report = Report(metrics=[metric])
144          report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping())
145          metric.get_result()