test_dataset_missing_values_metric.py
1 import numpy as np 2 import pandas as pd 3 import pytest 4 5 from evidently.legacy.metrics import DatasetMissingValuesMetric 6 from evidently.legacy.pipeline.column_mapping import ColumnMapping 7 from evidently.legacy.report import Report 8 9 10 @pytest.mark.parametrize( 11 "current_data, reference_data, metric", 12 ( 13 ( 14 pd.DataFrame( 15 { 16 "feature": [" a", "a", "\tb", np.nan, np.nan], 17 } 18 ), 19 None, 20 DatasetMissingValuesMetric(missing_values=[None]), 21 ), 22 ( 23 pd.DataFrame( 24 { 25 "feature": [" a", "a", "\tb", np.nan, np.nan], 26 } 27 ), 28 pd.DataFrame( 29 { 30 "feature": [" a", np.nan, "\tb", pd.NaT, np.inf], 31 } 32 ), 33 DatasetMissingValuesMetric(missing_values=[None]), 34 ), 35 ), 36 ) 37 def test_dataset_missing_values_metric_with_report( 38 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: DatasetMissingValuesMetric 39 ) -> None: 40 report = Report(metrics=[metric]) 41 report.run(current_data=current_data, reference_data=reference_data) 42 assert report.show() 43 assert report.json() 44 45 46 def test_dataset_missing_values_metric_different_missing_values() -> None: 47 test_dataset = pd.DataFrame( 48 { 49 "category_feature_1": ["", "n/a", "3"], 50 "category_feature_2": ["", None, np.inf], 51 "numerical_feature_1": [3, -9999, 0], 52 "numerical_feature_2": [0, None, -np.inf], 53 "prediction": [1, pd.NaT, 1], 54 "target": [None, np.nan, 1], 55 } 56 ) 57 data_mapping = ColumnMapping() 58 metric = DatasetMissingValuesMetric() 59 report = Report(metrics=[metric]) 60 report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping) 61 result = metric.get_result() 62 assert result is not None 63 # expect na values and an empty string as null-values 64 assert result.current.different_missing_values == {None: 5, -np.inf: 1, np.inf: 1, "": 2} 65 assert result.current.number_of_different_missing_values == 4 66 assert result.current.number_of_missing_values == 9 67 assert result.current.number_of_rows_with_missing_values == 3 68 assert result.current.different_missing_values_by_column == { 69 "category_feature_1": {None: 0, -np.inf: 0, np.inf: 0, "": 1}, 70 "category_feature_2": {None: 1, -np.inf: 0, np.inf: 1, "": 1}, 71 "numerical_feature_1": {None: 0, -np.inf: 0, np.inf: 0, "": 0}, 72 "numerical_feature_2": {None: 1, -np.inf: 1, np.inf: 0, "": 0}, 73 "prediction": {None: 1, -np.inf: 0, np.inf: 0, "": 0}, 74 "target": {None: 2, -np.inf: 0, np.inf: 0, "": 0}, 75 } 76 assert result.current.number_of_different_missing_values_by_column == { 77 "category_feature_1": 1, 78 "category_feature_2": 3, 79 "numerical_feature_1": 0, 80 "numerical_feature_2": 2, 81 "prediction": 1, 82 "target": 1, 83 } 84 assert result.current.number_of_missing_values_by_column == { 85 "category_feature_1": 1, 86 "category_feature_2": 3, 87 "numerical_feature_1": 0, 88 "numerical_feature_2": 2, 89 "prediction": 1, 90 "target": 2, 91 } 92 assert result.reference is None 93 94 metric = DatasetMissingValuesMetric(missing_values=["n/a"], replace=False) 95 report = Report(metrics=[metric]) 96 report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping) 97 result = metric.get_result() 98 assert result is not None 99 # expect n/a and other defaults as null-values 100 assert result.current.number_of_different_missing_values == 5 101 assert result.current.number_of_missing_values == 10 102 assert result.reference is None 103 104 # test custom list of null values, no default, but with Pandas nulls 105 metric = DatasetMissingValuesMetric(missing_values=["", 0, "n/a", -9999, None], replace=True) 106 report = Report(metrics=[metric]) 107 report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping) 108 result = metric.get_result() 109 assert result is not None 110 assert result.current.number_of_different_missing_values == 5 111 assert result.current.number_of_missing_values == 11 112 assert result.reference is None 113 114 # test custom list of null values and ignore pandas null values 115 metric = DatasetMissingValuesMetric(missing_values=["", 0, "n/a", -9999], replace=True) 116 report = Report(metrics=[metric]) 117 report.run(current_data=test_dataset, reference_data=None, column_mapping=data_mapping) 118 result = metric.get_result() 119 assert result is not None 120 assert result.current.number_of_different_missing_values == 4 121 assert result.current.number_of_missing_values == 6 122 assert result.reference is None 123 124 125 @pytest.mark.parametrize( 126 "current_data, reference_data, metric", 127 ( 128 ( 129 pd.DataFrame( 130 { 131 "col": [1, 2, 1, 2, 1], 132 } 133 ), 134 None, 135 DatasetMissingValuesMetric(missing_values=[], replace=True), 136 ), 137 ), 138 ) 139 def test_dataset_missing_values_metrics_value_error( 140 current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: DatasetMissingValuesMetric 141 ) -> None: 142 with pytest.raises(ValueError): 143 report = Report(metrics=[metric]) 144 report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) 145 metric.get_result()