/ tests / metrics / data_interity / test_column_summary_metric.py
test_column_summary_metric.py
  1  import json
  2  
  3  import pandas as pd
  4  import pytest
  5  
  6  from evidently.legacy.metrics import ColumnSummaryMetric
  7  from evidently.legacy.pipeline.column_mapping import ColumnMapping
  8  from evidently.legacy.report import Report
  9  
 10  
 11  @pytest.mark.parametrize(
 12      "current_data, reference_data, column_mapping, metric, expected_json",
 13      (
 14          (
 15              pd.DataFrame({"test1": ["a", "a", "c", "d", "e", "f", "g", "h", "i", "j"]}),
 16              None,
 17              ColumnMapping(),
 18              ColumnSummaryMetric(column_name="test1"),
 19              {
 20                  "column_name": "test1",
 21                  "column_type": "cat",
 22                  "current_characteristics": {
 23                      "count": 10,
 24                      "missing": 0,
 25                      "missing_percentage": 0.0,
 26                      "most_common": "a",
 27                      "most_common_percentage": 20.0,
 28                      "new_in_current_values_count": None,
 29                      "number_of_rows": 10,
 30                      "unique": 9,
 31                      "unique_percentage": 90.0,
 32                      "unused_in_current_values_count": None,
 33                  },
 34                  "reference_characteristics": None,
 35              },
 36          ),
 37          (
 38              pd.DataFrame({"test1": ["a", "a", "a"]}),
 39              pd.DataFrame({"test1": ["c", "c", "e", "f", "g", "h", "i", "j"]}),
 40              ColumnMapping(),
 41              ColumnSummaryMetric(column_name="test1"),
 42              {
 43                  "column_name": "test1",
 44                  "column_type": "cat",
 45                  "current_characteristics": {
 46                      "count": 3,
 47                      "missing": 0,
 48                      "missing_percentage": 0.0,
 49                      "most_common": "a",
 50                      "most_common_percentage": 100.0,
 51                      "new_in_current_values_count": 1,
 52                      "number_of_rows": 3,
 53                      "unique": 1,
 54                      "unique_percentage": 33.33,
 55                      "unused_in_current_values_count": 7,
 56                  },
 57                  "reference_characteristics": {
 58                      "count": 8,
 59                      "missing": 0,
 60                      "missing_percentage": 0.0,
 61                      "most_common": "c",
 62                      "most_common_percentage": 25.0,
 63                      "new_in_current_values_count": None,
 64                      "number_of_rows": 8,
 65                      "unique": 7,
 66                      "unique_percentage": 87.5,
 67                      "unused_in_current_values_count": None,
 68                  },
 69              },
 70          ),
 71          (
 72              pd.DataFrame({"test1": [1, 2, 3], "test2": [1, 2, 3], "test3": [1, 1, 1]}),
 73              pd.DataFrame({"test1": [1, 2, 3], "test2": ["a", "a", "a"], "test3": [1, 1, 1]}),
 74              ColumnMapping(numerical_features=["test1"]),
 75              ColumnSummaryMetric(column_name="test1"),
 76              {
 77                  "column_name": "test1",
 78                  "column_type": "num",
 79                  "current_characteristics": {
 80                      "count": 3,
 81                      "infinite_count": 0,
 82                      "infinite_percentage": 0.0,
 83                      "max": 3,
 84                      "mean": 2.0,
 85                      "min": 1,
 86                      "missing": 0,
 87                      "missing_percentage": 0.0,
 88                      "most_common": 1,
 89                      "most_common_percentage": 33.33,
 90                      "number_of_rows": 3,
 91                      "p25": 1.5,
 92                      "p50": 2.0,
 93                      "p75": 2.5,
 94                      "std": 1.0,
 95                      "unique": 3,
 96                      "unique_percentage": 100.0,
 97                  },
 98                  "reference_characteristics": {
 99                      "count": 3,
100                      "infinite_count": 0,
101                      "infinite_percentage": 0.0,
102                      "max": 3,
103                      "mean": 2.0,
104                      "min": 1,
105                      "missing": 0,
106                      "missing_percentage": 0.0,
107                      "most_common": 1,
108                      "most_common_percentage": 33.33,
109                      "number_of_rows": 3,
110                      "p25": 1.5,
111                      "p50": 2.0,
112                      "p75": 2.5,
113                      "std": 1.0,
114                      "unique": 3,
115                      "unique_percentage": 100.0,
116                  },
117              },
118          ),
119      ),
120  )
121  def test_column_summary_metric_with_report(
122      current_data: pd.DataFrame,
123      reference_data: pd.DataFrame,
124      column_mapping: ColumnMapping,
125      metric: ColumnSummaryMetric,
126      expected_json: dict,
127  ) -> None:
128      report = Report(metrics=[metric])
129      report.run(current_data=current_data, reference_data=reference_data, column_mapping=column_mapping)
130      assert report.show()
131      json_result = report.json()
132      assert len(json_result) > 0
133      result = json.loads(json_result)
134      assert result["metrics"][0]["metric"] == "ColumnSummaryMetric"
135      assert result["metrics"][0]["result"] == expected_json