/ tests / features / test_multicolumn.py
test_multicolumn.py
 1  from typing import List
 2  from typing import Optional
 3  
 4  import pandas as pd
 5  
 6  from evidently._pydantic_compat import PrivateAttr
 7  from evidently.legacy.base_metric import ColumnName
 8  from evidently.legacy.core import ColumnType
 9  from evidently.legacy.features.feature_generator import FeatureGenerator
10  from evidently.legacy.features.generated_features import GeneratedFeatures
11  from evidently.legacy.metrics import ColumnSummaryMetric
12  from evidently.legacy.options.base import Options
13  from evidently.legacy.report import Report
14  from evidently.legacy.utils.data_preprocessing import DataDefinition
15  
16  
17  class MultiColumnFeature(GeneratedFeatures):
18      class Config:
19          alias_required = False
20  
21      source_column: str
22      _called_count: int = PrivateAttr(0)
23  
24      def generate_features(self, data: pd.DataFrame, data_definition: DataDefinition, options: Options) -> pd.DataFrame:
25          self._called_count += 1
26          col = data[self.source_column]
27          return pd.DataFrame({"+1": col + 1, "+5": col + 5})
28  
29      def list_columns(self) -> List["ColumnName"]:
30          return [self._create_column(subcolumn="+1"), self._create_column(subcolumn="+5")]
31  
32      def get_type(self, subcolumn: Optional[str] = None):
33          return ColumnType.Numerical
34  
35  
36  def test_feature_generator():
37      f1 = MultiColumnFeature(
38          source_column="a",
39      )
40      f2 = MultiColumnFeature(source_column="b")
41      report = FeatureGenerator(
42          features=[
43              f1,
44              f2,
45          ]
46      )
47      cur = pd.DataFrame({"a": [1, 2], "b": [11, 12]})
48      ref = pd.DataFrame({"a": [3, 4], "b": [13, 14]})
49      report.run(current_data=cur, reference_data=ref)
50  
51      f1_cur, f1_ref = report.get_features(f1)
52      pd.testing.assert_frame_equal(
53          f1_cur, pd.DataFrame({f"{f1.get_fingerprint()}.+1": [2, 3], f"{f1.get_fingerprint()}.+5": [6, 7]})
54      )
55      pd.testing.assert_frame_equal(
56          f1_ref, pd.DataFrame({f"{f1.get_fingerprint()}.+1": [4, 5], f"{f1.get_fingerprint()}.+5": [8, 9]})
57      )
58  
59      f2_cur, f2_ref = report.get_features(f2)
60  
61      pd.testing.assert_frame_equal(
62          f2_cur, pd.DataFrame({f"{f2.get_fingerprint()}.+1": [12, 13], f"{f2.get_fingerprint()}.+5": [16, 17]})
63      )
64      pd.testing.assert_frame_equal(
65          f2_ref, pd.DataFrame({f"{f2.get_fingerprint()}.+1": [14, 15], f"{f2.get_fingerprint()}.+5": [18, 19]})
66      )
67  
68      all_features_cur, all_features_ref = report.get_features()
69      pd.testing.assert_frame_equal(all_features_cur, f1_cur.join(f2_cur))
70      pd.testing.assert_frame_equal(all_features_ref, f1_ref.join(f2_ref))
71  
72      assert f1._called_count == 2  # once for cur and ref
73      assert f2._called_count == 2  # once for cur and ref
74  
75  
76  def test_multicolumn_in_report():
77      cur = pd.DataFrame({"a": [1, 2]})
78      ref = pd.DataFrame({"a": [3, 4]})
79  
80      f1 = MultiColumnFeature(source_column="a")
81      f2 = MultiColumnFeature(source_column="a")
82      report = Report(
83          metrics=[
84              ColumnSummaryMetric(column_name=f1.as_column(subcolumn="+1")),
85              ColumnSummaryMetric(column_name=f2.as_column(subcolumn="+5")),
86          ]
87      )
88      report.run(current_data=cur, reference_data=ref)
89  
90      res_cur, res_ref = report.datasets()
91      pd.testing.assert_frame_equal(
92          res_cur, pd.DataFrame({"a": [1, 2], f"{f1.get_fingerprint()}.+1": [2, 3], f"{f1.get_fingerprint()}.+5": [6, 7]})
93      )
94      pd.testing.assert_frame_equal(
95          res_ref, pd.DataFrame({"a": [3, 4], f"{f1.get_fingerprint()}.+1": [4, 5], f"{f1.get_fingerprint()}.+5": [8, 9]})
96      )
97      assert f1._called_count + f2._called_count == 2  # once for cur and ref