/ tests / features / test_text_contains_feature.py
test_text_contains_feature.py
  1  from typing import List
  2  
  3  import pandas as pd
  4  import pytest
  5  
  6  from evidently.legacy.features.text_contains_feature import Contains
  7  from evidently.legacy.features.text_contains_feature import DoesNotContain
  8  from evidently.legacy.features.text_contains_feature import ItemMatch
  9  from evidently.legacy.features.text_contains_feature import ItemNoMatch
 10  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 11  from evidently.legacy.utils.data_preprocessing import create_data_definition
 12  
 13  test_data = [
 14      "a b c d e f g h",
 15      "b c d e f g h",
 16      "h",
 17      "A",
 18      "a B c D",
 19  ]
 20  
 21  
 22  @pytest.mark.parametrize(
 23      ("items", "case", "mode", "expected"),
 24      [
 25          (["a"], True, "any", [True, False, False, False, True]),
 26          (["b"], True, "any", [True, True, False, False, False]),
 27          (["a"], False, "any", [True, False, False, True, True]),
 28          (["b"], False, "any", [True, True, False, False, True]),
 29          (["a", "b"], True, "any", [True, True, False, False, True]),
 30          (["a", "b"], True, "all", [True, False, False, False, False]),
 31          (["a", "b"], False, "any", [True, True, False, True, True]),
 32          (["a", "b"], False, "all", [True, False, False, False, True]),
 33      ],
 34  )
 35  def test_text_contains_feature(items: List[str], case: bool, mode: str, expected: List[bool]):
 36      feature_generator = Contains("column_1", items, case_sensitive=case, mode=mode)
 37      data = pd.DataFrame(dict(column_1=test_data))
 38      result = feature_generator.generate_feature(
 39          data=data,
 40          data_definition=create_data_definition(None, data, ColumnMapping()),
 41      )
 42      column_expected = feature_generator._feature_column_name()
 43      expected_df = pd.DataFrame({column_expected: expected})
 44      assert result.equals(expected_df)
 45  
 46  
 47  @pytest.mark.parametrize(
 48      ("items", "case", "mode", "expected"),
 49      [
 50          (["a", "b"], True, "any", [False, False, True, True, False]),
 51          (["a", "b"], True, "all", [False, True, True, True, True]),
 52          (["a", "b"], False, "any", [False, False, True, False, False]),
 53          (["a", "b"], False, "all", [False, True, True, True, False]),
 54      ],
 55  )
 56  def test_text_not_contains_feature(items: List[str], case: bool, mode: str, expected: List[bool]):
 57      feature_generator = DoesNotContain("column_1", items, case_sensitive=case, mode=mode)
 58      data = pd.DataFrame(dict(column_1=test_data))
 59      result = feature_generator.generate_feature(
 60          data=data,
 61          data_definition=create_data_definition(None, data, ColumnMapping()),
 62      )
 63      column_expected = feature_generator._feature_column_name()
 64      expected_df = pd.DataFrame({column_expected: expected})
 65      assert result.equals(expected_df)
 66  
 67  
 68  @pytest.mark.parametrize(
 69      ("case", "mode", "expected"),
 70      [
 71          (True, "any", [False, True, False, True, False]),
 72          (True, "all", [False, True, False, False, False]),
 73          (False, "any", [True, True, True, True, False]),
 74          (False, "all", [False, True, True, False, False]),
 75      ],
 76  )
 77  def test_item_match(case: bool, mode: str, expected: List[bool]):
 78      data = {
 79          "generated": [
 80              "You should consider purchasing Nike or Adidas shoes.",
 81              "I eat apples, grapes, and oranges",
 82              "grapes, oranges, apples.",
 83              "Oranges are more sour than grapes.",
 84              "This test doesn't have the words.",
 85          ],
 86          "expected": [
 87              ["nike", "adidas", "puma"],
 88              ["grapes", "apples", "oranges"],
 89              ["Apples", "Oranges", "Grapes"],
 90              ["orange", "sweet", "grape"],
 91              ["none", "of", "these"],
 92          ],
 93      }
 94      df = pd.DataFrame(data)
 95      df["expected"] = df["expected"].apply(tuple)
 96      feature_generator = ItemMatch(columns=["generated", "expected"], case_sensitive=case, mode=mode)
 97      result = feature_generator.generate_feature(
 98          data=df,
 99          data_definition=create_data_definition(None, df, ColumnMapping()),
100      )
101      column_expected = feature_generator._feature_column_name()
102      column_name_obj = feature_generator._as_column()
103      expected_df = pd.DataFrame({column_expected: expected})
104      assert result.equals(expected_df)
105      assert column_name_obj.display_name == f"Text contains {mode} of defined items"
106  
107  
108  @pytest.mark.parametrize(
109      ("case", "mode", "expected"),
110      [
111          (True, "any", [True, False, True, False, True]),
112          (True, "all", [True, False, True, True, True]),
113          (False, "any", [False, False, False, False, True]),
114          (False, "all", [True, False, False, True, True]),
115      ],
116  )
117  def test_item_no_match(case: bool, mode: str, expected: List[bool]):
118      data = {
119          "generated": [
120              "You should consider purchasing Nike or Adidas shoes.",
121              "I eat apples, grapes, and oranges",
122              "grapes, oranges, apples.",
123              "Oranges are more sour than grapes.",
124              "This test doesn't have the words.",
125          ],
126          "forbidden": [
127              ["nike", "adidas", "puma"],
128              ["grapes", "apples", "oranges"],
129              ["Apples", "Oranges", "Grapes"],
130              ["orange", "sweet", "grape"],
131              ["none", "of", "these"],
132          ],
133      }
134      feature_generator = ItemNoMatch(columns=["generated", "forbidden"], case_sensitive=case, mode=mode)
135      df = pd.DataFrame(data)
136      df["forbidden"] = df["forbidden"].apply(tuple)
137      result = feature_generator.generate_feature(
138          data=df,
139          data_definition=create_data_definition(None, df, ColumnMapping()),
140      )
141      column_expected = feature_generator._feature_column_name()
142      column_name_obj = feature_generator._as_column()
143      expected_df = pd.DataFrame({column_expected: expected})
144      assert result.equals(expected_df)
145      assert column_name_obj.display_name == f"Text does not contain {mode} of defined items"