/ tests / features / test_words_feature.py
test_words_feature.py
  1  from typing import List
  2  
  3  import pandas as pd
  4  import pytest
  5  
  6  from evidently.legacy.features.words_feature import ExcludesWords
  7  from evidently.legacy.features.words_feature import IncludesWords
  8  from evidently.legacy.features.words_feature import WordMatch
  9  from evidently.legacy.features.words_feature import WordNoMatch
 10  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 11  from evidently.legacy.utils.data_preprocessing import create_data_definition
 12  
 13  input_data = [
 14      "Who are you and where are my apples and grapes?",
 15      "Apple is red",
 16      "Grape is blue",
 17  ]
 18  
 19  
 20  @pytest.mark.parametrize(
 21      ["words", "mode", "lemmatize", "expected"],
 22      [
 23          (["apple", "grape"], "any", True, [True, True, True]),
 24          (["apple", "grape"], "all", True, [True, False, False]),
 25          (["apple", "grape"], "any", False, [False, True, True]),
 26          (["apple", "grape"], "all", False, [False, False, False]),
 27      ],
 28  )
 29  def test_includes_words(words: List[str], mode: str, lemmatize: bool, expected: List[bool]):
 30      feature_generator = IncludesWords("column_1", words_list=words, mode=mode, lemmatize=lemmatize)
 31      data = pd.DataFrame(dict(column_1=input_data))
 32      result = feature_generator.generate_feature(
 33          data=data,
 34          data_definition=create_data_definition(None, data, ColumnMapping()),
 35      )
 36      assert result.equals(pd.DataFrame(dict([(feature_generator._feature_column_name(), expected)])))
 37  
 38  
 39  @pytest.mark.parametrize(
 40      ["words", "mode", "lemmatize", "expected"],
 41      [
 42          (["apple", "grape"], "any", True, [False, True, True]),
 43          (["apple", "grape"], "all", True, [False, False, False]),
 44          (["apple", "grape"], "any", False, [True, True, True]),
 45          (["apple", "grape"], "all", False, [True, False, False]),
 46      ],
 47  )
 48  def test_excludes_words(words: List[str], mode: str, lemmatize: bool, expected: List[bool]):
 49      feature_generator = ExcludesWords("column_1", words_list=words, mode=mode, lemmatize=lemmatize)
 50      data = pd.DataFrame(dict(column_1=input_data))
 51      result = feature_generator.generate_feature(
 52          data=data,
 53          data_definition=create_data_definition(None, data, ColumnMapping()),
 54      )
 55      assert result.equals(pd.DataFrame(dict([(feature_generator._feature_column_name(), expected)])))
 56  
 57  
 58  @pytest.mark.parametrize(
 59      ["mode", "lemmatize", "expected"],
 60      [
 61          ("any", False, [True, True, False, False, False, True]),
 62          ("all", False, [False, True, False, False, False, True]),
 63          ("any", True, [False, False, True, True, False, True]),
 64          ("all", True, [False, False, True, False, False, True]),
 65      ],
 66  )
 67  def test_word_match(mode: str, lemmatize: bool, expected: List[bool]):
 68      data = {
 69          "generated": [
 70              "I love eating apples and grapes.",
 71              "I eat apples, grapes, and oranges",
 72              "Grapes, oranges, apples.",
 73              "Oranges are more sour than grapes.",
 74              "This test doesn't have the words.",
 75              "You are allowed to cancel at any time, and we guarantee that you will receive a refund.",
 76          ],
 77          "expected": [
 78              ["apples", "grapes", "oranges"],
 79              ["grapes", "apples", "oranges"],
 80              ["apple", "orange", "grape"],
 81              ["orange", "sweet", "grape"],
 82              ["none", "of", "these"],
 83              ["guarantee", "allowed", "refund"],
 84          ],
 85      }
 86      df = pd.DataFrame(data)
 87      df["expected"] = df["expected"].apply(tuple)
 88      feature_generator = WordMatch(columns=["generated", "expected"], mode=mode, lemmatize=lemmatize)
 89      result = feature_generator.generate_feature(
 90          data=df,
 91          data_definition=create_data_definition(None, df, ColumnMapping()),
 92      )
 93      assert result.equals(pd.DataFrame(dict([(feature_generator._feature_name(), expected)])))
 94      column_obj = feature_generator._as_column()
 95      assert column_obj.display_name == f"Text contains {mode} defined words"
 96  
 97  
 98  @pytest.mark.parametrize(
 99      ["mode", "lemmatize", "expected"],
100      [
101          ("any", False, [True, False, True, True, True, False]),
102          ("all", False, [False, False, True, True, True, False]),
103          ("any", True, [True, True, False, True, True, False]),
104          ("all", True, [True, True, False, False, True, False]),
105      ],
106  )
107  def test_word_no_match(mode: str, lemmatize: bool, expected: List[bool]):
108      data = {
109          "generated": [
110              "I love eating apples and grapes.",
111              "I eat apples, grapes, and oranges",
112              "Grapes, oranges, apples.",
113              "Oranges are more sour than grapes.",
114              "This test doesn't have the words.",
115              "You are allowed to cancel at any time, and we guarantee that you will receive a refund.",
116          ],
117          "forbidden": [
118              ["apples", "grapes", "oranges"],
119              ["grapes", "apples", "oranges"],
120              ["apple", "orange", "grape"],
121              ["orange", "sweet", "grape"],
122              ["none", "of", "these"],
123              ["guarantee", "allowed", "refund"],
124          ],
125      }
126      df = pd.DataFrame(data)
127      df["forbidden"] = df["forbidden"].apply(tuple)
128      feature_generator = WordNoMatch(columns=["generated", "forbidden"], mode=mode, lemmatize=lemmatize)
129      result = feature_generator.generate_feature(
130          data=df,
131          data_definition=create_data_definition(None, df, ColumnMapping()),
132      )
133      assert result.equals(pd.DataFrame(dict([(feature_generator._feature_name(), expected)])))
134      column_obj = feature_generator._as_column()
135      assert column_obj.display_name == f"Text does not contain {mode} defined words"