test_words_feature.py
1 from typing import List 2 3 import pandas as pd 4 import pytest 5 6 from evidently.legacy.features.words_feature import ExcludesWords 7 from evidently.legacy.features.words_feature import IncludesWords 8 from evidently.legacy.features.words_feature import WordMatch 9 from evidently.legacy.features.words_feature import WordNoMatch 10 from evidently.legacy.pipeline.column_mapping import ColumnMapping 11 from evidently.legacy.utils.data_preprocessing import create_data_definition 12 13 input_data = [ 14 "Who are you and where are my apples and grapes?", 15 "Apple is red", 16 "Grape is blue", 17 ] 18 19 20 @pytest.mark.parametrize( 21 ["words", "mode", "lemmatize", "expected"], 22 [ 23 (["apple", "grape"], "any", True, [True, True, True]), 24 (["apple", "grape"], "all", True, [True, False, False]), 25 (["apple", "grape"], "any", False, [False, True, True]), 26 (["apple", "grape"], "all", False, [False, False, False]), 27 ], 28 ) 29 def test_includes_words(words: List[str], mode: str, lemmatize: bool, expected: List[bool]): 30 feature_generator = IncludesWords("column_1", words_list=words, mode=mode, lemmatize=lemmatize) 31 data = pd.DataFrame(dict(column_1=input_data)) 32 result = feature_generator.generate_feature( 33 data=data, 34 data_definition=create_data_definition(None, data, ColumnMapping()), 35 ) 36 assert result.equals(pd.DataFrame(dict([(feature_generator._feature_column_name(), expected)]))) 37 38 39 @pytest.mark.parametrize( 40 ["words", "mode", "lemmatize", "expected"], 41 [ 42 (["apple", "grape"], "any", True, [False, True, True]), 43 (["apple", "grape"], "all", True, [False, False, False]), 44 (["apple", "grape"], "any", False, [True, True, True]), 45 (["apple", "grape"], "all", False, [True, False, False]), 46 ], 47 ) 48 def test_excludes_words(words: List[str], mode: str, lemmatize: bool, expected: List[bool]): 49 feature_generator = ExcludesWords("column_1", words_list=words, mode=mode, lemmatize=lemmatize) 50 data = pd.DataFrame(dict(column_1=input_data)) 51 result = feature_generator.generate_feature( 52 data=data, 53 data_definition=create_data_definition(None, data, ColumnMapping()), 54 ) 55 assert result.equals(pd.DataFrame(dict([(feature_generator._feature_column_name(), expected)]))) 56 57 58 @pytest.mark.parametrize( 59 ["mode", "lemmatize", "expected"], 60 [ 61 ("any", False, [True, True, False, False, False, True]), 62 ("all", False, [False, True, False, False, False, True]), 63 ("any", True, [False, False, True, True, False, True]), 64 ("all", True, [False, False, True, False, False, True]), 65 ], 66 ) 67 def test_word_match(mode: str, lemmatize: bool, expected: List[bool]): 68 data = { 69 "generated": [ 70 "I love eating apples and grapes.", 71 "I eat apples, grapes, and oranges", 72 "Grapes, oranges, apples.", 73 "Oranges are more sour than grapes.", 74 "This test doesn't have the words.", 75 "You are allowed to cancel at any time, and we guarantee that you will receive a refund.", 76 ], 77 "expected": [ 78 ["apples", "grapes", "oranges"], 79 ["grapes", "apples", "oranges"], 80 ["apple", "orange", "grape"], 81 ["orange", "sweet", "grape"], 82 ["none", "of", "these"], 83 ["guarantee", "allowed", "refund"], 84 ], 85 } 86 df = pd.DataFrame(data) 87 df["expected"] = df["expected"].apply(tuple) 88 feature_generator = WordMatch(columns=["generated", "expected"], mode=mode, lemmatize=lemmatize) 89 result = feature_generator.generate_feature( 90 data=df, 91 data_definition=create_data_definition(None, df, ColumnMapping()), 92 ) 93 assert result.equals(pd.DataFrame(dict([(feature_generator._feature_name(), expected)]))) 94 column_obj = feature_generator._as_column() 95 assert column_obj.display_name == f"Text contains {mode} defined words" 96 97 98 @pytest.mark.parametrize( 99 ["mode", "lemmatize", "expected"], 100 [ 101 ("any", False, [True, False, True, True, True, False]), 102 ("all", False, [False, False, True, True, True, False]), 103 ("any", True, [True, True, False, True, True, False]), 104 ("all", True, [True, True, False, False, True, False]), 105 ], 106 ) 107 def test_word_no_match(mode: str, lemmatize: bool, expected: List[bool]): 108 data = { 109 "generated": [ 110 "I love eating apples and grapes.", 111 "I eat apples, grapes, and oranges", 112 "Grapes, oranges, apples.", 113 "Oranges are more sour than grapes.", 114 "This test doesn't have the words.", 115 "You are allowed to cancel at any time, and we guarantee that you will receive a refund.", 116 ], 117 "forbidden": [ 118 ["apples", "grapes", "oranges"], 119 ["grapes", "apples", "oranges"], 120 ["apple", "orange", "grape"], 121 ["orange", "sweet", "grape"], 122 ["none", "of", "these"], 123 ["guarantee", "allowed", "refund"], 124 ], 125 } 126 df = pd.DataFrame(data) 127 df["forbidden"] = df["forbidden"].apply(tuple) 128 feature_generator = WordNoMatch(columns=["generated", "forbidden"], mode=mode, lemmatize=lemmatize) 129 result = feature_generator.generate_feature( 130 data=df, 131 data_definition=create_data_definition(None, df, ColumnMapping()), 132 ) 133 assert result.equals(pd.DataFrame(dict([(feature_generator._feature_name(), expected)]))) 134 column_obj = feature_generator._as_column() 135 assert column_obj.display_name == f"Text does not contain {mode} defined words"