test_text_contains_feature.py
1 from typing import List 2 3 import pandas as pd 4 import pytest 5 6 from evidently.legacy.features.text_contains_feature import Contains 7 from evidently.legacy.features.text_contains_feature import DoesNotContain 8 from evidently.legacy.features.text_contains_feature import ItemMatch 9 from evidently.legacy.features.text_contains_feature import ItemNoMatch 10 from evidently.legacy.pipeline.column_mapping import ColumnMapping 11 from evidently.legacy.utils.data_preprocessing import create_data_definition 12 13 test_data = [ 14 "a b c d e f g h", 15 "b c d e f g h", 16 "h", 17 "A", 18 "a B c D", 19 ] 20 21 22 @pytest.mark.parametrize( 23 ("items", "case", "mode", "expected"), 24 [ 25 (["a"], True, "any", [True, False, False, False, True]), 26 (["b"], True, "any", [True, True, False, False, False]), 27 (["a"], False, "any", [True, False, False, True, True]), 28 (["b"], False, "any", [True, True, False, False, True]), 29 (["a", "b"], True, "any", [True, True, False, False, True]), 30 (["a", "b"], True, "all", [True, False, False, False, False]), 31 (["a", "b"], False, "any", [True, True, False, True, True]), 32 (["a", "b"], False, "all", [True, False, False, False, True]), 33 ], 34 ) 35 def test_text_contains_feature(items: List[str], case: bool, mode: str, expected: List[bool]): 36 feature_generator = Contains("column_1", items, case_sensitive=case, mode=mode) 37 data = pd.DataFrame(dict(column_1=test_data)) 38 result = feature_generator.generate_feature( 39 data=data, 40 data_definition=create_data_definition(None, data, ColumnMapping()), 41 ) 42 column_expected = feature_generator._feature_column_name() 43 expected_df = pd.DataFrame({column_expected: expected}) 44 assert result.equals(expected_df) 45 46 47 @pytest.mark.parametrize( 48 ("items", "case", "mode", "expected"), 49 [ 50 (["a", "b"], True, "any", [False, False, True, True, False]), 51 (["a", "b"], True, "all", [False, True, True, True, True]), 52 (["a", "b"], False, "any", [False, False, True, False, False]), 53 (["a", "b"], False, "all", [False, True, True, True, False]), 54 ], 55 ) 56 def test_text_not_contains_feature(items: List[str], case: bool, mode: str, expected: List[bool]): 57 feature_generator = DoesNotContain("column_1", items, case_sensitive=case, mode=mode) 58 data = pd.DataFrame(dict(column_1=test_data)) 59 result = feature_generator.generate_feature( 60 data=data, 61 data_definition=create_data_definition(None, data, ColumnMapping()), 62 ) 63 column_expected = feature_generator._feature_column_name() 64 expected_df = pd.DataFrame({column_expected: expected}) 65 assert result.equals(expected_df) 66 67 68 @pytest.mark.parametrize( 69 ("case", "mode", "expected"), 70 [ 71 (True, "any", [False, True, False, True, False]), 72 (True, "all", [False, True, False, False, False]), 73 (False, "any", [True, True, True, True, False]), 74 (False, "all", [False, True, True, False, False]), 75 ], 76 ) 77 def test_item_match(case: bool, mode: str, expected: List[bool]): 78 data = { 79 "generated": [ 80 "You should consider purchasing Nike or Adidas shoes.", 81 "I eat apples, grapes, and oranges", 82 "grapes, oranges, apples.", 83 "Oranges are more sour than grapes.", 84 "This test doesn't have the words.", 85 ], 86 "expected": [ 87 ["nike", "adidas", "puma"], 88 ["grapes", "apples", "oranges"], 89 ["Apples", "Oranges", "Grapes"], 90 ["orange", "sweet", "grape"], 91 ["none", "of", "these"], 92 ], 93 } 94 df = pd.DataFrame(data) 95 df["expected"] = df["expected"].apply(tuple) 96 feature_generator = ItemMatch(columns=["generated", "expected"], case_sensitive=case, mode=mode) 97 result = feature_generator.generate_feature( 98 data=df, 99 data_definition=create_data_definition(None, df, ColumnMapping()), 100 ) 101 column_expected = feature_generator._feature_column_name() 102 column_name_obj = feature_generator._as_column() 103 expected_df = pd.DataFrame({column_expected: expected}) 104 assert result.equals(expected_df) 105 assert column_name_obj.display_name == f"Text contains {mode} of defined items" 106 107 108 @pytest.mark.parametrize( 109 ("case", "mode", "expected"), 110 [ 111 (True, "any", [True, False, True, False, True]), 112 (True, "all", [True, False, True, True, True]), 113 (False, "any", [False, False, False, False, True]), 114 (False, "all", [True, False, False, True, True]), 115 ], 116 ) 117 def test_item_no_match(case: bool, mode: str, expected: List[bool]): 118 data = { 119 "generated": [ 120 "You should consider purchasing Nike or Adidas shoes.", 121 "I eat apples, grapes, and oranges", 122 "grapes, oranges, apples.", 123 "Oranges are more sour than grapes.", 124 "This test doesn't have the words.", 125 ], 126 "forbidden": [ 127 ["nike", "adidas", "puma"], 128 ["grapes", "apples", "oranges"], 129 ["Apples", "Oranges", "Grapes"], 130 ["orange", "sweet", "grape"], 131 ["none", "of", "these"], 132 ], 133 } 134 feature_generator = ItemNoMatch(columns=["generated", "forbidden"], case_sensitive=case, mode=mode) 135 df = pd.DataFrame(data) 136 df["forbidden"] = df["forbidden"].apply(tuple) 137 result = feature_generator.generate_feature( 138 data=df, 139 data_definition=create_data_definition(None, df, ColumnMapping()), 140 ) 141 column_expected = feature_generator._feature_column_name() 142 column_name_obj = feature_generator._as_column() 143 expected_df = pd.DataFrame({column_expected: expected}) 144 assert result.equals(expected_df) 145 assert column_name_obj.display_name == f"Text does not contain {mode} of defined items"