test_text_match_compat.py
1 import pandas as pd 2 import pytest 3 4 from evidently.core.datasets import Dataset 5 from evidently.core.datasets import FeatureDescriptor 6 from evidently.descriptors import Contains 7 from evidently.descriptors import DoesNotContain 8 from evidently.descriptors import ExcludesWords 9 from evidently.descriptors import IncludesWords 10 from evidently.descriptors import ItemMatch 11 from evidently.descriptors import ItemNoMatch 12 from evidently.descriptors import RegExp 13 from evidently.descriptors import TriggerWordsPresent 14 from evidently.descriptors import WordMatch 15 from evidently.descriptors import WordNoMatch 16 from evidently.descriptors import WordsPresence 17 from evidently.legacy.features.regexp_feature import RegExp as LegacyRegExp 18 from evidently.legacy.features.text_contains_feature import Contains as LegacyContains 19 from evidently.legacy.features.text_contains_feature import DoesNotContain as LegacyDoesNotContain 20 from evidently.legacy.features.text_contains_feature import ItemMatch as LegacyItemMatch 21 from evidently.legacy.features.text_contains_feature import ItemNoMatch as LegacyItemNoMatch 22 from evidently.legacy.features.trigger_words_presence_feature import TriggerWordsPresent as LegacyTriggerWordsPresent 23 from evidently.legacy.features.words_feature import ExcludesWords as LegacyExcludesWords 24 from evidently.legacy.features.words_feature import IncludesWords as LegacyIncludesWords 25 from evidently.legacy.features.words_feature import WordMatch as LegacyWordMatch 26 from evidently.legacy.features.words_feature import WordNoMatch as LegacyWordNoMatch 27 from evidently.legacy.features.words_feature import WordsPresence as LegacyWordsPresence 28 29 30 @pytest.fixture 31 def sample_data(): 32 return pd.DataFrame( 33 { 34 "description": [ 35 "This is urgent and important message", 36 "This is just a test message", 37 "URGENT: Please respond immediately", 38 "Normal message without keywords", 39 "Contains both urgent and important words", 40 "Spam test message for filtering", 41 "Empty message", 42 "Message with numbers 123 and symbols @#$", 43 ], 44 "keywords": [ 45 ["urgent", "important"], 46 ["test"], 47 ["urgent"], 48 [], 49 ["urgent", "important"], 50 ["spam", "test"], 51 [], 52 ["numbers"], 53 ], 54 } 55 ) 56 57 58 @pytest.fixture 59 def sample_dataset(sample_data): 60 return Dataset.from_pandas(sample_data) 61 62 63 @pytest.mark.parametrize("case_sensitive", [True, False]) 64 @pytest.mark.parametrize("mode", ["any", "all"]) 65 def test_contains_parameters(sample_dataset, case_sensitive, mode): 66 legacy_feature = LegacyContains( 67 column_name="description", items=["urgent", "important"], case_sensitive=case_sensitive, mode=mode 68 ) 69 legacy_desc = FeatureDescriptor(feature=legacy_feature) 70 71 new_desc = Contains( 72 column_name="description", items=["urgent", "important"], case_sensitive=case_sensitive, mode=mode 73 ) 74 75 sample_dataset.add_descriptor(legacy_desc) 76 sample_dataset.add_descriptor(new_desc) 77 78 legacy_result = sample_dataset.column(legacy_desc.alias) 79 new_result = sample_dataset.column(new_desc.alias) 80 81 assert new_result.data.tolist() == legacy_result.data.tolist() 82 83 84 def test_contains_single_item(sample_dataset): 85 legacy_feature = LegacyContains(column_name="description", items=["urgent"], case_sensitive=False) 86 legacy_desc = FeatureDescriptor(feature=legacy_feature) 87 88 new_desc = Contains(column_name="description", items=["urgent"], case_sensitive=False) 89 90 sample_dataset.add_descriptor(legacy_desc) 91 sample_dataset.add_descriptor(new_desc) 92 93 legacy_result = sample_dataset.column(legacy_desc.alias) 94 new_result = sample_dataset.column(new_desc.alias) 95 96 assert new_result.data.tolist() == legacy_result.data.tolist() 97 98 99 @pytest.mark.parametrize("case_sensitive", [True, False]) 100 @pytest.mark.parametrize("mode", ["any", "all"]) 101 def test_does_not_contain_parameters(sample_dataset, case_sensitive, mode): 102 legacy_feature = LegacyDoesNotContain( 103 column_name="description", items=["spam", "test"], case_sensitive=case_sensitive, mode=mode 104 ) 105 legacy_desc = FeatureDescriptor(feature=legacy_feature) 106 107 new_desc = DoesNotContain( 108 column_name="description", items=["spam", "test"], case_sensitive=case_sensitive, mode=mode 109 ) 110 111 sample_dataset.add_descriptor(legacy_desc) 112 sample_dataset.add_descriptor(new_desc) 113 114 legacy_result = sample_dataset.column(legacy_desc.alias) 115 new_result = sample_dataset.column(new_desc.alias) 116 117 assert new_result.data.tolist() == legacy_result.data.tolist() 118 119 120 @pytest.mark.parametrize("case_sensitive", [True, False]) 121 @pytest.mark.parametrize("mode", ["any", "all"]) 122 def test_item_match_parameters(sample_dataset, case_sensitive, mode): 123 legacy_feature = LegacyItemMatch(columns=["description", "keywords"], case_sensitive=case_sensitive, mode=mode) 124 legacy_desc = FeatureDescriptor(feature=legacy_feature) 125 126 new_desc = ItemMatch(columns=["description", "keywords"], case_sensitive=case_sensitive, mode=mode) 127 128 sample_dataset.add_descriptor(legacy_desc) 129 sample_dataset.add_descriptor(new_desc) 130 131 legacy_result = sample_dataset.column(legacy_desc.alias) 132 new_result = sample_dataset.column(new_desc.alias) 133 134 assert new_result.data.tolist() == legacy_result.data.tolist() 135 136 137 def test_item_match_error_handling(): 138 with pytest.raises(ValueError, match="ItemMatch requires exactly 2 columns"): 139 ItemMatch(columns=["single_column"]) 140 141 with pytest.raises(ValueError, match="ItemMatch requires exactly 2 columns"): 142 ItemMatch(columns=["col1", "col2", "col3"]) 143 144 145 @pytest.mark.parametrize("case_sensitive", [True, False]) 146 @pytest.mark.parametrize("mode", ["any", "all"]) 147 def test_item_no_match_parameters(sample_dataset, case_sensitive, mode): 148 legacy_feature = LegacyItemNoMatch(columns=["description", "keywords"], case_sensitive=case_sensitive, mode=mode) 149 legacy_desc = FeatureDescriptor(feature=legacy_feature) 150 151 new_desc = ItemNoMatch(columns=["description", "keywords"], case_sensitive=case_sensitive, mode=mode) 152 153 sample_dataset.add_descriptor(legacy_desc) 154 sample_dataset.add_descriptor(new_desc) 155 156 legacy_result = sample_dataset.column(legacy_desc.alias) 157 new_result = sample_dataset.column(new_desc.alias) 158 159 assert new_result.data.tolist() == legacy_result.data.tolist() 160 161 162 @pytest.mark.parametrize("mode", ["includes_any", "includes_all", "excludes_any", "excludes_all"]) 163 @pytest.mark.parametrize("lemmatize", [True, False]) 164 def test_words_presence_parameters(sample_dataset, mode, lemmatize): 165 legacy_feature = LegacyWordsPresence( 166 column_name="description", words_list=["urgent", "important"], mode=mode, lemmatize=lemmatize 167 ) 168 legacy_desc = FeatureDescriptor(feature=legacy_feature) 169 170 new_desc = WordsPresence( 171 column_name="description", words_list=["urgent", "important"], mode=mode, lemmatize=lemmatize 172 ) 173 174 sample_dataset.add_descriptor(legacy_desc) 175 sample_dataset.add_descriptor(new_desc) 176 177 legacy_result = sample_dataset.column(legacy_desc.alias) 178 new_result = sample_dataset.column(new_desc.alias) 179 180 assert new_result.data.tolist() == legacy_result.data.tolist() 181 182 183 @pytest.mark.parametrize("mode", ["any", "all"]) 184 @pytest.mark.parametrize("lemmatize", [True, False]) 185 def test_includes_words_parameters(sample_dataset, mode, lemmatize): 186 legacy_feature = LegacyIncludesWords( 187 column_name="description", words_list=["urgent", "important"], mode=mode, lemmatize=lemmatize 188 ) 189 legacy_desc = FeatureDescriptor(feature=legacy_feature) 190 191 new_desc = IncludesWords( 192 column_name="description", words_list=["urgent", "important"], mode=mode, lemmatize=lemmatize 193 ) 194 195 sample_dataset.add_descriptor(legacy_desc) 196 sample_dataset.add_descriptor(new_desc) 197 198 legacy_result = sample_dataset.column(legacy_desc.alias) 199 new_result = sample_dataset.column(new_desc.alias) 200 201 assert new_result.data.tolist() == legacy_result.data.tolist() 202 203 204 @pytest.mark.parametrize("mode", ["any", "all"]) 205 @pytest.mark.parametrize("lemmatize", [True, False]) 206 def test_excludes_words_parameters(sample_dataset, mode, lemmatize): 207 legacy_feature = LegacyExcludesWords( 208 column_name="description", words_list=["spam", "test"], mode=mode, lemmatize=lemmatize 209 ) 210 legacy_desc = FeatureDescriptor(feature=legacy_feature) 211 212 new_desc = ExcludesWords(column_name="description", words_list=["spam", "test"], mode=mode, lemmatize=lemmatize) 213 214 sample_dataset.add_descriptor(legacy_desc) 215 sample_dataset.add_descriptor(new_desc) 216 217 legacy_result = sample_dataset.column(legacy_desc.alias) 218 new_result = sample_dataset.column(new_desc.alias) 219 220 assert new_result.data.tolist() == legacy_result.data.tolist() 221 222 223 @pytest.mark.parametrize("mode", ["any", "all"]) 224 @pytest.mark.parametrize("lemmatize", [True, False]) 225 def test_word_match_parameters(sample_dataset, mode, lemmatize): 226 legacy_feature = LegacyWordMatch(columns=["description", "keywords"], mode=mode, lemmatize=lemmatize) 227 legacy_desc = FeatureDescriptor(feature=legacy_feature) 228 229 new_desc = WordMatch(columns=["description", "keywords"], mode=mode, lemmatize=lemmatize) 230 231 sample_dataset.add_descriptor(legacy_desc) 232 sample_dataset.add_descriptor(new_desc) 233 234 legacy_result = sample_dataset.column(legacy_desc.alias) 235 new_result = sample_dataset.column(new_desc.alias) 236 237 assert new_result.data.tolist() == legacy_result.data.tolist() 238 239 240 @pytest.mark.parametrize("mode", ["any", "all"]) 241 @pytest.mark.parametrize("lemmatize", [True, False]) 242 def test_word_no_match_parameters(sample_dataset, mode, lemmatize): 243 legacy_feature = LegacyWordNoMatch(columns=["description", "keywords"], mode=mode, lemmatize=lemmatize) 244 legacy_desc = FeatureDescriptor(feature=legacy_feature) 245 246 new_desc = WordNoMatch(columns=["description", "keywords"], mode=mode, lemmatize=lemmatize) 247 248 sample_dataset.add_descriptor(legacy_desc) 249 sample_dataset.add_descriptor(new_desc) 250 251 legacy_result = sample_dataset.column(legacy_desc.alias) 252 new_result = sample_dataset.column(new_desc.alias) 253 254 assert new_result.data.tolist() == legacy_result.data.tolist() 255 256 257 @pytest.mark.parametrize("lemmatize", [True, False]) 258 def test_trigger_words_present_parameters(sample_dataset, lemmatize): 259 legacy_feature = LegacyTriggerWordsPresent( 260 column_name="description", words_list=["urgent", "important"], lemmatize=lemmatize 261 ) 262 legacy_desc = FeatureDescriptor(feature=legacy_feature) 263 264 new_desc = TriggerWordsPresent(column_name="description", words_list=["urgent", "important"], lemmatize=lemmatize) 265 266 sample_dataset.add_descriptor(legacy_desc) 267 sample_dataset.add_descriptor(new_desc) 268 269 legacy_result = sample_dataset.column(legacy_desc.alias) 270 new_result = sample_dataset.column(new_desc.alias) 271 272 assert new_result.data.tolist() == legacy_result.data.tolist() 273 274 275 def test_regex_parameters(sample_dataset): 276 legacy_feature = LegacyRegExp(column_name="description", reg_exp=r".*(urgent|important).*") 277 legacy_desc = FeatureDescriptor(feature=legacy_feature) 278 279 new_desc = RegExp(column_name="description", reg_exp=r".*(urgent|important).*") 280 281 sample_dataset.add_descriptor(legacy_desc) 282 sample_dataset.add_descriptor(new_desc) 283 284 legacy_result = sample_dataset.column(legacy_desc.alias) 285 new_result = sample_dataset.column(new_desc.alias) 286 287 assert new_result.data.tolist() == legacy_result.data.astype(bool).tolist() 288 289 290 def test_single_item_list(sample_dataset): 291 legacy_feature = LegacyContains(column_name="description", items=["urgent"], case_sensitive=False) 292 legacy_desc = FeatureDescriptor(feature=legacy_feature) 293 294 new_desc = Contains(column_name="description", items=["urgent"], case_sensitive=False) 295 296 sample_dataset.add_descriptor(legacy_desc) 297 sample_dataset.add_descriptor(new_desc) 298 299 legacy_result = sample_dataset.column(legacy_desc.alias) 300 new_result = sample_dataset.column(new_desc.alias) 301 302 assert new_result.data.tolist() == legacy_result.data.tolist() 303 304 305 def test_contains_alias(): 306 desc = Contains("description", ["urgent", "important"]) 307 assert desc.alias is not None 308 assert "description" in desc.alias 309 310 311 def test_does_not_contain_alias(): 312 desc = DoesNotContain("description", ["spam", "test"]) 313 assert desc.alias is not None 314 assert "description" in desc.alias 315 316 317 def test_item_match_alias(): 318 desc = ItemMatch(["description", "keywords"]) 319 assert desc.alias is not None 320 assert "description" in desc.alias 321 assert "keywords" in desc.alias 322 323 324 def test_custom_alias_preservation(): 325 custom_alias = "custom_contains" 326 desc = Contains("description", ["urgent"], alias=custom_alias) 327 assert desc.alias == custom_alias