text_contains_feature.py
1 from typing import ClassVar 2 from typing import List 3 from typing import Optional 4 5 import pandas as pd 6 7 from evidently.legacy.base_metric import ColumnName 8 from evidently.legacy.core import ColumnType 9 from evidently.legacy.features.generated_features import GeneratedFeature 10 from evidently.legacy.utils.data_preprocessing import DataDefinition 11 12 13 class Contains(GeneratedFeature): 14 class Config: 15 type_alias = "evidently:feature:Contains" 16 17 __feature_type__: ClassVar = ColumnType.Categorical 18 column_name: str 19 items: List[str] 20 case_sensitive: bool 21 mode: str 22 23 def __init__( 24 self, 25 column_name: str, 26 items: List[str], 27 case_sensitive: bool = True, 28 mode: str = "any", 29 display_name: Optional[str] = None, 30 ): 31 self.column_name = column_name 32 self.display_name = display_name 33 self.case_sensitive = case_sensitive 34 if mode not in ["any", "all"]: 35 raise ValueError("mode must be either 'any' or 'all'") 36 self.mode = mode 37 self.items = items 38 super().__init__() 39 40 def _feature_column_name(self) -> str: 41 return f"{self.column_name}_" + "_".join(self.items) + "_" + str(self.case_sensitive) + "_" + self.mode 42 43 def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame: 44 if self.mode == "any": 45 calculated = data[self.column_name].str.contains("|".join(self.items), case=self.case_sensitive) 46 elif self.mode == "all": 47 calculated = data[self.column_name].apply(lambda x: all(self.comparison(i, x) for i in self.items)) 48 else: 49 raise ValueError("mode must be either 'any' or 'all'") 50 return pd.DataFrame({self._feature_column_name(): calculated}) 51 52 def _as_column(self) -> ColumnName: 53 return self._create_column( 54 self._feature_column_name(), 55 default_display_name=f"Text Contains of {self.mode} [{', '.join(self.items)}] for {self.column_name}", 56 ) 57 58 def comparison(self, item: str, string: str): 59 if self.case_sensitive: 60 return item in string 61 return item.casefold() in string.casefold() 62 63 64 class DoesNotContain(GeneratedFeature): 65 class Config: 66 type_alias = "evidently:feature:DoesNotContain" 67 68 __feature_type__: ClassVar = ColumnType.Categorical 69 column_name: str 70 items: List[str] 71 case_sensitive: bool 72 mode: str 73 74 def __init__( 75 self, 76 column_name: str, 77 items: List[str], 78 case_sensitive: bool = True, 79 mode: str = "any", 80 display_name: Optional[str] = None, 81 ): 82 self.column_name = column_name 83 self.display_name = display_name 84 self.case_sensitive = case_sensitive 85 if mode not in ["any", "all"]: 86 raise ValueError("mode must be either 'any' or 'all'") 87 self.mode = mode 88 self.items = items 89 super().__init__() 90 91 def _feature_column_name(self) -> str: 92 return f"{self.column_name}_" + "_".join(self.items) + "_" + str(self.case_sensitive) + "_" + self.mode 93 94 def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame: 95 if self.mode == "any": 96 calculated = ~data[self.column_name].str.contains("|".join(self.items), case=self.case_sensitive) 97 elif self.mode == "all": 98 calculated = ~data[self.column_name].apply(lambda x: all(self.comparison(i, x) for i in self.items)) 99 else: 100 raise ValueError("mode must be either 'any' or 'all'") 101 return pd.DataFrame({self._feature_column_name(): calculated}) 102 103 def _as_column(self) -> ColumnName: 104 return self._create_column( 105 self._feature_column_name(), 106 default_display_name=f"Text Does Not Contain of {self.mode} [{', '.join(self.items)}] for {self.column_name}", 107 ) 108 109 def comparison(self, item: str, string: str): 110 if not isinstance(string, str): 111 return False 112 if self.case_sensitive: 113 return item in string 114 return item.casefold() in string.casefold() 115 116 117 class ItemMatch(GeneratedFeature): 118 class Config: 119 type_alias = "evidently:feature:ItemMatch" 120 121 __feature_type__: ClassVar = ColumnType.Categorical 122 columns: List[str] 123 case_sensitive: bool 124 mode: str 125 126 def __init__( 127 self, 128 columns: List[str], 129 case_sensitive: bool = True, 130 mode: str = "any", 131 display_name: Optional[str] = None, 132 ): 133 if len(columns) != 2: 134 raise ValueError("two columns must be provided") 135 self.columns = columns 136 self.display_name = display_name 137 self.case_sensitive = case_sensitive 138 if mode not in ["any", "all"]: 139 raise ValueError("mode must be either 'any' or 'all'") 140 self.mode = mode 141 super().__init__() 142 143 def _feature_column_name(self) -> str: 144 return f"{self.columns[0]}_{self.columns[1]}" + "_item_match_" + str(self.case_sensitive) + "_" + self.mode 145 146 def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame: 147 if self.mode == "any": 148 calculated = data.apply( 149 lambda row: any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]), 150 axis=1, 151 ) 152 else: 153 calculated = data.apply( 154 lambda row: all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]), 155 axis=1, 156 ) 157 return pd.DataFrame({self._feature_column_name(): calculated}) 158 159 def _as_column(self) -> ColumnName: 160 return self._create_column( 161 self._feature_column_name(), 162 default_display_name=f"Text contains {self.mode} of defined items", 163 ) 164 165 def comparison(self, item: str, string: str): 166 if self.case_sensitive: 167 return item in string 168 return item.casefold() in string.casefold() 169 170 171 class ItemNoMatch(GeneratedFeature): 172 class Config: 173 type_alias = "evidently:feature:ItemNoMatch" 174 175 __feature_type__: ClassVar = ColumnType.Categorical 176 columns: List[str] 177 case_sensitive: bool 178 mode: str 179 180 def __init__( 181 self, 182 columns: List[str], 183 case_sensitive: bool = True, 184 mode: str = "any", 185 display_name: Optional[str] = None, 186 ): 187 self.columns = columns 188 self.display_name = display_name 189 self.case_sensitive = case_sensitive 190 if mode not in ["any", "all"]: 191 raise ValueError("mode must be either 'any' or 'all'") 192 self.mode = mode 193 super().__init__() 194 195 def _feature_column_name(self) -> str: 196 return f"{self.columns[0]}_{self.columns[1]}" + "_item_no_match_" + str(self.case_sensitive) + "_" + self.mode 197 198 def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame: 199 if self.mode == "any": 200 calculated = data.apply( 201 lambda row: not any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]), 202 axis=1, 203 ) 204 else: 205 calculated = data.apply( 206 lambda row: not all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]), 207 axis=1, 208 ) 209 return pd.DataFrame({self._feature_column_name(): calculated}) 210 211 def _as_column(self) -> ColumnName: 212 return self._create_column( 213 self._feature_column_name(), 214 default_display_name=f"Text does not contain {self.mode} of defined items", 215 ) 216 217 def comparison(self, item: str, string: str): 218 if self.case_sensitive: 219 return item in string 220 return item.casefold() in string.casefold()