/ src / evidently / legacy / features / text_contains_feature.py
text_contains_feature.py
  1  from typing import ClassVar
  2  from typing import List
  3  from typing import Optional
  4  
  5  import pandas as pd
  6  
  7  from evidently.legacy.base_metric import ColumnName
  8  from evidently.legacy.core import ColumnType
  9  from evidently.legacy.features.generated_features import GeneratedFeature
 10  from evidently.legacy.utils.data_preprocessing import DataDefinition
 11  
 12  
 13  class Contains(GeneratedFeature):
 14      class Config:
 15          type_alias = "evidently:feature:Contains"
 16  
 17      __feature_type__: ClassVar = ColumnType.Categorical
 18      column_name: str
 19      items: List[str]
 20      case_sensitive: bool
 21      mode: str
 22  
 23      def __init__(
 24          self,
 25          column_name: str,
 26          items: List[str],
 27          case_sensitive: bool = True,
 28          mode: str = "any",
 29          display_name: Optional[str] = None,
 30      ):
 31          self.column_name = column_name
 32          self.display_name = display_name
 33          self.case_sensitive = case_sensitive
 34          if mode not in ["any", "all"]:
 35              raise ValueError("mode must be either 'any' or 'all'")
 36          self.mode = mode
 37          self.items = items
 38          super().__init__()
 39  
 40      def _feature_column_name(self) -> str:
 41          return f"{self.column_name}_" + "_".join(self.items) + "_" + str(self.case_sensitive) + "_" + self.mode
 42  
 43      def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
 44          if self.mode == "any":
 45              calculated = data[self.column_name].str.contains("|".join(self.items), case=self.case_sensitive)
 46          elif self.mode == "all":
 47              calculated = data[self.column_name].apply(lambda x: all(self.comparison(i, x) for i in self.items))
 48          else:
 49              raise ValueError("mode must be either 'any' or 'all'")
 50          return pd.DataFrame({self._feature_column_name(): calculated})
 51  
 52      def _as_column(self) -> ColumnName:
 53          return self._create_column(
 54              self._feature_column_name(),
 55              default_display_name=f"Text Contains of {self.mode} [{', '.join(self.items)}] for {self.column_name}",
 56          )
 57  
 58      def comparison(self, item: str, string: str):
 59          if self.case_sensitive:
 60              return item in string
 61          return item.casefold() in string.casefold()
 62  
 63  
 64  class DoesNotContain(GeneratedFeature):
 65      class Config:
 66          type_alias = "evidently:feature:DoesNotContain"
 67  
 68      __feature_type__: ClassVar = ColumnType.Categorical
 69      column_name: str
 70      items: List[str]
 71      case_sensitive: bool
 72      mode: str
 73  
 74      def __init__(
 75          self,
 76          column_name: str,
 77          items: List[str],
 78          case_sensitive: bool = True,
 79          mode: str = "any",
 80          display_name: Optional[str] = None,
 81      ):
 82          self.column_name = column_name
 83          self.display_name = display_name
 84          self.case_sensitive = case_sensitive
 85          if mode not in ["any", "all"]:
 86              raise ValueError("mode must be either 'any' or 'all'")
 87          self.mode = mode
 88          self.items = items
 89          super().__init__()
 90  
 91      def _feature_column_name(self) -> str:
 92          return f"{self.column_name}_" + "_".join(self.items) + "_" + str(self.case_sensitive) + "_" + self.mode
 93  
 94      def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
 95          if self.mode == "any":
 96              calculated = ~data[self.column_name].str.contains("|".join(self.items), case=self.case_sensitive)
 97          elif self.mode == "all":
 98              calculated = ~data[self.column_name].apply(lambda x: all(self.comparison(i, x) for i in self.items))
 99          else:
100              raise ValueError("mode must be either 'any' or 'all'")
101          return pd.DataFrame({self._feature_column_name(): calculated})
102  
103      def _as_column(self) -> ColumnName:
104          return self._create_column(
105              self._feature_column_name(),
106              default_display_name=f"Text Does Not Contain of {self.mode} [{', '.join(self.items)}] for {self.column_name}",
107          )
108  
109      def comparison(self, item: str, string: str):
110          if not isinstance(string, str):
111              return False
112          if self.case_sensitive:
113              return item in string
114          return item.casefold() in string.casefold()
115  
116  
117  class ItemMatch(GeneratedFeature):
118      class Config:
119          type_alias = "evidently:feature:ItemMatch"
120  
121      __feature_type__: ClassVar = ColumnType.Categorical
122      columns: List[str]
123      case_sensitive: bool
124      mode: str
125  
126      def __init__(
127          self,
128          columns: List[str],
129          case_sensitive: bool = True,
130          mode: str = "any",
131          display_name: Optional[str] = None,
132      ):
133          if len(columns) != 2:
134              raise ValueError("two columns must be provided")
135          self.columns = columns
136          self.display_name = display_name
137          self.case_sensitive = case_sensitive
138          if mode not in ["any", "all"]:
139              raise ValueError("mode must be either 'any' or 'all'")
140          self.mode = mode
141          super().__init__()
142  
143      def _feature_column_name(self) -> str:
144          return f"{self.columns[0]}_{self.columns[1]}" + "_item_match_" + str(self.case_sensitive) + "_" + self.mode
145  
146      def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
147          if self.mode == "any":
148              calculated = data.apply(
149                  lambda row: any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
150                  axis=1,
151              )
152          else:
153              calculated = data.apply(
154                  lambda row: all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
155                  axis=1,
156              )
157          return pd.DataFrame({self._feature_column_name(): calculated})
158  
159      def _as_column(self) -> ColumnName:
160          return self._create_column(
161              self._feature_column_name(),
162              default_display_name=f"Text contains {self.mode} of defined items",
163          )
164  
165      def comparison(self, item: str, string: str):
166          if self.case_sensitive:
167              return item in string
168          return item.casefold() in string.casefold()
169  
170  
171  class ItemNoMatch(GeneratedFeature):
172      class Config:
173          type_alias = "evidently:feature:ItemNoMatch"
174  
175      __feature_type__: ClassVar = ColumnType.Categorical
176      columns: List[str]
177      case_sensitive: bool
178      mode: str
179  
180      def __init__(
181          self,
182          columns: List[str],
183          case_sensitive: bool = True,
184          mode: str = "any",
185          display_name: Optional[str] = None,
186      ):
187          self.columns = columns
188          self.display_name = display_name
189          self.case_sensitive = case_sensitive
190          if mode not in ["any", "all"]:
191              raise ValueError("mode must be either 'any' or 'all'")
192          self.mode = mode
193          super().__init__()
194  
195      def _feature_column_name(self) -> str:
196          return f"{self.columns[0]}_{self.columns[1]}" + "_item_no_match_" + str(self.case_sensitive) + "_" + self.mode
197  
198      def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
199          if self.mode == "any":
200              calculated = data.apply(
201                  lambda row: not any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
202                  axis=1,
203              )
204          else:
205              calculated = data.apply(
206                  lambda row: not all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
207                  axis=1,
208              )
209          return pd.DataFrame({self._feature_column_name(): calculated})
210  
211      def _as_column(self) -> ColumnName:
212          return self._create_column(
213              self._feature_column_name(),
214              default_display_name=f"Text does not contain {self.mode} of defined items",
215          )
216  
217      def comparison(self, item: str, string: str):
218          if self.case_sensitive:
219              return item in string
220          return item.casefold() in string.casefold()