/ src / evidently / legacy / metrics / data_drift / target_by_features_table.py
target_by_features_table.py
  1  import json
  2  from typing import Dict
  3  from typing import List
  4  from typing import Optional
  5  from typing import Union
  6  
  7  import numpy as np
  8  import pandas as pd
  9  import plotly.express as px
 10  import plotly.graph_objs as go
 11  from pandas.api.types import is_integer_dtype
 12  from pandas.api.types import is_string_dtype
 13  from plotly.subplots import make_subplots
 14  
 15  from evidently.legacy.base_metric import InputData
 16  from evidently.legacy.base_metric import Metric
 17  from evidently.legacy.base_metric import MetricResult
 18  from evidently.legacy.base_metric import UsesRawDataMixin
 19  from evidently.legacy.calculations.classification_performance import get_prediction_data
 20  from evidently.legacy.core import ColumnType
 21  from evidently.legacy.core import IncludeTags
 22  from evidently.legacy.features.non_letter_character_percentage_feature import NonLetterCharacterPercentage
 23  from evidently.legacy.features.OOV_words_percentage_feature import OOVWordsPercentage
 24  from evidently.legacy.features.text_length_feature import TextLength
 25  from evidently.legacy.metric_results import StatsByFeature
 26  from evidently.legacy.model.widget import AdditionalGraphInfo
 27  from evidently.legacy.model.widget import BaseWidgetInfo
 28  from evidently.legacy.options.base import AnyOptions
 29  from evidently.legacy.renderers.base_renderer import MetricRenderer
 30  from evidently.legacy.renderers.base_renderer import default_renderer
 31  from evidently.legacy.utils.data_operations import process_columns
 32  from evidently.legacy.utils.data_preprocessing import DataDefinition
 33  
 34  
 35  class TargetByFeaturesTableResults(MetricResult):
 36      class Config:
 37          type_alias = "evidently:metric_result:TargetByFeaturesTableResults"
 38          dict_include = False
 39          field_tags = {
 40              "current": {IncludeTags.Current},
 41              "reference": {IncludeTags.Reference},
 42              "target_name": {IncludeTags.Parameter},
 43              "columns": {IncludeTags.Parameter},
 44              "task": {IncludeTags.Parameter},
 45          }
 46  
 47      current: StatsByFeature
 48      reference: Optional[StatsByFeature]
 49      target_name: Optional[str]
 50      columns: List[str]
 51      task: str
 52  
 53  
 54  class TargetByFeaturesTable(UsesRawDataMixin, Metric[TargetByFeaturesTableResults]):
 55      class Config:
 56          type_alias = "evidently:metric:TargetByFeaturesTable"
 57  
 58      columns: Optional[List[str]]
 59      _text_features_gen: Optional[
 60          Dict[
 61              str,
 62              Dict[str, Union[TextLength, NonLetterCharacterPercentage, OOVWordsPercentage]],
 63          ]
 64      ]
 65  
 66      def __init__(self, columns: Optional[List[str]] = None, options: AnyOptions = None):
 67          self.columns = columns
 68          super().__init__(options=options)
 69          self._text_features_gen = None
 70  
 71      def required_features(self, data_definition: DataDefinition):
 72          if len(data_definition.get_columns(ColumnType.Text, features_only=True)) > 0:
 73              text_cols = [col.column_name for col in data_definition.get_columns(ColumnType.Text, features_only=True)]
 74              text_features_gen = {}
 75              text_features_gen_result = []
 76              for col in text_cols:
 77                  col_dict: Dict[
 78                      str,
 79                      Union[TextLength, NonLetterCharacterPercentage, OOVWordsPercentage],
 80                  ] = {}
 81                  col_dict[f"{col}: Text Length"] = TextLength(col)
 82                  col_dict[f"{col}: Non Letter Character %"] = NonLetterCharacterPercentage(col)
 83                  col_dict[f"{col}: OOV %"] = OOVWordsPercentage(col)
 84  
 85                  text_features_gen_result += [
 86                      col_dict[f"{col}: Text Length"],
 87                      col_dict[f"{col}: Non Letter Character %"],
 88                      col_dict[f"{col}: OOV %"],
 89                  ]
 90                  text_features_gen[col] = col_dict
 91              self._text_features_gen = text_features_gen
 92  
 93              return text_features_gen_result
 94          else:
 95              return []
 96  
 97      def get_parameters(self) -> tuple:
 98          return ()
 99  
100      def calculate(self, data: InputData) -> TargetByFeaturesTableResults:
101          if not self.get_options().render_options.raw_data:
102              return TargetByFeaturesTableResults(
103                  current=StatsByFeature(plot_data=pd.DataFrame()),
104                  reference=None,
105                  target_name=None,
106                  columns=[],
107                  task="",
108              )
109          dataset_columns = process_columns(data.current_data, data.column_mapping)
110          target_name = dataset_columns.utility_columns.target
111          prediction_name = dataset_columns.utility_columns.prediction
112          if target_name is None and prediction_name is None:
113              raise ValueError("The columns 'target' or 'prediction' should be present")
114          if data.reference_data is None:
115              raise ValueError("Reference data should be present")
116          curr_df = data.current_data.copy()
117          ref_df = data.reference_data.copy()
118          curr_predictions = None
119          ref_predictions = None
120          if prediction_name is not None:
121              curr_predictions = get_prediction_data(data.current_data, dataset_columns, data.column_mapping.pos_label)
122              ref_predictions = get_prediction_data(data.reference_data, dataset_columns, data.column_mapping.pos_label)
123  
124          if self.columns is None:
125              columns = (
126                  dataset_columns.num_feature_names
127                  + dataset_columns.cat_feature_names
128                  + dataset_columns.text_feature_names
129              )
130          else:
131              columns = list(
132                  np.intersect1d(
133                      self.columns,
134                      (
135                          dataset_columns.num_feature_names
136                          + dataset_columns.cat_feature_names
137                          + dataset_columns.text_feature_names
138                      ),
139                  )
140              )
141          if data.column_mapping.task is not None:
142              task = data.column_mapping.task
143          else:
144              if target_name is not None:
145                  if curr_df[target_name].nunique() < 5 or is_string_dtype(curr_df[target_name]):
146                      task = "classification"
147                  else:
148                      task = "regression"
149              elif curr_predictions is not None:
150                  if is_string_dtype(curr_predictions.predictions) or (
151                      is_integer_dtype(curr_predictions.predictions) and curr_predictions.predictions.nunique() < 5
152                  ):
153                      task = "classification"
154                  else:
155                      task = "regression"
156              else:
157                  raise ValueError("Task parameter of column_mapping should be specified")
158          # process text columns
159          if (
160              self._text_features_gen is not None
161              and len(np.intersect1d(list(self._text_features_gen.keys()), columns)) >= 1
162          ):
163              for col in np.intersect1d(list(self._text_features_gen.keys()), columns):
164                  columns += list(self._text_features_gen[col].keys())
165                  columns.remove(col)
166                  curr_text_df = pd.concat(
167                      [data.get_current_column(x.as_column()) for x in list(self._text_features_gen[col].values())],
168                      axis=1,
169                  )
170                  curr_text_df.columns = pd.Index(list(self._text_features_gen[col].keys()))
171                  curr_df = pd.concat(
172                      [
173                          curr_df.reset_index(drop=True),
174                          curr_text_df.reset_index(drop=True),
175                      ],
176                      axis=1,
177                  )
178  
179                  if ref_df is not None:
180                      ref_text_df = pd.concat(
181                          [data.get_reference_column(x.as_column()) for x in list(self._text_features_gen[col].values())],
182                          axis=1,
183                      )
184                      ref_text_df.columns = pd.Index(list(self._text_features_gen[col].keys()))
185                      ref_df = pd.concat(
186                          [
187                              ref_df.reset_index(drop=True),
188                              ref_text_df.reset_index(drop=True),
189                          ],
190                          axis=1,
191                      )
192          table_columns = columns.copy()
193          if target_name is not None:
194              table_columns += [target_name]
195          if prediction_name is not None and isinstance(prediction_name, str):
196              table_columns += [prediction_name]
197          if prediction_name is not None and isinstance(prediction_name, list):
198              table_columns += prediction_name
199  
200          return TargetByFeaturesTableResults(
201              current=StatsByFeature(
202                  plot_data=curr_df[table_columns],
203                  predictions=curr_predictions,
204              ),
205              reference=StatsByFeature(
206                  plot_data=ref_df[table_columns],
207                  predictions=ref_predictions,
208              ),
209              columns=columns,
210              target_name=target_name,
211              task=task,
212          )
213  
214  
215  @default_renderer(wrap_type=TargetByFeaturesTable)
216  class TargetByFeaturesTableRenderer(MetricRenderer):
217      def render_html(self, obj: TargetByFeaturesTable) -> List[BaseWidgetInfo]:
218          if not obj.get_options().render_options.raw_data:
219              return []
220          result = obj.get_result()
221          current_data = result.current.plot_data
222          # todo: better typing
223          assert current_data is not None
224          if result.reference is None:
225              raise ValueError("reference is not set but required")
226          reference_data = result.reference.plot_data
227          target_name = result.target_name
228          curr_predictions = result.current.predictions
229          ref_predictions = result.reference.predictions
230          columns = result.columns
231          task = result.task
232          if curr_predictions is not None and ref_predictions is not None:
233              current_data["prediction_labels"] = curr_predictions.predictions.values
234              reference_data["prediction_labels"] = ref_predictions.predictions.values
235  
236          additional_graphs_data = []
237          params_data = []
238  
239          for feature_name in columns:
240              # add data for table in params
241              parts = []
242  
243              if target_name is not None:
244                  parts.append({"title": "Target", "id": feature_name + "_target_values"})
245                  if task == "regression":
246                      target_fig = self._get_regression_fig(feature_name, target_name, current_data, reference_data)
247                  else:
248                      target_fig = self._get_classification_fig(feature_name, target_name, current_data, reference_data)
249  
250                  target_fig_json = json.loads(target_fig.to_json())
251  
252                  additional_graphs_data.append(
253                      AdditionalGraphInfo(
254                          id=feature_name + "_target_values",
255                          params={
256                              "data": target_fig_json["data"],
257                              "layout": target_fig_json["layout"],
258                          },
259                      )
260                  )
261  
262              if curr_predictions is not None:
263                  parts.append({"title": "Prediction", "id": feature_name + "_prediction_values"})
264                  if task == "regression":
265                      preds_fig = self._get_regression_fig(
266                          feature_name, "prediction_labels", current_data, reference_data
267                      )
268                  else:
269                      preds_fig = self._get_classification_fig(
270                          feature_name, "prediction_labels", current_data, reference_data
271                      )
272                  preds_fig_json = json.loads(preds_fig.to_json())
273  
274                  additional_graphs_data.append(
275                      AdditionalGraphInfo(
276                          id=feature_name + "_prediction_values",
277                          params={
278                              "data": preds_fig_json["data"],
279                              "layout": preds_fig_json["layout"],
280                          },
281                      )
282                  )
283  
284              params_data.append(
285                  {
286                      "details": {
287                          "parts": parts,
288                          "insights": [],
289                      },
290                      "f1": feature_name,
291                  }
292              )
293          return [
294              BaseWidgetInfo(
295                  title="Target (Prediction) Behavior By Feature",
296                  type="big_table",
297                  size=2,
298                  params={
299                      "rowsPerPage": min(len(columns), 10),
300                      "columns": [{"title": "Feature", "field": "f1"}],
301                      "data": params_data,
302                  },
303                  additionalGraphs=additional_graphs_data,
304              )
305          ]
306  
307      def _get_regression_fig(self, feature_name: str, main_column: str, curr_data: pd.DataFrame, ref_data: pd.DataFrame):
308          fig = make_subplots(rows=1, cols=2, subplot_titles=("Current", "Reference"), shared_yaxes=True)
309          fig.add_trace(
310              go.Scattergl(
311                  x=curr_data[feature_name],
312                  y=curr_data[main_column],
313                  mode="markers",
314                  name="current",
315                  marker=dict(size=6, color=self.color_options.primary_color),
316              ),
317              row=1,
318              col=1,
319          )
320  
321          fig.add_trace(
322              go.Scatter(
323                  x=ref_data[feature_name],
324                  y=ref_data[main_column],
325                  mode="markers",
326                  name="reference",
327                  marker=dict(size=6, color=self.color_options.secondary_color),
328              ),
329              row=1,
330              col=2,
331          )
332          fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=1)
333          fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=2)
334          fig.update_yaxes(title_text=main_column, showgrid=True, row=1, col=1)
335  
336          return fig
337  
338      def _get_classification_fig(
339          self, feature_name: str, main_column: str, curr_data: pd.DataFrame, ref_data: pd.DataFrame
340      ):
341          curr = curr_data.copy()
342          ref = ref_data.copy()
343          ref["dataset"] = "Reference"
344          curr["dataset"] = "Current"
345          merged_data = pd.concat([ref, curr])
346          fig = px.histogram(
347              merged_data,
348              x=feature_name,
349              color=main_column,
350              facet_col="dataset",
351              barmode="overlay",
352              category_orders={"dataset": ["Current", "Reference"]},
353          )
354  
355          return fig