target_by_features_table.py
1 import json 2 from typing import Dict 3 from typing import List 4 from typing import Optional 5 from typing import Union 6 7 import numpy as np 8 import pandas as pd 9 import plotly.express as px 10 import plotly.graph_objs as go 11 from pandas.api.types import is_integer_dtype 12 from pandas.api.types import is_string_dtype 13 from plotly.subplots import make_subplots 14 15 from evidently.legacy.base_metric import InputData 16 from evidently.legacy.base_metric import Metric 17 from evidently.legacy.base_metric import MetricResult 18 from evidently.legacy.base_metric import UsesRawDataMixin 19 from evidently.legacy.calculations.classification_performance import get_prediction_data 20 from evidently.legacy.core import ColumnType 21 from evidently.legacy.core import IncludeTags 22 from evidently.legacy.features.non_letter_character_percentage_feature import NonLetterCharacterPercentage 23 from evidently.legacy.features.OOV_words_percentage_feature import OOVWordsPercentage 24 from evidently.legacy.features.text_length_feature import TextLength 25 from evidently.legacy.metric_results import StatsByFeature 26 from evidently.legacy.model.widget import AdditionalGraphInfo 27 from evidently.legacy.model.widget import BaseWidgetInfo 28 from evidently.legacy.options.base import AnyOptions 29 from evidently.legacy.renderers.base_renderer import MetricRenderer 30 from evidently.legacy.renderers.base_renderer import default_renderer 31 from evidently.legacy.utils.data_operations import process_columns 32 from evidently.legacy.utils.data_preprocessing import DataDefinition 33 34 35 class TargetByFeaturesTableResults(MetricResult): 36 class Config: 37 type_alias = "evidently:metric_result:TargetByFeaturesTableResults" 38 dict_include = False 39 field_tags = { 40 "current": {IncludeTags.Current}, 41 "reference": {IncludeTags.Reference}, 42 "target_name": {IncludeTags.Parameter}, 43 "columns": {IncludeTags.Parameter}, 44 "task": {IncludeTags.Parameter}, 45 } 46 47 current: StatsByFeature 48 reference: Optional[StatsByFeature] 49 target_name: Optional[str] 50 columns: List[str] 51 task: str 52 53 54 class TargetByFeaturesTable(UsesRawDataMixin, Metric[TargetByFeaturesTableResults]): 55 class Config: 56 type_alias = "evidently:metric:TargetByFeaturesTable" 57 58 columns: Optional[List[str]] 59 _text_features_gen: Optional[ 60 Dict[ 61 str, 62 Dict[str, Union[TextLength, NonLetterCharacterPercentage, OOVWordsPercentage]], 63 ] 64 ] 65 66 def __init__(self, columns: Optional[List[str]] = None, options: AnyOptions = None): 67 self.columns = columns 68 super().__init__(options=options) 69 self._text_features_gen = None 70 71 def required_features(self, data_definition: DataDefinition): 72 if len(data_definition.get_columns(ColumnType.Text, features_only=True)) > 0: 73 text_cols = [col.column_name for col in data_definition.get_columns(ColumnType.Text, features_only=True)] 74 text_features_gen = {} 75 text_features_gen_result = [] 76 for col in text_cols: 77 col_dict: Dict[ 78 str, 79 Union[TextLength, NonLetterCharacterPercentage, OOVWordsPercentage], 80 ] = {} 81 col_dict[f"{col}: Text Length"] = TextLength(col) 82 col_dict[f"{col}: Non Letter Character %"] = NonLetterCharacterPercentage(col) 83 col_dict[f"{col}: OOV %"] = OOVWordsPercentage(col) 84 85 text_features_gen_result += [ 86 col_dict[f"{col}: Text Length"], 87 col_dict[f"{col}: Non Letter Character %"], 88 col_dict[f"{col}: OOV %"], 89 ] 90 text_features_gen[col] = col_dict 91 self._text_features_gen = text_features_gen 92 93 return text_features_gen_result 94 else: 95 return [] 96 97 def get_parameters(self) -> tuple: 98 return () 99 100 def calculate(self, data: InputData) -> TargetByFeaturesTableResults: 101 if not self.get_options().render_options.raw_data: 102 return TargetByFeaturesTableResults( 103 current=StatsByFeature(plot_data=pd.DataFrame()), 104 reference=None, 105 target_name=None, 106 columns=[], 107 task="", 108 ) 109 dataset_columns = process_columns(data.current_data, data.column_mapping) 110 target_name = dataset_columns.utility_columns.target 111 prediction_name = dataset_columns.utility_columns.prediction 112 if target_name is None and prediction_name is None: 113 raise ValueError("The columns 'target' or 'prediction' should be present") 114 if data.reference_data is None: 115 raise ValueError("Reference data should be present") 116 curr_df = data.current_data.copy() 117 ref_df = data.reference_data.copy() 118 curr_predictions = None 119 ref_predictions = None 120 if prediction_name is not None: 121 curr_predictions = get_prediction_data(data.current_data, dataset_columns, data.column_mapping.pos_label) 122 ref_predictions = get_prediction_data(data.reference_data, dataset_columns, data.column_mapping.pos_label) 123 124 if self.columns is None: 125 columns = ( 126 dataset_columns.num_feature_names 127 + dataset_columns.cat_feature_names 128 + dataset_columns.text_feature_names 129 ) 130 else: 131 columns = list( 132 np.intersect1d( 133 self.columns, 134 ( 135 dataset_columns.num_feature_names 136 + dataset_columns.cat_feature_names 137 + dataset_columns.text_feature_names 138 ), 139 ) 140 ) 141 if data.column_mapping.task is not None: 142 task = data.column_mapping.task 143 else: 144 if target_name is not None: 145 if curr_df[target_name].nunique() < 5 or is_string_dtype(curr_df[target_name]): 146 task = "classification" 147 else: 148 task = "regression" 149 elif curr_predictions is not None: 150 if is_string_dtype(curr_predictions.predictions) or ( 151 is_integer_dtype(curr_predictions.predictions) and curr_predictions.predictions.nunique() < 5 152 ): 153 task = "classification" 154 else: 155 task = "regression" 156 else: 157 raise ValueError("Task parameter of column_mapping should be specified") 158 # process text columns 159 if ( 160 self._text_features_gen is not None 161 and len(np.intersect1d(list(self._text_features_gen.keys()), columns)) >= 1 162 ): 163 for col in np.intersect1d(list(self._text_features_gen.keys()), columns): 164 columns += list(self._text_features_gen[col].keys()) 165 columns.remove(col) 166 curr_text_df = pd.concat( 167 [data.get_current_column(x.as_column()) for x in list(self._text_features_gen[col].values())], 168 axis=1, 169 ) 170 curr_text_df.columns = pd.Index(list(self._text_features_gen[col].keys())) 171 curr_df = pd.concat( 172 [ 173 curr_df.reset_index(drop=True), 174 curr_text_df.reset_index(drop=True), 175 ], 176 axis=1, 177 ) 178 179 if ref_df is not None: 180 ref_text_df = pd.concat( 181 [data.get_reference_column(x.as_column()) for x in list(self._text_features_gen[col].values())], 182 axis=1, 183 ) 184 ref_text_df.columns = pd.Index(list(self._text_features_gen[col].keys())) 185 ref_df = pd.concat( 186 [ 187 ref_df.reset_index(drop=True), 188 ref_text_df.reset_index(drop=True), 189 ], 190 axis=1, 191 ) 192 table_columns = columns.copy() 193 if target_name is not None: 194 table_columns += [target_name] 195 if prediction_name is not None and isinstance(prediction_name, str): 196 table_columns += [prediction_name] 197 if prediction_name is not None and isinstance(prediction_name, list): 198 table_columns += prediction_name 199 200 return TargetByFeaturesTableResults( 201 current=StatsByFeature( 202 plot_data=curr_df[table_columns], 203 predictions=curr_predictions, 204 ), 205 reference=StatsByFeature( 206 plot_data=ref_df[table_columns], 207 predictions=ref_predictions, 208 ), 209 columns=columns, 210 target_name=target_name, 211 task=task, 212 ) 213 214 215 @default_renderer(wrap_type=TargetByFeaturesTable) 216 class TargetByFeaturesTableRenderer(MetricRenderer): 217 def render_html(self, obj: TargetByFeaturesTable) -> List[BaseWidgetInfo]: 218 if not obj.get_options().render_options.raw_data: 219 return [] 220 result = obj.get_result() 221 current_data = result.current.plot_data 222 # todo: better typing 223 assert current_data is not None 224 if result.reference is None: 225 raise ValueError("reference is not set but required") 226 reference_data = result.reference.plot_data 227 target_name = result.target_name 228 curr_predictions = result.current.predictions 229 ref_predictions = result.reference.predictions 230 columns = result.columns 231 task = result.task 232 if curr_predictions is not None and ref_predictions is not None: 233 current_data["prediction_labels"] = curr_predictions.predictions.values 234 reference_data["prediction_labels"] = ref_predictions.predictions.values 235 236 additional_graphs_data = [] 237 params_data = [] 238 239 for feature_name in columns: 240 # add data for table in params 241 parts = [] 242 243 if target_name is not None: 244 parts.append({"title": "Target", "id": feature_name + "_target_values"}) 245 if task == "regression": 246 target_fig = self._get_regression_fig(feature_name, target_name, current_data, reference_data) 247 else: 248 target_fig = self._get_classification_fig(feature_name, target_name, current_data, reference_data) 249 250 target_fig_json = json.loads(target_fig.to_json()) 251 252 additional_graphs_data.append( 253 AdditionalGraphInfo( 254 id=feature_name + "_target_values", 255 params={ 256 "data": target_fig_json["data"], 257 "layout": target_fig_json["layout"], 258 }, 259 ) 260 ) 261 262 if curr_predictions is not None: 263 parts.append({"title": "Prediction", "id": feature_name + "_prediction_values"}) 264 if task == "regression": 265 preds_fig = self._get_regression_fig( 266 feature_name, "prediction_labels", current_data, reference_data 267 ) 268 else: 269 preds_fig = self._get_classification_fig( 270 feature_name, "prediction_labels", current_data, reference_data 271 ) 272 preds_fig_json = json.loads(preds_fig.to_json()) 273 274 additional_graphs_data.append( 275 AdditionalGraphInfo( 276 id=feature_name + "_prediction_values", 277 params={ 278 "data": preds_fig_json["data"], 279 "layout": preds_fig_json["layout"], 280 }, 281 ) 282 ) 283 284 params_data.append( 285 { 286 "details": { 287 "parts": parts, 288 "insights": [], 289 }, 290 "f1": feature_name, 291 } 292 ) 293 return [ 294 BaseWidgetInfo( 295 title="Target (Prediction) Behavior By Feature", 296 type="big_table", 297 size=2, 298 params={ 299 "rowsPerPage": min(len(columns), 10), 300 "columns": [{"title": "Feature", "field": "f1"}], 301 "data": params_data, 302 }, 303 additionalGraphs=additional_graphs_data, 304 ) 305 ] 306 307 def _get_regression_fig(self, feature_name: str, main_column: str, curr_data: pd.DataFrame, ref_data: pd.DataFrame): 308 fig = make_subplots(rows=1, cols=2, subplot_titles=("Current", "Reference"), shared_yaxes=True) 309 fig.add_trace( 310 go.Scattergl( 311 x=curr_data[feature_name], 312 y=curr_data[main_column], 313 mode="markers", 314 name="current", 315 marker=dict(size=6, color=self.color_options.primary_color), 316 ), 317 row=1, 318 col=1, 319 ) 320 321 fig.add_trace( 322 go.Scatter( 323 x=ref_data[feature_name], 324 y=ref_data[main_column], 325 mode="markers", 326 name="reference", 327 marker=dict(size=6, color=self.color_options.secondary_color), 328 ), 329 row=1, 330 col=2, 331 ) 332 fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=1) 333 fig.update_xaxes(title_text=feature_name, showgrid=True, row=1, col=2) 334 fig.update_yaxes(title_text=main_column, showgrid=True, row=1, col=1) 335 336 return fig 337 338 def _get_classification_fig( 339 self, feature_name: str, main_column: str, curr_data: pd.DataFrame, ref_data: pd.DataFrame 340 ): 341 curr = curr_data.copy() 342 ref = ref_data.copy() 343 ref["dataset"] = "Reference" 344 curr["dataset"] = "Current" 345 merged_data = pd.concat([ref, curr]) 346 fig = px.histogram( 347 merged_data, 348 x=feature_name, 349 color=main_column, 350 facet_col="dataset", 351 barmode="overlay", 352 category_orders={"dataset": ["Current", "Reference"]}, 353 ) 354 355 return fig