/ src / evidently / legacy / ui / demo_projects / reviews.py
reviews.py
  1  import os
  2  import pathlib
  3  from datetime import datetime
  4  from datetime import timedelta
  5  
  6  import numpy as np
  7  import pandas as pd
  8  from sklearn import datasets
  9  
 10  from evidently.legacy import descriptors
 11  from evidently.legacy import metrics
 12  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 13  from evidently.legacy.renderers.html_widgets import WidgetSize
 14  from evidently.legacy.report import Report
 15  from evidently.legacy.ui.dashboards import CounterAgg
 16  from evidently.legacy.ui.dashboards import DashboardPanelCounter
 17  from evidently.legacy.ui.dashboards import DashboardPanelDistribution
 18  from evidently.legacy.ui.dashboards import DashboardPanelPlot
 19  from evidently.legacy.ui.dashboards import PanelValue
 20  from evidently.legacy.ui.dashboards import PlotType
 21  from evidently.legacy.ui.dashboards import ReportFilter
 22  from evidently.legacy.ui.demo_projects import DemoProject
 23  from evidently.legacy.ui.workspace import WorkspaceBase
 24  
 25  
 26  def create_data():
 27      if os.environ.get("EVIDENTLY_TEST_ENVIRONMENT", "0") != "1":
 28          reviews_data = datasets.fetch_openml(name="Womens-E-Commerce-Clothing-Reviews", version=2, as_frame="auto")
 29          reviews = reviews_data.frame
 30  
 31      else:
 32          reviews = pd.read_parquet(pathlib.Path(__file__).parent.joinpath("../../../../../test_data/reviews.parquet"))
 33  
 34      for name, rs in (
 35          ("TheOtherStore", 0),
 36          ("AMajorCompetitor", 42),
 37          ("AwesomeShop", 100),
 38      ):
 39          np.random.seed(rs)
 40          random_index = np.random.choice(reviews.index, 300, replace=False)
 41          reviews.loc[random_index, "Review_Text"] = (
 42              reviews.loc[random_index, "Review_Text"] + f" mention competitor {name}"
 43          )
 44  
 45      np.random.seed(13)
 46      random_index = np.random.choice(reviews.index, 1000, replace=False)
 47      reviews.loc[random_index, "Review_Text"] = (
 48          reviews.loc[random_index, "Review_Text"] + " mention www.someurl.someurl "
 49      )
 50      reviews["prediction"] = reviews["Rating"]
 51      np.random.seed(0)
 52      random_index = np.random.choice(reviews.index, 2000, replace=False)
 53      reviews.loc[random_index, "prediction"] = 1
 54      reference = reviews.sample(n=5000, replace=True, ignore_index=True, random_state=42)
 55      current = reviews
 56      column_mapping = ColumnMapping(
 57          target="Rating",
 58          prediction="prediction",
 59          numerical_features=["Age", "Positive_Feedback_Count"],
 60          categorical_features=["Division_Name", "Department_Name", "Class_Name"],
 61          text_features=["Review_Text", "Title"],
 62      )
 63      return current, reference, column_mapping
 64  
 65  
 66  def create_report(i: int, data):
 67      current, reference, column_mapping = data
 68      text_report = Report(
 69          metrics=[
 70              metrics.DatasetSummaryMetric(),
 71              metrics.DatasetDriftMetric(),
 72              metrics.ColumnDriftMetric(column_name="prediction"),
 73              metrics.ColumnDriftMetric(column_name="Rating"),
 74              metrics.ColumnDriftMetric(column_name="Age"),
 75              metrics.ColumnDriftMetric(column_name="Positive_Feedback_Count"),
 76              metrics.ColumnDriftMetric(column_name="Division_Name"),
 77              metrics.ColumnDriftMetric(column_name="Department_Name"),
 78              metrics.ColumnDriftMetric(column_name="Class_Name"),
 79              metrics.ColumnDriftMetric(column_name="Review_Text"),
 80              metrics.ColumnDriftMetric(column_name="Title"),
 81              metrics.ClassificationQualityMetric(),
 82              metrics.ColumnSummaryMetric(column_name=descriptors.OOV(display_name="OOV").for_column("Review_Text")),
 83              metrics.ColumnSummaryMetric(
 84                  column_name=descriptors.NonLetterCharacterPercentage(
 85                      display_name="Non Letter Character Percentage"
 86                  ).for_column("Review_Text")
 87              ),
 88              metrics.ColumnSummaryMetric(
 89                  column_name=descriptors.Sentiment(display_name="Sentiment").for_column("Review_Text")
 90              ),
 91              metrics.ColumnSummaryMetric(
 92                  column_name=descriptors.RegExp(display_name="urls", reg_exp=r".*(http|www)\S+.*").for_column(
 93                      "Review_Text"
 94                  )
 95              ),
 96              metrics.ColumnValueRangeMetric(
 97                  column_name=descriptors.TextLength(display_name="TextLength in the Range").for_column("Review_Text"),
 98                  left=1,
 99                  right=1000,
100              ),
101              metrics.ColumnCategoryMetric(
102                  column_name=descriptors.TriggerWordsPresence(
103                      display_name="competitors",
104                      words_list=["theotherstore", "amajorcompetitor", "awesomeshop"],
105                      lemmatize=False,
106                  ).for_column("Review_Text"),
107                  category=1,
108              ),
109              metrics.ColumnCategoryMetric(column_name="Rating", category=1),
110              metrics.ColumnCategoryMetric(column_name="Rating", category=5),
111          ],
112          timestamp=datetime(2023, 1, 29) + timedelta(days=i + 1),
113      )
114      text_report.set_batch_size("daily")
115  
116      if i < 17:
117          text_report.run(
118              reference_data=reference,
119              current_data=current.iloc[1000 * i : 1000 * (i + 1), :],
120              column_mapping=column_mapping,
121          )
122      else:
123          text_report.run(
124              reference_data=reference,
125              current_data=current[(current.Rating < 5)],
126              column_mapping=column_mapping,
127          )
128  
129      return text_report
130  
131  
132  def create_project(workspace: WorkspaceBase, name: str):
133      project = workspace.create_project(name)
134      project.description = "A toy demo project using E-commerce Reviews dataset. Text and tabular data, classification."
135      # title
136      project.dashboard.add_panel(
137          DashboardPanelCounter(
138              filter=ReportFilter(metadata_values={}, tag_values=[]),
139              agg=CounterAgg.NONE,
140              title="Classification of E-commerce User Reviews",
141          )
142      )
143      # counters
144      project.dashboard.add_panel(
145          DashboardPanelCounter(
146              title="Model Calls",
147              filter=ReportFilter(metadata_values={}, tag_values=[]),
148              value=PanelValue(
149                  metric_id="DatasetSummaryMetric",
150                  field_path=metrics.DatasetSummaryMetric.fields.current.number_of_rows,
151                  legend="count",
152              ),
153              text="count",
154              agg=CounterAgg.SUM,
155              size=WidgetSize.HALF,
156          )
157      )
158      project.dashboard.add_panel(
159          DashboardPanelCounter(
160              title="Share of Drifted Features",
161              filter=ReportFilter(metadata_values={}, tag_values=[]),
162              value=PanelValue(
163                  metric_id="DatasetDriftMetric",
164                  field_path="share_of_drifted_columns",
165                  legend="share",
166              ),
167              text="share",
168              agg=CounterAgg.LAST,
169              size=WidgetSize.HALF,
170          )
171      )
172      # Precision
173      project.dashboard.add_panel(
174          DashboardPanelPlot(
175              title="Model Precision",
176              filter=ReportFilter(metadata_values={}, tag_values=[]),
177              values=[
178                  PanelValue(
179                      metric_id="ClassificationQualityMetric",
180                      field_path="current.precision",
181                      legend="precision",
182                  ),
183              ],
184              plot_type=PlotType.LINE,
185              size=WidgetSize.FULL,
186          )
187      )
188      # target and prediction drift
189      project.dashboard.add_panel(
190          DashboardPanelPlot(
191              title="Target and Prediction Drift (Jensen-Shannon distance) ",
192              filter=ReportFilter(metadata_values={}, tag_values=[]),
193              values=[
194                  PanelValue(
195                      metric_id="ColumnDriftMetric",
196                      metric_args={"column_name.name": "prediction"},
197                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
198                      legend="prediction drift score",
199                  ),
200                  PanelValue(
201                      metric_id="ColumnDriftMetric",
202                      metric_args={"column_name.name": "Rating"},
203                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
204                      legend="target drift score",
205                  ),
206              ],
207              plot_type=PlotType.LINE,
208              size=WidgetSize.HALF,
209          )
210      )
211      # features drift
212      # text
213      values = []
214      for col in ["Title", "Review_Text"]:
215          values.append(
216              PanelValue(
217                  metric_id="ColumnDriftMetric",
218                  metric_args={"column_name.name": col},
219                  field_path=metrics.ColumnDriftMetric.fields.drift_score,
220                  legend=col,
221              ),
222          )
223      project.dashboard.add_panel(
224          DashboardPanelPlot(
225              title="Data Drift: review texts (domain classifier ROC AUC) ",
226              filter=ReportFilter(metadata_values={}, tag_values=[]),
227              values=values,
228              plot_type=PlotType.LINE,
229              size=WidgetSize.HALF,
230          )
231      )
232      # numerical
233      values = []
234      for col in ["Age", "Positive_Feedback_Count"]:
235          values.append(
236              PanelValue(
237                  metric_id="ColumnDriftMetric",
238                  metric_args={"column_name.name": col},
239                  field_path=metrics.ColumnDriftMetric.fields.drift_score,
240                  legend=f"{col}",
241              ),
242          )
243      project.dashboard.add_panel(
244          DashboardPanelPlot(
245              title="Data Drift: numerical features (Wasserstein distance)",
246              filter=ReportFilter(metadata_values={}, tag_values=[]),
247              values=values,
248              plot_type=PlotType.LINE,
249              size=WidgetSize.HALF,
250          )
251      )
252      # categorical
253      values = []
254      for col in ["Division_Name", "Department_Name", "Class_Name"]:
255          values.append(
256              PanelValue(
257                  metric_id="ColumnDriftMetric",
258                  metric_args={"column_name.name": col},
259                  field_path=metrics.ColumnDriftMetric.fields.drift_score,
260                  legend=col,
261              ),
262          )
263      project.dashboard.add_panel(
264          DashboardPanelPlot(
265              title="Data Drift: categorical features (Jensen-Shannon distance)",
266              filter=ReportFilter(metadata_values={}, tag_values=[]),
267              values=values,
268              plot_type=PlotType.LINE,
269              size=WidgetSize.HALF,
270          )
271      )
272      # Text quality
273      project.dashboard.add_panel(
274          DashboardPanelPlot(
275              title="Review Text Quality: % of out-of-vocabulary words",
276              filter=ReportFilter(metadata_values={}, tag_values=[]),
277              values=[
278                  PanelValue(
279                      metric_id="ColumnSummaryMetric",
280                      metric_args={"column_name": descriptors.OOV(display_name="OOV").for_column("Review_Text")},
281                      field_path="current_characteristics.mean",
282                      legend="OOV % (mean)",
283                  ),
284              ],
285              plot_type=PlotType.LINE,
286              size=WidgetSize.HALF,
287          )
288      )
289      project.dashboard.add_panel(
290          DashboardPanelPlot(
291              title="Review Text Quality: % of non-letter characters",
292              filter=ReportFilter(metadata_values={}, tag_values=[]),
293              values=[
294                  PanelValue(
295                      metric_id="ColumnSummaryMetric",
296                      metric_args={
297                          "column_name": descriptors.NonLetterCharacterPercentage(
298                              display_name="Non Letter Character Percentage"
299                          ).for_column("Review_Text")
300                      },
301                      field_path="current_characteristics.mean",
302                      legend="NonLetterCharacter % (mean)",
303                  ),
304              ],
305              plot_type=PlotType.LINE,
306              size=WidgetSize.HALF,
307          )
308      )
309      project.dashboard.add_panel(
310          DashboardPanelPlot(
311              title="Review Text Quality: share of non-empty reviews",
312              filter=ReportFilter(metadata_values={}, tag_values=[]),
313              values=[
314                  PanelValue(
315                      metric_id="ColumnValueRangeMetric",
316                      field_path="current.share_in_range",
317                      legend="Reviews with 1-1000 symbols",
318                  ),
319              ],
320              plot_type=PlotType.LINE,
321              size=WidgetSize.HALF,
322          )
323      )
324      # Average review sentiment
325      project.dashboard.add_panel(
326          DashboardPanelPlot(
327              title=" Review sentiment",
328              filter=ReportFilter(metadata_values={}, tag_values=[]),
329              values=[
330                  PanelValue(
331                      metric_id="ColumnSummaryMetric",
332                      metric_args={
333                          "column_name": descriptors.Sentiment(display_name="Sentiment").for_column("Review_Text")
334                      },
335                      field_path="current_characteristics.mean",
336                      legend="sentiment (mean)",
337                  ),
338              ],
339              plot_type=PlotType.LINE,
340              size=WidgetSize.HALF,
341          )
342      )
343      # Reviews that mention competitors
344      project.dashboard.add_panel(
345          DashboardPanelPlot(
346              title="Share of reviews mentioning 'TheOtherStore', 'AMajorCompetitor', 'AwesomeShop'",
347              filter=ReportFilter(metadata_values={}, tag_values=[]),
348              values=[
349                  PanelValue(
350                      metric_id="ColumnCategoryMetric",
351                      metric_args={
352                          "column_name": descriptors.TriggerWordsPresence(
353                              display_name="competitors",
354                              words_list=[
355                                  "theotherstore",
356                                  "amajorcompetitor",
357                                  "awesomeshop",
358                              ],
359                              lemmatize=False,
360                          ).for_column("Review_Text"),
361                          "category": 1,
362                      },
363                      field_path="current.category_ratio",
364                      legend="reviews with competitors",
365                  ),
366              ],
367              plot_type=PlotType.LINE,
368              size=WidgetSize.HALF,
369          )
370      )
371      # Reviews that mention url
372      project.dashboard.add_panel(
373          DashboardPanelDistribution(
374              title="Reviews with URLs distribution",
375              filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=True),
376              value=PanelValue(
377                  metric_id="ColumnSummaryMetric",
378                  metric_args={
379                      "column_name": descriptors.RegExp(display_name="urls", reg_exp=r".*(http|www)\S+.*").for_column(
380                          "Review_Text"
381                      )
382                  },
383                  field_path="plot_data.bins_for_hist.current",
384                  legend="reviews with URLs",
385              ),
386              size=WidgetSize.HALF,
387          )
388      )
389      # Rating ratio
390      project.dashboard.add_panel(
391          DashboardPanelPlot(
392              title='Share of reviews ranked "1"',
393              filter=ReportFilter(metadata_values={}, tag_values=[]),
394              values=[
395                  PanelValue(
396                      metric_id="ColumnCategoryMetric",
397                      metric_args={"column_name.name": "Rating", "category": 1},
398                      field_path="current.category_ratio",
399                      legend='share of "1"',
400                  ),
401              ],
402              plot_type=PlotType.LINE,
403              size=WidgetSize.HALF,
404          )
405      )
406      project.dashboard.add_panel(
407          DashboardPanelPlot(
408              title='Share of reviews ranked "5"',
409              filter=ReportFilter(metadata_values={}, tag_values=[]),
410              values=[
411                  PanelValue(
412                      metric_id="ColumnCategoryMetric",
413                      metric_args={"column_name.name": "Rating", "category": 5},
414                      field_path="current.category_ratio",
415                      legend='share of "5"',
416                  ),
417              ],
418              plot_type=PlotType.LINE,
419              size=WidgetSize.HALF,
420          )
421      )
422  
423      project.save()
424      return project
425  
426  
427  reviews_demo_project = DemoProject(
428      name="Demo project - Reviews",
429      create_snapshot=None,
430      create_data=create_data,
431      create_report=create_report,
432      create_project=create_project,
433      create_test_suite=None,
434      count=19,
435  )
436  
437  if __name__ == "__main__":
438      # create_demo_project("http://localhost:8080")
439      reviews_demo_project.create("workspace")