/ src / evidently / legacy / ui / demo_projects / bikes.py
bikes.py
  1  import io
  2  import os
  3  import zipfile
  4  from datetime import datetime
  5  from datetime import time
  6  from datetime import timedelta
  7  from itertools import cycle
  8  
  9  import pandas as pd
 10  import requests
 11  from dateutil.relativedelta import relativedelta
 12  from sklearn import ensemble
 13  
 14  from evidently.legacy import metrics
 15  from evidently.legacy.pipeline.column_mapping import ColumnMapping
 16  from evidently.legacy.renderers.html_widgets import WidgetSize
 17  from evidently.legacy.report import Report
 18  from evidently.legacy.test_preset import DataDriftTestPreset
 19  from evidently.legacy.test_suite import TestSuite
 20  from evidently.legacy.ui.dashboards import CounterAgg
 21  from evidently.legacy.ui.dashboards import DashboardPanelCounter
 22  from evidently.legacy.ui.dashboards import DashboardPanelPlot
 23  from evidently.legacy.ui.dashboards import PanelValue
 24  from evidently.legacy.ui.dashboards import PlotType
 25  from evidently.legacy.ui.dashboards import ReportFilter
 26  from evidently.legacy.ui.demo_projects import DemoProject
 27  from evidently.legacy.ui.workspace.base import WorkspaceBase
 28  
 29  
 30  def create_data():
 31      if os.path.exists("Bike-Sharing-Dataset.zip"):
 32          with open("Bike-Sharing-Dataset.zip", "rb") as f:
 33              content = f.read()
 34      elif os.path.exists("../../../../../test_data/bike_sharing_dataset.zip"):
 35          with open("../../../../../test_data/bike_sharing_dataset.zip", "rb") as f:
 36              content = f.read()
 37      else:
 38          response = requests.get(
 39              "https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip",
 40              verify=False,
 41          )
 42          if response.status_code != 200:
 43              raise ValueError(f"Could not download bike sharing dataset. {response.text}")
 44          if response.status_code == 200 and response.headers["content-type"] != "application/zip":
 45              raise ValueError(f"Invalid bike sharing dataset content type: {response.headers['content-type']}.")
 46          content = response.content
 47      with zipfile.ZipFile(io.BytesIO(content)) as arc:
 48          raw_data = pd.read_csv(
 49              arc.open("hour.csv"),
 50              header=0,
 51              sep=",",
 52              parse_dates=["dteday"],
 53              index_col="dteday",
 54          )
 55  
 56          raw_data.index = raw_data.apply(
 57              lambda row: datetime.combine(row.name, time(hour=int(row["hr"]))) + relativedelta(years=11),
 58              axis=1,
 59          )
 60          raw_data.sort_index(inplace=True)
 61  
 62      reference = raw_data.loc["2023-01-01 00:00:00":"2023-01-28 23:00:00"]
 63      current = raw_data.loc["2023-01-29 00:00:00":"2023-02-28 23:00:00"]
 64  
 65      target = "cnt"
 66      prediction = "prediction"
 67      numerical_features = ["temp", "atemp", "hum", "windspeed", "hr", "weekday"]
 68      categorical_features = ["season", "holiday", "workingday"]
 69  
 70      column_mapping = ColumnMapping()
 71      column_mapping.target = target
 72      column_mapping.prediction = prediction
 73      column_mapping.numerical_features = numerical_features
 74      column_mapping.categorical_features = categorical_features
 75  
 76      regressor = ensemble.RandomForestRegressor(random_state=0, n_estimators=50)
 77      regressor.fit(reference[numerical_features + categorical_features], reference[target])
 78  
 79      reference["prediction"] = regressor.predict(reference[numerical_features + categorical_features])
 80      current["prediction"] = regressor.predict(current[numerical_features + categorical_features])
 81  
 82      return current, reference, column_mapping
 83  
 84  
 85  def snapshot_tags_generator():
 86      tags = [
 87          "production_critical",
 88          "city_bikes_hourly",
 89          "tabular_data",
 90          "regression_batch_model",
 91          "high_seasonality",
 92          "numerical_features",
 93          "categorical_features",
 94          "no_missing_values",
 95      ]
 96  
 97      yield from cycle(
 98          [
 99              [tags[0], tags[1], tags[2]],
100              [tags[1]],
101              [],
102              [tags[2]],
103              [tags[3], tags[4]],
104              [],
105              [tags[4], tags[5], tags[6], tags[7]],
106              [],
107              [],
108          ]
109      )
110  
111  
112  SNAPSHOT_TAGS = snapshot_tags_generator()
113  
114  
115  def next_snapshot_tags():
116      return next(SNAPSHOT_TAGS)
117  
118  
119  def create_report(i: int, data):
120      current, reference, column_mapping = data
121  
122      data_drift_report = Report(
123          metrics=[
124              metrics.RegressionQualityMetric(),
125              metrics.DatasetSummaryMetric(),
126              metrics.DatasetDriftMetric(),
127              metrics.ColumnDriftMetric(column_name="cnt", stattest="wasserstein"),
128              metrics.ColumnDriftMetric(column_name="prediction", stattest="wasserstein"),
129              metrics.ColumnDriftMetric(column_name="temp", stattest="wasserstein"),
130              metrics.ColumnDriftMetric(column_name="atemp", stattest="wasserstein"),
131              metrics.ColumnDriftMetric(column_name="hum", stattest="wasserstein"),
132              metrics.ColumnDriftMetric(column_name="windspeed", stattest="wasserstein"),
133              metrics.ColumnSummaryMetric(column_name="cnt"),
134              metrics.ColumnSummaryMetric(column_name="prediction"),
135          ],
136          timestamp=datetime(2023, 1, 29) + timedelta(days=i + 1),
137          tags=next_snapshot_tags(),
138      )
139      data_drift_report.set_batch_size("daily")
140  
141      data_drift_report.run(
142          reference_data=reference,
143          current_data=current.loc[datetime(2023, 1, 29) + timedelta(days=i) : datetime(2023, 1, 29) + timedelta(i + 1)],
144          column_mapping=column_mapping,
145      )
146      return data_drift_report
147  
148  
149  def create_test_suite(i: int, data):
150      current, reference, column_mapping = data
151  
152      data_drift_test_suite = TestSuite(
153          tests=[DataDriftTestPreset()],
154          timestamp=datetime(2023, 1, 29) + timedelta(days=i + 1),
155          tags=next_snapshot_tags(),
156      )
157  
158      data_drift_test_suite.run(
159          reference_data=reference,
160          current_data=current.loc[datetime(2023, 1, 29) + timedelta(days=i) : datetime(2023, 1, 29) + timedelta(i + 1)],
161          column_mapping=column_mapping,
162      )
163      return data_drift_test_suite
164  
165  
166  def create_project(workspace: WorkspaceBase, name: str):
167      project = workspace.create_project(name)
168      project.description = "A toy demo project using Bike Demand forecasting dataset"
169      project.dashboard.add_panel(
170          DashboardPanelCounter(
171              filter=ReportFilter(metadata_values={}, tag_values=[]),
172              agg=CounterAgg.NONE,
173              title="Bike Rental Demand Forecast",
174          )
175      )
176      project.dashboard.add_panel(
177          DashboardPanelCounter(
178              title="Model Calls",
179              filter=ReportFilter(metadata_values={}, tag_values=[]),
180              value=PanelValue(
181                  metric_id="DatasetSummaryMetric",
182                  field_path=metrics.DatasetSummaryMetric.fields.current.number_of_rows,
183                  legend="count",
184              ),
185              text="count",
186              agg=CounterAgg.SUM,
187              size=WidgetSize.HALF,
188          )
189      )
190      project.dashboard.add_panel(
191          DashboardPanelCounter(
192              title="Share of Drifted Features",
193              filter=ReportFilter(metadata_values={}, tag_values=[]),
194              value=PanelValue(
195                  metric_id="DatasetDriftMetric",
196                  field_path="share_of_drifted_columns",
197                  legend="share",
198              ),
199              text="share",
200              agg=CounterAgg.LAST,
201              size=WidgetSize.HALF,
202          )
203      )
204      project.dashboard.add_panel(
205          DashboardPanelPlot(
206              title="Target and Prediction",
207              filter=ReportFilter(metadata_values={}, tag_values=[]),
208              values=[
209                  PanelValue(
210                      metric_id="ColumnSummaryMetric",
211                      field_path="current_characteristics.mean",
212                      metric_args={"column_name.name": "cnt"},
213                      legend="Target (daily mean)",
214                  ),
215                  PanelValue(
216                      metric_id="ColumnSummaryMetric",
217                      field_path="current_characteristics.mean",
218                      metric_args={"column_name.name": "prediction"},
219                      legend="Prediction (daily mean)",
220                  ),
221              ],
222              plot_type=PlotType.LINE,
223              size=WidgetSize.FULL,
224          )
225      )
226      project.dashboard.add_panel(
227          DashboardPanelPlot(
228              title="MAE",
229              filter=ReportFilter(metadata_values={}, tag_values=[]),
230              values=[
231                  PanelValue(
232                      metric_id="RegressionQualityMetric",
233                      field_path=metrics.RegressionQualityMetric.fields.current.mean_abs_error,
234                      legend="MAE",
235                  ),
236              ],
237              plot_type=PlotType.LINE,
238              size=WidgetSize.HALF,
239          )
240      )
241      project.dashboard.add_panel(
242          DashboardPanelPlot(
243              title="MAPE",
244              filter=ReportFilter(metadata_values={}, tag_values=[]),
245              values=[
246                  PanelValue(
247                      metric_id="RegressionQualityMetric",
248                      field_path=metrics.RegressionQualityMetric.fields.current.mean_abs_perc_error,
249                      legend="MAPE",
250                  ),
251              ],
252              plot_type=PlotType.LINE,
253              size=WidgetSize.HALF,
254          )
255      )
256      project.dashboard.add_panel(
257          DashboardPanelPlot(
258              title="Features Drift (Wasserstein Distance)",
259              filter=ReportFilter(metadata_values={}, tag_values=[]),
260              values=[
261                  PanelValue(
262                      metric_id="ColumnDriftMetric",
263                      metric_args={"column_name.name": "temp"},
264                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
265                      legend="temp",
266                  ),
267                  PanelValue(
268                      metric_id="ColumnDriftMetric",
269                      metric_args={"column_name.name": "atemp"},
270                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
271                      legend="atemp",
272                  ),
273                  PanelValue(
274                      metric_id="ColumnDriftMetric",
275                      metric_args={"column_name.name": "hum"},
276                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
277                      legend="hum",
278                  ),
279                  PanelValue(
280                      metric_id="ColumnDriftMetric",
281                      metric_args={"column_name.name": "windspeed"},
282                      field_path=metrics.ColumnDriftMetric.fields.drift_score,
283                      legend="windspeed",
284                  ),
285              ],
286              plot_type=PlotType.LINE,
287              size=WidgetSize.FULL,
288          )
289      )
290      project.save()
291      return project
292  
293  
294  bikes_demo_project = DemoProject(
295      name="Demo project - Bikes",
296      create_data=create_data,
297      create_snapshot=None,
298      create_report=create_report,
299      create_project=create_project,
300      create_test_suite=create_test_suite,
301      count=28,
302  )
303  
304  if __name__ == "__main__":
305      # create_demo_project("http://localhost:8080")
306      bikes_demo_project.create("workspace")