bikes.py
1 import io 2 import os 3 import zipfile 4 from datetime import datetime 5 from datetime import time 6 from datetime import timedelta 7 from itertools import cycle 8 9 import pandas as pd 10 import requests 11 from dateutil.relativedelta import relativedelta 12 from sklearn import ensemble 13 14 from evidently.legacy import metrics 15 from evidently.legacy.pipeline.column_mapping import ColumnMapping 16 from evidently.legacy.renderers.html_widgets import WidgetSize 17 from evidently.legacy.report import Report 18 from evidently.legacy.test_preset import DataDriftTestPreset 19 from evidently.legacy.test_suite import TestSuite 20 from evidently.legacy.ui.dashboards import CounterAgg 21 from evidently.legacy.ui.dashboards import DashboardPanelCounter 22 from evidently.legacy.ui.dashboards import DashboardPanelPlot 23 from evidently.legacy.ui.dashboards import PanelValue 24 from evidently.legacy.ui.dashboards import PlotType 25 from evidently.legacy.ui.dashboards import ReportFilter 26 from evidently.legacy.ui.demo_projects import DemoProject 27 from evidently.legacy.ui.workspace.base import WorkspaceBase 28 29 30 def create_data(): 31 if os.path.exists("Bike-Sharing-Dataset.zip"): 32 with open("Bike-Sharing-Dataset.zip", "rb") as f: 33 content = f.read() 34 elif os.path.exists("../../../../../test_data/bike_sharing_dataset.zip"): 35 with open("../../../../../test_data/bike_sharing_dataset.zip", "rb") as f: 36 content = f.read() 37 else: 38 response = requests.get( 39 "https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip", 40 verify=False, 41 ) 42 if response.status_code != 200: 43 raise ValueError(f"Could not download bike sharing dataset. {response.text}") 44 if response.status_code == 200 and response.headers["content-type"] != "application/zip": 45 raise ValueError(f"Invalid bike sharing dataset content type: {response.headers['content-type']}.") 46 content = response.content 47 with zipfile.ZipFile(io.BytesIO(content)) as arc: 48 raw_data = pd.read_csv( 49 arc.open("hour.csv"), 50 header=0, 51 sep=",", 52 parse_dates=["dteday"], 53 index_col="dteday", 54 ) 55 56 raw_data.index = raw_data.apply( 57 lambda row: datetime.combine(row.name, time(hour=int(row["hr"]))) + relativedelta(years=11), 58 axis=1, 59 ) 60 raw_data.sort_index(inplace=True) 61 62 reference = raw_data.loc["2023-01-01 00:00:00":"2023-01-28 23:00:00"] 63 current = raw_data.loc["2023-01-29 00:00:00":"2023-02-28 23:00:00"] 64 65 target = "cnt" 66 prediction = "prediction" 67 numerical_features = ["temp", "atemp", "hum", "windspeed", "hr", "weekday"] 68 categorical_features = ["season", "holiday", "workingday"] 69 70 column_mapping = ColumnMapping() 71 column_mapping.target = target 72 column_mapping.prediction = prediction 73 column_mapping.numerical_features = numerical_features 74 column_mapping.categorical_features = categorical_features 75 76 regressor = ensemble.RandomForestRegressor(random_state=0, n_estimators=50) 77 regressor.fit(reference[numerical_features + categorical_features], reference[target]) 78 79 reference["prediction"] = regressor.predict(reference[numerical_features + categorical_features]) 80 current["prediction"] = regressor.predict(current[numerical_features + categorical_features]) 81 82 return current, reference, column_mapping 83 84 85 def snapshot_tags_generator(): 86 tags = [ 87 "production_critical", 88 "city_bikes_hourly", 89 "tabular_data", 90 "regression_batch_model", 91 "high_seasonality", 92 "numerical_features", 93 "categorical_features", 94 "no_missing_values", 95 ] 96 97 yield from cycle( 98 [ 99 [tags[0], tags[1], tags[2]], 100 [tags[1]], 101 [], 102 [tags[2]], 103 [tags[3], tags[4]], 104 [], 105 [tags[4], tags[5], tags[6], tags[7]], 106 [], 107 [], 108 ] 109 ) 110 111 112 SNAPSHOT_TAGS = snapshot_tags_generator() 113 114 115 def next_snapshot_tags(): 116 return next(SNAPSHOT_TAGS) 117 118 119 def create_report(i: int, data): 120 current, reference, column_mapping = data 121 122 data_drift_report = Report( 123 metrics=[ 124 metrics.RegressionQualityMetric(), 125 metrics.DatasetSummaryMetric(), 126 metrics.DatasetDriftMetric(), 127 metrics.ColumnDriftMetric(column_name="cnt", stattest="wasserstein"), 128 metrics.ColumnDriftMetric(column_name="prediction", stattest="wasserstein"), 129 metrics.ColumnDriftMetric(column_name="temp", stattest="wasserstein"), 130 metrics.ColumnDriftMetric(column_name="atemp", stattest="wasserstein"), 131 metrics.ColumnDriftMetric(column_name="hum", stattest="wasserstein"), 132 metrics.ColumnDriftMetric(column_name="windspeed", stattest="wasserstein"), 133 metrics.ColumnSummaryMetric(column_name="cnt"), 134 metrics.ColumnSummaryMetric(column_name="prediction"), 135 ], 136 timestamp=datetime(2023, 1, 29) + timedelta(days=i + 1), 137 tags=next_snapshot_tags(), 138 ) 139 data_drift_report.set_batch_size("daily") 140 141 data_drift_report.run( 142 reference_data=reference, 143 current_data=current.loc[datetime(2023, 1, 29) + timedelta(days=i) : datetime(2023, 1, 29) + timedelta(i + 1)], 144 column_mapping=column_mapping, 145 ) 146 return data_drift_report 147 148 149 def create_test_suite(i: int, data): 150 current, reference, column_mapping = data 151 152 data_drift_test_suite = TestSuite( 153 tests=[DataDriftTestPreset()], 154 timestamp=datetime(2023, 1, 29) + timedelta(days=i + 1), 155 tags=next_snapshot_tags(), 156 ) 157 158 data_drift_test_suite.run( 159 reference_data=reference, 160 current_data=current.loc[datetime(2023, 1, 29) + timedelta(days=i) : datetime(2023, 1, 29) + timedelta(i + 1)], 161 column_mapping=column_mapping, 162 ) 163 return data_drift_test_suite 164 165 166 def create_project(workspace: WorkspaceBase, name: str): 167 project = workspace.create_project(name) 168 project.description = "A toy demo project using Bike Demand forecasting dataset" 169 project.dashboard.add_panel( 170 DashboardPanelCounter( 171 filter=ReportFilter(metadata_values={}, tag_values=[]), 172 agg=CounterAgg.NONE, 173 title="Bike Rental Demand Forecast", 174 ) 175 ) 176 project.dashboard.add_panel( 177 DashboardPanelCounter( 178 title="Model Calls", 179 filter=ReportFilter(metadata_values={}, tag_values=[]), 180 value=PanelValue( 181 metric_id="DatasetSummaryMetric", 182 field_path=metrics.DatasetSummaryMetric.fields.current.number_of_rows, 183 legend="count", 184 ), 185 text="count", 186 agg=CounterAgg.SUM, 187 size=WidgetSize.HALF, 188 ) 189 ) 190 project.dashboard.add_panel( 191 DashboardPanelCounter( 192 title="Share of Drifted Features", 193 filter=ReportFilter(metadata_values={}, tag_values=[]), 194 value=PanelValue( 195 metric_id="DatasetDriftMetric", 196 field_path="share_of_drifted_columns", 197 legend="share", 198 ), 199 text="share", 200 agg=CounterAgg.LAST, 201 size=WidgetSize.HALF, 202 ) 203 ) 204 project.dashboard.add_panel( 205 DashboardPanelPlot( 206 title="Target and Prediction", 207 filter=ReportFilter(metadata_values={}, tag_values=[]), 208 values=[ 209 PanelValue( 210 metric_id="ColumnSummaryMetric", 211 field_path="current_characteristics.mean", 212 metric_args={"column_name.name": "cnt"}, 213 legend="Target (daily mean)", 214 ), 215 PanelValue( 216 metric_id="ColumnSummaryMetric", 217 field_path="current_characteristics.mean", 218 metric_args={"column_name.name": "prediction"}, 219 legend="Prediction (daily mean)", 220 ), 221 ], 222 plot_type=PlotType.LINE, 223 size=WidgetSize.FULL, 224 ) 225 ) 226 project.dashboard.add_panel( 227 DashboardPanelPlot( 228 title="MAE", 229 filter=ReportFilter(metadata_values={}, tag_values=[]), 230 values=[ 231 PanelValue( 232 metric_id="RegressionQualityMetric", 233 field_path=metrics.RegressionQualityMetric.fields.current.mean_abs_error, 234 legend="MAE", 235 ), 236 ], 237 plot_type=PlotType.LINE, 238 size=WidgetSize.HALF, 239 ) 240 ) 241 project.dashboard.add_panel( 242 DashboardPanelPlot( 243 title="MAPE", 244 filter=ReportFilter(metadata_values={}, tag_values=[]), 245 values=[ 246 PanelValue( 247 metric_id="RegressionQualityMetric", 248 field_path=metrics.RegressionQualityMetric.fields.current.mean_abs_perc_error, 249 legend="MAPE", 250 ), 251 ], 252 plot_type=PlotType.LINE, 253 size=WidgetSize.HALF, 254 ) 255 ) 256 project.dashboard.add_panel( 257 DashboardPanelPlot( 258 title="Features Drift (Wasserstein Distance)", 259 filter=ReportFilter(metadata_values={}, tag_values=[]), 260 values=[ 261 PanelValue( 262 metric_id="ColumnDriftMetric", 263 metric_args={"column_name.name": "temp"}, 264 field_path=metrics.ColumnDriftMetric.fields.drift_score, 265 legend="temp", 266 ), 267 PanelValue( 268 metric_id="ColumnDriftMetric", 269 metric_args={"column_name.name": "atemp"}, 270 field_path=metrics.ColumnDriftMetric.fields.drift_score, 271 legend="atemp", 272 ), 273 PanelValue( 274 metric_id="ColumnDriftMetric", 275 metric_args={"column_name.name": "hum"}, 276 field_path=metrics.ColumnDriftMetric.fields.drift_score, 277 legend="hum", 278 ), 279 PanelValue( 280 metric_id="ColumnDriftMetric", 281 metric_args={"column_name.name": "windspeed"}, 282 field_path=metrics.ColumnDriftMetric.fields.drift_score, 283 legend="windspeed", 284 ), 285 ], 286 plot_type=PlotType.LINE, 287 size=WidgetSize.FULL, 288 ) 289 ) 290 project.save() 291 return project 292 293 294 bikes_demo_project = DemoProject( 295 name="Demo project - Bikes", 296 create_data=create_data, 297 create_snapshot=None, 298 create_report=create_report, 299 create_project=create_project, 300 create_test_suite=create_test_suite, 301 count=28, 302 ) 303 304 if __name__ == "__main__": 305 # create_demo_project("http://localhost:8080") 306 bikes_demo_project.create("workspace")