datasets.py
1 import dataclasses 2 import pathlib 3 import shutil 4 from enum import Enum 5 from typing import Any 6 from typing import List 7 from typing import Optional 8 9 import numpy as np 10 import pandas as pd 11 from sklearn import datasets 12 from sklearn import ensemble 13 14 from evidently.legacy.pipeline.column_mapping import ColumnMapping 15 16 17 class DatasetTags(Enum): 18 HAS_PREDICTION = "has_prediction" 19 HAS_TARGET = "has_target" 20 CLASSIFICATION = "classification" 21 PROB_PREDICTIONS = "prob_predictions" 22 BINARY_CLASSIFICATION = "binary_classification" 23 MULTICLASS_CLASSIFICATION = "multiclass_classification" 24 REGRESSION = "regression" 25 RECSYS = "recsys" 26 27 28 @dataclasses.dataclass(eq=True) 29 class TestDataset: 30 name: str = "" 31 current: Any = None 32 reference: Any = None 33 additional_data: Any = None 34 35 tags: List[DatasetTags] = dataclasses.field(default_factory=list) 36 column_mapping: Optional[ColumnMapping] = None 37 38 def __hash__(self): 39 return id(self) 40 41 42 dataset_fixtures = [] 43 44 45 def dataset(f): 46 # fixture = pytest.fixture(scope="session")(f) 47 dataset_fixtures.append(f()) 48 return f 49 50 51 @dataset 52 def bcancer(): 53 bcancer_data = datasets.load_breast_cancer(as_frame=True) 54 bcancer = bcancer_data.frame 55 56 bcancer_ref = bcancer.sample(n=300, replace=False) 57 bcancer_cur = bcancer.sample(n=200, replace=False) 58 59 model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10) 60 model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target) 61 62 bcancer_ref["prediction"] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1] 63 bcancer_cur["prediction"] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1] 64 65 return TestDataset( 66 "bcancer", 67 bcancer_cur, 68 bcancer_ref, 69 tags=[ 70 DatasetTags.CLASSIFICATION, 71 DatasetTags.PROB_PREDICTIONS, 72 DatasetTags.HAS_TARGET, 73 DatasetTags.BINARY_CLASSIFICATION, 74 DatasetTags.HAS_PREDICTION, 75 ], 76 ) 77 78 79 @dataset 80 def bcancer_label(): 81 bcancer_data = datasets.load_breast_cancer(as_frame=True) 82 bcancer = bcancer_data.frame 83 84 bcancer_ref = bcancer.sample(n=300, replace=False) 85 bcancer_cur = bcancer.sample(n=200, replace=False) 86 87 bcancer_label_ref = bcancer_ref.copy(deep=True) 88 bcancer_label_cur = bcancer_cur.copy(deep=True) 89 90 model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10) 91 model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target) 92 93 bcancer_label_ref["prediction"] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()]) 94 bcancer_label_cur["prediction"] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()]) 95 return TestDataset( 96 "bcancer_label", 97 bcancer_label_cur, 98 bcancer_label_ref, 99 tags=[ 100 DatasetTags.CLASSIFICATION, 101 DatasetTags.HAS_TARGET, 102 DatasetTags.BINARY_CLASSIFICATION, 103 DatasetTags.HAS_PREDICTION, 104 ], 105 ) 106 107 108 @dataset 109 def adult(): 110 adult = pd.read_parquet( 111 pathlib.Path(__file__).parent.joinpath("../../test_data/adults.parquet"), 112 ) 113 adult.education = adult.education.astype(object) 114 115 adult_ref = adult[~adult.education.isin(["Some-college", "HS-grad", "Bachelors"])] 116 adult_cur = adult[adult.education.isin(["Some-college", "HS-grad", "Bachelors"])] 117 118 adult_cur.iloc[:2000, 3:5] = np.nan 119 return TestDataset("adult", adult_cur, adult_ref, tags=[]) 120 121 122 @dataset 123 def housing(): 124 shutil.copy( 125 pathlib.Path(__file__).resolve().parents[2] / "test_data" / "cal_housing_py3.pkz", datasets.get_data_home() 126 ) 127 housing_data = datasets.fetch_california_housing(as_frame=True) 128 housing = housing_data.frame 129 130 housing.rename(columns={"MedHouseVal": "target"}, inplace=True) 131 housing["prediction"] = housing_data["target"].values + np.random.normal(0, 3, housing.shape[0]) 132 133 housing_ref = housing.sample(n=5000, replace=False) 134 housing_cur = housing.sample(n=5000, replace=False) 135 return TestDataset( 136 "housing", 137 housing_cur, 138 housing_ref, 139 tags=[DatasetTags.REGRESSION, DatasetTags.HAS_PREDICTION, DatasetTags.HAS_TARGET], 140 ) 141 142 143 @dataset 144 def reviews(): 145 reviews = pd.read_parquet( 146 pathlib.Path(__file__).parent.joinpath("../../test_data/reviews.parquet"), 147 ) 148 149 reviews["prediction"] = reviews["Rating"] 150 reviews_ref = reviews[reviews.Rating > 3].sample( 151 n=5000, replace=True, ignore_index=True, random_state=42 152 ) # .dropna() 153 reviews_cur = reviews[reviews.Rating < 3].sample( 154 n=5000, replace=True, ignore_index=True, random_state=42 155 ) # .dropna() 156 157 column_mapping = ColumnMapping( 158 target="Rating", 159 numerical_features=["Age", "Positive_Feedback_Count"], 160 categorical_features=["Division_Name", "Department_Name", "Class_Name"], 161 text_features=["Review_Text", "Title"], 162 ) 163 164 return TestDataset(name="reviews", current=reviews_cur, reference=reviews_ref, column_mapping=column_mapping) 165 166 167 @dataset 168 def recsys(): 169 users = sum([[x] * 10 for x in range(10)], []) 170 np.random.seed(0) 171 items = np.random.randint(0, high=100, size=100) 172 rank = [x + 1 for x in range(10)] * 10 173 np.random.seed(0) 174 true = np.random.choice([1, 0], 100, p=[0.1, 0.9]) 175 np.random.seed(1) 176 feature_1 = np.random.choice([1, 0], 100) 177 np.random.seed(2) 178 feature_2 = np.random.choice([1, 0], 100) 179 180 df = pd.DataFrame( 181 { 182 "user_id": users, 183 "item_id": items, 184 "prediction": rank, 185 "target": true, 186 "feature_1": feature_1, 187 "feature_2": feature_2, 188 } 189 ) 190 191 return TestDataset("recsys", df, df, {"current_train_data": df}, tags=[DatasetTags.RECSYS])