benchmark.py
1 # -*- coding: utf-8 -*- 2 """Benchmark of all implemented algorithms 3 """ 4 # Author: Yue Zhao <zhaoy@cmu.edu> 5 # License: BSD 2 clause 6 7 from __future__ import division 8 from __future__ import print_function 9 10 import os 11 import sys 12 from time import time 13 14 # temporary solution for relative imports in case pyod is not installed 15 # if pyod is installed, no need to use the following line 16 sys.path.append( 17 os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) 18 # supress warnings for clean output 19 import warnings 20 21 warnings.filterwarnings("ignore") 22 23 import numpy as np 24 import pandas as pd 25 from sklearn.model_selection import train_test_split 26 27 from pyod.models.abod import ABOD 28 from pyod.models.cblof import CBLOF 29 from pyod.models.feature_bagging import FeatureBagging 30 from pyod.models.hbos import HBOS 31 from pyod.models.iforest import IForest 32 from pyod.models.knn import KNN 33 from pyod.models.lof import LOF 34 from pyod.models.mcd import MCD 35 from pyod.models.ocsvm import OCSVM 36 from pyod.models.pca import PCA 37 from pyod.models.cof import COF 38 from pyod.models.sod import SOD 39 40 41 from pyod.models.auto_encoder import AutoEncoder 42 from pyod.models.cd import CD 43 from pyod.models.copod import COPOD 44 from pyod.models.dif import DIF 45 from pyod.models.ecod import ECOD 46 from pyod.models.gmm import GMM 47 from pyod.models.kde import KDE 48 from pyod.models.lmdd import LMDD 49 from pyod.models.loci import LOCI #19S 50 from pyod.models.loda import LODA 51 from pyod.models.qmcd import QMCD 52 from pyod.models.sampling import Sampling 53 from pyod.models.sos import SOS 54 55 from pyod.models.alad import ALAD #40s 56 from pyod.models.anogan import AnoGAN #151s 57 from pyod.models.inne import INNE 58 from pyod.models.kpca import KPCA 59 from pyod.models.lscp import LSCP 60 from pyod.models.lunar import LUNAR 61 from pyod.models.mad import MAD 62 from pyod.models.mo_gaal import MO_GAAL 63 from pyod.models.rgraph import RGraph #271S 64 from pyod.models.rod import ROD 65 from pyod.models.so_gaal import SO_GAAL 66 from pyod.models.sod import SOD 67 from pyod.models.vae import VAE 68 69 from pyod.utils.utility import standardizer 70 from pyod.utils.utility import precision_n_scores 71 from sklearn.metrics import roc_auc_score 72 73 # TODO: add neural networks, LOCI, SOS, COF, SOD 74 75 # Define data file and read X and y 76 csv_file_list = ['arrhythmia.csv', 77 'cardio.csv', 78 'glass.csv', 79 'ionosphere.csv', 80 'letter.csv', 81 'lympho.csv', 82 'mnist.csv', 83 'musk.csv', 84 'optdigits.csv', 85 'pendigits.csv', 86 'pima.csv', 87 'satellite.csv', 88 'satimage-2.csv', 89 'shuttle.csv', 90 'vertebral.csv', 91 'vowels.csv', 92 'wbc.csv' 93 ] 94 95 # define the number of iterations 96 n_ite = 1 97 98 df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc', 99 'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 100 'MCD', 'OCSVM', 'PCA', 'AutoEncoder', 'CD', 'COPOD', 'DIF', 'ECOD', 101 'GMM', 'KDE', 'LODA', 'QMCD','Sampling', 'SOS', 'ALAD', 'AnoGAN ', 102 'INNE', 'KPCA', 'LMDD', 'LOCI', 'LSCP', 'LUNAR', 'MO_GAAL', 'RGraph', 'SO_GAAL', 'SOD', 'VAE'] 103 104 n_classifiers = len(df_columns)-4 105 106 # initialize the container for saving the results 107 roc_df = pd.DataFrame(columns=df_columns) 108 prn_df = pd.DataFrame(columns=df_columns) 109 time_df = pd.DataFrame(columns=df_columns) 110 111 for j in range(len(csv_file_list)): 112 113 csv_file = csv_file_list[j] 114 data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',', 115 skip_header=1) 116 X = data[:, :-1] 117 y = data[:, -1].astype(int) 118 outliers_fraction = np.count_nonzero(y) / len(y) 119 outliers_percentage = round(outliers_fraction * 100, ndigits=4) 120 121 # construct containers for saving results 122 roc_list = [csv_file[:-4], X.shape[0], X.shape[1], outliers_percentage] 123 prn_list = [csv_file[:-4], X.shape[0], X.shape[1], outliers_percentage] 124 time_list = [csv_file[:-4], X.shape[0], X.shape[1], outliers_percentage] 125 126 roc_mat = np.zeros([n_ite, n_classifiers]) 127 prn_mat = np.zeros([n_ite, n_classifiers]) 128 time_mat = np.zeros([n_ite, n_classifiers]) 129 130 for i in range(n_ite): 131 print("\n... Processing", csv_file, '...', 'Iteration', i + 1) 132 random_state = np.random.RandomState(i) 133 134 # 60% data for training and 40% for testing 135 X_train, X_test, y_train, y_test = \ 136 train_test_split(X, y, test_size=0.4, random_state=random_state) 137 138 # standardizing data for processing 139 X_train_norm, X_test_norm = standardizer(X_train, X_test) 140 141 classifiers = { 142 'Angle-based Outlier Detector (ABOD)': ABOD( 143 contamination=outliers_fraction), 144 'Cluster-based Local Outlier Factor': CBLOF( 145 n_clusters=10, 146 contamination=outliers_fraction, 147 check_estimator=False, 148 random_state=random_state), 149 'Feature Bagging': FeatureBagging( 150 contamination=outliers_fraction, 151 random_state=random_state), 152 'Histogram-base Outlier Detection (HBOS)': HBOS( 153 contamination=outliers_fraction), 154 'Isolation Forest': IForest( 155 contamination=outliers_fraction, 156 random_state=random_state), 157 'K Nearest Neighbors (KNN)': KNN( 158 contamination=outliers_fraction), 159 'Local Outlier Factor (LOF)': LOF( 160 contamination=outliers_fraction), 161 'Minimum Covariance Determinant (MCD)': MCD( 162 contamination=outliers_fraction, 163 random_state=random_state), 164 'One-class SVM (OCSVM)': OCSVM( 165 contamination=outliers_fraction), 166 'Principal Component Analysis (PCA)': PCA( 167 contamination=outliers_fraction, 168 random_state=random_state), 169 'AutoEncoder': AutoEncoder( 170 contamination=outliers_fraction), 171 'CD': CD( 172 contamination=outliers_fraction), 173 'COPOD': COPOD( 174 contamination=outliers_fraction), 175 'DIF': DIF( 176 contamination=outliers_fraction), 177 'ECOD': ECOD( 178 contamination=outliers_fraction), 179 'GMM': GMM( 180 contamination=outliers_fraction), 181 'KDE': KDE( 182 contamination=outliers_fraction), 183 184 'LODA': LODA( 185 contamination=outliers_fraction), 186 'QMCD': QMCD( 187 contamination=outliers_fraction), 188 'Sampling': Sampling( 189 contamination=outliers_fraction), 190 'SOS': SOS( 191 contamination=outliers_fraction), 192 # 'ALAD': ALAD( 193 # contamination=outliers_fraction), 194 # 'AnoGAN':AnoGAN( 195 # contamination=outliers_fraction), 196 'INNE': INNE(contamination=outliers_fraction), 197 'KPCA': KPCA(contamination=outliers_fraction), 198 'LMDD': LMDD(contamination=outliers_fraction), 199 # 'LOCI': LOCI(contamination=outliers_fraction), 200 'LUNAR': LUNAR(contamination=outliers_fraction), 201 'MO_GAAL': MO_GAAL(contamination=outliers_fraction), 202 # 'RGraph': RGraph(contamination=outliers_fraction), 203 # 'SO_GAAL': SO_GAAL(contamination=outliers_fraction), 204 'SOD': SOD(contamination=outliers_fraction), 205 206 } 207 classifiers_indices = { 208 'Angle-based Outlier Detector (ABOD)': 0, 209 'Cluster-based Local Outlier Factor': 1, 210 'Feature Bagging': 2, 211 'Histogram-base Outlier Detection (HBOS)': 3, 212 'Isolation Forest': 4, 213 'K Nearest Neighbors (KNN)': 5, 214 'Local Outlier Factor (LOF)': 6, 215 'Minimum Covariance Determinant (MCD)': 7, 216 'One-class SVM (OCSVM)': 8, 217 'Principal Component Analysis (PCA)': 9, 218 'AutoEncoder': 10, 219 'CD': 11, 220 'COPOD': 12, 221 'DIF': 13, 222 'ECOD': 14, 223 'GMM': 15, 224 'KDE': 16, 225 'LODA': 17, 226 'QMCD': 18, 227 'Sampling': 19, 228 'SOS': 20, 229 'ALAD': 21, 230 'AnoGAN': 22, 231 'INNE': 23, 232 'KPCA': 24, 233 'LMDD': 25, 234 'LOCI': 26, 235 'LUNAR': 27, 236 'MO_GAAL': 28, 237 'RGraph': 29, 238 'SO_GAAL': 30, 239 'SOD': 31, 240 241 242 243 } 244 245 246 for clf_name, clf in classifiers.items(): 247 t0 = time() 248 clf.fit(X_train_norm) 249 test_scores = clf.decision_function(X_test_norm) 250 251 # Handle NaN values in test_scores 252 test_scores = np.nan_to_num(test_scores, 253 nan=0.0, 254 posinf=np.nanmax(test_scores), 255 neginf=np.nanmin(test_scores)) 256 # Handle NaN values in y_test 257 y_test = np.nan_to_num(y_test, nan=0.0, posinf=0.0, neginf=0.0) 258 259 t1 = time() 260 duration = round(t1 - t0, ndigits=4) 261 262 roc = round(roc_auc_score(y_test, test_scores), ndigits=4) 263 prn = round(precision_n_scores(y_test, test_scores), ndigits=4) 264 265 print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, ' 266 'execution time: {duration}s'.format( 267 clf_name=clf_name, roc=roc, prn=prn, duration=duration)) 268 269 time_mat[i, classifiers_indices[clf_name]] = duration 270 roc_mat[i, classifiers_indices[clf_name]] = roc 271 prn_mat[i, classifiers_indices[clf_name]] = prn 272 273 time_list = time_list + np.mean(time_mat, axis=0).tolist() 274 temp_df = pd.DataFrame(time_list).transpose() 275 temp_df.columns = df_columns 276 time_df = pd.concat([time_df, temp_df], axis=0) 277 278 roc_list = roc_list + np.mean(roc_mat, axis=0).tolist() 279 temp_df = pd.DataFrame(roc_list).transpose() 280 temp_df.columns = df_columns 281 roc_df = pd.concat([roc_df, temp_df], axis=0) 282 283 prn_list = prn_list + np.mean(prn_mat, axis=0).tolist() 284 temp_df = pd.DataFrame(prn_list).transpose() 285 temp_df.columns = df_columns 286 prn_df = pd.concat([prn_df, temp_df], axis=0) 287 288 # Save the results for each run 289 time_df.to_csv('time.csv', index=False, float_format='%.3f') 290 roc_df.to_csv('roc.csv', index=False, float_format='%.3f') 291 prn_df.to_csv('prc.csv', index=False, float_format='%.3f')