compare_all_models.py
1 # -*- coding: utf-8 -*- 2 """Compare all detection algorithms by plotting decision boundaries and 3 the number of decision boundaries. 4 """ 5 # Author: Yue Zhao <zhaoy@cmu.edu> 6 # License: BSD 2 clause 7 8 from __future__ import division 9 from __future__ import print_function 10 11 import os 12 import sys 13 14 # temporary solution for relative imports in case pyod is not installed 15 # if pyod is installed, no need to use the following line 16 sys.path.append( 17 os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) 18 19 # supress warnings for clean output 20 import warnings 21 22 warnings.filterwarnings("ignore") 23 import numpy as np 24 from numpy import percentile 25 import matplotlib.pyplot as plt 26 import matplotlib.font_manager 27 28 # Import all models 29 from pyod.models.abod import ABOD 30 from pyod.models.cblof import CBLOF 31 from pyod.models.feature_bagging import FeatureBagging 32 from pyod.models.hbos import HBOS 33 from pyod.models.iforest import IForest 34 from pyod.models.knn import KNN 35 from pyod.models.lof import LOF 36 from pyod.models.mcd import MCD 37 from pyod.models.ocsvm import OCSVM 38 from pyod.models.pca import PCA 39 from pyod.models.lscp import LSCP 40 from pyod.models.inne import INNE 41 from pyod.models.gmm import GMM 42 from pyod.models.kde import KDE 43 from pyod.models.lmdd import LMDD 44 45 from pyod.models.dif import DIF 46 from pyod.models.copod import COPOD 47 from pyod.models.ecod import ECOD 48 from pyod.models.suod import SUOD 49 from pyod.models.qmcd import QMCD 50 from pyod.models.sampling import Sampling 51 from pyod.models.kpca import KPCA 52 from pyod.models.lunar import LUNAR 53 54 # TODO: add neural networks, LOCI, SOS, COF, SOD 55 56 # Define the number of inliers and outliers 57 n_samples = 200 58 outliers_fraction = 0.25 59 clusters_separation = [0] 60 61 # Compare given detectors under given settings 62 # Initialize the data 63 xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100)) 64 n_inliers = int((1. - outliers_fraction) * n_samples) 65 n_outliers = int(outliers_fraction * n_samples) 66 ground_truth = np.zeros(n_samples, dtype=int) 67 ground_truth[-n_outliers:] = 1 68 69 # initialize a set of detectors for LSCP 70 detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15), 71 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), 72 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), 73 LOF(n_neighbors=50)] 74 75 # Show the statics of the data 76 print('Number of inliers: %i' % n_inliers) 77 print('Number of outliers: %i' % n_outliers) 78 print( 79 'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( 80 shape=ground_truth.shape)) 81 print(ground_truth, '\n') 82 83 random_state = 42 84 # Define nine outlier detection tools to be compared 85 classifiers = { 86 'Angle-based Outlier Detector (ABOD)': 87 ABOD(contamination=outliers_fraction), 88 'K Nearest Neighbors (KNN)': KNN( 89 contamination=outliers_fraction), 90 'Average KNN': KNN(method='mean', 91 contamination=outliers_fraction), 92 'Median KNN': KNN(method='median', 93 contamination=outliers_fraction), 94 'Local Outlier Factor (LOF)': 95 LOF(n_neighbors=35, contamination=outliers_fraction), 96 97 'Isolation Forest': IForest(contamination=outliers_fraction, 98 random_state=random_state), 99 'Deep Isolation Forest (DIF)': DIF(contamination=outliers_fraction, 100 random_state=random_state), 101 'INNE': INNE( 102 max_samples=2, contamination=outliers_fraction, 103 random_state=random_state, 104 ), 105 106 'Locally Selective Combination (LSCP)': LSCP( 107 detector_list, contamination=outliers_fraction, 108 random_state=random_state), 109 'Feature Bagging': 110 FeatureBagging(LOF(n_neighbors=35), 111 contamination=outliers_fraction, 112 random_state=random_state), 113 'SUOD': SUOD(contamination=outliers_fraction), 114 115 'Minimum Covariance Determinant (MCD)': MCD( 116 contamination=outliers_fraction, random_state=random_state), 117 118 'Principal Component Analysis (PCA)': PCA( 119 contamination=outliers_fraction, random_state=random_state), 120 'KPCA': KPCA( 121 contamination=outliers_fraction), 122 123 'Probabilistic Mixture Modeling (GMM)': GMM(contamination=outliers_fraction, 124 random_state=random_state), 125 126 'LMDD': LMDD(contamination=outliers_fraction, 127 random_state=random_state), 128 129 'Histogram-based Outlier Detection (HBOS)': HBOS( 130 contamination=outliers_fraction), 131 132 'Copula-base Outlier Detection (COPOD)': COPOD( 133 contamination=outliers_fraction), 134 135 'ECDF-baseD Outlier Detection (ECOD)': ECOD( 136 contamination=outliers_fraction), 137 'Kernel Density Functions (KDE)': KDE(contamination=outliers_fraction), 138 139 'QMCD': QMCD( 140 contamination=outliers_fraction), 141 142 'Sampling': Sampling( 143 contamination=outliers_fraction), 144 145 'LUNAR': LUNAR(), 146 147 'Cluster-based Local Outlier Factor (CBLOF)': 148 CBLOF(contamination=outliers_fraction, 149 check_estimator=False, random_state=random_state), 150 151 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 152 } 153 154 # Show all detectors 155 for i, clf in enumerate(classifiers.keys()): 156 print('Model', i + 1, clf) 157 158 # Fit the models with the generated data and 159 # compare model performances 160 for i, offset in enumerate(clusters_separation): 161 np.random.seed(42) 162 # Data generation 163 X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset 164 X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset 165 X = np.r_[X1, X2] 166 # Add outliers 167 X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] 168 169 # Fit the model 170 plt.figure(figsize=(20, 22)) 171 for i, (clf_name, clf) in enumerate(classifiers.items()): 172 print() 173 print(i + 1, 'fitting', clf_name) 174 # fit the data and tag outliers 175 clf.fit(X) 176 scores_pred = clf.decision_function(X) * -1 177 y_pred = clf.predict(X) 178 threshold = percentile(scores_pred, 100 * outliers_fraction) 179 n_errors = (y_pred != ground_truth).sum() 180 # plot the levels lines and the points 181 182 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 183 Z = Z.reshape(xx.shape) 184 subplot = plt.subplot(5, 5, i + 1) 185 subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), 186 cmap=plt.cm.Blues_r) 187 # a = subplot.contour(xx, yy, Z, levels=[threshold], 188 # linewidths=2, colors='red') 189 subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], 190 colors='orange') 191 b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white', 192 s=20, edgecolor='k') 193 c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black', 194 s=20, edgecolor='k') 195 subplot.axis('tight') 196 subplot.legend( 197 [ 198 # a.collections[0], 199 b, c], 200 [ 201 # 'learned decision function', 202 'true inliers', 'true outliers'], 203 prop=matplotlib.font_manager.FontProperties(size=10), 204 loc='lower right') 205 subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) 206 subplot.set_xlim((-7, 7)) 207 subplot.set_ylim((-7, 7)) 208 plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) 209 plt.suptitle("25 outlier detection algorithms on synthetic data", 210 fontsize=35) 211 plt.savefig('ALL.png', dpi=300, bbox_inches='tight') 212 plt.show()