Cradicle Explorer

/ examples / compare_all_models.py
compare_all_models.py
  1  # -*- coding: utf-8 -*-
  2  """Compare all detection algorithms by plotting decision boundaries and
  3  the number of decision boundaries.
  4  """
  5  # Author: Yue Zhao <zhaoy@cmu.edu>
  6  # License: BSD 2 clause
  7  
  8  from __future__ import division
  9  from __future__ import print_function
 10  
 11  import os
 12  import sys
 13  
 14  # temporary solution for relative imports in case pyod is not installed
 15  # if pyod is installed, no need to use the following line
 16  sys.path.append(
 17  	os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
 18  
 19  # supress warnings for clean output
 20  import warnings
 21  
 22  warnings.filterwarnings("ignore")
 23  import numpy as np
 24  from numpy import percentile
 25  import matplotlib.pyplot as plt
 26  import matplotlib.font_manager
 27  
 28  # Import all models
 29  from pyod.models.abod import ABOD
 30  from pyod.models.cblof import CBLOF
 31  from pyod.models.feature_bagging import FeatureBagging
 32  from pyod.models.hbos import HBOS
 33  from pyod.models.iforest import IForest
 34  from pyod.models.knn import KNN
 35  from pyod.models.lof import LOF
 36  from pyod.models.mcd import MCD
 37  from pyod.models.ocsvm import OCSVM
 38  from pyod.models.pca import PCA
 39  from pyod.models.lscp import LSCP
 40  from pyod.models.inne import INNE
 41  from pyod.models.gmm import GMM
 42  from pyod.models.kde import KDE
 43  from pyod.models.lmdd import LMDD
 44  
 45  from pyod.models.dif import DIF
 46  from pyod.models.copod import COPOD
 47  from pyod.models.ecod import ECOD
 48  from pyod.models.suod import SUOD
 49  from pyod.models.qmcd import QMCD
 50  from pyod.models.sampling import Sampling
 51  from pyod.models.kpca import KPCA
 52  from pyod.models.lunar import LUNAR
 53  
 54  # TODO: add neural networks, LOCI, SOS, COF, SOD
 55  
 56  # Define the number of inliers and outliers
 57  n_samples = 200
 58  outliers_fraction = 0.25
 59  clusters_separation = [0]
 60  
 61  # Compare given detectors under given settings
 62  # Initialize the data
 63  xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
 64  n_inliers = int((1. - outliers_fraction) * n_samples)
 65  n_outliers = int(outliers_fraction * n_samples)
 66  ground_truth = np.zeros(n_samples, dtype=int)
 67  ground_truth[-n_outliers:] = 1
 68  
 69  # initialize a set of detectors for LSCP
 70  detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
 71  				 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
 72  				 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
 73  				 LOF(n_neighbors=50)]
 74  
 75  # Show the statics of the data
 76  print('Number of inliers: %i' % n_inliers)
 77  print('Number of outliers: %i' % n_outliers)
 78  print(
 79  	'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(
 80  		shape=ground_truth.shape))
 81  print(ground_truth, '\n')
 82  
 83  random_state = 42
 84  # Define nine outlier detection tools to be compared
 85  classifiers = {
 86  	'Angle-based Outlier Detector (ABOD)':
 87  		ABOD(contamination=outliers_fraction),
 88  	'K Nearest Neighbors (KNN)': KNN(
 89  		contamination=outliers_fraction),
 90  	'Average KNN': KNN(method='mean',
 91  					   contamination=outliers_fraction),
 92  	'Median KNN': KNN(method='median',
 93  					  contamination=outliers_fraction),
 94  	'Local Outlier Factor (LOF)':
 95  		LOF(n_neighbors=35, contamination=outliers_fraction),
 96  
 97  	'Isolation Forest': IForest(contamination=outliers_fraction,
 98  								random_state=random_state),
 99  	'Deep Isolation Forest (DIF)': DIF(contamination=outliers_fraction,
100  									   random_state=random_state),
101  	'INNE': INNE(
102  		max_samples=2, contamination=outliers_fraction,
103  		random_state=random_state,
104  	),
105  
106  	'Locally Selective Combination (LSCP)': LSCP(
107  		detector_list, contamination=outliers_fraction,
108  		random_state=random_state),
109  	'Feature Bagging':
110  		FeatureBagging(LOF(n_neighbors=35),
111  					   contamination=outliers_fraction,
112  					   random_state=random_state),
113  	'SUOD': SUOD(contamination=outliers_fraction),
114  
115  	'Minimum Covariance Determinant (MCD)': MCD(
116  		contamination=outliers_fraction, random_state=random_state),
117  
118  	'Principal Component Analysis (PCA)': PCA(
119  		contamination=outliers_fraction, random_state=random_state),
120  	'KPCA': KPCA(
121  		contamination=outliers_fraction),
122  
123  	'Probabilistic Mixture Modeling (GMM)': GMM(contamination=outliers_fraction,
124  												random_state=random_state),
125  
126  	'LMDD': LMDD(contamination=outliers_fraction,
127  				 random_state=random_state),
128  
129  	'Histogram-based Outlier Detection (HBOS)': HBOS(
130  		contamination=outliers_fraction),
131  
132  	'Copula-base Outlier Detection (COPOD)': COPOD(
133  		contamination=outliers_fraction),
134  
135  	'ECDF-baseD Outlier Detection (ECOD)': ECOD(
136  		contamination=outliers_fraction),
137  	'Kernel Density Functions (KDE)': KDE(contamination=outliers_fraction),
138  
139  	'QMCD': QMCD(
140  		contamination=outliers_fraction),
141  
142  	'Sampling': Sampling(
143  		contamination=outliers_fraction),
144  
145  	'LUNAR': LUNAR(),
146  
147  	'Cluster-based Local Outlier Factor (CBLOF)':
148  		CBLOF(contamination=outliers_fraction,
149  			  check_estimator=False, random_state=random_state),
150  
151  	'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
152  }
153  
154  # Show all detectors
155  for i, clf in enumerate(classifiers.keys()):
156  	print('Model', i + 1, clf)
157  
158  # Fit the models with the generated data and
159  # compare model performances
160  for i, offset in enumerate(clusters_separation):
161  	np.random.seed(42)
162  	# Data generation
163  	X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
164  	X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
165  	X = np.r_[X1, X2]
166  	# Add outliers
167  	X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
168  
169  	# Fit the model
170  	plt.figure(figsize=(20, 22))
171  	for i, (clf_name, clf) in enumerate(classifiers.items()):
172  		print()
173  		print(i + 1, 'fitting', clf_name)
174  		# fit the data and tag outliers
175  		clf.fit(X)
176  		scores_pred = clf.decision_function(X) * -1
177  		y_pred = clf.predict(X)
178  		threshold = percentile(scores_pred, 100 * outliers_fraction)
179  		n_errors = (y_pred != ground_truth).sum()
180  		# plot the levels lines and the points
181  
182  		Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
183  		Z = Z.reshape(xx.shape)
184  		subplot = plt.subplot(5, 5, i + 1)
185  		subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
186  						 cmap=plt.cm.Blues_r)
187  		# a = subplot.contour(xx, yy, Z, levels=[threshold],
188  		#                     linewidths=2, colors='red')
189  		subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
190  						 colors='orange')
191  		b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white',
192  							s=20, edgecolor='k')
193  		c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black',
194  							s=20, edgecolor='k')
195  		subplot.axis('tight')
196  		subplot.legend(
197  			[
198  				# a.collections[0],
199  				b, c],
200  			[
201  				# 'learned decision function',
202  				'true inliers', 'true outliers'],
203  			prop=matplotlib.font_manager.FontProperties(size=10),
204  			loc='lower right')
205  		subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
206  		subplot.set_xlim((-7, 7))
207  		subplot.set_ylim((-7, 7))
208  	plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
209  	plt.suptitle("25 outlier detection algorithms on synthetic data",
210  				 fontsize=35)
211  plt.savefig('ALL.png', dpi=300, bbox_inches='tight')
212  plt.show()