lscp.py
1 """Locally Selective Combination of Parallel Outlier Ensembles (LSCP). 2 Adapted from the original implementation. 3 """ 4 # Author: Zain Nasrullah <zain.nasrullah.zn@gmail.com> 5 # License: BSD 2 clause 6 7 8 import collections 9 import warnings 10 11 import numpy as np 12 from sklearn.neighbors import KDTree 13 from sklearn.utils import check_array 14 from sklearn.utils.validation import check_is_fitted 15 from sklearn.utils.validation import check_random_state 16 17 # PyOD imports 18 from .base import BaseDetector 19 from ..utils.stat_models import pearsonr 20 from ..utils.utility import argmaxn 21 from ..utils.utility import check_detector 22 from ..utils.utility import generate_bagging_indices 23 from ..utils.utility import standardizer 24 25 26 # TODO: find random state that is causing runtime warning in pearson 27 28 class LSCP(BaseDetector): 29 """ Locally Selection Combination in Parallel Outlier Ensembles 30 31 LSCP is an unsupervised parallel outlier detection ensemble which selects 32 competent detectors in the local region of a test instance. This 33 implementation uses an Average of Maximum strategy. First, a heterogeneous 34 list of base detectors is fit to the training data and then generates a 35 pseudo ground truth for each train instance is generated by 36 taking the maximum outlier score. 37 38 For each test instance: 39 1) The local region is defined to be the set of nearest training points in 40 randomly sampled feature subspaces which occur more frequently than 41 a defined threshold over multiple iterations. 42 43 2) Using the local region, a local pseudo ground truth is defined and the 44 pearson correlation is calculated between each base detector's training 45 outlier scores and the pseudo ground truth. 46 47 3) A histogram is built out of pearson correlation scores; detectors in 48 the largest bin are selected as competent base detectors for the given 49 test instance. 50 51 4) The average outlier score of the selected competent detectors is taken 52 to be the final score. 53 54 See :cite:`zhao2019lscp` for details. 55 56 Parameters 57 ---------- 58 detector_list : List, length must be greater than 1 59 Base unsupervised outlier detectors from PyOD. (Note: requires fit and 60 decision_function methods) 61 62 local_region_size : int, optional (default=30) 63 Number of training points to consider in each iteration of the local 64 region generation process (30 by default). 65 66 local_max_features : float in (0.5, 1.), optional (default=1.0) 67 Maximum proportion of number of features to consider when defining the 68 local region (1.0 by default). 69 70 n_bins : int, optional (default=10) 71 Number of bins to use when selecting the local region 72 73 random_state : RandomState, optional (default=None) 74 A random number generator instance to define the state of the random 75 permutations generator. 76 77 contamination : float in (0., 0.5), optional (default=0.1) 78 The amount of contamination of the data set, i.e. 79 the proportion of outliers in the data set. Used when fitting to 80 define the threshold on the decision function (0.1 by default). 81 82 Attributes 83 ---------- 84 decision_scores_ : numpy array of shape (n_samples,) 85 The outlier scores of the training data. 86 The higher, the more abnormal. Outliers tend to have higher 87 scores. This value is available once the detector is fitted. 88 89 threshold_ : float 90 The threshold is based on ``contamination``. It is the 91 ``n_samples * contamination`` most abnormal samples in 92 ``decision_scores_``. The threshold is calculated for generating 93 binary outlier labels. 94 95 labels_ : int, either 0 or 1 96 The binary labels of the training data. 0 stands for inliers 97 and 1 for outliers/anomalies. It is generated by applying 98 ``threshold_`` on ``decision_scores_``. 99 100 Examples 101 -------- 102 >>> from pyod.utils.data import generate_data 103 >>> from pyod.utils.utility import standardizer 104 >>> from pyod.models.lscp import LSCP 105 >>> from pyod.models.lof import LOF 106 >>> X_train, y_train, X_test, y_test = generate_data( 107 ... n_train=50, n_test=50, 108 ... contamination=0.1, random_state=42) 109 >>> X_train, X_test = standardizer(X_train, X_test) 110 >>> detector_list = [LOF(), LOF()] 111 >>> clf = LSCP(detector_list) 112 >>> clf.fit(X_train) 113 LSCP(...) 114 """ 115 116 def __init__(self, detector_list, local_region_size=30, 117 local_max_features=1.0, n_bins=10, 118 random_state=None, contamination=0.1): 119 super(LSCP, self).__init__(contamination=contamination) 120 self.detector_list = detector_list 121 self.n_clf = len(self.detector_list) 122 self.local_region_size = local_region_size 123 self.local_region_min = 30 124 self.local_region_max = 200 125 self.local_max_features = local_max_features 126 self.local_min_features = 0.5 127 self.local_region_iterations = 20 128 self.local_region_threshold = int(self.local_region_iterations / 2) 129 self.n_bins = n_bins 130 self.n_selected = 1 131 self.random_state = random_state 132 133 def fit(self, X, y=None): 134 """Fit detector. y is ignored in unsupervised methods. 135 136 Parameters 137 ---------- 138 X : numpy array of shape (n_samples, n_features) 139 The input samples. 140 141 y : Ignored 142 Not used, present for API consistency by convention. 143 144 Returns 145 ------- 146 self : object 147 Fitted estimator. 148 """ 149 # check detector_list 150 if len(self.detector_list) < 2: 151 raise ValueError("The detector list has less than 2 detectors.") 152 153 for detector in self.detector_list: 154 check_detector(detector) 155 156 # check random state and input 157 self.random_state = check_random_state(self.random_state) 158 X = check_array(X) 159 self._set_n_classes(y) 160 self.n_features_ = X.shape[1] 161 162 # normalize input data 163 self.X_train_norm_ = X 164 train_scores = np.zeros([self.X_train_norm_.shape[0], self.n_clf]) 165 166 # fit each base detector and calculate standardized train scores 167 for k, detector in enumerate(self.detector_list): 168 detector.fit(self.X_train_norm_) 169 train_scores[:, k] = detector.decision_scores_ 170 self.train_scores_ = train_scores 171 172 # set decision scores and threshold 173 self.decision_scores_ = self._get_decision_scores(X) 174 self._process_decision_scores() 175 176 return self 177 178 def decision_function(self, X): 179 """Predict raw anomaly score of X using the fitted detector. 180 181 The anomaly score of an input sample is computed based on different 182 detector algorithms. For consistency, outliers are assigned with 183 larger anomaly scores. 184 185 Parameters 186 ---------- 187 X : numpy array of shape (n_samples, n_features) 188 The training input samples. Sparse matrices are accepted only 189 if they are supported by the base estimator. 190 191 Returns 192 ------- 193 anomaly_scores : numpy array of shape (n_samples,) 194 The anomaly score of the input samples. 195 """ 196 # check whether model has been fit 197 check_is_fitted(self, ['training_pseudo_label_', 'train_scores_', 198 'X_train_norm_', 'n_features_']) 199 200 # check input array 201 X = check_array(X) 202 if self.n_features_ != X.shape[1]: 203 raise ValueError("Number of features of the model must " 204 "match the input. Model n_features is {0} and " 205 "input n_features is {1}." 206 "".format(self.n_features_, X.shape[1])) 207 208 # get decision scores and return 209 decision_scores = self._get_decision_scores(X) 210 return decision_scores 211 212 def _get_decision_scores(self, X): 213 """ Helper function for getting outlier scores on test data X (note: 214 model must already be fit) 215 216 Parameters 217 ---------- 218 X : numpy array, shape (n_samples, n_features) 219 Test data 220 221 Returns 222 ------- 223 pred_scores_ens : numpy array, shape (n_samples,) 224 Outlier scores for test samples 225 """ 226 227 # raise warning if local region size is outside acceptable limits 228 if (self.local_region_size < self.local_region_min) or ( 229 self.local_region_size > self.local_region_max): 230 warnings.warn("Local region size of {} is outside " 231 "recommended range [{}, {}]".format( 232 self.local_region_size, self.local_region_min, 233 self.local_region_max)) 234 235 # standardize test data and get local region for each test instance 236 X_test_norm = X 237 test_local_regions = self._get_local_region(X_test_norm) 238 239 # calculate test scores 240 test_scores = np.zeros([X_test_norm.shape[0], self.n_clf]) 241 for k, detector in enumerate(self.detector_list): 242 test_scores[:, k] = detector.decision_function(X_test_norm) 243 244 # generate standardized scores 245 train_scores_norm, test_scores_norm = standardizer(self.train_scores_, 246 test_scores) 247 248 # generate pseudo target for training --> for calculating weights 249 self.training_pseudo_label_ = np.max(train_scores_norm, 250 axis=1).reshape(-1, 1) 251 252 # placeholder for ensemble predictions 253 pred_scores_ens = np.zeros([X_test_norm.shape[0], ]) 254 255 # iterate through test instances (test_local_regions 256 # indices correspond to x_test) 257 for i, test_local_region in enumerate(test_local_regions): 258 259 # get pseudo target and training scores in local region of 260 # test instance 261 local_pseudo_ground_truth = self.training_pseudo_label_[ 262 test_local_region,].ravel() 263 local_train_scores = train_scores_norm[test_local_region, :] 264 265 # calculate pearson correlation between local pseudo ground truth 266 # and local train scores 267 pearson_corr_scores = np.zeros([self.n_clf, ]) 268 for d in range(self.n_clf): 269 pearson_corr_scores[d,] = pearsonr( 270 local_pseudo_ground_truth, local_train_scores[:, d])[0] 271 272 # return best score 273 pred_scores_ens[i,] = np.mean( 274 test_scores_norm[ 275 i, self._get_competent_detectors(pearson_corr_scores)]) 276 277 return pred_scores_ens 278 279 def _get_local_region(self, X_test_norm): 280 """ Get local region for each test instance 281 282 Parameters 283 ---------- 284 X_test_norm : numpy array, shape (n_samples, n_features) 285 Normalized test data 286 287 Returns 288 ------- 289 final_local_region_list : List of lists, shape of [n_samples, [local_region]] 290 Indices of training samples in the local region of each test sample 291 """ 292 293 # Initialize the local region list 294 local_region_list = [[]] * X_test_norm.shape[0] 295 296 if self.local_max_features > 1.0: 297 warnings.warn( 298 "Local max features greater than 1.0, reducing to 1.0") 299 self.local_max_features = 1.0 300 301 if self.X_train_norm_.shape[1] * self.local_min_features < 1: 302 warnings.warn( 303 "Local min features smaller than 1, increasing to 1.0") 304 self.local_min_features = 1.0 305 306 # perform multiple iterations 307 for _ in range(self.local_region_iterations): 308 309 # if min and max are the same, then use all features 310 if self.local_max_features == self.local_min_features: 311 features = range(0, self.X_train_norm_.shape[1]) 312 warnings.warn("Local min features equals local max features; " 313 "use all features instead.") 314 315 else: 316 # randomly generate feature subspaces 317 features = generate_bagging_indices( 318 self.random_state, 319 bootstrap_features=False, 320 n_features=self.X_train_norm_.shape[1], 321 min_features=int( 322 self.X_train_norm_.shape[1] * self.local_min_features), 323 max_features=int( 324 self.X_train_norm_.shape[1] * self.local_max_features)) 325 326 # build KDTree out of training subspace 327 tree = KDTree(self.X_train_norm_[:, features]) 328 329 # Find neighbors of each test instance 330 _, ind_arr = tree.query(X_test_norm[:, features], 331 k=self.local_region_size) 332 333 # add neighbors to local region list 334 for j in range(X_test_norm.shape[0]): 335 local_region_list[j] = local_region_list[j] + \ 336 ind_arr[j, :].tolist() 337 338 # keep nearby points which occur at least local_region_threshold times 339 final_local_region_list = [[]] * X_test_norm.shape[0] 340 for j in range(X_test_norm.shape[0]): 341 tmp = [item for item, count in collections.Counter( 342 local_region_list[j]).items() if 343 count > self.local_region_threshold] 344 decrease_value = 0 345 while len(tmp) < 2: 346 decrease_value = decrease_value + 1 347 assert decrease_value < self.local_region_threshold 348 tmp = [item for item, count in 349 collections.Counter(local_region_list[j]).items() if 350 count > (self.local_region_threshold - decrease_value)] 351 352 final_local_region_list[j] = tmp 353 354 return final_local_region_list 355 356 def _get_competent_detectors(self, scores): 357 """ Identifies competent base detectors based on correlation scores 358 359 Parameters 360 ---------- 361 scores : numpy array, shape (n_clf,) 362 Correlation scores for each classifier (for a specific 363 test instance) 364 365 Returns 366 ------- 367 candidates : List 368 Indices for competent detectors (for given test instance) 369 """ 370 371 # create histogram of correlation scores 372 scores = scores.reshape(-1, 1) 373 374 # TODO: handle when Pearson score is 0 375 # if scores contain nan, change it to 0 376 if np.isnan(scores).any(): 377 scores = np.nan_to_num(scores) 378 379 if self.n_bins > self.n_clf: 380 warnings.warn( 381 "The number of histogram bins is greater than the number of " 382 "classifiers, reducing n_bins to n_clf.") 383 self.n_bins = self.n_clf 384 hist, bin_edges = np.histogram(scores, bins=self.n_bins) 385 386 # find n_selected largest bins 387 max_bins = argmaxn(hist, n=self.n_selected) 388 candidates = [] 389 390 # iterate through bins 391 for max_bin in max_bins: 392 # determine which detectors are inside this bin 393 selected = np.where((scores >= bin_edges[max_bin]) 394 & (scores <= bin_edges[max_bin + 1])) 395 396 # add to list of candidates 397 candidates = candidates + selected[0].tolist() 398 399 return candidates 400 401 def __len__(self): 402 return len(self.detector_list) 403 404 def __getitem__(self, index): 405 return self.detector_list[index] 406 407 def __iter__(self): 408 return iter(self.detector_list)