Cradicle Explorer

lscp.py
  1  """Locally Selective Combination of Parallel Outlier Ensembles (LSCP).
  2  Adapted from the original implementation.
  3  """
  4  # Author: Zain Nasrullah <zain.nasrullah.zn@gmail.com>
  5  # License: BSD 2 clause
  6  
  7  
  8  import collections
  9  import warnings
 10  
 11  import numpy as np
 12  from sklearn.neighbors import KDTree
 13  from sklearn.utils import check_array
 14  from sklearn.utils.validation import check_is_fitted
 15  from sklearn.utils.validation import check_random_state
 16  
 17  # PyOD imports
 18  from .base import BaseDetector
 19  from ..utils.stat_models import pearsonr
 20  from ..utils.utility import argmaxn
 21  from ..utils.utility import check_detector
 22  from ..utils.utility import generate_bagging_indices
 23  from ..utils.utility import standardizer
 24  
 25  
 26  # TODO: find random state that is causing runtime warning in pearson
 27  
 28  class LSCP(BaseDetector):
 29      """ Locally Selection Combination in Parallel Outlier Ensembles
 30  
 31      LSCP is an unsupervised parallel outlier detection ensemble which selects
 32      competent detectors in the local region of a test instance. This
 33      implementation uses an Average of Maximum strategy. First, a heterogeneous
 34      list of base detectors is fit to the training data and then generates a
 35      pseudo ground truth for each train instance is generated by
 36      taking the maximum outlier score.
 37  
 38      For each test instance:
 39      1) The local region is defined to be the set of nearest training points in
 40      randomly sampled feature subspaces which occur more frequently than
 41      a defined threshold over multiple iterations.
 42  
 43      2) Using the local region, a local pseudo ground truth is defined and the
 44      pearson correlation is calculated between each base detector's training
 45      outlier scores and the pseudo ground truth.
 46  
 47      3) A histogram is built out of pearson correlation scores; detectors in
 48      the largest bin are selected as competent base detectors for the given
 49      test instance.
 50  
 51      4) The average outlier score of the selected competent detectors is taken
 52      to be the final score.
 53  
 54      See :cite:`zhao2019lscp` for details.
 55  
 56      Parameters
 57      ----------
 58      detector_list : List, length must be greater than 1
 59          Base unsupervised outlier detectors from PyOD. (Note: requires fit and
 60          decision_function methods)
 61  
 62      local_region_size : int, optional (default=30)
 63          Number of training points to consider in each iteration of the local
 64          region generation process (30 by default).
 65  
 66      local_max_features : float in (0.5, 1.), optional (default=1.0)
 67          Maximum proportion of number of features to consider when defining the
 68          local region (1.0 by default).
 69  
 70      n_bins : int, optional (default=10)
 71          Number of bins to use when selecting the local region
 72  
 73      random_state : RandomState, optional (default=None)
 74          A random number generator instance to define the state of the random
 75          permutations generator.
 76  
 77      contamination : float in (0., 0.5), optional (default=0.1)
 78          The amount of contamination of the data set, i.e.
 79          the proportion of outliers in the data set. Used when fitting to
 80          define the threshold on the decision function (0.1 by default).
 81  
 82      Attributes
 83      ----------
 84      decision_scores_ : numpy array of shape (n_samples,)
 85          The outlier scores of the training data.
 86          The higher, the more abnormal. Outliers tend to have higher
 87          scores. This value is available once the detector is fitted.
 88  
 89      threshold_ : float
 90          The threshold is based on ``contamination``. It is the
 91          ``n_samples * contamination`` most abnormal samples in
 92          ``decision_scores_``. The threshold is calculated for generating
 93          binary outlier labels.
 94  
 95      labels_ : int, either 0 or 1
 96          The binary labels of the training data. 0 stands for inliers
 97          and 1 for outliers/anomalies. It is generated by applying
 98          ``threshold_`` on ``decision_scores_``.
 99  
100      Examples
101      --------
102      >>> from pyod.utils.data import generate_data
103      >>> from pyod.utils.utility import standardizer
104      >>> from pyod.models.lscp import LSCP
105      >>> from pyod.models.lof import LOF
106      >>> X_train, y_train, X_test, y_test = generate_data(
107      ...     n_train=50, n_test=50,
108      ...     contamination=0.1, random_state=42)
109      >>> X_train, X_test = standardizer(X_train, X_test)
110      >>> detector_list = [LOF(), LOF()]
111      >>> clf = LSCP(detector_list)
112      >>> clf.fit(X_train)
113      LSCP(...)
114      """
115  
116      def __init__(self, detector_list, local_region_size=30,
117                   local_max_features=1.0, n_bins=10,
118                   random_state=None, contamination=0.1):
119          super(LSCP, self).__init__(contamination=contamination)
120          self.detector_list = detector_list
121          self.n_clf = len(self.detector_list)
122          self.local_region_size = local_region_size
123          self.local_region_min = 30
124          self.local_region_max = 200
125          self.local_max_features = local_max_features
126          self.local_min_features = 0.5
127          self.local_region_iterations = 20
128          self.local_region_threshold = int(self.local_region_iterations / 2)
129          self.n_bins = n_bins
130          self.n_selected = 1
131          self.random_state = random_state
132  
133      def fit(self, X, y=None):
134          """Fit detector. y is ignored in unsupervised methods.
135  
136          Parameters
137          ----------
138          X : numpy array of shape (n_samples, n_features)
139              The input samples.
140  
141          y : Ignored
142              Not used, present for API consistency by convention.
143  
144          Returns
145          -------
146          self : object
147              Fitted estimator.
148          """
149          # check detector_list
150          if len(self.detector_list) < 2:
151              raise ValueError("The detector list has less than 2 detectors.")
152  
153          for detector in self.detector_list:
154              check_detector(detector)
155  
156          # check random state and input
157          self.random_state = check_random_state(self.random_state)
158          X = check_array(X)
159          self._set_n_classes(y)
160          self.n_features_ = X.shape[1]
161  
162          # normalize input data
163          self.X_train_norm_ = X
164          train_scores = np.zeros([self.X_train_norm_.shape[0], self.n_clf])
165  
166          # fit each base detector and calculate standardized train scores
167          for k, detector in enumerate(self.detector_list):
168              detector.fit(self.X_train_norm_)
169              train_scores[:, k] = detector.decision_scores_
170          self.train_scores_ = train_scores
171  
172          # set decision scores and threshold
173          self.decision_scores_ = self._get_decision_scores(X)
174          self._process_decision_scores()
175  
176          return self
177  
178      def decision_function(self, X):
179          """Predict raw anomaly score of X using the fitted detector.
180  
181          The anomaly score of an input sample is computed based on different
182          detector algorithms. For consistency, outliers are assigned with
183          larger anomaly scores.
184  
185          Parameters
186          ----------
187          X : numpy array of shape (n_samples, n_features)
188              The training input samples. Sparse matrices are accepted only
189              if they are supported by the base estimator.
190  
191          Returns
192          -------
193          anomaly_scores : numpy array of shape (n_samples,)
194              The anomaly score of the input samples.
195          """
196          # check whether model has been fit
197          check_is_fitted(self, ['training_pseudo_label_', 'train_scores_',
198                                 'X_train_norm_', 'n_features_'])
199  
200          # check input array
201          X = check_array(X)
202          if self.n_features_ != X.shape[1]:
203              raise ValueError("Number of features of the model must "
204                               "match the input. Model n_features is {0} and "
205                               "input n_features is {1}."
206                               "".format(self.n_features_, X.shape[1]))
207  
208          # get decision scores and return
209          decision_scores = self._get_decision_scores(X)
210          return decision_scores
211  
212      def _get_decision_scores(self, X):
213          """ Helper function for getting outlier scores on test data X (note:
214          model must already be fit)
215  
216          Parameters
217          ----------
218          X : numpy array, shape (n_samples, n_features)
219              Test data
220  
221          Returns
222          -------
223          pred_scores_ens : numpy array, shape (n_samples,)
224              Outlier scores for test samples
225          """
226  
227          # raise warning if local region size is outside acceptable limits
228          if (self.local_region_size < self.local_region_min) or (
229                  self.local_region_size > self.local_region_max):
230              warnings.warn("Local region size of {} is outside "
231                            "recommended range [{}, {}]".format(
232                  self.local_region_size, self.local_region_min,
233                  self.local_region_max))
234  
235          # standardize test data and get local region for each test instance
236          X_test_norm = X
237          test_local_regions = self._get_local_region(X_test_norm)
238  
239          # calculate test scores
240          test_scores = np.zeros([X_test_norm.shape[0], self.n_clf])
241          for k, detector in enumerate(self.detector_list):
242              test_scores[:, k] = detector.decision_function(X_test_norm)
243  
244          # generate standardized scores
245          train_scores_norm, test_scores_norm = standardizer(self.train_scores_,
246                                                             test_scores)
247  
248          # generate pseudo target for training --> for calculating weights
249          self.training_pseudo_label_ = np.max(train_scores_norm,
250                                               axis=1).reshape(-1, 1)
251  
252          # placeholder for ensemble predictions
253          pred_scores_ens = np.zeros([X_test_norm.shape[0], ])
254  
255          # iterate through test instances (test_local_regions
256          # indices correspond to x_test)
257          for i, test_local_region in enumerate(test_local_regions):
258  
259              # get pseudo target and training scores in local region of
260              # test instance
261              local_pseudo_ground_truth = self.training_pseudo_label_[
262                  test_local_region,].ravel()
263              local_train_scores = train_scores_norm[test_local_region, :]
264  
265              # calculate pearson correlation between local pseudo ground truth
266              # and local train scores
267              pearson_corr_scores = np.zeros([self.n_clf, ])
268              for d in range(self.n_clf):
269                  pearson_corr_scores[d,] = pearsonr(
270                      local_pseudo_ground_truth, local_train_scores[:, d])[0]
271  
272              # return best score
273              pred_scores_ens[i,] = np.mean(
274                  test_scores_norm[
275                      i, self._get_competent_detectors(pearson_corr_scores)])
276  
277          return pred_scores_ens
278  
279      def _get_local_region(self, X_test_norm):
280          """ Get local region for each test instance
281  
282          Parameters
283          ----------
284          X_test_norm : numpy array, shape (n_samples, n_features)
285              Normalized test data
286  
287          Returns
288          -------
289          final_local_region_list : List of lists, shape of [n_samples, [local_region]]
290              Indices of training samples in the local region of each test sample
291          """
292  
293          # Initialize the local region list
294          local_region_list = [[]] * X_test_norm.shape[0]
295  
296          if self.local_max_features > 1.0:
297              warnings.warn(
298                  "Local max features greater than 1.0, reducing to 1.0")
299              self.local_max_features = 1.0
300  
301          if self.X_train_norm_.shape[1] * self.local_min_features < 1:
302              warnings.warn(
303                  "Local min features smaller than 1, increasing to 1.0")
304              self.local_min_features = 1.0
305  
306          # perform multiple iterations
307          for _ in range(self.local_region_iterations):
308  
309              # if min and max are the same, then use all features
310              if self.local_max_features == self.local_min_features:
311                  features = range(0, self.X_train_norm_.shape[1])
312                  warnings.warn("Local min features equals local max features; "
313                                "use all features instead.")
314  
315              else:
316                  # randomly generate feature subspaces
317                  features = generate_bagging_indices(
318                      self.random_state,
319                      bootstrap_features=False,
320                      n_features=self.X_train_norm_.shape[1],
321                      min_features=int(
322                          self.X_train_norm_.shape[1] * self.local_min_features),
323                      max_features=int(
324                          self.X_train_norm_.shape[1] * self.local_max_features))
325  
326              # build KDTree out of training subspace
327              tree = KDTree(self.X_train_norm_[:, features])
328  
329              # Find neighbors of each test instance
330              _, ind_arr = tree.query(X_test_norm[:, features],
331                                      k=self.local_region_size)
332  
333              # add neighbors to local region list
334              for j in range(X_test_norm.shape[0]):
335                  local_region_list[j] = local_region_list[j] + \
336                                         ind_arr[j, :].tolist()
337  
338          # keep nearby points which occur at least local_region_threshold times
339          final_local_region_list = [[]] * X_test_norm.shape[0]
340          for j in range(X_test_norm.shape[0]):
341              tmp = [item for item, count in collections.Counter(
342                  local_region_list[j]).items() if
343                     count > self.local_region_threshold]
344              decrease_value = 0
345              while len(tmp) < 2:
346                  decrease_value = decrease_value + 1
347                  assert decrease_value < self.local_region_threshold
348                  tmp = [item for item, count in
349                         collections.Counter(local_region_list[j]).items() if
350                         count > (self.local_region_threshold - decrease_value)]
351  
352              final_local_region_list[j] = tmp
353  
354          return final_local_region_list
355  
356      def _get_competent_detectors(self, scores):
357          """ Identifies competent base detectors based on correlation scores
358  
359          Parameters
360          ----------
361          scores : numpy array, shape (n_clf,)
362              Correlation scores for each classifier (for a specific
363              test instance)
364  
365          Returns
366          -------
367          candidates : List
368              Indices for competent detectors (for given test instance)
369          """
370  
371          # create histogram of correlation scores
372          scores = scores.reshape(-1, 1)
373  
374          # TODO: handle when Pearson score is 0
375          # if scores contain nan, change it to 0
376          if np.isnan(scores).any():
377              scores = np.nan_to_num(scores)
378  
379          if self.n_bins > self.n_clf:
380              warnings.warn(
381                  "The number of histogram bins is greater than the number of "
382                  "classifiers, reducing n_bins to n_clf.")
383              self.n_bins = self.n_clf
384          hist, bin_edges = np.histogram(scores, bins=self.n_bins)
385  
386          # find n_selected largest bins
387          max_bins = argmaxn(hist, n=self.n_selected)
388          candidates = []
389  
390          # iterate through bins
391          for max_bin in max_bins:
392              # determine which detectors are inside this bin
393              selected = np.where((scores >= bin_edges[max_bin])
394                                  & (scores <= bin_edges[max_bin + 1]))
395  
396              # add to list of candidates
397              candidates = candidates + selected[0].tolist()
398  
399          return candidates
400  
401      def __len__(self):
402          return len(self.detector_list)
403  
404      def __getitem__(self, index):
405          return self.detector_list[index]
406  
407      def __iter__(self):
408          return iter(self.detector_list)