/ pyod / models / iforest.py
iforest.py
  1  # -*- coding: utf-8 -*-
  2  """IsolationForest Outlier Detector. Implemented on scikit-learn library.
  3  """
  4  # Author: Yue Zhao <yzhao062@gmail.com>
  5  # License: BSD 2 clause
  6  
  7  
  8  import numpy as np
  9  from joblib import Parallel
 10  from joblib.parallel import delayed
 11  from sklearn.ensemble import IsolationForest
 12  from sklearn.utils import check_array
 13  from sklearn.utils.validation import check_is_fitted
 14  
 15  from .base import BaseDetector
 16  # noinspection PyProtectedMember
 17  from ..utils.utility import invert_order
 18  
 19  
 20  # TODO: behavior of Isolation Forest will change in sklearn 0.22. See below.
 21  # in 0.22, scikit learn will start adjust decision_function values by
 22  # offset to make the values below zero as outliers. In other words, it is
 23  # an absolute shift, which SHOULD NOT affect the result of PyOD at all as
 24  # the order is still preserved.
 25  
 26  # Behaviour of the decision_function which can be either ‘old’ or ‘new’.
 27  # Passing behaviour='new' makes the decision_function change to match other
 28  # anomaly detection algorithm API which will be the default behaviour in the
 29  # future. As explained in details in the offset_ attribute documentation,
 30  # the decision_function becomes dependent on the contamination parameter,
 31  # in such a way that 0 becomes its natural threshold to detect outliers.
 32  
 33  # offset_ : float
 34  # Offset used to define the decision function from the raw scores.
 35  # We have the relation: decision_function = score_samples - offset_.
 36  # Assuming behaviour == ‘new’, offset_ is defined as follows.
 37  # When the contamination parameter is set to “auto”,
 38  # the offset is equal to -0.5 as the scores of inliers are close to 0 and the
 39  # scores of outliers are close to -1. When a contamination parameter different
 40  # than “auto” is provided, the offset is defined in such a way we obtain the
 41  # expected number of outliers (samples with decision function < 0) in training.
 42  # Assuming the behaviour parameter is set to ‘old’,
 43  # we always have offset_ = -0.5, making the decision function independent from
 44  # the contamination parameter.
 45  
 46  # check https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html for more information
 47  
 48  
 49  class IForest(BaseDetector):
 50      """Wrapper of scikit-learn Isolation Forest with more functionalities.
 51  
 52      The IsolationForest 'isolates' observations by randomly selecting a
 53      feature and then randomly selecting a split value between the maximum and
 54      minimum values of the selected feature.
 55      See :cite:`liu2008isolation,liu2012isolation` for details.
 56  
 57      Since recursive partitioning can be represented by a tree structure, the
 58      number of splittings required to isolate a sample is equivalent to the path
 59      length from the root node to the terminating node.
 60  
 61      This path length, averaged over a forest of such random trees, is a
 62      measure of normality and our decision function.
 63  
 64      Random partitioning produces noticeably shorter paths for anomalies.
 65      Hence, when a forest of random trees collectively produce shorter path
 66      lengths for particular samples, they are highly likely to be anomalies.
 67  
 68      Parameters
 69      ----------
 70      n_estimators : int, optional (default=100)
 71          The number of base estimators in the ensemble.
 72  
 73      max_samples : int or float, optional (default="auto")
 74          The number of samples to draw from X to train each base estimator.
 75  
 76              - If int, then draw `max_samples` samples.
 77              - If float, then draw `max_samples * X.shape[0]` samples.
 78              - If "auto", then `max_samples=min(256, n_samples)`.
 79  
 80          If max_samples is larger than the number of samples provided,
 81          all samples will be used for all trees (no sampling).
 82  
 83      contamination : float in (0., 0.5), optional (default=0.1)
 84          The amount of contamination of the data set, i.e. the proportion
 85          of outliers in the data set. Used when fitting to define the threshold
 86          on the decision function.
 87  
 88      max_features : int or float, optional (default=1.0)
 89          The number of features to draw from X to train each base estimator.
 90  
 91              - If int, then draw `max_features` features.
 92              - If float, then draw `max_features * X.shape[1]` features.
 93  
 94      bootstrap : bool, optional (default=False)
 95          If True, individual trees are fit on random subsets of the training
 96          data sampled with replacement. If False, sampling without replacement
 97          is performed.
 98  
 99      n_jobs : integer, optional (default=1)
100          The number of jobs to run in parallel for both `fit` and `predict`.
101          If -1, then the number of jobs is set to the number of cores.
102  
103      behaviour : str, default='old'
104          Behaviour of the ``decision_function`` which can be either 'old' or
105          'new'. Passing ``behaviour='new'`` makes the ``decision_function``
106          change to match other anomaly detection algorithm API which will be
107          the default behaviour in the future. As explained in details in the
108          ``offset_`` attribute documentation, the ``decision_function`` becomes
109          dependent on the contamination parameter, in such a way that 0 becomes
110          its natural threshold to detect outliers.
111  
112          .. versionadded:: 0.7.0
113             ``behaviour`` is added in 0.7.0 for back-compatibility purpose.
114  
115          .. deprecated:: 0.20
116             ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be
117             possible in 0.22.
118  
119          .. deprecated:: 0.22
120             ``behaviour`` parameter will be deprecated in sklearn 0.22 and
121             removed in 0.24.
122  
123          .. warning::
124              Only applicable for sklearn 0.20 above.
125  
126      random_state : int, RandomState instance or None, optional (default=None)
127          If int, random_state is the seed used by the random number generator;
128          If RandomState instance, random_state is the random number generator;
129          If None, the random number generator is the RandomState instance used
130          by `np.random`.
131  
132      verbose : int, optional (default=0)
133          Controls the verbosity of the tree building process.
134  
135      Attributes
136      ----------
137      estimators_ : list of DecisionTreeClassifier
138          The collection of fitted sub-estimators.
139  
140      estimators_samples_ : list of arrays
141          The subset of drawn samples (i.e., the in-bag samples) for each base
142          estimator.
143  
144      max_samples_ : integer
145          The actual number of samples
146  
147      decision_scores_ : numpy array of shape (n_samples,)
148          The outlier scores of the training data.
149          The higher, the more abnormal. Outliers tend to have higher
150          scores. This value is available once the detector is
151          fitted.
152  
153      threshold_ : float
154          The threshold is based on ``contamination``. It is the
155          ``n_samples * contamination`` most abnormal samples in
156          ``decision_scores_``. The threshold is calculated for generating
157          binary outlier labels.
158  
159      labels_ : int, either 0 or 1
160          The binary labels of the training data. 0 stands for inliers
161          and 1 for outliers/anomalies. It is generated by applying
162          ``threshold_`` on ``decision_scores_``.
163      """
164  
165      def __init__(self, n_estimators=100,
166                   max_samples="auto",
167                   contamination=0.1,
168                   max_features=1.,
169                   bootstrap=False,
170                   n_jobs=1,
171                   behaviour='old',
172                   random_state=None,
173                   verbose=0):
174          super(IForest, self).__init__(contamination=contamination)
175          self.n_estimators = n_estimators
176          self.max_samples = max_samples
177          self.max_features = max_features
178          self.bootstrap = bootstrap
179          self.n_jobs = n_jobs
180          self.behaviour = behaviour
181          self.random_state = random_state
182          self.verbose = verbose
183  
184      def fit(self, X, y=None):
185          """Fit detector. y is ignored in unsupervised methods.
186  
187          Parameters
188          ----------
189          X : numpy array of shape (n_samples, n_features)
190              The input samples.
191  
192          y : Ignored
193              Not used, present for API consistency by convention.
194  
195          Returns
196          -------
197          self : object
198              Fitted estimator.
199          """
200          # validate inputs X and y (optional)
201          X = check_array(X)
202          self._set_n_classes(y)
203  
204          # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'})
205          # to IsolationForest that shifts the location of the anomaly scores
206          # noinspection PyProtectedMember
207  
208          self.detector_ = IsolationForest(n_estimators=self.n_estimators,
209                                           max_samples=self.max_samples,
210                                           contamination=self.contamination,
211                                           max_features=self.max_features,
212                                           bootstrap=self.bootstrap,
213                                           n_jobs=self.n_jobs,
214                                           random_state=self.random_state,
215                                           verbose=self.verbose)
216  
217          self.detector_.fit(X=X, y=None, sample_weight=None)
218  
219          # invert decision_scores_. Outliers comes with higher outlier scores.
220          self.decision_scores_ = invert_order(
221              self.detector_.decision_function(X))
222          self._process_decision_scores()
223          return self
224  
225      def decision_function(self, X):
226          """Predict raw anomaly score of X using the fitted detector.
227  
228          The anomaly score of an input sample is computed based on different
229          detector algorithms. For consistency, outliers are assigned with
230          larger anomaly scores.
231  
232          Parameters
233          ----------
234          X : numpy array of shape (n_samples, n_features)
235              The training input samples. Sparse matrices are accepted only
236              if they are supported by the base estimator.
237  
238          Returns
239          -------
240          anomaly_scores : numpy array of shape (n_samples,)
241              The anomaly score of the input samples.
242          """
243          check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
244          # invert outlier scores. Outliers comes with higher outlier scores
245          return invert_order(self.detector_.decision_function(X))
246  
247      @property
248      def estimators_(self):
249          """The collection of fitted sub-estimators.
250          Decorator for scikit-learn Isolation Forest attributes.
251          """
252          return self.detector_.estimators_
253  
254      @property
255      def estimators_samples_(self):
256          """The subset of drawn samples (i.e., the in-bag samples) for
257          each base estimator.
258          Decorator for scikit-learn Isolation Forest attributes.
259          """
260          return self.detector_.estimators_samples_
261  
262      @property
263      def max_samples_(self):
264          """The actual number of samples.
265          Decorator for scikit-learn Isolation Forest attributes.
266          """
267          return self.detector_.max_samples_
268  
269      @property
270      def estimators_features_(self):
271          """The indeces of the subset of features used to train the estimators.
272          Decorator for scikit-learn Isolation Forest attributes.
273          """
274          return self.detector_.estimators_features_
275  
276      @property
277      def n_features_in_(self):
278          """The number of features seen during the fit.
279          Decorator for scikit-learn Isolation Forest attributes.
280          """
281          return self.detector_.n_features_in_
282  
283      @property
284      def offset_(self):
285          """Offset used to define the decision function from the raw scores.
286          Decorator for scikit-learn Isolation Forest attributes.
287          """
288          return self.detector_.offset_
289  
290      @property
291      def feature_importances_(self):
292          """The impurity-based feature importance. The higher, the more
293          important the feature. The importance of a feature is computed as the
294          (normalized) total reduction of the criterion brought by that feature.
295          It is also known as the Gini importance.
296  
297          .. warning::
298          impurity-based feature importance can be misleading for
299          high cardinality features (many unique values). See
300          https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html
301          as an alternative.
302  
303          Returns
304          -------
305          feature_importances_ : ndarray of shape (n_features,)
306              The values of this array sum to 1, unless all trees are single node
307              trees consisting of only the root node, in which case it will be an
308              array of zeros.
309          """
310          check_is_fitted(self)
311          all_importances = Parallel(
312              n_jobs=self.n_jobs)(
313              delayed(getattr)(tree, "feature_importances_")
314              for tree in self.detector_.estimators_
315              if tree.tree_.node_count > 1
316          )
317  
318          if not all_importances:
319              return np.zeros(self.n_features_in_, dtype=np.float64)
320  
321          all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
322          return all_importances / np.sum(all_importances)