iforest.py
1 # -*- coding: utf-8 -*- 2 """IsolationForest Outlier Detector. Implemented on scikit-learn library. 3 """ 4 # Author: Yue Zhao <yzhao062@gmail.com> 5 # License: BSD 2 clause 6 7 8 import numpy as np 9 from joblib import Parallel 10 from joblib.parallel import delayed 11 from sklearn.ensemble import IsolationForest 12 from sklearn.utils import check_array 13 from sklearn.utils.validation import check_is_fitted 14 15 from .base import BaseDetector 16 # noinspection PyProtectedMember 17 from ..utils.utility import invert_order 18 19 20 # TODO: behavior of Isolation Forest will change in sklearn 0.22. See below. 21 # in 0.22, scikit learn will start adjust decision_function values by 22 # offset to make the values below zero as outliers. In other words, it is 23 # an absolute shift, which SHOULD NOT affect the result of PyOD at all as 24 # the order is still preserved. 25 26 # Behaviour of the decision_function which can be either ‘old’ or ‘new’. 27 # Passing behaviour='new' makes the decision_function change to match other 28 # anomaly detection algorithm API which will be the default behaviour in the 29 # future. As explained in details in the offset_ attribute documentation, 30 # the decision_function becomes dependent on the contamination parameter, 31 # in such a way that 0 becomes its natural threshold to detect outliers. 32 33 # offset_ : float 34 # Offset used to define the decision function from the raw scores. 35 # We have the relation: decision_function = score_samples - offset_. 36 # Assuming behaviour == ‘new’, offset_ is defined as follows. 37 # When the contamination parameter is set to “auto”, 38 # the offset is equal to -0.5 as the scores of inliers are close to 0 and the 39 # scores of outliers are close to -1. When a contamination parameter different 40 # than “auto” is provided, the offset is defined in such a way we obtain the 41 # expected number of outliers (samples with decision function < 0) in training. 42 # Assuming the behaviour parameter is set to ‘old’, 43 # we always have offset_ = -0.5, making the decision function independent from 44 # the contamination parameter. 45 46 # check https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html for more information 47 48 49 class IForest(BaseDetector): 50 """Wrapper of scikit-learn Isolation Forest with more functionalities. 51 52 The IsolationForest 'isolates' observations by randomly selecting a 53 feature and then randomly selecting a split value between the maximum and 54 minimum values of the selected feature. 55 See :cite:`liu2008isolation,liu2012isolation` for details. 56 57 Since recursive partitioning can be represented by a tree structure, the 58 number of splittings required to isolate a sample is equivalent to the path 59 length from the root node to the terminating node. 60 61 This path length, averaged over a forest of such random trees, is a 62 measure of normality and our decision function. 63 64 Random partitioning produces noticeably shorter paths for anomalies. 65 Hence, when a forest of random trees collectively produce shorter path 66 lengths for particular samples, they are highly likely to be anomalies. 67 68 Parameters 69 ---------- 70 n_estimators : int, optional (default=100) 71 The number of base estimators in the ensemble. 72 73 max_samples : int or float, optional (default="auto") 74 The number of samples to draw from X to train each base estimator. 75 76 - If int, then draw `max_samples` samples. 77 - If float, then draw `max_samples * X.shape[0]` samples. 78 - If "auto", then `max_samples=min(256, n_samples)`. 79 80 If max_samples is larger than the number of samples provided, 81 all samples will be used for all trees (no sampling). 82 83 contamination : float in (0., 0.5), optional (default=0.1) 84 The amount of contamination of the data set, i.e. the proportion 85 of outliers in the data set. Used when fitting to define the threshold 86 on the decision function. 87 88 max_features : int or float, optional (default=1.0) 89 The number of features to draw from X to train each base estimator. 90 91 - If int, then draw `max_features` features. 92 - If float, then draw `max_features * X.shape[1]` features. 93 94 bootstrap : bool, optional (default=False) 95 If True, individual trees are fit on random subsets of the training 96 data sampled with replacement. If False, sampling without replacement 97 is performed. 98 99 n_jobs : integer, optional (default=1) 100 The number of jobs to run in parallel for both `fit` and `predict`. 101 If -1, then the number of jobs is set to the number of cores. 102 103 behaviour : str, default='old' 104 Behaviour of the ``decision_function`` which can be either 'old' or 105 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` 106 change to match other anomaly detection algorithm API which will be 107 the default behaviour in the future. As explained in details in the 108 ``offset_`` attribute documentation, the ``decision_function`` becomes 109 dependent on the contamination parameter, in such a way that 0 becomes 110 its natural threshold to detect outliers. 111 112 .. versionadded:: 0.7.0 113 ``behaviour`` is added in 0.7.0 for back-compatibility purpose. 114 115 .. deprecated:: 0.20 116 ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be 117 possible in 0.22. 118 119 .. deprecated:: 0.22 120 ``behaviour`` parameter will be deprecated in sklearn 0.22 and 121 removed in 0.24. 122 123 .. warning:: 124 Only applicable for sklearn 0.20 above. 125 126 random_state : int, RandomState instance or None, optional (default=None) 127 If int, random_state is the seed used by the random number generator; 128 If RandomState instance, random_state is the random number generator; 129 If None, the random number generator is the RandomState instance used 130 by `np.random`. 131 132 verbose : int, optional (default=0) 133 Controls the verbosity of the tree building process. 134 135 Attributes 136 ---------- 137 estimators_ : list of DecisionTreeClassifier 138 The collection of fitted sub-estimators. 139 140 estimators_samples_ : list of arrays 141 The subset of drawn samples (i.e., the in-bag samples) for each base 142 estimator. 143 144 max_samples_ : integer 145 The actual number of samples 146 147 decision_scores_ : numpy array of shape (n_samples,) 148 The outlier scores of the training data. 149 The higher, the more abnormal. Outliers tend to have higher 150 scores. This value is available once the detector is 151 fitted. 152 153 threshold_ : float 154 The threshold is based on ``contamination``. It is the 155 ``n_samples * contamination`` most abnormal samples in 156 ``decision_scores_``. The threshold is calculated for generating 157 binary outlier labels. 158 159 labels_ : int, either 0 or 1 160 The binary labels of the training data. 0 stands for inliers 161 and 1 for outliers/anomalies. It is generated by applying 162 ``threshold_`` on ``decision_scores_``. 163 """ 164 165 def __init__(self, n_estimators=100, 166 max_samples="auto", 167 contamination=0.1, 168 max_features=1., 169 bootstrap=False, 170 n_jobs=1, 171 behaviour='old', 172 random_state=None, 173 verbose=0): 174 super(IForest, self).__init__(contamination=contamination) 175 self.n_estimators = n_estimators 176 self.max_samples = max_samples 177 self.max_features = max_features 178 self.bootstrap = bootstrap 179 self.n_jobs = n_jobs 180 self.behaviour = behaviour 181 self.random_state = random_state 182 self.verbose = verbose 183 184 def fit(self, X, y=None): 185 """Fit detector. y is ignored in unsupervised methods. 186 187 Parameters 188 ---------- 189 X : numpy array of shape (n_samples, n_features) 190 The input samples. 191 192 y : Ignored 193 Not used, present for API consistency by convention. 194 195 Returns 196 ------- 197 self : object 198 Fitted estimator. 199 """ 200 # validate inputs X and y (optional) 201 X = check_array(X) 202 self._set_n_classes(y) 203 204 # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) 205 # to IsolationForest that shifts the location of the anomaly scores 206 # noinspection PyProtectedMember 207 208 self.detector_ = IsolationForest(n_estimators=self.n_estimators, 209 max_samples=self.max_samples, 210 contamination=self.contamination, 211 max_features=self.max_features, 212 bootstrap=self.bootstrap, 213 n_jobs=self.n_jobs, 214 random_state=self.random_state, 215 verbose=self.verbose) 216 217 self.detector_.fit(X=X, y=None, sample_weight=None) 218 219 # invert decision_scores_. Outliers comes with higher outlier scores. 220 self.decision_scores_ = invert_order( 221 self.detector_.decision_function(X)) 222 self._process_decision_scores() 223 return self 224 225 def decision_function(self, X): 226 """Predict raw anomaly score of X using the fitted detector. 227 228 The anomaly score of an input sample is computed based on different 229 detector algorithms. For consistency, outliers are assigned with 230 larger anomaly scores. 231 232 Parameters 233 ---------- 234 X : numpy array of shape (n_samples, n_features) 235 The training input samples. Sparse matrices are accepted only 236 if they are supported by the base estimator. 237 238 Returns 239 ------- 240 anomaly_scores : numpy array of shape (n_samples,) 241 The anomaly score of the input samples. 242 """ 243 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 244 # invert outlier scores. Outliers comes with higher outlier scores 245 return invert_order(self.detector_.decision_function(X)) 246 247 @property 248 def estimators_(self): 249 """The collection of fitted sub-estimators. 250 Decorator for scikit-learn Isolation Forest attributes. 251 """ 252 return self.detector_.estimators_ 253 254 @property 255 def estimators_samples_(self): 256 """The subset of drawn samples (i.e., the in-bag samples) for 257 each base estimator. 258 Decorator for scikit-learn Isolation Forest attributes. 259 """ 260 return self.detector_.estimators_samples_ 261 262 @property 263 def max_samples_(self): 264 """The actual number of samples. 265 Decorator for scikit-learn Isolation Forest attributes. 266 """ 267 return self.detector_.max_samples_ 268 269 @property 270 def estimators_features_(self): 271 """The indeces of the subset of features used to train the estimators. 272 Decorator for scikit-learn Isolation Forest attributes. 273 """ 274 return self.detector_.estimators_features_ 275 276 @property 277 def n_features_in_(self): 278 """The number of features seen during the fit. 279 Decorator for scikit-learn Isolation Forest attributes. 280 """ 281 return self.detector_.n_features_in_ 282 283 @property 284 def offset_(self): 285 """Offset used to define the decision function from the raw scores. 286 Decorator for scikit-learn Isolation Forest attributes. 287 """ 288 return self.detector_.offset_ 289 290 @property 291 def feature_importances_(self): 292 """The impurity-based feature importance. The higher, the more 293 important the feature. The importance of a feature is computed as the 294 (normalized) total reduction of the criterion brought by that feature. 295 It is also known as the Gini importance. 296 297 .. warning:: 298 impurity-based feature importance can be misleading for 299 high cardinality features (many unique values). See 300 https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html 301 as an alternative. 302 303 Returns 304 ------- 305 feature_importances_ : ndarray of shape (n_features,) 306 The values of this array sum to 1, unless all trees are single node 307 trees consisting of only the root node, in which case it will be an 308 array of zeros. 309 """ 310 check_is_fitted(self) 311 all_importances = Parallel( 312 n_jobs=self.n_jobs)( 313 delayed(getattr)(tree, "feature_importances_") 314 for tree in self.detector_.estimators_ 315 if tree.tree_.node_count > 1 316 ) 317 318 if not all_importances: 319 return np.zeros(self.n_features_in_, dtype=np.float64) 320 321 all_importances = np.mean(all_importances, axis=0, dtype=np.float64) 322 return all_importances / np.sum(all_importances)