base.py
1 # -*- coding: utf-8 -*- 2 """Base class for all outlier detector models 3 """ 4 # Author: Yue Zhao <yzhao062@gmail.com> 5 # License: BSD 2 clause 6 7 8 import abc 9 import warnings 10 from collections import defaultdict 11 from inspect import signature 12 13 import numpy as np 14 from numpy import percentile 15 from scipy.special import erf 16 from scipy.stats import binom 17 from sklearn.base import BaseEstimator 18 from sklearn.metrics import roc_auc_score 19 from sklearn.preprocessing import MinMaxScaler 20 from sklearn.utils import deprecated 21 from sklearn.utils.multiclass import check_classification_targets 22 from sklearn.utils.validation import check_is_fitted 23 from scipy.optimize import root_scalar 24 25 from .sklearn_base import _pprint 26 from ..utils.utility import precision_n_scores 27 28 29 class BaseDetector(BaseEstimator, metaclass=abc.ABCMeta): 30 """Abstract class for all outlier detection algorithms. 31 32 33 Parameters 34 ---------- 35 contamination : float in (0., 0.5), optional (default=0.1) 36 The amount of contamination of the data set, 37 i.e. the proportion of outliers in the data set. Used when fitting to 38 define the threshold on the decision function. 39 40 Attributes 41 ---------- 42 decision_scores_ : numpy array of shape (n_samples,) 43 The outlier scores of the training data. 44 The higher, the more abnormal. Outliers tend to have higher 45 scores. This value is available once the detector is fitted. 46 47 threshold_ : float 48 The threshold is based on ``contamination``. It is the 49 ``n_samples * contamination`` most abnormal samples in 50 ``decision_scores_``. The threshold is calculated for generating 51 binary outlier labels. 52 53 labels_ : int, either 0 or 1 54 The binary labels of the training data. 0 stands for inliers 55 and 1 for outliers/anomalies. It is generated by applying 56 ``threshold_`` on ``decision_scores_``. 57 """ 58 59 @abc.abstractmethod 60 def __init__(self, contamination=0.1): 61 62 if (isinstance(contamination, (float, int))): 63 64 if not (0. < contamination <= 0.5): 65 raise ValueError("contamination must be in (0, 0.5], " 66 "got: %f" % contamination) 67 68 # allow arbitrary input such as PyThreshld object 69 self.contamination = contamination 70 71 # noinspection PyIncorrectDocstring 72 @abc.abstractmethod 73 def fit(self, X, y=None): 74 """Fit detector. y is ignored in unsupervised methods. 75 76 Parameters 77 ---------- 78 X : numpy array of shape (n_samples, n_features) 79 The input samples. 80 81 y : Ignored 82 Not used, present for API consistency by convention. 83 84 Returns 85 ------- 86 self : object 87 Fitted estimator. 88 """ 89 pass 90 91 @abc.abstractmethod 92 def decision_function(self, X): 93 """Predict raw anomaly scores of X using the fitted detector. 94 95 The anomaly score of an input sample is computed based on the fitted 96 detector. For consistency, outliers are assigned with 97 higher anomaly scores. 98 99 Parameters 100 ---------- 101 X : numpy array of shape (n_samples, n_features) 102 The input samples. Sparse matrices are accepted only 103 if they are supported by the base estimator. 104 105 Returns 106 ------- 107 anomaly_scores : numpy array of shape (n_samples,) 108 The anomaly score of the input samples. 109 """ 110 pass 111 112 @deprecated() 113 def fit_predict(self, X, y=None): 114 """Fit detector first and then predict whether a particular sample 115 is an outlier or not. y is ignored in unsupervised models. 116 117 Parameters 118 ---------- 119 X : numpy array of shape (n_samples, n_features) 120 The input samples. 121 122 y : Ignored 123 Not used, present for API consistency by convention. 124 125 Returns 126 ------- 127 outlier_labels : numpy array of shape (n_samples,) 128 For each observation, tells whether 129 it should be considered as an outlier according to the 130 fitted model. 0 stands for inliers and 1 for outliers. 131 132 .. deprecated:: 0.6.9 133 `fit_predict` will be removed in pyod 0.8.0.; it will be 134 replaced by calling `fit` function first and then accessing 135 `labels_` attribute for consistency. 136 """ 137 138 self.fit(X, y) 139 return self.labels_ 140 141 def predict(self, X, return_confidence=False): 142 """Predict if a particular sample is an outlier or not. 143 144 Parameters 145 ---------- 146 X : numpy array of shape (n_samples, n_features) 147 The input samples. 148 149 return_confidence : boolean, optional(default=False) 150 If True, also return the confidence of prediction. 151 152 Returns 153 ------- 154 outlier_labels : numpy array of shape (n_samples,) 155 For each observation, tells whether 156 it should be considered as an outlier according to the 157 fitted model. 0 stands for inliers and 1 for outliers. 158 confidence : numpy array of shape (n_samples,). 159 Only if return_confidence is set to True. 160 """ 161 162 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 163 pred_score = self.decision_function(X) 164 165 if isinstance(self.contamination, (float, int)): 166 prediction = (pred_score > self.threshold_).astype('int').ravel() 167 168 # if this is a PyThresh object 169 else: 170 prediction = self.contamination.eval(pred_score) 171 172 if return_confidence: 173 confidence = self.predict_confidence(X) 174 return prediction, confidence 175 176 return prediction 177 178 def predict_proba(self, X, method='linear', return_confidence=False): 179 """Predict the probability of a sample being outlier. Two approaches 180 are possible: 181 182 1. simply use Min-max conversion to linearly transform the outlier 183 scores into the range of [0,1]. The model must be 184 fitted first. 185 2. use unifying scores, see :cite:`kriegel2011interpreting`. 186 187 Parameters 188 ---------- 189 X : numpy array of shape (n_samples, n_features) 190 The input samples. 191 192 method : str, optional (default='linear') 193 probability conversion method. It must be one of 194 'linear' or 'unify'. 195 196 return_confidence : boolean, optional(default=False) 197 If True, also return the confidence of prediction. 198 199 200 Returns 201 ------- 202 outlier_probability : numpy array of shape (n_samples, n_classes) 203 For each observation, tells whether or not 204 it should be considered as an outlier according to the 205 fitted model. Return the outlier probability, ranging 206 in [0,1]. Note it depends on the number of classes, which is by 207 default 2 classes ([proba of normal, proba of outliers]). 208 """ 209 210 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 211 train_scores = self.decision_scores_ 212 213 test_scores = self.decision_function(X) 214 215 probs = np.zeros([X.shape[0], int(self._classes)]) 216 if method == 'linear': 217 scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) 218 probs[:, 1] = scaler.transform( 219 test_scores.reshape(-1, 1)).ravel().clip(0, 1) 220 probs[:, 0] = 1 - probs[:, 1] 221 222 if return_confidence: 223 confidence = self.predict_confidence(X) 224 return probs, confidence 225 226 return probs 227 228 elif method == 'unify': 229 # turn output into probability 230 pre_erf_score = (test_scores - self._mu) / ( 231 self._sigma * np.sqrt(2)) 232 erf_score = erf(pre_erf_score) 233 probs[:, 1] = erf_score.clip(0, 1).ravel() 234 probs[:, 0] = 1 - probs[:, 1] 235 236 if return_confidence: 237 confidence = self.predict_confidence(X) 238 return probs, confidence 239 240 return probs 241 else: 242 raise ValueError(method, 243 'is not a valid probability conversion method') 244 245 def predict_confidence(self, X): 246 """Predict the model's confidence in making the same prediction 247 under slightly different training sets. 248 See :cite:`perini2020quantifying`. 249 250 Parameters 251 ------- 252 X : numpy array of shape (n_samples, n_features) 253 The input samples. 254 255 Returns 256 ------- 257 confidence : numpy array of shape (n_samples,) 258 For each observation, tells how consistently the model would 259 make the same prediction if the training set was perturbed. 260 Return a probability, ranging in [0,1]. 261 262 """ 263 264 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 265 266 n = len(self.decision_scores_) 267 268 # todo: this has an optimization opportunity since the scores may 269 # already be available 270 test_scores = self.decision_function(X) 271 272 count_instances = np.vectorize( 273 lambda x: np.count_nonzero(self.decision_scores_ <= x)) 274 n_instances = count_instances(test_scores) 275 276 # Derive the outlier probability using Bayesian approach 277 posterior_prob = np.vectorize(lambda x: (1 + x) / (2 + n))(n_instances) 278 279 if not isinstance(self.contamination, (float, int)): 280 contam = np.sum(self.labels_) / n 281 # if this is a PyThresh object 282 else: 283 contam = self.contamination 284 285 # Transform the outlier probability into a confidence value 286 confidence = np.vectorize( 287 lambda p: 1 - binom.cdf(n - int(n * contam), n, p))( 288 posterior_prob) 289 290 if isinstance(self.contamination, (float, int)): 291 prediction = (test_scores > self.threshold_).astype('int').ravel() 292 # if this is a PyThresh object 293 else: 294 prediction = self.contamination.eval(test_scores) 295 np.place(confidence, prediction == 0, 1 - confidence[prediction == 0]) 296 297 return confidence 298 299 def predict_with_rejection(self, X, T=32, return_stats=False, 300 delta=0.1, c_fp=1, c_fn=1, c_r=-1): 301 """Predict if a particular sample is an outlier or not, 302 allowing the detector to reject (i.e., output = -2) 303 low confidence predictions. 304 305 Parameters 306 ---------- 307 X : numpy array of shape (n_samples, n_features) 308 The input samples. 309 310 T : int, optional(default=32) 311 It allows to set the rejection threshold to 1-2exp(-T). 312 The higher the value of T, the more rejections are made. 313 314 return_stats: bool, optional (default = False) 315 If true, it returns also three additional float values: 316 the estimated rejection rate, the upper bound rejection 317 rate, and the upper bound of the cost. 318 319 delta: float, optional (default = 0.1) 320 The upper bound rejection rate holds with probability 1-delta. 321 322 c_fp, c_fn, c_r: floats (positive), optional (default = [1,1, contamination]) 323 costs for false positive predictions (c_fp), false negative 324 predictions (c_fn) and rejections (c_r). 325 326 Returns 327 ------- 328 outlier_labels : numpy array of shape (n_samples,) 329 For each observation, it tells whether it should be 330 considered as an outlier according to the fitted 331 model. 0 stands for inliers, 1 for outliers and 332 -2 for rejection. 333 334 expected_rejection_rate: float, if return_stats is True; 335 upperbound_rejection_rate: float, if return_stats is True; 336 upperbound_cost: float, if return_stats is True; 337 338 """ 339 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 340 if c_r < 0: 341 warnings.warn( 342 "The cost of rejection must be positive. " 343 "It has been set to the contamination rate.") 344 c_r = self.contamination 345 346 if delta <= 0 or delta >= 1: 347 warnings.warn( 348 "delta must belong to (0,1). It's value has been set to 0.1") 349 delta = 0.1 350 351 self.rejection_threshold_ = 1 - 2 * np.exp(-T) 352 prediction = self.predict(X) 353 confidence = self.predict_confidence(X) 354 np.place(confidence, prediction == 0, 1 - confidence[prediction == 0]) 355 confidence = 2 * abs(confidence - .5) 356 prediction[np.where(confidence <= self.rejection_threshold_)[0]] = -2 357 358 if return_stats: 359 expected_rejrate, ub_rejrate, ub_cost = self.compute_rejection_stats( 360 T=T, delta=delta, 361 c_fp=c_fp, c_fn=c_fn, c_r=c_r) 362 return prediction, [expected_rejrate, ub_rejrate, ub_cost] 363 364 return prediction 365 366 def compute_rejection_stats(self, T=32, delta=0.1, c_fp=1, c_fn=1, c_r=-1, 367 verbose=False): 368 """Add reject option into the unsupervised detector. 369 This comes with guarantees: an estimate of the expected 370 rejection rate (return_rejectrate=True), an upper 371 bound of the rejection rate (return_ub_rejectrate= True), 372 and an upper bound on the cost (return_ub_cost=True). 373 374 Parameters 375 ---------- 376 T: int, optional(default=32) 377 It allows to set the rejection threshold to 1-2exp(-T). 378 The higher the value of T, the more rejections are made. 379 380 delta: float, optional (default = 0.1) 381 The upper bound rejection rate holds with probability 1-delta. 382 383 c_fp, c_fn, c_r: floats (positive), 384 optional (default = [1,1, contamination]) 385 costs for false positive predictions (c_fp), 386 false negative predictions (c_fn) and rejections (c_r). 387 388 verbose: bool, optional (default = False) 389 If true, it prints the expected rejection rate, the upper 390 bound rejection rate, and the upper bound of the cost. 391 392 Returns 393 ------- 394 expected_rejection_rate: float, the expected rejection rate; 395 upperbound_rejection_rate: float, the upper bound for the rejection rate 396 satisfied with probability 1-delta; 397 upperbound_cost: float, the upper bound for the cost; 398 """ 399 400 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 401 402 if c_r < 0: 403 c_r = self.contamination 404 405 if delta <= 0 or delta >= 1: 406 delta = 0.1 407 408 # Computing the expected rejection rate 409 n = len(self.decision_scores_) 410 n_gamma_minus1 = int(n * self.contamination) - 1 411 argsmin = (n_gamma_minus1, n, 1 - np.exp(-T)) 412 argsmax = (n_gamma_minus1, n, np.exp(-T)) 413 q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, 414 bracket=[0, 1], method='brentq', args=argsmin).root 415 q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, 416 bracket=[0, 1], method='brentq', args=argsmax).root 417 expected_reject_rate = q2 - q1 418 419 # Computing the upper bound for the rejection rate 420 right_mar = (-self.contamination * (n + 2) + n + 1) / n + ( 421 T * (n + 2)) / (np.sqrt(2 * n ** 3 * T)) 422 right_mar = min(1, right_mar) 423 left_mar = ( 424 (2 + n * (1 - self.contamination) * (n + 1)) / n ** 2 425 - np.sqrt( 426 0.5 * n ** 5 * ( 427 2 * n * ( 428 -3 * self.contamination ** 2 429 - 2 * n * (1 - self.contamination) ** 2 430 + 4 * self.contamination - 3 431 ) 432 + T * (n + 2) ** 2 - 8 433 ) 434 ) / n ** 4 435 ) 436 left_mar = max(0, left_mar) 437 add_term = 2 * np.sqrt(np.log(2 / delta) / (2 * n)) 438 upperbound_rejectrate = right_mar - left_mar + add_term 439 440 # Computing the upper bound for the cost function 441 n_gamma_minus1 = int(n * self.contamination) - 1 442 argsmin = (n_gamma_minus1, n, 1 - np.exp(-T)) 443 argsmax = (n_gamma_minus1, n, np.exp(-T)) 444 q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, 445 bracket=[0, 1], method='brentq', args=argsmin).root 446 q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, 447 bracket=[0, 1], method='brentq', args=argsmax).root 448 upperbound_cost = np.min([self.contamination, q1]) * c_fp + np.min( 449 [1 - q2, self.contamination]) * c_fn + (q2 - q1) * c_r 450 451 if verbose: 452 print("Expected rejection rate: ", 453 np.round(expected_reject_rate, 4), '%') 454 print("Upper bound rejection rate: ", 455 np.round(upperbound_rejectrate, 4), '%') 456 print("Upper bound cost: ", np.round(upperbound_cost, 4)) 457 458 return expected_reject_rate, upperbound_rejectrate, upperbound_cost 459 460 def _predict_rank(self, X, normalized=False): 461 """Predict the outlyingness rank of a sample by a fitted model. The 462 method is for outlier detector score combination. 463 464 Parameters 465 ---------- 466 X : numpy array of shape (n_samples, n_features) 467 The input samples. 468 469 normalized : bool, optional (default=False) 470 If set to True, all ranks are normalized to [0,1]. 471 472 Returns 473 ------- 474 ranks : array, shape (n_samples,) 475 Outlying rank of a sample according to the training data. 476 477 """ 478 479 check_is_fitted(self, ['decision_scores_']) 480 481 test_scores = self.decision_function(X) 482 train_scores = self.decision_scores_ 483 484 sorted_train_scores = np.sort(train_scores) 485 ranks = np.searchsorted(sorted_train_scores, test_scores) 486 487 if normalized: 488 # return normalized ranks 489 ranks = ranks / ranks.max() 490 return ranks 491 492 @deprecated() 493 def fit_predict_score(self, X, y, scoring='roc_auc_score'): 494 """Fit the detector, predict on samples, and evaluate the model by 495 predefined metrics, e.g., ROC. 496 497 Parameters 498 ---------- 499 X : numpy array of shape (n_samples, n_features) 500 The input samples. 501 502 y : Ignored 503 Not used, present for API consistency by convention. 504 505 scoring : str, optional (default='roc_auc_score') 506 Evaluation metric: 507 508 - 'roc_auc_score': ROC score 509 - 'prc_n_score': Precision @ rank n score 510 511 Returns 512 ------- 513 score : float 514 515 .. deprecated:: 0.6.9 516 `fit_predict_score` will be removed in pyod 0.8.0.; it will be 517 replaced by calling `fit` function first and then accessing 518 `labels_` attribute for consistency. Scoring could be done by 519 calling an evaluation method, e.g., AUC ROC. 520 """ 521 522 self.fit(X) 523 524 if scoring == 'roc_auc_score': 525 score = roc_auc_score(y, self.decision_scores_) 526 elif scoring == 'prc_n_score': 527 score = precision_n_scores(y, self.decision_scores_) 528 else: 529 raise NotImplementedError('PyOD built-in scoring only supports ' 530 'ROC and Precision @ rank n') 531 532 print("{metric}: {score}".format(metric=scoring, score=score)) 533 534 return score 535 536 537 def _set_n_classes(self, y): 538 """Set the number of classes if `y` is presented, which is not 539 expected. It could be useful for multi-class outlier detection. 540 541 Parameters 542 ---------- 543 y : numpy array of shape (n_samples,) 544 Ground truth. 545 546 Returns 547 ------- 548 self 549 """ 550 551 self._classes = 2 # default as binary classification 552 if y is not None: 553 check_classification_targets(y) 554 self._classes = len(np.unique(y)) 555 warnings.warn( 556 "y should not be presented in unsupervised learning.") 557 return self 558 559 def _process_decision_scores(self): 560 """Internal function to calculate key attributes: 561 562 - threshold_: used to decide the binary label 563 - labels_: binary labels of training data 564 565 Returns 566 ------- 567 self 568 """ 569 570 if isinstance(self.contamination, (float, int)): 571 self.threshold_ = percentile(self.decision_scores_, 572 100 * (1 - self.contamination)) 573 self.labels_ = (self.decision_scores_ > self.threshold_).astype( 574 'int').ravel() 575 576 # if this is a PyThresh object 577 else: 578 self.labels_ = self.contamination.eval(self.decision_scores_) 579 self.threshold_ = self.contamination.thresh_ 580 if not self.threshold_: 581 self.threshold_ = np.sum(self.labels_) / len(self.labels_) 582 583 # calculate for predict_proba() 584 585 self._mu = np.mean(self.decision_scores_) 586 self._sigma = np.std(self.decision_scores_) 587 588 return self 589 590 # noinspection PyMethodParameters 591 def _get_param_names(cls): 592 # noinspection PyPep8 593 """Get parameter names for the estimator 594 595 See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 596 and sklearn/base.py for more information. 597 """ 598 599 # fetch the constructor or the original constructor before 600 # deprecation wrapping if any 601 init = getattr(cls.__init__, 'deprecated_original', cls.__init__) 602 if init is object.__init__: 603 # No explicit constructor to introspect 604 return [] 605 606 # introspect the constructor arguments to find the model parameters 607 # to represent 608 init_signature = signature(init) 609 # Consider the constructor parameters excluding 'self' 610 parameters = [p for p in init_signature.parameters.values() 611 if p.name != 'self' and p.kind != p.VAR_KEYWORD] 612 for p in parameters: 613 if p.kind == p.VAR_POSITIONAL: 614 raise RuntimeError("scikit-learn estimators should always " 615 "specify their parameters in the signature" 616 " of their __init__ (no varargs)." 617 " %s with constructor %s doesn't " 618 " follow this convention." 619 % (cls, init_signature)) 620 # Extract and sort argument names excluding 'self' 621 return sorted([p.name for p in parameters]) 622 623 # noinspection PyPep8 624 def get_params(self, deep=True): 625 """Get parameters for this estimator. 626 627 See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 628 and sklearn/base.py for more information. 629 630 Parameters 631 ---------- 632 deep : bool, optional (default=True) 633 If True, will return the parameters for this estimator and 634 contained subobjects that are estimators. 635 636 Returns 637 ------- 638 params : mapping of string to any 639 Parameter names mapped to their values. 640 """ 641 642 out = dict() 643 for key in self._get_param_names(): 644 # We need deprecation warnings to always be on in order to 645 # catch deprecated param values. 646 # This is set in utils/__init__.py but it gets overwritten 647 # when running under python3 somehow. 648 warnings.simplefilter("always", DeprecationWarning) 649 try: 650 with warnings.catch_warnings(record=True) as w: 651 value = getattr(self, key, None) 652 if len(w) and w[0].category == DeprecationWarning: 653 # if the parameter is deprecated, don't show it 654 continue 655 finally: 656 warnings.filters.pop(0) 657 658 # XXX: should we rather test if instance of estimator? 659 if deep and hasattr(value, 'get_params'): 660 deep_items = value.get_params().items() 661 out.update((key + '__' + k, val) for k, val in deep_items) 662 out[key] = value 663 return out 664 665 def set_params(self, **params): 666 # noinspection PyPep8 667 """Set the parameters of this estimator. 668 The method works on simple estimators as well as on nested objects 669 (such as pipelines). The latter have parameters of the form 670 ``<component>__<parameter>`` so that it's possible to update each 671 component of a nested object. 672 673 See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 674 and sklearn/base.py for more information. 675 676 Returns 677 ------- 678 self : object 679 """ 680 681 if not params: 682 # Simple optimization to gain speed (inspect is slow) 683 return self 684 valid_params = self.get_params(deep=True) 685 686 nested_params = defaultdict(dict) # grouped by prefix 687 for key, value in params.items(): 688 key, delim, sub_key = key.partition('__') 689 if key not in valid_params: 690 raise ValueError('Invalid parameter %s for estimator %s. ' 691 'Check the list of available parameters ' 692 'with `estimator.get_params().keys()`.' % 693 (key, self)) 694 695 if delim: 696 nested_params[key][sub_key] = value 697 else: 698 setattr(self, key, value) 699 700 for key, sub_params in nested_params.items(): 701 valid_params[key].set_params(**sub_params) 702 703 return self 704 705 def __repr__(self): 706 # noinspection PyPep8 707 """ 708 See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 709 and sklearn/base.py for more information. 710 """ 711 712 class_name = self.__class__.__name__ 713 return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), 714 offset=len(class_name), ),) 715 716 717 def __sklearn_tags__(self): 718 """Return sklearn-style Tags for compatibility with scikit-learn >= 1.8. 719 720 We mark all PyOD detectors as 'outlier_detector' so that utilities 721 such as sklearn.utils._tags.get_tags and is_outlier_detector work. 722 """ 723 tags = super().__sklearn_tags__() 724 # match sklearn's OutlierMixin 725 tags.estimator_type = "outlier_detector" 726 return tags 727