mad.py
1 # -*- coding: utf-8 -*- 2 """ 3 Median Absolute deviation (MAD) Algorithm. 4 Strictly for Univariate Data. 5 """ 6 # Author: Yahya Almardeny <almardeny@gmail.com> 7 # License: BSD 2 clause 8 9 10 import numpy as np 11 import sklearn 12 from packaging import version 13 from sklearn.utils import check_array 14 15 from .base import BaseDetector 16 17 18 def _check_dim(X): 19 """ 20 Internal function to assert univariate data 21 """ 22 if X.shape[1] != 1: 23 raise ValueError('MAD algorithm is just for univariate data. ' 24 'Got Data with {} Dimensions.'.format(X.shape[1])) 25 26 27 def _check_array_compat(X, **kwargs): 28 """ 29 Compatibility wrapper for sklearn.utils.check_array. 30 Handles force_all_finite -> ensure_all_finite rename in sklearn 1.6+ 31 """ 32 if 'force_all_finite' in kwargs: 33 if version.parse(str(sklearn.__version__)) >= version.parse('1.6.0'): 34 kwargs['ensure_all_finite'] = kwargs.pop('force_all_finite') 35 return check_array(X, **kwargs) 36 37 38 class MAD(BaseDetector): 39 """Median Absolute Deviation: for measuring the distances between 40 data points and the median in terms of median distance. 41 See :cite:`iglewicz1993detect` for details. 42 43 Parameters 44 ---------- 45 threshold : float, optional (default=3.5) 46 The modified z-score to use as a threshold. Observations with 47 a modified z-score (based on the median absolute deviation) greater 48 than this value will be classified as outliers. 49 50 Attributes 51 ---------- 52 decision_scores_ : numpy array of shape (n_samples,) 53 The outlier scores of the training data. 54 The higher, the more abnormal. Outliers tend to have higher 55 scores. This value is available once the detector is 56 fitted. 57 58 threshold_ : float 59 The modified z-score to use as a threshold. Observations with 60 a modified z-score (based on the median absolute deviation) greater 61 than this value will be classified as outliers. 62 63 labels_ : int, either 0 or 1 64 The binary labels of the training data. 0 stands for inliers 65 and 1 for outliers/anomalies. It is generated by applying 66 ``threshold_`` on ``decision_scores_``. 67 """ 68 69 def __init__(self, threshold=3.5, contamination=0.1): 70 super(MAD, self).__init__(contamination=contamination) 71 if not isinstance(threshold, (float, int)): 72 raise TypeError( 73 'threshold must be a number. Got {}'.format(type(threshold))) 74 self.threshold = threshold 75 76 def fit(self, X, y=None): 77 """Fit detector. y is ignored in unsupervised methods. 78 79 Parameters 80 ---------- 81 X : numpy array of shape (n_samples, n_features) 82 The input samples. Note that `n_features` must equal 1. 83 84 y : Ignored 85 Not used, present for API consistency by convention. 86 87 Returns 88 ------- 89 self : object 90 Fitted estimator. 91 """ 92 X = _check_array_compat(X, ensure_2d=False, force_all_finite=False) 93 _check_dim(X) 94 self._set_n_classes(y) 95 self.threshold_ = self.threshold 96 self.median_ = None # reset median after each call 97 self.median_diff_ = None # reset median_diff after each call 98 self.decision_scores_ = self.decision_function(X) 99 self._process_decision_scores() 100 101 return self 102 103 def decision_function(self, X): 104 """Predict raw anomaly score of X using the fitted detector. 105 The anomaly score of an input sample is computed based on different 106 detector algorithms. For consistency, outliers are assigned with 107 larger anomaly scores. 108 109 Parameters 110 ---------- 111 X : numpy array of shape (n_samples, n_features) 112 The training input samples. Sparse matrices are accepted only 113 if they are supported by the base estimator. 114 Note that `n_features` must equal 1. 115 116 Returns 117 ------- 118 anomaly_scores : numpy array of shape (n_samples,) 119 The anomaly score of the input samples. 120 """ 121 X = _check_array_compat(X, ensure_2d=False, force_all_finite=False) 122 _check_dim(X) 123 return self._mad(X) 124 125 def _mad(self, X): 126 """ 127 Apply the robust median absolute deviation (MAD) 128 to measure the distances of data points from the median. 129 130 Returns 131 ------- 132 numpy array containing modified Z-scores of the observations. 133 The greater the score, the greater the outlierness. 134 """ 135 obs = np.reshape(X, (-1, 1)) 136 # `self.median` will be None only before `fit()` is called 137 self.median_ = np.nanmedian( 138 obs) if self.median_ is None else self.median_ 139 diff = np.abs(obs - self.median_) 140 self.median_diff_ = np.nanmedian( 141 diff) if self.median_diff_ is None else self.median_diff_ 142 return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff_)) 143 144 def _process_decision_scores(self): 145 """This overrides PyOD base class function in order to use the 146 proper `threshold_` which is quite different in the base class. 147 Internal function to calculate key attributes: 148 - labels_: binary labels of training data. 149 - _mu: mean of decision scores. 150 - _sigma: standard deviation of decision scores. 151 152 Returns 153 ------- 154 self 155 """ 156 self.labels_ = (self.decision_scores_ > self.threshold).astype( 157 'int').ravel() 158 159 # calculate for predict_proba() 160 self._mu = np.nanmean(self.decision_scores_) 161 self._sigma = np.nanstd(self.decision_scores_) 162 163 return self