Cradicle Explorer

mad.py
  1  # -*- coding: utf-8 -*-
  2  """
  3  Median Absolute deviation (MAD) Algorithm.
  4  Strictly for Univariate Data.
  5  """
  6  # Author: Yahya Almardeny <almardeny@gmail.com>
  7  # License: BSD 2 clause
  8  
  9  
 10  import numpy as np
 11  import sklearn
 12  from packaging import version
 13  from sklearn.utils import check_array
 14  
 15  from .base import BaseDetector
 16  
 17  
 18  def _check_dim(X):
 19      """
 20      Internal function to assert univariate data
 21      """
 22      if X.shape[1] != 1:
 23          raise ValueError('MAD algorithm is just for univariate data. '
 24                           'Got Data with {} Dimensions.'.format(X.shape[1]))
 25  
 26  
 27  def _check_array_compat(X, **kwargs):
 28      """
 29      Compatibility wrapper for sklearn.utils.check_array.
 30      Handles force_all_finite -> ensure_all_finite rename in sklearn 1.6+
 31      """
 32      if 'force_all_finite' in kwargs:
 33          if version.parse(str(sklearn.__version__)) >= version.parse('1.6.0'):
 34              kwargs['ensure_all_finite'] = kwargs.pop('force_all_finite')
 35      return check_array(X, **kwargs)
 36  
 37  
 38  class MAD(BaseDetector):
 39      """Median Absolute Deviation: for measuring the distances between
 40      data points and the median in terms of median distance.
 41      See :cite:`iglewicz1993detect` for details.
 42  
 43      Parameters
 44      ----------
 45      threshold : float, optional (default=3.5)
 46         The modified z-score to use as a threshold. Observations with
 47         a modified z-score (based on the median absolute deviation) greater
 48         than this value will be classified as outliers.
 49  
 50      Attributes
 51      ----------
 52      decision_scores_ : numpy array of shape (n_samples,)
 53          The outlier scores of the training data.
 54          The higher, the more abnormal. Outliers tend to have higher
 55          scores. This value is available once the detector is
 56          fitted.
 57  
 58      threshold_ : float
 59         The modified z-score to use as a threshold. Observations with
 60         a modified z-score (based on the median absolute deviation) greater
 61         than this value will be classified as outliers.
 62  
 63      labels_ : int, either 0 or 1
 64          The binary labels of the training data. 0 stands for inliers
 65          and 1 for outliers/anomalies. It is generated by applying
 66          ``threshold_`` on ``decision_scores_``.
 67      """
 68  
 69      def __init__(self, threshold=3.5, contamination=0.1):
 70          super(MAD, self).__init__(contamination=contamination)
 71          if not isinstance(threshold, (float, int)):
 72              raise TypeError(
 73                  'threshold must be a number. Got {}'.format(type(threshold)))
 74          self.threshold = threshold
 75  
 76      def fit(self, X, y=None):
 77          """Fit detector. y is ignored in unsupervised methods.
 78  
 79          Parameters
 80          ----------
 81          X : numpy array of shape (n_samples, n_features)
 82              The input samples. Note that `n_features` must equal 1.
 83  
 84          y : Ignored
 85              Not used, present for API consistency by convention.
 86  
 87          Returns
 88          -------
 89          self : object
 90              Fitted estimator.
 91          """
 92          X = _check_array_compat(X, ensure_2d=False, force_all_finite=False)
 93          _check_dim(X)
 94          self._set_n_classes(y)
 95          self.threshold_ = self.threshold
 96          self.median_ = None  # reset median after each call
 97          self.median_diff_ = None  # reset median_diff after each call
 98          self.decision_scores_ = self.decision_function(X)
 99          self._process_decision_scores()
100  
101          return self
102  
103      def decision_function(self, X):
104          """Predict raw anomaly score of X using the fitted detector.
105          The anomaly score of an input sample is computed based on different
106          detector algorithms. For consistency, outliers are assigned with
107          larger anomaly scores.
108  
109          Parameters
110          ----------
111          X : numpy array of shape (n_samples, n_features)
112              The training input samples. Sparse matrices are accepted only
113              if they are supported by the base estimator.
114              Note that `n_features` must equal 1.
115  
116          Returns
117          -------
118          anomaly_scores : numpy array of shape (n_samples,)
119              The anomaly score of the input samples.
120          """
121          X = _check_array_compat(X, ensure_2d=False, force_all_finite=False)
122          _check_dim(X)
123          return self._mad(X)
124  
125      def _mad(self, X):
126          """
127          Apply the robust median absolute deviation (MAD)
128          to measure the distances of data points from the median.
129  
130          Returns
131          -------
132          numpy array containing modified Z-scores of the observations.
133          The greater the score, the greater the outlierness.
134          """
135          obs = np.reshape(X, (-1, 1))
136          # `self.median` will be None only before `fit()` is called
137          self.median_ = np.nanmedian(
138              obs) if self.median_ is None else self.median_
139          diff = np.abs(obs - self.median_)
140          self.median_diff_ = np.nanmedian(
141              diff) if self.median_diff_ is None else self.median_diff_
142          return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff_))
143  
144      def _process_decision_scores(self):
145          """This overrides PyOD base class function in order to use the
146          proper `threshold_` which is quite different in the base class.
147          Internal function to calculate key attributes:
148          - labels_: binary labels of training data.
149          - _mu: mean of decision scores.
150          - _sigma: standard deviation of decision scores.
151  
152          Returns
153          -------
154          self
155          """
156          self.labels_ = (self.decision_scores_ > self.threshold).astype(
157              'int').ravel()
158  
159          # calculate for predict_proba()
160          self._mu = np.nanmean(self.decision_scores_)
161          self._sigma = np.nanstd(self.decision_scores_)
162  
163          return self