Cradicle Explorer

kde.py
  1  # -*- coding: utf-8 -*-
  2  """Kernel Density Estimation (KDE) for Unsupervised Outlier Detection.
  3  """
  4  # Author: Akira Tamamori <tamamori5917@gmail.com>
  5  # License: BSD 2 clause
  6  
  7  
  8  from warnings import warn
  9  
 10  from sklearn.neighbors import KernelDensity
 11  from sklearn.utils import check_array
 12  from sklearn.utils.validation import check_is_fitted
 13  
 14  from .base import BaseDetector
 15  from ..utils.utility import invert_order
 16  
 17  
 18  class KDE(BaseDetector):
 19      """KDE class for outlier detection.
 20  
 21      For an observation, its negative log probability density could be viewed
 22      as the outlying score.
 23  
 24      See :cite:`latecki2007outlier` for details.
 25  
 26      Parameters
 27      ----------
 28      contamination : float in (0., 0.5), optional (default=0.1)
 29          The amount of contamination of the data set,
 30          i.e. the proportion of outliers in the data set. Used when fitting to
 31          define the threshold on the decision function.
 32  
 33      bandwidth : float, optional (default=1.0)
 34          The bandwidth of the kernel.
 35  
 36      algorithm : {'auto', 'ball_tree', 'kd_tree'}, optional
 37          Algorithm used to compute the kernel density estimator:
 38  
 39          - 'ball_tree' will use BallTree
 40          - 'kd_tree' will use KDTree
 41          - 'auto' will attempt to decide the most appropriate algorithm
 42            based on the values passed to :meth:`fit` method.
 43  
 44      leaf_size : int, optional (default = 30)
 45          Leaf size passed to BallTree. This can affect the
 46          speed of the construction and query, as well as the memory
 47          required to store the tree.  The optimal value depends on the
 48          nature of the problem.
 49  
 50      metric : string or callable, default 'minkowski'
 51          metric to use for distance computation. Any metric from scikit-learn
 52          or scipy.spatial.distance can be used.
 53  
 54          If metric is a callable function, it is called on each
 55          pair of instances (rows) and the resulting value recorded. The callable
 56          should take two arrays as input and return one value indicating the
 57          distance between them. This works for Scipy's metrics, but is less
 58          efficient than passing the metric name as a string.
 59  
 60          Distance matrices are not supported.
 61  
 62          Valid values for metric are:
 63  
 64          - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
 65            'manhattan']
 66  
 67          - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
 68            'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
 69            'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
 70            'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
 71            'sqeuclidean', 'yule']
 72  
 73          See the documentation for scipy.spatial.distance for details on these
 74          metrics.
 75  
 76      metric_params : dict, optional (default = None)
 77          Additional keyword arguments for the metric function.
 78  
 79      Attributes
 80      ----------
 81      decision_scores_ : numpy array of shape (n_samples,)
 82          The outlier scores of the training data.
 83          The higher, the more abnormal. Outliers tend to have higher
 84          scores. This value is available once the detector is
 85          fitted.
 86  
 87      threshold_ : float
 88          The threshold is based on ``contamination``. It is the
 89          ``n_samples * contamination`` most abnormal samples in
 90          ``decision_scores_``. The threshold is calculated for generating
 91          binary outlier labels.
 92  
 93      labels_ : int, either 0 or 1
 94          The binary labels of the training data. 0 stands for inliers
 95          and 1 for outliers/anomalies. It is generated by applying
 96          ``threshold_`` on ``decision_scores_``.
 97      """
 98  
 99      def __init__(
100              self,
101              contamination=0.1,
102              bandwidth=1.0,
103              algorithm="auto",
104              leaf_size=30,
105              metric="minkowski",
106              metric_params=None,
107      ):
108          super().__init__(contamination=contamination)
109          self.bandwidth = bandwidth
110          self.algorithm = algorithm
111          self.leaf_size = leaf_size
112          self.metric = metric
113          self.metric_params = metric_params
114  
115          if self.algorithm != "auto" and self.algorithm != "ball_tree":
116              warn(
117                  "algorithm parameter is deprecated and will be removed "
118                  "in version 0.7.6. By default, ball_tree will be used.",
119                  FutureWarning,
120              )
121  
122          self.kde_ = KernelDensity(
123              bandwidth=self.bandwidth,
124              algorithm=self.algorithm,
125              leaf_size=self.leaf_size,
126              metric=self.metric,
127              metric_params=self.metric_params,
128          )
129  
130          self.decision_scores_ = None
131  
132      def fit(self, X, y=None):
133          """Fit detector. y is ignored in unsupervised methods.
134  
135          Parameters
136          ----------
137          X : numpy array of shape (n_samples, n_features)
138              The input samples.
139  
140          y : Ignored
141              Not used, present for API consistency by convention.
142  
143          Returns
144          -------
145          self : object
146              Fitted estimator.
147          """
148  
149          # validate inputs X and y (optional)
150          X = check_array(X)
151          self._set_n_classes(y)
152  
153          self.kde_.fit(X)
154  
155          # invert decision_scores_. Outliers comes with higher outlier scores.
156          self.decision_scores_ = invert_order(self.kde_.score_samples(X))
157          self._process_decision_scores()
158  
159          return self
160  
161      def decision_function(self, X):
162          """Predict raw anomaly score of X using the fitted detector.
163  
164          The anomaly score of an input sample is computed based on different
165          detector algorithms. For consistency, outliers are assigned with
166          larger anomaly scores.
167  
168          Parameters
169          ----------
170          X : numpy array of shape (n_samples, n_features)
171              The training input samples. Sparse matrices are accepted only
172              if they are supported by the base estimator.
173  
174          Returns
175          -------
176          anomaly_scores : numpy array of shape (n_samples,)
177              The anomaly score of the input samples.
178          """
179          check_is_fitted(self, ["decision_scores_", "threshold_", "labels_"])
180  
181          X = check_array(X)
182  
183          # invert outlier scores. Outliers comes with higher outlier scores.
184          return invert_order(self.kde_.score_samples(X))