kde.py
1 # -*- coding: utf-8 -*- 2 """Kernel Density Estimation (KDE) for Unsupervised Outlier Detection. 3 """ 4 # Author: Akira Tamamori <tamamori5917@gmail.com> 5 # License: BSD 2 clause 6 7 8 from warnings import warn 9 10 from sklearn.neighbors import KernelDensity 11 from sklearn.utils import check_array 12 from sklearn.utils.validation import check_is_fitted 13 14 from .base import BaseDetector 15 from ..utils.utility import invert_order 16 17 18 class KDE(BaseDetector): 19 """KDE class for outlier detection. 20 21 For an observation, its negative log probability density could be viewed 22 as the outlying score. 23 24 See :cite:`latecki2007outlier` for details. 25 26 Parameters 27 ---------- 28 contamination : float in (0., 0.5), optional (default=0.1) 29 The amount of contamination of the data set, 30 i.e. the proportion of outliers in the data set. Used when fitting to 31 define the threshold on the decision function. 32 33 bandwidth : float, optional (default=1.0) 34 The bandwidth of the kernel. 35 36 algorithm : {'auto', 'ball_tree', 'kd_tree'}, optional 37 Algorithm used to compute the kernel density estimator: 38 39 - 'ball_tree' will use BallTree 40 - 'kd_tree' will use KDTree 41 - 'auto' will attempt to decide the most appropriate algorithm 42 based on the values passed to :meth:`fit` method. 43 44 leaf_size : int, optional (default = 30) 45 Leaf size passed to BallTree. This can affect the 46 speed of the construction and query, as well as the memory 47 required to store the tree. The optimal value depends on the 48 nature of the problem. 49 50 metric : string or callable, default 'minkowski' 51 metric to use for distance computation. Any metric from scikit-learn 52 or scipy.spatial.distance can be used. 53 54 If metric is a callable function, it is called on each 55 pair of instances (rows) and the resulting value recorded. The callable 56 should take two arrays as input and return one value indicating the 57 distance between them. This works for Scipy's metrics, but is less 58 efficient than passing the metric name as a string. 59 60 Distance matrices are not supported. 61 62 Valid values for metric are: 63 64 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 65 'manhattan'] 66 67 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 68 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 69 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 70 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 71 'sqeuclidean', 'yule'] 72 73 See the documentation for scipy.spatial.distance for details on these 74 metrics. 75 76 metric_params : dict, optional (default = None) 77 Additional keyword arguments for the metric function. 78 79 Attributes 80 ---------- 81 decision_scores_ : numpy array of shape (n_samples,) 82 The outlier scores of the training data. 83 The higher, the more abnormal. Outliers tend to have higher 84 scores. This value is available once the detector is 85 fitted. 86 87 threshold_ : float 88 The threshold is based on ``contamination``. It is the 89 ``n_samples * contamination`` most abnormal samples in 90 ``decision_scores_``. The threshold is calculated for generating 91 binary outlier labels. 92 93 labels_ : int, either 0 or 1 94 The binary labels of the training data. 0 stands for inliers 95 and 1 for outliers/anomalies. It is generated by applying 96 ``threshold_`` on ``decision_scores_``. 97 """ 98 99 def __init__( 100 self, 101 contamination=0.1, 102 bandwidth=1.0, 103 algorithm="auto", 104 leaf_size=30, 105 metric="minkowski", 106 metric_params=None, 107 ): 108 super().__init__(contamination=contamination) 109 self.bandwidth = bandwidth 110 self.algorithm = algorithm 111 self.leaf_size = leaf_size 112 self.metric = metric 113 self.metric_params = metric_params 114 115 if self.algorithm != "auto" and self.algorithm != "ball_tree": 116 warn( 117 "algorithm parameter is deprecated and will be removed " 118 "in version 0.7.6. By default, ball_tree will be used.", 119 FutureWarning, 120 ) 121 122 self.kde_ = KernelDensity( 123 bandwidth=self.bandwidth, 124 algorithm=self.algorithm, 125 leaf_size=self.leaf_size, 126 metric=self.metric, 127 metric_params=self.metric_params, 128 ) 129 130 self.decision_scores_ = None 131 132 def fit(self, X, y=None): 133 """Fit detector. y is ignored in unsupervised methods. 134 135 Parameters 136 ---------- 137 X : numpy array of shape (n_samples, n_features) 138 The input samples. 139 140 y : Ignored 141 Not used, present for API consistency by convention. 142 143 Returns 144 ------- 145 self : object 146 Fitted estimator. 147 """ 148 149 # validate inputs X and y (optional) 150 X = check_array(X) 151 self._set_n_classes(y) 152 153 self.kde_.fit(X) 154 155 # invert decision_scores_. Outliers comes with higher outlier scores. 156 self.decision_scores_ = invert_order(self.kde_.score_samples(X)) 157 self._process_decision_scores() 158 159 return self 160 161 def decision_function(self, X): 162 """Predict raw anomaly score of X using the fitted detector. 163 164 The anomaly score of an input sample is computed based on different 165 detector algorithms. For consistency, outliers are assigned with 166 larger anomaly scores. 167 168 Parameters 169 ---------- 170 X : numpy array of shape (n_samples, n_features) 171 The training input samples. Sparse matrices are accepted only 172 if they are supported by the base estimator. 173 174 Returns 175 ------- 176 anomaly_scores : numpy array of shape (n_samples,) 177 The anomaly score of the input samples. 178 """ 179 check_is_fitted(self, ["decision_scores_", "threshold_", "labels_"]) 180 181 X = check_array(X) 182 183 # invert outlier scores. Outliers comes with higher outlier scores. 184 return invert_order(self.kde_.score_samples(X))