ocsvm.py
1 # -*- coding: utf-8 -*- 2 """One-class SVM detector. Implemented on scikit-learn library. 3 """ 4 # Author: Yue Zhao <yzhao062@gmail.com> 5 # License: BSD 2 clause 6 7 8 from sklearn.svm import OneClassSVM 9 from sklearn.utils import check_array 10 from sklearn.utils.validation import check_is_fitted 11 12 from .base import BaseDetector 13 from ..utils.utility import invert_order 14 15 16 class OCSVM(BaseDetector): 17 """Wrapper of scikit-learn one-class SVM Class with more functionalities. 18 Unsupervised Outlier Detection. 19 20 Estimate the support of a high-dimensional distribution. 21 22 The implementation is based on libsvm. 23 See http://scikit-learn.org/stable/modules/svm.html#svm-outlier-detection 24 and :cite:`scholkopf2001estimating`. 25 26 Parameters 27 ---------- 28 kernel : string, optional (default='rbf') 29 Specifies the kernel type to be used in the algorithm. 30 It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or 31 a callable. 32 If none is given, 'rbf' will be used. If a callable is given it is 33 used to precompute the kernel matrix. 34 35 nu : float, optional 36 An upper bound on the fraction of training 37 errors and a lower bound of the fraction of support 38 vectors. Should be in the interval (0, 1]. By default 0.5 39 will be taken. 40 41 degree : int, optional (default=3) 42 Degree of the polynomial kernel function ('poly'). 43 Ignored by all other kernels. 44 45 gamma : float, optional (default='auto') 46 Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. 47 If gamma is 'auto' then 1/n_features will be used instead. 48 49 coef0 : float, optional (default=0.0) 50 Independent term in kernel function. 51 It is only significant in 'poly' and 'sigmoid'. 52 53 tol : float, optional 54 Tolerance for stopping criterion. 55 56 shrinking : bool, optional 57 Whether to use the shrinking heuristic. 58 59 cache_size : float, optional 60 Specify the size of the kernel cache (in MB). 61 62 verbose : bool, default: False 63 Enable verbose output. Note that this setting takes advantage of a 64 per-process runtime setting in libsvm that, if enabled, may not work 65 properly in a multithreaded context. 66 67 max_iter : int, optional (default=-1) 68 Hard limit on iterations within solver, or -1 for no limit. 69 70 contamination : float in (0., 0.5), optional (default=0.1) 71 The amount of contamination of the data set, i.e. 72 the proportion of outliers in the data set. Used when fitting to 73 define the threshold on the decision function. 74 75 76 Attributes 77 ---------- 78 support_ : array-like, shape = [n_SV] 79 Indices of support vectors. 80 81 support_vectors_ : array-like, shape = [nSV, n_features] 82 Support vectors. 83 84 dual_coef_ : array, shape = [1, n_SV] 85 Coefficients of the support vectors in the decision function. 86 87 coef_ : array, shape = [1, n_features] 88 Weights assigned to the features (coefficients in the primal 89 problem). This is only available in the case of a linear kernel. 90 91 `coef_` is readonly property derived from `dual_coef_` and 92 `support_vectors_` 93 94 intercept_ : array, shape = [1,] 95 Constant in the decision function. 96 97 decision_scores_ : numpy array of shape (n_samples,) 98 The outlier scores of the training data. 99 The higher, the more abnormal. Outliers tend to have higher 100 scores. This value is available once the detector is fitted. 101 102 threshold_ : float 103 The threshold is based on ``contamination``. It is the 104 ``n_samples * contamination`` most abnormal samples in 105 ``decision_scores_``. The threshold is calculated for generating 106 binary outlier labels. 107 108 labels_ : int, either 0 or 1 109 The binary labels of the training data. 0 stands for inliers 110 and 1 for outliers/anomalies. It is generated by applying 111 ``threshold_`` on ``decision_scores_``. 112 """ 113 114 def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, 115 tol=1e-3, nu=0.5, shrinking=True, cache_size=200, 116 verbose=False, max_iter=-1, contamination=0.1): 117 super(OCSVM, self).__init__(contamination=contamination) 118 self.kernel = kernel 119 self.degree = degree 120 self.gamma = gamma 121 self.coef0 = coef0 122 self.tol = tol 123 self.nu = nu 124 self.shrinking = shrinking 125 self.cache_size = cache_size 126 self.verbose = verbose 127 self.max_iter = max_iter 128 129 def fit(self, X, y=None, sample_weight=None, **params): 130 """Fit detector. y is ignored in unsupervised methods. 131 132 Parameters 133 ---------- 134 X : numpy array of shape (n_samples, n_features) 135 The input samples. 136 137 y : Ignored 138 Not used, present for API consistency by convention. 139 140 sample_weight : array-like, shape (n_samples,) 141 Per-sample weights. Rescale C per sample. Higher weights 142 force the classifier to put more emphasis on these points. 143 144 Returns 145 ------- 146 self : object 147 Fitted estimator. 148 """ 149 # validate inputs X and y (optional) 150 X = check_array(X) 151 self._set_n_classes(y) 152 153 self.detector_ = OneClassSVM(kernel=self.kernel, 154 degree=self.degree, 155 gamma=self.gamma, 156 coef0=self.coef0, 157 tol=self.tol, 158 nu=self.nu, 159 shrinking=self.shrinking, 160 cache_size=self.cache_size, 161 verbose=self.verbose, 162 max_iter=self.max_iter) 163 self.detector_.fit(X=X, y=y, sample_weight=sample_weight, 164 **params) 165 166 # invert decision_scores_. Outliers comes with higher outlier scores 167 self.decision_scores_ = invert_order( 168 self.detector_.decision_function(X)) 169 self._process_decision_scores() 170 return self 171 172 def decision_function(self, X): 173 """Predict raw anomaly score of X using the fitted detector. 174 175 The anomaly score of an input sample is computed based on different 176 detector algorithms. For consistency, outliers are assigned with 177 larger anomaly scores. 178 179 Parameters 180 ---------- 181 X : numpy array of shape (n_samples, n_features) 182 The training input samples. Sparse matrices are accepted only 183 if they are supported by the base estimator. 184 185 Returns 186 ------- 187 anomaly_scores : numpy array of shape (n_samples,) 188 The anomaly score of the input samples. 189 """ 190 check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 191 # Invert outlier scores. Outliers comes with higher outlier scores 192 return invert_order(self.detector_.decision_function(X)) 193 194 @property 195 def support_(self): 196 """Indices of support vectors. 197 Decorator for scikit-learn One class SVM attributes. 198 """ 199 return self.detector_.support_ 200 201 @property 202 def support_vectors_(self): 203 """Support vectors. 204 Decorator for scikit-learn One class SVM attributes. 205 """ 206 return self.detector_.support_vectors_ 207 208 @property 209 def dual_coef_(self): 210 """Coefficients of the support vectors in the decision function. 211 Decorator for scikit-learn One class SVM attributes. 212 """ 213 return self.detector_.dual_coef_ 214 215 @property 216 def coef_(self): 217 """Weights assigned to the features (coefficients in the primal 218 problem). This is only available in the case of a linear kernel. 219 `coef_` is readonly property derived from `dual_coef_` and 220 `support_vectors_` 221 Decorator for scikit-learn One class SVM attributes. 222 """ 223 return self.detector_.coef_ 224 225 @property 226 def intercept_(self): 227 """ Constant in the decision function. 228 Decorator for scikit-learn One class SVM attributes. 229 """ 230 return self.detector_.intercept_