Cradicle Explorer

ocsvm.py
  1  # -*- coding: utf-8 -*-
  2  """One-class SVM detector. Implemented on scikit-learn library.
  3  """
  4  # Author: Yue Zhao <yzhao062@gmail.com>
  5  # License: BSD 2 clause
  6  
  7  
  8  from sklearn.svm import OneClassSVM
  9  from sklearn.utils import check_array
 10  from sklearn.utils.validation import check_is_fitted
 11  
 12  from .base import BaseDetector
 13  from ..utils.utility import invert_order
 14  
 15  
 16  class OCSVM(BaseDetector):
 17      """Wrapper of scikit-learn one-class SVM Class with more functionalities.
 18      Unsupervised Outlier Detection.
 19  
 20      Estimate the support of a high-dimensional distribution.
 21  
 22      The implementation is based on libsvm.
 23      See http://scikit-learn.org/stable/modules/svm.html#svm-outlier-detection
 24      and :cite:`scholkopf2001estimating`.
 25  
 26      Parameters
 27      ----------
 28      kernel : string, optional (default='rbf')
 29           Specifies the kernel type to be used in the algorithm.
 30           It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
 31           a callable.
 32           If none is given, 'rbf' will be used. If a callable is given it is
 33           used to precompute the kernel matrix.
 34  
 35      nu : float, optional
 36          An upper bound on the fraction of training
 37          errors and a lower bound of the fraction of support
 38          vectors. Should be in the interval (0, 1]. By default 0.5
 39          will be taken.
 40  
 41      degree : int, optional (default=3)
 42          Degree of the polynomial kernel function ('poly').
 43          Ignored by all other kernels.
 44  
 45      gamma : float, optional (default='auto')
 46          Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
 47          If gamma is 'auto' then 1/n_features will be used instead.
 48  
 49      coef0 : float, optional (default=0.0)
 50          Independent term in kernel function.
 51          It is only significant in 'poly' and 'sigmoid'.
 52  
 53      tol : float, optional
 54          Tolerance for stopping criterion.
 55  
 56      shrinking : bool, optional
 57          Whether to use the shrinking heuristic.
 58  
 59      cache_size : float, optional
 60          Specify the size of the kernel cache (in MB).
 61  
 62      verbose : bool, default: False
 63          Enable verbose output. Note that this setting takes advantage of a
 64          per-process runtime setting in libsvm that, if enabled, may not work
 65          properly in a multithreaded context.
 66  
 67      max_iter : int, optional (default=-1)
 68          Hard limit on iterations within solver, or -1 for no limit.
 69  
 70      contamination : float in (0., 0.5), optional (default=0.1)
 71          The amount of contamination of the data set, i.e.
 72          the proportion of outliers in the data set. Used when fitting to
 73          define the threshold on the decision function.
 74  
 75  
 76      Attributes
 77      ----------
 78      support_ : array-like, shape = [n_SV]
 79          Indices of support vectors.
 80  
 81      support_vectors_ : array-like, shape = [nSV, n_features]
 82          Support vectors.
 83  
 84      dual_coef_ : array, shape = [1, n_SV]
 85          Coefficients of the support vectors in the decision function.
 86  
 87      coef_ : array, shape = [1, n_features]
 88          Weights assigned to the features (coefficients in the primal
 89          problem). This is only available in the case of a linear kernel.
 90  
 91          `coef_` is readonly property derived from `dual_coef_` and
 92          `support_vectors_`
 93  
 94      intercept_ : array, shape = [1,]
 95          Constant in the decision function.
 96  
 97      decision_scores_ : numpy array of shape (n_samples,)
 98          The outlier scores of the training data.
 99          The higher, the more abnormal. Outliers tend to have higher
100          scores. This value is available once the detector is fitted.
101  
102      threshold_ : float
103          The threshold is based on ``contamination``. It is the
104          ``n_samples * contamination`` most abnormal samples in
105          ``decision_scores_``. The threshold is calculated for generating
106          binary outlier labels.
107  
108      labels_ : int, either 0 or 1
109          The binary labels of the training data. 0 stands for inliers
110          and 1 for outliers/anomalies. It is generated by applying
111          ``threshold_`` on ``decision_scores_``.
112      """
113  
114      def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0,
115                   tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
116                   verbose=False, max_iter=-1, contamination=0.1):
117          super(OCSVM, self).__init__(contamination=contamination)
118          self.kernel = kernel
119          self.degree = degree
120          self.gamma = gamma
121          self.coef0 = coef0
122          self.tol = tol
123          self.nu = nu
124          self.shrinking = shrinking
125          self.cache_size = cache_size
126          self.verbose = verbose
127          self.max_iter = max_iter
128  
129      def fit(self, X, y=None, sample_weight=None, **params):
130          """Fit detector. y is ignored in unsupervised methods.
131  
132          Parameters
133          ----------
134          X : numpy array of shape (n_samples, n_features)
135              The input samples.
136  
137          y : Ignored
138              Not used, present for API consistency by convention.
139  
140          sample_weight : array-like, shape (n_samples,)
141              Per-sample weights. Rescale C per sample. Higher weights
142              force the classifier to put more emphasis on these points.
143  
144          Returns
145          -------
146          self : object
147              Fitted estimator.
148          """
149          # validate inputs X and y (optional)
150          X = check_array(X)
151          self._set_n_classes(y)
152  
153          self.detector_ = OneClassSVM(kernel=self.kernel,
154                                       degree=self.degree,
155                                       gamma=self.gamma,
156                                       coef0=self.coef0,
157                                       tol=self.tol,
158                                       nu=self.nu,
159                                       shrinking=self.shrinking,
160                                       cache_size=self.cache_size,
161                                       verbose=self.verbose,
162                                       max_iter=self.max_iter)
163          self.detector_.fit(X=X, y=y, sample_weight=sample_weight,
164                             **params)
165  
166          # invert decision_scores_. Outliers comes with higher outlier scores
167          self.decision_scores_ = invert_order(
168              self.detector_.decision_function(X))
169          self._process_decision_scores()
170          return self
171  
172      def decision_function(self, X):
173          """Predict raw anomaly score of X using the fitted detector.
174  
175          The anomaly score of an input sample is computed based on different
176          detector algorithms. For consistency, outliers are assigned with
177          larger anomaly scores.
178  
179          Parameters
180          ----------
181          X : numpy array of shape (n_samples, n_features)
182              The training input samples. Sparse matrices are accepted only
183              if they are supported by the base estimator.
184  
185          Returns
186          -------
187          anomaly_scores : numpy array of shape (n_samples,)
188              The anomaly score of the input samples.
189          """
190          check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
191          # Invert outlier scores. Outliers comes with higher outlier scores
192          return invert_order(self.detector_.decision_function(X))
193  
194      @property
195      def support_(self):
196          """Indices of support vectors.
197          Decorator for scikit-learn One class SVM attributes.
198          """
199          return self.detector_.support_
200  
201      @property
202      def support_vectors_(self):
203          """Support vectors.
204          Decorator for scikit-learn One class SVM attributes.
205          """
206          return self.detector_.support_vectors_
207  
208      @property
209      def dual_coef_(self):
210          """Coefficients of the support vectors in the decision function.
211          Decorator for scikit-learn One class SVM attributes.
212          """
213          return self.detector_.dual_coef_
214  
215      @property
216      def coef_(self):
217          """Weights assigned to the features (coefficients in the primal
218          problem). This is only available in the case of a linear kernel.
219          `coef_` is readonly property derived from `dual_coef_` and
220          `support_vectors_`
221          Decorator for scikit-learn One class SVM attributes.
222          """
223          return self.detector_.coef_
224  
225      @property
226      def intercept_(self):
227          """ Constant in the decision function.
228          Decorator for scikit-learn One class SVM attributes.
229          """
230          return self.detector_.intercept_