/ examples / copod_interpretability.py
copod_interpretability.py
 1  # -*- coding: utf-8 -*-
 2  """Example of using Copula Based Outlier Detector (COPOD) for outlier detection
 3  Sample wise interpretation is provided here.
 4  """
 5  # Author: Winston Li <jk_zhengli@hotmail.com>
 6  # License: BSD 2 clause
 7  
 8  from __future__ import division
 9  from __future__ import print_function
10  
11  import os
12  import sys
13  
14  # temporary solution for relative imports in case pyod is not installed
15  # if pyod is installed, no need to use the following line
16  sys.path.append(
17      os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
18  
19  import numpy as np
20  from sklearn.model_selection import train_test_split
21  
22  from pyod.models.copod import COPOD
23  from pyod.utils.utility import standardizer
24  
25  if __name__ == "__main__":
26      # Define data file and read X and y
27      csv_file = 'cardio.csv'
28      data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',',
29                           skip_header=1)
30      X = data[:, :-1]
31      y = data[:, -1].astype(int)
32  
33      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
34                                                          random_state=1)
35  
36      # standardizing data for processing
37      X_train_norm, X_test_norm = standardizer(X_train, X_test)
38  
39      # train COPOD detector
40      clf_name = 'COPOD'
41      clf = COPOD()
42  
43      # you could try parallel version as well.
44      # clf = COPOD(n_jobs=2)
45      clf.fit(X_train)
46  
47      # get the prediction labels and outlier scores of the training data
48      y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
49      y_train_scores = clf.decision_scores_  # raw outlier scores
50  
51      print('The first sample is an outlier', y_train[0])
52      clf.explain_outlier(0)
53  
54      # we could see feature 7, 16, and 20 is above the 0.99 cutoff
55      # and play a more important role in deciding it is an outlier.