copod_interpretability.py
1 # -*- coding: utf-8 -*- 2 """Example of using Copula Based Outlier Detector (COPOD) for outlier detection 3 Sample wise interpretation is provided here. 4 """ 5 # Author: Winston Li <jk_zhengli@hotmail.com> 6 # License: BSD 2 clause 7 8 from __future__ import division 9 from __future__ import print_function 10 11 import os 12 import sys 13 14 # temporary solution for relative imports in case pyod is not installed 15 # if pyod is installed, no need to use the following line 16 sys.path.append( 17 os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) 18 19 import numpy as np 20 from sklearn.model_selection import train_test_split 21 22 from pyod.models.copod import COPOD 23 from pyod.utils.utility import standardizer 24 25 if __name__ == "__main__": 26 # Define data file and read X and y 27 csv_file = 'cardio.csv' 28 data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',', 29 skip_header=1) 30 X = data[:, :-1] 31 y = data[:, -1].astype(int) 32 33 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, 34 random_state=1) 35 36 # standardizing data for processing 37 X_train_norm, X_test_norm = standardizer(X_train, X_test) 38 39 # train COPOD detector 40 clf_name = 'COPOD' 41 clf = COPOD() 42 43 # you could try parallel version as well. 44 # clf = COPOD(n_jobs=2) 45 clf.fit(X_train) 46 47 # get the prediction labels and outlier scores of the training data 48 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) 49 y_train_scores = clf.decision_scores_ # raw outlier scores 50 51 print('The first sample is an outlier', y_train[0]) 52 clf.explain_outlier(0) 53 54 # we could see feature 7, 16, and 20 is above the 0.99 cutoff 55 # and play a more important role in deciding it is an outlier.