comb_example.py
1 # -*- coding: utf-8 -*- 2 """Example of combining multiple base outlier scores. Four combination 3 frameworks are demonstrated: 4 5 1. Average: take the average of all base detectors 6 2. maximization : take the maximum score across all detectors as the score 7 3. Average of Maximum (AOM) 8 4. Maximum of Average (MOA) 9 """ 10 # Author: Yue Zhao <zhaoy@cmu.edu> 11 # License: BSD 2 clause 12 13 from __future__ import division 14 from __future__ import print_function 15 16 import os 17 import sys 18 19 # temporary solution for relative imports in case pyod is not installed 20 # if pyod is installed, no need to use the following line 21 sys.path.append( 22 os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) 23 24 import numpy as np 25 from sklearn.model_selection import train_test_split 26 from pyod.models.knn import KNN 27 from pyod.models.combination import aom, moa, average, maximization, median 28 from pyod.utils.utility import standardizer 29 from pyod.utils.data import generate_data 30 from pyod.utils.data import evaluate_print 31 32 if __name__ == "__main__": 33 34 # Define data file and read X and y 35 # Generate some data if the source data is missing 36 csv_file = 'cardio.csv' 37 try: 38 data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',', 39 skip_header=1) 40 41 except IOError: 42 print('{data_file} does not exist. Use generated data'.format( 43 data_file=csv_file)) 44 X, y = generate_data(train_only=True) # load data 45 else: 46 X = data[:, :-1] 47 y = data[:, -1].astype(int) 48 49 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) 50 51 # standardizing data for processing 52 X_train_norm, X_test_norm = standardizer(X_train, X_test) 53 54 n_clf = 20 # number of base detectors 55 56 # Initialize 20 base detectors for combination 57 k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 58 150, 160, 170, 180, 190, 200] 59 60 train_scores = np.zeros([X_train.shape[0], n_clf]) 61 test_scores = np.zeros([X_test.shape[0], n_clf]) 62 63 print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf)) 64 65 for i in range(n_clf): 66 k = k_list[i] 67 68 clf = KNN(n_neighbors=k, method='largest') 69 clf.fit(X_train_norm) 70 71 train_scores[:, i] = clf.decision_scores_ 72 test_scores[:, i] = clf.decision_function(X_test_norm) 73 74 # Decision scores have to be normalized before combination 75 train_scores_norm, test_scores_norm = standardizer(train_scores, 76 test_scores) 77 # Combination by average 78 y_by_average = average(test_scores_norm) 79 evaluate_print('Combination by Average', y_test, y_by_average) 80 81 # Combination by max 82 y_by_maximization = maximization(test_scores_norm) 83 evaluate_print('Combination by Maximization', y_test, y_by_maximization) 84 85 # Combination by median 86 y_by_median = median(test_scores_norm) 87 evaluate_print('Combination by Median', y_test, y_by_median) 88 89 # Combination by aom 90 y_by_aom = aom(test_scores_norm, n_buckets=5) 91 evaluate_print('Combination by AOM', y_test, y_by_aom) 92 93 # Combination by moa 94 y_by_moa = moa(test_scores_norm, n_buckets=5) 95 evaluate_print('Combination by MOA', y_test, y_by_moa)