/ examples / comb_example.py
comb_example.py
 1  # -*- coding: utf-8 -*-
 2  """Example of combining multiple base outlier scores. Four combination
 3  frameworks are demonstrated:
 4  
 5  1. Average: take the average of all base detectors
 6  2. maximization : take the maximum score across all detectors as the score
 7  3. Average of Maximum (AOM)
 8  4. Maximum of Average (MOA)
 9  """
10  # Author: Yue Zhao <zhaoy@cmu.edu>
11  # License: BSD 2 clause
12  
13  from __future__ import division
14  from __future__ import print_function
15  
16  import os
17  import sys
18  
19  # temporary solution for relative imports in case pyod is not installed
20  # if pyod is installed, no need to use the following line
21  sys.path.append(
22      os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
23  
24  import numpy as np
25  from sklearn.model_selection import train_test_split
26  from pyod.models.knn import KNN
27  from pyod.models.combination import aom, moa, average, maximization, median
28  from pyod.utils.utility import standardizer
29  from pyod.utils.data import generate_data
30  from pyod.utils.data import evaluate_print
31  
32  if __name__ == "__main__":
33  
34      # Define data file and read X and y
35      # Generate some data if the source data is missing
36      csv_file = 'cardio.csv'
37      try:
38          data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',',
39                               skip_header=1)
40  
41      except IOError:
42          print('{data_file} does not exist. Use generated data'.format(
43              data_file=csv_file))
44          X, y = generate_data(train_only=True)  # load data
45      else:
46          X = data[:, :-1]
47          y = data[:, -1].astype(int)
48  
49      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
50  
51      # standardizing data for processing
52      X_train_norm, X_test_norm = standardizer(X_train, X_test)
53  
54      n_clf = 20  # number of base detectors
55  
56      # Initialize 20 base detectors for combination
57      k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
58                150, 160, 170, 180, 190, 200]
59  
60      train_scores = np.zeros([X_train.shape[0], n_clf])
61      test_scores = np.zeros([X_test.shape[0], n_clf])
62  
63      print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))
64  
65      for i in range(n_clf):
66          k = k_list[i]
67  
68          clf = KNN(n_neighbors=k, method='largest')
69          clf.fit(X_train_norm)
70  
71          train_scores[:, i] = clf.decision_scores_
72          test_scores[:, i] = clf.decision_function(X_test_norm)
73  
74      # Decision scores have to be normalized before combination
75      train_scores_norm, test_scores_norm = standardizer(train_scores,
76                                                         test_scores)
77      # Combination by average
78      y_by_average = average(test_scores_norm)
79      evaluate_print('Combination by Average', y_test, y_by_average)
80  
81      # Combination by max
82      y_by_maximization = maximization(test_scores_norm)
83      evaluate_print('Combination by Maximization', y_test, y_by_maximization)
84  
85      # Combination by median
86      y_by_median = median(test_scores_norm)
87      evaluate_print('Combination by Median', y_test, y_by_median)
88  
89      # Combination by aom
90      y_by_aom = aom(test_scores_norm, n_buckets=5)
91      evaluate_print('Combination by AOM', y_test, y_by_aom)
92  
93      # Combination by moa
94      y_by_moa = moa(test_scores_norm, n_buckets=5)
95      evaluate_print('Combination by MOA', y_test, y_by_moa)