/ examples / xgbod_example.py
xgbod_example.py
 1  # -*- coding: utf-8 -*-
 2  """Example of using XGBOD for outlier detection
 3  """
 4  # Author: Yue Zhao <zhaoy@cmu.edu>
 5  # License: BSD 2 clause
 6  
 7  from __future__ import division
 8  from __future__ import print_function
 9  
10  import os
11  import sys
12  
13  # temporary solution for relative imports in case pyod is not installed
14  # if pyod is installed, no need to use the following line
15  sys.path.append(
16      os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
17  
18  import numpy as np
19  from sklearn.model_selection import train_test_split
20  from sklearn.utils.validation import check_X_y
21  
22  from pyod.models.xgbod import XGBOD
23  from pyod.utils.data import generate_data
24  from pyod.utils.data import evaluate_print
25  
26  if __name__ == "__main__":
27      # Define data file and read X and y
28      # Generate some data if the source data is missing
29      csv_file = 'cardio.csv'
30      try:
31          data = np.genfromtxt(os.path.join('data', csv_file), delimiter=',',
32                               skip_header=1)
33  
34      except IOError:
35          print('{data_file} does not exist. Use generated data'.format(
36              data_file=csv_file))
37          X, y = generate_data(train_only=True)  # load data
38      else:
39          X = data[:, :-1]
40          y = data[:, -1].astype(int)
41          X, y = check_X_y(X, y)
42  
43      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
44                                                          random_state=42)
45  
46      # train XGBOD detector
47      clf_name = 'XGBOD'
48      clf = XGBOD(random_state=42)
49      clf.fit(X_train, y_train)
50  
51      # get the prediction labels and outlier scores of the training data
52      y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
53      y_train_scores = clf.decision_scores_  # raw outlier scores
54  
55      # get the prediction on the test data
56      y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
57      y_test_scores = clf.decision_function(X_test)  # outlier scores
58  
59      # evaluate and print the results
60      print("\nOn Training Data:")
61      evaluate_print(clf_name, y_train, y_train_scores)
62      print("\nOn Test Data:")
63      evaluate_print(clf_name, y_test, y_test_scores)