/ examples / iforest_example.py
iforest_example.py
 1  # -*- coding: utf-8 -*-
 2  """Example of using Isolation Forest for outlier detection
 3  """
 4  # Author: Yue Zhao <zhaoy@cmu.edu>
 5  # License: BSD 2 clause
 6  
 7  from __future__ import division
 8  from __future__ import print_function
 9  
10  import os
11  import sys
12  
13  # temporary solution for relative imports in case pyod is not installed
14  # if pyod is installed, no need to use the following line
15  sys.path.append(
16      os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
17  
18  from pyod.models.iforest import IForest
19  from pyod.utils.data import generate_data
20  
21  from pyod.utils.data import evaluate_print
22  from pyod.utils.example import visualize
23  
24  if __name__ == "__main__":
25      contamination = 0.1  # percentage of outliers
26      n_train = 200  # number of training points
27      n_test = 100  # number of testing points
28  
29      # Generate sample data
30      X_train, X_test, y_train, y_test = \
31          generate_data(n_train=n_train,
32                        n_test=n_test,
33                        n_features=2,
34                        contamination=contamination,
35                        random_state=42)
36  
37      # train IForest detector
38      clf_name = 'IForest'
39      clf = IForest()
40      clf.fit(X_train)
41  
42      # get the prediction labels and outlier scores of the training data
43      y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
44      y_train_scores = clf.decision_scores_  # raw outlier scores
45  
46      # get the prediction on the test data
47      y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
48      y_test_scores = clf.decision_function(X_test)  # outlier scores
49  
50      # evaluate and print the results
51      print("\nOn Training Data:")
52      evaluate_print(clf_name, y_train, y_train_scores)
53      print("\nOn Test Data:")
54      evaluate_print(clf_name, y_test, y_test_scores)
55  
56      # example of the feature importance
57      feature_importance = clf.feature_importances_
58      print("Feature importance", feature_importance)
59  
60      # visualize the results
61      visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
62                y_test_pred, show_figure=True, save_figure=False)