iforest_example.py
1 # -*- coding: utf-8 -*- 2 """Example of using Isolation Forest for outlier detection 3 """ 4 # Author: Yue Zhao <zhaoy@cmu.edu> 5 # License: BSD 2 clause 6 7 from __future__ import division 8 from __future__ import print_function 9 10 import os 11 import sys 12 13 # temporary solution for relative imports in case pyod is not installed 14 # if pyod is installed, no need to use the following line 15 sys.path.append( 16 os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) 17 18 from pyod.models.iforest import IForest 19 from pyod.utils.data import generate_data 20 21 from pyod.utils.data import evaluate_print 22 from pyod.utils.example import visualize 23 24 if __name__ == "__main__": 25 contamination = 0.1 # percentage of outliers 26 n_train = 200 # number of training points 27 n_test = 100 # number of testing points 28 29 # Generate sample data 30 X_train, X_test, y_train, y_test = \ 31 generate_data(n_train=n_train, 32 n_test=n_test, 33 n_features=2, 34 contamination=contamination, 35 random_state=42) 36 37 # train IForest detector 38 clf_name = 'IForest' 39 clf = IForest() 40 clf.fit(X_train) 41 42 # get the prediction labels and outlier scores of the training data 43 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) 44 y_train_scores = clf.decision_scores_ # raw outlier scores 45 46 # get the prediction on the test data 47 y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) 48 y_test_scores = clf.decision_function(X_test) # outlier scores 49 50 # evaluate and print the results 51 print("\nOn Training Data:") 52 evaluate_print(clf_name, y_train, y_train_scores) 53 print("\nOn Test Data:") 54 evaluate_print(clf_name, y_test, y_test_scores) 55 56 # example of the feature importance 57 feature_importance = clf.feature_importances_ 58 print("Feature importance", feature_importance) 59 60 # visualize the results 61 visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, 62 y_test_pred, show_figure=True, save_figure=False)