test_suod.py
1 # -*- coding: utf-8 -*- 2 3 4 import os 5 import sys 6 import unittest 7 from os import path 8 9 # noinspection PyProtectedMember 10 from numpy.testing import assert_equal 11 from numpy.testing import assert_raises 12 from sklearn.base import clone 13 from sklearn.metrics import roc_auc_score 14 from sklearn.model_selection import train_test_split 15 from sklearn.utils.validation import check_X_y 16 17 # temporary solution for relative imports in case pyod is not installed 18 # if pyod is installed, no need to use the following line 19 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 20 21 from pyod.models.suod import SUOD 22 from pyod.models.lof import LOF 23 from pyod.models.iforest import IForest 24 from pyod.models.copod import COPOD 25 from pyod.utils.data import generate_data 26 27 28 class TestSUOD(unittest.TestCase): 29 def setUp(self): 30 # Define data file and read X and y 31 # Generate some data if the source data is missing 32 this_directory = path.abspath(path.dirname(__file__)) 33 csv_file = 'cardio.csv' 34 try: 35 import numpy as np 36 data = np.genfromtxt( 37 path.join(*[this_directory, 'data', csv_file]), 38 delimiter=',', skip_header=1) 39 40 except IOError: 41 print('{data_file} does not exist. Use generated data'.format( 42 data_file=csv_file)) 43 X, y = generate_data(train_only=True) # load data 44 else: 45 X = data[:, :-1] 46 y = data[:, -1].astype(int) 47 X, y = check_X_y(X, y) 48 49 self.X_train, self.X_test, self.y_train, self.y_test = \ 50 train_test_split(X, y, test_size=0.4, random_state=42) 51 52 self.base_estimators = [LOF(), LOF(), IForest(), COPOD()] 53 self.clf = SUOD(base_estimators=self.base_estimators) 54 self.clf.fit(self.X_train) 55 self.roc_floor = 0.7 56 57 def test_parameters(self): 58 assert (hasattr(self.clf, 'decision_scores_') and 59 self.clf.decision_scores_ is not None) 60 assert (hasattr(self.clf, 'labels_') and 61 self.clf.labels_ is not None) 62 assert (hasattr(self.clf, 'threshold_') and 63 self.clf.threshold_ is not None) 64 assert (hasattr(self.clf, '_mu') and 65 self.clf._mu is not None) 66 assert (hasattr(self.clf, '_sigma') and 67 self.clf._sigma is not None) 68 assert (hasattr(self.clf, 'model_') and 69 self.clf.model_ is not None) 70 71 def test_train_scores(self): 72 assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) 73 74 def test_prediction_scores(self): 75 pred_scores = self.clf.decision_function(self.X_test) 76 77 # check score shapes 78 assert_equal(pred_scores.shape[0], self.X_test.shape[0]) 79 80 # check performance 81 assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) 82 83 def test_prediction_labels(self): 84 pred_labels = self.clf.predict(self.X_test) 85 assert_equal(pred_labels.shape, self.y_test.shape) 86 87 def test_prediction_proba(self): 88 pred_proba = self.clf.predict_proba(self.X_test) 89 assert (pred_proba.min() >= 0) 90 assert (pred_proba.max() <= 1) 91 92 def test_prediction_proba_linear(self): 93 pred_proba = self.clf.predict_proba(self.X_test, method='linear') 94 assert (pred_proba.min() >= 0) 95 assert (pred_proba.max() <= 1) 96 97 def test_prediction_proba_unify(self): 98 pred_proba = self.clf.predict_proba(self.X_test, method='unify') 99 assert (pred_proba.min() >= 0) 100 assert (pred_proba.max() <= 1) 101 102 def test_prediction_proba_parameter(self): 103 with assert_raises(ValueError): 104 self.clf.predict_proba(self.X_test, method='something') 105 106 def test_prediction_labels_confidence(self): 107 pred_labels, confidence = self.clf.predict(self.X_test, 108 return_confidence=True) 109 assert_equal(pred_labels.shape, self.y_test.shape) 110 assert_equal(confidence.shape, self.y_test.shape) 111 assert (confidence.min() >= 0) 112 assert (confidence.max() <= 1) 113 114 def test_prediction_proba_linear_confidence(self): 115 pred_proba, confidence = self.clf.predict_proba(self.X_test, 116 method='linear', 117 return_confidence=True) 118 assert (pred_proba.min() >= 0) 119 assert (pred_proba.max() <= 1) 120 121 assert_equal(confidence.shape, self.y_test.shape) 122 assert (confidence.min() >= 0) 123 assert (confidence.max() <= 1) 124 125 def test_prediction_with_rejection(self): 126 pred_labels = self.clf.predict_with_rejection(self.X_test, 127 return_stats=False) 128 assert_equal(pred_labels.shape, self.y_test.shape) 129 130 def test_prediction_with_rejection_stats(self): 131 _, [expected_rejrate, ub_rejrate, 132 ub_cost] = self.clf.predict_with_rejection(self.X_test, 133 return_stats=True) 134 assert (expected_rejrate >= 0) 135 assert (expected_rejrate <= 1) 136 assert (ub_rejrate >= 0) 137 assert (ub_rejrate <= 1) 138 assert (ub_cost >= 0) 139 140 def test_fit_predict(self): 141 pred_labels = self.clf.fit_predict(self.X_train) 142 assert_equal(pred_labels.shape, self.y_train.shape) 143 144 def test_fit_predict_score(self): 145 self.clf.fit_predict_score(self.X_test, self.y_test) 146 self.clf.fit_predict_score(self.X_test, self.y_test, 147 scoring='roc_auc_score') 148 self.clf.fit_predict_score(self.X_test, self.y_test, 149 scoring='prc_n_score') 150 with assert_raises(NotImplementedError): 151 self.clf.fit_predict_score(self.X_test, self.y_test, 152 scoring='something') 153 154 # def test_predict_rank(self): 155 # pred_socres = self.clf.decision_function(self.X_test) 156 # pred_ranks = self.clf._predict_rank(self.X_test) 157 # 158 # # assert the order is reserved 159 # # assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) 160 # assert_array_less(pred_ranks, self.X_train.shape[0] + 1) 161 # assert_array_less(-0.1, pred_ranks) 162 # 163 # def test_predict_rank_normalized(self): 164 # pred_socres = self.clf.decision_function(self.X_test) 165 # pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) 166 # 167 # # assert the order is reserved 168 # # assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) 169 # assert_array_less(pred_ranks, 1.01) 170 # assert_array_less(-0.1, pred_ranks) 171 172 def test_model_clone(self): 173 clone_clf = clone(self.clf) 174 175 def test_default_njobs(self): 176 # Define data file and read X and y 177 # Generate some data if the source data is missing 178 this_directory = path.abspath(path.dirname(__file__)) 179 csv_file = 'cardio.csv' 180 try: 181 import numpy as np 182 data = np.genfromtxt( 183 path.join(*[this_directory, 'data', csv_file]), 184 delimiter=',', skip_header=1) 185 186 except IOError: 187 print('{data_file} does not exist. Use generated data'.format( 188 data_file=csv_file)) 189 X, y = generate_data(train_only=True) # load data 190 else: 191 X = data[:, :-1] 192 y = data[:, -1].astype(int) 193 X, y = check_X_y(X, y) 194 195 self.X_train, self.X_test, self.y_train, self.y_test = \ 196 train_test_split(X, y, test_size=0.4, random_state=42) 197 198 self.base_estimators = [LOF(), LOF(), IForest(), COPOD()] 199 self.clf = SUOD(n_jobs=2) 200 self.clf.fit(self.X_train) 201 self.roc_floor = 0.7 202 203 def tearDown(self): 204 pass 205 206 207 if __name__ == '__main__': 208 unittest.main()