test_data.py
1 # -*- coding: utf-8 -*- 2 3 4 import os 5 import sys 6 import unittest 7 8 import numpy as np 9 # noinspection PyProtectedMember 10 from numpy.testing import assert_allclose 11 from numpy.testing import assert_equal 12 from numpy.testing import assert_raises 13 14 # temporary solution for relative imports in case pyod is not installed 15 # if pyod is installed, no need to use the following line 16 from pyod.utils.data import generate_data_categorical 17 18 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 19 20 from pyod.utils.data import generate_data 21 from pyod.utils.data import evaluate_print 22 from pyod.utils.data import get_outliers_inliers 23 from pyod.utils.data import check_consistent_shape 24 from pyod.utils.data import generate_data_clusters 25 26 27 class TestData(unittest.TestCase): 28 def setUp(self): 29 self.n_train = 1000 30 self.n_test = 500 31 self.contamination = 0.1 32 self.n_samples = 1000 33 self.test_size = 0.2 34 self.value_lists = [0.1, 0.3, 0.2, -2, 1.5, 0, 1, -1, -0.5, 11] 35 self.random_state = 42 36 37 def test_data_generate(self): 38 X_train, X_test, y_train, y_test = \ 39 generate_data(n_train=self.n_train, 40 n_test=self.n_test, 41 contamination=self.contamination) 42 43 assert_equal(y_train.shape[0], X_train.shape[0]) 44 assert_equal(y_test.shape[0], X_test.shape[0]) 45 46 assert (self.n_train - X_train.shape[0] <= 1) 47 assert_equal(X_train.shape[1], 2) 48 49 assert (self.n_test - X_test.shape[0] <= 1) 50 assert_equal(X_test.shape[1], 2) 51 52 out_perc = np.sum(y_train) / self.n_train 53 assert_allclose(self.contamination, out_perc, atol=0.01) 54 55 out_perc = np.sum(y_test) / self.n_test 56 assert_allclose(self.contamination, out_perc, atol=0.01) 57 58 def test_data_generate2(self): 59 X_train, X_test, y_train, y_test = \ 60 generate_data(n_train=self.n_train, 61 n_test=self.n_test, 62 n_features=3, 63 contamination=self.contamination) 64 assert_allclose(X_train.shape, (self.n_train, 3)) 65 assert_allclose(X_test.shape, (self.n_test, 3)) 66 67 def test_data_generate3(self): 68 X_train, y_train, X_test, y_test = \ 69 generate_data(n_train=self.n_train, 70 n_test=self.n_test, 71 n_features=2, 72 contamination=self.contamination, 73 random_state=42) 74 75 X_train2, y_train2, X_test2, y_test2 = \ 76 generate_data(n_train=self.n_train, 77 n_test=self.n_test, 78 n_features=2, 79 contamination=self.contamination, 80 random_state=42) 81 82 assert_allclose(X_train, X_train2) 83 assert_allclose(X_test, X_test2) 84 assert_allclose(y_train, y_train2) 85 assert_allclose(y_test, y_test2) 86 87 def test_data_generate_cluster(self): 88 X_train, X_test, y_train, y_test = \ 89 generate_data_clusters(n_train=self.n_train, 90 n_test=self.n_test, 91 n_features=2, 92 contamination=self.contamination, 93 random_state=self.random_state) 94 95 assert_equal(y_train.shape[0], X_train.shape[0]) 96 assert_equal(y_test.shape[0], X_test.shape[0]) 97 98 assert (self.n_train - X_train.shape[0] <= 1) 99 assert_equal(X_train.shape[1], 2) 100 101 assert (self.n_test - X_test.shape[0] <= 1) 102 assert_equal(X_test.shape[1], 2) 103 104 out_perc = (np.sum(y_train) + np.sum(y_test)) / ( 105 self.n_train + self.n_test) 106 assert_allclose(self.contamination, out_perc, atol=0.01) 107 108 def test_data_generate_cluster2(self): 109 X_train, X_test, y_train, y_test = \ 110 generate_data_clusters(n_train=self.n_train, 111 n_test=self.n_test, 112 n_features=4, 113 contamination=self.contamination, 114 random_state=self.random_state) 115 116 assert_allclose(X_train.shape, (self.n_train, 4)) 117 assert_allclose(X_test.shape, (self.n_test, 4)) 118 119 def test_data_generate_cluster3(self): 120 X_train, y_train, X_test, y_test = \ 121 generate_data_clusters(n_train=self.n_train, 122 n_test=self.n_test, 123 n_features=3, 124 contamination=self.contamination, 125 random_state=self.random_state) 126 127 X_train2, y_train2, X_test2, y_test2 = \ 128 generate_data_clusters(n_train=self.n_train, 129 n_test=self.n_test, 130 n_features=3, 131 contamination=self.contamination, 132 random_state=self.random_state) 133 134 assert_allclose(X_train, X_train2) 135 assert_allclose(X_test, X_test2) 136 assert_allclose(y_train, y_train2) 137 assert_allclose(y_test, y_test2) 138 139 def test_data_generate_cluster5(self): 140 with assert_raises(ValueError): 141 generate_data_clusters(n_train=self.n_train, 142 n_test=self.n_test, 143 n_features=3, 144 n_clusters='e', 145 contamination=self.contamination, 146 random_state=self.random_state) 147 148 with assert_raises(ValueError): 149 generate_data_clusters(n_train=self.n_train, 150 n_test=self.n_test, 151 n_features='e', 152 contamination=self.contamination, 153 random_state=self.random_state) 154 155 with assert_raises(ValueError): 156 generate_data_clusters(n_train=self.n_train, 157 n_test=self.n_test, 158 n_features=3, 159 contamination='e', 160 random_state=self.random_state) 161 162 with assert_raises(ValueError): 163 generate_data_clusters(n_train=self.n_train, 164 n_test=self.n_test, 165 n_features=3, 166 contamination=self.contamination, 167 dist='e', 168 random_state=self.random_state) 169 170 def test_data_generate_cluster6(self): 171 X_train, X_test, y_train, y_test = \ 172 generate_data_clusters(n_train=self.n_train, 173 n_test=self.n_test, 174 n_features=2, 175 size='different', 176 density='different', 177 contamination=self.contamination, 178 random_state=self.random_state) 179 180 assert_equal(y_train.shape[0], X_train.shape[0]) 181 assert_equal(y_test.shape[0], X_test.shape[0]) 182 183 assert (self.n_train - X_train.shape[0] <= 1) 184 assert_equal(X_train.shape[1], 2) 185 186 assert (self.n_test - X_test.shape[0] <= 1) 187 assert_equal(X_test.shape[1], 2) 188 189 out_perc = (np.sum(y_train) + np.sum(y_test)) / ( 190 self.n_train + self.n_test) 191 assert_allclose(self.contamination, out_perc, atol=0.01) 192 193 def test_data_generate_categorical(self): 194 X_train, X_test, y_train, y_test = \ 195 generate_data_categorical(n_train=self.n_train, 196 n_test=self.n_test, 197 n_features=2, 198 contamination=self.contamination, 199 random_state=self.random_state) 200 201 assert_equal(y_train.shape[0], X_train.shape[0]) 202 assert_equal(y_test.shape[0], X_test.shape[0]) 203 204 assert (self.n_train - X_train.shape[0] <= 1) 205 assert_equal(X_train.shape[1], 2) 206 207 assert (self.n_test - X_test.shape[0] <= 1) 208 assert_equal(X_test.shape[1], 2) 209 210 out_perc = (np.sum(y_train) + np.sum(y_test)) / ( 211 self.n_train + self.n_test) 212 assert_allclose(self.contamination, out_perc, atol=0.01) 213 214 def test_data_generate_categorical2(self): 215 X_train, X_test, y_train, y_test = \ 216 generate_data_categorical(n_train=self.n_train, 217 n_test=self.n_test, 218 n_features=4, 219 contamination=self.contamination, 220 random_state=self.random_state) 221 222 assert_allclose(X_train.shape, (self.n_train, 4)) 223 assert_allclose(X_test.shape, (self.n_test, 4)) 224 225 def test_data_generate_categorical3(self): 226 X_train, y_train, X_test, y_test = \ 227 generate_data_categorical(n_train=self.n_train, 228 n_test=self.n_test, 229 n_features=3, 230 contamination=self.contamination, 231 random_state=self.random_state) 232 233 X_train2, y_train2, X_test2, y_test2 = \ 234 generate_data_categorical(n_train=self.n_train, 235 n_test=self.n_test, 236 n_features=3, 237 contamination=self.contamination, 238 random_state=self.random_state) 239 240 assert np.array_equal(X_train, X_train2) 241 assert np.array_equal(X_train, X_train2) 242 assert np.array_equal(X_test, X_test2) 243 assert np.array_equal(y_train, y_train2) 244 assert np.array_equal(y_test, y_test2) 245 246 def test_data_generate_categorical5(self): 247 with assert_raises(ValueError): 248 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 249 n_category_in=5, n_category_out=3, 250 n_informative=1, n_features=1, 251 contamination=self.contamination, 252 random_state=-1) 253 254 with assert_raises(ValueError): 255 generate_data_categorical(n_train=0, n_test=self.n_test, 256 n_category_in=5, n_category_out=3, 257 n_informative=1, n_features=1, 258 contamination=self.contamination, 259 random_state=self.random_state) 260 261 with assert_raises(ValueError): 262 generate_data_categorical(n_train=self.n_train, n_test=-1, 263 n_category_in=5, n_category_out=3, 264 n_informative=1, n_features=1, 265 contamination=self.contamination, 266 random_state=self.random_state) 267 268 with assert_raises(ValueError): 269 generate_data_categorical(n_train='not int', n_test=self.n_test, 270 n_category_in=5, n_category_out=3, 271 n_informative=1, n_features=1, 272 contamination=self.contamination, 273 random_state=self.random_state) 274 275 with assert_raises(ValueError): 276 generate_data_categorical(n_train=self.n_train, n_test='not int', 277 n_category_in=5, n_category_out=3, 278 n_informative=1, n_features=1, 279 contamination=self.contamination, 280 random_state=self.random_state) 281 282 with assert_raises(ValueError): 283 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 284 n_category_in=5, n_category_out=3, 285 n_informative=1, n_features=0, 286 contamination=self.contamination, 287 random_state=self.random_state) 288 289 with assert_raises(ValueError): 290 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 291 n_category_in=5, n_category_out=3, 292 n_informative=1, n_features='not int', 293 contamination=self.contamination, 294 random_state=self.random_state) 295 with assert_raises(ValueError): 296 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 297 n_category_in=5, n_category_out=3, 298 n_informative=-1, n_features=1, 299 contamination=self.contamination, 300 random_state=self.random_state) 301 with assert_raises(ValueError): 302 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 303 n_category_in=5, n_category_out=3, 304 n_informative='not int', n_features=1, 305 contamination=self.contamination, 306 random_state=self.random_state) 307 with assert_raises(ValueError): 308 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 309 n_category_in=5, n_category_out=3, 310 n_informative=1, n_features=1, 311 contamination=0.6, 312 random_state=self.random_state) 313 with assert_raises(ValueError): 314 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 315 n_category_in=5, n_category_out=3, 316 n_informative=1, n_features=1, 317 contamination='not float', 318 random_state=self.random_state) 319 with assert_raises(ValueError): 320 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 321 n_category_in=-1, n_category_out=3, 322 n_informative=1, n_features=1, 323 contamination=self.contamination, 324 random_state=self.random_state) 325 with assert_raises(ValueError): 326 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 327 n_category_in='not int', 328 n_category_out=3, 329 n_informative=1, n_features=1, 330 contamination=self.contamination, 331 random_state=self.random_state) 332 with assert_raises(ValueError): 333 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 334 n_category_in=self.n_train + self.n_test + 1, 335 n_category_out=3, 336 n_informative=1, n_features=1, 337 contamination=self.contamination, 338 random_state=self.random_state) 339 340 with assert_raises(ValueError): 341 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 342 n_category_in=5, n_category_out=-1, 343 n_informative=1, n_features=1, 344 contamination=self.contamination, 345 random_state=self.random_state) 346 with assert_raises(ValueError): 347 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 348 n_category_in=5, 349 n_category_out='not int', 350 n_informative=1, n_features=1, 351 contamination=self.contamination, 352 random_state=self.random_state) 353 with assert_raises(ValueError): 354 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 355 n_category_in=5, 356 n_category_out=self.n_train + self.n_test + 1, 357 n_informative=1, n_features=1, 358 contamination=self.contamination, 359 random_state=self.random_state) 360 361 with assert_raises(ValueError): 362 generate_data_categorical(n_train=self.n_train, n_test=self.n_test, 363 n_category_in=5, 364 n_category_out=5, 365 n_informative=2, n_features=2, 366 contamination=self.contamination, 367 shuffle='not bool', 368 random_state=self.random_state) 369 370 def test_evaluate_print(self): 371 X_train, X_test, y_train, y_test = generate_data( 372 n_train=self.n_train, 373 n_test=self.n_test, 374 contamination=self.contamination) 375 evaluate_print('dummy', y_train, y_train * 0.1) 376 377 def test_get_outliers_inliers(self): 378 X_train, y_train = generate_data( 379 n_train=self.n_train, train_only=True, 380 contamination=self.contamination) 381 382 X_outliers, X_inliers = get_outliers_inliers(X_train, y_train) 383 384 inlier_index = int(self.n_train * (1 - self.contamination)) 385 386 assert_allclose(X_train[0:inlier_index, :], X_inliers) 387 assert_allclose(X_train[inlier_index:, :], X_outliers) 388 389 def test_check_consistent_shape(self): 390 X_train, X_test, y_train, y_test = generate_data( 391 n_train=self.n_train, 392 n_test=self.n_test, 393 contamination=self.contamination) 394 395 X_train_n, y_train_n, X_test_n, y_test_n, y_train_pred_n, y_test_pred_n \ 396 = check_consistent_shape(X_train, y_train, X_test, y_test, 397 y_train, y_test) 398 399 assert_allclose(X_train_n, X_train) 400 assert_allclose(y_train_n, y_train) 401 assert_allclose(X_test_n, X_test) 402 assert_allclose(y_test_n, y_test) 403 assert_allclose(y_train_pred_n, y_train) 404 assert_allclose(y_test_pred_n, y_test) 405 406 # test shape difference 407 with assert_raises(ValueError): 408 check_consistent_shape(X_train, y_train, y_train, y_test, 409 y_train, y_test) 410 411 # test shape difference between X_train and X_test 412 X_test = np.hstack((X_test, np.zeros( 413 (X_test.shape[0], 1)))) # add extra column/feature 414 with assert_raises(ValueError): 415 check_consistent_shape(X_train, y_train, X_test, y_test, 416 y_train_pred_n, y_test_pred_n) 417 418 def tearDown(self): 419 pass 420 421 422 if __name__ == '__main__': 423 unittest.main()