/ pyod / test / test_data.py
test_data.py
  1  # -*- coding: utf-8 -*-
  2  
  3  
  4  import os
  5  import sys
  6  import unittest
  7  
  8  import numpy as np
  9  # noinspection PyProtectedMember
 10  from numpy.testing import assert_allclose
 11  from numpy.testing import assert_equal
 12  from numpy.testing import assert_raises
 13  
 14  # temporary solution for relative imports in case pyod is not installed
 15  # if pyod is installed, no need to use the following line
 16  from pyod.utils.data import generate_data_categorical
 17  
 18  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 19  
 20  from pyod.utils.data import generate_data
 21  from pyod.utils.data import evaluate_print
 22  from pyod.utils.data import get_outliers_inliers
 23  from pyod.utils.data import check_consistent_shape
 24  from pyod.utils.data import generate_data_clusters
 25  
 26  
 27  class TestData(unittest.TestCase):
 28      def setUp(self):
 29          self.n_train = 1000
 30          self.n_test = 500
 31          self.contamination = 0.1
 32          self.n_samples = 1000
 33          self.test_size = 0.2
 34          self.value_lists = [0.1, 0.3, 0.2, -2, 1.5, 0, 1, -1, -0.5, 11]
 35          self.random_state = 42
 36  
 37      def test_data_generate(self):
 38          X_train, X_test, y_train, y_test = \
 39              generate_data(n_train=self.n_train,
 40                            n_test=self.n_test,
 41                            contamination=self.contamination)
 42  
 43          assert_equal(y_train.shape[0], X_train.shape[0])
 44          assert_equal(y_test.shape[0], X_test.shape[0])
 45  
 46          assert (self.n_train - X_train.shape[0] <= 1)
 47          assert_equal(X_train.shape[1], 2)
 48  
 49          assert (self.n_test - X_test.shape[0] <= 1)
 50          assert_equal(X_test.shape[1], 2)
 51  
 52          out_perc = np.sum(y_train) / self.n_train
 53          assert_allclose(self.contamination, out_perc, atol=0.01)
 54  
 55          out_perc = np.sum(y_test) / self.n_test
 56          assert_allclose(self.contamination, out_perc, atol=0.01)
 57  
 58      def test_data_generate2(self):
 59          X_train, X_test, y_train, y_test = \
 60              generate_data(n_train=self.n_train,
 61                            n_test=self.n_test,
 62                            n_features=3,
 63                            contamination=self.contamination)
 64          assert_allclose(X_train.shape, (self.n_train, 3))
 65          assert_allclose(X_test.shape, (self.n_test, 3))
 66  
 67      def test_data_generate3(self):
 68          X_train, y_train, X_test, y_test = \
 69              generate_data(n_train=self.n_train,
 70                            n_test=self.n_test,
 71                            n_features=2,
 72                            contamination=self.contamination,
 73                            random_state=42)
 74  
 75          X_train2, y_train2, X_test2, y_test2 = \
 76              generate_data(n_train=self.n_train,
 77                            n_test=self.n_test,
 78                            n_features=2,
 79                            contamination=self.contamination,
 80                            random_state=42)
 81  
 82          assert_allclose(X_train, X_train2)
 83          assert_allclose(X_test, X_test2)
 84          assert_allclose(y_train, y_train2)
 85          assert_allclose(y_test, y_test2)
 86  
 87      def test_data_generate_cluster(self):
 88          X_train, X_test, y_train, y_test = \
 89              generate_data_clusters(n_train=self.n_train,
 90                                     n_test=self.n_test,
 91                                     n_features=2,
 92                                     contamination=self.contamination,
 93                                     random_state=self.random_state)
 94  
 95          assert_equal(y_train.shape[0], X_train.shape[0])
 96          assert_equal(y_test.shape[0], X_test.shape[0])
 97  
 98          assert (self.n_train - X_train.shape[0] <= 1)
 99          assert_equal(X_train.shape[1], 2)
100  
101          assert (self.n_test - X_test.shape[0] <= 1)
102          assert_equal(X_test.shape[1], 2)
103  
104          out_perc = (np.sum(y_train) + np.sum(y_test)) / (
105                  self.n_train + self.n_test)
106          assert_allclose(self.contamination, out_perc, atol=0.01)
107  
108      def test_data_generate_cluster2(self):
109          X_train, X_test, y_train, y_test = \
110              generate_data_clusters(n_train=self.n_train,
111                                     n_test=self.n_test,
112                                     n_features=4,
113                                     contamination=self.contamination,
114                                     random_state=self.random_state)
115  
116          assert_allclose(X_train.shape, (self.n_train, 4))
117          assert_allclose(X_test.shape, (self.n_test, 4))
118  
119      def test_data_generate_cluster3(self):
120          X_train, y_train, X_test, y_test = \
121              generate_data_clusters(n_train=self.n_train,
122                                     n_test=self.n_test,
123                                     n_features=3,
124                                     contamination=self.contamination,
125                                     random_state=self.random_state)
126  
127          X_train2, y_train2, X_test2, y_test2 = \
128              generate_data_clusters(n_train=self.n_train,
129                                     n_test=self.n_test,
130                                     n_features=3,
131                                     contamination=self.contamination,
132                                     random_state=self.random_state)
133  
134          assert_allclose(X_train, X_train2)
135          assert_allclose(X_test, X_test2)
136          assert_allclose(y_train, y_train2)
137          assert_allclose(y_test, y_test2)
138  
139      def test_data_generate_cluster5(self):
140          with assert_raises(ValueError):
141              generate_data_clusters(n_train=self.n_train,
142                                     n_test=self.n_test,
143                                     n_features=3,
144                                     n_clusters='e',
145                                     contamination=self.contamination,
146                                     random_state=self.random_state)
147  
148          with assert_raises(ValueError):
149              generate_data_clusters(n_train=self.n_train,
150                                     n_test=self.n_test,
151                                     n_features='e',
152                                     contamination=self.contamination,
153                                     random_state=self.random_state)
154  
155          with assert_raises(ValueError):
156              generate_data_clusters(n_train=self.n_train,
157                                     n_test=self.n_test,
158                                     n_features=3,
159                                     contamination='e',
160                                     random_state=self.random_state)
161  
162          with assert_raises(ValueError):
163              generate_data_clusters(n_train=self.n_train,
164                                     n_test=self.n_test,
165                                     n_features=3,
166                                     contamination=self.contamination,
167                                     dist='e',
168                                     random_state=self.random_state)
169  
170      def test_data_generate_cluster6(self):
171          X_train, X_test, y_train, y_test = \
172              generate_data_clusters(n_train=self.n_train,
173                                     n_test=self.n_test,
174                                     n_features=2,
175                                     size='different',
176                                     density='different',
177                                     contamination=self.contamination,
178                                     random_state=self.random_state)
179  
180          assert_equal(y_train.shape[0], X_train.shape[0])
181          assert_equal(y_test.shape[0], X_test.shape[0])
182  
183          assert (self.n_train - X_train.shape[0] <= 1)
184          assert_equal(X_train.shape[1], 2)
185  
186          assert (self.n_test - X_test.shape[0] <= 1)
187          assert_equal(X_test.shape[1], 2)
188  
189          out_perc = (np.sum(y_train) + np.sum(y_test)) / (
190                  self.n_train + self.n_test)
191          assert_allclose(self.contamination, out_perc, atol=0.01)
192  
193      def test_data_generate_categorical(self):
194          X_train, X_test, y_train, y_test = \
195              generate_data_categorical(n_train=self.n_train,
196                                        n_test=self.n_test,
197                                        n_features=2,
198                                        contamination=self.contamination,
199                                        random_state=self.random_state)
200  
201          assert_equal(y_train.shape[0], X_train.shape[0])
202          assert_equal(y_test.shape[0], X_test.shape[0])
203  
204          assert (self.n_train - X_train.shape[0] <= 1)
205          assert_equal(X_train.shape[1], 2)
206  
207          assert (self.n_test - X_test.shape[0] <= 1)
208          assert_equal(X_test.shape[1], 2)
209  
210          out_perc = (np.sum(y_train) + np.sum(y_test)) / (
211                  self.n_train + self.n_test)
212          assert_allclose(self.contamination, out_perc, atol=0.01)
213  
214      def test_data_generate_categorical2(self):
215          X_train, X_test, y_train, y_test = \
216              generate_data_categorical(n_train=self.n_train,
217                                        n_test=self.n_test,
218                                        n_features=4,
219                                        contamination=self.contamination,
220                                        random_state=self.random_state)
221  
222          assert_allclose(X_train.shape, (self.n_train, 4))
223          assert_allclose(X_test.shape, (self.n_test, 4))
224  
225      def test_data_generate_categorical3(self):
226          X_train, y_train, X_test, y_test = \
227              generate_data_categorical(n_train=self.n_train,
228                                        n_test=self.n_test,
229                                        n_features=3,
230                                        contamination=self.contamination,
231                                        random_state=self.random_state)
232  
233          X_train2, y_train2, X_test2, y_test2 = \
234              generate_data_categorical(n_train=self.n_train,
235                                        n_test=self.n_test,
236                                        n_features=3,
237                                        contamination=self.contamination,
238                                        random_state=self.random_state)
239  
240          assert np.array_equal(X_train, X_train2)
241          assert np.array_equal(X_train, X_train2)
242          assert np.array_equal(X_test, X_test2)
243          assert np.array_equal(y_train, y_train2)
244          assert np.array_equal(y_test, y_test2)
245  
246      def test_data_generate_categorical5(self):
247          with assert_raises(ValueError):
248              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
249                                        n_category_in=5, n_category_out=3,
250                                        n_informative=1, n_features=1,
251                                        contamination=self.contamination,
252                                        random_state=-1)
253  
254          with assert_raises(ValueError):
255              generate_data_categorical(n_train=0, n_test=self.n_test,
256                                        n_category_in=5, n_category_out=3,
257                                        n_informative=1, n_features=1,
258                                        contamination=self.contamination,
259                                        random_state=self.random_state)
260  
261          with assert_raises(ValueError):
262              generate_data_categorical(n_train=self.n_train, n_test=-1,
263                                        n_category_in=5, n_category_out=3,
264                                        n_informative=1, n_features=1,
265                                        contamination=self.contamination,
266                                        random_state=self.random_state)
267  
268          with assert_raises(ValueError):
269              generate_data_categorical(n_train='not int', n_test=self.n_test,
270                                        n_category_in=5, n_category_out=3,
271                                        n_informative=1, n_features=1,
272                                        contamination=self.contamination,
273                                        random_state=self.random_state)
274  
275          with assert_raises(ValueError):
276              generate_data_categorical(n_train=self.n_train, n_test='not int',
277                                        n_category_in=5, n_category_out=3,
278                                        n_informative=1, n_features=1,
279                                        contamination=self.contamination,
280                                        random_state=self.random_state)
281  
282          with assert_raises(ValueError):
283              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
284                                        n_category_in=5, n_category_out=3,
285                                        n_informative=1, n_features=0,
286                                        contamination=self.contamination,
287                                        random_state=self.random_state)
288  
289          with assert_raises(ValueError):
290              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
291                                        n_category_in=5, n_category_out=3,
292                                        n_informative=1, n_features='not int',
293                                        contamination=self.contamination,
294                                        random_state=self.random_state)
295          with assert_raises(ValueError):
296              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
297                                        n_category_in=5, n_category_out=3,
298                                        n_informative=-1, n_features=1,
299                                        contamination=self.contamination,
300                                        random_state=self.random_state)
301          with assert_raises(ValueError):
302              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
303                                        n_category_in=5, n_category_out=3,
304                                        n_informative='not int', n_features=1,
305                                        contamination=self.contamination,
306                                        random_state=self.random_state)
307          with assert_raises(ValueError):
308              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
309                                        n_category_in=5, n_category_out=3,
310                                        n_informative=1, n_features=1,
311                                        contamination=0.6,
312                                        random_state=self.random_state)
313          with assert_raises(ValueError):
314              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
315                                        n_category_in=5, n_category_out=3,
316                                        n_informative=1, n_features=1,
317                                        contamination='not float',
318                                        random_state=self.random_state)
319          with assert_raises(ValueError):
320              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
321                                        n_category_in=-1, n_category_out=3,
322                                        n_informative=1, n_features=1,
323                                        contamination=self.contamination,
324                                        random_state=self.random_state)
325          with assert_raises(ValueError):
326              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
327                                        n_category_in='not int',
328                                        n_category_out=3,
329                                        n_informative=1, n_features=1,
330                                        contamination=self.contamination,
331                                        random_state=self.random_state)
332          with assert_raises(ValueError):
333              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
334                                        n_category_in=self.n_train + self.n_test + 1,
335                                        n_category_out=3,
336                                        n_informative=1, n_features=1,
337                                        contamination=self.contamination,
338                                        random_state=self.random_state)
339  
340          with assert_raises(ValueError):
341              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
342                                        n_category_in=5, n_category_out=-1,
343                                        n_informative=1, n_features=1,
344                                        contamination=self.contamination,
345                                        random_state=self.random_state)
346          with assert_raises(ValueError):
347              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
348                                        n_category_in=5,
349                                        n_category_out='not int',
350                                        n_informative=1, n_features=1,
351                                        contamination=self.contamination,
352                                        random_state=self.random_state)
353          with assert_raises(ValueError):
354              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
355                                        n_category_in=5,
356                                        n_category_out=self.n_train + self.n_test + 1,
357                                        n_informative=1, n_features=1,
358                                        contamination=self.contamination,
359                                        random_state=self.random_state)
360  
361          with assert_raises(ValueError):
362              generate_data_categorical(n_train=self.n_train, n_test=self.n_test,
363                                        n_category_in=5,
364                                        n_category_out=5,
365                                        n_informative=2, n_features=2,
366                                        contamination=self.contamination,
367                                        shuffle='not bool',
368                                        random_state=self.random_state)
369  
370      def test_evaluate_print(self):
371          X_train, X_test, y_train, y_test = generate_data(
372              n_train=self.n_train,
373              n_test=self.n_test,
374              contamination=self.contamination)
375          evaluate_print('dummy', y_train, y_train * 0.1)
376  
377      def test_get_outliers_inliers(self):
378          X_train, y_train = generate_data(
379              n_train=self.n_train, train_only=True,
380              contamination=self.contamination)
381  
382          X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)
383  
384          inlier_index = int(self.n_train * (1 - self.contamination))
385  
386          assert_allclose(X_train[0:inlier_index, :], X_inliers)
387          assert_allclose(X_train[inlier_index:, :], X_outliers)
388  
389      def test_check_consistent_shape(self):
390          X_train, X_test, y_train, y_test = generate_data(
391              n_train=self.n_train,
392              n_test=self.n_test,
393              contamination=self.contamination)
394  
395          X_train_n, y_train_n, X_test_n, y_test_n, y_train_pred_n, y_test_pred_n \
396              = check_consistent_shape(X_train, y_train, X_test, y_test,
397                                       y_train, y_test)
398  
399          assert_allclose(X_train_n, X_train)
400          assert_allclose(y_train_n, y_train)
401          assert_allclose(X_test_n, X_test)
402          assert_allclose(y_test_n, y_test)
403          assert_allclose(y_train_pred_n, y_train)
404          assert_allclose(y_test_pred_n, y_test)
405  
406          # test shape difference
407          with assert_raises(ValueError):
408              check_consistent_shape(X_train, y_train, y_train, y_test,
409                                     y_train, y_test)
410  
411          # test shape difference between X_train and X_test
412          X_test = np.hstack((X_test, np.zeros(
413              (X_test.shape[0], 1))))  # add extra column/feature
414          with assert_raises(ValueError):
415              check_consistent_shape(X_train, y_train, X_test, y_test,
416                                     y_train_pred_n, y_test_pred_n)
417  
418      def tearDown(self):
419          pass
420  
421  
422  if __name__ == '__main__':
423      unittest.main()