Cradicle Explorer

data.py
  1  # -*- coding: utf-8 -*-
  2  """Utility functions for manipulating data
  3  """
  4  # Author: Yue Zhao <zhaoy@cmu.edu>
  5  # Author: Yahya Almardeny <almardeny@gmail.com>
  6  # License: BSD 2 clause
  7  
  8  from __future__ import division
  9  from __future__ import print_function
 10  
 11  from warnings import warn
 12  
 13  import numpy as np
 14  from sklearn.datasets import make_blobs
 15  from sklearn.metrics import roc_auc_score
 16  from sklearn.model_selection import train_test_split
 17  from sklearn.utils import check_X_y
 18  from sklearn.utils import check_consistent_length
 19  from sklearn.utils import check_random_state
 20  from sklearn.utils import column_or_1d
 21  
 22  from .utility import check_parameter
 23  from .utility import precision_n_scores
 24  
 25  MAX_INT = np.iinfo(np.int32).max
 26  
 27  
 28  def _generate_data(n_inliers, n_outliers, n_features, coef, offset,
 29                     random_state, n_nan=0, n_inf=0):
 30      """Internal function to generate data samples.
 31  
 32      Parameters
 33      ----------
 34      n_inliers : int
 35          The number of inliers.
 36  
 37      n_outliers : int
 38          The number of outliers.
 39  
 40      n_features : int
 41          The number of features (dimensions).
 42  
 43      coef : float in range [0,1)+0.001
 44          The coefficient of data generation.
 45  
 46      offset : int
 47          Adjust the value range of Gaussian and Uniform.
 48  
 49      random_state : int, RandomState instance or None, optional (default=None)
 50          If int, random_state is the seed used by the random number generator;
 51          If RandomState instance, random_state is the random number generator;
 52          If None, the random number generator is the RandomState instance used
 53          by `np.random`.
 54  
 55      n_nan : int
 56          The number of values that are missing (np.nan). Defaults to zero.
 57  
 58      n_inf : int
 59          The number of values that are infinite. (np.inf). Defaults to zero.
 60  
 61      Returns
 62      -------
 63      X : numpy array of shape (n_train, n_features)
 64          Data.
 65  
 66      y : numpy array of shape (n_train,)
 67          Ground truth.
 68      """
 69  
 70      inliers = coef * random_state.randn(n_inliers, n_features) + offset
 71      outliers = random_state.uniform(low=-1 * offset, high=offset,
 72                                      size=(n_outliers, n_features))
 73      X = np.r_[inliers, outliers]
 74  
 75      y = np.r_[np.zeros((n_inliers,)), np.ones((n_outliers,))]
 76  
 77      if n_nan > 0:
 78          X = np.r_[X, np.full((n_nan, n_features), np.nan)]
 79          y = np.r_[y, np.full((n_nan), np.nan)]
 80  
 81      if n_inf > 0:
 82          X = np.r_[X, np.full((n_inf, n_features), np.inf)]
 83          y = np.r_[y, np.full((n_inf), np.inf)]
 84  
 85      return X, y
 86  
 87  
 88  def get_outliers_inliers(X, y):
 89      """Internal method to separate inliers from outliers.
 90  
 91      Parameters
 92      ----------
 93      X : numpy array of shape (n_samples, n_features)
 94          The input samples
 95  
 96      y : list or array of shape (n_samples,)
 97          The ground truth of input samples.
 98  
 99      Returns
100      -------
101      X_outliers : numpy array of shape (n_samples, n_features)
102          Outliers.
103  
104      X_inliers : numpy array of shape (n_samples, n_features)
105          Inliers.
106  
107      """
108      X_outliers = X[np.where(y == 1)]
109      X_inliers = X[np.where(y == 0)]
110      return X_outliers, X_inliers
111  
112  
113  def generate_data(n_train=1000, n_test=500, n_features=2, contamination=0.1,
114                    train_only=False, offset=10, behaviour='new',
115                    random_state=None, n_nan=0, n_inf=0):
116      """Utility function to generate synthesized data.
117      Normal data is generated by a multivariate Gaussian distribution and
118      outliers are generated by a uniform distribution.
119      "X_train, X_test, y_train, y_test" are returned.
120  
121      Parameters
122      ----------
123      n_train : int, (default=1000)
124          The number of training points to generate.
125  
126      n_test : int, (default=500)
127          The number of test points to generate.
128  
129      n_features : int, optional (default=2)
130          The number of features (dimensions).
131  
132      contamination : float in (0., 0.5), optional (default=0.1)
133          The amount of contamination of the data set, i.e.
134          the proportion of outliers in the data set. Used when fitting to
135          define the threshold on the decision function.
136  
137      train_only : bool, optional (default=False)
138          If true, generate train data only.
139  
140      offset : int, optional (default=10)
141          Adjust the value range of Gaussian and Uniform.
142  
143      behaviour : str, default='new'
144          Behaviour of the returned datasets which can be either 'old' or
145          'new'. Passing ``behaviour='new'`` returns
146          "X_train, X_test, y_train, y_test", while passing ``behaviour='old'``
147          returns "X_train, y_train, X_test, y_test".
148  
149      random_state : int, RandomState instance or None, optional (default=None)
150          If int, random_state is the seed used by the random number generator;
151          If RandomState instance, random_state is the random number generator;
152          If None, the random number generator is the RandomState instance used
153          by `np.random`.
154  
155      n_nan : int
156          The number of values that are missing (np.nan). Defaults to zero.
157  
158      n_inf : int
159          The number of values that are infinite. (np.inf). Defaults to zero.
160  
161      Returns
162      -------
163      X_train : numpy array of shape (n_train, n_features)
164          Training data.
165  
166      X_test : numpy array of shape (n_test, n_features)
167          Test data.
168  
169      y_train : numpy array of shape (n_train,)
170          Training ground truth.
171  
172      y_test : numpy array of shape (n_test,)
173          Test ground truth.
174  
175      """
176  
177      # initialize a random state and seeds for the instance
178      random_state = check_random_state(random_state)
179      offset_ = random_state.randint(low=offset)
180      coef_ = random_state.random_sample() + 0.001  # in case of underflow
181  
182      if isinstance(contamination, (float, int)):
183          n_outliers_train = int(n_train * contamination)
184      else:
185          contamination = 0.1
186          n_outliers_train = int(n_train * contamination)
187  
188      n_inliers_train = int(n_train - n_outliers_train)
189  
190      X_train, y_train = _generate_data(n_inliers_train, n_outliers_train,
191                                        n_features, coef_, offset_, random_state,
192                                        n_nan, n_inf)
193  
194      if train_only:
195          return X_train, y_train
196  
197      n_outliers_test = int(n_test * contamination)
198      n_inliers_test = int(n_test - n_outliers_test)
199  
200      X_test, y_test = _generate_data(n_inliers_test, n_outliers_test,
201                                      n_features, coef_, offset_, random_state,
202                                      n_nan, n_inf)
203  
204      if behaviour == 'old':
205          warn('behaviour="old" is deprecated and will be removed '
206               'in version 0.9.0. Please use behaviour="new", which '
207               'makes the returned datasets in the order of '
208               'X_train, X_test, y_train, y_test.',
209               FutureWarning)
210          return X_train, y_train, X_test, y_test
211  
212      else:
213          return X_train, X_test, y_train, y_test
214  
215  
216  def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
217                             y_test_pred):
218      """Internal shape to check input data shapes are consistent.
219  
220      Parameters
221      ----------
222      X_train : numpy array of shape (n_samples, n_features)
223          The training samples.
224  
225      y_train : list or array of shape (n_samples,)
226          The ground truth of training samples.
227  
228      X_test : numpy array of shape (n_samples, n_features)
229          The test samples.
230  
231      y_test : list or array of shape (n_samples,)
232          The ground truth of test samples.
233  
234      y_train_pred : numpy array of shape (n_samples, n_features)
235          The predicted binary labels of the training samples.
236  
237      y_test_pred : numpy array of shape (n_samples, n_features)
238          The predicted binary labels of the test samples.
239  
240      Returns
241      -------
242      X_train : numpy array of shape (n_samples, n_features)
243          The training samples.
244  
245      y_train : list or array of shape (n_samples,)
246          The ground truth of training samples.
247  
248      X_test : numpy array of shape (n_samples, n_features)
249          The test samples.
250  
251      y_test : list or array of shape (n_samples,)
252          The ground truth of test samples.
253  
254      y_train_pred : numpy array of shape (n_samples, n_features)
255          The predicted binary labels of the training samples.
256  
257      y_test_pred : numpy array of shape (n_samples, n_features)
258          The predicted binary labels of the test samples.
259      """
260  
261      # check input data shapes are consistent
262      X_train, y_train = check_X_y(X_train, y_train)
263      X_test, y_test = check_X_y(X_test, y_test)
264  
265      y_test_pred = column_or_1d(y_test_pred)
266      y_train_pred = column_or_1d(y_train_pred)
267  
268      check_consistent_length(y_train, y_train_pred)
269      check_consistent_length(y_test, y_test_pred)
270  
271      if X_train.shape[1] != X_test.shape[1]:
272          raise ValueError("X_train {0} and X_test {1} have different number "
273                           "of features.".format(X_train.shape, X_test.shape))
274  
275      return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
276  
277  
278  def evaluate_print(clf_name, y, y_pred):
279      """Utility function for evaluating and printing the results for examples.
280      Default metrics include ROC and Precision @ n
281  
282      Parameters
283      ----------
284      clf_name : str
285          The name of the detector.
286  
287      y : list or numpy array of shape (n_samples,)
288          The ground truth. Binary (0: inliers, 1: outliers).
289  
290      y_pred : list or numpy array of shape (n_samples,)
291          The raw outlier scores as returned by a fitted model.
292  
293      """
294  
295      y = column_or_1d(y)
296      y_pred = column_or_1d(y_pred)
297      check_consistent_length(y, y_pred)
298  
299      print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format(
300          clf_name=clf_name,
301          roc=np.round(roc_auc_score(y, y_pred), decimals=4),
302          prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
303  
304  
305  def generate_data_clusters(n_train=1000, n_test=500, n_clusters=2,
306                             n_features=2, contamination=0.1, size='same',
307                             density='same', dist=0.25, random_state=None,
308                             return_in_clusters=False):
309      """Utility function to generate synthesized data in clusters.
310         Generated data can involve the low density pattern problem and global
311         outliers which are considered as difficult tasks for outliers detection
312         algorithms.
313  
314      Parameters
315      ----------
316      n_train : int, (default=1000)
317          The number of training points to generate.
318  
319      n_test : int, (default=500)
320          The number of test points to generate.
321  
322      n_clusters : int, optional (default=2)
323         The number of centers (i.e. clusters) to generate.
324  
325      n_features : int, optional (default=2)
326         The number of features for each sample.
327  
328      contamination : float in (0., 0.5), optional (default=0.1)
329         The amount of contamination of the data set, i.e.
330         the proportion of outliers in the data set.
331  
332      size : str, optional (default='same')
333         Size of each cluster: 'same' generates clusters with same size,
334         'different' generate clusters with different sizes.
335  
336      density : str, optional (default='same')
337         Density of each cluster: 'same' generates clusters with same density,
338         'different' generate clusters with different densities.
339  
340      dist: float, optional (default=0.25)
341         Distance between clusters. Should be between 0. and 1.0
342         It is used to avoid clusters overlapping as much as possible.
343         However, if number of samples and number of clusters are too high,
344         it is unlikely to separate them fully even if ``dist`` set to 1.0
345  
346      random_state : int, RandomState instance or None, optional (default=None)
347          If int, random_state is the seed used by the random number generator;
348          If RandomState instance, random_state is the random number generator;
349          If None, the random number generator is the RandomState instance used
350          by `np.random`.
351  
352      return_in_clusters : bool, optional (default=False)
353          If True, the function returns x_train, y_train, x_test, y_test each as
354          a list of numpy arrays where each index represents a cluster.
355          If False, it returns x_train, y_train, x_test, y_test each as numpy
356          array after joining the sequence of clusters arrays,
357  
358      Returns
359      -------
360      X_train : numpy array of shape (n_train, n_features)
361          Training data.
362  
363      y_train : numpy array of shape (n_train,)
364          Training ground truth.
365  
366      X_test : numpy array of shape (n_test, n_features)
367          Test data.
368  
369      y_test : numpy array of shape (n_test,)
370          Test ground truth.
371      """
372      # initialize a random state and seeds for the instance
373      random_state = check_random_state(random_state)
374  
375      if isinstance(n_clusters, int):
376          check_parameter(n_clusters, low=1, param_name='n_clusters')
377      else:
378          raise ValueError("n_clusters should be int, got %s" % n_clusters)
379  
380      if isinstance(n_features, int):
381          check_parameter(n_features, low=1, param_name='n_features')
382      else:
383          raise ValueError("n_features should be int, got %s" % n_features)
384  
385      if isinstance(contamination, (float, int)):
386          check_parameter(contamination, low=0, high=0.5,
387                          param_name='contamination')
388      else:
389          raise ValueError(
390              "contamination should be float, got %s" % contamination)
391  
392      if isinstance(dist, float):
393          check_parameter(dist, low=0, high=1.0, param_name='dist')
394      else:
395          raise ValueError("dist should be float, got %s" % dist)
396  
397      if not isinstance(return_in_clusters, bool):
398          raise ValueError("return_in_clusters should be of type bool, "
399                           "got %s" % return_in_clusters)
400  
401      # find the required number of outliers and inliers
402      n_samples = n_train + n_test
403      n_outliers = int(n_samples * contamination)
404      n_inliers = n_samples - n_outliers
405  
406      if size == 'same':
407          a_ = [int(n_inliers / n_clusters)] * (n_clusters - 1)
408          clusters_size = a_ + [int(n_inliers - sum(a_))]
409      elif size == 'different':
410          if (n_clusters * 10) > n_samples:
411              raise ValueError('number of samples should be at least 10 times of'
412                               'the number of clusters')
413          if (n_clusters * 10) > n_inliers:
414              raise ValueError('contamination ratio is too high, try to increase'
415                               ' number of samples or decrease the contamination')
416          _r = 1. / n_clusters
417          _offset = random_state.uniform(_r * 0.2, _r * 0.4,
418                                         size=(int(n_clusters / 2),)).tolist()
419          _offset += [i * -1. for i in _offset]
420          clusters_size = np.round(
421              np.multiply(n_inliers, np.add(_r, _offset))).astype(int)
422          if n_clusters % 2 == 0:  # if it is even number
423              clusters_size[n_clusters - 1] += n_inliers - sum(clusters_size)
424          else:
425              clusters_size = np.append(clusters_size,
426                                        n_inliers - sum(clusters_size))
427      else:
428          raise ValueError(
429              'size should be a string of value \'same\' or \'different\'')
430  
431      # check for clusters densities and apply split accordingly
432      if density == 'same':
433          clusters_density = random_state.uniform(low=0.1, high=0.5, size=(
434              1,)).tolist() * n_clusters
435      elif density == 'different':
436          clusters_density = random_state.uniform(low=0.1, high=0.5,
437                                                  size=(n_clusters,))
438      else:
439          raise ValueError(
440              'density should be a string of value \'same\' or \'different\'')
441  
442      # calculate number of outliers for every cluster
443      n_outliers_ = []
444      for i in range(n_clusters):
445          n_outliers_.append(int(round(clusters_size[i] * contamination)))
446      _diff = int((n_outliers - sum(n_outliers_)) / n_clusters)
447      for i in range(n_clusters - 1):
448          n_outliers_[i] += _diff
449      n_outliers_[n_clusters - 1] += n_outliers - sum(n_outliers_)
450      random_state.shuffle(n_outliers_)
451  
452      # generate data
453      X_clusters, y_clusters = [], []
454      X, y = np.zeros([n_samples, n_features]), np.zeros([n_samples, ])
455  
456      center_box = list(filter(lambda a: a != 0, np.linspace(
457          -np.power(n_samples * n_clusters, dist),
458          np.power(n_samples * n_clusters, dist),
459          n_clusters + 2)))
460  
461      # index tracker for value assignment
462      tracker_idx = 0
463  
464      for i in range(n_clusters):
465          inliers, outliers = [], []
466          _blob, _y = make_blobs(n_samples=clusters_size[i], centers=1,
467                                 cluster_std=clusters_density[i],
468                                 center_box=(center_box[i], center_box[i + 1]),
469                                 n_features=n_features,
470                                 random_state=random_state)
471  
472          inliers.append(_blob)
473  
474          center_box_l = center_box[i] * (1.2 + dist + clusters_density[i])
475          center_box_r = center_box[i + 1] * (1.2 + dist + clusters_density[i])
476  
477          outliers.append(make_blobs(n_samples=n_outliers_[i], centers=1,
478                                     cluster_std=random_state.uniform(
479                                         clusters_density[i] * 3.5,
480                                         clusters_density[i] * 4.,
481                                         size=(1,)[0]),
482                                     center_box=(center_box_l, center_box_r),
483                                     n_features=n_features,
484                                     random_state=random_state)[0])
485          _y = np.append(_y, [1] * int(n_outliers_[i]))
486  
487          # generate X
488          if np.array(outliers).ravel().shape[0] > 0:
489              stacked_X_temp = np.vstack(
490                  (np.concatenate(inliers), np.concatenate(outliers)))
491              X_clusters.append(stacked_X_temp)
492              tracker_idx_new = tracker_idx + stacked_X_temp.shape[0]
493              X[tracker_idx:tracker_idx_new, :] = stacked_X_temp
494          else:
495              X_clusters.append(np.concatenate(inliers))
496  
497          # generate Y
498          y_clusters.append(_y)
499          y[tracker_idx:tracker_idx_new, ] = _y
500  
501          tracker_idx = tracker_idx_new
502  
503      if return_in_clusters:
504          return X_clusters, y_clusters
505  
506      # return X_train, X_test, y_train, y_test
507      else:
508          return train_test_split(X, y, test_size=n_test,
509                                  random_state=random_state)
510  
511  
512  def generate_data_categorical(n_train=1000, n_test=500, n_features=2,
513                                n_informative=2, n_category_in=2,
514                                n_category_out=2, contamination=0.1,
515                                shuffle=True, random_state=None):
516      """Utility function to generate synthesized categorical data.
517  
518      Parameters
519      ----------
520      n_train : int, (default=1000)
521          The number of training points to generate.
522  
523      n_test : int, (default=500)
524          The number of test points to generate.
525  
526      n_features : int, optional (default=2)
527         The number of features for each sample.
528  
529      n_informative : int in (1, n_features), optional (default=2)
530         The number of informative features in the outlier points.
531         The higher the easier the outlier detection should be.
532         Note that n_informative should not be less than or
533         equal n_features.
534  
535      n_category_in : int in (1, n_inliers), optional (default=2)
536         The number of categories in the inlier points.
537  
538      n_category_out : int in (1, n_outliers), optional (default=2)
539         The number of categories in the outlier points.
540  
541      contamination : float in (0., 0.5), optional (default=0.1)
542         The amount of contamination of the data set, i.e.
543         the proportion of outliers in the data set.
544  
545      shuffle: bool, optional(default=True)
546          If True, inliers will be shuffled which makes more noisy distribution.
547  
548      random_state : int, RandomState instance or None, optional (default=None)
549          If int, random_state is the seed used by the random number generator;
550          If RandomState instance, random_state is the random number generator;
551          If None, the random number generator is the RandomState instance used
552          by `np.random`.
553  
554  
555      Returns
556      -------
557      X_train : numpy array of shape (n_train, n_features)
558          Training data.
559  
560      y_train : numpy array of shape (n_train,)
561          Training ground truth.
562  
563      X_test : numpy array of shape (n_test, n_features)
564          Test data.
565  
566      y_test : numpy array of shape (n_test,)
567          Test ground truth.
568      """
569  
570      # initialize a random state and seeds for the instance
571      random_state = check_random_state(random_state)
572  
573      if isinstance(n_train, int):
574          check_parameter(n_train, low=1, param_name='n_train')
575      else:
576          raise ValueError("n_train should be int, got %s" % n_train)
577  
578      if isinstance(n_test, int):
579          check_parameter(n_test, low=0, param_name='n_test')
580      else:
581          raise ValueError("n_test should be int, got %s" % n_test)
582  
583      if isinstance(n_features, int):
584          check_parameter(n_features, low=0, param_name='n_features')
585      else:
586          raise ValueError("n_features should be int, got %s" % n_features)
587  
588      if isinstance(n_informative, int):
589          check_parameter(n_informative, low=0, high=n_features + 1, param_name='n_informative')
590      else:
591          raise ValueError("n_informative should be int, got %s" % n_informative)
592  
593      if isinstance(contamination, (float, int)):
594          check_parameter(contamination, low=0, high=0.5,
595                          param_name='contamination')
596      else:
597          raise ValueError("contamination should be float, got %s" % contamination)
598  
599      if not isinstance(shuffle, bool):
600          raise ValueError("shuffle should be bool, got %s" % shuffle)
601  
602      # find the required number of outliers and inliers
603      n_samples = n_train + n_test
604      n_outliers = int(n_samples * contamination)
605      n_inliers = n_samples - n_outliers
606  
607      if isinstance(n_category_in, int):
608          check_parameter(n_category_in, low=0, high=n_inliers + 1, param_name='n_category_in')
609      else:
610          raise ValueError("n_category_in should be int, got %s" % n_category_in)
611  
612      if isinstance(n_category_out, int):
613          check_parameter(n_category_out, low=0, high=n_outliers + 1, param_name='n_category_out')
614      else:
615          raise ValueError("n_category_out should be int, got %s" % n_category_out)
616  
617      # Encapsulated functions to generate features
618      def __f(f):
619          quot, rem = divmod(f - 1, 26)
620          return __f(quot) + chr(rem + ord('A')) if f != 0 else ''
621  
622      # generate pool of features to be the base for naming the data points
623      features = []
624      for i in range(1, n_features + 1):
625          features.append(__f(i))
626  
627      # find the required distributions of categories over inliers and outliers
628      temp_ = [int(n_inliers / n_category_in)] * (n_category_in - 1)
629      dist_in = temp_ + [int(n_inliers - sum(temp_))]
630      temp_ = [int(n_outliers / n_category_out)] * (n_category_out - 1)
631      dist_out = temp_ + [int(n_outliers - sum(temp_))]
632  
633      # generate categorical data
634      X = []
635      count = 0
636      for f in features:
637          inliers = np.hstack([[f + str(i)] * dist_in[i] for i in range(n_category_in)])
638          if shuffle:
639              random_state.shuffle(inliers)
640          if count < n_informative:
641              outliers = list(np.hstack(
642                  [[f + str((n_category_in * 2) + i)] * dist_out[i] for i in range(n_category_out)]))
643          else:
644              outliers = list(inliers[random_state.randint(0, len(inliers), size=n_outliers)])
645          count += 1
646  
647          X.append(list(inliers) + outliers)
648  
649      return train_test_split(np.array(X).T,
650                              np.array(([0] * n_inliers) + ([1] * n_outliers)),
651                              test_size=n_test,
652                              random_state=random_state)
653  
654  
655  def generate_ts_data(n_train=500, n_test=200, n_channels=1,
656                       contamination=0.05, period=50, noise_std=0.3,
657                       anomaly_type='point', random_state=None):
658      """Generate synthetic time series data with injected anomalies.
659  
660      Creates a sinusoidal base signal with Gaussian noise and injects
661      anomalies at random locations. Follows conventions from the TS-AD
662      literature (e.g., TSB-AD benchmark).
663  
664      Parameters
665      ----------
666      n_train : int, optional (default=500)
667          Length of training time series.
668      n_test : int, optional (default=200)
669          Length of test time series.
670      n_channels : int, optional (default=1)
671          Number of channels (univariate=1, multivariate>1).
672      contamination : float, optional (default=0.05)
673          Fraction of timestamps that are anomalous (approximately).
674          For subsequence anomalies, the total labeled timestamps are
675          controlled to stay near this fraction.
676      period : int, optional (default=50)
677          Period of the sinusoidal base signal.
678      noise_std : float, optional (default=0.3)
679          Standard deviation of Gaussian noise.
680      anomaly_type : str, optional (default='point')
681          Type of anomaly: 'point' (spikes), 'subsequence' (shape change),
682          or 'both'.
683      random_state : int, RandomState instance, or None (default=None)
684          Random seed for reproducibility.
685  
686      Returns
687      -------
688      X_train : np.ndarray of shape (n_train,) or (n_train, n_channels)
689          Training time series. Univariate returned as 1D.
690      X_test : np.ndarray of shape (n_test,) or (n_test, n_channels)
691          Test time series.
692      y_train : np.ndarray of shape (n_train,)
693          Binary labels (1=anomaly, 0=normal) for training.
694      y_test : np.ndarray of shape (n_test,)
695          Binary labels for test.
696      """
697      rng = check_random_state(random_state)
698  
699      # Validate parameters
700      if n_train < 20:
701          raise ValueError("n_train must be >= 20, got %d" % n_train)
702      if n_test < 20:
703          raise ValueError("n_test must be >= 20, got %d" % n_test)
704      if n_channels < 1:
705          raise ValueError("n_channels must be >= 1, got %d" % n_channels)
706      if not 0 < contamination < 0.5:
707          raise ValueError("contamination must be in (0, 0.5), got %f"
708                           % contamination)
709      if anomaly_type not in ('point', 'subsequence', 'both'):
710          raise ValueError("anomaly_type must be 'point', 'subsequence', "
711                           "or 'both', got '%s'" % anomaly_type)
712  
713      def _make_series(length):
714          t = np.arange(length, dtype=np.float64)
715          if n_channels == 1:
716              base = np.sin(2 * np.pi * t / period)
717              X = base + noise_std * rng.randn(length)
718          else:
719              X = np.empty((length, n_channels))
720              for ch in range(n_channels):
721                  phase = 2 * np.pi * ch / n_channels
722                  freq = period * (1 + 0.2 * ch)
723                  X[:, ch] = np.sin(2 * np.pi * t / freq + phase) \
724                             + noise_std * rng.randn(length)
725          return X
726  
727      def _inject_anomalies(X, length):
728          target_n_anom_timestamps = max(1, int(length * contamination))
729          y = np.zeros(length, dtype=np.int32)
730  
731          # Choose anomaly locations (avoid first/last 10%)
732          margin = max(5, length // 10)
733          candidates = np.arange(margin, length - margin)
734          if len(candidates) == 0:
735              candidates = np.arange(1, length - 1)
736  
737          # For subsequence anomalies, compute how many events we need
738          # to approximately hit the target timestamp count
739          subseq_len = max(3, period // 5)
740          if anomaly_type == 'point':
741              n_events = target_n_anom_timestamps
742          elif anomaly_type == 'subsequence':
743              n_events = max(1, target_n_anom_timestamps // subseq_len)
744          else:  # both
745              avg_len = (1 + subseq_len) / 2
746              n_events = max(1, int(target_n_anom_timestamps / avg_len))
747  
748          n_events = min(n_events, len(candidates))
749          anom_indices = rng.choice(candidates, size=n_events, replace=False)
750          anom_indices.sort()
751  
752          for idx in anom_indices:
753              if anomaly_type == 'point' or \
754                      (anomaly_type == 'both' and rng.rand() > 0.5):
755                  # Point anomaly: spike
756                  magnitude = 4.0 + 2.0 * rng.rand()
757                  sign = 1 if rng.rand() > 0.5 else -1
758                  if n_channels == 1:
759                      X[idx] += sign * magnitude
760                  else:
761                      ch = rng.randint(n_channels)
762                      X[idx, ch] += sign * magnitude
763                  y[idx] = 1
764              else:
765                  # Subsequence anomaly: shape change
766                  end = min(idx + subseq_len, length)
767                  if n_channels == 1:
768                      X[idx:end] = np.mean(X[idx:end]) + \
769                                   3.0 * noise_std * rng.randn(end - idx)
770                  else:
771                      ch = rng.randint(n_channels)
772                      X[idx:end, ch] = 3.0 * noise_std * rng.randn(end - idx)
773                  y[idx:end] = 1
774  
775          return X, y
776  
777      X_train = _make_series(n_train)
778      X_test = _make_series(n_test)
779      X_train, y_train = _inject_anomalies(X_train, n_train)
780      X_test, y_test = _inject_anomalies(X_test, n_test)
781  
782      return X_train, X_test, y_train, y_test
783  
784  
785  def generate_graph_data(n_nodes=300, n_features=16, n_edges_per_node=5,
786                          contamination=0.1, random_state=None):
787      """Generate synthetic attributed graph data with planted anomalies.
788  
789      Normal nodes have features from N(0, 1). Anomaly nodes have features
790      shifted by +5 standard deviations. Edges are generated via random
791      neighbor selection (undirected, no self-loops, no duplicates).
792  
793      Parameters
794      ----------
795      n_nodes : int, default=300
796          Number of nodes.
797  
798      n_features : int, default=16
799          Dimensionality of node features.
800  
801      n_edges_per_node : int, default=5
802          Average number of edges per node (Poisson-sampled per node).
803  
804      contamination : float, default=0.1
805          Fraction of nodes that are anomalies.
806  
807      random_state : int, RandomState or None, default=None
808          Seed for reproducibility.
809  
810      Returns
811      -------
812      X : np.ndarray of shape (n_nodes, n_features)
813          Node feature matrix (float32).
814  
815      edge_index : np.ndarray of shape (2, n_edges)
816          COO-format edge list (int64, undirected, no self-loops).
817  
818      y : np.ndarray of shape (n_nodes,)
819          Binary labels: 0 = normal, 1 = anomaly.
820      """
821      rng = check_random_state(random_state)
822  
823      n_anomalies = max(1, int(n_nodes * contamination))
824      n_normal = n_nodes - n_anomalies
825  
826      # Features: normal from N(0,1), anomalies shifted by +5
827      X_normal = rng.randn(n_normal, n_features).astype(np.float32)
828      X_anomaly = (rng.randn(n_anomalies, n_features) + 5.0).astype(
829          np.float32)
830      X = np.vstack([X_normal, X_anomaly])
831      y = np.concatenate([np.zeros(n_normal, dtype=np.int32),
832                          np.ones(n_anomalies, dtype=np.int32)])
833  
834      # Shuffle
835      perm = rng.permutation(n_nodes)
836      X, y = X[perm], y[perm]
837  
838      # Generate edges via random neighbor selection
839      edges = set()
840      for i in range(n_nodes):
841          n_nbrs = max(1, rng.poisson(n_edges_per_node))
842          candidates = rng.choice(n_nodes, size=min(n_nbrs + 1, n_nodes),
843                                  replace=False)
844          for j in candidates:
845              if i != j:
846                  u, v = (i, j) if i < j else (j, i)
847                  edges.add((u, v))
848  
849      rows, cols = [], []
850      for u, v in edges:
851          rows.extend([u, v])
852          cols.extend([v, u])
853  
854      edge_index = np.array([rows, cols], dtype=np.int64)
855      return X, edge_index, y