data.py
1 # -*- coding: utf-8 -*- 2 """Utility functions for manipulating data 3 """ 4 # Author: Yue Zhao <zhaoy@cmu.edu> 5 # Author: Yahya Almardeny <almardeny@gmail.com> 6 # License: BSD 2 clause 7 8 from __future__ import division 9 from __future__ import print_function 10 11 from warnings import warn 12 13 import numpy as np 14 from sklearn.datasets import make_blobs 15 from sklearn.metrics import roc_auc_score 16 from sklearn.model_selection import train_test_split 17 from sklearn.utils import check_X_y 18 from sklearn.utils import check_consistent_length 19 from sklearn.utils import check_random_state 20 from sklearn.utils import column_or_1d 21 22 from .utility import check_parameter 23 from .utility import precision_n_scores 24 25 MAX_INT = np.iinfo(np.int32).max 26 27 28 def _generate_data(n_inliers, n_outliers, n_features, coef, offset, 29 random_state, n_nan=0, n_inf=0): 30 """Internal function to generate data samples. 31 32 Parameters 33 ---------- 34 n_inliers : int 35 The number of inliers. 36 37 n_outliers : int 38 The number of outliers. 39 40 n_features : int 41 The number of features (dimensions). 42 43 coef : float in range [0,1)+0.001 44 The coefficient of data generation. 45 46 offset : int 47 Adjust the value range of Gaussian and Uniform. 48 49 random_state : int, RandomState instance or None, optional (default=None) 50 If int, random_state is the seed used by the random number generator; 51 If RandomState instance, random_state is the random number generator; 52 If None, the random number generator is the RandomState instance used 53 by `np.random`. 54 55 n_nan : int 56 The number of values that are missing (np.nan). Defaults to zero. 57 58 n_inf : int 59 The number of values that are infinite. (np.inf). Defaults to zero. 60 61 Returns 62 ------- 63 X : numpy array of shape (n_train, n_features) 64 Data. 65 66 y : numpy array of shape (n_train,) 67 Ground truth. 68 """ 69 70 inliers = coef * random_state.randn(n_inliers, n_features) + offset 71 outliers = random_state.uniform(low=-1 * offset, high=offset, 72 size=(n_outliers, n_features)) 73 X = np.r_[inliers, outliers] 74 75 y = np.r_[np.zeros((n_inliers,)), np.ones((n_outliers,))] 76 77 if n_nan > 0: 78 X = np.r_[X, np.full((n_nan, n_features), np.nan)] 79 y = np.r_[y, np.full((n_nan), np.nan)] 80 81 if n_inf > 0: 82 X = np.r_[X, np.full((n_inf, n_features), np.inf)] 83 y = np.r_[y, np.full((n_inf), np.inf)] 84 85 return X, y 86 87 88 def get_outliers_inliers(X, y): 89 """Internal method to separate inliers from outliers. 90 91 Parameters 92 ---------- 93 X : numpy array of shape (n_samples, n_features) 94 The input samples 95 96 y : list or array of shape (n_samples,) 97 The ground truth of input samples. 98 99 Returns 100 ------- 101 X_outliers : numpy array of shape (n_samples, n_features) 102 Outliers. 103 104 X_inliers : numpy array of shape (n_samples, n_features) 105 Inliers. 106 107 """ 108 X_outliers = X[np.where(y == 1)] 109 X_inliers = X[np.where(y == 0)] 110 return X_outliers, X_inliers 111 112 113 def generate_data(n_train=1000, n_test=500, n_features=2, contamination=0.1, 114 train_only=False, offset=10, behaviour='new', 115 random_state=None, n_nan=0, n_inf=0): 116 """Utility function to generate synthesized data. 117 Normal data is generated by a multivariate Gaussian distribution and 118 outliers are generated by a uniform distribution. 119 "X_train, X_test, y_train, y_test" are returned. 120 121 Parameters 122 ---------- 123 n_train : int, (default=1000) 124 The number of training points to generate. 125 126 n_test : int, (default=500) 127 The number of test points to generate. 128 129 n_features : int, optional (default=2) 130 The number of features (dimensions). 131 132 contamination : float in (0., 0.5), optional (default=0.1) 133 The amount of contamination of the data set, i.e. 134 the proportion of outliers in the data set. Used when fitting to 135 define the threshold on the decision function. 136 137 train_only : bool, optional (default=False) 138 If true, generate train data only. 139 140 offset : int, optional (default=10) 141 Adjust the value range of Gaussian and Uniform. 142 143 behaviour : str, default='new' 144 Behaviour of the returned datasets which can be either 'old' or 145 'new'. Passing ``behaviour='new'`` returns 146 "X_train, X_test, y_train, y_test", while passing ``behaviour='old'`` 147 returns "X_train, y_train, X_test, y_test". 148 149 random_state : int, RandomState instance or None, optional (default=None) 150 If int, random_state is the seed used by the random number generator; 151 If RandomState instance, random_state is the random number generator; 152 If None, the random number generator is the RandomState instance used 153 by `np.random`. 154 155 n_nan : int 156 The number of values that are missing (np.nan). Defaults to zero. 157 158 n_inf : int 159 The number of values that are infinite. (np.inf). Defaults to zero. 160 161 Returns 162 ------- 163 X_train : numpy array of shape (n_train, n_features) 164 Training data. 165 166 X_test : numpy array of shape (n_test, n_features) 167 Test data. 168 169 y_train : numpy array of shape (n_train,) 170 Training ground truth. 171 172 y_test : numpy array of shape (n_test,) 173 Test ground truth. 174 175 """ 176 177 # initialize a random state and seeds for the instance 178 random_state = check_random_state(random_state) 179 offset_ = random_state.randint(low=offset) 180 coef_ = random_state.random_sample() + 0.001 # in case of underflow 181 182 if isinstance(contamination, (float, int)): 183 n_outliers_train = int(n_train * contamination) 184 else: 185 contamination = 0.1 186 n_outliers_train = int(n_train * contamination) 187 188 n_inliers_train = int(n_train - n_outliers_train) 189 190 X_train, y_train = _generate_data(n_inliers_train, n_outliers_train, 191 n_features, coef_, offset_, random_state, 192 n_nan, n_inf) 193 194 if train_only: 195 return X_train, y_train 196 197 n_outliers_test = int(n_test * contamination) 198 n_inliers_test = int(n_test - n_outliers_test) 199 200 X_test, y_test = _generate_data(n_inliers_test, n_outliers_test, 201 n_features, coef_, offset_, random_state, 202 n_nan, n_inf) 203 204 if behaviour == 'old': 205 warn('behaviour="old" is deprecated and will be removed ' 206 'in version 0.9.0. Please use behaviour="new", which ' 207 'makes the returned datasets in the order of ' 208 'X_train, X_test, y_train, y_test.', 209 FutureWarning) 210 return X_train, y_train, X_test, y_test 211 212 else: 213 return X_train, X_test, y_train, y_test 214 215 216 def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, 217 y_test_pred): 218 """Internal shape to check input data shapes are consistent. 219 220 Parameters 221 ---------- 222 X_train : numpy array of shape (n_samples, n_features) 223 The training samples. 224 225 y_train : list or array of shape (n_samples,) 226 The ground truth of training samples. 227 228 X_test : numpy array of shape (n_samples, n_features) 229 The test samples. 230 231 y_test : list or array of shape (n_samples,) 232 The ground truth of test samples. 233 234 y_train_pred : numpy array of shape (n_samples, n_features) 235 The predicted binary labels of the training samples. 236 237 y_test_pred : numpy array of shape (n_samples, n_features) 238 The predicted binary labels of the test samples. 239 240 Returns 241 ------- 242 X_train : numpy array of shape (n_samples, n_features) 243 The training samples. 244 245 y_train : list or array of shape (n_samples,) 246 The ground truth of training samples. 247 248 X_test : numpy array of shape (n_samples, n_features) 249 The test samples. 250 251 y_test : list or array of shape (n_samples,) 252 The ground truth of test samples. 253 254 y_train_pred : numpy array of shape (n_samples, n_features) 255 The predicted binary labels of the training samples. 256 257 y_test_pred : numpy array of shape (n_samples, n_features) 258 The predicted binary labels of the test samples. 259 """ 260 261 # check input data shapes are consistent 262 X_train, y_train = check_X_y(X_train, y_train) 263 X_test, y_test = check_X_y(X_test, y_test) 264 265 y_test_pred = column_or_1d(y_test_pred) 266 y_train_pred = column_or_1d(y_train_pred) 267 268 check_consistent_length(y_train, y_train_pred) 269 check_consistent_length(y_test, y_test_pred) 270 271 if X_train.shape[1] != X_test.shape[1]: 272 raise ValueError("X_train {0} and X_test {1} have different number " 273 "of features.".format(X_train.shape, X_test.shape)) 274 275 return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred 276 277 278 def evaluate_print(clf_name, y, y_pred): 279 """Utility function for evaluating and printing the results for examples. 280 Default metrics include ROC and Precision @ n 281 282 Parameters 283 ---------- 284 clf_name : str 285 The name of the detector. 286 287 y : list or numpy array of shape (n_samples,) 288 The ground truth. Binary (0: inliers, 1: outliers). 289 290 y_pred : list or numpy array of shape (n_samples,) 291 The raw outlier scores as returned by a fitted model. 292 293 """ 294 295 y = column_or_1d(y) 296 y_pred = column_or_1d(y_pred) 297 check_consistent_length(y, y_pred) 298 299 print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format( 300 clf_name=clf_name, 301 roc=np.round(roc_auc_score(y, y_pred), decimals=4), 302 prn=np.round(precision_n_scores(y, y_pred), decimals=4))) 303 304 305 def generate_data_clusters(n_train=1000, n_test=500, n_clusters=2, 306 n_features=2, contamination=0.1, size='same', 307 density='same', dist=0.25, random_state=None, 308 return_in_clusters=False): 309 """Utility function to generate synthesized data in clusters. 310 Generated data can involve the low density pattern problem and global 311 outliers which are considered as difficult tasks for outliers detection 312 algorithms. 313 314 Parameters 315 ---------- 316 n_train : int, (default=1000) 317 The number of training points to generate. 318 319 n_test : int, (default=500) 320 The number of test points to generate. 321 322 n_clusters : int, optional (default=2) 323 The number of centers (i.e. clusters) to generate. 324 325 n_features : int, optional (default=2) 326 The number of features for each sample. 327 328 contamination : float in (0., 0.5), optional (default=0.1) 329 The amount of contamination of the data set, i.e. 330 the proportion of outliers in the data set. 331 332 size : str, optional (default='same') 333 Size of each cluster: 'same' generates clusters with same size, 334 'different' generate clusters with different sizes. 335 336 density : str, optional (default='same') 337 Density of each cluster: 'same' generates clusters with same density, 338 'different' generate clusters with different densities. 339 340 dist: float, optional (default=0.25) 341 Distance between clusters. Should be between 0. and 1.0 342 It is used to avoid clusters overlapping as much as possible. 343 However, if number of samples and number of clusters are too high, 344 it is unlikely to separate them fully even if ``dist`` set to 1.0 345 346 random_state : int, RandomState instance or None, optional (default=None) 347 If int, random_state is the seed used by the random number generator; 348 If RandomState instance, random_state is the random number generator; 349 If None, the random number generator is the RandomState instance used 350 by `np.random`. 351 352 return_in_clusters : bool, optional (default=False) 353 If True, the function returns x_train, y_train, x_test, y_test each as 354 a list of numpy arrays where each index represents a cluster. 355 If False, it returns x_train, y_train, x_test, y_test each as numpy 356 array after joining the sequence of clusters arrays, 357 358 Returns 359 ------- 360 X_train : numpy array of shape (n_train, n_features) 361 Training data. 362 363 y_train : numpy array of shape (n_train,) 364 Training ground truth. 365 366 X_test : numpy array of shape (n_test, n_features) 367 Test data. 368 369 y_test : numpy array of shape (n_test,) 370 Test ground truth. 371 """ 372 # initialize a random state and seeds for the instance 373 random_state = check_random_state(random_state) 374 375 if isinstance(n_clusters, int): 376 check_parameter(n_clusters, low=1, param_name='n_clusters') 377 else: 378 raise ValueError("n_clusters should be int, got %s" % n_clusters) 379 380 if isinstance(n_features, int): 381 check_parameter(n_features, low=1, param_name='n_features') 382 else: 383 raise ValueError("n_features should be int, got %s" % n_features) 384 385 if isinstance(contamination, (float, int)): 386 check_parameter(contamination, low=0, high=0.5, 387 param_name='contamination') 388 else: 389 raise ValueError( 390 "contamination should be float, got %s" % contamination) 391 392 if isinstance(dist, float): 393 check_parameter(dist, low=0, high=1.0, param_name='dist') 394 else: 395 raise ValueError("dist should be float, got %s" % dist) 396 397 if not isinstance(return_in_clusters, bool): 398 raise ValueError("return_in_clusters should be of type bool, " 399 "got %s" % return_in_clusters) 400 401 # find the required number of outliers and inliers 402 n_samples = n_train + n_test 403 n_outliers = int(n_samples * contamination) 404 n_inliers = n_samples - n_outliers 405 406 if size == 'same': 407 a_ = [int(n_inliers / n_clusters)] * (n_clusters - 1) 408 clusters_size = a_ + [int(n_inliers - sum(a_))] 409 elif size == 'different': 410 if (n_clusters * 10) > n_samples: 411 raise ValueError('number of samples should be at least 10 times of' 412 'the number of clusters') 413 if (n_clusters * 10) > n_inliers: 414 raise ValueError('contamination ratio is too high, try to increase' 415 ' number of samples or decrease the contamination') 416 _r = 1. / n_clusters 417 _offset = random_state.uniform(_r * 0.2, _r * 0.4, 418 size=(int(n_clusters / 2),)).tolist() 419 _offset += [i * -1. for i in _offset] 420 clusters_size = np.round( 421 np.multiply(n_inliers, np.add(_r, _offset))).astype(int) 422 if n_clusters % 2 == 0: # if it is even number 423 clusters_size[n_clusters - 1] += n_inliers - sum(clusters_size) 424 else: 425 clusters_size = np.append(clusters_size, 426 n_inliers - sum(clusters_size)) 427 else: 428 raise ValueError( 429 'size should be a string of value \'same\' or \'different\'') 430 431 # check for clusters densities and apply split accordingly 432 if density == 'same': 433 clusters_density = random_state.uniform(low=0.1, high=0.5, size=( 434 1,)).tolist() * n_clusters 435 elif density == 'different': 436 clusters_density = random_state.uniform(low=0.1, high=0.5, 437 size=(n_clusters,)) 438 else: 439 raise ValueError( 440 'density should be a string of value \'same\' or \'different\'') 441 442 # calculate number of outliers for every cluster 443 n_outliers_ = [] 444 for i in range(n_clusters): 445 n_outliers_.append(int(round(clusters_size[i] * contamination))) 446 _diff = int((n_outliers - sum(n_outliers_)) / n_clusters) 447 for i in range(n_clusters - 1): 448 n_outliers_[i] += _diff 449 n_outliers_[n_clusters - 1] += n_outliers - sum(n_outliers_) 450 random_state.shuffle(n_outliers_) 451 452 # generate data 453 X_clusters, y_clusters = [], [] 454 X, y = np.zeros([n_samples, n_features]), np.zeros([n_samples, ]) 455 456 center_box = list(filter(lambda a: a != 0, np.linspace( 457 -np.power(n_samples * n_clusters, dist), 458 np.power(n_samples * n_clusters, dist), 459 n_clusters + 2))) 460 461 # index tracker for value assignment 462 tracker_idx = 0 463 464 for i in range(n_clusters): 465 inliers, outliers = [], [] 466 _blob, _y = make_blobs(n_samples=clusters_size[i], centers=1, 467 cluster_std=clusters_density[i], 468 center_box=(center_box[i], center_box[i + 1]), 469 n_features=n_features, 470 random_state=random_state) 471 472 inliers.append(_blob) 473 474 center_box_l = center_box[i] * (1.2 + dist + clusters_density[i]) 475 center_box_r = center_box[i + 1] * (1.2 + dist + clusters_density[i]) 476 477 outliers.append(make_blobs(n_samples=n_outliers_[i], centers=1, 478 cluster_std=random_state.uniform( 479 clusters_density[i] * 3.5, 480 clusters_density[i] * 4., 481 size=(1,)[0]), 482 center_box=(center_box_l, center_box_r), 483 n_features=n_features, 484 random_state=random_state)[0]) 485 _y = np.append(_y, [1] * int(n_outliers_[i])) 486 487 # generate X 488 if np.array(outliers).ravel().shape[0] > 0: 489 stacked_X_temp = np.vstack( 490 (np.concatenate(inliers), np.concatenate(outliers))) 491 X_clusters.append(stacked_X_temp) 492 tracker_idx_new = tracker_idx + stacked_X_temp.shape[0] 493 X[tracker_idx:tracker_idx_new, :] = stacked_X_temp 494 else: 495 X_clusters.append(np.concatenate(inliers)) 496 497 # generate Y 498 y_clusters.append(_y) 499 y[tracker_idx:tracker_idx_new, ] = _y 500 501 tracker_idx = tracker_idx_new 502 503 if return_in_clusters: 504 return X_clusters, y_clusters 505 506 # return X_train, X_test, y_train, y_test 507 else: 508 return train_test_split(X, y, test_size=n_test, 509 random_state=random_state) 510 511 512 def generate_data_categorical(n_train=1000, n_test=500, n_features=2, 513 n_informative=2, n_category_in=2, 514 n_category_out=2, contamination=0.1, 515 shuffle=True, random_state=None): 516 """Utility function to generate synthesized categorical data. 517 518 Parameters 519 ---------- 520 n_train : int, (default=1000) 521 The number of training points to generate. 522 523 n_test : int, (default=500) 524 The number of test points to generate. 525 526 n_features : int, optional (default=2) 527 The number of features for each sample. 528 529 n_informative : int in (1, n_features), optional (default=2) 530 The number of informative features in the outlier points. 531 The higher the easier the outlier detection should be. 532 Note that n_informative should not be less than or 533 equal n_features. 534 535 n_category_in : int in (1, n_inliers), optional (default=2) 536 The number of categories in the inlier points. 537 538 n_category_out : int in (1, n_outliers), optional (default=2) 539 The number of categories in the outlier points. 540 541 contamination : float in (0., 0.5), optional (default=0.1) 542 The amount of contamination of the data set, i.e. 543 the proportion of outliers in the data set. 544 545 shuffle: bool, optional(default=True) 546 If True, inliers will be shuffled which makes more noisy distribution. 547 548 random_state : int, RandomState instance or None, optional (default=None) 549 If int, random_state is the seed used by the random number generator; 550 If RandomState instance, random_state is the random number generator; 551 If None, the random number generator is the RandomState instance used 552 by `np.random`. 553 554 555 Returns 556 ------- 557 X_train : numpy array of shape (n_train, n_features) 558 Training data. 559 560 y_train : numpy array of shape (n_train,) 561 Training ground truth. 562 563 X_test : numpy array of shape (n_test, n_features) 564 Test data. 565 566 y_test : numpy array of shape (n_test,) 567 Test ground truth. 568 """ 569 570 # initialize a random state and seeds for the instance 571 random_state = check_random_state(random_state) 572 573 if isinstance(n_train, int): 574 check_parameter(n_train, low=1, param_name='n_train') 575 else: 576 raise ValueError("n_train should be int, got %s" % n_train) 577 578 if isinstance(n_test, int): 579 check_parameter(n_test, low=0, param_name='n_test') 580 else: 581 raise ValueError("n_test should be int, got %s" % n_test) 582 583 if isinstance(n_features, int): 584 check_parameter(n_features, low=0, param_name='n_features') 585 else: 586 raise ValueError("n_features should be int, got %s" % n_features) 587 588 if isinstance(n_informative, int): 589 check_parameter(n_informative, low=0, high=n_features + 1, param_name='n_informative') 590 else: 591 raise ValueError("n_informative should be int, got %s" % n_informative) 592 593 if isinstance(contamination, (float, int)): 594 check_parameter(contamination, low=0, high=0.5, 595 param_name='contamination') 596 else: 597 raise ValueError("contamination should be float, got %s" % contamination) 598 599 if not isinstance(shuffle, bool): 600 raise ValueError("shuffle should be bool, got %s" % shuffle) 601 602 # find the required number of outliers and inliers 603 n_samples = n_train + n_test 604 n_outliers = int(n_samples * contamination) 605 n_inliers = n_samples - n_outliers 606 607 if isinstance(n_category_in, int): 608 check_parameter(n_category_in, low=0, high=n_inliers + 1, param_name='n_category_in') 609 else: 610 raise ValueError("n_category_in should be int, got %s" % n_category_in) 611 612 if isinstance(n_category_out, int): 613 check_parameter(n_category_out, low=0, high=n_outliers + 1, param_name='n_category_out') 614 else: 615 raise ValueError("n_category_out should be int, got %s" % n_category_out) 616 617 # Encapsulated functions to generate features 618 def __f(f): 619 quot, rem = divmod(f - 1, 26) 620 return __f(quot) + chr(rem + ord('A')) if f != 0 else '' 621 622 # generate pool of features to be the base for naming the data points 623 features = [] 624 for i in range(1, n_features + 1): 625 features.append(__f(i)) 626 627 # find the required distributions of categories over inliers and outliers 628 temp_ = [int(n_inliers / n_category_in)] * (n_category_in - 1) 629 dist_in = temp_ + [int(n_inliers - sum(temp_))] 630 temp_ = [int(n_outliers / n_category_out)] * (n_category_out - 1) 631 dist_out = temp_ + [int(n_outliers - sum(temp_))] 632 633 # generate categorical data 634 X = [] 635 count = 0 636 for f in features: 637 inliers = np.hstack([[f + str(i)] * dist_in[i] for i in range(n_category_in)]) 638 if shuffle: 639 random_state.shuffle(inliers) 640 if count < n_informative: 641 outliers = list(np.hstack( 642 [[f + str((n_category_in * 2) + i)] * dist_out[i] for i in range(n_category_out)])) 643 else: 644 outliers = list(inliers[random_state.randint(0, len(inliers), size=n_outliers)]) 645 count += 1 646 647 X.append(list(inliers) + outliers) 648 649 return train_test_split(np.array(X).T, 650 np.array(([0] * n_inliers) + ([1] * n_outliers)), 651 test_size=n_test, 652 random_state=random_state) 653 654 655 def generate_ts_data(n_train=500, n_test=200, n_channels=1, 656 contamination=0.05, period=50, noise_std=0.3, 657 anomaly_type='point', random_state=None): 658 """Generate synthetic time series data with injected anomalies. 659 660 Creates a sinusoidal base signal with Gaussian noise and injects 661 anomalies at random locations. Follows conventions from the TS-AD 662 literature (e.g., TSB-AD benchmark). 663 664 Parameters 665 ---------- 666 n_train : int, optional (default=500) 667 Length of training time series. 668 n_test : int, optional (default=200) 669 Length of test time series. 670 n_channels : int, optional (default=1) 671 Number of channels (univariate=1, multivariate>1). 672 contamination : float, optional (default=0.05) 673 Fraction of timestamps that are anomalous (approximately). 674 For subsequence anomalies, the total labeled timestamps are 675 controlled to stay near this fraction. 676 period : int, optional (default=50) 677 Period of the sinusoidal base signal. 678 noise_std : float, optional (default=0.3) 679 Standard deviation of Gaussian noise. 680 anomaly_type : str, optional (default='point') 681 Type of anomaly: 'point' (spikes), 'subsequence' (shape change), 682 or 'both'. 683 random_state : int, RandomState instance, or None (default=None) 684 Random seed for reproducibility. 685 686 Returns 687 ------- 688 X_train : np.ndarray of shape (n_train,) or (n_train, n_channels) 689 Training time series. Univariate returned as 1D. 690 X_test : np.ndarray of shape (n_test,) or (n_test, n_channels) 691 Test time series. 692 y_train : np.ndarray of shape (n_train,) 693 Binary labels (1=anomaly, 0=normal) for training. 694 y_test : np.ndarray of shape (n_test,) 695 Binary labels for test. 696 """ 697 rng = check_random_state(random_state) 698 699 # Validate parameters 700 if n_train < 20: 701 raise ValueError("n_train must be >= 20, got %d" % n_train) 702 if n_test < 20: 703 raise ValueError("n_test must be >= 20, got %d" % n_test) 704 if n_channels < 1: 705 raise ValueError("n_channels must be >= 1, got %d" % n_channels) 706 if not 0 < contamination < 0.5: 707 raise ValueError("contamination must be in (0, 0.5), got %f" 708 % contamination) 709 if anomaly_type not in ('point', 'subsequence', 'both'): 710 raise ValueError("anomaly_type must be 'point', 'subsequence', " 711 "or 'both', got '%s'" % anomaly_type) 712 713 def _make_series(length): 714 t = np.arange(length, dtype=np.float64) 715 if n_channels == 1: 716 base = np.sin(2 * np.pi * t / period) 717 X = base + noise_std * rng.randn(length) 718 else: 719 X = np.empty((length, n_channels)) 720 for ch in range(n_channels): 721 phase = 2 * np.pi * ch / n_channels 722 freq = period * (1 + 0.2 * ch) 723 X[:, ch] = np.sin(2 * np.pi * t / freq + phase) \ 724 + noise_std * rng.randn(length) 725 return X 726 727 def _inject_anomalies(X, length): 728 target_n_anom_timestamps = max(1, int(length * contamination)) 729 y = np.zeros(length, dtype=np.int32) 730 731 # Choose anomaly locations (avoid first/last 10%) 732 margin = max(5, length // 10) 733 candidates = np.arange(margin, length - margin) 734 if len(candidates) == 0: 735 candidates = np.arange(1, length - 1) 736 737 # For subsequence anomalies, compute how many events we need 738 # to approximately hit the target timestamp count 739 subseq_len = max(3, period // 5) 740 if anomaly_type == 'point': 741 n_events = target_n_anom_timestamps 742 elif anomaly_type == 'subsequence': 743 n_events = max(1, target_n_anom_timestamps // subseq_len) 744 else: # both 745 avg_len = (1 + subseq_len) / 2 746 n_events = max(1, int(target_n_anom_timestamps / avg_len)) 747 748 n_events = min(n_events, len(candidates)) 749 anom_indices = rng.choice(candidates, size=n_events, replace=False) 750 anom_indices.sort() 751 752 for idx in anom_indices: 753 if anomaly_type == 'point' or \ 754 (anomaly_type == 'both' and rng.rand() > 0.5): 755 # Point anomaly: spike 756 magnitude = 4.0 + 2.0 * rng.rand() 757 sign = 1 if rng.rand() > 0.5 else -1 758 if n_channels == 1: 759 X[idx] += sign * magnitude 760 else: 761 ch = rng.randint(n_channels) 762 X[idx, ch] += sign * magnitude 763 y[idx] = 1 764 else: 765 # Subsequence anomaly: shape change 766 end = min(idx + subseq_len, length) 767 if n_channels == 1: 768 X[idx:end] = np.mean(X[idx:end]) + \ 769 3.0 * noise_std * rng.randn(end - idx) 770 else: 771 ch = rng.randint(n_channels) 772 X[idx:end, ch] = 3.0 * noise_std * rng.randn(end - idx) 773 y[idx:end] = 1 774 775 return X, y 776 777 X_train = _make_series(n_train) 778 X_test = _make_series(n_test) 779 X_train, y_train = _inject_anomalies(X_train, n_train) 780 X_test, y_test = _inject_anomalies(X_test, n_test) 781 782 return X_train, X_test, y_train, y_test 783 784 785 def generate_graph_data(n_nodes=300, n_features=16, n_edges_per_node=5, 786 contamination=0.1, random_state=None): 787 """Generate synthetic attributed graph data with planted anomalies. 788 789 Normal nodes have features from N(0, 1). Anomaly nodes have features 790 shifted by +5 standard deviations. Edges are generated via random 791 neighbor selection (undirected, no self-loops, no duplicates). 792 793 Parameters 794 ---------- 795 n_nodes : int, default=300 796 Number of nodes. 797 798 n_features : int, default=16 799 Dimensionality of node features. 800 801 n_edges_per_node : int, default=5 802 Average number of edges per node (Poisson-sampled per node). 803 804 contamination : float, default=0.1 805 Fraction of nodes that are anomalies. 806 807 random_state : int, RandomState or None, default=None 808 Seed for reproducibility. 809 810 Returns 811 ------- 812 X : np.ndarray of shape (n_nodes, n_features) 813 Node feature matrix (float32). 814 815 edge_index : np.ndarray of shape (2, n_edges) 816 COO-format edge list (int64, undirected, no self-loops). 817 818 y : np.ndarray of shape (n_nodes,) 819 Binary labels: 0 = normal, 1 = anomaly. 820 """ 821 rng = check_random_state(random_state) 822 823 n_anomalies = max(1, int(n_nodes * contamination)) 824 n_normal = n_nodes - n_anomalies 825 826 # Features: normal from N(0,1), anomalies shifted by +5 827 X_normal = rng.randn(n_normal, n_features).astype(np.float32) 828 X_anomaly = (rng.randn(n_anomalies, n_features) + 5.0).astype( 829 np.float32) 830 X = np.vstack([X_normal, X_anomaly]) 831 y = np.concatenate([np.zeros(n_normal, dtype=np.int32), 832 np.ones(n_anomalies, dtype=np.int32)]) 833 834 # Shuffle 835 perm = rng.permutation(n_nodes) 836 X, y = X[perm], y[perm] 837 838 # Generate edges via random neighbor selection 839 edges = set() 840 for i in range(n_nodes): 841 n_nbrs = max(1, rng.poisson(n_edges_per_node)) 842 candidates = rng.choice(n_nodes, size=min(n_nbrs + 1, n_nodes), 843 replace=False) 844 for j in candidates: 845 if i != j: 846 u, v = (i, j) if i < j else (j, i) 847 edges.add((u, v)) 848 849 rows, cols = [], [] 850 for u, v in edges: 851 rows.extend([u, v]) 852 cols.extend([v, u]) 853 854 edge_index = np.array([rows, cols], dtype=np.int64) 855 return X, edge_index, y