alad.py
1 # -*- coding: utf-8 -*- 2 """Using Adversarially Learned Anomaly Detection 3 """ 4 # Author: Michiel Bongaerts (but not author of the ALAD method) 5 # Pytorch version Author: Jiaqi Li <jli77629@usc.edu> 6 7 8 import numpy as np 9 import pandas as pd 10 11 try: 12 import torch 13 except ImportError: 14 print('please install torch first') 15 16 import torch 17 import torch.nn as nn 18 import torch.optim as optim 19 from matplotlib import pyplot as plt 20 from sklearn.preprocessing import StandardScaler 21 from sklearn.utils import check_array 22 from sklearn.utils.validation import check_is_fitted 23 24 from .base import BaseDetector 25 from ..utils.utility import check_parameter 26 27 28 class ALAD(BaseDetector): 29 """Adversarially Learned Anomaly Detection (ALAD). 30 Paper: https://arxiv.org/pdf/1812.02288.pdf 31 32 See :cite:`zenati2018adversarially` for details. 33 34 Parameters 35 ---------- 36 output_activation : str, optional (default=None) 37 Activation function to use for output layers for encoder and dector. 38 39 activation_hidden_disc : str, optional (default='tanh') 40 Activation function to use for hidden layers in discrimators. 41 42 activation_hidden_gen : str, optional (default='tanh') 43 Activation function to use for hidden layers in encoder and decoder 44 (i.e. generator). 45 46 epochs : int, optional (default=500) 47 Number of epochs to train the model. 48 49 batch_size : int, optional (default=32) 50 Number of samples per gradient update. 51 52 dropout_rate : float in (0., 1), optional (default=0.2) 53 The dropout to be used across all layers. 54 55 dec_layers : list, optional (default=[5,10,25]) 56 List that indicates the number of nodes per hidden layer for the d 57 ecoder network. 58 Thus, [10,10] indicates 2 hidden layers having each 10 nodes. 59 60 enc_layers : list, optional (default=[25,10,5]) 61 List that indicates the number of nodes per hidden layer for the 62 encoder network. 63 Thus, [10,10] indicates 2 hidden layers having each 10 nodes. 64 65 disc_xx_layers : list, optional (default=[25,10,5]) 66 List that indicates the number of nodes per hidden layer for 67 discriminator_xx. 68 Thus, [10,10] indicates 2 hidden layers having each 10 nodes. 69 70 disc_zz_layers : list, optional (default=[25,10,5]) 71 List that indicates the number of nodes per hidden layer for 72 discriminator_zz. 73 Thus, [10,10] indicates 2 hidden layers having each 10 nodes. 74 75 disc_xz_layers : list, optional (default=[25,10,5]) 76 List that indicates the number of nodes per hidden layer for 77 discriminator_xz. 78 Thus, [10,10] indicates 2 hidden layers having each 10 nodes. 79 80 learning_rate_gen: float in (0., 1), optional (default=0.001) 81 learning rate of training the encoder and decoder 82 83 learning_rate_disc: float in (0., 1), optional (default=0.001) 84 learning rate of training the discriminators 85 86 add_recon_loss: bool optional (default=False) 87 add an extra loss for encoder and decoder based on the reconstruction 88 error 89 90 lambda_recon_loss: float in (0., 1), optional (default=0.1) 91 if ``add_recon_loss= True``, the reconstruction loss gets multiplied 92 by ``lambda_recon_loss`` and added to the total loss for the generator 93 (i.e. encoder and decoder). 94 95 preprocessing : bool, optional (default=True) 96 If True, apply standardization on the data. 97 98 verbose : int, optional (default=1) 99 Verbosity mode. 100 - 0 = silent 101 - 1 = progress bar 102 103 contamination : float in (0., 0.5), optional (default=0.1) 104 The amount of contamination of the data set, i.e. 105 the proportion of outliers in the data set. When fitting this is used 106 to define the threshold on the decision function. 107 108 device : str or None, optional (default=None) 109 The device to use for computation. If None, the default device will be used. 110 Possible values include 'cpu' or 'gpu'. This parameter allows the user 111 to specify the preferred device for running the model. 112 113 Attributes 114 ---------- 115 decision_scores_ : numpy array of shape (n_samples,) 116 The outlier scores of the training data [0,1]. 117 The higher, the more abnormal. Outliers tend to have higher 118 scores. This value is available once the detector is 119 fitted. 120 121 threshold_ : float 122 The threshold is based on ``contamination``. It is the 123 ``n_samples * contamination`` most abnormal samples in 124 ``decision_scores_``. The threshold is calculated for generating 125 binary outlier labels. 126 127 labels_ : int, either 0 or 1 128 The binary labels of the training data. 0 stands for inliers 129 and 1 for outliers/anomalies. It is generated by applying 130 ``threshold_`` on ``decision_scores_``. 131 """ 132 133 def __init__(self, activation_hidden_gen='tanh', 134 activation_hidden_disc='tanh', 135 output_activation=None, 136 dropout_rate=0.2, 137 latent_dim=2, 138 dec_layers=[5, 10, 25], 139 enc_layers=[25, 10, 5], 140 disc_xx_layers=[25, 10, 5], 141 disc_zz_layers=[25, 10, 5], 142 disc_xz_layers=[25, 10, 5], 143 learning_rate_gen=0.0001, learning_rate_disc=0.0001, 144 add_recon_loss=False, lambda_recon_loss=0.1, 145 epochs=200, 146 verbose=0, 147 preprocessing=False, 148 add_disc_zz_loss=True, spectral_normalization=False, 149 batch_size=32, contamination=0.1, device=None): 150 super(ALAD, self).__init__(contamination=contamination) 151 152 self.device = device if device else torch.device( 153 "cuda" if torch.cuda.is_available() else "cpu") 154 self.activation_hidden_disc = activation_hidden_disc 155 self.activation_hidden_gen = activation_hidden_gen 156 self.output_activation = output_activation 157 self.dropout_rate = dropout_rate 158 self.latent_dim = latent_dim 159 self.dec_layers = dec_layers 160 self.enc_layers = enc_layers 161 162 self.disc_xx_layers = disc_xx_layers 163 self.disc_zz_layers = disc_zz_layers 164 self.disc_xz_layers = disc_xz_layers 165 166 self.add_recon_loss = add_recon_loss 167 self.lambda_recon_loss = lambda_recon_loss 168 self.add_disc_zz_loss = add_disc_zz_loss 169 170 self.contamination = contamination 171 self.epochs = epochs 172 self.learning_rate_gen = learning_rate_gen 173 self.learning_rate_disc = learning_rate_disc 174 self.preprocessing = preprocessing 175 self.batch_size = batch_size 176 self.verbose = verbose 177 self.spectral_normalization = spectral_normalization 178 179 if self.spectral_normalization: 180 try: 181 import torch.nn.utils.spectral_norm as spectral_norm 182 self.spectral_norm = spectral_norm 183 except ImportError: 184 print('Spectral normalization not available. ' 185 'Install torch>=1.0.0.') 186 self.spectral_normalization = False 187 188 check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', 189 include_left=True) 190 191 def _build_model(self): 192 def get_activation(name): 193 if name == 'tanh': 194 return nn.Tanh() 195 elif name == 'sigmoid': 196 return nn.Sigmoid() 197 elif name == 'relu': 198 return nn.ReLU() 199 else: 200 raise ValueError( 201 "Unsupported activation function: {}".format(name)) 202 203 # Create the decoder 204 dec_layers = [] 205 input_dim = self.latent_dim 206 for l_dim in self.dec_layers: 207 dec_layers.append(nn.Linear(input_dim, l_dim)) 208 dec_layers.append(nn.Dropout(self.dropout_rate)) 209 dec_layers.append(get_activation(self.activation_hidden_gen)) 210 input_dim = l_dim 211 dec_layers.append(nn.Linear(input_dim, self.n_features_)) 212 if self.output_activation: 213 dec_layers.append(get_activation(self.output_activation)) 214 self.dec = nn.Sequential(*dec_layers).to(self.device) 215 216 # Create the encoder 217 enc_layers = [] 218 input_dim = self.n_features_ 219 for l_dim in self.enc_layers: 220 enc_layers.append(nn.Linear(input_dim, l_dim)) 221 enc_layers.append(nn.Dropout(self.dropout_rate)) 222 enc_layers.append(get_activation(self.activation_hidden_gen)) 223 input_dim = l_dim 224 enc_layers.append(nn.Linear(input_dim, self.latent_dim)) 225 if self.output_activation: 226 enc_layers.append(get_activation(self.output_activation)) 227 self.enc = nn.Sequential(*enc_layers).to(self.device) 228 229 # Create the discriminators 230 def create_discriminator(layers, input_dim): 231 disc_layers = [] 232 for l_dim in layers: 233 disc_layers.append(nn.Linear(input_dim, l_dim)) 234 if self.spectral_normalization: 235 disc_layers[-1] = nn.utils.spectral_norm(disc_layers[-1]) 236 disc_layers.append(nn.Dropout(self.dropout_rate)) 237 disc_layers.append(get_activation(self.activation_hidden_disc)) 238 input_dim = l_dim 239 disc_layers.append(nn.Linear(input_dim, 1)) 240 disc_layers.append(nn.Sigmoid()) 241 return nn.Sequential(*disc_layers).to(self.device) 242 243 self.disc_xx = create_discriminator(self.disc_xx_layers, 244 2 * self.n_features_) 245 self.disc_zz = create_discriminator(self.disc_zz_layers, 246 2 * self.latent_dim) 247 self.disc_xz = create_discriminator(self.disc_xz_layers, 248 self.n_features_ + self.latent_dim) 249 250 # Optimizers 251 self.opt_gen = optim.Adam( 252 list(self.enc.parameters()) + list(self.dec.parameters()), 253 lr=self.learning_rate_gen) 254 self.opt_disc = optim.Adam(list(self.disc_xx.parameters()) + list( 255 self.disc_xz.parameters()) + list(self.disc_zz.parameters()), 256 lr=self.learning_rate_disc) 257 258 self.hist_loss_disc = [] 259 self.hist_loss_gen = [] 260 261 def train_step(self, data): 262 x_real, z_real = data 263 264 x_real = torch.FloatTensor(x_real).to(self.device) 265 z_real = torch.FloatTensor(z_real).to(self.device) 266 267 self.opt_disc.zero_grad() 268 x_gen = self.dec(z_real) 269 z_gen = self.enc(x_real) 270 271 out_true_xz = self.disc_xz(torch.cat((x_real, z_gen), dim=1)) 272 out_fake_xz = self.disc_xz(torch.cat((x_gen, z_real), dim=1)) 273 274 out_true_xx = self.disc_xx(torch.cat((x_real, x_real), dim=1)) 275 out_fake_xx = self.disc_xx(torch.cat((x_real, x_gen), dim=1)) 276 277 loss_dxz = nn.BCELoss()(out_true_xz, 278 torch.ones_like(out_true_xz)) + nn.BCELoss()( 279 out_fake_xz, torch.zeros_like(out_fake_xz)) 280 loss_dxx = nn.BCELoss()(out_true_xx, 281 torch.ones_like(out_true_xx)) + nn.BCELoss()( 282 out_fake_xx, torch.zeros_like(out_fake_xx)) 283 284 if self.add_disc_zz_loss: 285 out_true_zz = self.disc_zz(torch.cat((z_real, z_real), dim=1)) 286 out_fake_zz = self.disc_zz(torch.cat((z_real, z_gen), dim=1)) 287 loss_dzz = nn.BCELoss()(out_true_zz, torch.ones_like( 288 out_true_zz)) + nn.BCELoss()(out_fake_zz, 289 torch.zeros_like(out_fake_zz)) 290 loss_disc = loss_dxz + loss_dzz + loss_dxx 291 else: 292 loss_disc = loss_dxz + loss_dxx 293 294 loss_disc.backward() 295 self.opt_disc.step() 296 297 self.opt_gen.zero_grad() 298 x_gen = self.dec(z_real) 299 z_gen = self.enc(x_real) 300 301 out_true_xz = self.disc_xz(torch.cat((x_real, z_gen), dim=1)) 302 out_fake_xz = self.disc_xz(torch.cat((x_gen, z_real), dim=1)) 303 304 out_true_xx = self.disc_xx(torch.cat((x_real, x_real), dim=1)) 305 out_fake_xx = self.disc_xx(torch.cat((x_real, x_gen), dim=1)) 306 307 loss_gexz = nn.BCELoss()(out_fake_xz, 308 torch.ones_like(out_fake_xz)) + nn.BCELoss()( 309 out_true_xz, torch.zeros_like(out_true_xz)) 310 loss_gexx = nn.BCELoss()(out_fake_xx, 311 torch.ones_like(out_fake_xx)) + nn.BCELoss()( 312 out_true_xx, torch.zeros_like(out_true_xx)) 313 314 if self.add_disc_zz_loss: 315 out_true_zz = self.disc_zz(torch.cat((z_real, z_real), dim=1)) 316 out_fake_zz = self.disc_zz(torch.cat((z_real, z_gen), dim=1)) 317 loss_gezz = nn.BCELoss()(out_fake_zz, torch.ones_like( 318 out_fake_zz)) + nn.BCELoss()(out_true_zz, 319 torch.zeros_like(out_true_zz)) 320 cycle_consistency = loss_gezz + loss_gexx 321 loss_gen = loss_gexz + cycle_consistency 322 else: 323 cycle_consistency = loss_gexx 324 loss_gen = loss_gexz + cycle_consistency 325 326 if self.add_recon_loss: 327 x_recon = self.dec(self.enc(x_real)) 328 loss_recon = torch.mean((x_real - x_recon) ** 2) 329 loss_gen += loss_recon * self.lambda_recon_loss 330 331 loss_gen.backward() 332 self.opt_gen.step() 333 334 self.hist_loss_disc.append(loss_disc.item()) 335 self.hist_loss_gen.append(loss_gen.item()) 336 337 def fit(self, X, y=None, noise_std=0.1): 338 """Fit detector. y is ignored in unsupervised methods. 339 Parameters 340 ---------- 341 X : numpy array of shape (n_samples, n_features) 342 The input samples. 343 y : Ignored 344 Not used, present for API consistency by convention. 345 Returns 346 ------- 347 self : object 348 Fitted estimator. 349 """ 350 # validate inputs X and y (optional) 351 X = check_array(X) 352 self._set_n_classes(y) 353 354 # Get number of sampels and features from train set 355 self.n_samples_, self.n_features_ = X.shape[0], X.shape[1] 356 self._build_model() 357 358 # Apply data scaling or not 359 if self.preprocessing: 360 self.scaler_ = StandardScaler() 361 X_norm = self.scaler_.fit_transform(X) 362 else: 363 X_norm = np.copy(X) 364 365 for n in range(self.epochs): 366 if n % 50 == 0 and n != 0 and self.verbose == 1: 367 print(f'Train iter: {n}') 368 369 # Shuffle train 370 np.random.shuffle(X_norm) 371 372 X_train_sel = X_norm[ 373 :min(self.batch_size, self.n_samples_)].astype( 374 np.float32) 375 latent_noise = np.random.normal(0, 1, ( 376 X_train_sel.shape[0], self.latent_dim)) 377 X_train_sel += np.random.normal(0, noise_std, 378 size=X_train_sel.shape) 379 self.train_step((X_train_sel, latent_noise)) 380 381 if self.preprocessing: 382 X_norm = self.scaler_.transform(X) 383 else: 384 X_norm = np.copy(X) 385 386 pred_scores = self.get_outlier_scores(X_norm) 387 self.decision_scores_ = pred_scores 388 self._process_decision_scores() 389 return self 390 391 def train_more(self, X, epochs=100, noise_std=0.1): 392 """This function allows the researcher to perform extra training 393 instead of the fixed number determined 394 by the fit() function. 395 """ 396 # fit() should have been called first 397 check_is_fitted(self, ['decision_scores_']) 398 399 # Apply data scaling or not 400 if self.preprocessing: 401 X_norm = self.scaler_.transform(X) 402 else: 403 X_norm = np.copy(X) 404 405 for n in range(epochs): 406 if n % 50 == 0 and n != 0 and self.verbose == 1: 407 print(f'Train iter: {n}') 408 409 # Shuffle train 410 np.random.shuffle(X_norm) 411 412 X_train_sel = X_norm[ 413 :min(self.batch_size, self.n_samples_)].astype( 414 np.float32) 415 latent_noise = np.random.normal(0, 1, ( 416 X_train_sel.shape[0], self.latent_dim)) 417 X_train_sel += np.random.normal(0, noise_std, 418 size=X_train_sel.shape) 419 self.train_step((X_train_sel, latent_noise)) 420 421 if self.preprocessing: 422 X_norm = self.scaler_.transform(X) 423 else: 424 X_norm = np.copy(X) 425 426 pred_scores = self.get_outlier_scores(X_norm) 427 self.decision_scores_ = pred_scores 428 self._process_decision_scores() 429 return self 430 431 def get_outlier_scores(self, X_norm): 432 X_norm = torch.FloatTensor(X_norm).to(self.device) 433 X_enc = self.enc(X_norm).detach().cpu().numpy() 434 X_enc_gen = self.dec( 435 torch.FloatTensor(X_enc).to(self.device)).detach().cpu().numpy() 436 437 out_true_xx = self.disc_xx( 438 torch.cat((X_norm, X_norm), dim=1)).detach().cpu().numpy() 439 out_fake_xx = self.disc_xx( 440 torch.cat((X_norm, torch.FloatTensor(X_enc_gen).to(self.device)), 441 dim=1)).detach().cpu().numpy() 442 443 outlier_scores = np.mean(np.abs((out_true_xx - out_fake_xx) ** 2), 444 axis=1) 445 return outlier_scores 446 447 def decision_function(self, X): 448 """Predict raw anomaly score of X using the fitted detector. 449 The anomaly score of an input sample is computed based on different 450 detector algorithms. For consistency, outliers are assigned with 451 larger anomaly scores. 452 Parameters 453 ---------- 454 X : numpy array of shape (n_samples, n_features) 455 The training input samples. Sparse matrices are accepted only 456 if they are supported by the base estimator. 457 Returns 458 ------- 459 anomaly_scores : numpy array of shape (n_samples,) 460 The anomaly score of the input samples. 461 """ 462 check_is_fitted(self, ['decision_scores_']) 463 X = check_array(X) 464 465 if self.preprocessing: 466 X_norm = self.scaler_.transform(X) 467 else: 468 X_norm = np.copy(X) 469 470 X_norm = torch.FloatTensor(X_norm).to(self.device) 471 pred_scores = self.get_outlier_scores(X_norm.cpu().numpy()) 472 return pred_scores 473 474 def plot_learning_curves(self, start_ind=0, window_smoothening=10): 475 fig = plt.figure(figsize=(12, 5)) 476 477 l_gen = pd.Series(self.hist_loss_gen[start_ind:]).rolling( 478 window=window_smoothening).mean() 479 l_disc = pd.Series(self.hist_loss_disc[start_ind:]).rolling( 480 window=window_smoothening).mean() 481 482 ax = fig.add_subplot(1, 2, 1) 483 ax.plot(range(len(l_gen)), l_gen) 484 ax.set_title('Generator') 485 ax.set_ylabel('Loss') 486 ax.set_xlabel('Iter') 487 488 ax = fig.add_subplot(1, 2, 2) 489 ax.plot(range(len(l_disc)), l_disc) 490 ax.set_title('Discriminator(s)') 491 ax.set_ylabel('Loss') 492 ax.set_xlabel('Iter') 493 494 plt.show()