Cradicle Explorer

alad.py
  1  # -*- coding: utf-8 -*-
  2  """Using Adversarially Learned Anomaly Detection
  3  """
  4  # Author: Michiel Bongaerts (but not author of the ALAD method)
  5  # Pytorch version Author: Jiaqi Li <jli77629@usc.edu>
  6  
  7  
  8  import numpy as np
  9  import pandas as pd
 10  
 11  try:
 12      import torch
 13  except ImportError:
 14      print('please install torch first')
 15  
 16  import torch
 17  import torch.nn as nn
 18  import torch.optim as optim
 19  from matplotlib import pyplot as plt
 20  from sklearn.preprocessing import StandardScaler
 21  from sklearn.utils import check_array
 22  from sklearn.utils.validation import check_is_fitted
 23  
 24  from .base import BaseDetector
 25  from ..utils.utility import check_parameter
 26  
 27  
 28  class ALAD(BaseDetector):
 29      """Adversarially Learned Anomaly Detection (ALAD). 
 30      Paper: https://arxiv.org/pdf/1812.02288.pdf
 31  
 32      See :cite:`zenati2018adversarially` for details.
 33      
 34      Parameters
 35      ----------
 36      output_activation : str, optional (default=None)
 37          Activation function to use for output layers for encoder and dector.
 38  
 39      activation_hidden_disc : str, optional (default='tanh')
 40          Activation function to use for hidden layers in discrimators.
 41  
 42      activation_hidden_gen : str, optional (default='tanh')
 43          Activation function to use for hidden layers in encoder and decoder
 44          (i.e. generator).
 45  
 46      epochs : int, optional (default=500)
 47          Number of epochs to train the model.
 48  
 49      batch_size : int, optional (default=32)
 50          Number of samples per gradient update.
 51  
 52      dropout_rate : float in (0., 1), optional (default=0.2)
 53          The dropout to be used across all layers.
 54  
 55      dec_layers : list, optional (default=[5,10,25])
 56          List that indicates the number of nodes per hidden layer for the d
 57          ecoder network.
 58          Thus, [10,10] indicates 2 hidden layers having each 10 nodes.
 59  
 60      enc_layers : list, optional (default=[25,10,5])
 61          List that indicates the number of nodes per hidden layer for the
 62          encoder network.
 63          Thus, [10,10] indicates 2 hidden layers having each 10 nodes.
 64  
 65      disc_xx_layers : list, optional (default=[25,10,5])
 66          List that indicates the number of nodes per hidden layer for
 67          discriminator_xx.
 68          Thus, [10,10] indicates 2 hidden layers having each 10 nodes.
 69  
 70      disc_zz_layers : list, optional (default=[25,10,5])
 71          List that indicates the number of nodes per hidden layer for
 72          discriminator_zz.
 73          Thus, [10,10] indicates 2 hidden layers having each 10 nodes.
 74  
 75      disc_xz_layers : list, optional (default=[25,10,5])
 76          List that indicates the number of nodes per hidden layer for
 77          discriminator_xz.
 78          Thus, [10,10] indicates 2 hidden layers having each 10 nodes.
 79  
 80      learning_rate_gen: float in (0., 1), optional (default=0.001)
 81          learning rate of training the encoder and decoder
 82  
 83      learning_rate_disc: float in (0., 1), optional (default=0.001)
 84          learning rate of training the discriminators
 85  
 86      add_recon_loss: bool optional (default=False)
 87          add an extra loss for encoder and decoder based on the reconstruction
 88          error
 89  
 90      lambda_recon_loss: float in (0., 1), optional (default=0.1)
 91          if ``add_recon_loss= True``, the reconstruction loss gets multiplied
 92          by ``lambda_recon_loss`` and added to the total loss for the generator
 93           (i.e. encoder and decoder).
 94  
 95      preprocessing : bool, optional (default=True)
 96          If True, apply standardization on the data.
 97  
 98      verbose : int, optional (default=1)
 99          Verbosity mode.
100          - 0 = silent
101          - 1 = progress bar
102  
103      contamination : float in (0., 0.5), optional (default=0.1)
104          The amount of contamination of the data set, i.e.
105          the proportion of outliers in the data set. When fitting this is used
106          to define the threshold on the decision function.
107  
108      device : str or None, optional (default=None)
109          The device to use for computation. If None, the default device will be used.
110          Possible values include 'cpu' or 'gpu'. This parameter allows the user
111          to specify the preferred device for running the model.
112          
113      Attributes
114      ----------
115      decision_scores_ : numpy array of shape (n_samples,)
116          The outlier scores of the training data [0,1].
117          The higher, the more abnormal. Outliers tend to have higher
118          scores. This value is available once the detector is
119          fitted.
120  
121      threshold_ : float
122          The threshold is based on ``contamination``. It is the
123          ``n_samples * contamination`` most abnormal samples in
124          ``decision_scores_``. The threshold is calculated for generating
125          binary outlier labels.
126  
127      labels_ : int, either 0 or 1
128          The binary labels of the training data. 0 stands for inliers
129          and 1 for outliers/anomalies. It is generated by applying
130          ``threshold_`` on ``decision_scores_``.
131      """
132  
133      def __init__(self, activation_hidden_gen='tanh',
134                   activation_hidden_disc='tanh',
135                   output_activation=None,
136                   dropout_rate=0.2,
137                   latent_dim=2,
138                   dec_layers=[5, 10, 25],
139                   enc_layers=[25, 10, 5],
140                   disc_xx_layers=[25, 10, 5],
141                   disc_zz_layers=[25, 10, 5],
142                   disc_xz_layers=[25, 10, 5],
143                   learning_rate_gen=0.0001, learning_rate_disc=0.0001,
144                   add_recon_loss=False, lambda_recon_loss=0.1,
145                   epochs=200,
146                   verbose=0,
147                   preprocessing=False,
148                   add_disc_zz_loss=True, spectral_normalization=False,
149                   batch_size=32, contamination=0.1, device=None):
150          super(ALAD, self).__init__(contamination=contamination)
151  
152          self.device = device if device else torch.device(
153              "cuda" if torch.cuda.is_available() else "cpu")
154          self.activation_hidden_disc = activation_hidden_disc
155          self.activation_hidden_gen = activation_hidden_gen
156          self.output_activation = output_activation
157          self.dropout_rate = dropout_rate
158          self.latent_dim = latent_dim
159          self.dec_layers = dec_layers
160          self.enc_layers = enc_layers
161  
162          self.disc_xx_layers = disc_xx_layers
163          self.disc_zz_layers = disc_zz_layers
164          self.disc_xz_layers = disc_xz_layers
165  
166          self.add_recon_loss = add_recon_loss
167          self.lambda_recon_loss = lambda_recon_loss
168          self.add_disc_zz_loss = add_disc_zz_loss
169  
170          self.contamination = contamination
171          self.epochs = epochs
172          self.learning_rate_gen = learning_rate_gen
173          self.learning_rate_disc = learning_rate_disc
174          self.preprocessing = preprocessing
175          self.batch_size = batch_size
176          self.verbose = verbose
177          self.spectral_normalization = spectral_normalization
178  
179          if self.spectral_normalization:
180              try:
181                  import torch.nn.utils.spectral_norm as spectral_norm
182                  self.spectral_norm = spectral_norm
183              except ImportError:
184                  print('Spectral normalization not available. '
185                        'Install torch>=1.0.0.')
186                  self.spectral_normalization = False
187  
188          check_parameter(dropout_rate, 0, 1, param_name='dropout_rate',
189                          include_left=True)
190  
191      def _build_model(self):
192          def get_activation(name):
193              if name == 'tanh':
194                  return nn.Tanh()
195              elif name == 'sigmoid':
196                  return nn.Sigmoid()
197              elif name == 'relu':
198                  return nn.ReLU()
199              else:
200                  raise ValueError(
201                      "Unsupported activation function: {}".format(name))
202  
203          # Create the decoder
204          dec_layers = []
205          input_dim = self.latent_dim
206          for l_dim in self.dec_layers:
207              dec_layers.append(nn.Linear(input_dim, l_dim))
208              dec_layers.append(nn.Dropout(self.dropout_rate))
209              dec_layers.append(get_activation(self.activation_hidden_gen))
210              input_dim = l_dim
211          dec_layers.append(nn.Linear(input_dim, self.n_features_))
212          if self.output_activation:
213              dec_layers.append(get_activation(self.output_activation))
214          self.dec = nn.Sequential(*dec_layers).to(self.device)
215  
216          # Create the encoder
217          enc_layers = []
218          input_dim = self.n_features_
219          for l_dim in self.enc_layers:
220              enc_layers.append(nn.Linear(input_dim, l_dim))
221              enc_layers.append(nn.Dropout(self.dropout_rate))
222              enc_layers.append(get_activation(self.activation_hidden_gen))
223              input_dim = l_dim
224          enc_layers.append(nn.Linear(input_dim, self.latent_dim))
225          if self.output_activation:
226              enc_layers.append(get_activation(self.output_activation))
227          self.enc = nn.Sequential(*enc_layers).to(self.device)
228  
229          # Create the discriminators
230          def create_discriminator(layers, input_dim):
231              disc_layers = []
232              for l_dim in layers:
233                  disc_layers.append(nn.Linear(input_dim, l_dim))
234                  if self.spectral_normalization:
235                      disc_layers[-1] = nn.utils.spectral_norm(disc_layers[-1])
236                  disc_layers.append(nn.Dropout(self.dropout_rate))
237                  disc_layers.append(get_activation(self.activation_hidden_disc))
238                  input_dim = l_dim
239              disc_layers.append(nn.Linear(input_dim, 1))
240              disc_layers.append(nn.Sigmoid())
241              return nn.Sequential(*disc_layers).to(self.device)
242  
243          self.disc_xx = create_discriminator(self.disc_xx_layers,
244                                              2 * self.n_features_)
245          self.disc_zz = create_discriminator(self.disc_zz_layers,
246                                              2 * self.latent_dim)
247          self.disc_xz = create_discriminator(self.disc_xz_layers,
248                                              self.n_features_ + self.latent_dim)
249  
250          # Optimizers
251          self.opt_gen = optim.Adam(
252              list(self.enc.parameters()) + list(self.dec.parameters()),
253              lr=self.learning_rate_gen)
254          self.opt_disc = optim.Adam(list(self.disc_xx.parameters()) + list(
255              self.disc_xz.parameters()) + list(self.disc_zz.parameters()),
256                                     lr=self.learning_rate_disc)
257  
258          self.hist_loss_disc = []
259          self.hist_loss_gen = []
260  
261      def train_step(self, data):
262          x_real, z_real = data
263  
264          x_real = torch.FloatTensor(x_real).to(self.device)
265          z_real = torch.FloatTensor(z_real).to(self.device)
266  
267          self.opt_disc.zero_grad()
268          x_gen = self.dec(z_real)
269          z_gen = self.enc(x_real)
270  
271          out_true_xz = self.disc_xz(torch.cat((x_real, z_gen), dim=1))
272          out_fake_xz = self.disc_xz(torch.cat((x_gen, z_real), dim=1))
273  
274          out_true_xx = self.disc_xx(torch.cat((x_real, x_real), dim=1))
275          out_fake_xx = self.disc_xx(torch.cat((x_real, x_gen), dim=1))
276  
277          loss_dxz = nn.BCELoss()(out_true_xz,
278                                  torch.ones_like(out_true_xz)) + nn.BCELoss()(
279              out_fake_xz, torch.zeros_like(out_fake_xz))
280          loss_dxx = nn.BCELoss()(out_true_xx,
281                                  torch.ones_like(out_true_xx)) + nn.BCELoss()(
282              out_fake_xx, torch.zeros_like(out_fake_xx))
283  
284          if self.add_disc_zz_loss:
285              out_true_zz = self.disc_zz(torch.cat((z_real, z_real), dim=1))
286              out_fake_zz = self.disc_zz(torch.cat((z_real, z_gen), dim=1))
287              loss_dzz = nn.BCELoss()(out_true_zz, torch.ones_like(
288                  out_true_zz)) + nn.BCELoss()(out_fake_zz,
289                                               torch.zeros_like(out_fake_zz))
290              loss_disc = loss_dxz + loss_dzz + loss_dxx
291          else:
292              loss_disc = loss_dxz + loss_dxx
293  
294          loss_disc.backward()
295          self.opt_disc.step()
296  
297          self.opt_gen.zero_grad()
298          x_gen = self.dec(z_real)
299          z_gen = self.enc(x_real)
300  
301          out_true_xz = self.disc_xz(torch.cat((x_real, z_gen), dim=1))
302          out_fake_xz = self.disc_xz(torch.cat((x_gen, z_real), dim=1))
303  
304          out_true_xx = self.disc_xx(torch.cat((x_real, x_real), dim=1))
305          out_fake_xx = self.disc_xx(torch.cat((x_real, x_gen), dim=1))
306  
307          loss_gexz = nn.BCELoss()(out_fake_xz,
308                                   torch.ones_like(out_fake_xz)) + nn.BCELoss()(
309              out_true_xz, torch.zeros_like(out_true_xz))
310          loss_gexx = nn.BCELoss()(out_fake_xx,
311                                   torch.ones_like(out_fake_xx)) + nn.BCELoss()(
312              out_true_xx, torch.zeros_like(out_true_xx))
313  
314          if self.add_disc_zz_loss:
315              out_true_zz = self.disc_zz(torch.cat((z_real, z_real), dim=1))
316              out_fake_zz = self.disc_zz(torch.cat((z_real, z_gen), dim=1))
317              loss_gezz = nn.BCELoss()(out_fake_zz, torch.ones_like(
318                  out_fake_zz)) + nn.BCELoss()(out_true_zz,
319                                               torch.zeros_like(out_true_zz))
320              cycle_consistency = loss_gezz + loss_gexx
321              loss_gen = loss_gexz + cycle_consistency
322          else:
323              cycle_consistency = loss_gexx
324              loss_gen = loss_gexz + cycle_consistency
325  
326          if self.add_recon_loss:
327              x_recon = self.dec(self.enc(x_real))
328              loss_recon = torch.mean((x_real - x_recon) ** 2)
329              loss_gen += loss_recon * self.lambda_recon_loss
330  
331          loss_gen.backward()
332          self.opt_gen.step()
333  
334          self.hist_loss_disc.append(loss_disc.item())
335          self.hist_loss_gen.append(loss_gen.item())
336  
337      def fit(self, X, y=None, noise_std=0.1):
338          """Fit detector. y is ignored in unsupervised methods.
339          Parameters
340          ----------
341          X : numpy array of shape (n_samples, n_features)
342              The input samples.
343          y : Ignored
344              Not used, present for API consistency by convention.
345          Returns
346          -------
347          self : object
348              Fitted estimator.
349          """
350          # validate inputs X and y (optional)
351          X = check_array(X)
352          self._set_n_classes(y)
353  
354          # Get number of sampels and features from train set
355          self.n_samples_, self.n_features_ = X.shape[0], X.shape[1]
356          self._build_model()
357  
358          # Apply data scaling or not
359          if self.preprocessing:
360              self.scaler_ = StandardScaler()
361              X_norm = self.scaler_.fit_transform(X)
362          else:
363              X_norm = np.copy(X)
364  
365          for n in range(self.epochs):
366              if n % 50 == 0 and n != 0 and self.verbose == 1:
367                  print(f'Train iter: {n}')
368  
369              # Shuffle train 
370              np.random.shuffle(X_norm)
371  
372              X_train_sel = X_norm[
373                            :min(self.batch_size, self.n_samples_)].astype(
374                  np.float32)
375              latent_noise = np.random.normal(0, 1, (
376                  X_train_sel.shape[0], self.latent_dim))
377              X_train_sel += np.random.normal(0, noise_std,
378                                              size=X_train_sel.shape)
379              self.train_step((X_train_sel, latent_noise))
380  
381          if self.preprocessing:
382              X_norm = self.scaler_.transform(X)
383          else:
384              X_norm = np.copy(X)
385  
386          pred_scores = self.get_outlier_scores(X_norm)
387          self.decision_scores_ = pred_scores
388          self._process_decision_scores()
389          return self
390  
391      def train_more(self, X, epochs=100, noise_std=0.1):
392          """This function allows the researcher to perform extra training
393          instead of the fixed number determined
394          by the fit() function.
395          """
396          # fit() should have been called first
397          check_is_fitted(self, ['decision_scores_'])
398  
399          # Apply data scaling or not
400          if self.preprocessing:
401              X_norm = self.scaler_.transform(X)
402          else:
403              X_norm = np.copy(X)
404  
405          for n in range(epochs):
406              if n % 50 == 0 and n != 0 and self.verbose == 1:
407                  print(f'Train iter: {n}')
408  
409              # Shuffle train 
410              np.random.shuffle(X_norm)
411  
412              X_train_sel = X_norm[
413                            :min(self.batch_size, self.n_samples_)].astype(
414                  np.float32)
415              latent_noise = np.random.normal(0, 1, (
416                  X_train_sel.shape[0], self.latent_dim))
417              X_train_sel += np.random.normal(0, noise_std,
418                                              size=X_train_sel.shape)
419              self.train_step((X_train_sel, latent_noise))
420  
421          if self.preprocessing:
422              X_norm = self.scaler_.transform(X)
423          else:
424              X_norm = np.copy(X)
425  
426          pred_scores = self.get_outlier_scores(X_norm)
427          self.decision_scores_ = pred_scores
428          self._process_decision_scores()
429          return self
430  
431      def get_outlier_scores(self, X_norm):
432          X_norm = torch.FloatTensor(X_norm).to(self.device)
433          X_enc = self.enc(X_norm).detach().cpu().numpy()
434          X_enc_gen = self.dec(
435              torch.FloatTensor(X_enc).to(self.device)).detach().cpu().numpy()
436  
437          out_true_xx = self.disc_xx(
438              torch.cat((X_norm, X_norm), dim=1)).detach().cpu().numpy()
439          out_fake_xx = self.disc_xx(
440              torch.cat((X_norm, torch.FloatTensor(X_enc_gen).to(self.device)),
441                        dim=1)).detach().cpu().numpy()
442  
443          outlier_scores = np.mean(np.abs((out_true_xx - out_fake_xx) ** 2),
444                                   axis=1)
445          return outlier_scores
446  
447      def decision_function(self, X):
448          """Predict raw anomaly score of X using the fitted detector.
449          The anomaly score of an input sample is computed based on different
450          detector algorithms. For consistency, outliers are assigned with
451          larger anomaly scores.
452          Parameters
453          ----------
454          X : numpy array of shape (n_samples, n_features)
455              The training input samples. Sparse matrices are accepted only
456              if they are supported by the base estimator.
457          Returns
458          -------
459          anomaly_scores : numpy array of shape (n_samples,)
460              The anomaly score of the input samples.
461          """
462          check_is_fitted(self, ['decision_scores_'])
463          X = check_array(X)
464  
465          if self.preprocessing:
466              X_norm = self.scaler_.transform(X)
467          else:
468              X_norm = np.copy(X)
469  
470          X_norm = torch.FloatTensor(X_norm).to(self.device)
471          pred_scores = self.get_outlier_scores(X_norm.cpu().numpy())
472          return pred_scores
473  
474      def plot_learning_curves(self, start_ind=0, window_smoothening=10):
475          fig = plt.figure(figsize=(12, 5))
476  
477          l_gen = pd.Series(self.hist_loss_gen[start_ind:]).rolling(
478              window=window_smoothening).mean()
479          l_disc = pd.Series(self.hist_loss_disc[start_ind:]).rolling(
480              window=window_smoothening).mean()
481  
482          ax = fig.add_subplot(1, 2, 1)
483          ax.plot(range(len(l_gen)), l_gen)
484          ax.set_title('Generator')
485          ax.set_ylabel('Loss')
486          ax.set_xlabel('Iter')
487  
488          ax = fig.add_subplot(1, 2, 2)
489          ax.plot(range(len(l_disc)), l_disc)
490          ax.set_title('Discriminator(s)')
491          ax.set_ylabel('Loss')
492          ax.set_xlabel('Iter')
493  
494          plt.show()