/ pyod / models / pyg_guide.py
pyg_guide.py
  1  # -*- coding: utf-8 -*-
  2  """GUIDE: Higher-order Structure Based Anomaly Detection.
  3  
  4  Dual GCN autoencoders: one on the original adjacency, one on a
  5  motif (triangle) adjacency. Anomaly score = weighted sum of
  6  reconstruction errors from both views.
  7  
  8  See :cite:`yuan2021guide` for details.
  9  
 10  Reference:
 11      Yuan, X., Zhou, N., Yu, S., Huang, H., Chen, Z. and Xia, F.,
 12      2021. Higher-order Structure Based Anomaly Detection on Attributed
 13      Networks. In IEEE BigData, pp. 2691-2700.
 14  """
 15  # Author: Yue Zhao <yzhao062@gmail.com>
 16  # License: BSD 2 clause
 17  
 18  import numpy as np
 19  from sklearn.utils.validation import check_is_fitted
 20  
 21  from .base import BaseDetector
 22  from ._pyg_utils import validate_graph_input, to_sparse_adj
 23  
 24  
 25  class GUIDE(BaseDetector):
 26      """GUIDE: Higher-order Structure Based Anomaly Detection.
 27  
 28      Constructs a motif adjacency from triangle participation
 29      (binarized in v1: edges in at least one triangle) and runs
 30      two GCN autoencoders in parallel. Score = ``alpha * err_orig
 31      + (1 - alpha) * err_motif``.
 32  
 33      This detector is **transductive**.
 34  
 35      Parameters
 36      ----------
 37      hidden_dim : int, default=64
 38          Hidden dimension of GCN layers.
 39  
 40      num_layers : int, default=2
 41          Number of GCN encoder layers.
 42  
 43      alpha : float, default=0.5
 44          Weight for original-graph reconstruction error.
 45  
 46      dropout : float, default=0.3
 47          Dropout rate.
 48  
 49      epochs : int, default=100
 50          Training epochs.
 51  
 52      lr : float, default=5e-3
 53          Learning rate.
 54  
 55      contamination : float, default=0.1
 56          Expected proportion of anomalies.
 57  
 58      Attributes
 59      ----------
 60      decision_scores_ : numpy array of shape (n_nodes,)
 61      labels_ : numpy array of shape (n_nodes,)
 62      threshold_ : float
 63      """
 64  
 65      def __init__(self, hidden_dim=64, num_layers=2, alpha=0.5,
 66                   dropout=0.3, epochs=100, lr=5e-3,
 67                   contamination=0.1):
 68          super(GUIDE, self).__init__(contamination=contamination)
 69          self.hidden_dim = hidden_dim
 70          self.num_layers = num_layers
 71          self.alpha = alpha
 72          self.dropout = dropout
 73          self.epochs = epochs
 74          self.lr = lr
 75  
 76      def fit(self, X, y=None, edge_index=None):
 77          """Fit the detector on graph data.
 78  
 79          Parameters
 80          ----------
 81          X : Data or array-like
 82          y : ignored
 83          edge_index : array-like or None
 84  
 85          Returns
 86          -------
 87          self
 88          """
 89          import torch
 90          import torch.nn as nn
 91          from torch_geometric.nn import GCNConv
 92  
 93          data = validate_graph_input(X, edge_index)
 94          n_nodes = data.num_nodes
 95          self._set_n_classes(y)
 96  
 97          if data.x is None:
 98              raise ValueError("GUIDE requires node features (data.x).")
 99  
100          in_dim = data.x.shape[1]
101  
102          ei = data.edge_index
103          ei_np = ei.cpu().numpy()
104  
105          # Build motif adjacency (triangle counts per edge)
106          adj_sp = to_sparse_adj(ei_np, n_nodes)
107          motif_adj = adj_sp.dot(adj_sp).multiply(adj_sp)
108          motif_coo = motif_adj.tocoo()
109          if motif_coo.nnz == 0:
110              raise ValueError(
111                  "GUIDE requires higher-order structures (triangles) "
112                  "in the graph. This graph has no triangles. Use "
113                  "DOMINANT or CoLA instead.")
114          ei_motif = torch.LongTensor(
115              np.array([motif_coo.row, motif_coo.col]))
116  
117          x = data.x
118  
119          # Dense adjacencies for loss
120          adj_dense = torch.zeros(n_nodes, n_nodes)
121          adj_dense[ei[0], ei[1]] = 1.0
122  
123          motif_dense = torch.zeros(n_nodes, n_nodes)
124          motif_dense[ei_motif[0], ei_motif[1]] = 1.0
125  
126          model = _GUIDEModel(
127              in_dim, self.hidden_dim, self.num_layers, self.dropout)
128          optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
129  
130          model.train()
131          for epoch in range(self.epochs):
132              x_hat_o, x_hat_m, a_hat_o, a_hat_m = model(
133                  x, ei, ei_motif)
134  
135              # Original graph errors
136              s_err_o = torch.sum(
137                  (adj_dense - a_hat_o) ** 2, dim=1)
138              a_err_o = torch.sum((x - x_hat_o) ** 2, dim=1)
139              err_orig = s_err_o + a_err_o
140  
141              # Motif graph errors
142              s_err_m = torch.sum(
143                  (motif_dense - a_hat_m) ** 2, dim=1)
144              a_err_m = torch.sum((x - x_hat_m) ** 2, dim=1)
145              err_motif = s_err_m + a_err_m
146  
147              loss = torch.mean(
148                  self.alpha * err_orig
149                  + (1 - self.alpha) * err_motif)
150  
151              optimizer.zero_grad()
152              loss.backward()
153              optimizer.step()
154  
155          model.eval()
156          with torch.no_grad():
157              x_hat_o, x_hat_m, a_hat_o, a_hat_m = model(
158                  x, ei, ei_motif)
159  
160              s_err_o = torch.sum(
161                  (adj_dense - a_hat_o) ** 2, dim=1)
162              a_err_o = torch.sum((x - x_hat_o) ** 2, dim=1)
163              err_orig = s_err_o + a_err_o
164  
165              s_err_m = torch.sum(
166                  (motif_dense - a_hat_m) ** 2, dim=1)
167              a_err_m = torch.sum((x - x_hat_m) ** 2, dim=1)
168              err_motif = s_err_m + a_err_m
169  
170              scores = (self.alpha * err_orig
171                        + (1 - self.alpha) * err_motif)
172  
173          self.decision_scores_ = scores.cpu().numpy()
174          self._process_decision_scores()
175          return self
176  
177      def decision_function(self, X):
178          """Not supported (transductive detector)."""
179          raise NotImplementedError(
180              "GUIDE is a transductive detector. Use decision_scores_ "
181              "after fit().")
182  
183      def predict(self, X, return_confidence=False):
184          """Not supported (transductive detector)."""
185          raise NotImplementedError(
186              "GUIDE is a transductive detector. Use labels_ "
187              "after fit().")
188  
189      def predict_proba(self, X, method="linear", return_confidence=False):
190          """Not supported (transductive detector)."""
191          raise NotImplementedError("GUIDE is a transductive detector.")
192  
193      def predict_confidence(self, X):
194          """Not supported (transductive detector)."""
195          raise NotImplementedError("GUIDE is a transductive detector.")
196  
197  
198  def _GUIDEModel(in_dim, hid_dim, num_layers, dropout):
199      """Factory: returns torch.nn.Module for GUIDE dual AE."""
200      import torch
201      import torch.nn as nn
202      from torch_geometric.nn import GCNConv
203  
204      class _Model(nn.Module):
205          def __init__(self):
206              super().__init__()
207              # Original-graph encoder
208              self.enc_orig = nn.ModuleList()
209              self.enc_orig.append(GCNConv(in_dim, hid_dim))
210              for _ in range(num_layers - 1):
211                  self.enc_orig.append(GCNConv(hid_dim, hid_dim))
212  
213              # Motif-graph encoder
214              self.enc_motif = nn.ModuleList()
215              self.enc_motif.append(GCNConv(in_dim, hid_dim))
216              for _ in range(num_layers - 1):
217                  self.enc_motif.append(GCNConv(hid_dim, hid_dim))
218  
219              self.dec_attr_orig = nn.Linear(hid_dim, in_dim)
220              self.dec_attr_motif = nn.Linear(hid_dim, in_dim)
221              self._dropout = dropout
222  
223          def _encode(self, x, edge_index, encoder):
224              z = x
225              for i, conv in enumerate(encoder):
226                  z = conv(z, edge_index)
227                  if i < len(encoder) - 1:
228                      z = torch.relu(z)
229                      z = torch.dropout(
230                          z, p=self._dropout, train=self.training)
231              return z
232  
233          def forward(self, x, ei_orig, ei_motif):
234              z_o = self._encode(x, ei_orig, self.enc_orig)
235              z_m = self._encode(x, ei_motif, self.enc_motif)
236  
237              x_hat_o = self.dec_attr_orig(z_o)
238              x_hat_m = self.dec_attr_motif(z_m)
239  
240              a_hat_o = torch.sigmoid(z_o @ z_o.t())
241              a_hat_m = torch.sigmoid(z_m @ z_m.t())
242  
243              return x_hat_o, x_hat_m, a_hat_o, a_hat_m
244  
245      return _Model()