/ pyod / models / pyg_cola.py
pyg_cola.py
  1  # -*- coding: utf-8 -*-
  2  """CoLA: Contrastive Self-Supervised Learning for Anomaly Detection.
  3  
  4  Contrasts each node's embedding against its local neighbor context
  5  (mean of neighbors' embeddings). Nodes whose embeddings are
  6  indistinguishable from shuffled-feature embeddings are anomalous.
  7  Multi-round scoring for robustness.
  8  
  9  See :cite:`liu2022cola` for details.
 10  
 11  Reference:
 12      Liu, Y., Li, Z., Pan, S., Gool, T., Xiang, T. and Gong, B., 2022.
 13      Anomaly Detection on Attributed Networks via Contrastive
 14      Self-Supervised Learning. In WWW, pp. 2137-2147.
 15  """
 16  # Author: Yue Zhao <yzhao062@gmail.com>
 17  # License: BSD 2 clause
 18  
 19  import numpy as np
 20  from sklearn.utils.validation import check_is_fitted
 21  
 22  from .base import BaseDetector
 23  from ._pyg_utils import validate_graph_input
 24  
 25  
 26  class CoLA(BaseDetector):
 27      """CoLA: Contrastive Anomaly Detection on Attributed Networks.
 28  
 29      GCN encoder maps nodes to embeddings. A bilinear discriminator
 30      scores how well a node's embedding matches its local neighbor
 31      context (mean of neighbors' embeddings).
 32      Nodes with low discriminator scores are anomalous.
 33  
 34      This detector is **transductive**.
 35  
 36      Parameters
 37      ----------
 38      hidden_dim : int, default=64
 39          Hidden dimension of GCN.
 40  
 41      num_layers : int, default=2
 42          Number of GCN layers.
 43  
 44      epochs : int, default=100
 45          Training epochs.
 46  
 47      lr : float, default=1e-3
 48          Learning rate.
 49  
 50      contamination : float, default=0.1
 51          Expected proportion of anomalies.
 52  
 53      Attributes
 54      ----------
 55      decision_scores_ : numpy array of shape (n_nodes,)
 56      labels_ : numpy array of shape (n_nodes,)
 57      threshold_ : float
 58      """
 59  
 60      def __init__(self, hidden_dim=64, num_layers=2, epochs=100,
 61                   lr=1e-3, contamination=0.1):
 62          super(CoLA, self).__init__(contamination=contamination)
 63          self.hidden_dim = hidden_dim
 64          self.num_layers = num_layers
 65          self.epochs = epochs
 66          self.lr = lr
 67  
 68      def fit(self, X, y=None, edge_index=None):
 69          """Fit the detector on graph data.
 70  
 71          Parameters
 72          ----------
 73          X : Data or array-like
 74          y : ignored
 75          edge_index : array-like or None
 76  
 77          Returns
 78          -------
 79          self
 80          """
 81          import torch
 82          import torch.nn as nn
 83          import torch.nn.functional as F
 84          from torch_geometric.nn import GCNConv
 85  
 86          data = validate_graph_input(X, edge_index)
 87          n_nodes = data.num_nodes
 88          self._set_n_classes(y)
 89  
 90          if data.x is None:
 91              raise ValueError("CoLA requires node features (data.x).")
 92  
 93          in_dim = data.x.shape[1]
 94  
 95          model = _CoLAModel(in_dim, self.hidden_dim, self.num_layers)
 96          optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
 97  
 98          x = data.x
 99          ei = data.edge_index
100  
101          # Sparse row-normalized adjacency for local context
102          from torch_geometric.utils import degree
103          row_deg = degree(ei[0], num_nodes=n_nodes)
104          row_deg = row_deg.clamp(min=1)
105          edge_weight = 1.0 / row_deg[ei[0]]
106          adj_norm = torch.sparse_coo_tensor(
107              ei, edge_weight, (n_nodes, n_nodes)).coalesce()
108  
109          model.train()
110          for epoch in range(self.epochs):
111              z = model.encode(x, ei)
112  
113              # Local context: mean of neighbors' embeddings
114              local_ctx = torch.sparse.mm(adj_norm, z)  # (n, hid)
115  
116              # Positive: (node, local_context) pairs
117              pos_scores = model.discriminate(z, local_ctx)
118  
119              # Negative: shuffle features, re-encode
120              perm = torch.randperm(n_nodes)
121              z_neg = model.encode(x[perm], ei)
122              neg_scores = model.discriminate(z_neg, local_ctx)
123  
124              pos_loss = F.binary_cross_entropy_with_logits(
125                  pos_scores, torch.ones(n_nodes))
126              neg_loss = F.binary_cross_entropy_with_logits(
127                  neg_scores, torch.zeros(n_nodes))
128              loss = pos_loss + neg_loss
129  
130              optimizer.zero_grad()
131              loss.backward()
132              optimizer.step()
133  
134          # Multi-round scoring with dropout stochasticity
135          model.train()  # keep dropout active for stochasticity
136          all_scores = []
137          for _ in range(5):
138              with torch.no_grad():
139                  z = model.encode(x, ei)
140                  local_ctx = torch.sparse.mm(adj_norm, z)
141                  s = -model.discriminate(z, local_ctx)
142                  all_scores.append(s.cpu().numpy())
143          model.eval()
144          scores = torch.FloatTensor(np.mean(all_scores, axis=0))
145  
146          self.decision_scores_ = scores.cpu().numpy()
147          self._process_decision_scores()
148          return self
149  
150      def decision_function(self, X):
151          """Not supported (transductive detector)."""
152          raise NotImplementedError(
153              "CoLA is a transductive detector. Use decision_scores_ "
154              "after fit().")
155  
156      def predict(self, X, return_confidence=False):
157          """Not supported (transductive detector)."""
158          raise NotImplementedError(
159              "CoLA is a transductive detector. Use labels_ after fit().")
160  
161      def predict_proba(self, X, method="linear", return_confidence=False):
162          """Not supported (transductive detector)."""
163          raise NotImplementedError("CoLA is a transductive detector.")
164  
165      def predict_confidence(self, X):
166          """Not supported (transductive detector)."""
167          raise NotImplementedError("CoLA is a transductive detector.")
168  
169  
170  def _CoLAModel(in_dim, hid_dim, num_layers):
171      """Factory: returns a torch.nn.Module for CoLA.
172  
173      Uses local-context contrastive learning: a GCN encoder
174      produces node embeddings, and a bilinear discriminator
175      scores (node, local_neighbor_context) pairs.
176      """
177      import torch
178      import torch.nn as nn
179      from torch_geometric.nn import GCNConv
180  
181      class _Model(nn.Module):
182          def __init__(self):
183              super().__init__()
184              self.convs = nn.ModuleList()
185              self.convs.append(GCNConv(in_dim, hid_dim))
186              for _ in range(num_layers - 1):
187                  self.convs.append(GCNConv(hid_dim, hid_dim))
188              self.drop = nn.Dropout(0.3)
189              self.disc = nn.Bilinear(hid_dim, hid_dim, 1)
190  
191          def encode(self, x, edge_index):
192              z = x
193              for i, conv in enumerate(self.convs):
194                  z = conv(z, edge_index)
195                  if i < len(self.convs) - 1:
196                      z = torch.relu(z)
197                      z = self.drop(z)
198              return z
199  
200          def discriminate(self, z, local_ctx):
201              """Score (node_embedding, local_context) pairs."""
202              return self.disc(z, local_ctx).squeeze(-1)
203  
204      return _Model()