pyg_guide.py
1 # -*- coding: utf-8 -*- 2 """GUIDE: Higher-order Structure Based Anomaly Detection. 3 4 Dual GCN autoencoders: one on the original adjacency, one on a 5 motif (triangle) adjacency. Anomaly score = weighted sum of 6 reconstruction errors from both views. 7 8 See :cite:`yuan2021guide` for details. 9 10 Reference: 11 Yuan, X., Zhou, N., Yu, S., Huang, H., Chen, Z. and Xia, F., 12 2021. Higher-order Structure Based Anomaly Detection on Attributed 13 Networks. In IEEE BigData, pp. 2691-2700. 14 """ 15 # Author: Yue Zhao <yzhao062@gmail.com> 16 # License: BSD 2 clause 17 18 import numpy as np 19 from sklearn.utils.validation import check_is_fitted 20 21 from .base import BaseDetector 22 from ._pyg_utils import validate_graph_input, to_sparse_adj 23 24 25 class GUIDE(BaseDetector): 26 """GUIDE: Higher-order Structure Based Anomaly Detection. 27 28 Constructs a motif adjacency from triangle participation 29 (binarized in v1: edges in at least one triangle) and runs 30 two GCN autoencoders in parallel. Score = ``alpha * err_orig 31 + (1 - alpha) * err_motif``. 32 33 This detector is **transductive**. 34 35 Parameters 36 ---------- 37 hidden_dim : int, default=64 38 Hidden dimension of GCN layers. 39 40 num_layers : int, default=2 41 Number of GCN encoder layers. 42 43 alpha : float, default=0.5 44 Weight for original-graph reconstruction error. 45 46 dropout : float, default=0.3 47 Dropout rate. 48 49 epochs : int, default=100 50 Training epochs. 51 52 lr : float, default=5e-3 53 Learning rate. 54 55 contamination : float, default=0.1 56 Expected proportion of anomalies. 57 58 Attributes 59 ---------- 60 decision_scores_ : numpy array of shape (n_nodes,) 61 labels_ : numpy array of shape (n_nodes,) 62 threshold_ : float 63 """ 64 65 def __init__(self, hidden_dim=64, num_layers=2, alpha=0.5, 66 dropout=0.3, epochs=100, lr=5e-3, 67 contamination=0.1): 68 super(GUIDE, self).__init__(contamination=contamination) 69 self.hidden_dim = hidden_dim 70 self.num_layers = num_layers 71 self.alpha = alpha 72 self.dropout = dropout 73 self.epochs = epochs 74 self.lr = lr 75 76 def fit(self, X, y=None, edge_index=None): 77 """Fit the detector on graph data. 78 79 Parameters 80 ---------- 81 X : Data or array-like 82 y : ignored 83 edge_index : array-like or None 84 85 Returns 86 ------- 87 self 88 """ 89 import torch 90 import torch.nn as nn 91 from torch_geometric.nn import GCNConv 92 93 data = validate_graph_input(X, edge_index) 94 n_nodes = data.num_nodes 95 self._set_n_classes(y) 96 97 if data.x is None: 98 raise ValueError("GUIDE requires node features (data.x).") 99 100 in_dim = data.x.shape[1] 101 102 ei = data.edge_index 103 ei_np = ei.cpu().numpy() 104 105 # Build motif adjacency (triangle counts per edge) 106 adj_sp = to_sparse_adj(ei_np, n_nodes) 107 motif_adj = adj_sp.dot(adj_sp).multiply(adj_sp) 108 motif_coo = motif_adj.tocoo() 109 if motif_coo.nnz == 0: 110 raise ValueError( 111 "GUIDE requires higher-order structures (triangles) " 112 "in the graph. This graph has no triangles. Use " 113 "DOMINANT or CoLA instead.") 114 ei_motif = torch.LongTensor( 115 np.array([motif_coo.row, motif_coo.col])) 116 117 x = data.x 118 119 # Dense adjacencies for loss 120 adj_dense = torch.zeros(n_nodes, n_nodes) 121 adj_dense[ei[0], ei[1]] = 1.0 122 123 motif_dense = torch.zeros(n_nodes, n_nodes) 124 motif_dense[ei_motif[0], ei_motif[1]] = 1.0 125 126 model = _GUIDEModel( 127 in_dim, self.hidden_dim, self.num_layers, self.dropout) 128 optimizer = torch.optim.Adam(model.parameters(), lr=self.lr) 129 130 model.train() 131 for epoch in range(self.epochs): 132 x_hat_o, x_hat_m, a_hat_o, a_hat_m = model( 133 x, ei, ei_motif) 134 135 # Original graph errors 136 s_err_o = torch.sum( 137 (adj_dense - a_hat_o) ** 2, dim=1) 138 a_err_o = torch.sum((x - x_hat_o) ** 2, dim=1) 139 err_orig = s_err_o + a_err_o 140 141 # Motif graph errors 142 s_err_m = torch.sum( 143 (motif_dense - a_hat_m) ** 2, dim=1) 144 a_err_m = torch.sum((x - x_hat_m) ** 2, dim=1) 145 err_motif = s_err_m + a_err_m 146 147 loss = torch.mean( 148 self.alpha * err_orig 149 + (1 - self.alpha) * err_motif) 150 151 optimizer.zero_grad() 152 loss.backward() 153 optimizer.step() 154 155 model.eval() 156 with torch.no_grad(): 157 x_hat_o, x_hat_m, a_hat_o, a_hat_m = model( 158 x, ei, ei_motif) 159 160 s_err_o = torch.sum( 161 (adj_dense - a_hat_o) ** 2, dim=1) 162 a_err_o = torch.sum((x - x_hat_o) ** 2, dim=1) 163 err_orig = s_err_o + a_err_o 164 165 s_err_m = torch.sum( 166 (motif_dense - a_hat_m) ** 2, dim=1) 167 a_err_m = torch.sum((x - x_hat_m) ** 2, dim=1) 168 err_motif = s_err_m + a_err_m 169 170 scores = (self.alpha * err_orig 171 + (1 - self.alpha) * err_motif) 172 173 self.decision_scores_ = scores.cpu().numpy() 174 self._process_decision_scores() 175 return self 176 177 def decision_function(self, X): 178 """Not supported (transductive detector).""" 179 raise NotImplementedError( 180 "GUIDE is a transductive detector. Use decision_scores_ " 181 "after fit().") 182 183 def predict(self, X, return_confidence=False): 184 """Not supported (transductive detector).""" 185 raise NotImplementedError( 186 "GUIDE is a transductive detector. Use labels_ " 187 "after fit().") 188 189 def predict_proba(self, X, method="linear", return_confidence=False): 190 """Not supported (transductive detector).""" 191 raise NotImplementedError("GUIDE is a transductive detector.") 192 193 def predict_confidence(self, X): 194 """Not supported (transductive detector).""" 195 raise NotImplementedError("GUIDE is a transductive detector.") 196 197 198 def _GUIDEModel(in_dim, hid_dim, num_layers, dropout): 199 """Factory: returns torch.nn.Module for GUIDE dual AE.""" 200 import torch 201 import torch.nn as nn 202 from torch_geometric.nn import GCNConv 203 204 class _Model(nn.Module): 205 def __init__(self): 206 super().__init__() 207 # Original-graph encoder 208 self.enc_orig = nn.ModuleList() 209 self.enc_orig.append(GCNConv(in_dim, hid_dim)) 210 for _ in range(num_layers - 1): 211 self.enc_orig.append(GCNConv(hid_dim, hid_dim)) 212 213 # Motif-graph encoder 214 self.enc_motif = nn.ModuleList() 215 self.enc_motif.append(GCNConv(in_dim, hid_dim)) 216 for _ in range(num_layers - 1): 217 self.enc_motif.append(GCNConv(hid_dim, hid_dim)) 218 219 self.dec_attr_orig = nn.Linear(hid_dim, in_dim) 220 self.dec_attr_motif = nn.Linear(hid_dim, in_dim) 221 self._dropout = dropout 222 223 def _encode(self, x, edge_index, encoder): 224 z = x 225 for i, conv in enumerate(encoder): 226 z = conv(z, edge_index) 227 if i < len(encoder) - 1: 228 z = torch.relu(z) 229 z = torch.dropout( 230 z, p=self._dropout, train=self.training) 231 return z 232 233 def forward(self, x, ei_orig, ei_motif): 234 z_o = self._encode(x, ei_orig, self.enc_orig) 235 z_m = self._encode(x, ei_motif, self.enc_motif) 236 237 x_hat_o = self.dec_attr_orig(z_o) 238 x_hat_m = self.dec_attr_motif(z_m) 239 240 a_hat_o = torch.sigmoid(z_o @ z_o.t()) 241 a_hat_m = torch.sigmoid(z_m @ z_m.t()) 242 243 return x_hat_o, x_hat_m, a_hat_o, a_hat_m 244 245 return _Model()