pyg_scan.py
1 # -*- coding: utf-8 -*- 2 """SCAN: Structural Clustering Algorithm for Networks. 3 4 Detects anomalous nodes based on structural similarity to neighbors. 5 Nodes with low average structural similarity to their neighbors are 6 scored as anomalous. This is a structure-only method — node features 7 are not used. 8 9 See :cite:`xu2007scan` for details. 10 11 Reference: 12 Xu, X., Yuruk, N., Feng, Z. and Schweiger, T.A.J., 2007. SCAN: 13 A Structural Clustering Algorithm for Networks. In KDD, pp. 824-833. 14 """ 15 # Author: Yue Zhao <yzhao062@gmail.com> 16 # License: BSD 2 clause 17 18 import numpy as np 19 from sklearn.utils.validation import check_is_fitted 20 21 from .base import BaseDetector 22 from ._pyg_utils import validate_graph_input 23 24 25 class SCAN(BaseDetector): 26 """SCAN: Structural Clustering Algorithm for Networks. 27 28 Implements the full SCAN procedure: computes structural 29 similarity ``sigma(u,v) = |N[u] ∩ N[v]| / sqrt(|N[u]| * 30 |N[v]|)`` between neighbors, identifies cores (nodes with 31 at least ``mu`` epsilon-neighbors), clusters cores via 32 BFS reachability on epsilon-edges, and classifies remaining 33 nodes as hubs (adjacent to 2+ clusters) or outliers. 34 35 Continuous anomaly score is derived from this classification: 36 cores receive low scores (inversely proportional to their 37 epsilon-neighbor count), hubs receive medium scores, and 38 outliers receive the highest scores. 39 40 This detector is **transductive**. 41 42 Parameters 43 ---------- 44 epsilon : float, default=0.5 45 Structural similarity threshold. An edge (u,v) is an 46 epsilon-edge if ``sigma(u,v) >= epsilon``. 47 48 mu : int, default=2 49 Minimum number of epsilon-neighbors for a node to be 50 a core. 51 52 contamination : float, default=0.1 53 Expected proportion of anomalies in the dataset. 54 55 Attributes 56 ---------- 57 decision_scores_ : numpy array of shape (n_nodes,) 58 Anomaly scores. Higher = more anomalous. 59 60 labels_ : numpy array of shape (n_nodes,) 61 Binary labels (0=inlier, 1=outlier). 62 63 threshold_ : float 64 Score threshold derived from contamination. 65 66 Examples 67 -------- 68 >>> from torch_geometric.data import Data 69 >>> import torch 70 >>> data = Data(edge_index=torch.tensor([[0,1,1,2],[1,0,2,1]]), 71 ... num_nodes=3) 72 >>> clf = SCAN(contamination=0.3) 73 >>> clf.fit(data) # doctest: +SKIP 74 >>> clf.decision_scores_ # doctest: +SKIP 75 """ 76 77 def __init__(self, epsilon=0.5, mu=2, contamination=0.1): 78 super(SCAN, self).__init__(contamination=contamination) 79 self.epsilon = epsilon 80 self.mu = mu 81 82 def fit(self, X, y=None, edge_index=None): 83 """Fit the detector on graph data. 84 85 Parameters 86 ---------- 87 X : Data or array-like 88 PyG Data object, or node features (n_nodes, n_features). 89 For SCAN, node features are ignored — only structure is 90 used. When passing a structure-only graph as Data, set 91 ``data.num_nodes`` explicitly. 92 93 y : ignored 94 95 edge_index : array-like or None 96 COO edge list. Required when X is numpy. 97 98 Returns 99 ------- 100 self 101 """ 102 data = validate_graph_input(X, edge_index) 103 n_nodes = data.num_nodes 104 self._set_n_classes(y) 105 106 ei = data.edge_index.cpu().numpy() 107 scores = self._compute_scores(ei, n_nodes) 108 109 self.decision_scores_ = scores 110 self._process_decision_scores() 111 return self 112 113 def _compute_scores(self, edge_index, num_nodes): 114 """Full SCAN procedure: similarity, cores, clustering, scoring. 115 116 Parameters 117 ---------- 118 edge_index : np.ndarray of shape (2, n_edges) 119 num_nodes : int 120 121 Returns 122 ------- 123 scores : np.ndarray of shape (num_nodes,) 124 """ 125 from scipy.sparse import csr_matrix, eye as sp_eye, diags 126 from scipy.sparse.csgraph import connected_components 127 128 if edge_index.shape[1] == 0: 129 return np.ones(num_nodes) 130 131 row, col = edge_index[0], edge_index[1] 132 adj = csr_matrix( 133 (np.ones(len(row), dtype=np.float64), (row, col)), 134 shape=(num_nodes, num_nodes)) 135 136 # --- Structural similarity --- 137 adj_self = adj + sp_eye(num_nodes, dtype=np.float64) 138 adj_self = (adj_self > 0).astype(np.float64) 139 deg_self = np.asarray(adj_self.sum(axis=1)).ravel() 140 intersection = adj_self.dot(adj_self.T) 141 inv_sqrt = np.zeros_like(deg_self) 142 nz = deg_self > 0 143 inv_sqrt[nz] = 1.0 / np.sqrt(deg_self[nz]) 144 sim_matrix = diags(inv_sqrt).dot(intersection).dot( 145 diags(inv_sqrt)) 146 147 # --- Epsilon-neighborhood --- 148 sim_edges = sim_matrix.multiply(adj) 149 eps_adj = sim_edges.copy() 150 eps_adj.data[eps_adj.data < self.epsilon] = 0 151 eps_adj.eliminate_zeros() 152 n_eps = np.asarray( 153 (eps_adj > 0).sum(axis=1)).ravel().astype(int) 154 155 # --- Core detection --- 156 is_core = n_eps >= self.mu 157 158 # --- Cluster cores via connected components --- 159 core_mask = is_core.astype(np.float64) 160 core_graph = diags(core_mask).dot(eps_adj).dot( 161 diags(core_mask)) 162 _, comp_labels = connected_components( 163 core_graph, directed=False) 164 cluster_of = np.where(is_core, comp_labels, -1) 165 166 # --- Classify non-cores: hub vs outlier --- 167 node_deg = np.asarray(adj.sum(axis=1)).ravel() 168 sim_sum = np.asarray(sim_edges.sum(axis=1)).ravel() 169 mean_sim = np.where( 170 node_deg > 0, sim_sum / node_deg, 0.0) 171 172 scores = np.zeros(num_nodes) 173 for u in range(num_nodes): 174 if is_core[u]: 175 # Core: low score, inversely proportional to 176 # excess epsilon-neighbors 177 scores[u] = max( 178 0.0, 1.0 - n_eps[u] / max(self.mu * 2, 1)) 179 else: 180 # Count adjacent clusters 181 nbrs = adj[u].indices 182 adj_clusters = set() 183 for v in nbrs: 184 if cluster_of[v] >= 0: 185 adj_clusters.add(cluster_of[v]) 186 if len(adj_clusters) >= 2: 187 # Hub: medium score 188 scores[u] = 0.5 + 0.5 * (1.0 - mean_sim[u]) 189 else: 190 # Outlier: high score 191 scores[u] = 1.0 + (1.0 - mean_sim[u]) 192 193 return scores 194 195 def decision_function(self, X): 196 """Not supported (transductive detector).""" 197 raise NotImplementedError( 198 "SCAN is a transductive detector. Use decision_scores_ " 199 "after fit().") 200 201 def predict(self, X, return_confidence=False): 202 """Not supported (transductive detector).""" 203 raise NotImplementedError( 204 "SCAN is a transductive detector. Use labels_ after fit().") 205 206 def predict_proba(self, X, method="linear", return_confidence=False): 207 """Not supported (transductive detector).""" 208 raise NotImplementedError( 209 "SCAN is a transductive detector.") 210 211 def predict_confidence(self, X): 212 """Not supported (transductive detector).""" 213 raise NotImplementedError( 214 "SCAN is a transductive detector.")