/ pyod / models / pyg_scan.py
pyg_scan.py
  1  # -*- coding: utf-8 -*-
  2  """SCAN: Structural Clustering Algorithm for Networks.
  3  
  4  Detects anomalous nodes based on structural similarity to neighbors.
  5  Nodes with low average structural similarity to their neighbors are
  6  scored as anomalous. This is a structure-only method — node features
  7  are not used.
  8  
  9  See :cite:`xu2007scan` for details.
 10  
 11  Reference:
 12      Xu, X., Yuruk, N., Feng, Z. and Schweiger, T.A.J., 2007. SCAN:
 13      A Structural Clustering Algorithm for Networks. In KDD, pp. 824-833.
 14  """
 15  # Author: Yue Zhao <yzhao062@gmail.com>
 16  # License: BSD 2 clause
 17  
 18  import numpy as np
 19  from sklearn.utils.validation import check_is_fitted
 20  
 21  from .base import BaseDetector
 22  from ._pyg_utils import validate_graph_input
 23  
 24  
 25  class SCAN(BaseDetector):
 26      """SCAN: Structural Clustering Algorithm for Networks.
 27  
 28      Implements the full SCAN procedure: computes structural
 29      similarity ``sigma(u,v) = |N[u] ∩ N[v]| / sqrt(|N[u]| *
 30      |N[v]|)`` between neighbors, identifies cores (nodes with
 31      at least ``mu`` epsilon-neighbors), clusters cores via
 32      BFS reachability on epsilon-edges, and classifies remaining
 33      nodes as hubs (adjacent to 2+ clusters) or outliers.
 34  
 35      Continuous anomaly score is derived from this classification:
 36      cores receive low scores (inversely proportional to their
 37      epsilon-neighbor count), hubs receive medium scores, and
 38      outliers receive the highest scores.
 39  
 40      This detector is **transductive**.
 41  
 42      Parameters
 43      ----------
 44      epsilon : float, default=0.5
 45          Structural similarity threshold. An edge (u,v) is an
 46          epsilon-edge if ``sigma(u,v) >= epsilon``.
 47  
 48      mu : int, default=2
 49          Minimum number of epsilon-neighbors for a node to be
 50          a core.
 51  
 52      contamination : float, default=0.1
 53          Expected proportion of anomalies in the dataset.
 54  
 55      Attributes
 56      ----------
 57      decision_scores_ : numpy array of shape (n_nodes,)
 58          Anomaly scores. Higher = more anomalous.
 59  
 60      labels_ : numpy array of shape (n_nodes,)
 61          Binary labels (0=inlier, 1=outlier).
 62  
 63      threshold_ : float
 64          Score threshold derived from contamination.
 65  
 66      Examples
 67      --------
 68      >>> from torch_geometric.data import Data
 69      >>> import torch
 70      >>> data = Data(edge_index=torch.tensor([[0,1,1,2],[1,0,2,1]]),
 71      ...             num_nodes=3)
 72      >>> clf = SCAN(contamination=0.3)
 73      >>> clf.fit(data)  # doctest: +SKIP
 74      >>> clf.decision_scores_  # doctest: +SKIP
 75      """
 76  
 77      def __init__(self, epsilon=0.5, mu=2, contamination=0.1):
 78          super(SCAN, self).__init__(contamination=contamination)
 79          self.epsilon = epsilon
 80          self.mu = mu
 81  
 82      def fit(self, X, y=None, edge_index=None):
 83          """Fit the detector on graph data.
 84  
 85          Parameters
 86          ----------
 87          X : Data or array-like
 88              PyG Data object, or node features (n_nodes, n_features).
 89              For SCAN, node features are ignored — only structure is
 90              used. When passing a structure-only graph as Data, set
 91              ``data.num_nodes`` explicitly.
 92  
 93          y : ignored
 94  
 95          edge_index : array-like or None
 96              COO edge list. Required when X is numpy.
 97  
 98          Returns
 99          -------
100          self
101          """
102          data = validate_graph_input(X, edge_index)
103          n_nodes = data.num_nodes
104          self._set_n_classes(y)
105  
106          ei = data.edge_index.cpu().numpy()
107          scores = self._compute_scores(ei, n_nodes)
108  
109          self.decision_scores_ = scores
110          self._process_decision_scores()
111          return self
112  
113      def _compute_scores(self, edge_index, num_nodes):
114          """Full SCAN procedure: similarity, cores, clustering, scoring.
115  
116          Parameters
117          ----------
118          edge_index : np.ndarray of shape (2, n_edges)
119          num_nodes : int
120  
121          Returns
122          -------
123          scores : np.ndarray of shape (num_nodes,)
124          """
125          from scipy.sparse import csr_matrix, eye as sp_eye, diags
126          from scipy.sparse.csgraph import connected_components
127  
128          if edge_index.shape[1] == 0:
129              return np.ones(num_nodes)
130  
131          row, col = edge_index[0], edge_index[1]
132          adj = csr_matrix(
133              (np.ones(len(row), dtype=np.float64), (row, col)),
134              shape=(num_nodes, num_nodes))
135  
136          # --- Structural similarity ---
137          adj_self = adj + sp_eye(num_nodes, dtype=np.float64)
138          adj_self = (adj_self > 0).astype(np.float64)
139          deg_self = np.asarray(adj_self.sum(axis=1)).ravel()
140          intersection = adj_self.dot(adj_self.T)
141          inv_sqrt = np.zeros_like(deg_self)
142          nz = deg_self > 0
143          inv_sqrt[nz] = 1.0 / np.sqrt(deg_self[nz])
144          sim_matrix = diags(inv_sqrt).dot(intersection).dot(
145              diags(inv_sqrt))
146  
147          # --- Epsilon-neighborhood ---
148          sim_edges = sim_matrix.multiply(adj)
149          eps_adj = sim_edges.copy()
150          eps_adj.data[eps_adj.data < self.epsilon] = 0
151          eps_adj.eliminate_zeros()
152          n_eps = np.asarray(
153              (eps_adj > 0).sum(axis=1)).ravel().astype(int)
154  
155          # --- Core detection ---
156          is_core = n_eps >= self.mu
157  
158          # --- Cluster cores via connected components ---
159          core_mask = is_core.astype(np.float64)
160          core_graph = diags(core_mask).dot(eps_adj).dot(
161              diags(core_mask))
162          _, comp_labels = connected_components(
163              core_graph, directed=False)
164          cluster_of = np.where(is_core, comp_labels, -1)
165  
166          # --- Classify non-cores: hub vs outlier ---
167          node_deg = np.asarray(adj.sum(axis=1)).ravel()
168          sim_sum = np.asarray(sim_edges.sum(axis=1)).ravel()
169          mean_sim = np.where(
170              node_deg > 0, sim_sum / node_deg, 0.0)
171  
172          scores = np.zeros(num_nodes)
173          for u in range(num_nodes):
174              if is_core[u]:
175                  # Core: low score, inversely proportional to
176                  # excess epsilon-neighbors
177                  scores[u] = max(
178                      0.0, 1.0 - n_eps[u] / max(self.mu * 2, 1))
179              else:
180                  # Count adjacent clusters
181                  nbrs = adj[u].indices
182                  adj_clusters = set()
183                  for v in nbrs:
184                      if cluster_of[v] >= 0:
185                          adj_clusters.add(cluster_of[v])
186                  if len(adj_clusters) >= 2:
187                      # Hub: medium score
188                      scores[u] = 0.5 + 0.5 * (1.0 - mean_sim[u])
189                  else:
190                      # Outlier: high score
191                      scores[u] = 1.0 + (1.0 - mean_sim[u])
192  
193          return scores
194  
195      def decision_function(self, X):
196          """Not supported (transductive detector)."""
197          raise NotImplementedError(
198              "SCAN is a transductive detector. Use decision_scores_ "
199              "after fit().")
200  
201      def predict(self, X, return_confidence=False):
202          """Not supported (transductive detector)."""
203          raise NotImplementedError(
204              "SCAN is a transductive detector. Use labels_ after fit().")
205  
206      def predict_proba(self, X, method="linear", return_confidence=False):
207          """Not supported (transductive detector)."""
208          raise NotImplementedError(
209              "SCAN is a transductive detector.")
210  
211      def predict_confidence(self, X):
212          """Not supported (transductive detector)."""
213          raise NotImplementedError(
214              "SCAN is a transductive detector.")