/ core / sharing / fingerprint.py
fingerprint.py
  1  """
  2  Pattern Fingerprint System
  3  
  4  Extracts structural signatures from graph topology without accessing content.
  5  Enables similarity detection, federated learning, and flock coordination.
  6  
  7  Architecture:
  8  - Graphs are analyzed for topological features only
  9  - Content (node labels, edge labels) is NEVER accessed
 10  - Fingerprints enable:
 11    - Similarity clustering across the flock
 12    - Federated learning contribution
 13    - Resonance beacon matching
 14    - Aggregate pattern detection
 15  
 16  Privacy Guarantee:
 17  - Fingerprints cannot be reversed to recover content
 18  - No identifying information is included
 19  - Individual fingerprints are never shared - only aggregates
 20  """
 21  
 22  from dataclasses import dataclass, field
 23  from typing import List, Dict, Optional, Set, Tuple
 24  from enum import Enum
 25  import hashlib
 26  import json
 27  import math
 28  from collections import defaultdict
 29  
 30  
 31  @dataclass
 32  class TopologyMetrics:
 33      """
 34      Topological features of a graph - content-free.
 35  
 36      These metrics describe the SHAPE of a knowledge graph
 37      without knowing what any node or edge actually contains.
 38      """
 39      # Size metrics
 40      node_count: int
 41      edge_count: int
 42      density: float                    # edges / possible edges
 43  
 44      # Degree distribution
 45      avg_degree: float
 46      max_degree: int
 47      degree_variance: float
 48      degree_entropy: float             # How uniform is the degree distribution
 49  
 50      # Clustering
 51      global_clustering: float          # Overall clustering coefficient
 52      local_clustering_avg: float       # Average local clustering
 53      local_clustering_variance: float
 54  
 55      # Path metrics
 56      avg_path_length: float            # Average shortest path
 57      diameter: int                     # Longest shortest path
 58      radius: int                       # Shortest eccentricity
 59  
 60      # Centrality distribution
 61      betweenness_entropy: float        # How distributed is betweenness
 62      closeness_entropy: float
 63      pagerank_entropy: float
 64  
 65      # Community structure
 66      modularity: float                 # How modular is the graph
 67      community_count: int              # Detected communities
 68      community_size_variance: float
 69  
 70      # Temporal patterns
 71      growth_rate: float                # Nodes added per time unit
 72      edge_formation_rate: float        # Edges added per time unit
 73      revision_frequency: float         # How often nodes are modified
 74  
 75      # Activity patterns
 76      activity_entropy: float           # How distributed is activity over time
 77      burst_coefficient: float          # Tendency for bursty activity
 78  
 79  
 80  @dataclass
 81  class AttentionPattern:
 82      """
 83      Patterns in how attention flows through the graph - content-free.
 84  
 85      This captures the RHYTHM of cognition without knowing the content.
 86      """
 87      # Traversal patterns
 88      avg_session_length: int           # Average nodes visited per session
 89      revisit_rate: float               # How often nodes are revisited
 90      jump_distance_avg: float          # Average hops between visited nodes
 91  
 92      # Temporal attention
 93      attention_entropy: float          # How distributed is attention
 94      recency_bias: float               # Preference for recent vs. old
 95      periodicity: Optional[float]      # Detected periodic patterns
 96  
 97      # Depth patterns
 98      depth_preference: float           # -1 (shallow) to +1 (deep)
 99      breadth_preference: float         # -1 (focused) to +1 (exploratory)
100  
101      # Return patterns
102      avg_return_time: float            # Time before returning to a node
103      return_probability: float         # Probability of returning at all
104  
105  
106  @dataclass
107  class GraphFingerprint:
108      """
109      The fingerprint of a graph's structure and attention patterns.
110  
111      This is what can be safely shared with the flock.
112      """
113      # Identity (anonymized)
114      fingerprint_id: str               # Hash of the fingerprint itself
115      blanket_id: str                   # Anonymized Markov blanket ID
116      timestamp: str                    # When extracted
117  
118      # Structural fingerprint
119      topology_vector: List[float]      # Normalized topology metrics
120      attention_vector: List[float]     # Normalized attention patterns
121  
122      # LSH buckets for fast similarity
123      topology_buckets: List[int]
124      attention_buckets: List[int]
125  
126      # Version info
127      version: str                      # Algorithm version
128  
129      @property
130      def combined_vector(self) -> List[float]:
131          """Combined feature vector for overall similarity"""
132          return self.topology_vector + self.attention_vector
133  
134  
135  class GraphFingerprinter:
136      """
137      Extracts fingerprints from graph structures.
138  
139      CRITICAL: This class NEVER accesses content. Only topology and patterns.
140      """
141  
142      def __init__(self, version: str = "0.1.0"):
143          self.version = version
144          self.topology_lsh_bands = 15
145          self.attention_lsh_bands = 10
146  
147      def extract_topology(self, graph: 'Graph') -> TopologyMetrics:
148          """
149          Extract topological metrics from a graph.
150  
151          Args:
152              graph: The graph structure (we only look at connections, not content)
153  
154          Returns:
155              TopologyMetrics with structure-only features
156          """
157          n = graph.node_count
158          e = graph.edge_count
159  
160          # Density
161          max_edges = n * (n - 1) / 2 if n > 1 else 1
162          density = e / max_edges if max_edges > 0 else 0
163  
164          # Degree distribution
165          degrees = graph.degree_sequence
166          avg_degree = sum(degrees) / len(degrees) if degrees else 0
167          max_degree = max(degrees) if degrees else 0
168          degree_variance = self._variance(degrees)
169          degree_entropy = self._entropy(self._normalize_distribution(degrees))
170  
171          # Clustering
172          global_clustering = graph.global_clustering_coefficient
173          local_clusterings = graph.local_clustering_coefficients
174          local_clustering_avg = sum(local_clusterings) / len(local_clusterings) if local_clusterings else 0
175          local_clustering_variance = self._variance(local_clusterings)
176  
177          # Path metrics
178          avg_path_length = graph.average_path_length
179          diameter = graph.diameter
180          radius = graph.radius
181  
182          # Centrality entropy
183          betweenness = graph.betweenness_centrality_values
184          closeness = graph.closeness_centrality_values
185          pagerank = graph.pagerank_values
186  
187          betweenness_entropy = self._entropy(self._normalize_distribution(betweenness))
188          closeness_entropy = self._entropy(self._normalize_distribution(closeness))
189          pagerank_entropy = self._entropy(self._normalize_distribution(pagerank))
190  
191          # Community structure
192          modularity = graph.modularity
193          community_sizes = graph.community_sizes
194          community_count = len(community_sizes)
195          community_size_variance = self._variance(community_sizes)
196  
197          # Temporal patterns
198          growth_rate = graph.growth_rate
199          edge_formation_rate = graph.edge_formation_rate
200          revision_frequency = graph.revision_frequency
201  
202          # Activity patterns
203          activity_distribution = graph.activity_distribution
204          activity_entropy = self._entropy(activity_distribution)
205          burst_coefficient = graph.burst_coefficient
206  
207          return TopologyMetrics(
208              node_count=n,
209              edge_count=e,
210              density=density,
211              avg_degree=avg_degree,
212              max_degree=max_degree,
213              degree_variance=degree_variance,
214              degree_entropy=degree_entropy,
215              global_clustering=global_clustering,
216              local_clustering_avg=local_clustering_avg,
217              local_clustering_variance=local_clustering_variance,
218              avg_path_length=avg_path_length,
219              diameter=diameter,
220              radius=radius,
221              betweenness_entropy=betweenness_entropy,
222              closeness_entropy=closeness_entropy,
223              pagerank_entropy=pagerank_entropy,
224              modularity=modularity,
225              community_count=community_count,
226              community_size_variance=community_size_variance,
227              growth_rate=growth_rate,
228              edge_formation_rate=edge_formation_rate,
229              revision_frequency=revision_frequency,
230              activity_entropy=activity_entropy,
231              burst_coefficient=burst_coefficient,
232          )
233  
234      def extract_attention(self, attention_log: 'AttentionLog') -> AttentionPattern:
235          """
236          Extract attention patterns from usage logs.
237  
238          Args:
239              attention_log: Log of what nodes were visited when
240                            (we only look at IDs and timestamps, not content)
241  
242          Returns:
243              AttentionPattern with attention-only features
244          """
245          sessions = attention_log.sessions
246  
247          # Traversal patterns
248          session_lengths = [len(s.nodes_visited) for s in sessions]
249          avg_session_length = sum(session_lengths) / len(session_lengths) if session_lengths else 0
250  
251          revisit_counts = attention_log.revisit_counts
252          total_visits = sum(revisit_counts.values())
253          revisit_rate = sum(c - 1 for c in revisit_counts.values() if c > 1) / total_visits if total_visits else 0
254  
255          jump_distances = attention_log.jump_distances
256          jump_distance_avg = sum(jump_distances) / len(jump_distances) if jump_distances else 0
257  
258          # Temporal attention
259          visit_distribution = attention_log.visit_distribution
260          attention_entropy = self._entropy(visit_distribution)
261  
262          recency_weights = attention_log.recency_weights
263          recency_bias = sum(recency_weights) / len(recency_weights) if recency_weights else 0.5
264  
265          periodicity = self._detect_periodicity(attention_log.timestamps)
266  
267          # Depth patterns
268          depths_visited = attention_log.depths_visited
269          avg_depth = sum(depths_visited) / len(depths_visited) if depths_visited else 0
270          max_depth = max(depths_visited) if depths_visited else 1
271          depth_preference = (avg_depth / max_depth - 0.5) * 2 if max_depth > 0 else 0
272  
273          breadths_visited = attention_log.breadths_visited
274          avg_breadth = sum(breadths_visited) / len(breadths_visited) if breadths_visited else 0
275          max_breadth = max(breadths_visited) if breadths_visited else 1
276          breadth_preference = (avg_breadth / max_breadth - 0.5) * 2 if max_breadth > 0 else 0
277  
278          # Return patterns
279          return_times = attention_log.return_times
280          avg_return_time = sum(return_times) / len(return_times) if return_times else 0
281          return_probability = len([t for t in return_times if t > 0]) / len(return_times) if return_times else 0
282  
283          return AttentionPattern(
284              avg_session_length=int(avg_session_length),
285              revisit_rate=revisit_rate,
286              jump_distance_avg=jump_distance_avg,
287              attention_entropy=attention_entropy,
288              recency_bias=recency_bias,
289              periodicity=periodicity,
290              depth_preference=depth_preference,
291              breadth_preference=breadth_preference,
292              avg_return_time=avg_return_time,
293              return_probability=return_probability,
294          )
295  
296      def compute_fingerprint(
297          self,
298          topology: TopologyMetrics,
299          attention: AttentionPattern,
300          blanket_id: str,
301          timestamp: str
302      ) -> GraphFingerprint:
303          """
304          Compute the graph fingerprint from extracted metrics.
305  
306          This is the value that can be shared with the flock.
307          """
308          # Normalize topology to vector
309          topology_vector = self._topology_to_vector(topology)
310  
311          # Normalize attention to vector
312          attention_vector = self._attention_to_vector(attention)
313  
314          # Compute LSH buckets
315          topology_buckets = self._compute_lsh(topology_vector, self.topology_lsh_bands)
316          attention_buckets = self._compute_lsh(attention_vector, self.attention_lsh_bands)
317  
318          # Compute fingerprint ID
319          combined = topology_vector + attention_vector
320          fingerprint_id = hashlib.sha256(
321              json.dumps(combined, sort_keys=True).encode()
322          ).hexdigest()
323  
324          return GraphFingerprint(
325              fingerprint_id=fingerprint_id,
326              blanket_id=blanket_id,
327              timestamp=timestamp,
328              topology_vector=topology_vector,
329              attention_vector=attention_vector,
330              topology_buckets=topology_buckets,
331              attention_buckets=attention_buckets,
332              version=self.version,
333          )
334  
335      def similarity(
336          self,
337          fp1: GraphFingerprint,
338          fp2: GraphFingerprint,
339          topology_weight: float = 0.6,
340          attention_weight: float = 0.4
341      ) -> float:
342          """
343          Compute similarity between two fingerprints.
344  
345          Returns value between 0 (completely different) and 1 (identical).
346          """
347          # Topology similarity
348          topology_sim = self._cosine_similarity(fp1.topology_vector, fp2.topology_vector)
349  
350          # Attention similarity
351          attention_sim = self._cosine_similarity(fp1.attention_vector, fp2.attention_vector)
352  
353          # Weighted combination
354          return topology_weight * topology_sim + attention_weight * attention_sim
355  
356      def cluster_fingerprints(
357          self,
358          fingerprints: List[GraphFingerprint],
359          n_clusters: int = 10
360      ) -> Dict[int, List[GraphFingerprint]]:
361          """
362          Cluster fingerprints by similarity.
363  
364          Used by the flock to identify patterns across users.
365          """
366          # Simple k-means-like clustering using LSH buckets for speed
367          clusters: Dict[int, List[GraphFingerprint]] = defaultdict(list)
368  
369          # Use LSH buckets to assign initial clusters
370          for fp in fingerprints:
371              # Hash of combined buckets determines cluster
372              bucket_hash = hash(tuple(fp.topology_buckets + fp.attention_buckets))
373              cluster_id = bucket_hash % n_clusters
374              clusters[cluster_id].append(fp)
375  
376          return dict(clusters)
377  
378      # --- Private helper methods ---
379  
380      def _variance(self, values: List[float]) -> float:
381          if not values:
382              return 0.0
383          mean = sum(values) / len(values)
384          return sum((x - mean) ** 2 for x in values) / len(values)
385  
386      def _entropy(self, distribution: List[float]) -> float:
387          """Compute Shannon entropy of a distribution"""
388          if not distribution:
389              return 0.0
390          total = sum(distribution)
391          if total == 0:
392              return 0.0
393          probs = [x / total for x in distribution if x > 0]
394          return -sum(p * math.log2(p) for p in probs if p > 0)
395  
396      def _normalize_distribution(self, values: List[float]) -> List[float]:
397          if not values:
398              return []
399          total = sum(values)
400          if total == 0:
401              return [1.0 / len(values)] * len(values)
402          return [x / total for x in values]
403  
404      def _detect_periodicity(self, timestamps: List[float]) -> Optional[float]:
405          """Detect periodic patterns in timestamps"""
406          if len(timestamps) < 10:
407              return None
408          # Simplified: compute intervals and check for regularity
409          intervals = [timestamps[i+1] - timestamps[i] for i in range(len(timestamps)-1)]
410          if not intervals:
411              return None
412          avg_interval = sum(intervals) / len(intervals)
413          variance = self._variance(intervals)
414          # If variance is low relative to mean, there's periodicity
415          if avg_interval > 0 and variance / (avg_interval ** 2) < 0.1:
416              return avg_interval
417          return None
418  
419      def _topology_to_vector(self, t: TopologyMetrics) -> List[float]:
420          """Convert topology metrics to normalized vector"""
421          return [
422              min(t.node_count / 10000, 1.0),
423              min(t.edge_count / 50000, 1.0),
424              t.density,
425              min(t.avg_degree / 20, 1.0),
426              min(t.max_degree / 100, 1.0),
427              min(t.degree_variance / 100, 1.0),
428              min(t.degree_entropy / 10, 1.0),
429              t.global_clustering,
430              t.local_clustering_avg,
431              min(t.local_clustering_variance, 1.0),
432              min(t.avg_path_length / 20, 1.0),
433              min(t.diameter / 50, 1.0),
434              min(t.radius / 25, 1.0),
435              min(t.betweenness_entropy / 10, 1.0),
436              min(t.closeness_entropy / 10, 1.0),
437              min(t.pagerank_entropy / 10, 1.0),
438              min(t.modularity, 1.0),
439              min(t.community_count / 50, 1.0),
440              min(t.community_size_variance / 1000, 1.0),
441              min(t.growth_rate / 100, 1.0),
442              min(t.edge_formation_rate / 500, 1.0),
443              min(t.revision_frequency / 10, 1.0),
444              min(t.activity_entropy / 10, 1.0),
445              min(t.burst_coefficient, 1.0),
446          ]
447  
448      def _attention_to_vector(self, a: AttentionPattern) -> List[float]:
449          """Convert attention pattern to normalized vector"""
450          return [
451              min(a.avg_session_length / 100, 1.0),
452              a.revisit_rate,
453              min(a.jump_distance_avg / 10, 1.0),
454              min(a.attention_entropy / 10, 1.0),
455              a.recency_bias,
456              1.0 if a.periodicity else 0.0,
457              (a.depth_preference + 1) / 2,      # Normalize -1..1 to 0..1
458              (a.breadth_preference + 1) / 2,
459              min(a.avg_return_time / 1000, 1.0),
460              a.return_probability,
461          ]
462  
463      def _compute_lsh(self, vector: List[float], n_bands: int) -> List[int]:
464          """Compute LSH buckets for similarity search"""
465          buckets = []
466          band_size = max(1, len(vector) // n_bands)
467  
468          for band in range(n_bands):
469              start = band * band_size
470              end = min(start + band_size, len(vector))
471              band_features = vector[start:end]
472  
473              band_bytes = json.dumps(band_features).encode()
474              band_hash = int(hashlib.md5(band_bytes).hexdigest(), 16)
475              buckets.append(band_hash % (2 ** 16))
476  
477          return buckets
478  
479      def _cosine_similarity(self, v1: List[float], v2: List[float]) -> float:
480          """Compute cosine similarity between two vectors"""
481          if len(v1) != len(v2) or not v1:
482              return 0.0
483  
484          dot_product = sum(a * b for a, b in zip(v1, v2))
485          norm1 = math.sqrt(sum(a * a for a in v1))
486          norm2 = math.sqrt(sum(b * b for b in v2))
487  
488          if norm1 == 0 or norm2 == 0:
489              return 0.0
490  
491          return dot_product / (norm1 * norm2)
492  
493  
494  # --- Placeholder types ---
495  
496  @dataclass
497  class Graph:
498      """Placeholder for actual graph implementation"""
499      node_count: int = 0
500      edge_count: int = 0
501      degree_sequence: List[int] = field(default_factory=list)
502      global_clustering_coefficient: float = 0.0
503      local_clustering_coefficients: List[float] = field(default_factory=list)
504      average_path_length: float = 0.0
505      diameter: int = 0
506      radius: int = 0
507      betweenness_centrality_values: List[float] = field(default_factory=list)
508      closeness_centrality_values: List[float] = field(default_factory=list)
509      pagerank_values: List[float] = field(default_factory=list)
510      modularity: float = 0.0
511      community_sizes: List[int] = field(default_factory=list)
512      growth_rate: float = 0.0
513      edge_formation_rate: float = 0.0
514      revision_frequency: float = 0.0
515      activity_distribution: List[float] = field(default_factory=list)
516      burst_coefficient: float = 0.0
517  
518  
519  @dataclass
520  class Session:
521      """Placeholder for session data"""
522      nodes_visited: List[str] = field(default_factory=list)
523  
524  
525  @dataclass
526  class AttentionLog:
527      """Placeholder for attention log"""
528      sessions: List[Session] = field(default_factory=list)
529      revisit_counts: Dict[str, int] = field(default_factory=dict)
530      jump_distances: List[float] = field(default_factory=list)
531      visit_distribution: List[float] = field(default_factory=list)
532      recency_weights: List[float] = field(default_factory=list)
533      timestamps: List[float] = field(default_factory=list)
534      depths_visited: List[int] = field(default_factory=list)
535      breadths_visited: List[int] = field(default_factory=list)
536      return_times: List[float] = field(default_factory=list)
537  
538  
539  # --- Demonstration ---
540  
541  if __name__ == "__main__":
542      print("Graph Fingerprint System")
543      print("=" * 50)
544      print()
545      print("This system extracts structural fingerprints from")
546      print("knowledge graphs WITHOUT accessing content.")
547      print()
548      print("What we capture:")
549      print("  - Graph topology (density, clustering, paths)")
550      print("  - Attention patterns (session length, revisits)")
551      print("  - Temporal patterns (growth rate, periodicity)")
552      print()
553      print("What we NEVER capture:")
554      print("  - Node content (what nodes say)")
555      print("  - Edge labels (why things connect)")
556      print("  - User identity (who you are)")
557      print()
558      print("The fingerprint enables:")
559      print("  - Finding similar thinkers (by structure)")
560      print("  - Federated learning (improve shared models)")
561      print("  - Resonance beacons (signal relevance)")
562      print()
563      print("Privacy guarantee: Structure only. Content never leaves.")