fingerprint.py
1 """ 2 Pattern Fingerprint System 3 4 Extracts structural signatures from graph topology without accessing content. 5 Enables similarity detection, federated learning, and flock coordination. 6 7 Architecture: 8 - Graphs are analyzed for topological features only 9 - Content (node labels, edge labels) is NEVER accessed 10 - Fingerprints enable: 11 - Similarity clustering across the flock 12 - Federated learning contribution 13 - Resonance beacon matching 14 - Aggregate pattern detection 15 16 Privacy Guarantee: 17 - Fingerprints cannot be reversed to recover content 18 - No identifying information is included 19 - Individual fingerprints are never shared - only aggregates 20 """ 21 22 from dataclasses import dataclass, field 23 from typing import List, Dict, Optional, Set, Tuple 24 from enum import Enum 25 import hashlib 26 import json 27 import math 28 from collections import defaultdict 29 30 31 @dataclass 32 class TopologyMetrics: 33 """ 34 Topological features of a graph - content-free. 35 36 These metrics describe the SHAPE of a knowledge graph 37 without knowing what any node or edge actually contains. 38 """ 39 # Size metrics 40 node_count: int 41 edge_count: int 42 density: float # edges / possible edges 43 44 # Degree distribution 45 avg_degree: float 46 max_degree: int 47 degree_variance: float 48 degree_entropy: float # How uniform is the degree distribution 49 50 # Clustering 51 global_clustering: float # Overall clustering coefficient 52 local_clustering_avg: float # Average local clustering 53 local_clustering_variance: float 54 55 # Path metrics 56 avg_path_length: float # Average shortest path 57 diameter: int # Longest shortest path 58 radius: int # Shortest eccentricity 59 60 # Centrality distribution 61 betweenness_entropy: float # How distributed is betweenness 62 closeness_entropy: float 63 pagerank_entropy: float 64 65 # Community structure 66 modularity: float # How modular is the graph 67 community_count: int # Detected communities 68 community_size_variance: float 69 70 # Temporal patterns 71 growth_rate: float # Nodes added per time unit 72 edge_formation_rate: float # Edges added per time unit 73 revision_frequency: float # How often nodes are modified 74 75 # Activity patterns 76 activity_entropy: float # How distributed is activity over time 77 burst_coefficient: float # Tendency for bursty activity 78 79 80 @dataclass 81 class AttentionPattern: 82 """ 83 Patterns in how attention flows through the graph - content-free. 84 85 This captures the RHYTHM of cognition without knowing the content. 86 """ 87 # Traversal patterns 88 avg_session_length: int # Average nodes visited per session 89 revisit_rate: float # How often nodes are revisited 90 jump_distance_avg: float # Average hops between visited nodes 91 92 # Temporal attention 93 attention_entropy: float # How distributed is attention 94 recency_bias: float # Preference for recent vs. old 95 periodicity: Optional[float] # Detected periodic patterns 96 97 # Depth patterns 98 depth_preference: float # -1 (shallow) to +1 (deep) 99 breadth_preference: float # -1 (focused) to +1 (exploratory) 100 101 # Return patterns 102 avg_return_time: float # Time before returning to a node 103 return_probability: float # Probability of returning at all 104 105 106 @dataclass 107 class GraphFingerprint: 108 """ 109 The fingerprint of a graph's structure and attention patterns. 110 111 This is what can be safely shared with the flock. 112 """ 113 # Identity (anonymized) 114 fingerprint_id: str # Hash of the fingerprint itself 115 blanket_id: str # Anonymized Markov blanket ID 116 timestamp: str # When extracted 117 118 # Structural fingerprint 119 topology_vector: List[float] # Normalized topology metrics 120 attention_vector: List[float] # Normalized attention patterns 121 122 # LSH buckets for fast similarity 123 topology_buckets: List[int] 124 attention_buckets: List[int] 125 126 # Version info 127 version: str # Algorithm version 128 129 @property 130 def combined_vector(self) -> List[float]: 131 """Combined feature vector for overall similarity""" 132 return self.topology_vector + self.attention_vector 133 134 135 class GraphFingerprinter: 136 """ 137 Extracts fingerprints from graph structures. 138 139 CRITICAL: This class NEVER accesses content. Only topology and patterns. 140 """ 141 142 def __init__(self, version: str = "0.1.0"): 143 self.version = version 144 self.topology_lsh_bands = 15 145 self.attention_lsh_bands = 10 146 147 def extract_topology(self, graph: 'Graph') -> TopologyMetrics: 148 """ 149 Extract topological metrics from a graph. 150 151 Args: 152 graph: The graph structure (we only look at connections, not content) 153 154 Returns: 155 TopologyMetrics with structure-only features 156 """ 157 n = graph.node_count 158 e = graph.edge_count 159 160 # Density 161 max_edges = n * (n - 1) / 2 if n > 1 else 1 162 density = e / max_edges if max_edges > 0 else 0 163 164 # Degree distribution 165 degrees = graph.degree_sequence 166 avg_degree = sum(degrees) / len(degrees) if degrees else 0 167 max_degree = max(degrees) if degrees else 0 168 degree_variance = self._variance(degrees) 169 degree_entropy = self._entropy(self._normalize_distribution(degrees)) 170 171 # Clustering 172 global_clustering = graph.global_clustering_coefficient 173 local_clusterings = graph.local_clustering_coefficients 174 local_clustering_avg = sum(local_clusterings) / len(local_clusterings) if local_clusterings else 0 175 local_clustering_variance = self._variance(local_clusterings) 176 177 # Path metrics 178 avg_path_length = graph.average_path_length 179 diameter = graph.diameter 180 radius = graph.radius 181 182 # Centrality entropy 183 betweenness = graph.betweenness_centrality_values 184 closeness = graph.closeness_centrality_values 185 pagerank = graph.pagerank_values 186 187 betweenness_entropy = self._entropy(self._normalize_distribution(betweenness)) 188 closeness_entropy = self._entropy(self._normalize_distribution(closeness)) 189 pagerank_entropy = self._entropy(self._normalize_distribution(pagerank)) 190 191 # Community structure 192 modularity = graph.modularity 193 community_sizes = graph.community_sizes 194 community_count = len(community_sizes) 195 community_size_variance = self._variance(community_sizes) 196 197 # Temporal patterns 198 growth_rate = graph.growth_rate 199 edge_formation_rate = graph.edge_formation_rate 200 revision_frequency = graph.revision_frequency 201 202 # Activity patterns 203 activity_distribution = graph.activity_distribution 204 activity_entropy = self._entropy(activity_distribution) 205 burst_coefficient = graph.burst_coefficient 206 207 return TopologyMetrics( 208 node_count=n, 209 edge_count=e, 210 density=density, 211 avg_degree=avg_degree, 212 max_degree=max_degree, 213 degree_variance=degree_variance, 214 degree_entropy=degree_entropy, 215 global_clustering=global_clustering, 216 local_clustering_avg=local_clustering_avg, 217 local_clustering_variance=local_clustering_variance, 218 avg_path_length=avg_path_length, 219 diameter=diameter, 220 radius=radius, 221 betweenness_entropy=betweenness_entropy, 222 closeness_entropy=closeness_entropy, 223 pagerank_entropy=pagerank_entropy, 224 modularity=modularity, 225 community_count=community_count, 226 community_size_variance=community_size_variance, 227 growth_rate=growth_rate, 228 edge_formation_rate=edge_formation_rate, 229 revision_frequency=revision_frequency, 230 activity_entropy=activity_entropy, 231 burst_coefficient=burst_coefficient, 232 ) 233 234 def extract_attention(self, attention_log: 'AttentionLog') -> AttentionPattern: 235 """ 236 Extract attention patterns from usage logs. 237 238 Args: 239 attention_log: Log of what nodes were visited when 240 (we only look at IDs and timestamps, not content) 241 242 Returns: 243 AttentionPattern with attention-only features 244 """ 245 sessions = attention_log.sessions 246 247 # Traversal patterns 248 session_lengths = [len(s.nodes_visited) for s in sessions] 249 avg_session_length = sum(session_lengths) / len(session_lengths) if session_lengths else 0 250 251 revisit_counts = attention_log.revisit_counts 252 total_visits = sum(revisit_counts.values()) 253 revisit_rate = sum(c - 1 for c in revisit_counts.values() if c > 1) / total_visits if total_visits else 0 254 255 jump_distances = attention_log.jump_distances 256 jump_distance_avg = sum(jump_distances) / len(jump_distances) if jump_distances else 0 257 258 # Temporal attention 259 visit_distribution = attention_log.visit_distribution 260 attention_entropy = self._entropy(visit_distribution) 261 262 recency_weights = attention_log.recency_weights 263 recency_bias = sum(recency_weights) / len(recency_weights) if recency_weights else 0.5 264 265 periodicity = self._detect_periodicity(attention_log.timestamps) 266 267 # Depth patterns 268 depths_visited = attention_log.depths_visited 269 avg_depth = sum(depths_visited) / len(depths_visited) if depths_visited else 0 270 max_depth = max(depths_visited) if depths_visited else 1 271 depth_preference = (avg_depth / max_depth - 0.5) * 2 if max_depth > 0 else 0 272 273 breadths_visited = attention_log.breadths_visited 274 avg_breadth = sum(breadths_visited) / len(breadths_visited) if breadths_visited else 0 275 max_breadth = max(breadths_visited) if breadths_visited else 1 276 breadth_preference = (avg_breadth / max_breadth - 0.5) * 2 if max_breadth > 0 else 0 277 278 # Return patterns 279 return_times = attention_log.return_times 280 avg_return_time = sum(return_times) / len(return_times) if return_times else 0 281 return_probability = len([t for t in return_times if t > 0]) / len(return_times) if return_times else 0 282 283 return AttentionPattern( 284 avg_session_length=int(avg_session_length), 285 revisit_rate=revisit_rate, 286 jump_distance_avg=jump_distance_avg, 287 attention_entropy=attention_entropy, 288 recency_bias=recency_bias, 289 periodicity=periodicity, 290 depth_preference=depth_preference, 291 breadth_preference=breadth_preference, 292 avg_return_time=avg_return_time, 293 return_probability=return_probability, 294 ) 295 296 def compute_fingerprint( 297 self, 298 topology: TopologyMetrics, 299 attention: AttentionPattern, 300 blanket_id: str, 301 timestamp: str 302 ) -> GraphFingerprint: 303 """ 304 Compute the graph fingerprint from extracted metrics. 305 306 This is the value that can be shared with the flock. 307 """ 308 # Normalize topology to vector 309 topology_vector = self._topology_to_vector(topology) 310 311 # Normalize attention to vector 312 attention_vector = self._attention_to_vector(attention) 313 314 # Compute LSH buckets 315 topology_buckets = self._compute_lsh(topology_vector, self.topology_lsh_bands) 316 attention_buckets = self._compute_lsh(attention_vector, self.attention_lsh_bands) 317 318 # Compute fingerprint ID 319 combined = topology_vector + attention_vector 320 fingerprint_id = hashlib.sha256( 321 json.dumps(combined, sort_keys=True).encode() 322 ).hexdigest() 323 324 return GraphFingerprint( 325 fingerprint_id=fingerprint_id, 326 blanket_id=blanket_id, 327 timestamp=timestamp, 328 topology_vector=topology_vector, 329 attention_vector=attention_vector, 330 topology_buckets=topology_buckets, 331 attention_buckets=attention_buckets, 332 version=self.version, 333 ) 334 335 def similarity( 336 self, 337 fp1: GraphFingerprint, 338 fp2: GraphFingerprint, 339 topology_weight: float = 0.6, 340 attention_weight: float = 0.4 341 ) -> float: 342 """ 343 Compute similarity between two fingerprints. 344 345 Returns value between 0 (completely different) and 1 (identical). 346 """ 347 # Topology similarity 348 topology_sim = self._cosine_similarity(fp1.topology_vector, fp2.topology_vector) 349 350 # Attention similarity 351 attention_sim = self._cosine_similarity(fp1.attention_vector, fp2.attention_vector) 352 353 # Weighted combination 354 return topology_weight * topology_sim + attention_weight * attention_sim 355 356 def cluster_fingerprints( 357 self, 358 fingerprints: List[GraphFingerprint], 359 n_clusters: int = 10 360 ) -> Dict[int, List[GraphFingerprint]]: 361 """ 362 Cluster fingerprints by similarity. 363 364 Used by the flock to identify patterns across users. 365 """ 366 # Simple k-means-like clustering using LSH buckets for speed 367 clusters: Dict[int, List[GraphFingerprint]] = defaultdict(list) 368 369 # Use LSH buckets to assign initial clusters 370 for fp in fingerprints: 371 # Hash of combined buckets determines cluster 372 bucket_hash = hash(tuple(fp.topology_buckets + fp.attention_buckets)) 373 cluster_id = bucket_hash % n_clusters 374 clusters[cluster_id].append(fp) 375 376 return dict(clusters) 377 378 # --- Private helper methods --- 379 380 def _variance(self, values: List[float]) -> float: 381 if not values: 382 return 0.0 383 mean = sum(values) / len(values) 384 return sum((x - mean) ** 2 for x in values) / len(values) 385 386 def _entropy(self, distribution: List[float]) -> float: 387 """Compute Shannon entropy of a distribution""" 388 if not distribution: 389 return 0.0 390 total = sum(distribution) 391 if total == 0: 392 return 0.0 393 probs = [x / total for x in distribution if x > 0] 394 return -sum(p * math.log2(p) for p in probs if p > 0) 395 396 def _normalize_distribution(self, values: List[float]) -> List[float]: 397 if not values: 398 return [] 399 total = sum(values) 400 if total == 0: 401 return [1.0 / len(values)] * len(values) 402 return [x / total for x in values] 403 404 def _detect_periodicity(self, timestamps: List[float]) -> Optional[float]: 405 """Detect periodic patterns in timestamps""" 406 if len(timestamps) < 10: 407 return None 408 # Simplified: compute intervals and check for regularity 409 intervals = [timestamps[i+1] - timestamps[i] for i in range(len(timestamps)-1)] 410 if not intervals: 411 return None 412 avg_interval = sum(intervals) / len(intervals) 413 variance = self._variance(intervals) 414 # If variance is low relative to mean, there's periodicity 415 if avg_interval > 0 and variance / (avg_interval ** 2) < 0.1: 416 return avg_interval 417 return None 418 419 def _topology_to_vector(self, t: TopologyMetrics) -> List[float]: 420 """Convert topology metrics to normalized vector""" 421 return [ 422 min(t.node_count / 10000, 1.0), 423 min(t.edge_count / 50000, 1.0), 424 t.density, 425 min(t.avg_degree / 20, 1.0), 426 min(t.max_degree / 100, 1.0), 427 min(t.degree_variance / 100, 1.0), 428 min(t.degree_entropy / 10, 1.0), 429 t.global_clustering, 430 t.local_clustering_avg, 431 min(t.local_clustering_variance, 1.0), 432 min(t.avg_path_length / 20, 1.0), 433 min(t.diameter / 50, 1.0), 434 min(t.radius / 25, 1.0), 435 min(t.betweenness_entropy / 10, 1.0), 436 min(t.closeness_entropy / 10, 1.0), 437 min(t.pagerank_entropy / 10, 1.0), 438 min(t.modularity, 1.0), 439 min(t.community_count / 50, 1.0), 440 min(t.community_size_variance / 1000, 1.0), 441 min(t.growth_rate / 100, 1.0), 442 min(t.edge_formation_rate / 500, 1.0), 443 min(t.revision_frequency / 10, 1.0), 444 min(t.activity_entropy / 10, 1.0), 445 min(t.burst_coefficient, 1.0), 446 ] 447 448 def _attention_to_vector(self, a: AttentionPattern) -> List[float]: 449 """Convert attention pattern to normalized vector""" 450 return [ 451 min(a.avg_session_length / 100, 1.0), 452 a.revisit_rate, 453 min(a.jump_distance_avg / 10, 1.0), 454 min(a.attention_entropy / 10, 1.0), 455 a.recency_bias, 456 1.0 if a.periodicity else 0.0, 457 (a.depth_preference + 1) / 2, # Normalize -1..1 to 0..1 458 (a.breadth_preference + 1) / 2, 459 min(a.avg_return_time / 1000, 1.0), 460 a.return_probability, 461 ] 462 463 def _compute_lsh(self, vector: List[float], n_bands: int) -> List[int]: 464 """Compute LSH buckets for similarity search""" 465 buckets = [] 466 band_size = max(1, len(vector) // n_bands) 467 468 for band in range(n_bands): 469 start = band * band_size 470 end = min(start + band_size, len(vector)) 471 band_features = vector[start:end] 472 473 band_bytes = json.dumps(band_features).encode() 474 band_hash = int(hashlib.md5(band_bytes).hexdigest(), 16) 475 buckets.append(band_hash % (2 ** 16)) 476 477 return buckets 478 479 def _cosine_similarity(self, v1: List[float], v2: List[float]) -> float: 480 """Compute cosine similarity between two vectors""" 481 if len(v1) != len(v2) or not v1: 482 return 0.0 483 484 dot_product = sum(a * b for a, b in zip(v1, v2)) 485 norm1 = math.sqrt(sum(a * a for a in v1)) 486 norm2 = math.sqrt(sum(b * b for b in v2)) 487 488 if norm1 == 0 or norm2 == 0: 489 return 0.0 490 491 return dot_product / (norm1 * norm2) 492 493 494 # --- Placeholder types --- 495 496 @dataclass 497 class Graph: 498 """Placeholder for actual graph implementation""" 499 node_count: int = 0 500 edge_count: int = 0 501 degree_sequence: List[int] = field(default_factory=list) 502 global_clustering_coefficient: float = 0.0 503 local_clustering_coefficients: List[float] = field(default_factory=list) 504 average_path_length: float = 0.0 505 diameter: int = 0 506 radius: int = 0 507 betweenness_centrality_values: List[float] = field(default_factory=list) 508 closeness_centrality_values: List[float] = field(default_factory=list) 509 pagerank_values: List[float] = field(default_factory=list) 510 modularity: float = 0.0 511 community_sizes: List[int] = field(default_factory=list) 512 growth_rate: float = 0.0 513 edge_formation_rate: float = 0.0 514 revision_frequency: float = 0.0 515 activity_distribution: List[float] = field(default_factory=list) 516 burst_coefficient: float = 0.0 517 518 519 @dataclass 520 class Session: 521 """Placeholder for session data""" 522 nodes_visited: List[str] = field(default_factory=list) 523 524 525 @dataclass 526 class AttentionLog: 527 """Placeholder for attention log""" 528 sessions: List[Session] = field(default_factory=list) 529 revisit_counts: Dict[str, int] = field(default_factory=dict) 530 jump_distances: List[float] = field(default_factory=list) 531 visit_distribution: List[float] = field(default_factory=list) 532 recency_weights: List[float] = field(default_factory=list) 533 timestamps: List[float] = field(default_factory=list) 534 depths_visited: List[int] = field(default_factory=list) 535 breadths_visited: List[int] = field(default_factory=list) 536 return_times: List[float] = field(default_factory=list) 537 538 539 # --- Demonstration --- 540 541 if __name__ == "__main__": 542 print("Graph Fingerprint System") 543 print("=" * 50) 544 print() 545 print("This system extracts structural fingerprints from") 546 print("knowledge graphs WITHOUT accessing content.") 547 print() 548 print("What we capture:") 549 print(" - Graph topology (density, clustering, paths)") 550 print(" - Attention patterns (session length, revisits)") 551 print(" - Temporal patterns (growth rate, periodicity)") 552 print() 553 print("What we NEVER capture:") 554 print(" - Node content (what nodes say)") 555 print(" - Edge labels (why things connect)") 556 print(" - User identity (who you are)") 557 print() 558 print("The fingerprint enables:") 559 print(" - Finding similar thinkers (by structure)") 560 print(" - Federated learning (improve shared models)") 561 print(" - Resonance beacons (signal relevance)") 562 print() 563 print("Privacy guarantee: Structure only. Content never leaves.")