orphan_detector.py
1 #!/usr/bin/env python3 2 """ 3 Orphan Detector - Find unconnected pages in Sovereign_OS graph 4 5 Usage: 6 python scripts/orphan_detector.py [--verbose] [--fix-suggestions] 7 8 This script: 9 1. Scans all markdown files in the repo 10 2. Extracts wiki-links [[like-this]] 11 3. Identifies orphan pages (no inbound links) 12 4. Reports pages missing principle/shape headers 13 5. Suggests connections based on content similarity 14 15 A4 Ergodicity Application: 16 Orphan pages are at risk of "ruin" - they can be forgotten, 17 lost in compression, or never discovered. This script prevents 18 page ruin by surfacing disconnection before it becomes permanent. 19 """ 20 21 import os 22 import re 23 import sys 24 from collections import defaultdict 25 from pathlib import Path 26 from typing import Dict, List, Set, Tuple 27 28 29 # Directories to scan 30 SCAN_DIRS = [ 31 'docs', 32 'patterns', 33 'sessions', 34 'dashboards', 35 ] 36 37 # Directories to skip 38 SKIP_DIRS = { 39 '.git', 40 'node_modules', 41 '__pycache__', 42 'templates', # Templates are meant to be orphans 43 } 44 45 # Files to skip (these are allowed to be orphans) 46 SKIP_FILES = { 47 'README.md', 48 'CLAUDE.md', # Entry point, doesn't need inbound links 49 'CHANGELOG.md', 50 } 51 52 53 def normalize_link(link: str) -> str: 54 """Normalize a wiki-link to a comparable format.""" 55 # Remove any path components, get just the name 56 name = link.split('/')[-1] 57 # Convert to lowercase for comparison 58 name = name.lower() 59 # Remove .md extension if present 60 if name.endswith('.md'): 61 name = name[:-3] 62 # Replace spaces with hyphens 63 name = name.replace(' ', '-') 64 return name 65 66 67 def extract_wiki_links(content: str) -> List[str]: 68 """Extract all [[wiki-links]] from content.""" 69 # Match [[link]] but not [[link|alias]] (take link part) 70 pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]' 71 matches = re.findall(pattern, content) 72 return [normalize_link(m) for m in matches] 73 74 75 def file_to_link_name(filepath: Path) -> str: 76 """Convert a filepath to its wiki-link name.""" 77 name = filepath.stem # filename without extension 78 return normalize_link(name) 79 80 81 def has_principle_shape(content: str) -> Tuple[bool, bool]: 82 """Check if content has principle and shape headers.""" 83 has_principle = bool(re.search(r'^- \*\*principle\*\*', content, re.MULTILINE)) 84 has_shape = bool(re.search(r'^- \*\*shape\*\*', content, re.MULTILINE)) 85 return has_principle, has_shape 86 87 88 def has_related_section(content: str) -> bool: 89 """Check if content has a Related section.""" 90 return bool(re.search(r'^## Related', content, re.MULTILINE)) 91 92 93 def scan_directory(base_path: Path) -> Dict[str, dict]: 94 """ 95 Scan directory for markdown files and extract link information. 96 97 Returns: 98 Dict mapping normalized filename to: 99 { 100 'path': full path, 101 'outbound': set of outbound links, 102 'has_principle': bool, 103 'has_shape': bool, 104 'has_related': bool, 105 'title': first heading 106 } 107 """ 108 files = {} 109 110 for scan_dir in SCAN_DIRS: 111 dir_path = base_path / scan_dir 112 if not dir_path.exists(): 113 continue 114 115 for filepath in dir_path.rglob('*.md'): 116 # Skip directories 117 if any(skip in filepath.parts for skip in SKIP_DIRS): 118 continue 119 120 # Skip specific files 121 if filepath.name in SKIP_FILES: 122 continue 123 124 try: 125 content = filepath.read_text(encoding='utf-8') 126 except Exception as e: 127 print(f"Warning: Could not read {filepath}: {e}", file=sys.stderr) 128 continue 129 130 link_name = file_to_link_name(filepath) 131 outbound = set(extract_wiki_links(content)) 132 has_principle, has_shape = has_principle_shape(content) 133 has_related = has_related_section(content) 134 135 # Extract title (first # heading) 136 title_match = re.search(r'^# (.+)$', content, re.MULTILINE) 137 title = title_match.group(1) if title_match else filepath.stem 138 139 files[link_name] = { 140 'path': filepath, 141 'outbound': outbound, 142 'has_principle': has_principle, 143 'has_shape': has_shape, 144 'has_related': has_related, 145 'title': title, 146 } 147 148 return files 149 150 151 def find_orphans(files: Dict[str, dict]) -> Set[str]: 152 """Find files with no inbound links.""" 153 # Build inbound link map 154 inbound: Dict[str, Set[str]] = defaultdict(set) 155 156 for source, data in files.items(): 157 for target in data['outbound']: 158 inbound[target].add(source) 159 160 # Find orphans (files with no inbound links) 161 orphans = set() 162 for filename in files: 163 if filename not in inbound or len(inbound[filename]) == 0: 164 orphans.add(filename) 165 166 return orphans 167 168 169 def find_broken_links(files: Dict[str, dict]) -> Dict[str, Set[str]]: 170 """Find links that point to non-existent files.""" 171 all_files = set(files.keys()) 172 broken = {} 173 174 for source, data in files.items(): 175 missing = data['outbound'] - all_files 176 if missing: 177 broken[source] = missing 178 179 return broken 180 181 182 def find_missing_structure(files: Dict[str, dict]) -> Dict[str, List[str]]: 183 """Find files missing principle/shape/related sections.""" 184 missing = {} 185 186 for filename, data in files.items(): 187 issues = [] 188 if not data['has_principle']: 189 issues.append('principle') 190 if not data['has_shape']: 191 issues.append('shape') 192 if not data['has_related']: 193 issues.append('Related section') 194 195 if issues: 196 missing[filename] = issues 197 198 return missing 199 200 201 def suggest_connections(files: Dict[str, dict], orphan: str) -> List[str]: 202 """Suggest potential connections for an orphan based on content similarity.""" 203 if orphan not in files: 204 return [] 205 206 orphan_outbound = files[orphan]['outbound'] 207 suggestions = [] 208 209 # Find files that link to the same things the orphan links to 210 for filename, data in files.items(): 211 if filename == orphan: 212 continue 213 214 # Check for shared outbound links 215 shared = orphan_outbound & data['outbound'] 216 if len(shared) >= 2: # At least 2 shared links 217 suggestions.append(filename) 218 219 return suggestions[:5] # Top 5 suggestions 220 221 222 def print_report( 223 files: Dict[str, dict], 224 orphans: Set[str], 225 broken: Dict[str, Set[str]], 226 missing_structure: Dict[str, List[str]], 227 verbose: bool = False, 228 fix_suggestions: bool = False 229 ): 230 """Print the orphan detection report.""" 231 232 print("=" * 70) 233 print("SOVEREIGN_OS ORPHAN DETECTOR") 234 print("=" * 70) 235 print() 236 237 # Summary stats 238 total_files = len(files) 239 total_links = sum(len(d['outbound']) for d in files.values()) 240 241 print(f"📊 SUMMARY") 242 print(f" Total files scanned: {total_files}") 243 print(f" Total wiki-links: {total_links}") 244 print(f" Orphan pages: {len(orphans)}") 245 print(f" Files with broken links: {len(broken)}") 246 print(f" Files missing structure: {len(missing_structure)}") 247 print() 248 249 # Orphans 250 if orphans: 251 print("=" * 70) 252 print("🔴 ORPHAN PAGES (no inbound links)") 253 print(" These pages cannot be discovered through graph navigation.") 254 print(" A4 risk: One compression event away from oblivion.") 255 print("=" * 70) 256 print() 257 258 for orphan in sorted(orphans): 259 if orphan in files: 260 data = files[orphan] 261 print(f" 📄 {data['title']}") 262 print(f" Path: {data['path']}") 263 print(f" Outbound links: {len(data['outbound'])}") 264 265 if fix_suggestions: 266 suggestions = suggest_connections(files, orphan) 267 if suggestions: 268 print(f" 💡 Could connect from: {', '.join(suggestions)}") 269 print() 270 else: 271 print("✅ No orphan pages found!") 272 print() 273 274 # Broken links 275 if broken: 276 print("=" * 70) 277 print("🟡 BROKEN LINKS (point to non-existent files)") 278 print("=" * 70) 279 print() 280 281 for source, targets in sorted(broken.items()): 282 if source in files: 283 print(f" 📄 {files[source]['path']}") 284 for target in sorted(targets): 285 print(f" → [[{target}]] (not found)") 286 print() 287 288 # Missing structure (only in verbose mode) 289 if verbose and missing_structure: 290 print("=" * 70) 291 print("🟠MISSING STRUCTURE") 292 print("=" * 70) 293 print() 294 295 for filename, issues in sorted(missing_structure.items()): 296 if filename in files: 297 data = files[filename] 298 print(f" 📄 {data['title']}") 299 print(f" Path: {data['path']}") 300 print(f" Missing: {', '.join(issues)}") 301 print() 302 303 # Graph health score 304 print("=" * 70) 305 print("📈 GRAPH HEALTH") 306 print("=" * 70) 307 print() 308 309 orphan_rate = len(orphans) / total_files if total_files > 0 else 0 310 structure_rate = len(missing_structure) / total_files if total_files > 0 else 0 311 link_density = total_links / total_files if total_files > 0 else 0 312 313 # Health score (0-1, higher is better) 314 health = 1.0 - (orphan_rate * 0.5) - (structure_rate * 0.3) - (len(broken) / total_files * 0.2 if total_files > 0 else 0) 315 health = max(0, min(1, health)) 316 317 print(f" Orphan rate: {orphan_rate:.1%} (target: <10%)") 318 print(f" Structure compliance: {1-structure_rate:.1%} (target: >90%)") 319 print(f" Link density: {link_density:.1f} links/page (target: >3)") 320 print(f" Broken link files: {len(broken)}") 321 print() 322 print(f" 🎯 GRAPH HEALTH SCORE: {health:.0%}") 323 324 if health >= 0.9: 325 print(" Status: Excellent - graph is well connected") 326 elif health >= 0.7: 327 print(" Status: Good - some orphans need attention") 328 elif health >= 0.5: 329 print(" Status: Fair - significant disconnection") 330 else: 331 print(" Status: Poor - major graph fragmentation") 332 333 print() 334 print("=" * 70) 335 336 337 def main(): 338 """Main entry point.""" 339 import argparse 340 341 parser = argparse.ArgumentParser( 342 description='Detect orphan pages in Sovereign_OS graph' 343 ) 344 parser.add_argument( 345 '--verbose', '-v', 346 action='store_true', 347 help='Show detailed report including missing structure' 348 ) 349 parser.add_argument( 350 '--fix-suggestions', '-f', 351 action='store_true', 352 help='Suggest potential connections for orphans' 353 ) 354 parser.add_argument( 355 '--path', '-p', 356 type=Path, 357 default=Path(__file__).parent.parent, 358 help='Path to Sovereign_OS repo (default: parent of scripts/)' 359 ) 360 361 args = parser.parse_args() 362 363 # Scan files 364 print(f"Scanning {args.path}...\n") 365 files = scan_directory(args.path) 366 367 if not files: 368 print("No markdown files found!", file=sys.stderr) 369 sys.exit(1) 370 371 # Analyze 372 orphans = find_orphans(files) 373 broken = find_broken_links(files) 374 missing_structure = find_missing_structure(files) 375 376 # Report 377 print_report( 378 files, 379 orphans, 380 broken, 381 missing_structure, 382 verbose=args.verbose, 383 fix_suggestions=args.fix_suggestions 384 ) 385 386 # Exit code based on orphan count 387 if len(orphans) > 0: 388 sys.exit(1) 389 sys.exit(0) 390 391 392 if __name__ == '__main__': 393 main()