/ scripts / orphan_detector.py
orphan_detector.py
  1  #!/usr/bin/env python3
  2  """
  3  Orphan Detector - Find unconnected pages in Sovereign_OS graph
  4  
  5  Usage:
  6      python scripts/orphan_detector.py [--verbose] [--fix-suggestions]
  7  
  8  This script:
  9  1. Scans all markdown files in the repo
 10  2. Extracts wiki-links [[like-this]]
 11  3. Identifies orphan pages (no inbound links)
 12  4. Reports pages missing principle/shape headers
 13  5. Suggests connections based on content similarity
 14  
 15  A4 Ergodicity Application:
 16      Orphan pages are at risk of "ruin" - they can be forgotten,
 17      lost in compression, or never discovered. This script prevents
 18      page ruin by surfacing disconnection before it becomes permanent.
 19  """
 20  
 21  import os
 22  import re
 23  import sys
 24  from collections import defaultdict
 25  from pathlib import Path
 26  from typing import Dict, List, Set, Tuple
 27  
 28  
 29  # Directories to scan
 30  SCAN_DIRS = [
 31      'docs',
 32      'patterns',
 33      'sessions',
 34      'dashboards',
 35  ]
 36  
 37  # Directories to skip
 38  SKIP_DIRS = {
 39      '.git',
 40      'node_modules',
 41      '__pycache__',
 42      'templates',  # Templates are meant to be orphans
 43  }
 44  
 45  # Files to skip (these are allowed to be orphans)
 46  SKIP_FILES = {
 47      'README.md',
 48      'CLAUDE.md',  # Entry point, doesn't need inbound links
 49      'CHANGELOG.md',
 50  }
 51  
 52  
 53  def normalize_link(link: str) -> str:
 54      """Normalize a wiki-link to a comparable format."""
 55      # Remove any path components, get just the name
 56      name = link.split('/')[-1]
 57      # Convert to lowercase for comparison
 58      name = name.lower()
 59      # Remove .md extension if present
 60      if name.endswith('.md'):
 61          name = name[:-3]
 62      # Replace spaces with hyphens
 63      name = name.replace(' ', '-')
 64      return name
 65  
 66  
 67  def extract_wiki_links(content: str) -> List[str]:
 68      """Extract all [[wiki-links]] from content."""
 69      # Match [[link]] but not [[link|alias]] (take link part)
 70      pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]'
 71      matches = re.findall(pattern, content)
 72      return [normalize_link(m) for m in matches]
 73  
 74  
 75  def file_to_link_name(filepath: Path) -> str:
 76      """Convert a filepath to its wiki-link name."""
 77      name = filepath.stem  # filename without extension
 78      return normalize_link(name)
 79  
 80  
 81  def has_principle_shape(content: str) -> Tuple[bool, bool]:
 82      """Check if content has principle and shape headers."""
 83      has_principle = bool(re.search(r'^- \*\*principle\*\*', content, re.MULTILINE))
 84      has_shape = bool(re.search(r'^- \*\*shape\*\*', content, re.MULTILINE))
 85      return has_principle, has_shape
 86  
 87  
 88  def has_related_section(content: str) -> bool:
 89      """Check if content has a Related section."""
 90      return bool(re.search(r'^## Related', content, re.MULTILINE))
 91  
 92  
 93  def scan_directory(base_path: Path) -> Dict[str, dict]:
 94      """
 95      Scan directory for markdown files and extract link information.
 96  
 97      Returns:
 98          Dict mapping normalized filename to:
 99          {
100              'path': full path,
101              'outbound': set of outbound links,
102              'has_principle': bool,
103              'has_shape': bool,
104              'has_related': bool,
105              'title': first heading
106          }
107      """
108      files = {}
109  
110      for scan_dir in SCAN_DIRS:
111          dir_path = base_path / scan_dir
112          if not dir_path.exists():
113              continue
114  
115          for filepath in dir_path.rglob('*.md'):
116              # Skip directories
117              if any(skip in filepath.parts for skip in SKIP_DIRS):
118                  continue
119  
120              # Skip specific files
121              if filepath.name in SKIP_FILES:
122                  continue
123  
124              try:
125                  content = filepath.read_text(encoding='utf-8')
126              except Exception as e:
127                  print(f"Warning: Could not read {filepath}: {e}", file=sys.stderr)
128                  continue
129  
130              link_name = file_to_link_name(filepath)
131              outbound = set(extract_wiki_links(content))
132              has_principle, has_shape = has_principle_shape(content)
133              has_related = has_related_section(content)
134  
135              # Extract title (first # heading)
136              title_match = re.search(r'^# (.+)$', content, re.MULTILINE)
137              title = title_match.group(1) if title_match else filepath.stem
138  
139              files[link_name] = {
140                  'path': filepath,
141                  'outbound': outbound,
142                  'has_principle': has_principle,
143                  'has_shape': has_shape,
144                  'has_related': has_related,
145                  'title': title,
146              }
147  
148      return files
149  
150  
151  def find_orphans(files: Dict[str, dict]) -> Set[str]:
152      """Find files with no inbound links."""
153      # Build inbound link map
154      inbound: Dict[str, Set[str]] = defaultdict(set)
155  
156      for source, data in files.items():
157          for target in data['outbound']:
158              inbound[target].add(source)
159  
160      # Find orphans (files with no inbound links)
161      orphans = set()
162      for filename in files:
163          if filename not in inbound or len(inbound[filename]) == 0:
164              orphans.add(filename)
165  
166      return orphans
167  
168  
169  def find_broken_links(files: Dict[str, dict]) -> Dict[str, Set[str]]:
170      """Find links that point to non-existent files."""
171      all_files = set(files.keys())
172      broken = {}
173  
174      for source, data in files.items():
175          missing = data['outbound'] - all_files
176          if missing:
177              broken[source] = missing
178  
179      return broken
180  
181  
182  def find_missing_structure(files: Dict[str, dict]) -> Dict[str, List[str]]:
183      """Find files missing principle/shape/related sections."""
184      missing = {}
185  
186      for filename, data in files.items():
187          issues = []
188          if not data['has_principle']:
189              issues.append('principle')
190          if not data['has_shape']:
191              issues.append('shape')
192          if not data['has_related']:
193              issues.append('Related section')
194  
195          if issues:
196              missing[filename] = issues
197  
198      return missing
199  
200  
201  def suggest_connections(files: Dict[str, dict], orphan: str) -> List[str]:
202      """Suggest potential connections for an orphan based on content similarity."""
203      if orphan not in files:
204          return []
205  
206      orphan_outbound = files[orphan]['outbound']
207      suggestions = []
208  
209      # Find files that link to the same things the orphan links to
210      for filename, data in files.items():
211          if filename == orphan:
212              continue
213  
214          # Check for shared outbound links
215          shared = orphan_outbound & data['outbound']
216          if len(shared) >= 2:  # At least 2 shared links
217              suggestions.append(filename)
218  
219      return suggestions[:5]  # Top 5 suggestions
220  
221  
222  def print_report(
223      files: Dict[str, dict],
224      orphans: Set[str],
225      broken: Dict[str, Set[str]],
226      missing_structure: Dict[str, List[str]],
227      verbose: bool = False,
228      fix_suggestions: bool = False
229  ):
230      """Print the orphan detection report."""
231  
232      print("=" * 70)
233      print("SOVEREIGN_OS ORPHAN DETECTOR")
234      print("=" * 70)
235      print()
236  
237      # Summary stats
238      total_files = len(files)
239      total_links = sum(len(d['outbound']) for d in files.values())
240  
241      print(f"📊 SUMMARY")
242      print(f"   Total files scanned: {total_files}")
243      print(f"   Total wiki-links: {total_links}")
244      print(f"   Orphan pages: {len(orphans)}")
245      print(f"   Files with broken links: {len(broken)}")
246      print(f"   Files missing structure: {len(missing_structure)}")
247      print()
248  
249      # Orphans
250      if orphans:
251          print("=" * 70)
252          print("🔴 ORPHAN PAGES (no inbound links)")
253          print("   These pages cannot be discovered through graph navigation.")
254          print("   A4 risk: One compression event away from oblivion.")
255          print("=" * 70)
256          print()
257  
258          for orphan in sorted(orphans):
259              if orphan in files:
260                  data = files[orphan]
261                  print(f"   📄 {data['title']}")
262                  print(f"      Path: {data['path']}")
263                  print(f"      Outbound links: {len(data['outbound'])}")
264  
265                  if fix_suggestions:
266                      suggestions = suggest_connections(files, orphan)
267                      if suggestions:
268                          print(f"      💡 Could connect from: {', '.join(suggestions)}")
269                  print()
270      else:
271          print("✅ No orphan pages found!")
272          print()
273  
274      # Broken links
275      if broken:
276          print("=" * 70)
277          print("🟡 BROKEN LINKS (point to non-existent files)")
278          print("=" * 70)
279          print()
280  
281          for source, targets in sorted(broken.items()):
282              if source in files:
283                  print(f"   📄 {files[source]['path']}")
284                  for target in sorted(targets):
285                      print(f"      → [[{target}]] (not found)")
286                  print()
287  
288      # Missing structure (only in verbose mode)
289      if verbose and missing_structure:
290          print("=" * 70)
291          print("🟠 MISSING STRUCTURE")
292          print("=" * 70)
293          print()
294  
295          for filename, issues in sorted(missing_structure.items()):
296              if filename in files:
297                  data = files[filename]
298                  print(f"   📄 {data['title']}")
299                  print(f"      Path: {data['path']}")
300                  print(f"      Missing: {', '.join(issues)}")
301                  print()
302  
303      # Graph health score
304      print("=" * 70)
305      print("📈 GRAPH HEALTH")
306      print("=" * 70)
307      print()
308  
309      orphan_rate = len(orphans) / total_files if total_files > 0 else 0
310      structure_rate = len(missing_structure) / total_files if total_files > 0 else 0
311      link_density = total_links / total_files if total_files > 0 else 0
312  
313      # Health score (0-1, higher is better)
314      health = 1.0 - (orphan_rate * 0.5) - (structure_rate * 0.3) - (len(broken) / total_files * 0.2 if total_files > 0 else 0)
315      health = max(0, min(1, health))
316  
317      print(f"   Orphan rate: {orphan_rate:.1%} (target: <10%)")
318      print(f"   Structure compliance: {1-structure_rate:.1%} (target: >90%)")
319      print(f"   Link density: {link_density:.1f} links/page (target: >3)")
320      print(f"   Broken link files: {len(broken)}")
321      print()
322      print(f"   🎯 GRAPH HEALTH SCORE: {health:.0%}")
323  
324      if health >= 0.9:
325          print("      Status: Excellent - graph is well connected")
326      elif health >= 0.7:
327          print("      Status: Good - some orphans need attention")
328      elif health >= 0.5:
329          print("      Status: Fair - significant disconnection")
330      else:
331          print("      Status: Poor - major graph fragmentation")
332  
333      print()
334      print("=" * 70)
335  
336  
337  def main():
338      """Main entry point."""
339      import argparse
340  
341      parser = argparse.ArgumentParser(
342          description='Detect orphan pages in Sovereign_OS graph'
343      )
344      parser.add_argument(
345          '--verbose', '-v',
346          action='store_true',
347          help='Show detailed report including missing structure'
348      )
349      parser.add_argument(
350          '--fix-suggestions', '-f',
351          action='store_true',
352          help='Suggest potential connections for orphans'
353      )
354      parser.add_argument(
355          '--path', '-p',
356          type=Path,
357          default=Path(__file__).parent.parent,
358          help='Path to Sovereign_OS repo (default: parent of scripts/)'
359      )
360  
361      args = parser.parse_args()
362  
363      # Scan files
364      print(f"Scanning {args.path}...\n")
365      files = scan_directory(args.path)
366  
367      if not files:
368          print("No markdown files found!", file=sys.stderr)
369          sys.exit(1)
370  
371      # Analyze
372      orphans = find_orphans(files)
373      broken = find_broken_links(files)
374      missing_structure = find_missing_structure(files)
375  
376      # Report
377      print_report(
378          files,
379          orphans,
380          broken,
381          missing_structure,
382          verbose=args.verbose,
383          fix_suggestions=args.fix_suggestions
384      )
385  
386      # Exit code based on orphan count
387      if len(orphans) > 0:
388          sys.exit(1)
389      sys.exit(0)
390  
391  
392  if __name__ == '__main__':
393      main()