/ scripts / roam_backfill.py
roam_backfill.py
  1  #!/usr/bin/env python3
  2  """
  3  Roam Backfill Script - Extract high-gravity concepts and create Sovereign_OS stub pages
  4  
  5  Usage:
  6      python scripts/roam_backfill.py /path/to/roam-export.json
  7  
  8  This script:
  9  1. Parses a Roam Research JSON export
 10  2. Identifies high-gravity concepts (frequently referenced)
 11  3. Extracts content around those concepts
 12  4. Creates stub pages in Sovereign_OS with shapes and connections
 13  """
 14  
 15  import json
 16  import re
 17  import os
 18  import sys
 19  from collections import Counter, defaultdict
 20  from pathlib import Path
 21  
 22  
 23  # Noise to filter out
 24  NOISE_CONCEPTS = {
 25      'TODO', 'DONE', 'orange', 'pink', 'blue', 'green', 'yellow', 'red',
 26      'Quick Capture', 'Roam-Highlights', 'embed', 'pdf', 'favorite',
 27      'Highlights', 'Matter', 'Quick capture', 'youtube', 'call', 'Email',
 28      'highlights', 'resume', 'Tweet', 'Twitter', 'article', 'books',
 29      'calc', 'slider', 'cover letter', 'nft', 'crypto'
 30  }
 31  
 32  MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
 33            'July', 'August', 'September', 'October', 'November', 'December']
 34  
 35  
 36  def extract_links_from_block(obj, links=None, blocks=None, path=None):
 37      """Recursively extract [[wiki-links]] and their containing blocks."""
 38      if links is None:
 39          links = []
 40      if blocks is None:
 41          blocks = {}
 42      if path is None:
 43          path = []
 44  
 45      if isinstance(obj, dict):
 46          if 'string' in obj:
 47              text = obj.get('string', '')
 48              found = re.findall(r'\[\[([^\]]+)\]\]', text)
 49              for link in found:
 50                  links.append(link)
 51                  if link not in blocks:
 52                      blocks[link] = []
 53                  blocks[link].append({
 54                      'text': text,
 55                      'path': path.copy(),
 56                      'uid': obj.get('uid', '')
 57                  })
 58  
 59          if 'children' in obj:
 60              new_path = path + [obj.get('string', obj.get('title', ''))]
 61              for child in obj['children']:
 62                  extract_links_from_block(child, links, blocks, new_path)
 63  
 64      return links, blocks
 65  
 66  
 67  def is_concept(link):
 68      """Filter out noise, dates, and invalid concepts."""
 69      if link in NOISE_CONCEPTS:
 70          return False
 71      if any(month in link for month in MONTHS):
 72          return False
 73      if link.startswith('[['):  # nested links
 74          return False
 75      if '(highlights)' in link.lower():
 76          return False
 77      if len(link) < 3 or len(link) > 60:
 78          return False
 79      if link.isdigit():
 80          return False
 81      return True
 82  
 83  
 84  def extract_shape_from_blocks(blocks, concept):
 85      """Try to extract a shape/definition from the blocks mentioning a concept."""
 86      # Look for definitional patterns
 87      patterns = [
 88          r'::?\s*(.{20,200})',  # property syntax
 89          r'is\s+(.{20,150})',  # "X is ..."
 90          r'means?\s+(.{20,150})',  # "X means ..."
 91          r'-\s+(.{20,150})',  # bullet definition
 92      ]
 93  
 94      for block in blocks[:5]:  # Check first 5 blocks
 95          text = block['text']
 96          for pattern in patterns:
 97              match = re.search(f'{re.escape(concept)}.*{pattern}', text, re.IGNORECASE)
 98              if match:
 99                  shape = match.group(1).strip()
100                  # Clean up
101                  shape = re.sub(r'\[\[|\]\]', '', shape)  # Remove wiki-link brackets
102                  shape = shape[:200]  # Truncate
103                  if len(shape) > 30:
104                      return shape
105  
106      return None
107  
108  
109  def create_stub_page(concept, blocks, link_count, output_dir):
110      """Create a stub page for a concept in Sovereign_OS."""
111      shape = extract_shape_from_blocks(blocks, concept)
112  
113      # Sample blocks for context
114      sample_blocks = blocks[:3]
115  
116      content = f"""# {concept}
117  
118  *Backfilled from Roam Research | {link_count} references*
119  
120  ---
121  
122  - **source**
123    - Roam Research graph backfill
124    - referenced {link_count} times across graph
125  
126  """
127  
128      if shape:
129          content += f"""- **shape**
130    - {shape}
131  
132  """
133  
134      content += """---
135  
136  ## Context from Roam
137  
138  """
139  
140      for block in sample_blocks:
141          text = block['text']
142          # Clean up the text
143          text = text.replace('\n', ' ').strip()
144          if len(text) > 300:
145              text = text[:300] + '...'
146          content += f"- {text}\n"
147  
148      content += """
149  ---
150  
151  ## Related
152  
153  - **needs**
154    - shape definition
155    - connections to axioms
156    - integration with Sovereign_OS concepts
157  
158  ---
159  
160  *Backfilled stub - requires human curation*
161  """
162  
163      # Create safe filename
164      safe_name = re.sub(r'[^\w\s-]', '', concept).strip()
165      safe_name = re.sub(r'\s+', '-', safe_name)
166  
167      filepath = output_dir / f"{safe_name}.md"
168  
169      with open(filepath, 'w') as f:
170          f.write(content)
171  
172      return filepath
173  
174  
175  def main(roam_export_path, min_references=5, max_concepts=50):
176      """Main backfill process."""
177  
178      print(f"Loading Roam export: {roam_export_path}")
179      with open(roam_export_path, 'r') as f:
180          data = json.load(f)
181  
182      print(f"Found {len(data)} pages")
183  
184      # Extract all links and their blocks
185      all_links = []
186      all_blocks = defaultdict(list)
187  
188      for page in data:
189          links, blocks = extract_links_from_block(page)
190          all_links.extend(links)
191          for concept, block_list in blocks.items():
192              all_blocks[concept].extend(block_list)
193  
194      print(f"Found {len(all_links)} total link references")
195  
196      # Count and filter
197      link_counts = Counter(all_links)
198      concepts = {
199          k: v for k, v in link_counts.items()
200          if is_concept(k) and v >= min_references
201      }
202  
203      print(f"Found {len(concepts)} concepts with >= {min_references} references")
204  
205      # Sort by count
206      sorted_concepts = sorted(concepts.items(), key=lambda x: -x[1])[:max_concepts]
207  
208      # Create output directory
209      output_dir = Path(__file__).parent.parent / 'sessions' / 'backfill'
210      output_dir.mkdir(parents=True, exist_ok=True)
211  
212      print(f"\nCreating stub pages in: {output_dir}")
213  
214      created = []
215      for concept, count in sorted_concepts:
216          blocks = all_blocks.get(concept, [])
217          filepath = create_stub_page(concept, blocks, count, output_dir)
218          created.append((concept, count, filepath))
219          print(f"  Created: {concept} ({count}x)")
220  
221      # Create index
222      index_content = """# Roam Backfill Index
223  
224  *Concepts extracted from Roam Research for integration into Sovereign_OS*
225  
226  ---
227  
228  ## High-Gravity Concepts
229  
230  | Concept | References | Status |
231  |---------|------------|--------|
232  """
233  
234      for concept, count, filepath in created:
235          index_content += f"| [[{concept}]] | {count}x | stub |\n"
236  
237      index_content += """
238  ---
239  
240  ## Integration Process
241  
242  1. Review each stub page
243  2. Add proper shape definition
244  3. Connect to relevant axioms (A0-A3)
245  4. Link to related Sovereign_OS protocols
246  5. Update status when curated
247  
248  ---
249  
250  *Generated by roam_backfill.py*
251  """
252  
253      index_path = output_dir / 'BACKFILL-INDEX.md'
254      with open(index_path, 'w') as f:
255          f.write(index_content)
256  
257      print(f"\nCreated index: {index_path}")
258      print(f"Total stubs created: {len(created)}")
259  
260      return created
261  
262  
263  if __name__ == '__main__':
264      if len(sys.argv) < 2:
265          print("Usage: python roam_backfill.py /path/to/roam-export.json")
266          sys.exit(1)
267  
268      roam_path = sys.argv[1]
269      main(roam_path)