roam_backfill.py
1 #!/usr/bin/env python3 2 """ 3 Roam Backfill Script - Extract high-gravity concepts and create Sovereign_OS stub pages 4 5 Usage: 6 python scripts/roam_backfill.py /path/to/roam-export.json 7 8 This script: 9 1. Parses a Roam Research JSON export 10 2. Identifies high-gravity concepts (frequently referenced) 11 3. Extracts content around those concepts 12 4. Creates stub pages in Sovereign_OS with shapes and connections 13 """ 14 15 import json 16 import re 17 import os 18 import sys 19 from collections import Counter, defaultdict 20 from pathlib import Path 21 22 23 # Noise to filter out 24 NOISE_CONCEPTS = { 25 'TODO', 'DONE', 'orange', 'pink', 'blue', 'green', 'yellow', 'red', 26 'Quick Capture', 'Roam-Highlights', 'embed', 'pdf', 'favorite', 27 'Highlights', 'Matter', 'Quick capture', 'youtube', 'call', 'Email', 28 'highlights', 'resume', 'Tweet', 'Twitter', 'article', 'books', 29 'calc', 'slider', 'cover letter', 'nft', 'crypto' 30 } 31 32 MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 33 'July', 'August', 'September', 'October', 'November', 'December'] 34 35 36 def extract_links_from_block(obj, links=None, blocks=None, path=None): 37 """Recursively extract [[wiki-links]] and their containing blocks.""" 38 if links is None: 39 links = [] 40 if blocks is None: 41 blocks = {} 42 if path is None: 43 path = [] 44 45 if isinstance(obj, dict): 46 if 'string' in obj: 47 text = obj.get('string', '') 48 found = re.findall(r'\[\[([^\]]+)\]\]', text) 49 for link in found: 50 links.append(link) 51 if link not in blocks: 52 blocks[link] = [] 53 blocks[link].append({ 54 'text': text, 55 'path': path.copy(), 56 'uid': obj.get('uid', '') 57 }) 58 59 if 'children' in obj: 60 new_path = path + [obj.get('string', obj.get('title', ''))] 61 for child in obj['children']: 62 extract_links_from_block(child, links, blocks, new_path) 63 64 return links, blocks 65 66 67 def is_concept(link): 68 """Filter out noise, dates, and invalid concepts.""" 69 if link in NOISE_CONCEPTS: 70 return False 71 if any(month in link for month in MONTHS): 72 return False 73 if link.startswith('[['): # nested links 74 return False 75 if '(highlights)' in link.lower(): 76 return False 77 if len(link) < 3 or len(link) > 60: 78 return False 79 if link.isdigit(): 80 return False 81 return True 82 83 84 def extract_shape_from_blocks(blocks, concept): 85 """Try to extract a shape/definition from the blocks mentioning a concept.""" 86 # Look for definitional patterns 87 patterns = [ 88 r'::?\s*(.{20,200})', # property syntax 89 r'is\s+(.{20,150})', # "X is ..." 90 r'means?\s+(.{20,150})', # "X means ..." 91 r'-\s+(.{20,150})', # bullet definition 92 ] 93 94 for block in blocks[:5]: # Check first 5 blocks 95 text = block['text'] 96 for pattern in patterns: 97 match = re.search(f'{re.escape(concept)}.*{pattern}', text, re.IGNORECASE) 98 if match: 99 shape = match.group(1).strip() 100 # Clean up 101 shape = re.sub(r'\[\[|\]\]', '', shape) # Remove wiki-link brackets 102 shape = shape[:200] # Truncate 103 if len(shape) > 30: 104 return shape 105 106 return None 107 108 109 def create_stub_page(concept, blocks, link_count, output_dir): 110 """Create a stub page for a concept in Sovereign_OS.""" 111 shape = extract_shape_from_blocks(blocks, concept) 112 113 # Sample blocks for context 114 sample_blocks = blocks[:3] 115 116 content = f"""# {concept} 117 118 *Backfilled from Roam Research | {link_count} references* 119 120 --- 121 122 - **source** 123 - Roam Research graph backfill 124 - referenced {link_count} times across graph 125 126 """ 127 128 if shape: 129 content += f"""- **shape** 130 - {shape} 131 132 """ 133 134 content += """--- 135 136 ## Context from Roam 137 138 """ 139 140 for block in sample_blocks: 141 text = block['text'] 142 # Clean up the text 143 text = text.replace('\n', ' ').strip() 144 if len(text) > 300: 145 text = text[:300] + '...' 146 content += f"- {text}\n" 147 148 content += """ 149 --- 150 151 ## Related 152 153 - **needs** 154 - shape definition 155 - connections to axioms 156 - integration with Sovereign_OS concepts 157 158 --- 159 160 *Backfilled stub - requires human curation* 161 """ 162 163 # Create safe filename 164 safe_name = re.sub(r'[^\w\s-]', '', concept).strip() 165 safe_name = re.sub(r'\s+', '-', safe_name) 166 167 filepath = output_dir / f"{safe_name}.md" 168 169 with open(filepath, 'w') as f: 170 f.write(content) 171 172 return filepath 173 174 175 def main(roam_export_path, min_references=5, max_concepts=50): 176 """Main backfill process.""" 177 178 print(f"Loading Roam export: {roam_export_path}") 179 with open(roam_export_path, 'r') as f: 180 data = json.load(f) 181 182 print(f"Found {len(data)} pages") 183 184 # Extract all links and their blocks 185 all_links = [] 186 all_blocks = defaultdict(list) 187 188 for page in data: 189 links, blocks = extract_links_from_block(page) 190 all_links.extend(links) 191 for concept, block_list in blocks.items(): 192 all_blocks[concept].extend(block_list) 193 194 print(f"Found {len(all_links)} total link references") 195 196 # Count and filter 197 link_counts = Counter(all_links) 198 concepts = { 199 k: v for k, v in link_counts.items() 200 if is_concept(k) and v >= min_references 201 } 202 203 print(f"Found {len(concepts)} concepts with >= {min_references} references") 204 205 # Sort by count 206 sorted_concepts = sorted(concepts.items(), key=lambda x: -x[1])[:max_concepts] 207 208 # Create output directory 209 output_dir = Path(__file__).parent.parent / 'sessions' / 'backfill' 210 output_dir.mkdir(parents=True, exist_ok=True) 211 212 print(f"\nCreating stub pages in: {output_dir}") 213 214 created = [] 215 for concept, count in sorted_concepts: 216 blocks = all_blocks.get(concept, []) 217 filepath = create_stub_page(concept, blocks, count, output_dir) 218 created.append((concept, count, filepath)) 219 print(f" Created: {concept} ({count}x)") 220 221 # Create index 222 index_content = """# Roam Backfill Index 223 224 *Concepts extracted from Roam Research for integration into Sovereign_OS* 225 226 --- 227 228 ## High-Gravity Concepts 229 230 | Concept | References | Status | 231 |---------|------------|--------| 232 """ 233 234 for concept, count, filepath in created: 235 index_content += f"| [[{concept}]] | {count}x | stub |\n" 236 237 index_content += """ 238 --- 239 240 ## Integration Process 241 242 1. Review each stub page 243 2. Add proper shape definition 244 3. Connect to relevant axioms (A0-A3) 245 4. Link to related Sovereign_OS protocols 246 5. Update status when curated 247 248 --- 249 250 *Generated by roam_backfill.py* 251 """ 252 253 index_path = output_dir / 'BACKFILL-INDEX.md' 254 with open(index_path, 'w') as f: 255 f.write(index_content) 256 257 print(f"\nCreated index: {index_path}") 258 print(f"Total stubs created: {len(created)}") 259 260 return created 261 262 263 if __name__ == '__main__': 264 if len(sys.argv) < 2: 265 print("Usage: python roam_backfill.py /path/to/roam-export.json") 266 sys.exit(1) 267 268 roam_path = sys.argv[1] 269 main(roam_path)