health_monitor.py
1 #!/usr/bin/env python3 2 """ 3 Sovereign OS - Health Monitor Daemon 4 5 Continuously monitors daemon health and: 6 1. Restarts crashed daemons via LaunchAgents 7 2. Writes health status to file for other tools 8 3. Alerts via terminal-notifier if available 9 10 Run as: python3 scripts/health_monitor.py 11 """ 12 13 import os 14 import sys 15 import subprocess 16 import json 17 import time 18 from pathlib import Path 19 from datetime import datetime 20 21 # Configuration 22 CHECK_INTERVAL = 30 # seconds between checks 23 MAX_RESTART_ATTEMPTS = 3 24 RESTART_COOLDOWN = 300 # seconds before resetting restart counter 25 26 LOGS_DIR = Path.home() / ".sovereign" / "logs" 27 LOGS_DIR.mkdir(parents=True, exist_ok=True) 28 HEALTH_STATUS_FILE = LOGS_DIR / "health-status.json" 29 30 # Daemons to monitor 31 DAEMONS = { 32 "mission-control": { 33 "patterns": ["mission_control", "run_mission_control"], 34 "launch_agent": "com.sovereign.mission-control", 35 "critical": True, 36 }, 37 "first-officer": { 38 "patterns": ["first_officer_local", "run_first_officer"], 39 "launch_agent": "com.sovereign.first-officer", 40 "critical": True, 41 }, 42 "mesh": { 43 "patterns": ["sos.js", "sovereign-mesh"], 44 "launch_agent": "com.sovereign.mesh", 45 "critical": False, # System works without mesh 46 }, 47 } 48 49 # Track restart attempts 50 restart_attempts = {name: {"count": 0, "last_attempt": 0} for name in DAEMONS} 51 52 53 def log(msg: str, level: str = "INFO"): 54 """Log with timestamp.""" 55 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 56 print(f"[{timestamp}] [{level}] {msg}") 57 58 59 def check_daemon_running(patterns: list) -> bool: 60 """Check if a daemon is running by any of the process patterns.""" 61 for pattern in patterns: 62 try: 63 result = subprocess.run( 64 ["pgrep", "-f", pattern], 65 capture_output=True, text=True 66 ) 67 if result.returncode == 0: 68 return True 69 except: 70 pass 71 return False 72 73 74 def restart_daemon(name: str, config: dict) -> bool: 75 """Attempt to restart a daemon via LaunchAgent.""" 76 plist_path = Path.home() / "Library" / "LaunchAgents" / f"{config['launch_agent']}.plist" 77 78 if not plist_path.exists(): 79 log(f"LaunchAgent not found: {plist_path}", "ERROR") 80 return False 81 82 try: 83 # Unload first 84 subprocess.run(["launchctl", "unload", str(plist_path)], 85 capture_output=True, timeout=5) 86 time.sleep(1) 87 # Load fresh 88 result = subprocess.run(["launchctl", "load", str(plist_path)], 89 capture_output=True, timeout=5) 90 return result.returncode == 0 91 except Exception as e: 92 log(f"Failed to restart {name}: {e}", "ERROR") 93 return False 94 95 96 def send_notification(title: str, message: str): 97 """Send macOS notification if terminal-notifier is available.""" 98 try: 99 subprocess.run([ 100 "terminal-notifier", 101 "-title", title, 102 "-message", message, 103 "-sound", "default" 104 ], capture_output=True, timeout=5) 105 except: 106 # terminal-notifier not installed, use osascript 107 try: 108 subprocess.run([ 109 "osascript", "-e", 110 f'display notification "{message}" with title "{title}"' 111 ], capture_output=True, timeout=5) 112 except: 113 pass # No notification available 114 115 116 def write_health_status(status: dict): 117 """Write health status to file.""" 118 status["timestamp"] = datetime.now().isoformat() 119 status["uptime_seconds"] = time.time() - start_time 120 with open(HEALTH_STATUS_FILE, "w") as f: 121 json.dump(status, f, indent=2) 122 123 124 def check_health() -> dict: 125 """Check health of all daemons.""" 126 status = {"daemons": {}, "healthy": True} 127 now = time.time() 128 129 for name, config in DAEMONS.items(): 130 running = check_daemon_running(config["patterns"]) 131 132 if running: 133 status["daemons"][name] = "running" 134 # Reset restart counter if running for a while 135 if now - restart_attempts[name]["last_attempt"] > RESTART_COOLDOWN: 136 restart_attempts[name]["count"] = 0 137 else: 138 status["daemons"][name] = "down" 139 if config["critical"]: 140 status["healthy"] = False 141 142 # Attempt restart 143 attempts = restart_attempts[name] 144 if attempts["count"] < MAX_RESTART_ATTEMPTS: 145 log(f"⚠ {name} is down, attempting restart ({attempts['count'] + 1}/{MAX_RESTART_ATTEMPTS})...", "WARN") 146 147 if restart_daemon(name, config): 148 time.sleep(3) # Wait for daemon to start 149 if check_daemon_running(config["patterns"]): 150 log(f"✓ {name} restarted successfully") 151 status["daemons"][name] = "restarted" 152 send_notification( 153 "Sovereign OS", 154 f"Restarted {name} daemon" 155 ) 156 else: 157 log(f"✗ {name} failed to start after restart", "ERROR") 158 attempts["count"] += 1 159 attempts["last_attempt"] = now 160 else: 161 attempts["count"] += 1 162 attempts["last_attempt"] = now 163 else: 164 log(f"✗ {name} exceeded max restart attempts", "ERROR") 165 if config["critical"]: 166 send_notification( 167 "⚠️ Sovereign OS Alert", 168 f"Critical daemon {name} is down and won't restart!" 169 ) 170 171 return status 172 173 174 def main(): 175 global start_time 176 start_time = time.time() 177 178 print() 179 print("=" * 50) 180 print(" SOVEREIGN OS - HEALTH MONITOR") 181 print("=" * 50) 182 print(f" Monitoring: {', '.join(DAEMONS.keys())}") 183 print(f" Check interval: {CHECK_INTERVAL}s") 184 print(f" Status file: {HEALTH_STATUS_FILE}") 185 print("=" * 50) 186 print() 187 188 # Initial check 189 status = check_health() 190 write_health_status(status) 191 192 if status["healthy"]: 193 log("✓ All daemons healthy") 194 else: 195 log("⚠ Some daemons unhealthy", "WARN") 196 197 # Main loop 198 while True: 199 try: 200 time.sleep(CHECK_INTERVAL) 201 status = check_health() 202 write_health_status(status) 203 except KeyboardInterrupt: 204 log("Shutting down health monitor...") 205 break 206 except Exception as e: 207 log(f"Error in health check: {e}", "ERROR") 208 time.sleep(CHECK_INTERVAL) 209 210 211 if __name__ == "__main__": 212 main()