/ scripts / health_monitor.py
health_monitor.py
  1  #!/usr/bin/env python3
  2  """
  3  Sovereign OS - Health Monitor Daemon
  4  
  5  Continuously monitors daemon health and:
  6  1. Restarts crashed daemons via LaunchAgents
  7  2. Writes health status to file for other tools
  8  3. Alerts via terminal-notifier if available
  9  
 10  Run as: python3 scripts/health_monitor.py
 11  """
 12  
 13  import os
 14  import sys
 15  import subprocess
 16  import json
 17  import time
 18  from pathlib import Path
 19  from datetime import datetime
 20  
 21  # Configuration
 22  CHECK_INTERVAL = 30  # seconds between checks
 23  MAX_RESTART_ATTEMPTS = 3
 24  RESTART_COOLDOWN = 300  # seconds before resetting restart counter
 25  
 26  LOGS_DIR = Path.home() / ".sovereign" / "logs"
 27  LOGS_DIR.mkdir(parents=True, exist_ok=True)
 28  HEALTH_STATUS_FILE = LOGS_DIR / "health-status.json"
 29  
 30  # Daemons to monitor
 31  DAEMONS = {
 32      "mission-control": {
 33          "patterns": ["mission_control", "run_mission_control"],
 34          "launch_agent": "com.sovereign.mission-control",
 35          "critical": True,
 36      },
 37      "first-officer": {
 38          "patterns": ["first_officer_local", "run_first_officer"],
 39          "launch_agent": "com.sovereign.first-officer",
 40          "critical": True,
 41      },
 42      "mesh": {
 43          "patterns": ["sos.js", "sovereign-mesh"],
 44          "launch_agent": "com.sovereign.mesh",
 45          "critical": False,  # System works without mesh
 46      },
 47  }
 48  
 49  # Track restart attempts
 50  restart_attempts = {name: {"count": 0, "last_attempt": 0} for name in DAEMONS}
 51  
 52  
 53  def log(msg: str, level: str = "INFO"):
 54      """Log with timestamp."""
 55      timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 56      print(f"[{timestamp}] [{level}] {msg}")
 57  
 58  
 59  def check_daemon_running(patterns: list) -> bool:
 60      """Check if a daemon is running by any of the process patterns."""
 61      for pattern in patterns:
 62          try:
 63              result = subprocess.run(
 64                  ["pgrep", "-f", pattern],
 65                  capture_output=True, text=True
 66              )
 67              if result.returncode == 0:
 68                  return True
 69          except:
 70              pass
 71      return False
 72  
 73  
 74  def restart_daemon(name: str, config: dict) -> bool:
 75      """Attempt to restart a daemon via LaunchAgent."""
 76      plist_path = Path.home() / "Library" / "LaunchAgents" / f"{config['launch_agent']}.plist"
 77  
 78      if not plist_path.exists():
 79          log(f"LaunchAgent not found: {plist_path}", "ERROR")
 80          return False
 81  
 82      try:
 83          # Unload first
 84          subprocess.run(["launchctl", "unload", str(plist_path)],
 85                        capture_output=True, timeout=5)
 86          time.sleep(1)
 87          # Load fresh
 88          result = subprocess.run(["launchctl", "load", str(plist_path)],
 89                                 capture_output=True, timeout=5)
 90          return result.returncode == 0
 91      except Exception as e:
 92          log(f"Failed to restart {name}: {e}", "ERROR")
 93          return False
 94  
 95  
 96  def send_notification(title: str, message: str):
 97      """Send macOS notification if terminal-notifier is available."""
 98      try:
 99          subprocess.run([
100              "terminal-notifier",
101              "-title", title,
102              "-message", message,
103              "-sound", "default"
104          ], capture_output=True, timeout=5)
105      except:
106          # terminal-notifier not installed, use osascript
107          try:
108              subprocess.run([
109                  "osascript", "-e",
110                  f'display notification "{message}" with title "{title}"'
111              ], capture_output=True, timeout=5)
112          except:
113              pass  # No notification available
114  
115  
116  def write_health_status(status: dict):
117      """Write health status to file."""
118      status["timestamp"] = datetime.now().isoformat()
119      status["uptime_seconds"] = time.time() - start_time
120      with open(HEALTH_STATUS_FILE, "w") as f:
121          json.dump(status, f, indent=2)
122  
123  
124  def check_health() -> dict:
125      """Check health of all daemons."""
126      status = {"daemons": {}, "healthy": True}
127      now = time.time()
128  
129      for name, config in DAEMONS.items():
130          running = check_daemon_running(config["patterns"])
131  
132          if running:
133              status["daemons"][name] = "running"
134              # Reset restart counter if running for a while
135              if now - restart_attempts[name]["last_attempt"] > RESTART_COOLDOWN:
136                  restart_attempts[name]["count"] = 0
137          else:
138              status["daemons"][name] = "down"
139              if config["critical"]:
140                  status["healthy"] = False
141  
142              # Attempt restart
143              attempts = restart_attempts[name]
144              if attempts["count"] < MAX_RESTART_ATTEMPTS:
145                  log(f"⚠ {name} is down, attempting restart ({attempts['count'] + 1}/{MAX_RESTART_ATTEMPTS})...", "WARN")
146  
147                  if restart_daemon(name, config):
148                      time.sleep(3)  # Wait for daemon to start
149                      if check_daemon_running(config["patterns"]):
150                          log(f"✓ {name} restarted successfully")
151                          status["daemons"][name] = "restarted"
152                          send_notification(
153                              "Sovereign OS",
154                              f"Restarted {name} daemon"
155                          )
156                      else:
157                          log(f"✗ {name} failed to start after restart", "ERROR")
158                          attempts["count"] += 1
159                          attempts["last_attempt"] = now
160                  else:
161                      attempts["count"] += 1
162                      attempts["last_attempt"] = now
163              else:
164                  log(f"✗ {name} exceeded max restart attempts", "ERROR")
165                  if config["critical"]:
166                      send_notification(
167                          "⚠️ Sovereign OS Alert",
168                          f"Critical daemon {name} is down and won't restart!"
169                      )
170  
171      return status
172  
173  
174  def main():
175      global start_time
176      start_time = time.time()
177  
178      print()
179      print("=" * 50)
180      print("  SOVEREIGN OS - HEALTH MONITOR")
181      print("=" * 50)
182      print(f"  Monitoring: {', '.join(DAEMONS.keys())}")
183      print(f"  Check interval: {CHECK_INTERVAL}s")
184      print(f"  Status file: {HEALTH_STATUS_FILE}")
185      print("=" * 50)
186      print()
187  
188      # Initial check
189      status = check_health()
190      write_health_status(status)
191  
192      if status["healthy"]:
193          log("✓ All daemons healthy")
194      else:
195          log("⚠ Some daemons unhealthy", "WARN")
196  
197      # Main loop
198      while True:
199          try:
200              time.sleep(CHECK_INTERVAL)
201              status = check_health()
202              write_health_status(status)
203          except KeyboardInterrupt:
204              log("Shutting down health monitor...")
205              break
206          except Exception as e:
207              log(f"Error in health check: {e}", "ERROR")
208              time.sleep(CHECK_INTERVAL)
209  
210  
211  if __name__ == "__main__":
212      main()