remediation.py
1 """Home Keeper remediation actions — automated fixes for common problems. 2 3 Remediation is conservative: 4 - Restart crashed containers (non-critical) 5 - Clean Docker build cache if disk > 80% 6 - Log all actions for audit 7 8 Destructive actions (NixOS rebuild, force-remove containers) are NOT automated. 9 They are flagged as recommendations in the health report. 10 """ 11 12 import json 13 import os 14 import subprocess 15 import sys 16 from datetime import datetime, timezone 17 18 # Containers that are safe to auto-restart 19 SAFE_TO_RESTART = { 20 "bob-home-keeper", 21 "bob-morning-coordinator", 22 "bob-evening-coordinator", 23 "bob-agent-scheduler", 24 "pipecat-agent", 25 "kokoro-tts", 26 "faster-whisper", 27 "embeddings", 28 "openwakeword", 29 "ha-nats-bridge", 30 "oxigraph", 31 "grafana", 32 "neo4j", 33 "fish-speech", 34 } 35 36 # Containers that should NEVER be auto-restarted (data loss risk) 37 NEVER_RESTART = { 38 "vllm", # Model loading takes minutes 39 "homeassistant", # Automation state 40 } 41 42 # TrustGraph containers are managed by docker-compose, skip them 43 TRUSTGRAPH_PREFIX = "trustgraph-" 44 45 46 def run_cmd(cmd: str, timeout: int = 30) -> tuple[str, int]: 47 """Run a command, return (stdout, returncode).""" 48 try: 49 result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) 50 return result.stdout.strip(), result.returncode 51 except subprocess.TimeoutExpired: 52 return f"TIMEOUT after {timeout}s", 1 53 except Exception as e: 54 return f"ERROR: {e}", 1 55 56 57 def restart_container(name: str) -> dict: 58 """Restart a Docker container if it's in the safe list.""" 59 action = { 60 "action": "restart_container", 61 "target": name, 62 "timestamp": datetime.now(timezone.utc).isoformat(), 63 } 64 65 if name in NEVER_RESTART: 66 action["status"] = "skipped" 67 action["reason"] = "Container is in NEVER_RESTART list" 68 return action 69 70 if name.startswith(TRUSTGRAPH_PREFIX): 71 action["status"] = "skipped" 72 action["reason"] = "TrustGraph containers managed by docker-compose" 73 return action 74 75 if name not in SAFE_TO_RESTART: 76 action["status"] = "skipped" 77 action["reason"] = "Container not in SAFE_TO_RESTART list" 78 return action 79 80 output, rc = run_cmd(f"docker restart {name}") 81 if rc == 0: 82 action["status"] = "success" 83 action["output"] = output 84 else: 85 action["status"] = "failed" 86 action["error"] = output 87 88 return action 89 90 91 def clean_docker_cache() -> dict: 92 """Remove unused Docker build cache and dangling images.""" 93 action = { 94 "action": "docker_cleanup", 95 "timestamp": datetime.now(timezone.utc).isoformat(), 96 } 97 98 # Remove dangling images 99 output, rc = run_cmd("docker image prune -f") 100 reclaimed = "unknown" 101 if "Total reclaimed space:" in output: 102 reclaimed = output.split("Total reclaimed space:")[-1].strip() 103 104 action["status"] = "success" if rc == 0 else "failed" 105 action["reclaimed"] = reclaimed 106 return action 107 108 109 def evaluate_remediation(health_result: dict) -> list: 110 """Evaluate health check results and determine what remediation to take.""" 111 actions = [] 112 details = health_result.get("details", {}) 113 114 # Check for crashed containers that can be restarted 115 docker_info = details.get("docker", {}) 116 for container in docker_info.get("containers", []): 117 if not container["up"] and container["name"] in SAFE_TO_RESTART: 118 actions.append(("restart", container["name"])) 119 elif container.get("restarts", 0) >= 10 and container["name"] in SAFE_TO_RESTART: 120 actions.append(("restart", container["name"])) 121 122 # Check disk usage 123 system = details.get("system", {}) 124 disk = system.get("disk", {}) 125 if disk.get("percent", 0) > 80: 126 actions.append(("docker_cleanup", None)) 127 128 return actions 129 130 131 def execute_remediation(actions: list) -> list: 132 """Execute remediation actions and return results.""" 133 results = [] 134 for action_type, target in actions: 135 if action_type == "restart": 136 result = restart_container(target) 137 results.append(result) 138 print(f" Remediation: restart {target} -> {result['status']}") 139 elif action_type == "docker_cleanup": 140 result = clean_docker_cache() 141 results.append(result) 142 print(f" Remediation: docker cleanup -> {result['status']} (reclaimed: {result.get('reclaimed', '?')})") 143 return results