/ services / home-keeper / remediation.py
remediation.py
  1  """Home Keeper remediation actions — automated fixes for common problems.
  2  
  3  Remediation is conservative:
  4  - Restart crashed containers (non-critical)
  5  - Clean Docker build cache if disk > 80%
  6  - Log all actions for audit
  7  
  8  Destructive actions (NixOS rebuild, force-remove containers) are NOT automated.
  9  They are flagged as recommendations in the health report.
 10  """
 11  
 12  import json
 13  import os
 14  import subprocess
 15  import sys
 16  from datetime import datetime, timezone
 17  
 18  # Containers that are safe to auto-restart
 19  SAFE_TO_RESTART = {
 20      "bob-home-keeper",
 21      "bob-morning-coordinator",
 22      "bob-evening-coordinator",
 23      "bob-agent-scheduler",
 24      "pipecat-agent",
 25      "kokoro-tts",
 26      "faster-whisper",
 27      "embeddings",
 28      "openwakeword",
 29      "ha-nats-bridge",
 30      "oxigraph",
 31      "grafana",
 32      "neo4j",
 33      "fish-speech",
 34  }
 35  
 36  # Containers that should NEVER be auto-restarted (data loss risk)
 37  NEVER_RESTART = {
 38      "vllm",  # Model loading takes minutes
 39      "homeassistant",  # Automation state
 40  }
 41  
 42  # TrustGraph containers are managed by docker-compose, skip them
 43  TRUSTGRAPH_PREFIX = "trustgraph-"
 44  
 45  
 46  def run_cmd(cmd: str, timeout: int = 30) -> tuple[str, int]:
 47      """Run a command, return (stdout, returncode)."""
 48      try:
 49          result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
 50          return result.stdout.strip(), result.returncode
 51      except subprocess.TimeoutExpired:
 52          return f"TIMEOUT after {timeout}s", 1
 53      except Exception as e:
 54          return f"ERROR: {e}", 1
 55  
 56  
 57  def restart_container(name: str) -> dict:
 58      """Restart a Docker container if it's in the safe list."""
 59      action = {
 60          "action": "restart_container",
 61          "target": name,
 62          "timestamp": datetime.now(timezone.utc).isoformat(),
 63      }
 64  
 65      if name in NEVER_RESTART:
 66          action["status"] = "skipped"
 67          action["reason"] = "Container is in NEVER_RESTART list"
 68          return action
 69  
 70      if name.startswith(TRUSTGRAPH_PREFIX):
 71          action["status"] = "skipped"
 72          action["reason"] = "TrustGraph containers managed by docker-compose"
 73          return action
 74  
 75      if name not in SAFE_TO_RESTART:
 76          action["status"] = "skipped"
 77          action["reason"] = "Container not in SAFE_TO_RESTART list"
 78          return action
 79  
 80      output, rc = run_cmd(f"docker restart {name}")
 81      if rc == 0:
 82          action["status"] = "success"
 83          action["output"] = output
 84      else:
 85          action["status"] = "failed"
 86          action["error"] = output
 87  
 88      return action
 89  
 90  
 91  def clean_docker_cache() -> dict:
 92      """Remove unused Docker build cache and dangling images."""
 93      action = {
 94          "action": "docker_cleanup",
 95          "timestamp": datetime.now(timezone.utc).isoformat(),
 96      }
 97  
 98      # Remove dangling images
 99      output, rc = run_cmd("docker image prune -f")
100      reclaimed = "unknown"
101      if "Total reclaimed space:" in output:
102          reclaimed = output.split("Total reclaimed space:")[-1].strip()
103  
104      action["status"] = "success" if rc == 0 else "failed"
105      action["reclaimed"] = reclaimed
106      return action
107  
108  
109  def evaluate_remediation(health_result: dict) -> list:
110      """Evaluate health check results and determine what remediation to take."""
111      actions = []
112      details = health_result.get("details", {})
113  
114      # Check for crashed containers that can be restarted
115      docker_info = details.get("docker", {})
116      for container in docker_info.get("containers", []):
117          if not container["up"] and container["name"] in SAFE_TO_RESTART:
118              actions.append(("restart", container["name"]))
119          elif container.get("restarts", 0) >= 10 and container["name"] in SAFE_TO_RESTART:
120              actions.append(("restart", container["name"]))
121  
122      # Check disk usage
123      system = details.get("system", {})
124      disk = system.get("disk", {})
125      if disk.get("percent", 0) > 80:
126          actions.append(("docker_cleanup", None))
127  
128      return actions
129  
130  
131  def execute_remediation(actions: list) -> list:
132      """Execute remediation actions and return results."""
133      results = []
134      for action_type, target in actions:
135          if action_type == "restart":
136              result = restart_container(target)
137              results.append(result)
138              print(f"  Remediation: restart {target} -> {result['status']}")
139          elif action_type == "docker_cleanup":
140              result = clean_docker_cache()
141              results.append(result)
142              print(f"  Remediation: docker cleanup -> {result['status']} (reclaimed: {result.get('reclaimed', '?')})")
143      return results