monitoring.nix
1 # Prometheus + Alertmanager + node exporter (NixOS-native) 2 # Grafana runs as a Docker container (easier dashboard provisioning) 3 # Alert webhook bridge publishes to NATS for System Sentinel 4 5 { config, pkgs, lib, ... }: 6 7 { 8 # ── Prometheus ─────────────────────────────────────────────────────── 9 services.prometheus = { 10 enable = true; 11 port = 9090; 12 retentionTime = "30d"; 13 globalConfig = { 14 scrape_interval = "15s"; 15 }; 16 scrapeConfigs = [ 17 { 18 job_name = "node"; 19 static_configs = [{ 20 targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; 21 }]; 22 } 23 # NATS /varz is JSON, not Prometheus format — needs nats-exporter for proper metrics 24 # TODO: Deploy nats-exporter when needed 25 ]; 26 27 # Alert rules 28 rules = [ 29 (builtins.toJSON { 30 groups = [ 31 { 32 name = "bob-system"; 33 rules = [ 34 { 35 alert = "HighMemoryUsage"; 36 expr = ''100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 85''; 37 "for" = "5m"; 38 labels.severity = "warning"; 39 annotations.summary = "RAM usage above 85% for 5 minutes"; 40 } 41 { 42 alert = "CriticalMemoryUsage"; 43 expr = ''100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 95''; 44 "for" = "2m"; 45 labels.severity = "critical"; 46 annotations.summary = "RAM usage above 95% — risk of OOM"; 47 } 48 { 49 alert = "DiskSpaceWarning"; 50 expr = ''100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) > 80''; 51 "for" = "10m"; 52 labels.severity = "warning"; 53 annotations.summary = "Disk usage above 80%"; 54 } 55 { 56 alert = "DiskSpaceCritical"; 57 expr = ''100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) > 90''; 58 "for" = "5m"; 59 labels.severity = "critical"; 60 annotations.summary = "Disk usage above 90% — immediate attention needed"; 61 } 62 { 63 alert = "HighCPUUsage"; 64 expr = ''100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90''; 65 "for" = "10m"; 66 labels.severity = "warning"; 67 annotations.summary = "CPU usage above 90% sustained for 10 minutes"; 68 } 69 { 70 alert = "HighLoadAverage"; 71 expr = ''node_load15 > 16''; 72 "for" = "5m"; 73 labels.severity = "warning"; 74 annotations.summary = "15-minute load average above 16 (2x CPU cores)"; 75 } 76 { 77 alert = "SystemdServiceFailed"; 78 expr = ''node_systemd_unit_state{state="failed"} == 1''; 79 "for" = "1m"; 80 labels.severity = "critical"; 81 annotations.summary = "Systemd service {{ $labels.name }} has failed"; 82 } 83 { 84 alert = "NodeExporterDown"; 85 expr = ''up{job="node"} == 0''; 86 "for" = "1m"; 87 labels.severity = "critical"; 88 annotations.summary = "Node exporter is not responding — metrics collection broken"; 89 } 90 ]; 91 } 92 ]; 93 }) 94 ]; 95 96 # Alertmanager integration 97 alertmanagers = [{ 98 static_configs = [{ 99 targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; 100 }]; 101 }]; 102 }; 103 104 # ── Alertmanager ───────────────────────────────────────────────────── 105 services.prometheus.alertmanager = { 106 enable = true; 107 port = 9093; 108 configuration = { 109 route = { 110 receiver = "bob-webhook"; 111 group_by = [ "alertname" "severity" ]; 112 group_wait = "30s"; 113 group_interval = "5m"; 114 repeat_interval = "4h"; 115 }; 116 receivers = [{ 117 name = "bob-webhook"; 118 webhook_configs = [{ 119 url = "http://127.0.0.1:9095/webhook"; 120 send_resolved = true; 121 }]; 122 }]; 123 }; 124 }; 125 126 # ── Node exporter ─────────────────────────────────────────────────── 127 services.prometheus.exporters.node = { 128 enable = true; 129 port = 9100; 130 enabledCollectors = [ 131 "cpu" "diskstats" "filesystem" "loadavg" "meminfo" 132 "netdev" "stat" "time" "uname" "systemd" 133 ]; 134 }; 135 136 # Restrict exporter to localhost 137 networking.firewall.allowedTCPPorts = [ ]; # Prometheus/exporters stay behind Caddy 138 }