/ nix / modules / monitoring.nix
monitoring.nix
  1  # Prometheus + Alertmanager + node exporter (NixOS-native)
  2  # Grafana runs as a Docker container (easier dashboard provisioning)
  3  # Alert webhook bridge publishes to NATS for System Sentinel
  4  
  5  { config, pkgs, lib, ... }:
  6  
  7  {
  8    # ── Prometheus ───────────────────────────────────────────────────────
  9    services.prometheus = {
 10      enable = true;
 11      port = 9090;
 12      retentionTime = "30d";
 13      globalConfig = {
 14        scrape_interval = "15s";
 15      };
 16      scrapeConfigs = [
 17        {
 18          job_name = "node";
 19          static_configs = [{
 20            targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ];
 21          }];
 22        }
 23        # NATS /varz is JSON, not Prometheus format — needs nats-exporter for proper metrics
 24        # TODO: Deploy nats-exporter when needed
 25      ];
 26  
 27      # Alert rules
 28      rules = [
 29        (builtins.toJSON {
 30          groups = [
 31            {
 32              name = "bob-system";
 33              rules = [
 34                {
 35                  alert = "HighMemoryUsage";
 36                  expr = ''100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 85'';
 37                  "for" = "5m";
 38                  labels.severity = "warning";
 39                  annotations.summary = "RAM usage above 85% for 5 minutes";
 40                }
 41                {
 42                  alert = "CriticalMemoryUsage";
 43                  expr = ''100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 95'';
 44                  "for" = "2m";
 45                  labels.severity = "critical";
 46                  annotations.summary = "RAM usage above 95% — risk of OOM";
 47                }
 48                {
 49                  alert = "DiskSpaceWarning";
 50                  expr = ''100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) > 80'';
 51                  "for" = "10m";
 52                  labels.severity = "warning";
 53                  annotations.summary = "Disk usage above 80%";
 54                }
 55                {
 56                  alert = "DiskSpaceCritical";
 57                  expr = ''100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) > 90'';
 58                  "for" = "5m";
 59                  labels.severity = "critical";
 60                  annotations.summary = "Disk usage above 90% — immediate attention needed";
 61                }
 62                {
 63                  alert = "HighCPUUsage";
 64                  expr = ''100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90'';
 65                  "for" = "10m";
 66                  labels.severity = "warning";
 67                  annotations.summary = "CPU usage above 90% sustained for 10 minutes";
 68                }
 69                {
 70                  alert = "HighLoadAverage";
 71                  expr = ''node_load15 > 16'';
 72                  "for" = "5m";
 73                  labels.severity = "warning";
 74                  annotations.summary = "15-minute load average above 16 (2x CPU cores)";
 75                }
 76                {
 77                  alert = "SystemdServiceFailed";
 78                  expr = ''node_systemd_unit_state{state="failed"} == 1'';
 79                  "for" = "1m";
 80                  labels.severity = "critical";
 81                  annotations.summary = "Systemd service {{ $labels.name }} has failed";
 82                }
 83                {
 84                  alert = "NodeExporterDown";
 85                  expr = ''up{job="node"} == 0'';
 86                  "for" = "1m";
 87                  labels.severity = "critical";
 88                  annotations.summary = "Node exporter is not responding — metrics collection broken";
 89                }
 90              ];
 91            }
 92          ];
 93        })
 94      ];
 95  
 96      # Alertmanager integration
 97      alertmanagers = [{
 98        static_configs = [{
 99          targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ];
100        }];
101      }];
102    };
103  
104    # ── Alertmanager ─────────────────────────────────────────────────────
105    services.prometheus.alertmanager = {
106      enable = true;
107      port = 9093;
108      configuration = {
109        route = {
110          receiver = "bob-webhook";
111          group_by = [ "alertname" "severity" ];
112          group_wait = "30s";
113          group_interval = "5m";
114          repeat_interval = "4h";
115        };
116        receivers = [{
117          name = "bob-webhook";
118          webhook_configs = [{
119            url = "http://127.0.0.1:9095/webhook";
120            send_resolved = true;
121          }];
122        }];
123      };
124    };
125  
126    # ── Node exporter ───────────────────────────────────────────────────
127    services.prometheus.exporters.node = {
128      enable = true;
129      port = 9100;
130      enabledCollectors = [
131        "cpu" "diskstats" "filesystem" "loadavg" "meminfo"
132        "netdev" "stat" "time" "uname" "systemd"
133      ];
134    };
135  
136    # Restrict exporter to localhost
137    networking.firewall.allowedTCPPorts = [ ]; # Prometheus/exporters stay behind Caddy
138  }