monitoring.cspec
1 # Monitoring Configuration 2 # Generated: 2026-01-06 3 # Purpose: Claude-readable monitoring spec 4 5 @overview: 6 strategy: Lightweight Prometheus stack 7 metrics_retention: 15 days (default) 8 health_check_interval: 5 minutes 9 10 @components: 11 12 @prometheus: 13 host: ci.ac-dc.network 14 port: 9090 (localhost only) 15 config: /etc/prometheus/prometheus.yml 16 storage: /var/lib/prometheus 17 service: prometheus.service 18 scrape_interval: 30s 19 20 @node_exporter: 21 version: 1.8.2 22 servers: 23 - host: source.ac-dc.network 24 listen: 10.106.0.2:9100 (VPC only) 25 - host: ci.ac-dc.network 26 listen: 0.0.0.0:9100 27 service: node_exporter.service 28 29 @health_check: 30 script: /usr/local/bin/health-check.sh 31 cron: "*/5 * * * *" 32 log: /var/log/health-check.log 33 checks: 34 - Forgejo (HTTPS) 35 - Radicle node (port 8776) 36 - Forgejo runner (systemd) 37 - Prometheus 38 - node_exporter (both servers) 39 - Disk usage (>85% alert) 40 41 @scrape_targets: 42 - job: prometheus 43 target: localhost:9090 44 - job: ci-server 45 target: localhost:9100 46 - job: source-server 47 target: 10.106.0.2:9100 48 49 @alerts: 50 disk_high: usage > 85% 51 service_down: any monitored service unreachable 52 53 @access: 54 prometheus_ui: ssh -L 9090:localhost:9090 devops@ci.ac-dc.network 55 # Then open http://localhost:9090 in browser 56 57 @useful_queries: 58 cpu_usage: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) 59 memory_used: node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes 60 disk_used_pct: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 61 62 @rebuild_commands: 63 1: Install node_exporter on both servers 64 2: Install Prometheus on CI server 65 3: Copy prometheus.yml config 66 4: Create systemd services 67 5: Set up health-check cron