/ infra / machine / monitoring.cspec
monitoring.cspec
 1  # Monitoring Configuration
 2  # Generated: 2026-01-06
 3  # Purpose: Claude-readable monitoring spec
 4  
 5  @overview:
 6    strategy: Lightweight Prometheus stack
 7    metrics_retention: 15 days (default)
 8    health_check_interval: 5 minutes
 9  
10  @components:
11  
12    @prometheus:
13      host: ci.ac-dc.network
14      port: 9090 (localhost only)
15      config: /etc/prometheus/prometheus.yml
16      storage: /var/lib/prometheus
17      service: prometheus.service
18      scrape_interval: 30s
19  
20    @node_exporter:
21      version: 1.8.2
22      servers:
23        - host: source.ac-dc.network
24          listen: 10.106.0.2:9100 (VPC only)
25        - host: ci.ac-dc.network
26          listen: 0.0.0.0:9100
27      service: node_exporter.service
28  
29    @health_check:
30      script: /usr/local/bin/health-check.sh
31      cron: "*/5 * * * *"
32      log: /var/log/health-check.log
33      checks:
34        - Forgejo (HTTPS)
35        - Radicle node (port 8776)
36        - Forgejo runner (systemd)
37        - Prometheus
38        - node_exporter (both servers)
39        - Disk usage (>85% alert)
40  
41  @scrape_targets:
42    - job: prometheus
43      target: localhost:9090
44    - job: ci-server
45      target: localhost:9100
46    - job: source-server
47      target: 10.106.0.2:9100
48  
49  @alerts:
50    disk_high: usage > 85%
51    service_down: any monitored service unreachable
52  
53  @access:
54    prometheus_ui: ssh -L 9090:localhost:9090 devops@ci.ac-dc.network
55    # Then open http://localhost:9090 in browser
56  
57  @useful_queries:
58    cpu_usage: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
59    memory_used: node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
60    disk_used_pct: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100
61  
62  @rebuild_commands:
63    1: Install node_exporter on both servers
64    2: Install Prometheus on CI server
65    3: Copy prometheus.yml config
66    4: Create systemd services
67    5: Set up health-check cron