/ testnet / monitoring / alerting-rules.yml
alerting-rules.yml
  1  # Prometheus Alerting Rules for ALPHA/DELTA Testnet
  2  # ==================================================
  3  
  4  groups:
  5    # Node Health Alerts
  6    - name: node_health
  7      interval: 30s
  8      rules:
  9        - alert: NodeDown
 10          expr: up == 0
 11          for: 2m
 12          labels:
 13            severity: critical
 14          annotations:
 15            summary: "Node {{ $labels.instance }} is down"
 16            description: "Node {{ $labels.instance }} has been unreachable for more than 2 minutes."
 17  
 18        - alert: NodeHighCPU
 19          expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
 20          for: 5m
 21          labels:
 22            severity: warning
 23          annotations:
 24            summary: "High CPU usage on {{ $labels.instance }}"
 25            description: "CPU usage is above 90% for more than 5 minutes on {{ $labels.instance }}."
 26  
 27        - alert: NodeHighMemory
 28          expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
 29          for: 5m
 30          labels:
 31            severity: warning
 32          annotations:
 33            summary: "High memory usage on {{ $labels.instance }}"
 34            description: "Memory usage is above 85% for more than 5 minutes on {{ $labels.instance }}."
 35  
 36        - alert: NodeDiskSpaceLow
 37          expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15
 38          for: 5m
 39          labels:
 40            severity: warning
 41          annotations:
 42            summary: "Low disk space on {{ $labels.instance }}"
 43            description: "Disk space is below 15% on {{ $labels.instance }}:{{ $labels.mountpoint }}."
 44  
 45        - alert: NodeDiskSpaceCritical
 46          expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 5
 47          for: 2m
 48          labels:
 49            severity: critical
 50          annotations:
 51            summary: "Critical disk space on {{ $labels.instance }}"
 52            description: "Disk space is below 5% on {{ $labels.instance }}:{{ $labels.mountpoint }}."
 53  
 54    # Consensus Alerts
 55    - name: consensus
 56      interval: 30s
 57      rules:
 58        - alert: ConsensusStalledAlpha
 59          expr: increase(adnet_alpha_block_height[5m]) == 0
 60          for: 5m
 61          labels:
 62            severity: critical
 63            chain: alpha
 64          annotations:
 65            summary: "ALPHA chain consensus stalled"
 66            description: "No new blocks produced on ALPHA chain for 5 minutes."
 67  
 68        - alert: ConsensusStalledDelta
 69          expr: increase(adnet_delta_block_height[2m]) == 0
 70          for: 2m
 71          labels:
 72            severity: critical
 73            chain: delta
 74          annotations:
 75            summary: "DELTA chain consensus stalled"
 76            description: "No new blocks produced on DELTA chain for 2 minutes."
 77  
 78        - alert: ValidatorMissingBlocks
 79          expr: increase(adnet_validator_missed_blocks[10m]) > 100
 80          for: 5m
 81          labels:
 82            severity: warning
 83          annotations:
 84            summary: "Validator {{ $labels.instance }} missing blocks"
 85            description: "Validator {{ $labels.instance }} has missed more than 100 blocks in 10 minutes."
 86  
 87        - alert: LowPeerCount
 88          expr: adnet_peer_count < 3
 89          for: 5m
 90          labels:
 91            severity: warning
 92          annotations:
 93            summary: "Low peer count on {{ $labels.instance }}"
 94            description: "Node {{ $labels.instance }} has fewer than 3 peers connected."
 95  
 96    # Sync Alerts
 97    - name: sync
 98      interval: 60s
 99      rules:
100        - alert: NodeOutOfSync
101          expr: adnet_sync_lag_blocks > 100
102          for: 5m
103          labels:
104            severity: warning
105          annotations:
106            summary: "Node {{ $labels.instance }} is out of sync"
107            description: "Node {{ $labels.instance }} is {{ $value }} blocks behind the network."
108  
109        - alert: NodeSeverelyOutOfSync
110          expr: adnet_sync_lag_blocks > 1000
111          for: 5m
112          labels:
113            severity: critical
114          annotations:
115            summary: "Node {{ $labels.instance }} severely out of sync"
116            description: "Node {{ $labels.instance }} is {{ $value }} blocks behind the network."
117  
118    # DEX Alerts (DELTA chain)
119    - name: dex
120      interval: 30s
121      rules:
122        - alert: DexHighLatency
123          expr: histogram_quantile(0.99, rate(adnet_dex_order_latency_seconds_bucket[5m])) > 1
124          for: 5m
125          labels:
126            severity: warning
127            chain: delta
128          annotations:
129            summary: "High DEX order latency"
130            description: "99th percentile order latency is above 1 second."
131  
132        - alert: DexOrderBookEmpty
133          expr: adnet_dex_orderbook_depth == 0
134          for: 10m
135          labels:
136            severity: warning
137            chain: delta
138          annotations:
139            summary: "Empty order book for {{ $labels.market }}"
140            description: "Order book for market {{ $labels.market }} has been empty for 10 minutes."
141  
142        - alert: DexCircuitBreaker
143          expr: adnet_dex_circuit_breaker_triggered == 1
144          for: 1m
145          labels:
146            severity: critical
147            chain: delta
148          annotations:
149            summary: "DEX circuit breaker triggered"
150            description: "Circuit breaker has been triggered for market {{ $labels.market }}."
151  
152    # RPC Alerts
153    - name: rpc
154      interval: 30s
155      rules:
156        - alert: RPCHighErrorRate
157          expr: sum(rate(adnet_rpc_errors_total[5m])) / sum(rate(adnet_rpc_requests_total[5m])) > 0.05
158          for: 5m
159          labels:
160            severity: warning
161          annotations:
162            summary: "High RPC error rate on {{ $labels.instance }}"
163            description: "RPC error rate is above 5% on {{ $labels.instance }}."
164  
165        - alert: RPCHighLatency
166          expr: histogram_quantile(0.95, rate(adnet_rpc_latency_seconds_bucket[5m])) > 2
167          for: 5m
168          labels:
169            severity: warning
170          annotations:
171            summary: "High RPC latency on {{ $labels.instance }}"
172            description: "95th percentile RPC latency is above 2 seconds on {{ $labels.instance }}."
173  
174    # Faucet Alerts
175    - name: faucet
176      interval: 60s
177      rules:
178        - alert: FaucetLowBalance
179          expr: adnet_faucet_balance < 1000000000000
180          for: 5m
181          labels:
182            severity: warning
183          annotations:
184            summary: "Low faucet balance"
185            description: "Testnet faucet balance is below 1000 credits."
186  
187        - alert: FaucetHighRequestRate
188          expr: rate(adnet_faucet_requests_total[5m]) > 10
189          for: 10m
190          labels:
191            severity: warning
192          annotations:
193            summary: "High faucet request rate"
194            description: "Faucet is receiving more than 10 requests per second."