alerting-rules.yml
1 # Prometheus Alerting Rules for ALPHA/DELTA Testnet 2 # ================================================== 3 4 groups: 5 # Node Health Alerts 6 - name: node_health 7 interval: 30s 8 rules: 9 - alert: NodeDown 10 expr: up == 0 11 for: 2m 12 labels: 13 severity: critical 14 annotations: 15 summary: "Node {{ $labels.instance }} is down" 16 description: "Node {{ $labels.instance }} has been unreachable for more than 2 minutes." 17 18 - alert: NodeHighCPU 19 expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 20 for: 5m 21 labels: 22 severity: warning 23 annotations: 24 summary: "High CPU usage on {{ $labels.instance }}" 25 description: "CPU usage is above 90% for more than 5 minutes on {{ $labels.instance }}." 26 27 - alert: NodeHighMemory 28 expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 29 for: 5m 30 labels: 31 severity: warning 32 annotations: 33 summary: "High memory usage on {{ $labels.instance }}" 34 description: "Memory usage is above 85% for more than 5 minutes on {{ $labels.instance }}." 35 36 - alert: NodeDiskSpaceLow 37 expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15 38 for: 5m 39 labels: 40 severity: warning 41 annotations: 42 summary: "Low disk space on {{ $labels.instance }}" 43 description: "Disk space is below 15% on {{ $labels.instance }}:{{ $labels.mountpoint }}." 44 45 - alert: NodeDiskSpaceCritical 46 expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 5 47 for: 2m 48 labels: 49 severity: critical 50 annotations: 51 summary: "Critical disk space on {{ $labels.instance }}" 52 description: "Disk space is below 5% on {{ $labels.instance }}:{{ $labels.mountpoint }}." 53 54 # Consensus Alerts 55 - name: consensus 56 interval: 30s 57 rules: 58 - alert: ConsensusStalledAlpha 59 expr: increase(adnet_alpha_block_height[5m]) == 0 60 for: 5m 61 labels: 62 severity: critical 63 chain: alpha 64 annotations: 65 summary: "ALPHA chain consensus stalled" 66 description: "No new blocks produced on ALPHA chain for 5 minutes." 67 68 - alert: ConsensusStalledDelta 69 expr: increase(adnet_delta_block_height[2m]) == 0 70 for: 2m 71 labels: 72 severity: critical 73 chain: delta 74 annotations: 75 summary: "DELTA chain consensus stalled" 76 description: "No new blocks produced on DELTA chain for 2 minutes." 77 78 - alert: ValidatorMissingBlocks 79 expr: increase(adnet_validator_missed_blocks[10m]) > 100 80 for: 5m 81 labels: 82 severity: warning 83 annotations: 84 summary: "Validator {{ $labels.instance }} missing blocks" 85 description: "Validator {{ $labels.instance }} has missed more than 100 blocks in 10 minutes." 86 87 - alert: LowPeerCount 88 expr: adnet_peer_count < 3 89 for: 5m 90 labels: 91 severity: warning 92 annotations: 93 summary: "Low peer count on {{ $labels.instance }}" 94 description: "Node {{ $labels.instance }} has fewer than 3 peers connected." 95 96 # Sync Alerts 97 - name: sync 98 interval: 60s 99 rules: 100 - alert: NodeOutOfSync 101 expr: adnet_sync_lag_blocks > 100 102 for: 5m 103 labels: 104 severity: warning 105 annotations: 106 summary: "Node {{ $labels.instance }} is out of sync" 107 description: "Node {{ $labels.instance }} is {{ $value }} blocks behind the network." 108 109 - alert: NodeSeverelyOutOfSync 110 expr: adnet_sync_lag_blocks > 1000 111 for: 5m 112 labels: 113 severity: critical 114 annotations: 115 summary: "Node {{ $labels.instance }} severely out of sync" 116 description: "Node {{ $labels.instance }} is {{ $value }} blocks behind the network." 117 118 # DEX Alerts (DELTA chain) 119 - name: dex 120 interval: 30s 121 rules: 122 - alert: DexHighLatency 123 expr: histogram_quantile(0.99, rate(adnet_dex_order_latency_seconds_bucket[5m])) > 1 124 for: 5m 125 labels: 126 severity: warning 127 chain: delta 128 annotations: 129 summary: "High DEX order latency" 130 description: "99th percentile order latency is above 1 second." 131 132 - alert: DexOrderBookEmpty 133 expr: adnet_dex_orderbook_depth == 0 134 for: 10m 135 labels: 136 severity: warning 137 chain: delta 138 annotations: 139 summary: "Empty order book for {{ $labels.market }}" 140 description: "Order book for market {{ $labels.market }} has been empty for 10 minutes." 141 142 - alert: DexCircuitBreaker 143 expr: adnet_dex_circuit_breaker_triggered == 1 144 for: 1m 145 labels: 146 severity: critical 147 chain: delta 148 annotations: 149 summary: "DEX circuit breaker triggered" 150 description: "Circuit breaker has been triggered for market {{ $labels.market }}." 151 152 # RPC Alerts 153 - name: rpc 154 interval: 30s 155 rules: 156 - alert: RPCHighErrorRate 157 expr: sum(rate(adnet_rpc_errors_total[5m])) / sum(rate(adnet_rpc_requests_total[5m])) > 0.05 158 for: 5m 159 labels: 160 severity: warning 161 annotations: 162 summary: "High RPC error rate on {{ $labels.instance }}" 163 description: "RPC error rate is above 5% on {{ $labels.instance }}." 164 165 - alert: RPCHighLatency 166 expr: histogram_quantile(0.95, rate(adnet_rpc_latency_seconds_bucket[5m])) > 2 167 for: 5m 168 labels: 169 severity: warning 170 annotations: 171 summary: "High RPC latency on {{ $labels.instance }}" 172 description: "95th percentile RPC latency is above 2 seconds on {{ $labels.instance }}." 173 174 # Faucet Alerts 175 - name: faucet 176 interval: 60s 177 rules: 178 - alert: FaucetLowBalance 179 expr: adnet_faucet_balance < 1000000000000 180 for: 5m 181 labels: 182 severity: warning 183 annotations: 184 summary: "Low faucet balance" 185 description: "Testnet faucet balance is below 1000 credits." 186 187 - alert: FaucetHighRequestRate 188 expr: rate(adnet_faucet_requests_total[5m]) > 10 189 for: 10m 190 labels: 191 severity: warning 192 annotations: 193 summary: "High faucet request rate" 194 description: "Faucet is receiving more than 10 requests per second."