issues.rs
1 //! Issue detection and diagnosis. 2 3 use crate::system::{self}; 4 use acdc_core::Result; 5 use serde::{Deserialize, Serialize}; 6 use std::process::Command; 7 8 /// Detected issue. 9 #[derive(Debug, Clone, Serialize, Deserialize)] 10 pub struct Issue { 11 /// Issue ID 12 pub id: String, 13 /// Issue category 14 pub category: IssueCategory, 15 /// Severity level 16 pub severity: IssueSeverity, 17 /// Short title 18 pub title: String, 19 /// Detailed description 20 pub description: String, 21 /// Suggested resolution 22 pub resolution: Option<String>, 23 /// Related component 24 pub component: Option<String>, 25 } 26 27 /// Issue category. 28 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 29 pub enum IssueCategory { 30 /// Resource constraints (CPU, memory, disk) 31 Resource, 32 /// Network connectivity 33 Network, 34 /// Service status 35 Service, 36 /// Synchronization 37 Sync, 38 /// Configuration 39 Config, 40 /// Security 41 Security, 42 /// Performance 43 Performance, 44 } 45 46 impl std::fmt::Display for IssueCategory { 47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 48 match self { 49 IssueCategory::Resource => write!(f, "Resource"), 50 IssueCategory::Network => write!(f, "Network"), 51 IssueCategory::Service => write!(f, "Service"), 52 IssueCategory::Sync => write!(f, "Sync"), 53 IssueCategory::Config => write!(f, "Config"), 54 IssueCategory::Security => write!(f, "Security"), 55 IssueCategory::Performance => write!(f, "Performance"), 56 } 57 } 58 } 59 60 /// Issue severity. 61 #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] 62 pub enum IssueSeverity { 63 /// Informational 64 Info, 65 /// Warning - may impact performance 66 Warning, 67 /// Error - functionality impacted 68 Error, 69 /// Critical - immediate action required 70 Critical, 71 } 72 73 impl std::fmt::Display for IssueSeverity { 74 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 75 match self { 76 IssueSeverity::Info => write!(f, "INFO"), 77 IssueSeverity::Warning => write!(f, "WARNING"), 78 IssueSeverity::Error => write!(f, "ERROR"), 79 IssueSeverity::Critical => write!(f, "CRITICAL"), 80 } 81 } 82 } 83 84 /// Issue detector. 85 pub struct IssueDetector { 86 checks: Vec<Box<dyn IssueCheck + Send + Sync>>, 87 } 88 89 impl IssueDetector { 90 /// Create a new issue detector with default checks. 91 pub fn new() -> Self { 92 Self { 93 checks: vec![ 94 Box::new(ResourceCheck), 95 Box::new(ServiceCheck), 96 Box::new(NetworkCheck), 97 Box::new(SyncCheck), 98 Box::new(ConfigCheck), 99 Box::new(SecurityCheck), 100 ], 101 } 102 } 103 104 /// Run all checks. 105 pub async fn run_all(&self, node_id: Option<&str>) -> Result<Vec<Issue>> { 106 let mut issues = Vec::new(); 107 108 for check in &self.checks { 109 match check.check(node_id).await { 110 Ok(mut found) => issues.append(&mut found), 111 Err(e) => { 112 tracing::warn!("Check {} failed: {}", check.name(), e); 113 } 114 } 115 } 116 117 // Sort by severity (most severe first) 118 issues.sort_by(|a, b| b.severity.cmp(&a.severity)); 119 120 Ok(issues) 121 } 122 } 123 124 impl Default for IssueDetector { 125 fn default() -> Self { 126 Self::new() 127 } 128 } 129 130 /// Trait for issue checks. 131 #[async_trait::async_trait] 132 trait IssueCheck { 133 fn name(&self) -> &'static str; 134 async fn check(&self, node_id: Option<&str>) -> Result<Vec<Issue>>; 135 } 136 137 /// Resource usage check. 138 struct ResourceCheck; 139 140 #[async_trait::async_trait] 141 impl IssueCheck for ResourceCheck { 142 fn name(&self) -> &'static str { 143 "resource" 144 } 145 146 async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> { 147 let mut issues = Vec::new(); 148 let diag = system::collect_diagnostics().await?; 149 150 // Check CPU 151 if diag.cpu.usage_percent >= 95.0 { 152 issues.push(Issue { 153 id: "resource-cpu-critical".to_string(), 154 category: IssueCategory::Resource, 155 severity: IssueSeverity::Critical, 156 title: "Critical CPU usage".to_string(), 157 description: format!( 158 "CPU usage is at {:.1}%, which may cause performance issues.", 159 diag.cpu.usage_percent 160 ), 161 resolution: Some( 162 "Consider upgrading CPU or reducing workload. Check for runaway processes." 163 .to_string(), 164 ), 165 component: None, 166 }); 167 } else if diag.cpu.usage_percent >= 80.0 { 168 issues.push(Issue { 169 id: "resource-cpu-high".to_string(), 170 category: IssueCategory::Resource, 171 severity: IssueSeverity::Warning, 172 title: "High CPU usage".to_string(), 173 description: format!("CPU usage is at {:.1}%.", diag.cpu.usage_percent), 174 resolution: Some( 175 "Monitor for sustained high usage. Consider scaling if this persists." 176 .to_string(), 177 ), 178 component: None, 179 }); 180 } 181 182 // Check memory 183 if diag.memory.usage_percent >= 95.0 { 184 issues.push(Issue { 185 id: "resource-memory-critical".to_string(), 186 category: IssueCategory::Resource, 187 severity: IssueSeverity::Critical, 188 title: "Critical memory usage".to_string(), 189 description: format!( 190 "Memory usage is at {:.1}%, risk of OOM.", 191 diag.memory.usage_percent 192 ), 193 resolution: Some( 194 "Free up memory or add more RAM. Check for memory leaks.".to_string(), 195 ), 196 component: None, 197 }); 198 } else if diag.memory.usage_percent >= 85.0 { 199 issues.push(Issue { 200 id: "resource-memory-high".to_string(), 201 category: IssueCategory::Resource, 202 severity: IssueSeverity::Warning, 203 title: "High memory usage".to_string(), 204 description: format!("Memory usage is at {:.1}%.", diag.memory.usage_percent), 205 resolution: Some( 206 "Monitor memory usage. Consider adding more RAM if this persists.".to_string(), 207 ), 208 component: None, 209 }); 210 } 211 212 // Check disk 213 for disk in &diag.disks { 214 if disk.mount_point == "/" || disk.mount_point.contains("data") { 215 if disk.usage_percent >= 95.0 { 216 issues.push(Issue { 217 id: format!( 218 "resource-disk-critical-{}", 219 disk.mount_point.replace('/', "_") 220 ), 221 category: IssueCategory::Resource, 222 severity: IssueSeverity::Critical, 223 title: format!("Critical disk usage on {}", disk.mount_point), 224 description: format!( 225 "Disk usage is at {:.1}%, only {} available.", 226 disk.usage_percent, 227 format_bytes(disk.available_bytes) 228 ), 229 resolution: Some( 230 "Free up disk space immediately. Consider expanding storage." 231 .to_string(), 232 ), 233 component: None, 234 }); 235 } else if disk.usage_percent >= 90.0 { 236 issues.push(Issue { 237 id: format!("resource-disk-high-{}", disk.mount_point.replace('/', "_")), 238 category: IssueCategory::Resource, 239 severity: IssueSeverity::Warning, 240 title: format!("High disk usage on {}", disk.mount_point), 241 description: format!("Disk usage is at {:.1}%.", disk.usage_percent), 242 resolution: Some("Plan for disk space expansion or cleanup.".to_string()), 243 component: None, 244 }); 245 } 246 } 247 } 248 249 Ok(issues) 250 } 251 } 252 253 /// Service status check. 254 struct ServiceCheck; 255 256 #[async_trait::async_trait] 257 impl IssueCheck for ServiceCheck { 258 fn name(&self) -> &'static str { 259 "service" 260 } 261 262 async fn check(&self, node_id: Option<&str>) -> Result<Vec<Issue>> { 263 let mut issues = Vec::new(); 264 265 let services = if let Some(id) = node_id { 266 vec![format!("ac-dc-{}", id)] 267 } else { 268 // Check common services 269 vec![ 270 "ac-dc-validator".to_string(), 271 "ac-dc-prover".to_string(), 272 "ac-dc-client".to_string(), 273 ] 274 }; 275 276 for service in services { 277 let output = Command::new("systemctl") 278 .args(["is-active", &service]) 279 .output(); 280 281 match output { 282 Ok(out) => { 283 let status = String::from_utf8_lossy(&out.stdout).trim().to_string(); 284 if status != "active" { 285 issues.push(Issue { 286 id: format!("service-not-active-{}", service), 287 category: IssueCategory::Service, 288 severity: IssueSeverity::Error, 289 title: format!("Service {} not active", service), 290 description: format!("Service status is: {}", status), 291 resolution: Some(format!( 292 "Run 'systemctl start {}' to start the service.", 293 service 294 )), 295 component: Some(service.clone()), 296 }); 297 } 298 } 299 Err(_) => { 300 // Service doesn't exist or can't be checked 301 } 302 } 303 } 304 305 Ok(issues) 306 } 307 } 308 309 /// Network connectivity check. 310 struct NetworkCheck; 311 312 #[async_trait::async_trait] 313 impl IssueCheck for NetworkCheck { 314 fn name(&self) -> &'static str { 315 "network" 316 } 317 318 async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> { 319 let mut issues = Vec::new(); 320 321 // Check DNS resolution 322 let dns_ok = std::net::ToSocketAddrs::to_socket_addrs(&("google.com", 80)).is_ok(); 323 if !dns_ok { 324 issues.push(Issue { 325 id: "network-dns-failure".to_string(), 326 category: IssueCategory::Network, 327 severity: IssueSeverity::Error, 328 title: "DNS resolution failing".to_string(), 329 description: "Unable to resolve DNS queries.".to_string(), 330 resolution: Some("Check /etc/resolv.conf and network configuration.".to_string()), 331 component: None, 332 }); 333 } 334 335 // Check common ports 336 let ports = [ 337 (3030, "ALPHA REST API"), 338 (4030, "DELTA REST API"), 339 (4130, "ALPHA P2P"), 340 (4131, "DELTA P2P"), 341 ]; 342 343 for (port, name) in ports { 344 if system::check_port_in_use(port) { 345 // Port is in use - check if it's our service or something else 346 let output = Command::new("lsof") 347 .args(["-i", &format!(":{}", port), "-t"]) 348 .output(); 349 350 if let Ok(out) = output { 351 let pids = String::from_utf8_lossy(&out.stdout); 352 if pids.trim().is_empty() { 353 // Port in use but can't identify process 354 issues.push(Issue { 355 id: format!("network-port-unknown-{}", port), 356 category: IssueCategory::Network, 357 severity: IssueSeverity::Warning, 358 title: format!("Port {} ({}) in use by unknown process", port, name), 359 description: "Port is in use but process cannot be identified." 360 .to_string(), 361 resolution: Some(format!( 362 "Check what's using port {} with 'lsof -i :{}'", 363 port, port 364 )), 365 component: None, 366 }); 367 } 368 } 369 } 370 } 371 372 Ok(issues) 373 } 374 } 375 376 /// Sync status check. 377 struct SyncCheck; 378 379 #[async_trait::async_trait] 380 impl IssueCheck for SyncCheck { 381 fn name(&self) -> &'static str { 382 "sync" 383 } 384 385 async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> { 386 let mut issues = Vec::new(); 387 388 // Try to get sync status from node API 389 let client = reqwest::Client::new(); 390 391 // Check ALPHA node 392 if let Ok(resp) = client 393 .get("http://127.0.0.1:3030/testnet/latest/height") 394 .timeout(std::time::Duration::from_secs(5)) 395 .send() 396 .await 397 { 398 if !resp.status().is_success() { 399 issues.push(Issue { 400 id: "sync-alpha-api-error".to_string(), 401 category: IssueCategory::Sync, 402 severity: IssueSeverity::Warning, 403 title: "ALPHA node API not responding".to_string(), 404 description: "Cannot connect to ALPHA node REST API.".to_string(), 405 resolution: Some( 406 "Check if the node is running and the API port is accessible.".to_string(), 407 ), 408 component: Some("alphaos".to_string()), 409 }); 410 } 411 } 412 413 // Check DELTA node 414 if let Ok(resp) = client 415 .get("http://127.0.0.1:4030/testnet/latest/height") 416 .timeout(std::time::Duration::from_secs(5)) 417 .send() 418 .await 419 { 420 if !resp.status().is_success() { 421 issues.push(Issue { 422 id: "sync-delta-api-error".to_string(), 423 category: IssueCategory::Sync, 424 severity: IssueSeverity::Warning, 425 title: "DELTA node API not responding".to_string(), 426 description: "Cannot connect to DELTA node REST API.".to_string(), 427 resolution: Some( 428 "Check if the node is running and the API port is accessible.".to_string(), 429 ), 430 component: Some("deltaos".to_string()), 431 }); 432 } 433 } 434 435 Ok(issues) 436 } 437 } 438 439 /// Configuration check. 440 struct ConfigCheck; 441 442 #[async_trait::async_trait] 443 impl IssueCheck for ConfigCheck { 444 fn name(&self) -> &'static str { 445 "config" 446 } 447 448 async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> { 449 let mut issues = Vec::new(); 450 451 // Check config directory exists 452 let config_dir = dirs::config_dir() 453 .map(|p| p.join("ac-dc")) 454 .unwrap_or_else(|| std::path::PathBuf::from("/etc/ac-dc")); 455 456 if !config_dir.exists() { 457 issues.push(Issue { 458 id: "config-dir-missing".to_string(), 459 category: IssueCategory::Config, 460 severity: IssueSeverity::Warning, 461 title: "Configuration directory missing".to_string(), 462 description: format!( 463 "Configuration directory {} does not exist.", 464 config_dir.display() 465 ), 466 resolution: Some("Run 'ac-dc setup' to create configuration.".to_string()), 467 component: None, 468 }); 469 } 470 471 // Check data directory 472 let data_dir = std::path::Path::new("/var/lib/ac-dc"); 473 if !data_dir.exists() { 474 issues.push(Issue { 475 id: "data-dir-missing".to_string(), 476 category: IssueCategory::Config, 477 severity: IssueSeverity::Warning, 478 title: "Data directory missing".to_string(), 479 description: "Data directory /var/lib/ac-dc does not exist.".to_string(), 480 resolution: Some("Run 'ac-dc setup' to create data directory.".to_string()), 481 component: None, 482 }); 483 } 484 485 Ok(issues) 486 } 487 } 488 489 /// Security check. 490 struct SecurityCheck; 491 492 #[async_trait::async_trait] 493 impl IssueCheck for SecurityCheck { 494 fn name(&self) -> &'static str { 495 "security" 496 } 497 498 async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> { 499 let mut issues = Vec::new(); 500 501 // Check if running as root 502 if unsafe { libc::geteuid() } == 0 { 503 issues.push(Issue { 504 id: "security-running-as-root".to_string(), 505 category: IssueCategory::Security, 506 severity: IssueSeverity::Warning, 507 title: "Running as root".to_string(), 508 description: "AC/DC is running with root privileges.".to_string(), 509 resolution: Some( 510 "Consider running as a dedicated service user for security.".to_string(), 511 ), 512 component: None, 513 }); 514 } 515 516 // Check entropy 517 if let Some(entropy) = system::get_entropy_available() { 518 if entropy < 256 { 519 issues.push(Issue { 520 id: "security-low-entropy".to_string(), 521 category: IssueCategory::Security, 522 severity: IssueSeverity::Warning, 523 title: "Low system entropy".to_string(), 524 description: format!("System entropy is low ({} bits).", entropy), 525 resolution: Some("Consider installing haveged or rng-tools.".to_string()), 526 component: None, 527 }); 528 } 529 } 530 531 Ok(issues) 532 } 533 } 534 535 /// Detect all issues. 536 pub async fn detect_all(node_id: Option<&str>) -> Result<Vec<Issue>> { 537 let detector = IssueDetector::new(); 538 detector.run_all(node_id).await 539 } 540 541 /// Format bytes to human readable. 542 fn format_bytes(bytes: u64) -> String { 543 const KB: u64 = 1024; 544 const MB: u64 = KB * 1024; 545 const GB: u64 = MB * 1024; 546 const TB: u64 = GB * 1024; 547 548 if bytes >= TB { 549 format!("{:.1} TB", bytes as f64 / TB as f64) 550 } else if bytes >= GB { 551 format!("{:.1} GB", bytes as f64 / GB as f64) 552 } else if bytes >= MB { 553 format!("{:.1} MB", bytes as f64 / MB as f64) 554 } else if bytes >= KB { 555 format!("{:.1} KB", bytes as f64 / KB as f64) 556 } else { 557 format!("{} B", bytes) 558 } 559 }