/ crates / acdc-diagnostics / src / issues.rs
issues.rs
  1  //! Issue detection and diagnosis.
  2  
  3  use crate::system::{self};
  4  use acdc_core::Result;
  5  use serde::{Deserialize, Serialize};
  6  use std::process::Command;
  7  
  8  /// Detected issue.
  9  #[derive(Debug, Clone, Serialize, Deserialize)]
 10  pub struct Issue {
 11      /// Issue ID
 12      pub id: String,
 13      /// Issue category
 14      pub category: IssueCategory,
 15      /// Severity level
 16      pub severity: IssueSeverity,
 17      /// Short title
 18      pub title: String,
 19      /// Detailed description
 20      pub description: String,
 21      /// Suggested resolution
 22      pub resolution: Option<String>,
 23      /// Related component
 24      pub component: Option<String>,
 25  }
 26  
 27  /// Issue category.
 28  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 29  pub enum IssueCategory {
 30      /// Resource constraints (CPU, memory, disk)
 31      Resource,
 32      /// Network connectivity
 33      Network,
 34      /// Service status
 35      Service,
 36      /// Synchronization
 37      Sync,
 38      /// Configuration
 39      Config,
 40      /// Security
 41      Security,
 42      /// Performance
 43      Performance,
 44  }
 45  
 46  impl std::fmt::Display for IssueCategory {
 47      fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 48          match self {
 49              IssueCategory::Resource => write!(f, "Resource"),
 50              IssueCategory::Network => write!(f, "Network"),
 51              IssueCategory::Service => write!(f, "Service"),
 52              IssueCategory::Sync => write!(f, "Sync"),
 53              IssueCategory::Config => write!(f, "Config"),
 54              IssueCategory::Security => write!(f, "Security"),
 55              IssueCategory::Performance => write!(f, "Performance"),
 56          }
 57      }
 58  }
 59  
 60  /// Issue severity.
 61  #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 62  pub enum IssueSeverity {
 63      /// Informational
 64      Info,
 65      /// Warning - may impact performance
 66      Warning,
 67      /// Error - functionality impacted
 68      Error,
 69      /// Critical - immediate action required
 70      Critical,
 71  }
 72  
 73  impl std::fmt::Display for IssueSeverity {
 74      fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 75          match self {
 76              IssueSeverity::Info => write!(f, "INFO"),
 77              IssueSeverity::Warning => write!(f, "WARNING"),
 78              IssueSeverity::Error => write!(f, "ERROR"),
 79              IssueSeverity::Critical => write!(f, "CRITICAL"),
 80          }
 81      }
 82  }
 83  
 84  /// Issue detector.
 85  pub struct IssueDetector {
 86      checks: Vec<Box<dyn IssueCheck + Send + Sync>>,
 87  }
 88  
 89  impl IssueDetector {
 90      /// Create a new issue detector with default checks.
 91      pub fn new() -> Self {
 92          Self {
 93              checks: vec![
 94                  Box::new(ResourceCheck),
 95                  Box::new(ServiceCheck),
 96                  Box::new(NetworkCheck),
 97                  Box::new(SyncCheck),
 98                  Box::new(ConfigCheck),
 99                  Box::new(SecurityCheck),
100              ],
101          }
102      }
103  
104      /// Run all checks.
105      pub async fn run_all(&self, node_id: Option<&str>) -> Result<Vec<Issue>> {
106          let mut issues = Vec::new();
107  
108          for check in &self.checks {
109              match check.check(node_id).await {
110                  Ok(mut found) => issues.append(&mut found),
111                  Err(e) => {
112                      tracing::warn!("Check {} failed: {}", check.name(), e);
113                  }
114              }
115          }
116  
117          // Sort by severity (most severe first)
118          issues.sort_by(|a, b| b.severity.cmp(&a.severity));
119  
120          Ok(issues)
121      }
122  }
123  
124  impl Default for IssueDetector {
125      fn default() -> Self {
126          Self::new()
127      }
128  }
129  
130  /// Trait for issue checks.
131  #[async_trait::async_trait]
132  trait IssueCheck {
133      fn name(&self) -> &'static str;
134      async fn check(&self, node_id: Option<&str>) -> Result<Vec<Issue>>;
135  }
136  
137  /// Resource usage check.
138  struct ResourceCheck;
139  
140  #[async_trait::async_trait]
141  impl IssueCheck for ResourceCheck {
142      fn name(&self) -> &'static str {
143          "resource"
144      }
145  
146      async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> {
147          let mut issues = Vec::new();
148          let diag = system::collect_diagnostics().await?;
149  
150          // Check CPU
151          if diag.cpu.usage_percent >= 95.0 {
152              issues.push(Issue {
153                  id: "resource-cpu-critical".to_string(),
154                  category: IssueCategory::Resource,
155                  severity: IssueSeverity::Critical,
156                  title: "Critical CPU usage".to_string(),
157                  description: format!(
158                      "CPU usage is at {:.1}%, which may cause performance issues.",
159                      diag.cpu.usage_percent
160                  ),
161                  resolution: Some(
162                      "Consider upgrading CPU or reducing workload. Check for runaway processes."
163                          .to_string(),
164                  ),
165                  component: None,
166              });
167          } else if diag.cpu.usage_percent >= 80.0 {
168              issues.push(Issue {
169                  id: "resource-cpu-high".to_string(),
170                  category: IssueCategory::Resource,
171                  severity: IssueSeverity::Warning,
172                  title: "High CPU usage".to_string(),
173                  description: format!("CPU usage is at {:.1}%.", diag.cpu.usage_percent),
174                  resolution: Some(
175                      "Monitor for sustained high usage. Consider scaling if this persists."
176                          .to_string(),
177                  ),
178                  component: None,
179              });
180          }
181  
182          // Check memory
183          if diag.memory.usage_percent >= 95.0 {
184              issues.push(Issue {
185                  id: "resource-memory-critical".to_string(),
186                  category: IssueCategory::Resource,
187                  severity: IssueSeverity::Critical,
188                  title: "Critical memory usage".to_string(),
189                  description: format!(
190                      "Memory usage is at {:.1}%, risk of OOM.",
191                      diag.memory.usage_percent
192                  ),
193                  resolution: Some(
194                      "Free up memory or add more RAM. Check for memory leaks.".to_string(),
195                  ),
196                  component: None,
197              });
198          } else if diag.memory.usage_percent >= 85.0 {
199              issues.push(Issue {
200                  id: "resource-memory-high".to_string(),
201                  category: IssueCategory::Resource,
202                  severity: IssueSeverity::Warning,
203                  title: "High memory usage".to_string(),
204                  description: format!("Memory usage is at {:.1}%.", diag.memory.usage_percent),
205                  resolution: Some(
206                      "Monitor memory usage. Consider adding more RAM if this persists.".to_string(),
207                  ),
208                  component: None,
209              });
210          }
211  
212          // Check disk
213          for disk in &diag.disks {
214              if disk.mount_point == "/" || disk.mount_point.contains("data") {
215                  if disk.usage_percent >= 95.0 {
216                      issues.push(Issue {
217                          id: format!(
218                              "resource-disk-critical-{}",
219                              disk.mount_point.replace('/', "_")
220                          ),
221                          category: IssueCategory::Resource,
222                          severity: IssueSeverity::Critical,
223                          title: format!("Critical disk usage on {}", disk.mount_point),
224                          description: format!(
225                              "Disk usage is at {:.1}%, only {} available.",
226                              disk.usage_percent,
227                              format_bytes(disk.available_bytes)
228                          ),
229                          resolution: Some(
230                              "Free up disk space immediately. Consider expanding storage."
231                                  .to_string(),
232                          ),
233                          component: None,
234                      });
235                  } else if disk.usage_percent >= 90.0 {
236                      issues.push(Issue {
237                          id: format!("resource-disk-high-{}", disk.mount_point.replace('/', "_")),
238                          category: IssueCategory::Resource,
239                          severity: IssueSeverity::Warning,
240                          title: format!("High disk usage on {}", disk.mount_point),
241                          description: format!("Disk usage is at {:.1}%.", disk.usage_percent),
242                          resolution: Some("Plan for disk space expansion or cleanup.".to_string()),
243                          component: None,
244                      });
245                  }
246              }
247          }
248  
249          Ok(issues)
250      }
251  }
252  
253  /// Service status check.
254  struct ServiceCheck;
255  
256  #[async_trait::async_trait]
257  impl IssueCheck for ServiceCheck {
258      fn name(&self) -> &'static str {
259          "service"
260      }
261  
262      async fn check(&self, node_id: Option<&str>) -> Result<Vec<Issue>> {
263          let mut issues = Vec::new();
264  
265          let services = if let Some(id) = node_id {
266              vec![format!("ac-dc-{}", id)]
267          } else {
268              // Check common services
269              vec![
270                  "ac-dc-validator".to_string(),
271                  "ac-dc-prover".to_string(),
272                  "ac-dc-client".to_string(),
273              ]
274          };
275  
276          for service in services {
277              let output = Command::new("systemctl")
278                  .args(["is-active", &service])
279                  .output();
280  
281              match output {
282                  Ok(out) => {
283                      let status = String::from_utf8_lossy(&out.stdout).trim().to_string();
284                      if status != "active" {
285                          issues.push(Issue {
286                              id: format!("service-not-active-{}", service),
287                              category: IssueCategory::Service,
288                              severity: IssueSeverity::Error,
289                              title: format!("Service {} not active", service),
290                              description: format!("Service status is: {}", status),
291                              resolution: Some(format!(
292                                  "Run 'systemctl start {}' to start the service.",
293                                  service
294                              )),
295                              component: Some(service.clone()),
296                          });
297                      }
298                  }
299                  Err(_) => {
300                      // Service doesn't exist or can't be checked
301                  }
302              }
303          }
304  
305          Ok(issues)
306      }
307  }
308  
309  /// Network connectivity check.
310  struct NetworkCheck;
311  
312  #[async_trait::async_trait]
313  impl IssueCheck for NetworkCheck {
314      fn name(&self) -> &'static str {
315          "network"
316      }
317  
318      async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> {
319          let mut issues = Vec::new();
320  
321          // Check DNS resolution
322          let dns_ok = std::net::ToSocketAddrs::to_socket_addrs(&("google.com", 80)).is_ok();
323          if !dns_ok {
324              issues.push(Issue {
325                  id: "network-dns-failure".to_string(),
326                  category: IssueCategory::Network,
327                  severity: IssueSeverity::Error,
328                  title: "DNS resolution failing".to_string(),
329                  description: "Unable to resolve DNS queries.".to_string(),
330                  resolution: Some("Check /etc/resolv.conf and network configuration.".to_string()),
331                  component: None,
332              });
333          }
334  
335          // Check common ports
336          let ports = [
337              (3030, "ALPHA REST API"),
338              (4030, "DELTA REST API"),
339              (4130, "ALPHA P2P"),
340              (4131, "DELTA P2P"),
341          ];
342  
343          for (port, name) in ports {
344              if system::check_port_in_use(port) {
345                  // Port is in use - check if it's our service or something else
346                  let output = Command::new("lsof")
347                      .args(["-i", &format!(":{}", port), "-t"])
348                      .output();
349  
350                  if let Ok(out) = output {
351                      let pids = String::from_utf8_lossy(&out.stdout);
352                      if pids.trim().is_empty() {
353                          // Port in use but can't identify process
354                          issues.push(Issue {
355                              id: format!("network-port-unknown-{}", port),
356                              category: IssueCategory::Network,
357                              severity: IssueSeverity::Warning,
358                              title: format!("Port {} ({}) in use by unknown process", port, name),
359                              description: "Port is in use but process cannot be identified."
360                                  .to_string(),
361                              resolution: Some(format!(
362                                  "Check what's using port {} with 'lsof -i :{}'",
363                                  port, port
364                              )),
365                              component: None,
366                          });
367                      }
368                  }
369              }
370          }
371  
372          Ok(issues)
373      }
374  }
375  
376  /// Sync status check.
377  struct SyncCheck;
378  
379  #[async_trait::async_trait]
380  impl IssueCheck for SyncCheck {
381      fn name(&self) -> &'static str {
382          "sync"
383      }
384  
385      async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> {
386          let mut issues = Vec::new();
387  
388          // Try to get sync status from node API
389          let client = reqwest::Client::new();
390  
391          // Check ALPHA node
392          if let Ok(resp) = client
393              .get("http://127.0.0.1:3030/testnet/latest/height")
394              .timeout(std::time::Duration::from_secs(5))
395              .send()
396              .await
397          {
398              if !resp.status().is_success() {
399                  issues.push(Issue {
400                      id: "sync-alpha-api-error".to_string(),
401                      category: IssueCategory::Sync,
402                      severity: IssueSeverity::Warning,
403                      title: "ALPHA node API not responding".to_string(),
404                      description: "Cannot connect to ALPHA node REST API.".to_string(),
405                      resolution: Some(
406                          "Check if the node is running and the API port is accessible.".to_string(),
407                      ),
408                      component: Some("alphaos".to_string()),
409                  });
410              }
411          }
412  
413          // Check DELTA node
414          if let Ok(resp) = client
415              .get("http://127.0.0.1:4030/testnet/latest/height")
416              .timeout(std::time::Duration::from_secs(5))
417              .send()
418              .await
419          {
420              if !resp.status().is_success() {
421                  issues.push(Issue {
422                      id: "sync-delta-api-error".to_string(),
423                      category: IssueCategory::Sync,
424                      severity: IssueSeverity::Warning,
425                      title: "DELTA node API not responding".to_string(),
426                      description: "Cannot connect to DELTA node REST API.".to_string(),
427                      resolution: Some(
428                          "Check if the node is running and the API port is accessible.".to_string(),
429                      ),
430                      component: Some("deltaos".to_string()),
431                  });
432              }
433          }
434  
435          Ok(issues)
436      }
437  }
438  
439  /// Configuration check.
440  struct ConfigCheck;
441  
442  #[async_trait::async_trait]
443  impl IssueCheck for ConfigCheck {
444      fn name(&self) -> &'static str {
445          "config"
446      }
447  
448      async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> {
449          let mut issues = Vec::new();
450  
451          // Check config directory exists
452          let config_dir = dirs::config_dir()
453              .map(|p| p.join("ac-dc"))
454              .unwrap_or_else(|| std::path::PathBuf::from("/etc/ac-dc"));
455  
456          if !config_dir.exists() {
457              issues.push(Issue {
458                  id: "config-dir-missing".to_string(),
459                  category: IssueCategory::Config,
460                  severity: IssueSeverity::Warning,
461                  title: "Configuration directory missing".to_string(),
462                  description: format!(
463                      "Configuration directory {} does not exist.",
464                      config_dir.display()
465                  ),
466                  resolution: Some("Run 'ac-dc setup' to create configuration.".to_string()),
467                  component: None,
468              });
469          }
470  
471          // Check data directory
472          let data_dir = std::path::Path::new("/var/lib/ac-dc");
473          if !data_dir.exists() {
474              issues.push(Issue {
475                  id: "data-dir-missing".to_string(),
476                  category: IssueCategory::Config,
477                  severity: IssueSeverity::Warning,
478                  title: "Data directory missing".to_string(),
479                  description: "Data directory /var/lib/ac-dc does not exist.".to_string(),
480                  resolution: Some("Run 'ac-dc setup' to create data directory.".to_string()),
481                  component: None,
482              });
483          }
484  
485          Ok(issues)
486      }
487  }
488  
489  /// Security check.
490  struct SecurityCheck;
491  
492  #[async_trait::async_trait]
493  impl IssueCheck for SecurityCheck {
494      fn name(&self) -> &'static str {
495          "security"
496      }
497  
498      async fn check(&self, _node_id: Option<&str>) -> Result<Vec<Issue>> {
499          let mut issues = Vec::new();
500  
501          // Check if running as root
502          if unsafe { libc::geteuid() } == 0 {
503              issues.push(Issue {
504                  id: "security-running-as-root".to_string(),
505                  category: IssueCategory::Security,
506                  severity: IssueSeverity::Warning,
507                  title: "Running as root".to_string(),
508                  description: "AC/DC is running with root privileges.".to_string(),
509                  resolution: Some(
510                      "Consider running as a dedicated service user for security.".to_string(),
511                  ),
512                  component: None,
513              });
514          }
515  
516          // Check entropy
517          if let Some(entropy) = system::get_entropy_available() {
518              if entropy < 256 {
519                  issues.push(Issue {
520                      id: "security-low-entropy".to_string(),
521                      category: IssueCategory::Security,
522                      severity: IssueSeverity::Warning,
523                      title: "Low system entropy".to_string(),
524                      description: format!("System entropy is low ({} bits).", entropy),
525                      resolution: Some("Consider installing haveged or rng-tools.".to_string()),
526                      component: None,
527                  });
528              }
529          }
530  
531          Ok(issues)
532      }
533  }
534  
535  /// Detect all issues.
536  pub async fn detect_all(node_id: Option<&str>) -> Result<Vec<Issue>> {
537      let detector = IssueDetector::new();
538      detector.run_all(node_id).await
539  }
540  
541  /// Format bytes to human readable.
542  fn format_bytes(bytes: u64) -> String {
543      const KB: u64 = 1024;
544      const MB: u64 = KB * 1024;
545      const GB: u64 = MB * 1024;
546      const TB: u64 = GB * 1024;
547  
548      if bytes >= TB {
549          format!("{:.1} TB", bytes as f64 / TB as f64)
550      } else if bytes >= GB {
551          format!("{:.1} GB", bytes as f64 / GB as f64)
552      } else if bytes >= MB {
553          format!("{:.1} MB", bytes as f64 / MB as f64)
554      } else if bytes >= KB {
555          format!("{:.1} KB", bytes as f64 / KB as f64)
556      } else {
557          format!("{} B", bytes)
558      }
559  }