/ infra / machine / ci-maintenance.cspec
ci-maintenance.cspec
  1  # CI Monitoring & Maintenance
  2  # phase: 6
  3  # human_doc: devops/human/ALPHA-DELTA_CI_Setup_Guide_v2.md
  4  # updated: 2026-01-22
  5  # status: INSTALLED
  6  
  7  # === RUNNER HEALTH SCRIPT (INSTALLED) ===
  8  health_script:
  9    path: /usr/local/bin/runner-health.sh
 10    installed: 2026-01-05
 11    run: sudo /usr/local/bin/runner-health.sh
 12    checks:
 13      - cpu_usage: top -bn1 | head -5
 14      - memory: free -h
 15      - disk: df -h /opt/ci
 16      - load_average: uptime
 17      - sccache_stats: sccache --show-stats
 18      - runner_status: systemctl is-active forgejo-runner
 19      - active_builds: pgrep -a cargo
 20  
 21  # === DAILY CLEANUP (INSTALLED) ===
 22  cleanup_cron:
 23    path: /etc/cron.daily/ci-cleanup
 24    installed: 2026-01-05
 25    log: /var/log/ci-cleanup.log
 26    tasks:
 27      - find_delete: /opt/ci/workspaces target dirs (mtime +7)
 28      - sccache_trim: if_over_25GB, delete atime +14
 29      - artifacts_cleanup: /opt/ci/artifacts (mtime +30)
 30      - cargo_registry: /home/devops/.cargo/registry/cache (atime +30)
 31  
 32  # === DISK CLEANUP WATCHER (INSTALLED) ===
 33  disk_cleanup_watcher:
 34    installed: 2026-01-22
 35    purpose: auto_cleanup_when_disk_reaches_threshold
 36    threshold: 85%
 37    check_interval: 300s  # 5 minutes
 38    script: /home/devops/scripts/disk-cleanup-watch.sh
 39    log: /var/log/disk-cleanup-watch.log
 40    service: disk-cleanup-watch.service
 41    behavior:
 42      - monitors_root_partition_usage
 43      - triggers_cargo_clean_on_all_repos_when_threshold_exceeded
 44      - cleans_cargo_registry_cache
 45      - logs_all_actions_with_timestamps
 46    repos_cleaned:
 47      - alphaos
 48      - deltaos
 49      - adl
 50      - alphavm
 51      - adnet
 52      - deltavm
 53      - ac-dc
 54      - acdc-core
 55      - wallet-core
 56    systemd:
 57      type: simple
 58      user: devops
 59      restart: always
 60      restart_sec: 10
 61      enabled: true  # starts on boot
 62    commands:
 63      status: sudo systemctl status disk-cleanup-watch
 64      restart: sudo systemctl restart disk-cleanup-watch
 65      stop: sudo systemctl stop disk-cleanup-watch
 66      logs_follow: journalctl -u disk-cleanup-watch -f
 67      logs_file: cat /var/log/disk-cleanup-watch.log
 68      manual_trigger: |
 69        # Force cleanup regardless of threshold
 70        for repo in alphaos deltaos adl alphavm adnet deltavm ac-dc; do
 71          cd /home/devops/working-repos/$repo && cargo clean
 72        done
 73  
 74  # === FORGEJO BACKUP ===
 75  backup_script:
 76    path: /usr/local/bin/backup-forgejo.sh
 77    dir: /var/backups/forgejo
 78    retention: 14_days
 79    tasks:
 80      - pg_dump forgejo -> db-$DATE.sql.gz
 81      - rsync: /var/lib/forgejo/repositories -> repos-latest/
 82      - tar: app.ini, nginx, systemd -> config-$DATE.tar.gz
 83  
 84  # === MAINTENANCE CHECKLISTS ===
 85  weekly:
 86    - check_runner_disk: df -h /opt/ci
 87    - review_sccache_hit_rate: sccache --show-stats
 88    - check_forgejo_logs: journalctl -u forgejo --since "1 week ago" | grep -i error
 89    - verify_backup_integrity
 90  
 91  monthly:
 92    - update_forgejo_binary
 93    - update_forgejo_runner_binary
 94    - update_rust: rustup update
 95    - update_mold_if_available
 96    - rotate_secrets_tokens
 97    - test_cert_renewal: certbot renew --dry-run
 98  
 99  quarterly:
100    - security_audit: cargo audit + CVE review
101    - backup_restoration_test
102    - performance_baseline_comparison
103    - optimize_ci_pipeline_times
104    - cleanup_old_s3_artifacts
105  
106  # === EXPECTED BUILD TIMES (32-core) ===
107  build_times:
108    cargo_check: cold:45s, warm:15s, incremental:5s
109    cargo_test_all: cold:3min, warm:1min, incremental:30s
110    cargo_build_release_single: cold:2min, warm:45s, incremental:20s
111    full_release_alpha_delta: cold:5min, warm:2min, incremental:1min
112    integration_tests: cold:4min, warm:2min, incremental:1min
113    complete_ci: cold:12min, warm:6min, incremental:4min
114  
115  # === RESOURCE TARGETS ===
116  targets:
117    cpu_during_build: 80-95% (alert: <50%)
118    memory: 40-70% (alert: >85%)
119    disk_root: <85% (auto_cleanup_at_85%, alert: >90%)
120    disk_io: nvme_saturated_ok (alert: >95% for >5min)
121    sccache_hit_rate: >70% (alert: <50%)
122    job_queue: <3_pending (alert: >8_pending)
123  
124  # === TROUBLESHOOTING ===
125  troubleshoot:
126    slow_builds:
127      - check_sccache: sccache --show-stats | grep "Cache hits"
128      - verify_mold: readelf -p .comment target/release/binary | grep mold
129      - check_io: iostat -x 1 5
130      - monitor_parallelism: watch -n1 'ps aux | grep rustc | wc -l'
131  
132    runner_not_picking_jobs:
133      - check_logs: journalctl -u forgejo-runner -n 50 | grep -i error
134      - verify_network: curl -I https://ci.yourdomain.com/api/v1/version
135      - re-register: stop, rm .runner, register, start
136  
137    oom_during_compilation:
138      - reduce_parallelism: CARGO_BUILD_JOBS=16 cargo build --release
139      - enable_swap: fallocate -l 16G /swapfile && mkswap && swapon
140  
141  # === COST ESTIMATE (Monthly) ===
142  cost:
143    forgejo_droplet: $12
144    runner_droplet: ~$336
145    do_spaces_100gb: ~$5
146    reserved_ip: $4
147    bandwidth_500gb: ~$45
148    total: ~$402/month
149  
150  # === PERFORMANCE IMPROVEMENTS ===
151  improvements:
152    full_ci_cycle: baseline:25min -> optimized:6min (4x)
153    parallel_jobs: baseline:2 -> optimized:8 (4x)
154    sccache_hit_rate: baseline:40% -> optimized:75%
155    release_build: baseline:15min -> optimized:2min (7x)