/ sessions / 2026-01-23-swap-oom-protection-added.cspec
2026-01-23-swap-oom-protection-added.cspec
  1  id: 2026-01-23-swap-oom-protection-added
  2  type: session
  3  date: 2026-01-23
  4  status: completed
  5  priority: high
  6  
  7  title: Add Swap Configuration and OOM Protection to All Server Setup Specifications
  8  
  9  summary: |
 10    Updated all server setup specifications and deployment scripts to include
 11    32GB swap configuration and OOM protection for critical services. This
 12    prevents OOM killer from crashing user sessions and validator processes
 13    during memory pressure events.
 14  
 15  context:
 16    incident:
 17      date: 2026-01-23 13:38 UTC
 18      symptom: "Claude and tmux sessions crashed unexpectedly"
 19      root_cause: "OOM killer terminated user systemd (PID 1610)"
 20      trigger: "No swap configured + parallel Rust compilation consumed all 64GB RAM"
 21      impact: "All user sessions (tmux, Claude) terminated instantly"
 22  
 23    investigation:
 24      memory_available: 62GB RAM
 25      swap_before: 0GB
 26      oom_events:
 27        - killed_process: radicle-node (PID 1677)
 28        - killed_process: dbus-daemon (PID 2348)
 29        - killed_process: systemd (PID 1610)  # CRITICAL - killed all user sessions
 30      top_consumers:
 31        - deltavm_tests: 4.6GB
 32        - rustc_compilations: 3.5GB each (multiple parallel)
 33        - ci_test_runs: multiple parallel processes
 34  
 35    solution_applied:
 36      swap:
 37        size: 32GB
 38        file: /swapfile
 39        swappiness: 10  # Prefer RAM, use swap for emergency buffer
 40        persistent: /etc/fstab entry added
 41      oom_protection:
 42        systemd_user: oom_score_adj=-1000  # Never kill user systemd
 43        tmux_sessions: oom_score_adj=-800  # Protect development sessions
 44  
 45  objectives:
 46    - Document swap configuration requirements for all servers
 47    - Update infrastructure specifications with memory management
 48    - Update deployment scripts to configure swap automatically
 49    - Update MCP tools to include swap in server setup
 50    - Document OOM protection best practices
 51  
 52  deliverables:
 53    specifications_updated:
 54      ci_infrastructure:
 55        file: infra/machine/ci-infrastructure.cspec
 56        changes:
 57          - added: memory_config section
 58          - swap: 32GB for CI runner
 59          - swappiness: 10
 60          - oom_protection: systemd_user, tmux_sessions
 61  
 62      testnet_deployment:
 63        file: testnet-deployment-requirements.cspec
 64        changes:
 65          - added: memory_config to minimum_server_spec
 66          - swap: 16GB minimum, 32GB recommended
 67          - swappiness: 10
 68          - oom_protection: protect_critical_services
 69  
 70      deployment_procedures:
 71        file: infra/machine/deployment.cspec
 72        changes:
 73          - added: swap configuration to pre-deployment checklist
 74          - requirement: 16GB+ for validators, 32GB+ for CI
 75          - added: OOM protection check for critical services
 76  
 77    scripts_updated:
 78      setup_testnet_server:
 79        file: tools/setup-testnet-server.sh
 80        changes:
 81          - added: Step 6 - Configure swap memory (32GB)
 82          - creates: /swapfile with 32GB
 83          - configures: swappiness=10
 84          - makes_persistent: /etc/fstab entry
 85          - displays: swap status after configuration
 86        version: 1.1.0
 87  
 88    documentation_updated:
 89      testnet_setup:
 90        file: tools/TESTNET-SETUP.md
 91        changes:
 92          - updated: "What it does" section
 93          - added: swap configuration to setup steps
 94          - updated: "What's Protected" section
 95          - added: swap monitoring to recommendations
 96  
 97    mcp_tools_updated:
 98      setup_testnet_server:
 99        file: .claude/mcp/server.py
100        changes:
101          - updated: function docstring with swap step
102          - updated: tool description to mention swap
103          - updated: success message to include swap status
104  
105  configuration_details:
106    swap_settings:
107      size: 32GB
108      location: /swapfile
109      swappiness: 10
110      rationale: |
111        - 32GB provides buffer for 64GB RAM servers during compile spikes
112        - swappiness=10 prefers RAM but uses swap before OOM killer
113        - /swapfile easier to manage than partition-based swap
114  
115    oom_protection:
116      systemd_user: -1000  # Highest protection
117      tmux_sessions: -800  # High protection
118      validators: recommended (not automated yet)
119      rationale: |
120        - Negative oom_score_adj reduces likelihood of being killed
121        - systemd user must never be killed (terminates all sessions)
122        - tmux protects active development work
123        - Validator protection should be per-service (future work)
124  
125  deployment_requirements:
126    all_servers:
127      - swap_minimum: 16GB
128      - swap_recommended: 32GB
129      - swappiness: 10
130      - fstab_entry: required for persistence
131      - sysctl_conf: required for swappiness persistence
132  
133    ci_servers:
134      - swap_required: 32GB
135      - reason: parallel compilation + test execution
136      - oom_protection: systemd_user, tmux_sessions
137  
138    testnet_validators:
139      - swap_required: 16GB minimum, 32GB recommended
140      - reason: consensus load spikes, block processing bursts
141      - oom_protection: recommended for validator processes
142  
143  verification:
144    ci_server:
145      hostname: ci.ac-dc.network
146      swap_configured: 2026-01-23
147      size: 32GB
148      swappiness: 10
149      oom_protection_applied: tmux sessions
150      status: operational
151  
152    testnet_servers:
153      status: not_yet_applied
154      action_required: run setup-testnet-server.sh on all testnet nodes
155      servers_affected:
156        - testnet001.ac-dc.network
157        - testnet002.ac-dc.network
158        - testnet003.ac-dc.network
159        - testnet004.ac-dc.network
160        - testnet005.ac-dc.network
161  
162  files_modified:
163    - infra/machine/ci-infrastructure.cspec
164    - testnet-deployment-requirements.cspec
165    - infra/machine/deployment.cspec
166    - tools/setup-testnet-server.sh
167    - tools/TESTNET-SETUP.md
168    - .claude/mcp/server.py
169    - sessions/2026-01-23-swap-oom-protection-added.cspec
170  
171  testing:
172    manual_test:
173      command: sudo swapon --show && free -h
174      expected: 32GB swap active, swappiness=10
175      result: verified on ci.ac-dc.network
176  
177    stress_test_needed:
178      scenario: parallel cargo build across all repos
179      monitor: swap usage, no OOM events
180      status: future_validation
181  
182  monitoring:
183    recommended:
184      - swap_usage: should stay <10% under normal operation
185      - swap_usage_warning: alert if >50% for >5 minutes
186      - swap_usage_critical: alert if >80%
187      - oom_events: alert on any OOM killer activation
188      - memory_pressure: monitor psi (pressure stall information)
189  
190  best_practices:
191    swap_sizing:
192      rule: 0.5x to 1x RAM size
193      ci_servers: 0.5x (32GB swap for 64GB RAM)
194      validators: 0.5x to 1x depending on load
195  
196    swappiness:
197      value: 10
198      reasoning: |
199        - 0 = swap disabled (dangerous, can cause OOM)
200        - 10 = prefer RAM, use swap as emergency buffer
201        - 60 = default (too aggressive swapping)
202        - 100 = swap aggressively (hurts performance)
203  
204    oom_protection:
205      critical_services: -1000 to -900
206      important_services: -800 to -500
207      normal_services: 0 (default)
208      expendable_services: +100 to +1000
209  
210  future_work:
211    validator_oom_protection:
212      action: add oom_score_adj to systemd service files
213      services:
214        - alphaos-validator.service
215        - deltaos-validator.service
216        - adnet.service
217      oom_score: -900 (high protection, not absolute)
218  
219    monitoring_alerts:
220      action: add Prometheus alerts for swap usage
221      thresholds:
222        warning: 50%
223        critical: 80%
224  
225    documentation:
226      action: add to runbooks
227      topics:
228        - monitoring swap usage
229        - investigating OOM events
230        - adjusting oom_score_adj
231  
232  lessons_learned:
233    no_swap_dangerous: |
234      Without swap, OOM killer immediately terminates processes
235      during memory spikes. This can kill critical services like
236      user systemd, terminating all sessions instantly.
237  
238    systemd_user_critical: |
239      User systemd (PID 1610) manages all user sessions. If OOM
240      killer terminates it, ALL tmux/screen/ssh sessions die.
241      Must be protected with oom_score_adj=-1000.
242  
243    swap_is_safety_net: |
244      Swap is not for performance, it's a safety net. With swap,
245      system slows down under pressure instead of crashing.
246      Better to swap than to kill processes.
247  
248    parallel_compilation_risk: |
249      Parallel Rust compilation (multiple rustc processes at 3-4GB
250      each) can consume 20-30GB+ RAM. CI servers need adequate
251      swap to handle these spikes.
252  
253  status: completed
254  next_steps:
255    - Apply swap configuration to all 5 testnet servers
256    - Test setup-testnet-server.sh on fresh server
257    - Add validator OOM protection to systemd services
258    - Set up monitoring alerts for swap usage