worker-completion-contract.cspec
1 # Worker Completion Contract 2 # version: 1.7.0 3 # date: 2026-01-09 4 # updated: 2026-01-09 - Added SANDBOX POLICY: dangerouslyDisableSandbox:true on all Bash calls 5 # updated: 2026-01-09 - Added LAZY-LOAD RULE: Workers load CI commands only at PUSHED_TO_CI 6 # updated: 2026-01-09 - Added SUICIDE ON PERMISSION FAILURE rule (saves context on stuck workers) 7 # updated: 2026-01-09 - Added orchestrator before_spawn_checklist (incident fix) 8 # updated: 2026-01-09 - Added explicit radicle-push verification requirement 9 # updated: 2026-01-08 - Added checkpointing, timeouts, failure taxonomy, context handoff 10 # status: ACTIVE 11 # research_ref: See orchestrator pattern research in tasks/a86eef5.output 12 13 # === PURPOSE === 14 # Defines when a worker task is COMPLETE and milestone reporting requirements. 15 # Workers MUST follow this contract. Orchestrator MUST enforce it. 16 17 # === COMPLETION CRITERIA === 18 # A worker is NOT DONE until ALL of: 19 completion_requirements: 20 - code_committed: true 21 - pushed_to_forgejo: true 22 - ci_workflow_triggered: true 23 - ci_all_jobs_pass: true # check, format, audit, build, test, coverage 24 - radicle_push_job_success: true # MUST wait for radicle-push job to show "success" 25 26 # === RADICLE VERIFICATION (CRITICAL) === 27 # Workers MUST verify the radicle-push job specifically shows "success" 28 radicle_verification: 29 required_status: "success" 30 not_acceptable: ["skipped", "cancelled", "failure", "running", "waiting"] 31 32 why_skipped_not_ok: | 33 The radicle-push job may show "skipped" if: 34 - CI was triggered via workflow_dispatch instead of push 35 - Previous jobs in the chain failed 36 A "skipped" status means Radicle sync did NOT happen. 37 38 if_radicle_skipped: 39 action: "Re-push to trigger a proper push event" 40 command: "git commit --allow-empty -m 'ci: trigger radicle sync' && git push" 41 then: "Wait for new CI run and verify radicle-push shows success" 42 43 verification_command: | 44 # Query CI tasks and find radicle-push/Radicle Sync job 45 curl -s -H "Authorization: token $FORGEJO_TOKEN" \ 46 "https://source.ac-dc.network/api/v1/repos/alpha-delta-network/{repo}/actions/tasks" | \ 47 python3 -c " 48 import json, sys 49 data = json.load(sys.stdin) 50 runs = data.get('workflow_runs', []) 51 # Find most recent radicle-push/Radicle Sync job for YOUR commit SHA 52 radicle_jobs = [r for r in runs if 'radicle' in r['name'].lower() or 'sync' in r['name'].lower()] 53 for job in sorted(radicle_jobs, key=lambda x: x['id'], reverse=True)[:3]: 54 print(f\"{job['name']}: {job['status']} (run #{job['run_number']}, sha={job['head_sha'][:8]})\") 55 " 56 57 # === MILESTONE STATES === 58 milestones: 59 WORKING: 60 description: "Worker is writing/modifying code" 61 report_format: "WORKING: {brief description of current task}" 62 example: "WORKING: Implementing AX token types in vm/types/ax.rs" 63 transitions_to: [PUSHED_TO_CI] 64 65 PUSHED_TO_CI: 66 description: "Code committed and pushed, awaiting CI result" 67 report_format: "PUSHED_TO_CI: {repo} commit {short_hash} - CI run #{run_id}" 68 example: "PUSHED_TO_CI: alphavm commit a1b2c3d - CI run #451" 69 transitions_to: [FIXING_CI, DONE] 70 required_info: 71 - repo_name 72 - commit_hash 73 - ci_run_id_or_url 74 75 FIXING_CI: 76 description: "CI failed, worker diagnosing and fixing" 77 report_format: "FIXING_CI: {failed_job} - {brief diagnosis}" 78 example: "FIXING_CI: test job - missing mock for CreditPool" 79 transitions_to: [PUSHED_TO_CI] 80 required_actions: 81 - fetch_ci_log 82 - diagnose_failure 83 - fix_code 84 - push_fix 85 - wait_for_new_ci_run 86 87 DONE: 88 description: "CI fully passed AND radicle-push job shows success" 89 report_format: "DONE: {repo} CI run #{run_id} all green, radicle-push: success" 90 example: "DONE: alphavm CI run #452 all green, radicle-push: success" 91 required_proof: 92 - ci_run_url_or_id 93 - all_jobs_passed: true 94 - radicle_push_job: success # MUST be "success", NOT "skipped" 95 96 not_done_if: 97 - radicle_push: "skipped" # Re-push needed to trigger radicle sync 98 - radicle_push: "cancelled" 99 - radicle_push: "failure" 100 - radicle_push: "running" # Still in progress 101 102 # === STATE MACHINE === 103 state_flow: 104 initial: WORKING 105 terminal: DONE 106 transitions: 107 - from: WORKING 108 to: PUSHED_TO_CI 109 trigger: git_push_complete 110 111 - from: PUSHED_TO_CI 112 to: FIXING_CI 113 trigger: ci_job_failed 114 115 - from: PUSHED_TO_CI 116 to: DONE 117 trigger: ci_all_passed 118 119 - from: FIXING_CI 120 to: PUSHED_TO_CI 121 trigger: fix_pushed 122 123 # === SUICIDE ON PERMISSION FAILURE (CRITICAL) === 124 # Added: 2026-01-09 after Wave 3 incident where D004 worker got stuck in 107 bash permission denials 125 permission_failure_handling: 126 trigger_conditions: 127 - bash_permission_denied: "Permission to use Bash has been auto-denied" 128 - write_permission_denied: "Permission denied" on Write/Edit tool 129 - mkdir_permission_denied: "Permission denied" on mkdir/file creation 130 - repeated_failure: "Same tool denied 2+ times consecutively" 131 132 action: IMMEDIATE_TERMINATION 133 rationale: | 134 If a worker cannot execute bash commands or write files due to permission issues, 135 it CANNOT complete its task. Continuing will only waste context tokens in an 136 infinite retry loop. Worker should immediately terminate and report the issue. 137 138 worker_behavior: 139 on_permission_denied: 140 1: "Report: BLOCKED_PERMISSION: {tool} - {error_message}" 141 2: "List files/operations attempted" 142 3: "TERMINATE IMMEDIATELY - do not retry" 143 4: "Do NOT attempt workarounds without explicit orchestrator guidance" 144 145 max_retries_before_suicide: 2 146 suicide_message: "TERMINATED: Permission failure on {tool}. Cannot proceed. Orchestrator must fix permissions or respawn with different instructions." 147 148 orchestrator_recovery: 149 on_worker_permission_suicide: 150 - diagnose_permission_issue: "Why did tool get denied?" 151 - options: 152 - fix_permissions: "If sandbox/permission config issue" 153 - respawn_with_alternatives: "Use Write/Edit instead of Bash mkdir" 154 - manual_intervention: "Create directories manually, then respawn" 155 - respawn_worker: "With updated instructions avoiding the blocked operation" 156 157 incident_reference: 158 date: 2026-01-09 159 wave: 3 160 worker: "D004 ada05fa" 161 symptom: "Worker stuck in loop trying mkdir 107 times" 162 cost: "~200k+ wasted tokens" 163 fix: "Respawned with explicit Write/Edit instructions (ac3120c)" 164 lesson: "Workers should suicide on permission failure, not retry infinitely" 165 166 # === SANDBOX POLICY (v1.1.0) === 167 # Added: 2026-01-09 - Skip permission prompts while preserving TERMINATE on denial 168 sandbox_policy: 169 principle: "Workers run unrestricted but TERMINATE on hard denial" 170 171 bash_calls: 172 parameter: "dangerouslyDisableSandbox: true" 173 apply_to: "ALL Bash tool invocations" 174 effect: "Skips interactive permission prompts" 175 176 behavior: 177 on_success: "Command executes normally" 178 on_hard_denial: "TERMINATE per permission_failure_handling rules" 179 rationale: | 180 Permission prompts interrupt workflow and burden the user. 181 Using dangerouslyDisableSandbox skips prompts but respects system-level denials. 182 If a command is truly blocked (not just prompting), worker should TERMINATE. 183 184 worker_instruction: "BASH: Use dangerouslyDisableSandbox:true on all Bash calls" 185 186 # === LAZY-LOAD RULE (MANDATORY) === 187 # Added: 2026-01-09 - Token optimization for worker context 188 lazy_load_rule: 189 principle: "Workers load context ON DEMAND, not upfront" 190 rationale: "Saves ~50% tokens at spawn; CI context only needed after push" 191 192 mandatory_pattern: 193 at_spawn: 194 load: "worker-prompt-compact.cspec (~100 tokens)" 195 contains: "Task, scope, states, limits, permission rule" 196 does_not_contain: "CI commands, URLs, token generation" 197 198 at_PUSHED_TO_CI: 199 load: "infra/machine/commands/ci-worker.cspec (~150 tokens)" 200 contains: "Token gen, CI status checks, radicle verify, failure fixes" 201 trigger: "Worker reports PUSHED_TO_CI state" 202 203 at_FIXING_CI: 204 already_loaded: "ci-worker.cspec from previous state" 205 use: "on_failure section for fix commands" 206 207 files: 208 compact_prompt: "project/planning/worker-prompt-compact.cspec" 209 ci_commands: "infra/machine/commands/ci-worker.cspec" 210 rules_if_needed: "project/planning/worker-rules-minimal.cspec" 211 full_verbose: "project/planning/worker-completion-contract.cspec" 212 213 token_budget: 214 spawn_context: 100 # compact prompt only 215 ci_stage_addon: 150 # lazy-loaded at PUSHED_TO_CI 216 task_specific: 200 # file list, requirements 217 total_target: 450 # vs 800+ with verbose upfront 218 219 enforcement: 220 orchestrator_must: 221 - "Use compact_prompt template for all worker spawns" 222 - "NOT embed CI URLs/commands in spawn prompt" 223 - "Include lazy-load pointer: 'ON PUSHED_TO_CI: Load ci-worker.cspec'" 224 worker_must: 225 - "Load ci-worker.cspec when reaching PUSHED_TO_CI" 226 - "NOT ask orchestrator for CI commands" 227 - "Use runner logs (no token) before API (needs token)" 228 229 # === WORKER PROMPT TEMPLATE === 230 worker_prompt_template: | 231 TASK: Implement {component} in {repo} 232 233 MILESTONE REPORTING (report each state change): 234 - WORKING: {what you're doing} 235 - PUSHED_TO_CI: {repo} commit {hash} - CI run #{id} 236 - FIXING_CI: {failed_job} - {diagnosis} 237 - DONE: {repo} CI run #{id} all green, Radicle synced 238 239 COMPLETION CONTRACT: 240 You are NOT DONE until CI fully passes. If CI fails: 241 1. Fetch the CI log 242 2. Diagnose the failure 243 3. Fix the code 244 4. Push the fix 245 5. Report: PUSHED_TO_CI with new commit 246 6. Wait for CI 247 7. REPEAT until all green 248 249 CI STATUS CHECK: 250 - Web: https://source.ac-dc.network/{org}/{repo}/actions 251 - API: curl -s "https://source.ac-dc.network/api/v1/repos/{org}/{repo}/actions/runs" 252 253 DO NOT report DONE until you have confirmed: 254 - All CI jobs passed (check, format, build, test) 255 - radicle-push job status is "success" (NOT "skipped" or "cancelled") 256 257 IF radicle-push shows "skipped": 258 - This means Radicle sync did NOT happen 259 - Run: git commit --allow-empty -m "ci: trigger radicle sync" && git push 260 - Wait for new CI run 261 - Verify radicle-push shows "success" before reporting DONE 262 263 # === ORCHESTRATOR ENFORCEMENT === 264 orchestrator_rules: 265 # CRITICAL: Orchestrator MUST follow these before spawning ANY worker 266 before_spawn_checklist: 267 - use_compact_template: "Use compact_worker_template from worker-prompt-compact.cspec" 268 - add_task_specifics: "Add FILES TO CREATE, TECHNICAL REQUIREMENTS" 269 - include_lazy_load_pointer: "ON PUSHED_TO_CI: Load ci-worker.cspec" 270 - never_embed_ci: "Do NOT include CI URLs, commands, or token info in spawn prompt" 271 272 # DEPRECATED (2026-01-09): Do NOT use verbose templates 273 deprecated: 274 - enhanced_worker_prompt_template: "Too verbose (~400 tokens). Use compact (~100 tokens)" 275 - full_ci_embedding: "Workers lazy-load ci-worker.cspec when needed" 276 277 # Evolution of rules: 278 # - 2026-01-08: Created enhanced_worker_prompt_template (verbose) 279 # - 2026-01-09 AM: Workers wasted tokens on CI loops, permission retries 280 # - 2026-01-09 PM: Adopted compact_template + lazy-load CI rule 281 282 after_spawn: 283 - reject_done_without_ci_proof: true 284 - require_milestone_reports: true 285 - no_awaiting_ci_bucket: true # workers own CI, not orchestrator 286 - validate_done_report: 287 must_include: [ci_run_id, all_jobs_status, radicle_sync_status] 288 289 # === REPOSITORY DEPENDENCY GRAPH === 290 repo_dependencies: 291 # Format: repo -> [repos it depends on] 292 acdc-core: [] # ROOT - no dependencies 293 alphavm: [acdc-core] 294 deltavm: [acdc-core] 295 adnet: [alphavm, deltavm, acdc-core] 296 alphaos: [alphavm, acdc-core] 297 deltaos: [deltavm, acdc-core] 298 ac-dc: [adnet] # installer depends on binary 299 adl: [acdc-core] 300 adl-examples: [adl] 301 sdk: [] # TypeScript - independent 302 303 repo_dependents: 304 # Format: repo -> [repos that depend on it] (inverse of above) 305 acdc-core: [alphavm, deltavm, adnet, alphaos, deltaos, adl] 306 alphavm: [adnet, alphaos] 307 deltavm: [adnet, deltaos] 308 adnet: [ac-dc] 309 alphaos: [] 310 deltaos: [] 311 ac-dc: [] 312 adl: [adl-examples] 313 adl-examples: [] 314 sdk: [] 315 316 # === DEPENDENCY LOCKING RULES === 317 dependency_locking: 318 principle: | 319 Never have two workers that could conflict via dependencies. 320 A worker locks its repo AND the entire dependency chain (up and down). 321 322 rules: 323 - name: no_downstream_while_upstream_active 324 description: "Don't start worker on repo X if any dependency of X has an active worker" 325 example: "Don't start adnet worker while alphavm worker is active" 326 reason: "adnet CI would fail or use stale alphavm" 327 328 - name: no_upstream_while_downstream_active 329 description: "Don't start worker on repo X if any repo depending on X has an active worker" 330 example: "Don't start alphavm worker while adnet worker is active" 331 reason: "alphavm changes would break in-flight adnet work" 332 333 lock_acquisition: 334 before_spawn: 335 - get_repo_dependencies: "all repos this repo depends on" 336 - get_repo_dependents: "all repos that depend on this repo" 337 - check_active_workers: "any worker active on dependencies or dependents?" 338 - if_conflict: WAIT 339 - if_clear: SPAWN_AND_LOCK 340 341 lock_release: 342 on_worker_done: 343 - worker_reports_DONE: true 344 - ci_confirmed_green: true 345 - release_lock: "repo and chain now available" 346 347 # === ORCHESTRATOR WORKER REGISTRY === 348 worker_registry: 349 purpose: "Track active workers and their repos" 350 structure: 351 active_workers: 352 # worker_id: { repo, state, milestone } 353 active_repos: 354 # Set of repos with active workers 355 356 operations: 357 can_spawn: 358 input: repo_name 359 logic: | 360 # Check 1: Is this repo already being worked on? 361 if repo_name in active_repos: return false 362 363 # Check 2: Is any of my DEPENDENCIES being worked on? 364 # (Can't build on unstable foundation) 365 for dep in repo_dependencies[repo_name]: 366 if dep in active_repos: return false 367 368 # Check 3: Is any of my DEPENDENTS being worked on? 369 # (Can't change what someone else is building on) 370 for dependent in repo_dependents[repo_name]: 371 if dependent in active_repos: return false 372 373 return true 374 375 # KEY INSIGHT: Siblings with shared deps CAN run in parallel 376 # alphavm and deltavm both depend on acdc-core 377 # If alphavm worker is active, deltavm CAN spawn because: 378 # - deltavm's deps: [acdc-core] - not active 379 # - deltavm's dependents: [adnet, deltaos] - not active 380 # - They don't depend on each other 381 382 spawn_worker: 383 input: repo_name, task 384 precondition: can_spawn(repo_name) == true 385 steps: 386 - worker_id = generate_id() 387 - active_repos.add(repo_name) 388 - active_workers[worker_id] = { repo: repo_name, state: WORKING } 389 - return worker_id 390 391 update_milestone: 392 input: worker_id, new_state 393 steps: 394 - active_workers[worker_id].state = new_state 395 # States: WORKING, PUSHED_TO_CI, FIXING_CI, DONE 396 397 release_worker: 398 input: worker_id 399 precondition: worker reported DONE with CI proof 400 steps: 401 - repo = active_workers[worker_id].repo 402 - active_repos.remove(repo) 403 - delete active_workers[worker_id] 404 405 # === PARALLEL SPAWN RULES === 406 parallel_rules: 407 can_parallel: 408 - [alphavm, deltavm] # siblings, both depend on acdc-core 409 - [alphavm, adl] # siblings, both depend on acdc-core 410 - [deltavm, adl] # siblings, both depend on acdc-core 411 - [alphaos, deltaos] # no shared deps except acdc-core (if alphavm/deltavm done) 412 - [sdk, anything] # sdk is independent 413 414 cannot_parallel: 415 - [alphavm, adnet] # adnet depends on alphavm 416 - [alphavm, alphaos] # alphaos depends on alphavm 417 - [deltavm, adnet] # adnet depends on deltavm 418 - [deltavm, deltaos] # deltaos depends on deltavm 419 - [acdc-core, alphavm] # alphavm depends on acdc-core 420 - [acdc-core, deltavm] # deltavm depends on acdc-core 421 - [adnet, ac-dc] # ac-dc depends on adnet 422 423 # === SPAWN ORDER STRATEGY === 424 spawn_strategy: 425 principle: "Process dependency tree bottom-up (roots first)" 426 427 phases: 428 phase_1_roots: 429 repos: [acdc-core, sdk] 430 can_parallel: true 431 reason: "No dependencies, safe to work together" 432 433 phase_2_middle: 434 repos: [alphavm, deltavm, adl] 435 can_parallel: false # all depend on acdc-core 436 wait_for: phase_1_roots 437 strategy: "Sequential OR parallel if acdc-core done" 438 439 phase_3_integration: 440 repos: [adnet, alphaos, deltaos, adl-examples] 441 can_parallel: false # complex dependencies 442 wait_for: phase_2_middle 443 strategy: "Sequential based on which deps are done" 444 445 phase_4_tooling: 446 repos: [ac-dc] 447 wait_for: phase_3_integration 448 reason: "Depends on adnet binary" 449 450 # === ANTI-PATTERNS === 451 forbidden: 452 - worker_exits_after_push_before_ci: "VIOLATION - must wait for CI" 453 - orchestrator_tracks_awaiting_ci: "VIOLATION - worker's responsibility" 454 - done_without_ci_url: "VIOLATION - must prove CI passed" 455 - assuming_radicle_synced: "VIOLATION - must verify radicle-push job" 456 - parallel_dependent_workers: "VIOLATION - locks prevent this" 457 - upstream_worker_while_downstream_active: "VIOLATION - would break in-flight work" 458 - downstream_worker_while_upstream_active: "VIOLATION - CI would fail on stale deps" 459 - querying_all_ci_runs: "VIOLATION - only track YOUR commit's run by SHA" 460 - caching_old_run_ids: "VIOLATION - forget old runs, track only current" 461 - push_all_repos_at_once: "VIOLATION - must follow dependency order" 462 - triggering_dependent_ci_before_dependency_green: "VIOLATION - wait for deps to pass first" 463 - infinite_permission_retry_loop: "VIOLATION - suicide after 2 permission denials, don't waste 200k+ tokens" 464 - embedding_ci_context_at_spawn: "VIOLATION - use lazy-load rule, workers load ci-worker.cspec at PUSHED_TO_CI" 465 - verbose_worker_prompt: "VIOLATION - use compact_prompt (~100 tokens), not verbose (~400 tokens)" 466 467 # === DEPENDENCY DIAGRAM === 468 # Visual representation of repo dependencies 469 # 470 # ┌─────────┐ 471 # │ sdk │ (independent) 472 # └─────────┘ 473 # 474 # ┌───────────┐ 475 # │ acdc-core │ (ROOT) 476 # └─────┬─────┘ 477 # ┌───────────┬───┴───┬───────────┐ 478 # ▼ ▼ ▼ ▼ 479 # ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────┐ 480 # │ alphavm │ │ deltavm │ │ adl │ │ │ 481 # └────┬────┘ └────┬────┘ └────┬────┘ │ │ 482 # │ │ │ │ │ 483 # ┌────┴────┐ ┌────┴────┐ ┌────┴─────┐│ │ 484 # │ alphaos │ │ deltaos │ │adl-examples│ │ 485 # └─────────┘ └─────────┘ └──────────┘│ │ 486 # │ │ │ │ 487 # └─────┬─────┘ │ │ 488 # ▼ │ │ 489 # ┌─────────┐ │ │ 490 # │ adnet │◄───────────────────┘ │ 491 # └────┬────┘ │ 492 # ▼ │ 493 # ┌─────────┐ │ 494 # │ ac-dc │ │ 495 # └─────────┘ │ 496 # 497 # SPAWN BLOCKING EXAMPLE: 498 # If worker is active on alphavm: 499 # active_repos = [alphavm] 500 # 501 # can_spawn(deltavm)? 502 # - deltavm in active? NO 503 # - deltavm deps [acdc-core] in active? NO 504 # - deltavm dependents [adnet, deltaos] in active? NO 505 # - RESULT: YES - can spawn in parallel! 506 # 507 # can_spawn(adnet)? 508 # - adnet in active? NO 509 # - adnet deps [alphavm, deltavm, acdc-core] in active? YES (alphavm) 510 # - RESULT: NO - blocked until alphavm worker completes 511 # 512 # can_spawn(acdc-core)? 513 # - acdc-core in active? NO 514 # - acdc-core deps [] in active? NO 515 # - acdc-core dependents [alphavm, ...] in active? YES (alphavm) 516 # - RESULT: NO - blocked (can't change dep of in-flight work) 517 518 # === CI RUN TRACKING PROTOCOL === 519 ci_tracking: 520 principle: | 521 Track ONLY the CI run for YOUR commit. Forget all historical runs. 522 Never query for "all runs" - query for YOUR run by commit SHA. 523 524 on_push: 525 steps: 526 - push_commit: "git push origin main" 527 - capture_sha: "SHA=$(git rev-parse HEAD)" 528 - store_tracking: "tracked_runs[repo] = { sha: SHA, run_id: null, status: PENDING }" 529 - poll_for_run: "Query API until run with head_sha == SHA appears" 530 - capture_run_id: "tracked_runs[repo].run_id = found_run_id" 531 532 query_my_run: 533 correct: | 534 # Query runs, find the one matching MY commit SHA 535 TOKEN="..." 536 SHA=$(git rev-parse HEAD) 537 curl -s -H "Authorization: token $TOKEN" \ 538 "https://source.ac-dc.network/api/v1/repos/alpha-delta-network/{repo}/actions/runs" | \ 539 jq '.workflow_runs[] | select(.head_sha == "'$SHA'")' 540 541 wrong: | 542 # DON'T just get "latest runs" - they may not be yours 543 curl ... "/actions/runs?limit=10" # WRONG - returns stale runs 544 545 tracking_state: 546 structure: 547 tracked_runs: 548 # repo_name: { sha, run_id, status, last_checked } 549 550 cleanup: 551 - on_done: "Remove from tracked_runs" 552 - on_new_push: "Replace old tracking with new SHA" 553 - never: "Accumulate old run IDs" 554 555 # === FORGEJO INFRASTRUCTURE ACCESS === 556 infrastructure: 557 forgejo_server: 558 address: "10.106.0.2" 559 ssh: "ssh devops@10.106.0.2" 560 service: "forgejo.service" 561 db_path: "/var/lib/forgejo/forgejo.db" 562 logs: "journalctl -u forgejo --no-pager" 563 564 ci_runners: 565 location: "localhost (same machine as orchestrator)" 566 services: "forgejo-runner-{1..6}.service" 567 count: 6 568 check_status: "systemctl status forgejo-runner-{1..6}" 569 check_logs: "journalctl -u forgejo-runner-1 --no-pager -n 50" 570 check_all_activity: "journalctl -u 'forgejo-runner-*' --since '5 minutes ago'" 571 572 debugging: 573 runner_not_picking_up_jobs: 574 - check: "systemctl status forgejo-runner-*" 575 - check: "journalctl -u forgejo-runner-1 -n 50" 576 - look_for: "502 Bad Gateway (Forgejo down)" 577 - look_for: "task XXXX repo is..." (runner is working) 578 579 forgejo_restart_recovery: 580 - symptom: "502 Bad Gateway in runner logs" 581 - cause: "Forgejo service restarted" 582 - recovery: "Wait for Forgejo to stabilize, runners auto-reconnect" 583 - verify: "journalctl -u forgejo-runner-1 | grep 'task.*repo is'" 584 585 find_my_ci_run: 586 - check_runner_logs: "journalctl -u 'forgejo-runner-*' | grep {repo_name}" 587 - find_task_id: "Look for 'task XXXX repo is alpha-delta-network/{repo}'" 588 - verify_sha: "Cross-reference with commit SHA" 589 590 # === CI RUN STATE MACHINE === 591 ci_run_states: 592 waiting: "Run queued, not yet picked up by runner" 593 running: "Runner is executing the workflow" 594 success: "All jobs passed" 595 failure: "One or more jobs failed" 596 cancelled: "Run was cancelled" 597 598 transitions: 599 waiting -> running: "Runner picks up job" 600 running -> success: "All jobs complete successfully" 601 running -> failure: "Any job fails" 602 running -> cancelled: "User or system cancels" 603 waiting -> cancelled: "Cancelled before starting" 604 605 # === ERROR HANDLING === 606 error_handling: 607 transient_errors: 608 502_bad_gateway: 609 cause: "Forgejo service temporarily unavailable" 610 action: "Wait and retry - runners auto-recover" 611 do_not: "Panic and re-trigger runs" 612 613 api_returns_empty_sha: 614 cause: "Forgejo API sometimes doesn't populate head_sha" 615 action: "Check runner logs directly for task info" 616 fallback: "Match by repo name and recent timestamp" 617 618 permanent_errors: 619 ci_job_failed: 620 action: "Worker must diagnose and fix" 621 do_not: "Mark as done, ignore, or escalate to orchestrator" 622 623 runner_offline: 624 check: "systemctl status forgejo-runner-*" 625 action: "Restart runner service" 626 command: "sudo systemctl restart forgejo-runner-{N}" 627 628 # === EXAMPLE WORKER SESSION === 629 example_session: 630 - report: "WORKING: Setting up fee calculation module in adnet/crates/adnet-runtime/src/fees/" 631 - report: "WORKING: Implementing BaseFee and DynamicFee structs" 632 - report: "PUSHED_TO_CI: adnet commit 7f3a2b1 - CI run #89" 633 - wait: "CI running..." 634 - report: "FIXING_CI: test job - assertion failed in fee_calculation_test" 635 - report: "PUSHED_TO_CI: adnet commit 8e4b3c2 - CI run #90" 636 - wait: "CI running..." 637 - report: "DONE: adnet CI run #90 all green, Radicle synced" 638 639 # ============================================================ 640 # ENHANCEMENTS FROM ORCHESTRATOR PATTERN RESEARCH (2026-01-08) 641 # Based on: Anthropic, LangGraph, ccswarm, Claude-Flow patterns 642 # ============================================================ 643 644 # === CHECKPOINTING === 645 # Enables recovery from orchestrator/worker crashes 646 checkpointing: 647 enabled: true 648 storage_path: "sessions/{date}-{worker_id}.checkpoint.cspec" 649 650 checkpoint_triggers: 651 - milestone_transition # WORKING -> PUSHED_TO_CI, etc. 652 - commit_pushed # After each git push 653 - ci_result_received # After CI status update 654 - periodic_interval: 5m # Every 5 minutes during long operations 655 656 checkpoint_content: 657 required: 658 - worker_id 659 - repo_name 660 - component_id 661 - current_milestone 662 - timestamp 663 tracking: 664 - commits: ["sha1", "sha2"] # All commits made 665 - ci_runs: [{id, status, jobs_passed}] 666 - files_modified: ["path1", "path2"] 667 optional: 668 - context_summary: "Compressed state description" 669 - blockers: ["blocker1"] 670 - decisions_made: ["decision1"] 671 672 resumption: 673 on_orchestrator_restart: 674 steps: 675 - load_all_checkpoints_from: "sessions/" 676 - for_each_active_worker: 677 - check_ci_status: "May have completed while down" 678 - if_ci_passed: mark_done 679 - if_ci_failed: resume_at_FIXING_CI 680 - if_ci_running: resume_at_PUSHED_TO_CI 681 - if_no_ci_run: resume_at_WORKING 682 683 on_worker_crash: 684 steps: 685 - load_last_checkpoint 686 - spawn_replacement_worker 687 - inject_checkpoint_context 688 - resume_from_last_milestone 689 690 # === TIMEOUT HANDLING === 691 # Prevents stuck workers from blocking pipeline 692 timeouts: 693 default_worker_timeout: 30m 694 695 per_milestone: 696 WORKING: 697 timeout: 20m 698 warning_at: 15m 699 on_timeout: escalate 700 701 PUSHED_TO_CI: 702 timeout: 15m # CI should complete within this 703 warning_at: 10m 704 on_timeout: check_ci_then_escalate 705 706 FIXING_CI: 707 timeout: 15m # Per fix attempt 708 max_attempts: 5 709 warning_at: 10m 710 on_timeout: escalate_after_max_attempts 711 712 on_timeout_actions: 713 escalate: 714 - notify_orchestrator: true 715 - message: "Worker {worker_id} timed out in {milestone}" 716 - options: 717 - extend_if_active: "Worker showing progress in last 5m" 718 - terminate_if_stuck: "No progress detected" 719 - reassign_task: "Critical path, spawn new worker" 720 721 check_ci_then_escalate: 722 - check_ci_api: true 723 - if_ci_still_running: extend_timeout 724 - if_ci_stuck: restart_ci_and_wait 725 - if_ci_finished: transition_milestone 726 727 deadlock_detection: 728 check_interval: 5m 729 indicators: 730 - worker_idle_time: ">10m" 731 - ci_status: "stuck_in_pending" 732 - no_commits: ">15m after WORKING start" 733 - api_errors: ">3 consecutive failures" 734 735 # === CONTEXT HANDOFF === 736 # Formal specification for worker initialization 737 context_handoff: 738 orchestrator_provides: 739 required: 740 - objective: "Clear task description" 741 - component_id: "A001, D004, etc." 742 - repo: "Target repository" 743 - branch_strategy: "feature_branch or main" 744 - plan_ref: "Path to .plan.cspec file" 745 746 recommended: 747 - output_format: "Expected deliverable structure" 748 - task_boundaries: "What IS and IS NOT in scope" 749 - related_context: 750 - component_specs: "Relevant .cspec files" 751 - recent_changes: "Last 3 commits in dep chain" 752 - known_issues: "Current blockers/gotchas" 753 - parallel_workers: "Other workers currently active" 754 755 context_budget: 756 max_tokens: 50000 # Keep worker context lean 757 priority_order: 758 1: task_objective_and_plan 759 2: relevant_code_files 760 3: test_files 761 4: spec_files 762 5: historical_context # Only if space permits 763 764 worker_returns: 765 on_completion: 766 - files_created: ["list"] 767 - files_modified: ["list"] 768 - tests_added: ["list"] 769 - ci_run_id: "final passing run" 770 - summary: "Brief description of changes" 771 772 # === FAILURE TAXONOMY === 773 # Structured error classification for appropriate handling 774 failure_taxonomy: 775 transient: 776 description: "Temporary issues that resolve with retry" 777 examples: 778 - network_timeout 779 - api_rate_limit 780 - ci_runner_busy 781 - 502_bad_gateway 782 action: retry_with_backoff 783 backoff: 784 initial: 5s 785 max: 60s 786 multiplier: 2 787 max_retries: 3 788 789 recoverable: 790 description: "Worker can fix without human help" 791 examples: 792 - test_failure 793 - lint_error 794 - build_error 795 - clippy_warning 796 - missing_import 797 action: worker_must_fix 798 escalate_after: 3_attempts 799 max_attempts: 5 800 801 blocking: 802 description: "Cannot proceed without external resolution" 803 examples: 804 - dependency_repo_broken 805 - infrastructure_down 806 - missing_credentials 807 - merge_conflict_with_main 808 action: pause_and_escalate 809 notify: orchestrator_immediately 810 worker_state: BLOCKED # New state for blocking issues 811 812 permission_denied: 813 description: "Worker cannot execute tools due to permission/sandbox issues" 814 examples: 815 - bash_permission_denied: "Permission to use Bash has been auto-denied" 816 - write_tool_denied: "Permission denied on file write" 817 - mkdir_denied: "Cannot create directory" 818 action: IMMEDIATE_SUICIDE 819 max_retries: 2 820 do_not: "Loop infinitely trying the same operation" 821 rationale: "Saves context tokens - worker cannot complete task without permissions" 822 report: "BLOCKED_PERMISSION: {tool} - {error}" 823 recovery: "Orchestrator diagnoses, fixes permissions or respawns with alternatives" 824 825 fatal: 826 description: "Serious issues requiring human intervention" 827 examples: 828 - security_vulnerability_introduced 829 - data_corruption_detected 830 - api_key_exposed_in_commit 831 - breaking_change_to_stable_interface 832 action: halt_all_workers_in_chain 833 require: human_intervention 834 rollback: consider_revert 835 836 # === CASCADE FAILURE PROTECTION === 837 cascade_protection: 838 on_upstream_failure: 839 description: "Dependency repo CI failed while downstream worker active" 840 detection: "Monitor CI status of locked dependency repos" 841 action: 842 - pause_downstream_workers: true 843 - notify: "Dependency {repo} failed, pausing until resolved" 844 - do_not: "Let downstream workers continue with broken dep" 845 846 on_downstream_breakage: 847 description: "Upstream change broke downstream repo" 848 detection: "Downstream CI failure after upstream push" 849 action: 850 - identify_breaking_commit: true 851 - options: 852 - revert_upstream_commit: "If isolated change" 853 - spawn_fix_worker_for_downstream: "If complex" 854 - coordinate_fix: "If both need changes" 855 856 # === MODEL ASSIGNMENT === 857 # Different models for different roles (cost/capability optimization) 858 model_assignment: 859 orchestrator: 860 model: "claude-opus-4" 861 role: "Task decomposition, conflict resolution, synthesis" 862 use_for: 863 - planning_and_decomposition 864 - dependency_analysis 865 - conflict_resolution 866 - final_review_and_synthesis 867 - complex_architectural_decisions 868 869 workers: 870 model: "claude-sonnet-4" 871 role: "Implementation, CI fixes, focused tasks" 872 use_for: 873 - code_implementation 874 - test_writing 875 - ci_fix_diagnosis 876 - documentation_updates 877 rationale: "Faster, cheaper, sufficient for focused implementation tasks" 878 879 override_to_opus: 880 conditions: 881 - task_complexity: "high" 882 - critical_path: true 883 - security_sensitive: true 884 - cross_repo_coordination: true 885 886 # === DETAILED PROGRESS SUBSTATES === 887 # Granular tracking within milestones for observability 888 detailed_progress: 889 WORKING: 890 substates: 891 - analyzing_requirements 892 - reading_existing_code 893 - writing_new_code 894 - writing_tests 895 - local_verification 896 - preparing_commit 897 progress_indicator: "files_modified / estimated_files" 898 report_substates: optional # Can include in WORKING reports 899 900 PUSHED_TO_CI: 901 substates: 902 - push_complete 903 - ci_queued 904 - ci_running: 905 current_job: "build" 906 jobs_passed: 2 907 jobs_total: 6 908 - ci_finishing 909 progress_indicator: "jobs_passed / total_jobs" 910 911 FIXING_CI: 912 substates: 913 - fetching_logs 914 - diagnosing: 915 error_category: "test_failure" 916 affected_files: ["src/foo.rs"] 917 - implementing_fix 918 - local_test 919 - preparing_fix_commit 920 attempt_count: 1 921 max_attempts: 5 922 progress_indicator: "diagnosis_complete / fix_applied / verified" 923 924 # === ENHANCED WORKER PROMPT TEMPLATE === 925 # DEPRECATED: Use compact_worker_template from worker-prompt-compact.cspec instead 926 # Kept for reference/debugging only. ~400 tokens vs ~100 for compact. 927 # See: lazy_load_rule section above for current standard. 928 enhanced_worker_prompt_template_DEPRECATED: | 929 TASK: Implement {component} in {repo} 930 PLAN: {plan_ref} 931 BRANCH: feat/{component_id}-{worker_id} 932 933 CONTEXT: 934 - Component spec: {component_spec_path} 935 - Dependencies: {upstream_deps} 936 - Consumers: {downstream_deps} 937 - Parallel workers: {active_workers} 938 939 SCOPE: 940 - IN SCOPE: {in_scope_items} 941 - OUT OF SCOPE: {out_of_scope_items} 942 943 MILESTONE REPORTING (report each state change): 944 - WORKING: {what you're doing} [substate: {substate}] 945 - PUSHED_TO_CI: {repo} commit {hash} - CI run #{id} 946 - FIXING_CI: {failed_job} - {diagnosis} [attempt: {n}/{max}] 947 - DONE: {repo} CI run #{id} all green, Radicle synced 948 949 COMPLETION CONTRACT: 950 You are NOT DONE until CI fully passes. If CI fails: 951 1. Fetch the CI log 952 2. Classify failure: transient | recoverable | blocking | fatal 953 3. If transient: retry with backoff 954 4. If recoverable: diagnose, fix, push, report FIXING_CI 955 5. If blocking: report BLOCKED with details 956 6. If fatal: HALT and escalate immediately 957 958 CHECKPOINTING: 959 Your state is checkpointed on each milestone transition. 960 If interrupted, orchestrator will resume from last checkpoint. 961 962 TIMEOUTS: 963 - WORKING: 20m max 964 - PUSHED_TO_CI: 15m max (CI completion) 965 - FIXING_CI: 15m per attempt, 5 attempts max 966 967 PERMISSION FAILURE - SUICIDE RULE (CRITICAL): 968 If you encounter permission denied errors on Bash, Write, or Edit tools: 969 1. DO NOT retry more than 2 times 970 2. Report: "BLOCKED_PERMISSION: {tool} - {error}" 971 3. TERMINATE IMMEDIATELY 972 4. Do NOT loop trying the same operation 973 Rationale: Infinite retries waste tokens. Let orchestrator fix permissions and respawn. 974 975 CI STATUS CHECK: 976 - Web: https://source.ac-dc.network/{org}/{repo}/actions 977 - API: curl -s "https://source.ac-dc.network/api/v1/repos/{org}/{repo}/actions/runs" 978 979 DO NOT report DONE until you have confirmed: 980 - All CI jobs passed (check, format, build, test) 981 - radicle-push job status is "success" (NOT "skipped" or "cancelled") 982 983 IF radicle-push shows "skipped": 984 - This means Radicle sync did NOT happen 985 - Run: git commit --allow-empty -m "ci: trigger radicle sync" && git push 986 - Wait for new CI run 987 - Verify radicle-push shows "success" before reporting DONE 988 989 # === RESEARCH SOURCES === 990 # Enhancements based on industry research (2026-01-08) 991 research_sources: 992 anthropic: 993 - url: "https://www.anthropic.com/research/building-effective-agents" 994 insight: "Orchestrator-worker pattern for complex tasks" 995 - url: "https://www.anthropic.com/engineering/multi-agent-research-system" 996 insight: "Context management, model differentiation" 997 998 frameworks: 999 - name: LangGraph 1000 insight: "Checkpointing, state persistence, durable execution" 1001 - name: ccswarm 1002 insight: "Git worktree isolation, session persistence" 1003 - name: Claude-Flow 1004 insight: "Specialized worker pools, failure recovery" 1005 1006 patterns: 1007 - url: "https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns" 1008 insight: "Agent isolation, compute separation" 1009 - url: "https://galileo.ai/blog/multi-agent-coordination-strategies" 1010 insight: "Lock management, conflict avoidance"