Cradicle Explorer

/ project / planning / worker-completion-contract.cspec
worker-completion-contract.cspec
   1  # Worker Completion Contract
   2  # version: 1.7.0
   3  # date: 2026-01-09
   4  # updated: 2026-01-09 - Added SANDBOX POLICY: dangerouslyDisableSandbox:true on all Bash calls
   5  # updated: 2026-01-09 - Added LAZY-LOAD RULE: Workers load CI commands only at PUSHED_TO_CI
   6  # updated: 2026-01-09 - Added SUICIDE ON PERMISSION FAILURE rule (saves context on stuck workers)
   7  # updated: 2026-01-09 - Added orchestrator before_spawn_checklist (incident fix)
   8  # updated: 2026-01-09 - Added explicit radicle-push verification requirement
   9  # updated: 2026-01-08 - Added checkpointing, timeouts, failure taxonomy, context handoff
  10  # status: ACTIVE
  11  # research_ref: See orchestrator pattern research in tasks/a86eef5.output
  12  
  13  # === PURPOSE ===
  14  # Defines when a worker task is COMPLETE and milestone reporting requirements.
  15  # Workers MUST follow this contract. Orchestrator MUST enforce it.
  16  
  17  # === COMPLETION CRITERIA ===
  18  # A worker is NOT DONE until ALL of:
  19  completion_requirements:
  20    - code_committed: true
  21    - pushed_to_forgejo: true
  22    - ci_workflow_triggered: true
  23    - ci_all_jobs_pass: true  # check, format, audit, build, test, coverage
  24    - radicle_push_job_success: true  # MUST wait for radicle-push job to show "success"
  25  
  26  # === RADICLE VERIFICATION (CRITICAL) ===
  27  # Workers MUST verify the radicle-push job specifically shows "success"
  28  radicle_verification:
  29    required_status: "success"
  30    not_acceptable: ["skipped", "cancelled", "failure", "running", "waiting"]
  31  
  32    why_skipped_not_ok: |
  33      The radicle-push job may show "skipped" if:
  34      - CI was triggered via workflow_dispatch instead of push
  35      - Previous jobs in the chain failed
  36      A "skipped" status means Radicle sync did NOT happen.
  37  
  38    if_radicle_skipped:
  39      action: "Re-push to trigger a proper push event"
  40      command: "git commit --allow-empty -m 'ci: trigger radicle sync' && git push"
  41      then: "Wait for new CI run and verify radicle-push shows success"
  42  
  43    verification_command: |
  44      # Query CI tasks and find radicle-push/Radicle Sync job
  45      curl -s -H "Authorization: token $FORGEJO_TOKEN" \
  46        "https://source.ac-dc.network/api/v1/repos/alpha-delta-network/{repo}/actions/tasks" | \
  47        python3 -c "
  48      import json, sys
  49      data = json.load(sys.stdin)
  50      runs = data.get('workflow_runs', [])
  51      # Find most recent radicle-push/Radicle Sync job for YOUR commit SHA
  52      radicle_jobs = [r for r in runs if 'radicle' in r['name'].lower() or 'sync' in r['name'].lower()]
  53      for job in sorted(radicle_jobs, key=lambda x: x['id'], reverse=True)[:3]:
  54          print(f\"{job['name']}: {job['status']} (run #{job['run_number']}, sha={job['head_sha'][:8]})\")
  55      "
  56  
  57  # === MILESTONE STATES ===
  58  milestones:
  59    WORKING:
  60      description: "Worker is writing/modifying code"
  61      report_format: "WORKING: {brief description of current task}"
  62      example: "WORKING: Implementing AX token types in vm/types/ax.rs"
  63      transitions_to: [PUSHED_TO_CI]
  64  
  65    PUSHED_TO_CI:
  66      description: "Code committed and pushed, awaiting CI result"
  67      report_format: "PUSHED_TO_CI: {repo} commit {short_hash} - CI run #{run_id}"
  68      example: "PUSHED_TO_CI: alphavm commit a1b2c3d - CI run #451"
  69      transitions_to: [FIXING_CI, DONE]
  70      required_info:
  71        - repo_name
  72        - commit_hash
  73        - ci_run_id_or_url
  74  
  75    FIXING_CI:
  76      description: "CI failed, worker diagnosing and fixing"
  77      report_format: "FIXING_CI: {failed_job} - {brief diagnosis}"
  78      example: "FIXING_CI: test job - missing mock for CreditPool"
  79      transitions_to: [PUSHED_TO_CI]
  80      required_actions:
  81        - fetch_ci_log
  82        - diagnose_failure
  83        - fix_code
  84        - push_fix
  85        - wait_for_new_ci_run
  86  
  87    DONE:
  88      description: "CI fully passed AND radicle-push job shows success"
  89      report_format: "DONE: {repo} CI run #{run_id} all green, radicle-push: success"
  90      example: "DONE: alphavm CI run #452 all green, radicle-push: success"
  91      required_proof:
  92        - ci_run_url_or_id
  93        - all_jobs_passed: true
  94        - radicle_push_job: success  # MUST be "success", NOT "skipped"
  95  
  96      not_done_if:
  97        - radicle_push: "skipped"   # Re-push needed to trigger radicle sync
  98        - radicle_push: "cancelled"
  99        - radicle_push: "failure"
 100        - radicle_push: "running"  # Still in progress
 101  
 102  # === STATE MACHINE ===
 103  state_flow:
 104    initial: WORKING
 105    terminal: DONE
 106    transitions:
 107      - from: WORKING
 108        to: PUSHED_TO_CI
 109        trigger: git_push_complete
 110  
 111      - from: PUSHED_TO_CI
 112        to: FIXING_CI
 113        trigger: ci_job_failed
 114  
 115      - from: PUSHED_TO_CI
 116        to: DONE
 117        trigger: ci_all_passed
 118  
 119      - from: FIXING_CI
 120        to: PUSHED_TO_CI
 121        trigger: fix_pushed
 122  
 123  # === SUICIDE ON PERMISSION FAILURE (CRITICAL) ===
 124  # Added: 2026-01-09 after Wave 3 incident where D004 worker got stuck in 107 bash permission denials
 125  permission_failure_handling:
 126    trigger_conditions:
 127      - bash_permission_denied: "Permission to use Bash has been auto-denied"
 128      - write_permission_denied: "Permission denied" on Write/Edit tool
 129      - mkdir_permission_denied: "Permission denied" on mkdir/file creation
 130      - repeated_failure: "Same tool denied 2+ times consecutively"
 131  
 132    action: IMMEDIATE_TERMINATION
 133    rationale: |
 134      If a worker cannot execute bash commands or write files due to permission issues,
 135      it CANNOT complete its task. Continuing will only waste context tokens in an
 136      infinite retry loop. Worker should immediately terminate and report the issue.
 137  
 138    worker_behavior:
 139      on_permission_denied:
 140        1: "Report: BLOCKED_PERMISSION: {tool} - {error_message}"
 141        2: "List files/operations attempted"
 142        3: "TERMINATE IMMEDIATELY - do not retry"
 143        4: "Do NOT attempt workarounds without explicit orchestrator guidance"
 144  
 145      max_retries_before_suicide: 2
 146      suicide_message: "TERMINATED: Permission failure on {tool}. Cannot proceed. Orchestrator must fix permissions or respawn with different instructions."
 147  
 148    orchestrator_recovery:
 149      on_worker_permission_suicide:
 150        - diagnose_permission_issue: "Why did tool get denied?"
 151        - options:
 152            - fix_permissions: "If sandbox/permission config issue"
 153            - respawn_with_alternatives: "Use Write/Edit instead of Bash mkdir"
 154            - manual_intervention: "Create directories manually, then respawn"
 155        - respawn_worker: "With updated instructions avoiding the blocked operation"
 156  
 157    incident_reference:
 158      date: 2026-01-09
 159      wave: 3
 160      worker: "D004 ada05fa"
 161      symptom: "Worker stuck in loop trying mkdir 107 times"
 162      cost: "~200k+ wasted tokens"
 163      fix: "Respawned with explicit Write/Edit instructions (ac3120c)"
 164      lesson: "Workers should suicide on permission failure, not retry infinitely"
 165  
 166  # === SANDBOX POLICY (v1.1.0) ===
 167  # Added: 2026-01-09 - Skip permission prompts while preserving TERMINATE on denial
 168  sandbox_policy:
 169    principle: "Workers run unrestricted but TERMINATE on hard denial"
 170  
 171    bash_calls:
 172      parameter: "dangerouslyDisableSandbox: true"
 173      apply_to: "ALL Bash tool invocations"
 174      effect: "Skips interactive permission prompts"
 175  
 176    behavior:
 177      on_success: "Command executes normally"
 178      on_hard_denial: "TERMINATE per permission_failure_handling rules"
 179      rationale: |
 180        Permission prompts interrupt workflow and burden the user.
 181        Using dangerouslyDisableSandbox skips prompts but respects system-level denials.
 182        If a command is truly blocked (not just prompting), worker should TERMINATE.
 183  
 184    worker_instruction: "BASH: Use dangerouslyDisableSandbox:true on all Bash calls"
 185  
 186  # === LAZY-LOAD RULE (MANDATORY) ===
 187  # Added: 2026-01-09 - Token optimization for worker context
 188  lazy_load_rule:
 189    principle: "Workers load context ON DEMAND, not upfront"
 190    rationale: "Saves ~50% tokens at spawn; CI context only needed after push"
 191  
 192    mandatory_pattern:
 193      at_spawn:
 194        load: "worker-prompt-compact.cspec (~100 tokens)"
 195        contains: "Task, scope, states, limits, permission rule"
 196        does_not_contain: "CI commands, URLs, token generation"
 197  
 198      at_PUSHED_TO_CI:
 199        load: "infra/machine/commands/ci-worker.cspec (~150 tokens)"
 200        contains: "Token gen, CI status checks, radicle verify, failure fixes"
 201        trigger: "Worker reports PUSHED_TO_CI state"
 202  
 203      at_FIXING_CI:
 204        already_loaded: "ci-worker.cspec from previous state"
 205        use: "on_failure section for fix commands"
 206  
 207    files:
 208      compact_prompt: "project/planning/worker-prompt-compact.cspec"
 209      ci_commands: "infra/machine/commands/ci-worker.cspec"
 210      rules_if_needed: "project/planning/worker-rules-minimal.cspec"
 211      full_verbose: "project/planning/worker-completion-contract.cspec"
 212  
 213    token_budget:
 214      spawn_context: 100      # compact prompt only
 215      ci_stage_addon: 150     # lazy-loaded at PUSHED_TO_CI
 216      task_specific: 200      # file list, requirements
 217      total_target: 450       # vs 800+ with verbose upfront
 218  
 219    enforcement:
 220      orchestrator_must:
 221        - "Use compact_prompt template for all worker spawns"
 222        - "NOT embed CI URLs/commands in spawn prompt"
 223        - "Include lazy-load pointer: 'ON PUSHED_TO_CI: Load ci-worker.cspec'"
 224      worker_must:
 225        - "Load ci-worker.cspec when reaching PUSHED_TO_CI"
 226        - "NOT ask orchestrator for CI commands"
 227        - "Use runner logs (no token) before API (needs token)"
 228  
 229  # === WORKER PROMPT TEMPLATE ===
 230  worker_prompt_template: |
 231    TASK: Implement {component} in {repo}
 232  
 233    MILESTONE REPORTING (report each state change):
 234    - WORKING: {what you're doing}
 235    - PUSHED_TO_CI: {repo} commit {hash} - CI run #{id}
 236    - FIXING_CI: {failed_job} - {diagnosis}
 237    - DONE: {repo} CI run #{id} all green, Radicle synced
 238  
 239    COMPLETION CONTRACT:
 240    You are NOT DONE until CI fully passes. If CI fails:
 241    1. Fetch the CI log
 242    2. Diagnose the failure
 243    3. Fix the code
 244    4. Push the fix
 245    5. Report: PUSHED_TO_CI with new commit
 246    6. Wait for CI
 247    7. REPEAT until all green
 248  
 249    CI STATUS CHECK:
 250    - Web: https://source.ac-dc.network/{org}/{repo}/actions
 251    - API: curl -s "https://source.ac-dc.network/api/v1/repos/{org}/{repo}/actions/runs"
 252  
 253    DO NOT report DONE until you have confirmed:
 254    - All CI jobs passed (check, format, build, test)
 255    - radicle-push job status is "success" (NOT "skipped" or "cancelled")
 256  
 257    IF radicle-push shows "skipped":
 258    - This means Radicle sync did NOT happen
 259    - Run: git commit --allow-empty -m "ci: trigger radicle sync" && git push
 260    - Wait for new CI run
 261    - Verify radicle-push shows "success" before reporting DONE
 262  
 263  # === ORCHESTRATOR ENFORCEMENT ===
 264  orchestrator_rules:
 265    # CRITICAL: Orchestrator MUST follow these before spawning ANY worker
 266    before_spawn_checklist:
 267      - use_compact_template: "Use compact_worker_template from worker-prompt-compact.cspec"
 268      - add_task_specifics: "Add FILES TO CREATE, TECHNICAL REQUIREMENTS"
 269      - include_lazy_load_pointer: "ON PUSHED_TO_CI: Load ci-worker.cspec"
 270      - never_embed_ci: "Do NOT include CI URLs, commands, or token info in spawn prompt"
 271  
 272    # DEPRECATED (2026-01-09): Do NOT use verbose templates
 273    deprecated:
 274      - enhanced_worker_prompt_template: "Too verbose (~400 tokens). Use compact (~100 tokens)"
 275      - full_ci_embedding: "Workers lazy-load ci-worker.cspec when needed"
 276  
 277    # Evolution of rules:
 278    # - 2026-01-08: Created enhanced_worker_prompt_template (verbose)
 279    # - 2026-01-09 AM: Workers wasted tokens on CI loops, permission retries
 280    # - 2026-01-09 PM: Adopted compact_template + lazy-load CI rule
 281  
 282    after_spawn:
 283      - reject_done_without_ci_proof: true
 284      - require_milestone_reports: true
 285      - no_awaiting_ci_bucket: true  # workers own CI, not orchestrator
 286      - validate_done_report:
 287          must_include: [ci_run_id, all_jobs_status, radicle_sync_status]
 288  
 289  # === REPOSITORY DEPENDENCY GRAPH ===
 290  repo_dependencies:
 291    # Format: repo -> [repos it depends on]
 292    acdc-core: []                          # ROOT - no dependencies
 293    alphavm: [acdc-core]
 294    deltavm: [acdc-core]
 295    adnet: [alphavm, deltavm, acdc-core]
 296    alphaos: [alphavm, acdc-core]
 297    deltaos: [deltavm, acdc-core]
 298    ac-dc: [adnet]                         # installer depends on binary
 299    adl: [acdc-core]
 300    adl-examples: [adl]
 301    sdk: []                                # TypeScript - independent
 302  
 303  repo_dependents:
 304    # Format: repo -> [repos that depend on it] (inverse of above)
 305    acdc-core: [alphavm, deltavm, adnet, alphaos, deltaos, adl]
 306    alphavm: [adnet, alphaos]
 307    deltavm: [adnet, deltaos]
 308    adnet: [ac-dc]
 309    alphaos: []
 310    deltaos: []
 311    ac-dc: []
 312    adl: [adl-examples]
 313    adl-examples: []
 314    sdk: []
 315  
 316  # === DEPENDENCY LOCKING RULES ===
 317  dependency_locking:
 318    principle: |
 319      Never have two workers that could conflict via dependencies.
 320      A worker locks its repo AND the entire dependency chain (up and down).
 321  
 322    rules:
 323      - name: no_downstream_while_upstream_active
 324        description: "Don't start worker on repo X if any dependency of X has an active worker"
 325        example: "Don't start adnet worker while alphavm worker is active"
 326        reason: "adnet CI would fail or use stale alphavm"
 327  
 328      - name: no_upstream_while_downstream_active
 329        description: "Don't start worker on repo X if any repo depending on X has an active worker"
 330        example: "Don't start alphavm worker while adnet worker is active"
 331        reason: "alphavm changes would break in-flight adnet work"
 332  
 333    lock_acquisition:
 334      before_spawn:
 335        - get_repo_dependencies: "all repos this repo depends on"
 336        - get_repo_dependents: "all repos that depend on this repo"
 337        - check_active_workers: "any worker active on dependencies or dependents?"
 338        - if_conflict: WAIT
 339        - if_clear: SPAWN_AND_LOCK
 340  
 341    lock_release:
 342      on_worker_done:
 343        - worker_reports_DONE: true
 344        - ci_confirmed_green: true
 345        - release_lock: "repo and chain now available"
 346  
 347  # === ORCHESTRATOR WORKER REGISTRY ===
 348  worker_registry:
 349    purpose: "Track active workers and their repos"
 350    structure:
 351      active_workers:
 352        # worker_id: { repo, state, milestone }
 353      active_repos:
 354        # Set of repos with active workers
 355  
 356    operations:
 357      can_spawn:
 358        input: repo_name
 359        logic: |
 360          # Check 1: Is this repo already being worked on?
 361          if repo_name in active_repos: return false
 362  
 363          # Check 2: Is any of my DEPENDENCIES being worked on?
 364          # (Can't build on unstable foundation)
 365          for dep in repo_dependencies[repo_name]:
 366            if dep in active_repos: return false
 367  
 368          # Check 3: Is any of my DEPENDENTS being worked on?
 369          # (Can't change what someone else is building on)
 370          for dependent in repo_dependents[repo_name]:
 371            if dependent in active_repos: return false
 372  
 373          return true
 374  
 375        # KEY INSIGHT: Siblings with shared deps CAN run in parallel
 376        # alphavm and deltavm both depend on acdc-core
 377        # If alphavm worker is active, deltavm CAN spawn because:
 378        #   - deltavm's deps: [acdc-core] - not active
 379        #   - deltavm's dependents: [adnet, deltaos] - not active
 380        #   - They don't depend on each other
 381  
 382      spawn_worker:
 383        input: repo_name, task
 384        precondition: can_spawn(repo_name) == true
 385        steps:
 386          - worker_id = generate_id()
 387          - active_repos.add(repo_name)
 388          - active_workers[worker_id] = { repo: repo_name, state: WORKING }
 389          - return worker_id
 390  
 391      update_milestone:
 392        input: worker_id, new_state
 393        steps:
 394          - active_workers[worker_id].state = new_state
 395          # States: WORKING, PUSHED_TO_CI, FIXING_CI, DONE
 396  
 397      release_worker:
 398        input: worker_id
 399        precondition: worker reported DONE with CI proof
 400        steps:
 401          - repo = active_workers[worker_id].repo
 402          - active_repos.remove(repo)
 403          - delete active_workers[worker_id]
 404  
 405  # === PARALLEL SPAWN RULES ===
 406  parallel_rules:
 407    can_parallel:
 408      - [alphavm, deltavm]      # siblings, both depend on acdc-core
 409      - [alphavm, adl]          # siblings, both depend on acdc-core
 410      - [deltavm, adl]          # siblings, both depend on acdc-core
 411      - [alphaos, deltaos]      # no shared deps except acdc-core (if alphavm/deltavm done)
 412      - [sdk, anything]         # sdk is independent
 413  
 414    cannot_parallel:
 415      - [alphavm, adnet]        # adnet depends on alphavm
 416      - [alphavm, alphaos]      # alphaos depends on alphavm
 417      - [deltavm, adnet]        # adnet depends on deltavm
 418      - [deltavm, deltaos]      # deltaos depends on deltavm
 419      - [acdc-core, alphavm]    # alphavm depends on acdc-core
 420      - [acdc-core, deltavm]    # deltavm depends on acdc-core
 421      - [adnet, ac-dc]          # ac-dc depends on adnet
 422  
 423  # === SPAWN ORDER STRATEGY ===
 424  spawn_strategy:
 425    principle: "Process dependency tree bottom-up (roots first)"
 426  
 427    phases:
 428      phase_1_roots:
 429        repos: [acdc-core, sdk]
 430        can_parallel: true
 431        reason: "No dependencies, safe to work together"
 432  
 433      phase_2_middle:
 434        repos: [alphavm, deltavm, adl]
 435        can_parallel: false  # all depend on acdc-core
 436        wait_for: phase_1_roots
 437        strategy: "Sequential OR parallel if acdc-core done"
 438  
 439      phase_3_integration:
 440        repos: [adnet, alphaos, deltaos, adl-examples]
 441        can_parallel: false  # complex dependencies
 442        wait_for: phase_2_middle
 443        strategy: "Sequential based on which deps are done"
 444  
 445      phase_4_tooling:
 446        repos: [ac-dc]
 447        wait_for: phase_3_integration
 448        reason: "Depends on adnet binary"
 449  
 450  # === ANTI-PATTERNS ===
 451  forbidden:
 452    - worker_exits_after_push_before_ci: "VIOLATION - must wait for CI"
 453    - orchestrator_tracks_awaiting_ci: "VIOLATION - worker's responsibility"
 454    - done_without_ci_url: "VIOLATION - must prove CI passed"
 455    - assuming_radicle_synced: "VIOLATION - must verify radicle-push job"
 456    - parallel_dependent_workers: "VIOLATION - locks prevent this"
 457    - upstream_worker_while_downstream_active: "VIOLATION - would break in-flight work"
 458    - downstream_worker_while_upstream_active: "VIOLATION - CI would fail on stale deps"
 459    - querying_all_ci_runs: "VIOLATION - only track YOUR commit's run by SHA"
 460    - caching_old_run_ids: "VIOLATION - forget old runs, track only current"
 461    - push_all_repos_at_once: "VIOLATION - must follow dependency order"
 462    - triggering_dependent_ci_before_dependency_green: "VIOLATION - wait for deps to pass first"
 463    - infinite_permission_retry_loop: "VIOLATION - suicide after 2 permission denials, don't waste 200k+ tokens"
 464    - embedding_ci_context_at_spawn: "VIOLATION - use lazy-load rule, workers load ci-worker.cspec at PUSHED_TO_CI"
 465    - verbose_worker_prompt: "VIOLATION - use compact_prompt (~100 tokens), not verbose (~400 tokens)"
 466  
 467  # === DEPENDENCY DIAGRAM ===
 468  # Visual representation of repo dependencies
 469  #
 470  #                    ┌─────────┐
 471  #                    │  sdk    │  (independent)
 472  #                    └─────────┘
 473  #
 474  #                    ┌───────────┐
 475  #                    │ acdc-core │  (ROOT)
 476  #                    └─────┬─────┘
 477  #          ┌───────────┬───┴───┬───────────┐
 478  #          ▼           ▼       ▼           ▼
 479  #     ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────┐
 480  #     │ alphavm │ │ deltavm │ │   adl   │ │     │
 481  #     └────┬────┘ └────┬────┘ └────┬────┘ │     │
 482  #          │           │           │      │     │
 483  #     ┌────┴────┐ ┌────┴────┐ ┌────┴─────┐│     │
 484  #     │ alphaos │ │ deltaos │ │adl-examples│     │
 485  #     └─────────┘ └─────────┘ └──────────┘│     │
 486  #          │           │                  │     │
 487  #          └─────┬─────┘                  │     │
 488  #                ▼                        │     │
 489  #          ┌─────────┐                    │     │
 490  #          │  adnet  │◄───────────────────┘     │
 491  #          └────┬────┘                          │
 492  #               ▼                               │
 493  #          ┌─────────┐                          │
 494  #          │  ac-dc  │                          │
 495  #          └─────────┘                          │
 496  #
 497  # SPAWN BLOCKING EXAMPLE:
 498  # If worker is active on alphavm:
 499  #   active_repos = [alphavm]
 500  #
 501  #   can_spawn(deltavm)?
 502  #     - deltavm in active? NO
 503  #     - deltavm deps [acdc-core] in active? NO
 504  #     - deltavm dependents [adnet, deltaos] in active? NO
 505  #     - RESULT: YES - can spawn in parallel!
 506  #
 507  #   can_spawn(adnet)?
 508  #     - adnet in active? NO
 509  #     - adnet deps [alphavm, deltavm, acdc-core] in active? YES (alphavm)
 510  #     - RESULT: NO - blocked until alphavm worker completes
 511  #
 512  #   can_spawn(acdc-core)?
 513  #     - acdc-core in active? NO
 514  #     - acdc-core deps [] in active? NO
 515  #     - acdc-core dependents [alphavm, ...] in active? YES (alphavm)
 516  #     - RESULT: NO - blocked (can't change dep of in-flight work)
 517  
 518  # === CI RUN TRACKING PROTOCOL ===
 519  ci_tracking:
 520    principle: |
 521      Track ONLY the CI run for YOUR commit. Forget all historical runs.
 522      Never query for "all runs" - query for YOUR run by commit SHA.
 523  
 524    on_push:
 525      steps:
 526        - push_commit: "git push origin main"
 527        - capture_sha: "SHA=$(git rev-parse HEAD)"
 528        - store_tracking: "tracked_runs[repo] = { sha: SHA, run_id: null, status: PENDING }"
 529        - poll_for_run: "Query API until run with head_sha == SHA appears"
 530        - capture_run_id: "tracked_runs[repo].run_id = found_run_id"
 531  
 532    query_my_run:
 533      correct: |
 534        # Query runs, find the one matching MY commit SHA
 535        TOKEN="..."
 536        SHA=$(git rev-parse HEAD)
 537        curl -s -H "Authorization: token $TOKEN" \
 538          "https://source.ac-dc.network/api/v1/repos/alpha-delta-network/{repo}/actions/runs" | \
 539          jq '.workflow_runs[] | select(.head_sha == "'$SHA'")'
 540  
 541      wrong: |
 542        # DON'T just get "latest runs" - they may not be yours
 543        curl ... "/actions/runs?limit=10"  # WRONG - returns stale runs
 544  
 545    tracking_state:
 546      structure:
 547        tracked_runs:
 548          # repo_name: { sha, run_id, status, last_checked }
 549  
 550      cleanup:
 551        - on_done: "Remove from tracked_runs"
 552        - on_new_push: "Replace old tracking with new SHA"
 553        - never: "Accumulate old run IDs"
 554  
 555  # === FORGEJO INFRASTRUCTURE ACCESS ===
 556  infrastructure:
 557    forgejo_server:
 558      address: "10.106.0.2"
 559      ssh: "ssh devops@10.106.0.2"
 560      service: "forgejo.service"
 561      db_path: "/var/lib/forgejo/forgejo.db"
 562      logs: "journalctl -u forgejo --no-pager"
 563  
 564    ci_runners:
 565      location: "localhost (same machine as orchestrator)"
 566      services: "forgejo-runner-{1..6}.service"
 567      count: 6
 568      check_status: "systemctl status forgejo-runner-{1..6}"
 569      check_logs: "journalctl -u forgejo-runner-1 --no-pager -n 50"
 570      check_all_activity: "journalctl -u 'forgejo-runner-*' --since '5 minutes ago'"
 571  
 572    debugging:
 573      runner_not_picking_up_jobs:
 574        - check: "systemctl status forgejo-runner-*"
 575        - check: "journalctl -u forgejo-runner-1 -n 50"
 576        - look_for: "502 Bad Gateway (Forgejo down)"
 577        - look_for: "task XXXX repo is..." (runner is working)
 578  
 579      forgejo_restart_recovery:
 580        - symptom: "502 Bad Gateway in runner logs"
 581        - cause: "Forgejo service restarted"
 582        - recovery: "Wait for Forgejo to stabilize, runners auto-reconnect"
 583        - verify: "journalctl -u forgejo-runner-1 | grep 'task.*repo is'"
 584  
 585      find_my_ci_run:
 586        - check_runner_logs: "journalctl -u 'forgejo-runner-*' | grep {repo_name}"
 587        - find_task_id: "Look for 'task XXXX repo is alpha-delta-network/{repo}'"
 588        - verify_sha: "Cross-reference with commit SHA"
 589  
 590  # === CI RUN STATE MACHINE ===
 591  ci_run_states:
 592    waiting: "Run queued, not yet picked up by runner"
 593    running: "Runner is executing the workflow"
 594    success: "All jobs passed"
 595    failure: "One or more jobs failed"
 596    cancelled: "Run was cancelled"
 597  
 598    transitions:
 599      waiting -> running: "Runner picks up job"
 600      running -> success: "All jobs complete successfully"
 601      running -> failure: "Any job fails"
 602      running -> cancelled: "User or system cancels"
 603      waiting -> cancelled: "Cancelled before starting"
 604  
 605  # === ERROR HANDLING ===
 606  error_handling:
 607    transient_errors:
 608      502_bad_gateway:
 609        cause: "Forgejo service temporarily unavailable"
 610        action: "Wait and retry - runners auto-recover"
 611        do_not: "Panic and re-trigger runs"
 612  
 613      api_returns_empty_sha:
 614        cause: "Forgejo API sometimes doesn't populate head_sha"
 615        action: "Check runner logs directly for task info"
 616        fallback: "Match by repo name and recent timestamp"
 617  
 618    permanent_errors:
 619      ci_job_failed:
 620        action: "Worker must diagnose and fix"
 621        do_not: "Mark as done, ignore, or escalate to orchestrator"
 622  
 623      runner_offline:
 624        check: "systemctl status forgejo-runner-*"
 625        action: "Restart runner service"
 626        command: "sudo systemctl restart forgejo-runner-{N}"
 627  
 628  # === EXAMPLE WORKER SESSION ===
 629  example_session:
 630    - report: "WORKING: Setting up fee calculation module in adnet/crates/adnet-runtime/src/fees/"
 631    - report: "WORKING: Implementing BaseFee and DynamicFee structs"
 632    - report: "PUSHED_TO_CI: adnet commit 7f3a2b1 - CI run #89"
 633    - wait: "CI running..."
 634    - report: "FIXING_CI: test job - assertion failed in fee_calculation_test"
 635    - report: "PUSHED_TO_CI: adnet commit 8e4b3c2 - CI run #90"
 636    - wait: "CI running..."
 637    - report: "DONE: adnet CI run #90 all green, Radicle synced"
 638  
 639  # ============================================================
 640  # ENHANCEMENTS FROM ORCHESTRATOR PATTERN RESEARCH (2026-01-08)
 641  # Based on: Anthropic, LangGraph, ccswarm, Claude-Flow patterns
 642  # ============================================================
 643  
 644  # === CHECKPOINTING ===
 645  # Enables recovery from orchestrator/worker crashes
 646  checkpointing:
 647    enabled: true
 648    storage_path: "sessions/{date}-{worker_id}.checkpoint.cspec"
 649  
 650    checkpoint_triggers:
 651      - milestone_transition     # WORKING -> PUSHED_TO_CI, etc.
 652      - commit_pushed            # After each git push
 653      - ci_result_received       # After CI status update
 654      - periodic_interval: 5m    # Every 5 minutes during long operations
 655  
 656    checkpoint_content:
 657      required:
 658        - worker_id
 659        - repo_name
 660        - component_id
 661        - current_milestone
 662        - timestamp
 663      tracking:
 664        - commits: ["sha1", "sha2"]  # All commits made
 665        - ci_runs: [{id, status, jobs_passed}]
 666        - files_modified: ["path1", "path2"]
 667      optional:
 668        - context_summary: "Compressed state description"
 669        - blockers: ["blocker1"]
 670        - decisions_made: ["decision1"]
 671  
 672    resumption:
 673      on_orchestrator_restart:
 674        steps:
 675          - load_all_checkpoints_from: "sessions/"
 676          - for_each_active_worker:
 677              - check_ci_status: "May have completed while down"
 678              - if_ci_passed: mark_done
 679              - if_ci_failed: resume_at_FIXING_CI
 680              - if_ci_running: resume_at_PUSHED_TO_CI
 681              - if_no_ci_run: resume_at_WORKING
 682  
 683      on_worker_crash:
 684        steps:
 685          - load_last_checkpoint
 686          - spawn_replacement_worker
 687          - inject_checkpoint_context
 688          - resume_from_last_milestone
 689  
 690  # === TIMEOUT HANDLING ===
 691  # Prevents stuck workers from blocking pipeline
 692  timeouts:
 693    default_worker_timeout: 30m
 694  
 695    per_milestone:
 696      WORKING:
 697        timeout: 20m
 698        warning_at: 15m
 699        on_timeout: escalate
 700  
 701      PUSHED_TO_CI:
 702        timeout: 15m  # CI should complete within this
 703        warning_at: 10m
 704        on_timeout: check_ci_then_escalate
 705  
 706      FIXING_CI:
 707        timeout: 15m  # Per fix attempt
 708        max_attempts: 5
 709        warning_at: 10m
 710        on_timeout: escalate_after_max_attempts
 711  
 712    on_timeout_actions:
 713      escalate:
 714        - notify_orchestrator: true
 715        - message: "Worker {worker_id} timed out in {milestone}"
 716        - options:
 717            - extend_if_active: "Worker showing progress in last 5m"
 718            - terminate_if_stuck: "No progress detected"
 719            - reassign_task: "Critical path, spawn new worker"
 720  
 721      check_ci_then_escalate:
 722        - check_ci_api: true
 723        - if_ci_still_running: extend_timeout
 724        - if_ci_stuck: restart_ci_and_wait
 725        - if_ci_finished: transition_milestone
 726  
 727    deadlock_detection:
 728      check_interval: 5m
 729      indicators:
 730        - worker_idle_time: ">10m"
 731        - ci_status: "stuck_in_pending"
 732        - no_commits: ">15m after WORKING start"
 733        - api_errors: ">3 consecutive failures"
 734  
 735  # === CONTEXT HANDOFF ===
 736  # Formal specification for worker initialization
 737  context_handoff:
 738    orchestrator_provides:
 739      required:
 740        - objective: "Clear task description"
 741        - component_id: "A001, D004, etc."
 742        - repo: "Target repository"
 743        - branch_strategy: "feature_branch or main"
 744        - plan_ref: "Path to .plan.cspec file"
 745  
 746      recommended:
 747        - output_format: "Expected deliverable structure"
 748        - task_boundaries: "What IS and IS NOT in scope"
 749        - related_context:
 750            - component_specs: "Relevant .cspec files"
 751            - recent_changes: "Last 3 commits in dep chain"
 752            - known_issues: "Current blockers/gotchas"
 753        - parallel_workers: "Other workers currently active"
 754  
 755    context_budget:
 756      max_tokens: 50000  # Keep worker context lean
 757      priority_order:
 758        1: task_objective_and_plan
 759        2: relevant_code_files
 760        3: test_files
 761        4: spec_files
 762        5: historical_context  # Only if space permits
 763  
 764    worker_returns:
 765      on_completion:
 766        - files_created: ["list"]
 767        - files_modified: ["list"]
 768        - tests_added: ["list"]
 769        - ci_run_id: "final passing run"
 770        - summary: "Brief description of changes"
 771  
 772  # === FAILURE TAXONOMY ===
 773  # Structured error classification for appropriate handling
 774  failure_taxonomy:
 775    transient:
 776      description: "Temporary issues that resolve with retry"
 777      examples:
 778        - network_timeout
 779        - api_rate_limit
 780        - ci_runner_busy
 781        - 502_bad_gateway
 782      action: retry_with_backoff
 783      backoff:
 784        initial: 5s
 785        max: 60s
 786        multiplier: 2
 787      max_retries: 3
 788  
 789    recoverable:
 790      description: "Worker can fix without human help"
 791      examples:
 792        - test_failure
 793        - lint_error
 794        - build_error
 795        - clippy_warning
 796        - missing_import
 797      action: worker_must_fix
 798      escalate_after: 3_attempts
 799      max_attempts: 5
 800  
 801    blocking:
 802      description: "Cannot proceed without external resolution"
 803      examples:
 804        - dependency_repo_broken
 805        - infrastructure_down
 806        - missing_credentials
 807        - merge_conflict_with_main
 808      action: pause_and_escalate
 809      notify: orchestrator_immediately
 810      worker_state: BLOCKED  # New state for blocking issues
 811  
 812    permission_denied:
 813      description: "Worker cannot execute tools due to permission/sandbox issues"
 814      examples:
 815        - bash_permission_denied: "Permission to use Bash has been auto-denied"
 816        - write_tool_denied: "Permission denied on file write"
 817        - mkdir_denied: "Cannot create directory"
 818      action: IMMEDIATE_SUICIDE
 819      max_retries: 2
 820      do_not: "Loop infinitely trying the same operation"
 821      rationale: "Saves context tokens - worker cannot complete task without permissions"
 822      report: "BLOCKED_PERMISSION: {tool} - {error}"
 823      recovery: "Orchestrator diagnoses, fixes permissions or respawns with alternatives"
 824  
 825    fatal:
 826      description: "Serious issues requiring human intervention"
 827      examples:
 828        - security_vulnerability_introduced
 829        - data_corruption_detected
 830        - api_key_exposed_in_commit
 831        - breaking_change_to_stable_interface
 832      action: halt_all_workers_in_chain
 833      require: human_intervention
 834      rollback: consider_revert
 835  
 836  # === CASCADE FAILURE PROTECTION ===
 837  cascade_protection:
 838    on_upstream_failure:
 839      description: "Dependency repo CI failed while downstream worker active"
 840      detection: "Monitor CI status of locked dependency repos"
 841      action:
 842        - pause_downstream_workers: true
 843        - notify: "Dependency {repo} failed, pausing until resolved"
 844        - do_not: "Let downstream workers continue with broken dep"
 845  
 846    on_downstream_breakage:
 847      description: "Upstream change broke downstream repo"
 848      detection: "Downstream CI failure after upstream push"
 849      action:
 850        - identify_breaking_commit: true
 851        - options:
 852            - revert_upstream_commit: "If isolated change"
 853            - spawn_fix_worker_for_downstream: "If complex"
 854            - coordinate_fix: "If both need changes"
 855  
 856  # === MODEL ASSIGNMENT ===
 857  # Different models for different roles (cost/capability optimization)
 858  model_assignment:
 859    orchestrator:
 860      model: "claude-opus-4"
 861      role: "Task decomposition, conflict resolution, synthesis"
 862      use_for:
 863        - planning_and_decomposition
 864        - dependency_analysis
 865        - conflict_resolution
 866        - final_review_and_synthesis
 867        - complex_architectural_decisions
 868  
 869    workers:
 870      model: "claude-sonnet-4"
 871      role: "Implementation, CI fixes, focused tasks"
 872      use_for:
 873        - code_implementation
 874        - test_writing
 875        - ci_fix_diagnosis
 876        - documentation_updates
 877      rationale: "Faster, cheaper, sufficient for focused implementation tasks"
 878  
 879    override_to_opus:
 880      conditions:
 881        - task_complexity: "high"
 882        - critical_path: true
 883        - security_sensitive: true
 884        - cross_repo_coordination: true
 885  
 886  # === DETAILED PROGRESS SUBSTATES ===
 887  # Granular tracking within milestones for observability
 888  detailed_progress:
 889    WORKING:
 890      substates:
 891        - analyzing_requirements
 892        - reading_existing_code
 893        - writing_new_code
 894        - writing_tests
 895        - local_verification
 896        - preparing_commit
 897      progress_indicator: "files_modified / estimated_files"
 898      report_substates: optional  # Can include in WORKING reports
 899  
 900    PUSHED_TO_CI:
 901      substates:
 902        - push_complete
 903        - ci_queued
 904        - ci_running:
 905            current_job: "build"
 906            jobs_passed: 2
 907            jobs_total: 6
 908        - ci_finishing
 909      progress_indicator: "jobs_passed / total_jobs"
 910  
 911    FIXING_CI:
 912      substates:
 913        - fetching_logs
 914        - diagnosing:
 915            error_category: "test_failure"
 916            affected_files: ["src/foo.rs"]
 917        - implementing_fix
 918        - local_test
 919        - preparing_fix_commit
 920      attempt_count: 1
 921      max_attempts: 5
 922      progress_indicator: "diagnosis_complete / fix_applied / verified"
 923  
 924  # === ENHANCED WORKER PROMPT TEMPLATE ===
 925  # DEPRECATED: Use compact_worker_template from worker-prompt-compact.cspec instead
 926  # Kept for reference/debugging only. ~400 tokens vs ~100 for compact.
 927  # See: lazy_load_rule section above for current standard.
 928  enhanced_worker_prompt_template_DEPRECATED: |
 929    TASK: Implement {component} in {repo}
 930    PLAN: {plan_ref}
 931    BRANCH: feat/{component_id}-{worker_id}
 932  
 933    CONTEXT:
 934    - Component spec: {component_spec_path}
 935    - Dependencies: {upstream_deps}
 936    - Consumers: {downstream_deps}
 937    - Parallel workers: {active_workers}
 938  
 939    SCOPE:
 940    - IN SCOPE: {in_scope_items}
 941    - OUT OF SCOPE: {out_of_scope_items}
 942  
 943    MILESTONE REPORTING (report each state change):
 944    - WORKING: {what you're doing} [substate: {substate}]
 945    - PUSHED_TO_CI: {repo} commit {hash} - CI run #{id}
 946    - FIXING_CI: {failed_job} - {diagnosis} [attempt: {n}/{max}]
 947    - DONE: {repo} CI run #{id} all green, Radicle synced
 948  
 949    COMPLETION CONTRACT:
 950    You are NOT DONE until CI fully passes. If CI fails:
 951    1. Fetch the CI log
 952    2. Classify failure: transient | recoverable | blocking | fatal
 953    3. If transient: retry with backoff
 954    4. If recoverable: diagnose, fix, push, report FIXING_CI
 955    5. If blocking: report BLOCKED with details
 956    6. If fatal: HALT and escalate immediately
 957  
 958    CHECKPOINTING:
 959    Your state is checkpointed on each milestone transition.
 960    If interrupted, orchestrator will resume from last checkpoint.
 961  
 962    TIMEOUTS:
 963    - WORKING: 20m max
 964    - PUSHED_TO_CI: 15m max (CI completion)
 965    - FIXING_CI: 15m per attempt, 5 attempts max
 966  
 967    PERMISSION FAILURE - SUICIDE RULE (CRITICAL):
 968    If you encounter permission denied errors on Bash, Write, or Edit tools:
 969    1. DO NOT retry more than 2 times
 970    2. Report: "BLOCKED_PERMISSION: {tool} - {error}"
 971    3. TERMINATE IMMEDIATELY
 972    4. Do NOT loop trying the same operation
 973    Rationale: Infinite retries waste tokens. Let orchestrator fix permissions and respawn.
 974  
 975    CI STATUS CHECK:
 976    - Web: https://source.ac-dc.network/{org}/{repo}/actions
 977    - API: curl -s "https://source.ac-dc.network/api/v1/repos/{org}/{repo}/actions/runs"
 978  
 979    DO NOT report DONE until you have confirmed:
 980    - All CI jobs passed (check, format, build, test)
 981    - radicle-push job status is "success" (NOT "skipped" or "cancelled")
 982  
 983    IF radicle-push shows "skipped":
 984    - This means Radicle sync did NOT happen
 985    - Run: git commit --allow-empty -m "ci: trigger radicle sync" && git push
 986    - Wait for new CI run
 987    - Verify radicle-push shows "success" before reporting DONE
 988  
 989  # === RESEARCH SOURCES ===
 990  # Enhancements based on industry research (2026-01-08)
 991  research_sources:
 992    anthropic:
 993      - url: "https://www.anthropic.com/research/building-effective-agents"
 994        insight: "Orchestrator-worker pattern for complex tasks"
 995      - url: "https://www.anthropic.com/engineering/multi-agent-research-system"
 996        insight: "Context management, model differentiation"
 997  
 998    frameworks:
 999      - name: LangGraph
1000        insight: "Checkpointing, state persistence, durable execution"
1001      - name: ccswarm
1002        insight: "Git worktree isolation, session persistence"
1003      - name: Claude-Flow
1004        insight: "Specialized worker pools, failure recovery"
1005  
1006    patterns:
1007      - url: "https://learn.microsoft.com/en-us/azure/architecture/ai-ml/guide/ai-agent-design-patterns"
1008        insight: "Agent isolation, compute separation"
1009      - url: "https://galileo.ai/blog/multi-agent-coordination-strategies"
1010        insight: "Lock management, conflict avoidance"