Cradicle Explorer

/ scripts / claude-orchestrator.sh
claude-orchestrator.sh
   1  #!/bin/sh
   2  # Claude Orchestrator — Process batches using Claude Code (claude -p)
   3  #
   4  # Usage:
   5  #   scripts/claude-orchestrator.sh              # One batch per type, then exit
   6  #   scripts/claude-orchestrator.sh --loop       # Repeat until all queues empty
   7  #   scripts/claude-orchestrator.sh --type proposals_email  # Single type only
   8  #
   9  # Requires: claude CLI (Claude Max subscription)
  10  # Each batch runs as a separate `claude -p --model <model>` invocation:
  11  #   - Fresh context per batch (no RAM accumulation)
  12  #   - Loads only relevant prompt/context docs
  13  #   - Output piped to claude-store.js for DB persistence
  14  
  15  set -e
  16  
  17  SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
  18  PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
  19  cd "$PROJECT_ROOT"
  20  
  21  # Ensure node and claude are in PATH — NixOS systemd services may not inherit them.
  22  # Must match the same Node.js version that compiled node_modules (nodejs_22).
  23  if ! command -v node >/dev/null 2>&1; then
  24    for d in /nix/store/*-nodejs-22.*/bin; do
  25      if [ -x "$d/node" ]; then
  26        export PATH="$d:$PATH"
  27        break
  28      fi
  29    done
  30  fi
  31  # Resolve claude CLI — prefer the latest installed version directly to avoid
  32  # stale symlinks after updates (e.g. ~/.local/bin/claude -> deleted version).
  33  # Extracted to a function so --loop mode can re-resolve each cycle when
  34  # Claude auto-updates delete the previously resolved version.
  35  resolve_claude_bin() {
  36    _claude_bin=""
  37    if [ -d "$HOME/.local/share/claude/versions" ]; then
  38      # Pick the latest *executable binary* — skip wrapper scripts (<1MB) and broken files
  39      for _v in $(ls -1 "$HOME/.local/share/claude/versions" 2>/dev/null | sort -V -r); do
  40        _candidate="$HOME/.local/share/claude/versions/$_v"
  41        _sz=$(stat -c%s "$_candidate" 2>/dev/null || echo 0)
  42        if [ -x "$_candidate" ] && [ "$_sz" -gt 1000000 ]; then
  43          _claude_bin="$_candidate"
  44          break
  45        fi
  46      done
  47    fi
  48    if [ -z "$_claude_bin" ] || [ ! -x "$_claude_bin" ]; then
  49      # Fallback to whatever is on PATH (symlink or system install)
  50      _claude_bin=$(command -v claude 2>/dev/null || echo "")
  51    fi
  52    alias claude="$_claude_bin"
  53    export CLAUDE_BIN="$_claude_bin"
  54  }
  55  resolve_claude_bin
  56  
  57  # Self-managed log file when running non-interactively (e.g. systemd)
  58  if [ ! -t 1 ]; then
  59    mkdir -p "$PROJECT_ROOT/logs"
  60    LOG_FILE="$PROJECT_ROOT/logs/orchestrator-$(date +%Y-%m-%d).log"
  61    exec >> "$LOG_FILE" 2>&1
  62  fi
  63  
  64  # Log CLAUDE_BIN resolution after log redirect so it appears in the log file
  65  echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] CLAUDE_BIN=$CLAUDE_BIN (executable: $([ -x "$CLAUDE_BIN" ] && echo yes || echo no))"
  66  
  67  LOOP=false
  68  SINGLE_TYPE=""
  69  
  70  # ── Usage limit state ──────────────────────────────────────────────────────
  71  # Claude Max has two windows: 5-hour rolling window and a weekly cap.
  72  # Utilization percentages are read from ~/.claude/usage-cache.json (written
  73  # by the Claude Code TUI after polling api.anthropic.com/api/oauth/usage).
  74  #
  75  # Strategy:
  76  #   1. PROACTIVE: Before each batch cycle, read usage-cache.json. If 5h window
  77  #      is ≥ WARNING_PCT or weekly is ≥ WEEKLY_WARNING_PCT, enter CONSERVATION
  78  #      MODE before hitting the hard limit. This preserves tokens for critical work.
  79  #   2. REACTIVE: If claude -p returns is_error:true or a limit keyword, enter
  80  #      conservation mode immediately.
  81  #
  82  # Conservation mode defers Opus-heavy rewording/proposals; keeps:
  83  #   - reply_responses (Opus) — time-critical inbound replies can't wait
  84  #   - classify_replies / extract_names (Haiku) — cheap; extract_names gated to 15min intervals
  85  #   - oversee (Sonnet) — system health, always run
  86  #   - classify_errors (Haiku) — cheap, always run
  87  #
  88  # Once the window resets (resets_at timestamp passes), exits conservation mode.
  89  CONSECUTIVE_LIMIT_HITS=0
  90  LIMIT_HIT_SINCE=""
  91  CONSERVATION_MODE=false
  92  CONSERVATION_REASON=""
  93  WARNING_PCT=85           # Enter conservation when 5h window ≥ 85% used
  94  WEEKLY_WARNING_PCT=90    # Enter conservation when weekly window ≥ 90% used
  95  USAGE_CACHE="${HOME}/.claude/usage-cache.json"
  96  
  97  while [ $# -gt 0 ]; do
  98    case "$1" in
  99      --loop) LOOP=true; shift ;;
 100      --type) SINGLE_TYPE="$2"; shift 2 ;;
 101      *) echo "Unknown arg: $1"; exit 1 ;;
 102    esac
 103  done
 104  
 105  # Batch sizes per type
 106  PROPOSALS_EMAIL_BATCH=5
 107  PROPOSALS_SMS_BATCH=10
 108  CLASSIFY_BATCH=50
 109  NAMES_BATCH=50
 110  REPLIES_BATCH=10
 111  PROOFREAD_BATCH=50
 112  SCORE_SEMANTIC_BATCH=50
 113  SCORE_SITES_BATCH=10
 114  ENRICH_SITES_BATCH=15
 115  
 116  # Frequency-gated batch state file (tracks last-run timestamps for periodic batch types)
 117  GATE_FILE="$PROJECT_ROOT/logs/orchestrator-gates.json"
 118  
 119  # ── Canary check (runs once per cycle) ───────────────────────────────────────
 120  # Verify external tools can actually execute before attempting any batches.
 121  # Catches: missing binary, wrong architecture (glibc on NixOS), broken PATH,
 122  # missing node_modules, etc. Fails loudly so monitoring picks it up immediately
 123  # rather than silently returning empty results for every batch.
 124  # Called from process_all_batches() so it runs once per cycle in --loop mode too.
 125  run_canary() {
 126    _canary_ok=true
 127  
 128    # 1. node must be able to run a basic script
 129    if ! node -e "process.exit(0)" 2>/dev/null; then
 130      log "FATAL: node is not executable — $(node --version 2>&1 | head -1). All batches will fail."
 131      _canary_ok=false
 132    fi
 133  
 134    # 2. claude-batch.js must be loadable (catches missing node_modules, syntax errors)
 135    if $_canary_ok && ! node scripts/claude-batch.js --canary 2>/dev/null | grep -q "ok"; then
 136      # Tolerate if --canary flag isn't implemented — just check node can load it without crashing
 137      node scripts/claude-batch.js --canary 2>/tmp/orch-canary-err.txt || true
 138      _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 139      if echo "$_canary_err" | grep -qiE "cannot find module|SyntaxError|ERR_"; then
 140        log "FATAL: claude-batch.js failed to load: $_canary_err. All batches will fail."
 141        _canary_ok=false
 142      fi
 143    fi
 144  
 145    # 3. CLAUDE_BIN must be executable (catches glibc/nix-ld issues, broken symlinks, wrapper loops)
 146    if $_canary_ok && [ -n "$CLAUDE_BIN" ]; then
 147      _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true)
 148      _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 149      if [ -z "$_canary_out" ]; then
 150        # Re-resolve — Claude may have auto-updated, deleting the old version
 151        log "WARN: CLAUDE_BIN=$CLAUDE_BIN failed (${_canary_err:-no output}). Re-resolving..."
 152        resolve_claude_bin
 153        if [ -n "$CLAUDE_BIN" ]; then
 154          _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true)
 155          _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 156        fi
 157        if [ -z "$_canary_out" ]; then
 158          log "FATAL: CLAUDE_BIN=$CLAUDE_BIN still failed after re-resolve: ${_canary_err:-no output}. All LLM batches will return empty."
 159          _canary_ok=false
 160        else
 161          log "Canary OK after re-resolve: node=$(node --version 2>/dev/null), claude=$_canary_out"
 162        fi
 163      else
 164        log "Canary OK: node=$(node --version 2>/dev/null), claude=$_canary_out"
 165      fi
 166    elif $_canary_ok; then
 167      # CLAUDE_BIN empty — try resolving (may have been installed since startup)
 168      resolve_claude_bin
 169      if [ -n "$CLAUDE_BIN" ]; then
 170        log "Resolved CLAUDE_BIN=$CLAUDE_BIN after initially empty"
 171      else
 172        log "WARN: CLAUDE_BIN is empty — LLM batches will fail. Check PATH and ~/.local/share/claude/versions/."
 173        _canary_ok=false
 174      fi
 175    fi
 176    rm -f /tmp/orch-canary-err.txt
 177  }
 178  
 179  # Effort level per batch type — matches Agency Agents plan.
 180  # Controls how much reasoning the model uses. Saves tokens on simple tasks.
 181  # Levels: low (classification/extraction), medium (analysis), high (reasoning/creative), max (extended thinking)
 182  effort_for_batch() {
 183    case "$1" in
 184      proposals_email)      echo "high" ;;
 185      proposals_sms)        echo "medium" ;;
 186      classify_replies)     echo "medium" ;;
 187      extract_names)        echo "low" ;;
 188      reply_responses)      echo "high" ;;
 189      proofread)            echo "medium" ;;
 190      score_sites)          echo "medium" ;;
 191      score_semantic)       echo "medium" ;;
 192      enrich_sites)         echo "low" ;;
 193      oversee)              echo "medium" ;;
 194      classify_errors)      echo "low" ;;
 195      code_review)          echo "medium" ;;
 196      monitor_health)       echo "low" ;;
 197      triage_errors)        echo "medium" ;;
 198      check_docs)           echo "low" ;;
 199      evidence_merge)       echo "low" ;;
 200      followup_generate)    echo "medium" ;;
 201      *)                    echo "medium" ;;
 202    esac
 203  }
 204  
 205  # Check if a frequency-gated batch type is due to run.
 206  # Args: gate_type interval_minutes
 207  # Returns 0 (run it) or 1 (skip — not due yet)
 208  is_due() {
 209    gate_type="$1"
 210    interval_mins="$2"
 211    if [ ! -f "$GATE_FILE" ]; then return 0; fi
 212    elapsed=$(node -e "
 213      try {
 214        const d = JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));
 215        const ts = d['$gate_type'];
 216        process.stdout.write(ts ? String(Date.now() - new Date(ts).getTime()) : '0');
 217      } catch { process.stdout.write('0'); }
 218    " 2>/dev/null)
 219    [ -z "$elapsed" ] && elapsed=0
 220    threshold=$((interval_mins * 60 * 1000))
 221    [ "$elapsed" -ge "$threshold" ] 2>/dev/null || [ "$elapsed" = "0" ]
 222  }
 223  
 224  # Record that a gated batch type just ran
 225  mark_ran() {
 226    gate_type="$1"
 227    node -e "
 228      const fs = require('fs');
 229      let d = {};
 230      try { d = JSON.parse(fs.readFileSync('$GATE_FILE','utf8')); } catch {}
 231      d['$gate_type'] = new Date().toISOString();
 232      fs.writeFileSync('$GATE_FILE', JSON.stringify(d, null, 2));
 233    " 2>/dev/null
 234  }
 235  
 236  log() {
 237    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] $*"
 238  }
 239  
 240  # Proactive: fetch live usage from Anthropic API, fall back to cache.
 241  # Reads OAuth token from ~/.claude/.credentials.json (Linux credential store).
 242  # Called once per batch cycle (before processing batches).
 243  check_usage_proactive() {
 244    CREDS_FILE="${HOME}/.claude/.credentials.json"
 245  
 246    # Try live fetch first (updates cache for TUI too)
 247    usage=$(node -e "
 248      const fs = require('fs');
 249      const https = require('https');
 250      let token = '';
 251      try {
 252        const creds = JSON.parse(fs.readFileSync('$CREDS_FILE','utf8'));
 253        token = creds.claudeAiOauth?.accessToken || '';
 254      } catch {}
 255  
 256      if (!token) {
 257        // Fall back to cache
 258        try {
 259          const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8'));
 260          process.stdout.write(JSON.stringify({...d, source:'cache_no_token'}));
 261        } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"none\"}'); }
 262        return;
 263      }
 264  
 265      const req = https.request({
 266        hostname: 'api.anthropic.com',
 267        path: '/api/oauth/usage',
 268        method: 'GET',
 269        headers: {
 270          'Authorization': 'Bearer ' + token,
 271          'anthropic-beta': 'oauth-2025-04-20',
 272          'User-Agent': 'claude-orchestrator/1.0'
 273        },
 274        timeout: 8000
 275      }, (res) => {
 276        let body = '';
 277        res.on('data', c => body += c);
 278        res.on('end', () => {
 279          try {
 280            const raw = JSON.parse(body);
 281            const result = {
 282              five_hour: Math.round(raw.five_hour?.utilization || 0),
 283              seven_day: Math.round(raw.seven_day?.utilization || 0),
 284              five_hour_resets_at: raw.five_hour?.resets_at || '',
 285              seven_day_resets_at: raw.seven_day?.resets_at || '',
 286              source: 'live'
 287            };
 288            // Update cache file for TUI
 289            try { fs.writeFileSync('$USAGE_CACHE', JSON.stringify({...result, stale: false})); } catch {}
 290            process.stdout.write(JSON.stringify(result));
 291          } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"parse_error\"}'); }
 292        });
 293      });
 294      req.on('error', () => {
 295        // Fall back to cache on network error
 296        try {
 297          const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8'));
 298          process.stdout.write(JSON.stringify({...d, source:'cache_fallback'}));
 299        } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"error\"}'); }
 300      });
 301      req.on('timeout', () => { req.destroy(); });
 302      req.end();
 303    " 2>/dev/null || node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));d.source='node_error_cache';process.stdout.write(JSON.stringify(d));}catch{process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"node_error\"}');}" 2>/dev/null || echo '{"five_hour":0,"seven_day":0,"source":"node_error"}')
 304  
 305    five_hour=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).five_hour||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
 306    seven_day=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).seven_day||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
 307    five_resets=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).five_hour_resets_at||'');}catch{process.stdout.write('');}})" 2>/dev/null || echo "")
 308  
 309    source=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).source||'?');}catch{process.stdout.write('?');}})" 2>/dev/null || echo "?")
 310    log "Usage [${source}]: 5h=${five_hour}% weekly=${seven_day}%  (warn at ${WARNING_PCT}%/${WEEKLY_WARNING_PCT}%)"
 311  
 312    # Check if the 5h window has reset since we entered conservation
 313    if [ "$CONSERVATION_MODE" = "true" ] && [ -n "$five_resets" ] && [ -n "$LIMIT_HIT_SINCE" ]; then
 314      resets_epoch=$(date -d "$five_resets" +%s 2>/dev/null || echo "0")
 315      hit_epoch=$(date -d "$LIMIT_HIT_SINCE" +%s 2>/dev/null || echo "0")
 316      if [ "$resets_epoch" -gt 0 ] && [ "$resets_epoch" -gt "$hit_epoch" ]; then
 317        CONSERVATION_MODE=false
 318        CONSECUTIVE_LIMIT_HITS=0
 319        LIMIT_HIT_SINCE=""
 320        CONSERVATION_REASON=""
 321        node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true
 322        log "CONSERVATION MODE OFF — 5h window reset at $five_resets. Usage now ${five_hour}%. Resuming full operation."
 323        return
 324      fi
 325      # Also exit if usage has dropped back below warning threshold
 326      if [ "$five_hour" -lt "$((WARNING_PCT - 10))" ] && [ "$seven_day" -lt "$((WEEKLY_WARNING_PCT - 10))" ]; then
 327        CONSERVATION_MODE=false
 328        CONSECUTIVE_LIMIT_HITS=0
 329        LIMIT_HIT_SINCE=""
 330        CONSERVATION_REASON=""
 331        node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true
 332        log "CONSERVATION MODE OFF — usage recovered (5h=${five_hour}% weekly=${seven_day}%). Resuming full operation."
 333        return
 334      fi
 335    fi
 336  
 337    # Enter conservation proactively
 338    if [ "$CONSERVATION_MODE" = "false" ]; then
 339      if [ "$five_hour" -ge "$WARNING_PCT" ]; then
 340        CONSERVATION_MODE=true
 341        LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 342        CONSERVATION_REASON="5h=${five_hour}% (>=${WARNING_PCT}%)"
 343        log "CONSERVATION MODE ON (proactive) — 5h window at ${five_hour}%. Deferring Opus rewording/proposals. Keeping replies+Haiku. Resets: $five_resets"
 344      elif [ "$seven_day" -ge "$WEEKLY_WARNING_PCT" ]; then
 345        CONSERVATION_MODE=true
 346        LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 347        CONSERVATION_REASON="weekly=${seven_day}% (>=${WEEKLY_WARNING_PCT}%)"
 348        log "CONSERVATION MODE ON (proactive) — weekly window at ${seven_day}%. Deferring Opus rewording/proposals. Resets: $(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).seven_day_resets_at||'');}catch{process.stdout.write('');}});")"
 349      fi
 350    fi
 351  }
 352  
 353  # Reactive: detect hard limit errors from claude -p envelope.
 354  # Args: raw_result (full JSON envelope string), batch_type
 355  check_limit_signal() {
 356    raw="$1"
 357    btype="$2"
 358  
 359    [ -z "$raw" ] && return
 360  
 361    is_limit=$(echo "$raw" | node -e "
 362      let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 363        try {
 364          const env = JSON.parse(d);
 365          // Only scan envelope-level metadata for limit signals — NOT env.result (LLM content).
 366          // LLM-generated text (proposals, emails, rewrites) may mention "rate limit" etc.
 367          const meta = [
 368            env.is_error ? 'is_error' : '',
 369            typeof env.stop_reason === 'string' ? env.stop_reason : '',
 370            typeof env.error === 'string' ? env.error : '',
 371            typeof env.error?.message === 'string' ? env.error.message : '',
 372            env.is_error && typeof env.result === 'string' && env.result.length < 500 ? env.result : '',
 373          ].join(' ').toLowerCase();
 374          const hasKeyword = env.is_error && /rate.?limit|usage.?limit|claude.?max|overloaded|quota|529|too.?many.?request/.test(meta);
 375          process.stdout.write(hasKeyword ? '1' : '0');
 376        } catch { process.stdout.write('0'); }
 377      });
 378    " 2>/dev/null || echo "0")
 379  
 380    if [ "$is_limit" = "1" ]; then
 381      CONSECUTIVE_LIMIT_HITS=$((CONSECUTIVE_LIMIT_HITS + 1))
 382      [ -z "$LIMIT_HIT_SINCE" ] && LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 383      CONSERVATION_REASON="hard limit on $btype"
 384      if [ "$CONSERVATION_MODE" = "false" ]; then
 385        CONSERVATION_MODE=true
 386        log "CONSERVATION MODE ON (reactive) — hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS). Deferring Opus tasks."
 387      else
 388        log "Usage limit still active on $btype (hit #$CONSECUTIVE_LIMIT_HITS since $LIMIT_HIT_SINCE)"
 389      fi
 390  
 391      # Persist to rate-limits.json so the state survives orchestrator restarts
 392      # and is visible to should_skip_for_rate_limits() on next cycle.
 393      # Uses 'hourly' reset window (1h backoff) — conservation mode recovery
 394      # logic may clear it sooner if usage drops below threshold.
 395      node -e "
 396        const { setRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js');
 397        setRateLimit('claude_cli', {
 398          limitType: 'hourly',
 399          reason: 'Claude Max hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS)',
 400        });
 401      " 2>/dev/null || log "WARN: Failed to persist claude_cli rate limit to rate-limits.json"
 402    fi
 403  }
 404  
 405  # Query pipeline backlog counts for intelligent batch skipping.
 406  # All counts are ACTIONABLE — filtered to only items that can reach a sent outreach.
 407  # The core filter is ACTIVE_COUNTRY_CODES = all countries NOT in OUTREACH_BLOCKED_COUNTRIES.
 408  # This filter is inherited by every gate so that blocking a country upstream stops all work
 409  # for that country at every stage (scoring, enriching, proposals, reword).
 410  #
 411  # Sets global vars:
 412  #   BACKLOG_ELIGIBLE_OUTREACH  — approved email/sms, delivery_status IS NULL, past 3d cooldown, gdpr_verified where required
 413  #   BACKLOG_PROPOSALS          — proposals_drafted sites with unsent email/sms in active countries
 414  #   BACKLOG_ENRICHED           — enriched sites in active countries, score < LOW_SCORE_CUTOFF
 415  #   BACKLOG_ACTIVE_COUNTRIES   — count of active (non-blocked) country codes in pipeline
 416  #   BACKLOG_APPROVED_UNSENT    — all approved+unsent (any channel, for display only)
 417  #   BACKLOG_PENDING_APPROVAL   — pending approval count (display only)
 418  refresh_backlog() {
 419    # Read OUTREACH_SKIP_METHODS, OUTREACH_BLOCKED_COUNTRIES, and SMS-blocked countries from .env
 420    _skip_methods=$(grep '^OUTREACH_SKIP_METHODS=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 421    _blocked_countries=$(grep '^OUTREACH_BLOCKED_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 422    _sms_blocked_countries=$(grep '^OUTREACH_BLOCKED_SMS_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 423  
 424    eval "$(node "$PROJECT_ROOT/scripts/refresh-backlog.js" \
 425      --skip-methods="$_skip_methods" \
 426      --blocked-countries="$_blocked_countries" \
 427      --sms-blocked-countries="$_sms_blocked_countries" \
 428      --low-score-cutoff="$LOW_SCORE_CUTOFF" \
 429      --project-root="$PROJECT_ROOT" 2>/dev/null)"
 430  
 431    # ── Data-loss canary: halt if site count dropped >50% ──
 432    if [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -gt 0 ]; then
 433      _drop_pct=$(( (CANARY_PREVIOUS_COUNT - CANARY_SITE_COUNT) * 100 / CANARY_PREVIOUS_COUNT ))
 434      if [ "$_drop_pct" -gt 50 ]; then
 435        log "CRITICAL: Site count dropped ${_drop_pct}% (${CANARY_PREVIOUS_COUNT} → ${CANARY_SITE_COUNT}) — HALTING ORCHESTRATOR"
 436        log "ACTION REQUIRED: Check PostgreSQL mmo database integrity. Restore from backup if needed."
 437        log "To resume: rm logs/site-count-canary.json && restart orchestrator"
 438        exit 1
 439      fi
 440    elif [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -eq 0 ]; then
 441      log "CRITICAL: Site count is ZERO (was ${CANARY_PREVIOUS_COUNT}) — HALTING ORCHESTRATOR"
 442      log "ACTION REQUIRED: Database has been wiped. Restore from backup immediately."
 443      log "To resume: rm logs/site-count-canary.json && restart orchestrator"
 444      exit 1
 445    fi
 446  }
 447  
 448  # Load STAGE_THROTTLE_MULTIPLIER and LOW_SCORE_CUTOFF from .env (defaults: 3 and 82).
 449  # Read once at startup — restart orchestrator to pick up changes.
 450  STAGE_THROTTLE_MULTIPLIER=$(grep '^STAGE_THROTTLE_MULTIPLIER=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ')
 451  STAGE_THROTTLE_MULTIPLIER=${STAGE_THROTTLE_MULTIPLIER:-3}
 452  LOW_SCORE_CUTOFF=$(grep '^LOW_SCORE_CUTOFF=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ')
 453  LOW_SCORE_CUTOFF=${LOW_SCORE_CUTOFF:-82}
 454  
 455  # Load SKIP_STAGES from .env on every loop iteration so changes take effect without restart.
 456  # Always reads .env — edit the file to change skip behaviour live.
 457  load_skip_stages() {
 458    if [ -f "$PROJECT_ROOT/.env" ]; then
 459      val=$(grep '^SKIP_STAGES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'")
 460      SKIP_STAGES="$val"
 461    fi
 462    # Normalise: lowercase, strip spaces
 463    SKIP_STAGES=$(echo "${SKIP_STAGES:-}" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 464  }
 465  
 466  # Check if a batch type is blocked by the pipeline's rate-limit-scheduler state.
 467  # Reads logs/rate-limits.json (written by circuit-breaker.js when an API quota/rate-limit hits).
 468  # This ensures the orchestrator respects the same automatic backoffs as the pipeline —
 469  # e.g. if ZenRows hits its daily quota and blocks 'serps', both systems honour that pause.
 470  # Returns 0 = skip (rate-limited), 1 = run
 471  should_skip_for_rate_limits() {
 472    batch_type="$1"
 473    rate_limits_file="$PROJECT_ROOT/logs/rate-limits.json"
 474    [ -f "$rate_limits_file" ] || return 1  # No file = no active rate limits
 475  
 476    # Map batch_type → stage name to check in rate-limits.json
 477    case "$batch_type" in
 478      proposals_email|proposals_sms)                               stage="proposals" ;;
 479      reply_responses)                                              stage="outreach" ;;
 480      classify_replies)                                             stage="replies" ;;
 481      score_semantic|score_sites)                                   stage="scoring" ;;
 482      enrich_sites)                                                 stage="enrich" ;;
 483      followup_generate)                                            stage="outreach" ;;
 484      *)                                                            stage="" ;;
 485    esac
 486  
 487    now_ms=$(date +%s%3N 2>/dev/null || echo "0")
 488  
 489    # Check each entry: if resetAt > now and stages includes our stage, skip.
 490    # Also check 'claude_cli' — when Claude Max credits are exhausted, ALL orchestrator
 491    # batches are blocked regardless of stage mapping (they all use claude -p).
 492    matched=$(node -e "
 493      try {
 494        const data = JSON.parse(require('fs').readFileSync('$rate_limits_file', 'utf8'));
 495        const stage = '$stage';
 496        const now = $now_ms;
 497  
 498        // Global claude_cli gate — blocks ALL orchestrator batches
 499        if (data.claude_cli && Number(data.claude_cli.resetAt) > now) {
 500          const resetAt = new Date(Number(data.claude_cli.resetAt)).toISOString();
 501          process.stdout.write('claude_cli:' + resetAt + ':' + (data.claude_cli.reason || ''));
 502          process.exit(0);
 503        }
 504  
 505        // Stage-specific gate (e.g. openrouter scoring, zenrows serps)
 506        if (stage) {
 507          for (const [api, info] of Object.entries(data)) {
 508            if (Array.isArray(info.stages) && info.stages.includes(stage) && Number(info.resetAt) > now) {
 509              const resetAt = new Date(Number(info.resetAt)).toISOString();
 510              process.stdout.write(api + ':' + resetAt + ':' + (info.reason || ''));
 511              process.exit(0);
 512            }
 513          }
 514        }
 515      } catch(e) {}
 516    " 2>/dev/null)
 517  
 518    if [ -n "$matched" ]; then
 519      api=$(echo "$matched" | cut -d: -f1)
 520      reset_at=$(echo "$matched" | cut -d: -f2)
 521      reason=$(echo "$matched" | cut -d: -f3-)
 522      log "$batch_type: SKIP (rate-limited by $api until $reset_at — $reason)"
 523      return 0
 524    fi
 525    return 1
 526  }
 527  
 528  # Check if a batch type is blocked by SKIP_STAGES.
 529  # Returns 0 = skip (stage paused), 1 = run
 530  should_skip_for_stage_flag() {
 531    batch_type="$1"
 532    [ -z "$SKIP_STAGES" ] && return 1  # Nothing skipped
 533  
 534    # Map batch types to pipeline stage names
 535    case "$batch_type" in
 536      proposals_email|proposals_sms)
 537        echo "$SKIP_STAGES" | grep -q 'proposals' && {
 538          log "$batch_type: SKIP (SKIP_STAGES=proposals)"; return 0; } ;;
 539      reply_responses)
 540        echo "$SKIP_STAGES" | grep -q 'outreach\|replies\|reply' && {
 541          log "$batch_type: SKIP (SKIP_STAGES includes outreach/replies)"; return 0; } ;;
 542      classify_replies)
 543        echo "$SKIP_STAGES" | grep -q 'replies\|classify' && {
 544          log "$batch_type: SKIP (SKIP_STAGES=replies/classify)"; return 0; } ;;
 545      extract_names)
 546        echo "$SKIP_STAGES" | grep -q 'extract_names\|extract' && {
 547          log "$batch_type: SKIP (SKIP_STAGES=extract_names)"; return 0; } ;;
 548      proofread)
 549        echo "$SKIP_STAGES" | grep -q 'proofread' && {
 550          log "$batch_type: SKIP (SKIP_STAGES=proofread)"; return 0; } ;;
 551      score_semantic|score_sites)
 552        echo "$SKIP_STAGES" | grep -q 'scoring' && {
 553          log "$batch_type: SKIP (SKIP_STAGES=scoring)"; return 0; } ;;
 554      enrich_sites)
 555        echo "$SKIP_STAGES" | grep -q 'enrich' && {
 556          log "$batch_type: SKIP (SKIP_STAGES=enrich)"; return 0; } ;;
 557      followup_generate)
 558        echo "$SKIP_STAGES" | grep -q 'followup' && {
 559          log "$batch_type: SKIP (SKIP_STAGES=followup)"; return 0; } ;;
 560    esac
 561    return 1  # Not skipped
 562  }
 563  
 564  # Check if a batch type should be skipped because its downstream queue is already saturated.
 565  # Uses STAGE_THROTTLE_MULTIPLIER (default 3) × the relevant batch size as the threshold.
 566  #
 567  # Stage chain (each gate pauses all earlier stages):
 568  #   enriched → [proposals_*] → proposals_drafted → [proofread] → approved outreach
 569  #
 570  # Gate 1 (outreach gate):  if eligible outreach > 3× rolling 7-day avg daily send rate
 571  #                           (floor: 3× proofread_batch, so the gate isn't too permissive
 572  #                           when historical send rate is very low or zero)
 573  #                           pause: proofread, proposals_*, enrich_sites, score_semantic
 574  # Gate 2 (proposals gate): if proposals_drafted > 3× (proposals_email_batch + proposals_sms_batch)
 575  #                           pause: proposals_*, enrich_sites
 576  # Gate 3 (enrich gate):    if enriched > 3× enrich_sites_batch
 577  #                           pause: enrich_sites, score_semantic
 578  #
 579  # Exception: serps is never paused (ZenRows daily quota must be used).
 580  # Returns 0 = skip, 1 = run
 581  should_skip_for_backlog() {
 582    batch_type="$1"
 583  
 584    # Gate 1: outreach queue saturated — pause everything feeding into it.
 585    # Use 3× the rolling 7-day daily send average as the threshold, with a floor
 586    # of 3× PROOFREAD_BATCH to prevent the gate from being too permissive when
 587    # the send rate is very low (e.g. pipeline just started, GDPR blocking, etc).
 588    _static_floor=$((PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER))
 589    _send_rate_threshold=$((${BACKLOG_DAILY_SEND_AVG:-0} * STAGE_THROTTLE_MULTIPLIER))
 590    _outreach_threshold=$(( _send_rate_threshold > _static_floor ? _send_rate_threshold : _static_floor ))
 591  
 592    case "$batch_type" in
 593      proofread|proposals_email|proposals_sms|enrich_sites|score_semantic)
 594        if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -gt "$_outreach_threshold" ]; then
 595          log "$batch_type: SKIP — eligible_outreach=${BACKLOG_ELIGIBLE_OUTREACH} > threshold=${_outreach_threshold} (${STAGE_THROTTLE_MULTIPLIER}×max(daily_avg=${BACKLOG_DAILY_SEND_AVG:-0},floor=${PROOFREAD_BATCH}))"
 596          return 0
 597        fi
 598        ;;
 599    esac
 600  
 601    # Gate 2: proposals_drafted queue saturated — pause proposal generation and earlier stages.
 602    # Threshold based on PROOFREAD throughput (the bottleneck), not proposal generation batch size.
 603    # Proofread clears PROOFREAD_BATCH per cycle; allow 3× that buffer before pausing upstream.
 604    _proposals_threshold=$(( PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER ))
 605  
 606    case "$batch_type" in
 607      proposals_email|proposals_sms|enrich_sites)
 608        if [ "${BACKLOG_PROPOSALS:-0}" -gt "$_proposals_threshold" ]; then
 609          log "$batch_type: SKIP — proposals_drafted=${BACKLOG_PROPOSALS} > threshold=${_proposals_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${_proposals_batch_max})"
 610          return 0
 611        fi
 612        ;;
 613    esac
 614  
 615    # Gate 3: enriched queue saturated — pause enrich and scoring.
 616    # Threshold based on proposals throughput (PROPOSALS_EMAIL + PROPOSALS_SMS per cycle),
 617    # not enrich batch size. Enriched sites are consumed by proposal generation.
 618    _proposals_batch_max=$(( PROPOSALS_EMAIL_BATCH + PROPOSALS_SMS_BATCH ))
 619    _enrich_threshold=$(( _proposals_batch_max * STAGE_THROTTLE_MULTIPLIER * 5 ))
 620  
 621    case "$batch_type" in
 622      enrich_sites|score_semantic)
 623        if [ "${BACKLOG_ENRICHED:-0}" -gt "$_enrich_threshold" ]; then
 624          log "$batch_type: SKIP — enriched=${BACKLOG_ENRICHED} > threshold=${_enrich_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${ENRICH_SITES_BATCH})"
 625          return 0
 626        fi
 627        ;;
 628    esac
 629  
 630    return 1  # Don't skip — run normally
 631  }
 632  
 633  # Run a single batch type
 634  # Args: batch_type model batch_size context_files...
 635  run_batch() {
 636    batch_type="$1"
 637    model="$2"
 638    batch_size="$3"
 639    shift 3
 640    # Remaining args are context file paths
 641  
 642    log "Pulling $batch_type batch (limit=$batch_size)..."
 643    batch_json=$(node scripts/claude-batch.js "$batch_type" "$batch_size" 2>/dev/null || echo '{"count":0,"items":[]}')
 644  
 645    count=$(echo "$batch_json" | node -e "
 646      let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 647        try { process.stdout.write(String(JSON.parse(d).count||0)); }
 648        catch { process.stdout.write('0'); }
 649      });
 650    " 2>/dev/null || echo "0")
 651    [ -z "$count" ] && count=0
 652  
 653    if [ "$count" = "0" ]; then
 654      log "$batch_type: no items to process"
 655      return 1  # Signal "empty" to caller
 656    fi
 657  
 658    _effort_log=$(effort_for_batch "$batch_type")
 659    log "$batch_type: processing $count items with $model (effort=$_effort_log)..."
 660  
 661    # Build the prompt — include context files and the batch data
 662    context=""
 663    for f in "$@"; do
 664      if [ -f "$f" ]; then
 665        context="$context
 666  
 667  --- FILE: $f ---
 668  $(cat "$f")
 669  --- END FILE ---"
 670      fi
 671    done
 672  
 673    # Build task-specific instructions
 674    case "$batch_type" in
 675      proposals_email)
 676        task_prompt="Generate email proposals for each site below. Follow the PROPOSAL.md and email best practices. Output JSON: {\"batch_type\":\"proposals_email\",\"results\":[{\"site_id\":N,\"contact_method\":\"email\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\",\"subject_line\":\"...\"}]}"
 677        ;;
 678      proposals_sms)
 679        task_prompt="Generate SMS proposals (<160 chars) for each site below. Follow PROPOSAL.md and SMS best practices. TCPA opt-out required for US/CA only. Output JSON: {\"batch_type\":\"proposals_sms\",\"results\":[{\"site_id\":N,\"contact_method\":\"sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\"}]}"
 680        ;;
 681      classify_replies)
 682        task_prompt="$(cat prompts/CLASSIFY-REPLIES.md)"
 683        ;;
 684      extract_names)
 685        task_prompt="Extract the person's first name from these email addresses and domains. If no name can be inferred, leave null. Output JSON: {\"batch_type\":\"extract_names\",\"results\":[{\"site_id\":N,\"contacts\":[{\"uri\":\"...\",\"name\":\"FirstName\"}]}]}"
 686        ;;
 687      reply_responses)
 688        task_prompt="Follow the reply guidelines in prompts/REPLIES.md. Respond to every inbound in the batch. Output ONLY valid JSON — no markdown, no explanation."
 689        ;;
 690      oversee)
 691        task_prompt="You are the senior overseer of an automated lead-generation pipeline (333 Method). Analyze the system health snapshot and decide what corrective actions to take. The structured JSON is the authoritative source of truth — previous_overseer_findings is historical context only.
 692  KNOWN SELF-HEALING BEHAVIOURS (do NOT escalate): cron_timer_dead is auto-fixed by processGuardian every minute. 1-minute cron gaps are expected.
 693  COST MODEL (do NOT flag as expensive): All LLM batches run via `claude -p --model <name>` which uses the Claude Max subscription — zero marginal cost per call regardless of model (opus/sonnet/haiku). Do NOT flag opus or sonnet model usage as a cost issue. Only OpenRouter direct-API calls (gather_evidence, scoring, replies) count against the $200/day LLM_DAILY_BUDGET.
 694  VALID PIPELINE STATUSES (ALL are intentional — do NOT flag as invalid): found, assets_captured, prog_scored, semantic_scored, vision_scored, enriched_regex, enriched_llm, enriched, proposals_drafted, outreach_partial, outreach_sent, rescored, skipped, error. enriched_regex is an intermediate enrichment step (browser pass complete, LLM pass pending) — it is expected and normal.
 695  Available actions: RESTART_PIPELINE (if stopped/crash-looping), CLEAR_STALE_TASKS (params: min_age_hours), RESET_STUCK_SITES (params: stage, min_stuck_hours ≥ 4 — never lower), LOG_ONLY (params: note). Do NOT reset a stage when its target already has a large backlog (found>5000: don't reset assets_captured or scored; rescored>500: don't reset enriched or rescored). Sites with recapture_count≥2 are auto-excluded from resets. NEVER reset 'enriched' stage — enriched is a deliberate evidence-collection queue (gather_evidence cron picks sites up every 5 min); large enriched backlogs are normal and expected.
 696  Output JSON: {\"batch_type\":\"oversee\",\"results\":[{\"summary\":\"1-2 sentences\",\"severity\":\"ok|warn|critical\",\"findings\":[{\"issue\":\"...\",\"severity\":\"low|medium|high\",\"stage\":\"pipeline|agents|cron|sites\"}],\"actions\":[{\"code\":\"ACTION_CODE\",\"description\":\"...\",\"params\":{}}]}]}"
 697        ;;
 698      classify_errors)
 699        task_prompt="Analyze these pipeline error messages that don't match any existing patterns. Propose regex patterns for categorization using JavaScript regex syntax (no leading/trailing slashes). Group similar errors into one pattern where possible. Terminal = permanent failure; Retriable = transient or fixable.
 700  Output JSON: {\"batch_type\":\"classify_errors\",\"results\":[{\"pattern\":\"regex\",\"label\":\"Short Label\",\"group\":\"terminal|retriable\",\"context\":\"site|outreach\",\"example_errors\":[\"...\"],\"occurrence_count\":N}]}"
 701        ;;
 702      score_sites)
 703        task_prompt="You are a CRO specialist scoring local business websites for conversion effectiveness. For each site in BATCH DATA, analyze the provided HTML and score it using the CRO scoring system defined in the context above.
 704  
 705  CRITICAL RULES:
 706  - Evaluate each site independently using ONLY its provided HTML and http_headers
 707  - Always return COMPLETE factor_scores for every site — never omit any factor
 708  - Do NOT include conversion_score or letter_grade — these are computed programmatically
 709  - Output ONLY valid JSON, no markdown fences
 710  
 711  Output JSON: {\"batch_type\":\"score_sites\",\"results\":[{\"site_id\":N,\"factor_scores\":{\"headline_quality\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"call_to_action\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"urgency_messaging\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"hook_engagement\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"trust_signals\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"imagery_design\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"offer_clarity\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"contextual_appropriateness\":{\"score\":N,\"reasoning\":\"...\",\"industry_context\":\"...\"}},\"overall_calculation\":{\"grade_interpretation\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"country_detection_confidence\":\"high|medium|low\",\"country_detection_evidence\":[\"...\"],\"is_business_directory\":false,\"is_local_business\":true,\"is_law_firm\":false,\"industry_classification\":\"plumbing\",\"is_error_page\":false,\"error_type\":null,\"error_description\":null,\"is_broken_site\":false,\"broken_site_details\":[]},\"contact_details\":{\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"email_addresses\":[],\"phone_numbers\":[],\"social_profiles\":[],\"key_pages\":[]}}]}"
 712        ;;
 713      enrich_sites)
 714        task_prompt="You are a contact data extraction specialist. For each site in BATCH DATA, extract contact information from the provided HTML using the enrichment guidelines in the context above.
 715  
 716  CRITICAL RULES:
 717  - Extract ONLY information explicitly visible in the HTML — do NOT fabricate data
 718  - If current_contacts already has a field, only add NEW items not already present
 719  - Preserve phone numbers EXACTLY as displayed (do not normalize format)
 720  - Return an empty object {} for sites where no new contacts are found
 721  
 722  Output JSON: {\"batch_type\":\"enrich_sites\",\"results\":[{\"site_id\":N,\"domain\":\"example.com\",\"business_name\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"NSW\",\"email_addresses\":[{\"email\":\"info@example.com\",\"label\":\"General\"}],\"phone_numbers\":[{\"number\":\"+61 2 1234 5678\",\"label\":\"Office\"}],\"social_profiles\":[{\"url\":\"https://facebook.com/example\",\"label\":\"Facebook\"}],\"key_pages\":[\"https://example.com/contact\"],\"primary_contact_form\":null}]}"
 723        ;;
 724      score_semantic)
 725        task_prompt="You are scoring local business websites for conversion effectiveness. For each site, evaluate these 3 semantic factors from the extracted text:
 726  
 727  1. **headline_quality** (0-10): Is the H1/title compelling? Does it promise a benefit, address the visitor, create curiosity? Generic 'Welcome' or just a business name = 2-4. Specific benefit + audience = 7-9.
 728  2. **value_proposition** (0-10): Does the page clearly articulate WHAT they do, WHO they help, and WHY choose them? Look for quantified claims, benefit language, customer-focused framing (you/your > we/our). Vague 'quality service' = 3-4. Specific benefits with proof = 7-9.
 729  3. **unique_selling_proposition** (0-10): What differentiates them? Look for 'only', 'exclusive', years of experience, number of customers served, awards, patents, specific methodologies. Generic = 3-4. Clear differentiators = 7-9.
 730  
 731  Score conservatively. Most local business sites score 4-7. A 9-10 is exceptional (clear benefit headline + quantified proof + genuine differentiators). A 0-2 means almost no content or completely generic.
 732  
 733  Output JSON: {\"batch_type\":\"score_semantic\",\"results\":[{\"site_id\":N,\"headline_quality\":{\"score\":N,\"reasoning\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\"}}]}"
 734        ;;
 735      proofread)
 736        task_prompt="Review each message in BATCH DATA against the proofreading rules in the context above. Apply every rule. Output a decision for every item — no omissions.
 737  
 738  Output JSON: {\"batch_type\":\"proofread\",\"results\":[{\"message_id\":N,\"decision\":\"approve\"},{\"message_id\":N,\"decision\":\"rework\",\"rework_instructions\":\"...\"},{\"message_id\":N,\"decision\":\"reject\",\"reject_reason\":\"...\"}]}"
 739        ;;
 740      code_review)
 741        task_prompt="You are a senior code reviewer (Code Reviewer agent). Review the source file in BATCH DATA for security vulnerabilities, bugs, performance issues, and maintainability problems. Follow the review criteria in the context above.
 742  
 743  For each finding include: severity (1-10), category (security/bug/performance/style), line number, description, and a concrete suggestion.
 744  
 745  Only report real issues with concrete fixes. Do not hallucinate problems.
 746  
 747  Output JSON: {\"batch_type\":\"code_review\",\"results\":[{\"file_path\":\"...\",\"findings\":[{\"severity\":N,\"category\":\"...\",\"line\":N,\"description\":\"...\",\"suggestion\":\"...\"}],\"summary\":\"...\"}]}"
 748        ;;
 749      monitor_health)
 750        task_prompt="You are an SRE monitoring a Node.js outreach pipeline. Analyze the system health data in BATCH DATA. Follow the monitoring criteria in the context above.
 751  
 752  Identify any critical issues, warnings, or degraded conditions. If the system is healthy, say so explicitly.
 753  
 754  Output JSON: {\"batch_type\":\"monitor_health\",\"results\":[{\"severity\":\"info|warn|critical\",\"summary\":\"...\",\"findings\":[{\"severity\":\"...\",\"component\":\"...\",\"issue\":\"...\",\"detail\":\"...\"}],\"recommended_actions\":[\"...\"]}]}"
 755        ;;
 756      triage_errors)
 757        task_prompt="You are an Incident Response Commander triaging pipeline errors. Classify each failed task in BATCH DATA. Follow the triage criteria in the context above.
 758  
 759  For each error: determine type, severity, and action. Only flag create_fix_task for critical/high severity issues with a clear code fix needed.
 760  
 761  Output JSON: {\"batch_type\":\"triage_errors\",\"results\":[{\"error_message\":\"...\",\"type\":\"pipeline_bug|infrastructure|external_api|data_quality|config|known_transient\",\"severity\":\"critical|high|medium|low\",\"action\":\"create_fix_task|monitor|ignore\",\"summary\":\"...\",\"suggested_fix\":\"...\"}]}"
 762        ;;
 763      check_docs)
 764        task_prompt="You are a Technical Writer checking documentation freshness. Review the recently changed source files in BATCH DATA. Follow the checking criteria in the context above.
 765  
 766  For each file, identify any stale documentation (comments, JSDoc, CLAUDE.md references, README mentions). Only report genuinely stale content — not missing docs for new code (that's a separate concern).
 767  
 768  Output JSON: {\"batch_type\":\"check_docs\",\"results\":[{\"file_path\":\"...\",\"stale_docs\":[{\"location\":\"...\",\"issue\":\"...\",\"suggested_update\":\"...\"}],\"summary\":\"...\"}]}"
 769        ;;
 770      evidence_merge)
 771        task_prompt="You are a CRO analyst merging evidence-based findings into HTML-scored website evaluations. Follow the merge criteria in the context above.
 772  
 773  For each site in BATCH DATA: compare score_json factors against evidence_pass1 and evidence_pass2 findings. Revise scores only where evidence is concrete and specific. Surface additional findings that HTML scoring could not catch (CSS-confirmed issues, stock photo filenames, wrong-state links, above-fold placement).
 774  
 775  Output JSON: {\"batch_type\":\"evidence_merge\",\"results\":[{\"site_id\":N,\"revised_scores\":{\"factor_name\":{\"original\":N,\"revised\":N,\"reason\":\"...\"}},\"additional_findings\":[{\"factor\":\"...\",\"finding\":\"...\",\"severity\":\"critical|major|minor\"}],\"evidence_summary\":\"...\"}]}"
 776        ;;
 777      followup_generate)
 778        task_prompt="You are writing follow-up outreach messages for a website conversion audit service. Each site in BATCH DATA has already been contacted once (or more) with no reply. Generate the NEXT follow-up touch using a DIFFERENT value angle than previous messages.
 779  
 780  FOLLOW-UP CADENCE (8 touches over 42 days):
 781  - Touch 2 (Day 3): Different weakness + social proof (\"other businesses in your area\")
 782  - Touch 3 (Day 7): ROI framing (\"this could be costing you \$X/month\")
 783  - Touch 4 (Day 14): Case study + sample report link (auditandfix.com)
 784  - Touch 5 (Day 21): Quick win — give one specific free fix to demonstrate value
 785  - Touch 6 (Day 28): Ad waste angle (\"paying for ads with a low-scoring site?\") / competitor gap
 786  - Touch 7 (Day 35): Authority — \"scored 43,000+ sites, pattern is always the same\"
 787  - Touch 8 (Day 42): Breakup email (\"closing the file, offer stands if you need it later\")
 788  
 789  RULES:
 790  - Each result must include ALL channels from the site's channels array (generate one message per channel per site)
 791  - Email: conversational, 3-5 short paragraphs, include subject line
 792  - SMS: under 160 chars, punchy, no subject line needed
 793  - NEVER repeat the same angle or wording as previous_messages
 794  - Use the site's weaknesses for personalisation but pick DIFFERENT weaknesses than touch 1
 795  - Touch 5: lead with a specific free fix — demonstrate competence, not just pitch
 796  - Touch 6: if site has is_running_ads=true, lean into ad waste angle; otherwise use competitor gap
 797  - Touch 8 (breakup): be respectful, leave the door open, mention auditandfix.com
 798  - Match the tone to the country (G'day for AU, casual for US/CA, formal for UK/EU)
 799  
 800  Output JSON: {\"batch_type\":\"followup_generate\",\"results\":[{\"site_id\":N,\"contact_method\":\"email|sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"sequence_step\":N,\"message_body\":\"...\",\"subject_line\":\"...(email only)\"}]}"
 801        ;;
 802    esac
 803  
 804    full_prompt="$task_prompt
 805  
 806  $context
 807  
 808  BATCH DATA:
 809  $batch_json
 810  
 811  IMPORTANT: Output ONLY valid JSON. No markdown, no explanation, no code fences."
 812  
 813    # Call Claude and pipe result to store
 814    # Write prompt to a temp file to avoid shell arg-length limits with large batches.
 815    # Unset CLAUDECODE so claude doesn't refuse to run if invoked inside another Claude Code session.
 816    _prompt_file=$(mktemp /tmp/orch-prompt-XXXXXX)
 817    printf '%s' "$full_prompt" > "$_prompt_file"
 818    _prompt_bytes=$(wc -c < "$_prompt_file" 2>/dev/null || echo 0)
 819    _prompt_kb=$(( _prompt_bytes / 1024 ))
 820    log "$batch_type: context=${_prompt_kb}kB (prompt+context+batch)"
 821    # Claude Code (Electron) requires XDG_RUNTIME_DIR to run. Systemd user services may
 822    # not inherit it — set a fallback so claude doesn't silently exit with empty output.
 823    _xdg="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}"
 824    [ -d "$_xdg" ] || _xdg="/tmp/runtime-$(id -u)" && mkdir -p "$_xdg" 2>/dev/null || true
 825  
 826    # Write claude output DIRECTLY to a file — never capture in $() bash variable.
 827    # bash $() in non-UTF8 locales (NixOS systemd env) corrupts multi-byte characters
 828    # (Japanese, Chinese, emoji), making the extracted JSON invalid at random positions.
 829    # File I/O preserves all bytes exactly regardless of locale.
 830    _raw_file="$PROJECT_ROOT/logs/.orch-raw-$$.json"
 831    _claude_stderr=$(mktemp /tmp/orch-stderr-XXXXXX)
 832    _effort=$(effort_for_batch "$batch_type")
 833    XDG_RUNTIME_DIR="$_xdg" env -u CLAUDECODE "$CLAUDE_BIN" -p --model "$model" --effort "$_effort" --output-format json \
 834      < "$_prompt_file" > "$_raw_file" 2>"$_claude_stderr" || true
 835    rm -f "$_prompt_file"
 836    _stderr_content=$(cat "$_claude_stderr"); rm -f "$_claude_stderr"
 837  
 838    # Always check for usage limit signals (reads only ASCII metadata, file-safe)
 839    check_limit_signal "$(cat "$_raw_file" 2>/dev/null || true)" "$batch_type"
 840  
 841    # Debug: log raw file size and first 200 ASCII chars for diagnosis
 842    _raw_size=$(wc -c < "$_raw_file" 2>/dev/null || echo "missing")
 843    _raw_preview=$(cat "$_raw_file" 2>/dev/null | tr -cd '[:print:]' | head -c 200 || echo "")
 844    log "$batch_type: raw_file=${_raw_size}b preview=$(echo "$_raw_preview" | head -c 150)"
 845  
 846    if [ ! -s "$_raw_file" ]; then
 847      rm -f "$_raw_file"
 848      # Track consecutive empty results per batch type in the gate file.
 849      # Pattern: external tool failure looks identical to "no work" — consecutive
 850      # empties on a batch that had items is a strong signal something is broken.
 851      _empty_key="empty_streak_${batch_type}"
 852      _streak=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(String(d['$_empty_key']||0));}catch{process.stdout.write('0');}" 2>/dev/null || echo "0")
 853      _streak=$(( _streak + 1 ))
 854      node -e "
 855        const fs=require('fs');
 856        let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{}
 857        d['$_empty_key']=$_streak;
 858        fs.writeFileSync('$GATE_FILE',JSON.stringify(d));
 859      " 2>/dev/null || true
 860  
 861      _empty_msg="$batch_type: claude returned empty — skipping store${_stderr_content:+ (stderr: $(echo "$_stderr_content" | head -1))}"
 862      if [ "$_streak" -ge 5 ]; then
 863        log "ALERT: $_empty_msg [consecutive_empty=$_streak — likely tool failure, not empty queue]"
 864      elif [ "$_streak" -ge 3 ]; then
 865        log "WARN: $_empty_msg [consecutive_empty=$_streak]"
 866      else
 867        log "$_empty_msg"
 868      fi
 869      return 0
 870    fi
 871  
 872    # Success — reset the consecutive-empty streak for this batch type
 873    node -e "
 874      const fs=require('fs');
 875      let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{}
 876      delete d['empty_streak_${batch_type}'];
 877      fs.writeFileSync('$GATE_FILE',JSON.stringify(d));
 878    " 2>/dev/null || true
 879  
 880    store_summary=$(node "$PROJECT_ROOT/scripts/claude-store-wrapper.js" "$_raw_file" "$batch_type" 2>&1 || echo '{"error":"store_node_failed"}')
 881  
 882    if [ -z "$store_summary" ] || echo "$store_summary" | grep -q '"error"'; then
 883      # On error: preserve raw file for inspection
 884      _saved_raw="$PROJECT_ROOT/logs/orch-raw-${batch_type}-saved.json"
 885      cp "$_raw_file" "$_saved_raw" 2>/dev/null || true
 886      log "$batch_type: saved raw to $_saved_raw for inspection"
 887    fi
 888    rm -f "$_raw_file"
 889  
 890    if [ -z "$store_summary" ]; then
 891      log "$batch_type: [WARN] store_summary empty — node produced no output. stderr=$(echo "$_stderr_content" | head -c 300)"
 892    else
 893      log "$batch_type: $store_summary"
 894      [ -n "$_stderr_content" ] && echo "$store_summary" | grep -q '"error"' && log "$batch_type: claude_stderr=$(echo "$_stderr_content" | head -c 200)"
 895    fi
 896    return 0
 897  }
 898  
 899  # Returns 0 if the batch type is skipped in conservation mode, 1 if it should run.
 900  # Time-critical and Haiku tasks always run. Opus-heavy generation deferred.
 901  should_skip_for_conservation() {
 902    batch_type="$1"
 903    [ "$CONSERVATION_MODE" = "false" ] && return 1  # Not in conservation mode — don't skip
 904  
 905    case "$batch_type" in
 906      # Always run — time-critical (inbound replies can't wait) or cheap (Haiku)
 907      reply_responses|classify_replies|extract_names|oversee|classify_errors)
 908        return 1 ;;
 909      # Defer — Opus-heavy or pipeline-upstream work that can wait
 910      proposals_email|proposals_sms|score_semantic|score_sites|enrich_sites|proofread|followup_generate)
 911        log "$batch_type: SKIP (conservation mode — Claude usage limit active since $LIMIT_HIT_SINCE)"
 912        return 0 ;;
 913    esac
 914    return 1
 915  }
 916  
 917  # Run a batch with all standard skip checks (rate_limits, stage_flag, conservation, backlog).
 918  # The skip functions themselves know which batch types they apply to — unknown types pass through.
 919  # Usage: run_checked <type> <model> <batch_size> [context_files...]
 920  run_checked() {
 921    _rc_type="$1"; _rc_model="$2"; _rc_size="$3"; shift 3
 922    [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rc_type" ] && return 0
 923    should_skip_for_rate_limits "$_rc_type" && return 0
 924    should_skip_for_stage_flag "$_rc_type" && return 0
 925    should_skip_for_conservation "$_rc_type" && return 0
 926    should_skip_for_backlog "$_rc_type" && return 0
 927    run_batch "$_rc_type" "$_rc_model" "$_rc_size" "$@" && had_work=true
 928  }
 929  
 930  # Run a time-gated batch with all standard skip checks + is_due gate.
 931  # Usage: run_checked_gated <type> <model> <batch_size> <interval_minutes> [context_files...]
 932  run_checked_gated() {
 933    _rg_type="$1"; _rg_model="$2"; _rg_size="$3"; _rg_interval="$4"; shift 4
 934    [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rg_type" ] && return 0
 935    should_skip_for_rate_limits "$_rg_type" && return 0
 936    should_skip_for_stage_flag "$_rg_type" && return 0
 937    should_skip_for_conservation "$_rg_type" && return 0
 938    should_skip_for_backlog "$_rg_type" && return 0
 939    if is_due "$_rg_type" "$_rg_interval"; then
 940      run_batch "$_rg_type" "$_rg_model" "$_rg_size" "$@" && had_work=true
 941      mark_ran "$_rg_type"
 942    else
 943      log "$_rg_type: not due yet (runs every ${_rg_interval}min)"
 944    fi
 945  }
 946  
 947  # Check if any Tier C (interactive) agents are overdue.
 948  # Writes logs/agents-due.json with overdue agents and their session start prompts.
 949  # Called once per cycle so the daily report and AFK monitor can surface them.
 950  check_tier_c_agents() {
 951    _due_agents="[]"
 952  
 953    # Security Engineer: quarterly (129600 min) + major refactor
 954    if is_due "tier_c_security_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 955      _due_agents=$(echo "$_due_agents" | node -e "
 956        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 957          const a=JSON.parse(d);
 958          a.push({agent:'security_audit',schedule:'quarterly',
 959            prompt:'Run a Security Engineer audit on 333Method and 2Step. Review: (1) API key handling in load-env.js and .env files, (2) Twilio webhook authentication in src/inbound/sms.js, (3) PayPal integration in src/payment/paypal.js, (4) unsigned unsubscribe tokens in 2Step src/outreach/email.js, (5) stealth-browser SSRF risk, (6) SQL injection review of all DB queries. Output findings as JSON with severity/file/line/description.'});
 960          process.stdout.write(JSON.stringify(a));
 961        });
 962      " 2>/dev/null || echo "$_due_agents")
 963      log "[AGENT DUE] security_audit — run Security Engineer (Opus, max, thinking on)"
 964    fi
 965  
 966    # Compliance Auditor: quarterly (129600 min) + major refactor
 967    if is_due "tier_c_compliance_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 968      _due_agents=$(echo "$_due_agents" | node -e "
 969        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 970          const a=JSON.parse(d);
 971          a.push({agent:'compliance_audit',schedule:'quarterly',
 972            prompt:'Run a Compliance Auditor review on 333Method outreach system. Check: (1) CAN-SPAM compliance in email templates, (2) TCPA compliance in SMS sending (business hours, opt-out), (3) GDPR country blocking logic in src/config/countries.js, (4) unsubscribe honoring in src/outreach/, (5) 72-hour cooldown enforcement. Also audit 2Step CAN-SPAM compliance in src/outreach/email.js.'});
 973          process.stdout.write(JSON.stringify(a));
 974        });
 975      " 2>/dev/null || echo "$_due_agents")
 976      log "[AGENT DUE] compliance_audit — run Compliance Auditor (Opus, high, thinking on)"
 977    fi
 978  
 979    # SEO Specialist: monthly (43200 min) for colorcraft-ai.com
 980    if is_due "tier_c_seo_audit" 43200; then
 981      _due_agents=$(echo "$_due_agents" | node -e "
 982        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 983          const a=JSON.parse(d);
 984          a.push({agent:'seo_specialist',schedule:'monthly',
 985            prompt:'Run an SEO Specialist audit on colorcraft-ai.com. Fetch the site, analyze technical SEO (meta tags, page speed indicators, structured data, sitemap, robots.txt), keyword opportunities, content gaps, and link building potential. Output a prioritized report of changes for Base44 to implement.'});
 986          process.stdout.write(JSON.stringify(a));
 987        });
 988      " 2>/dev/null || echo "$_due_agents")
 989      log "[AGENT DUE] seo_specialist — run SEO Specialist (Sonnet, medium)"
 990    fi
 991  
 992    # Automation Governance: quarterly (129600 min) + major refactor
 993    if is_due "tier_c_automation_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 994      _due_agents=$(echo "$_due_agents" | node -e "
 995        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 996          const a=JSON.parse(d);
 997          a.push({agent:'automation_audit',schedule:'quarterly',
 998            prompt:'Run an Automation Governance audit. Read the cron_jobs table (sqlite3 db/sites.db \"SELECT * FROM cron_jobs\"), review all cron jobs in src/cron/ and the orchestrator in scripts/claude-orchestrator.sh. Identify: redundant jobs, poor sequencing, missing error handling, jobs that should be consolidated. Output a ranked list of changes.'});
 999          process.stdout.write(JSON.stringify(a));
1000        });
1001      " 2>/dev/null || echo "$_due_agents")
1002      log "[AGENT DUE] automation_audit — run Automation Governance Architect (Sonnet, medium)"
1003    fi
1004  
1005    # Code review completion check — surface Phase 5 reminder when queue is done
1006    if [ -f "logs/code-review-queue.json" ]; then
1007      _cr_done=$(node -e "
1008        try {
1009          const q = JSON.parse(require('fs').readFileSync('logs/code-review-queue.json','utf8'));
1010          process.stdout.write(q.next_index >= q.files.length ? 'yes' : 'no');
1011        } catch { process.stdout.write('no'); }
1012      " 2>/dev/null || echo "no")
1013      if [ "$_cr_done" = "yes" ] && is_due "tier_c_phase5_reminder" 43200; then
1014        _due_agents=$(echo "$_due_agents" | node -e "
1015          let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
1016            const a=JSON.parse(d);
1017            a.push({agent:'phase5_sessions',schedule:'one-shot',
1018              prompt:'Code review of all 150 files is complete. Time to run Phase 5 Tier C sessions:\n1. Security audit: ./scripts/run-agent-session.sh security_audit\n2. 2Step CI/CD: ./scripts/run-agent-session.sh automation_audit\n3. colorcraft-ai.com report: ./scripts/run-agent-session.sh colorcraft_report\n4. Update distributed-agent-system plan (manual task in docs/plans/)'});
1019            process.stdout.write(JSON.stringify(a));
1020          });
1021        " 2>/dev/null || echo "$_due_agents")
1022        log "[REMINDER] Code review complete — Phase 5 Tier C sessions ready to run"
1023        log "  Run: ./scripts/run-agent-session.sh security_audit"
1024        log "  Run: ./scripts/run-agent-session.sh colorcraft_report"
1025        log "  Run: ./scripts/run-agent-session.sh automation_audit"
1026        log "  Update: docs/plans/distributed-agent-system.md"
1027      fi
1028    fi
1029  
1030    # Write results to logs/agents-due.json
1031    printf '%s\n' "$_due_agents" > logs/agents-due.json 2>/dev/null || true
1032  }
1033  
1034  # Detect major refactor: check recent commits for patterns that trigger agent reviews.
1035  # Sets MAJOR_REFACTOR=true if criteria met. Called once per cycle.
1036  check_major_refactor() {
1037    MAJOR_REFACTOR=false
1038    _changed=$(git diff --name-only HEAD~10 2>/dev/null | head -200 || echo "")
1039    _count=$(echo "$_changed" | grep -c '.' 2>/dev/null) || _count=0
1040  
1041    # 10+ files changed
1042    [ "$_count" -ge 10 ] && MAJOR_REFACTOR=true && return 0
1043  
1044    # Security-sensitive files
1045    echo "$_changed" | grep -qE '^src/stages/|^scripts/claude-orchestrator\.sh$|^db/migrations/|^src/outreach/|^src/payment/|^src/inbound/|^load-env\.js$|^src/utils/load-env\.js$|^src/utils/stealth-browser' && MAJOR_REFACTOR=true && return 0
1046  
1047    return 0
1048  }
1049  
1050  # Define batch order — priority: outreach-adjacent first, upstream last
1051  process_all_batches() {
1052    had_work=false
1053  
1054    # Re-resolve CLAUDE_BIN each cycle — auto-updates delete old versions
1055    resolve_claude_bin
1056  
1057    # Run canary once per cycle (verifies node, claude-batch.js, CLAUDE_BIN are usable)
1058    run_canary
1059  
1060    # Detect major refactor (sets MAJOR_REFACTOR env var for this cycle)
1061    check_major_refactor
1062    [ "$MAJOR_REFACTOR" = "true" ] && log "MAJOR_REFACTOR detected — Tier C agents flagged as due"
1063  
1064    # Check Tier C agent schedule — writes logs/agents-due.json
1065    check_tier_c_agents
1066  
1067    # Reload SKIP_STAGES from .env each cycle (respects live changes without restart)
1068    load_skip_stages
1069    [ -n "$SKIP_STAGES" ] && log "SKIP_STAGES=$SKIP_STAGES"
1070  
1071    # Check Claude Max usage limits proactively (live API fetch, ~1s)
1072    check_usage_proactive
1073  
1074    # Refresh pipeline backlog counts for intelligent skipping
1075    refresh_backlog
1076  
1077    # Scale batch sizes up when backlogs are large so we clear them faster.
1078    # Caps are conservative — LLM context window is the real ceiling.
1079    if [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 5000 ]; then
1080      PROOFREAD_BATCH=200
1081    elif [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 1000 ]; then
1082      PROOFREAD_BATCH=100
1083    fi
1084  
1085    if [ "${BACKLOG_PROPOSALS:-0}" -gt 5000 ]; then
1086      # Cap at 20 SMS / 10 email — 50 SMS exceeded 32k output token limit (29min run, hard error)
1087      REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=20; REWORD_FORM_BATCH=30
1088      REWORD_LINKEDIN_BATCH=30; REWORD_X_BATCH=30
1089    elif [ "${BACKLOG_PROPOSALS:-0}" -gt 1000 ]; then
1090      REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=30; REWORD_FORM_BATCH=30
1091    fi
1092  
1093    conservation_note=""
1094    [ "$CONSERVATION_MODE" = "true" ] && conservation_note=" [CONSERVATION MODE: $CONSERVATION_REASON]"
1095    log "Backlog: total_sites=${CANARY_SITE_COUNT:-?} eligible_outreach=$BACKLOG_ELIGIBLE_OUTREACH actionable_proposals=$BACKLOG_PROPOSALS actionable_enriched=$BACKLOG_ENRICHED approved_unsent=$BACKLOG_APPROVED_UNSENT pending=$BACKLOG_PENDING_APPROVAL active_countries=$BACKLOG_ACTIVE_COUNTRIES daily_send_avg=${BACKLOG_DAILY_SEND_AVG:-0}${conservation_note}"
1096    log "Effective batch sizes: proofread=$PROOFREAD_BATCH score_semantic=$SCORE_SEMANTIC_BATCH enrich=$ENRICH_SITES_BATCH"
1097  
1098    # --- Pipeline batches: outreach-adjacent first, upstream last ---
1099    run_checked proposals_email  opus   "$PROPOSALS_EMAIL_BATCH"  prompts/PROPOSAL.md docs/05-outreach/email-best-practices.md
1100    run_checked proposals_sms    opus   "$PROPOSALS_SMS_BATCH"    prompts/PROPOSAL.md docs/05-outreach/sms-best-practices.md
1101    # Semantic scoring: Sonnet evaluates headline, value prop, USP. Accuracy > cost.
1102    run_checked score_semantic   sonnet "$SCORE_SEMANTIC_BATCH"
1103  
1104    # score_sites: HTML-only LLM scoring (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)
1105    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "score_sites" ]; then
1106      _llm_scoring=$(node -e "process.stdout.write(process.env.ENABLE_LLM_SCORING !== 'false' ? '1' : '0')" 2>/dev/null || echo "0")
1107      _enable_vision=$(node -e "process.stdout.write(process.env.ENABLE_VISION !== 'false' ? '1' : '0')" 2>/dev/null || echo "1")
1108      if [ "$_llm_scoring" = "1" ] && [ "$_enable_vision" = "0" ]; then
1109        run_checked score_sites sonnet "$SCORE_SITES_BATCH" prompts/CONVERSION-SCORING-NOVIS.md
1110      else
1111        [ -z "$SINGLE_TYPE" ] || log "score_sites: SKIP (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)"
1112      fi
1113    fi
1114  
1115    # enrich_sites: LLM contact enrichment (requires ENABLE_ENRICHMENT_LLM=true)
1116    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "enrich_sites" ]; then
1117      _enrich_llm=$(node -e "process.stdout.write(process.env.ENABLE_ENRICHMENT_LLM !== 'false' ? '1' : '0')" 2>/dev/null || echo "0")
1118      if [ "$_enrich_llm" = "1" ]; then
1119        run_checked enrich_sites haiku "$ENRICH_SITES_BATCH" prompts/ENRICHMENT.md
1120      else
1121        [ -z "$SINGLE_TYPE" ] || log "enrich_sites: SKIP (ENABLE_ENRICHMENT_LLM=false)"
1122      fi
1123    fi
1124  
1125    # gather_evidence: moved to standalone cron job (5-min interval, migration 112).
1126    # No longer runs inline — the cron system handles execution independently.
1127    # The pending count is still logged here for orchestrator visibility.
1128    # Also reports sites awaiting merge (pass1+pass2 done, evidence_json still NULL).
1129    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "gather_evidence" ]; then
1130      _pending_evidence=$(node -e "
1131        const Database = require('./node_modules/better-sqlite3');
1132        const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true });
1133        const r = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)\").get();
1134        const m = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL\").get();
1135        process.stdout.write(JSON.stringify({ pending: r.c || 0, awaiting_merge: m.c || 0 }));
1136        db.close();
1137      " 2>/dev/null || echo '{"pending":0,"awaiting_merge":0}')
1138      _ev_pending=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).pending||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
1139      _ev_merge=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).awaiting_merge||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
1140      if [ "$_ev_pending" -gt 0 ] 2>/dev/null || [ "$_ev_merge" -gt 0 ] 2>/dev/null; then
1141        log "gather_evidence: ${_ev_pending} sites pending collection, ${_ev_merge} awaiting merge (proposals gate requires evidence_json)"
1142        # Detect evidence-blocked state: outreach pipeline is empty and evidence is the critical path.
1143        # Log clearly so the overseer and monitoring can see this is expected, not a crash.
1144        if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -eq 0 ] && [ "${BACKLOG_PROPOSALS:-0}" -eq 0 ] 2>/dev/null; then
1145          log "EVIDENCE-BLOCKED: eligible_outreach=0 proposals=0 — evidence collection is the critical path (pending=${_ev_pending} awaiting_merge=${_ev_merge})"
1146        fi
1147      else
1148        log "gather_evidence: no sites pending"
1149      fi
1150    fi
1151  
1152    run_checked proofread         opus   "$PROOFREAD_BATCH"   prompts/PROOFREAD.md
1153    run_checked followup_generate sonnet 20                    prompts/FOLLOWUP.md
1154    run_checked classify_replies  haiku  "$CLASSIFY_BATCH"    prompts/CLASSIFY-REPLIES.md
1155    run_checked reply_responses   opus   "$REPLIES_BATCH"     prompts/REPLIES.md
1156  
1157    # --- Time-gated batches ---
1158    run_checked_gated extract_names    haiku  "$NAMES_BATCH"  15
1159    run_checked_gated oversee          sonnet 1               30
1160    run_checked_gated classify_errors  haiku  1               240
1161    run_checked_gated monitor_health   haiku  1               60   prompts/agents/MONITOR-HEALTH.md
1162    run_checked_gated triage_errors    haiku  1               120  prompts/agents/TRIAGE-ERRORS.md
1163    run_checked_gated check_docs       haiku  1               240  prompts/agents/CHECK-DOCS.md
1164    # evidence_merge: bypass time gate when sites are awaiting merge — run every cycle until clear.
1165    # Falls back to 15min gate when backlog is empty (avoids unnecessary idle runs).
1166    if [ "${_ev_merge:-0}" -gt 0 ] 2>/dev/null; then
1167      run_checked evidence_merge haiku 1 prompts/EVIDENCE-MERGE.md
1168    else
1169      run_checked_gated evidence_merge haiku 1 15 prompts/EVIDENCE-MERGE.md
1170    fi
1171  
1172    # --- Code review (1 file per cycle, security-first order) ---
1173    # Gate: pause if pending code_review_fix tasks > 30 (backpressure — don't flood the fix queue)
1174    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "code_review" ]; then
1175      _pending_fixes=$(node -e "
1176        const Database = require('./node_modules/better-sqlite3');
1177        const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true });
1178        const r = db.prepare(\"SELECT COUNT(*) as c FROM agent_tasks WHERE task_type='code_review_fix' AND status IN ('pending','running')\").get();
1179        process.stdout.write(String(r.c || 0));
1180        db.close();
1181      " 2>/dev/null || echo "0")
1182      if [ "${_pending_fixes:-0}" -gt 30 ]; then
1183        log "code_review: SKIP — pending_fixes=${_pending_fixes} > 30 (fix backlog too large)"
1184      else
1185        run_checked code_review sonnet 1 prompts/agents/CODE-REVIEW.md
1186      fi
1187    fi
1188  
1189    [ "$had_work" = true ]
1190  }
1191  
1192  # Main
1193  log "Starting orchestrator (loop=$LOOP, type=${SINGLE_TYPE:-all})"
1194  
1195  if [ "$LOOP" = true ]; then
1196    while true; do
1197      if ! process_all_batches; then
1198        if [ "$CONSERVATION_MODE" = "true" ]; then
1199          # Queues appear empty for deferred tasks, but we're still in conservation mode.
1200          # Keep looping — time-critical tasks (replies, classify) may arrive.
1201          log "Queues empty in conservation mode — waiting 5min before retry..."
1202          sleep 300
1203          continue
1204        fi
1205        log "All queues empty — exiting loop"
1206        break
1207      fi
1208      # In conservation mode, pause between cycles to avoid hammering the quota check
1209      if [ "$CONSERVATION_MODE" = "true" ]; then
1210        log "Conservation mode: pausing 5min before next cycle..."
1211        sleep 300
1212      fi
1213      log "Batch complete, starting next cycle..."
1214    done
1215  else
1216    process_all_batches || log "No work to process"
1217  fi
1218  
1219  log "Orchestrator finished"