/ scripts / claude-orchestrator.sh
claude-orchestrator.sh
   1  #!/bin/sh
   2  # Claude Orchestrator — Process batches using Claude Code (claude -p)
   3  #
   4  # Usage:
   5  #   scripts/claude-orchestrator.sh              # One batch per type, then exit
   6  #   scripts/claude-orchestrator.sh --loop       # Repeat until all queues empty
   7  #   scripts/claude-orchestrator.sh --type proposals_email  # Single type only
   8  #
   9  # Requires: claude CLI (Claude Max subscription)
  10  # Each batch runs as a separate `claude -p --model <model>` invocation:
  11  #   - Fresh context per batch (no RAM accumulation)
  12  #   - Loads only relevant prompt/context docs
  13  #   - Output piped to claude-store.js for DB persistence
  14  
  15  set -e
  16  
  17  SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
  18  PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
  19  cd "$PROJECT_ROOT"
  20  
  21  # Ensure node and claude are in PATH — NixOS systemd services may not inherit them.
  22  # Must match the same Node.js version that compiled node_modules (nodejs_22).
  23  if ! command -v node >/dev/null 2>&1; then
  24    for d in /nix/store/*-nodejs-22.*/bin; do
  25      if [ -x "$d/node" ]; then
  26        export PATH="$d:$PATH"
  27        break
  28      fi
  29    done
  30  fi
  31  # Resolve claude CLI — prefer the latest installed version directly to avoid
  32  # stale symlinks after updates (e.g. ~/.local/bin/claude -> deleted version).
  33  # Extracted to a function so --loop mode can re-resolve each cycle when
  34  # Claude auto-updates delete the previously resolved version.
  35  resolve_claude_bin() {
  36    _claude_bin=""
  37    if [ -d "$HOME/.local/share/claude/versions" ]; then
  38      # Pick the latest *executable binary* — skip wrapper scripts (<1MB) and broken files
  39      for _v in $(ls -1 "$HOME/.local/share/claude/versions" 2>/dev/null | sort -V -r); do
  40        _candidate="$HOME/.local/share/claude/versions/$_v"
  41        _sz=$(stat -c%s "$_candidate" 2>/dev/null || echo 0)
  42        if [ -x "$_candidate" ] && [ "$_sz" -gt 1000000 ]; then
  43          _claude_bin="$_candidate"
  44          break
  45        fi
  46      done
  47    fi
  48    if [ -z "$_claude_bin" ] || [ ! -x "$_claude_bin" ]; then
  49      # Fallback to whatever is on PATH (symlink or system install)
  50      _claude_bin=$(command -v claude 2>/dev/null || echo "")
  51    fi
  52    alias claude="$_claude_bin"
  53    export CLAUDE_BIN="$_claude_bin"
  54  }
  55  resolve_claude_bin
  56  
  57  # Self-managed log file when running non-interactively (e.g. systemd)
  58  if [ ! -t 1 ]; then
  59    mkdir -p "$PROJECT_ROOT/logs"
  60    LOG_FILE="$PROJECT_ROOT/logs/orchestrator-$(date +%Y-%m-%d).log"
  61    exec >> "$LOG_FILE" 2>&1
  62  fi
  63  
  64  # Log CLAUDE_BIN resolution after log redirect so it appears in the log file
  65  echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] CLAUDE_BIN=$CLAUDE_BIN (executable: $([ -x "$CLAUDE_BIN" ] && echo yes || echo no))"
  66  
  67  LOOP=false
  68  SINGLE_TYPE=""
  69  
  70  # ── Usage limit state ──────────────────────────────────────────────────────
  71  # Claude Max has two windows: 5-hour rolling window and a weekly cap.
  72  # Utilization percentages are read from ~/.claude/usage-cache.json (written
  73  # by the Claude Code TUI after polling api.anthropic.com/api/oauth/usage).
  74  #
  75  # Strategy:
  76  #   1. PROACTIVE: Before each batch cycle, read usage-cache.json. If 5h window
  77  #      is ≥ WARNING_PCT or weekly is ≥ WEEKLY_WARNING_PCT, enter CONSERVATION
  78  #      MODE before hitting the hard limit. This preserves tokens for critical work.
  79  #   2. REACTIVE: If claude -p returns is_error:true or a limit keyword, enter
  80  #      conservation mode immediately.
  81  #
  82  # Conservation mode defers Opus-heavy rewording/proposals; keeps:
  83  #   - reply_responses (Opus) — time-critical inbound replies can't wait
  84  #   - classify_replies / extract_names (Haiku) — cheap; extract_names gated to 15min intervals
  85  #   - oversee (Sonnet) — system health, always run
  86  #   - classify_errors (Haiku) — cheap, always run
  87  #
  88  # Once the window resets (resets_at timestamp passes), exits conservation mode.
  89  CONSECUTIVE_LIMIT_HITS=0
  90  LIMIT_HIT_SINCE=""
  91  CONSERVATION_MODE=false
  92  CONSERVATION_REASON=""
  93  WARNING_PCT=85           # Enter conservation when 5h window ≥ 85% used
  94  WEEKLY_WARNING_PCT=90    # Enter conservation when weekly window ≥ 90% used
  95  USAGE_CACHE="${HOME}/.claude/usage-cache.json"
  96  
  97  while [ $# -gt 0 ]; do
  98    case "$1" in
  99      --loop) LOOP=true; shift ;;
 100      --type) SINGLE_TYPE="$2"; shift 2 ;;
 101      *) echo "Unknown arg: $1"; exit 1 ;;
 102    esac
 103  done
 104  
 105  # Batch sizes per type
 106  PROPOSALS_EMAIL_BATCH=5
 107  PROPOSALS_SMS_BATCH=10
 108  CLASSIFY_BATCH=50
 109  NAMES_BATCH=50
 110  REPLIES_BATCH=10
 111  PROOFREAD_BATCH=50
 112  SCORE_SEMANTIC_BATCH=50
 113  SCORE_SITES_BATCH=10
 114  ENRICH_SITES_BATCH=15
 115  
 116  # Frequency-gated batch state file (tracks last-run timestamps for periodic batch types)
 117  GATE_FILE="$PROJECT_ROOT/logs/orchestrator-gates.json"
 118  
 119  # ── Canary check (runs once per cycle) ───────────────────────────────────────
 120  # Verify external tools can actually execute before attempting any batches.
 121  # Catches: missing binary, wrong architecture (glibc on NixOS), broken PATH,
 122  # missing node_modules, etc. Fails loudly so monitoring picks it up immediately
 123  # rather than silently returning empty results for every batch.
 124  # Called from process_all_batches() so it runs once per cycle in --loop mode too.
 125  run_canary() {
 126    _canary_ok=true
 127  
 128    # 1. node must be able to run a basic script
 129    if ! node -e "process.exit(0)" 2>/dev/null; then
 130      log "FATAL: node is not executable — $(node --version 2>&1 | head -1). All batches will fail."
 131      _canary_ok=false
 132    fi
 133  
 134    # 2. claude-batch.js must be loadable (catches missing node_modules, syntax errors)
 135    if $_canary_ok && ! node scripts/claude-batch.js --canary 2>/dev/null | grep -q "ok"; then
 136      # Tolerate if --canary flag isn't implemented — just check node can load it without crashing
 137      node scripts/claude-batch.js --canary 2>/tmp/orch-canary-err.txt || true
 138      _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 139      if echo "$_canary_err" | grep -qiE "cannot find module|SyntaxError|ERR_"; then
 140        log "FATAL: claude-batch.js failed to load: $_canary_err. All batches will fail."
 141        _canary_ok=false
 142      fi
 143    fi
 144  
 145    # 3. CLAUDE_BIN must be executable (catches glibc/nix-ld issues, broken symlinks, wrapper loops)
 146    if $_canary_ok && [ -n "$CLAUDE_BIN" ]; then
 147      _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true)
 148      _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 149      if [ -z "$_canary_out" ]; then
 150        # Re-resolve — Claude may have auto-updated, deleting the old version
 151        log "WARN: CLAUDE_BIN=$CLAUDE_BIN failed (${_canary_err:-no output}). Re-resolving..."
 152        resolve_claude_bin
 153        if [ -n "$CLAUDE_BIN" ]; then
 154          _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true)
 155          _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1)
 156        fi
 157        if [ -z "$_canary_out" ]; then
 158          log "FATAL: CLAUDE_BIN=$CLAUDE_BIN still failed after re-resolve: ${_canary_err:-no output}. All LLM batches will return empty."
 159          _canary_ok=false
 160        else
 161          log "Canary OK after re-resolve: node=$(node --version 2>/dev/null), claude=$_canary_out"
 162        fi
 163      else
 164        log "Canary OK: node=$(node --version 2>/dev/null), claude=$_canary_out"
 165      fi
 166    elif $_canary_ok; then
 167      # CLAUDE_BIN empty — try resolving (may have been installed since startup)
 168      resolve_claude_bin
 169      if [ -n "$CLAUDE_BIN" ]; then
 170        log "Resolved CLAUDE_BIN=$CLAUDE_BIN after initially empty"
 171      else
 172        log "WARN: CLAUDE_BIN is empty — LLM batches will fail. Check PATH and ~/.local/share/claude/versions/."
 173        _canary_ok=false
 174      fi
 175    fi
 176    rm -f /tmp/orch-canary-err.txt
 177  }
 178  
 179  # Effort level per batch type — matches Agency Agents plan.
 180  # Controls how much reasoning the model uses. Saves tokens on simple tasks.
 181  # Levels: low (classification/extraction), medium (analysis), high (reasoning/creative), max (extended thinking)
 182  effort_for_batch() {
 183    case "$1" in
 184      proposals_email)      echo "high" ;;
 185      proposals_sms)        echo "medium" ;;
 186      classify_replies)     echo "medium" ;;
 187      extract_names)        echo "low" ;;
 188      reply_responses)      echo "high" ;;
 189      proofread)            echo "medium" ;;
 190      score_sites)          echo "medium" ;;
 191      score_semantic)       echo "medium" ;;
 192      enrich_sites)         echo "low" ;;
 193      oversee)              echo "medium" ;;
 194      classify_errors)      echo "low" ;;
 195      code_review)          echo "medium" ;;
 196      monitor_health)       echo "low" ;;
 197      triage_errors)        echo "medium" ;;
 198      check_docs)           echo "low" ;;
 199      evidence_merge)       echo "low" ;;
 200      followup_generate)    echo "medium" ;;
 201      *)                    echo "medium" ;;
 202    esac
 203  }
 204  
 205  # Check if a frequency-gated batch type is due to run.
 206  # Args: gate_type interval_minutes
 207  # Returns 0 (run it) or 1 (skip — not due yet)
 208  is_due() {
 209    gate_type="$1"
 210    interval_mins="$2"
 211    if [ ! -f "$GATE_FILE" ]; then return 0; fi
 212    elapsed=$(node -e "
 213      try {
 214        const d = JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));
 215        const ts = d['$gate_type'];
 216        process.stdout.write(ts ? String(Date.now() - new Date(ts).getTime()) : '0');
 217      } catch { process.stdout.write('0'); }
 218    " 2>/dev/null)
 219    [ -z "$elapsed" ] && elapsed=0
 220    threshold=$((interval_mins * 60 * 1000))
 221    [ "$elapsed" -ge "$threshold" ] 2>/dev/null || [ "$elapsed" = "0" ]
 222  }
 223  
 224  # Record that a gated batch type just ran
 225  mark_ran() {
 226    gate_type="$1"
 227    node -e "
 228      const fs = require('fs');
 229      let d = {};
 230      try { d = JSON.parse(fs.readFileSync('$GATE_FILE','utf8')); } catch {}
 231      d['$gate_type'] = new Date().toISOString();
 232      fs.writeFileSync('$GATE_FILE', JSON.stringify(d, null, 2));
 233    " 2>/dev/null
 234  }
 235  
 236  log() {
 237    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] $*"
 238  }
 239  
 240  # Proactive: fetch live usage from Anthropic API, fall back to cache.
 241  # Reads OAuth token from ~/.claude/.credentials.json (Linux credential store).
 242  # Called once per batch cycle (before processing batches).
 243  check_usage_proactive() {
 244    CREDS_FILE="${HOME}/.claude/.credentials.json"
 245  
 246    # Try live fetch first (updates cache for TUI too)
 247    usage=$(node -e "
 248      const fs = require('fs');
 249      const https = require('https');
 250      let token = '';
 251      try {
 252        const creds = JSON.parse(fs.readFileSync('$CREDS_FILE','utf8'));
 253        token = creds.claudeAiOauth?.accessToken || '';
 254      } catch {}
 255  
 256      if (!token) {
 257        // Fall back to cache
 258        try {
 259          const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8'));
 260          process.stdout.write(JSON.stringify({...d, source:'cache_no_token'}));
 261        } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"none\"}'); }
 262        return;
 263      }
 264  
 265      const req = https.request({
 266        hostname: 'api.anthropic.com',
 267        path: '/api/oauth/usage',
 268        method: 'GET',
 269        headers: {
 270          'Authorization': 'Bearer ' + token,
 271          'anthropic-beta': 'oauth-2025-04-20',
 272          'User-Agent': 'claude-orchestrator/1.0'
 273        },
 274        timeout: 8000
 275      }, (res) => {
 276        let body = '';
 277        res.on('data', c => body += c);
 278        res.on('end', () => {
 279          try {
 280            const raw = JSON.parse(body);
 281            const result = {
 282              five_hour: Math.round(raw.five_hour?.utilization || 0),
 283              seven_day: Math.round(raw.seven_day?.utilization || 0),
 284              five_hour_resets_at: raw.five_hour?.resets_at || '',
 285              seven_day_resets_at: raw.seven_day?.resets_at || '',
 286              source: 'live'
 287            };
 288            // Update cache file for TUI
 289            try { fs.writeFileSync('$USAGE_CACHE', JSON.stringify({...result, stale: false})); } catch {}
 290            process.stdout.write(JSON.stringify(result));
 291          } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"parse_error\"}'); }
 292        });
 293      });
 294      req.on('error', () => {
 295        // Fall back to cache on network error
 296        try {
 297          const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8'));
 298          process.stdout.write(JSON.stringify({...d, source:'cache_fallback'}));
 299        } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"error\"}'); }
 300      });
 301      req.on('timeout', () => { req.destroy(); });
 302      req.end();
 303    " 2>/dev/null || node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));d.source='node_error_cache';process.stdout.write(JSON.stringify(d));}catch{process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"node_error\"}');}" 2>/dev/null || echo '{"five_hour":0,"seven_day":0,"source":"node_error"}')
 304  
 305    five_hour=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).five_hour||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
 306    seven_day=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).seven_day||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
 307    five_resets=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).five_hour_resets_at||'');}catch{process.stdout.write('');}})" 2>/dev/null || echo "")
 308  
 309    source=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).source||'?');}catch{process.stdout.write('?');}})" 2>/dev/null || echo "?")
 310    log "Usage [${source}]: 5h=${five_hour}% weekly=${seven_day}%  (warn at ${WARNING_PCT}%/${WEEKLY_WARNING_PCT}%)"
 311  
 312    # Check if the 5h window has reset since we entered conservation
 313    if [ "$CONSERVATION_MODE" = "true" ] && [ -n "$five_resets" ] && [ -n "$LIMIT_HIT_SINCE" ]; then
 314      resets_epoch=$(date -d "$five_resets" +%s 2>/dev/null || echo "0")
 315      hit_epoch=$(date -d "$LIMIT_HIT_SINCE" +%s 2>/dev/null || echo "0")
 316      if [ "$resets_epoch" -gt 0 ] && [ "$resets_epoch" -gt "$hit_epoch" ]; then
 317        CONSERVATION_MODE=false
 318        CONSECUTIVE_LIMIT_HITS=0
 319        LIMIT_HIT_SINCE=""
 320        CONSERVATION_REASON=""
 321        node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true
 322        log "CONSERVATION MODE OFF — 5h window reset at $five_resets. Usage now ${five_hour}%. Resuming full operation."
 323        return
 324      fi
 325      # Also exit if usage has dropped back below warning threshold
 326      if [ "$five_hour" -lt "$((WARNING_PCT - 10))" ] && [ "$seven_day" -lt "$((WEEKLY_WARNING_PCT - 10))" ]; then
 327        CONSERVATION_MODE=false
 328        CONSECUTIVE_LIMIT_HITS=0
 329        LIMIT_HIT_SINCE=""
 330        CONSERVATION_REASON=""
 331        node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true
 332        log "CONSERVATION MODE OFF — usage recovered (5h=${five_hour}% weekly=${seven_day}%). Resuming full operation."
 333        return
 334      fi
 335    fi
 336  
 337    # Enter conservation proactively
 338    if [ "$CONSERVATION_MODE" = "false" ]; then
 339      if [ "$five_hour" -ge "$WARNING_PCT" ]; then
 340        CONSERVATION_MODE=true
 341        LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 342        CONSERVATION_REASON="5h=${five_hour}% (>=${WARNING_PCT}%)"
 343        log "CONSERVATION MODE ON (proactive) — 5h window at ${five_hour}%. Deferring Opus rewording/proposals. Keeping replies+Haiku. Resets: $five_resets"
 344      elif [ "$seven_day" -ge "$WEEKLY_WARNING_PCT" ]; then
 345        CONSERVATION_MODE=true
 346        LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 347        CONSERVATION_REASON="weekly=${seven_day}% (>=${WEEKLY_WARNING_PCT}%)"
 348        log "CONSERVATION MODE ON (proactive) — weekly window at ${seven_day}%. Deferring Opus rewording/proposals. Resets: $(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).seven_day_resets_at||'');}catch{process.stdout.write('');}});")"
 349      fi
 350    fi
 351  }
 352  
 353  # Reactive: detect hard limit errors from claude -p envelope.
 354  # Args: raw_result (full JSON envelope string), batch_type
 355  check_limit_signal() {
 356    raw="$1"
 357    btype="$2"
 358  
 359    [ -z "$raw" ] && return
 360  
 361    is_limit=$(echo "$raw" | node -e "
 362      let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 363        try {
 364          const env = JSON.parse(d);
 365          // Only scan envelope-level metadata for limit signals — NOT env.result (LLM content).
 366          // LLM-generated text (proposals, emails, rewrites) may mention "rate limit" etc.
 367          const meta = [
 368            env.is_error ? 'is_error' : '',
 369            typeof env.stop_reason === 'string' ? env.stop_reason : '',
 370            typeof env.error === 'string' ? env.error : '',
 371            typeof env.error?.message === 'string' ? env.error.message : '',
 372            env.is_error && typeof env.result === 'string' && env.result.length < 500 ? env.result : '',
 373          ].join(' ').toLowerCase();
 374          const hasKeyword = env.is_error && /rate.?limit|usage.?limit|claude.?max|overloaded|quota|529|too.?many.?request/.test(meta);
 375          process.stdout.write(hasKeyword ? '1' : '0');
 376        } catch { process.stdout.write('0'); }
 377      });
 378    " 2>/dev/null || echo "0")
 379  
 380    if [ "$is_limit" = "1" ]; then
 381      CONSECUTIVE_LIMIT_HITS=$((CONSECUTIVE_LIMIT_HITS + 1))
 382      [ -z "$LIMIT_HIT_SINCE" ] && LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S')
 383      CONSERVATION_REASON="hard limit on $btype"
 384      if [ "$CONSERVATION_MODE" = "false" ]; then
 385        CONSERVATION_MODE=true
 386        log "CONSERVATION MODE ON (reactive) — hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS). Deferring Opus tasks."
 387      else
 388        log "Usage limit still active on $btype (hit #$CONSECUTIVE_LIMIT_HITS since $LIMIT_HIT_SINCE)"
 389      fi
 390  
 391      # Persist to rate-limits.json so the state survives orchestrator restarts
 392      # and is visible to should_skip_for_rate_limits() on next cycle.
 393      # Uses 'hourly' reset window (1h backoff) — conservation mode recovery
 394      # logic may clear it sooner if usage drops below threshold.
 395      node -e "
 396        const { setRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js');
 397        setRateLimit('claude_cli', {
 398          limitType: 'hourly',
 399          reason: 'Claude Max hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS)',
 400        });
 401      " 2>/dev/null || log "WARN: Failed to persist claude_cli rate limit to rate-limits.json"
 402    fi
 403  }
 404  
 405  # Query pipeline backlog counts for intelligent batch skipping.
 406  # All counts are ACTIONABLE — filtered to only items that can reach a sent outreach.
 407  # The core filter is ACTIVE_COUNTRY_CODES = all countries NOT in OUTREACH_BLOCKED_COUNTRIES.
 408  # This filter is inherited by every gate so that blocking a country upstream stops all work
 409  # for that country at every stage (scoring, enriching, proposals, reword).
 410  #
 411  # Sets global vars:
 412  #   BACKLOG_ELIGIBLE_OUTREACH  — approved email/sms, delivery_status IS NULL, past 3d cooldown, gdpr_verified where required
 413  #   BACKLOG_PROPOSALS          — proposals_drafted sites with unsent email/sms in active countries
 414  #   BACKLOG_ENRICHED           — enriched sites in active countries, score < LOW_SCORE_CUTOFF
 415  #   BACKLOG_ACTIVE_COUNTRIES   — count of active (non-blocked) country codes in pipeline
 416  #   BACKLOG_APPROVED_UNSENT    — all approved+unsent (any channel, for display only)
 417  #   BACKLOG_PENDING_APPROVAL   — pending approval count (display only)
 418  refresh_backlog() {
 419    # Read OUTREACH_SKIP_METHODS, OUTREACH_BLOCKED_COUNTRIES, and SMS-blocked countries from .env
 420    _skip_methods=$(grep '^OUTREACH_SKIP_METHODS=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 421    _blocked_countries=$(grep '^OUTREACH_BLOCKED_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 422    _sms_blocked_countries=$(grep '^OUTREACH_BLOCKED_SMS_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 423  
 424    eval "$(node "$PROJECT_ROOT/scripts/refresh-backlog.js" \
 425      --skip-methods="$_skip_methods" \
 426      --blocked-countries="$_blocked_countries" \
 427      --sms-blocked-countries="$_sms_blocked_countries" \
 428      --low-score-cutoff="$LOW_SCORE_CUTOFF" \
 429      --project-root="$PROJECT_ROOT" 2>/dev/null)"
 430  
 431    # ── Data-loss canary: halt if site count dropped >50% ──
 432    if [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -gt 0 ]; then
 433      _drop_pct=$(( (CANARY_PREVIOUS_COUNT - CANARY_SITE_COUNT) * 100 / CANARY_PREVIOUS_COUNT ))
 434      if [ "$_drop_pct" -gt 50 ]; then
 435        log "CRITICAL: Site count dropped ${_drop_pct}% (${CANARY_PREVIOUS_COUNT} → ${CANARY_SITE_COUNT}) — HALTING ORCHESTRATOR"
 436        log "ACTION REQUIRED: Check PostgreSQL mmo database integrity. Restore from backup if needed."
 437        log "To resume: rm logs/site-count-canary.json && restart orchestrator"
 438        exit 1
 439      fi
 440    elif [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -eq 0 ]; then
 441      log "CRITICAL: Site count is ZERO (was ${CANARY_PREVIOUS_COUNT}) — HALTING ORCHESTRATOR"
 442      log "ACTION REQUIRED: Database has been wiped. Restore from backup immediately."
 443      log "To resume: rm logs/site-count-canary.json && restart orchestrator"
 444      exit 1
 445    fi
 446  }
 447  
 448  # Load STAGE_THROTTLE_MULTIPLIER and LOW_SCORE_CUTOFF from .env (defaults: 3 and 82).
 449  # Read once at startup — restart orchestrator to pick up changes.
 450  STAGE_THROTTLE_MULTIPLIER=$(grep '^STAGE_THROTTLE_MULTIPLIER=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ')
 451  STAGE_THROTTLE_MULTIPLIER=${STAGE_THROTTLE_MULTIPLIER:-3}
 452  LOW_SCORE_CUTOFF=$(grep '^LOW_SCORE_CUTOFF=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ')
 453  LOW_SCORE_CUTOFF=${LOW_SCORE_CUTOFF:-82}
 454  
 455  # Load SKIP_STAGES from .env on every loop iteration so changes take effect without restart.
 456  # Always reads .env — edit the file to change skip behaviour live.
 457  load_skip_stages() {
 458    if [ -f "$PROJECT_ROOT/.env" ]; then
 459      val=$(grep '^SKIP_STAGES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'")
 460      SKIP_STAGES="$val"
 461    fi
 462    # Normalise: lowercase, strip spaces
 463    SKIP_STAGES=$(echo "${SKIP_STAGES:-}" | tr '[:upper:]' '[:lower:]' | tr -d ' ')
 464  }
 465  
 466  # Check if a batch type is blocked by the pipeline's rate-limit-scheduler state.
 467  # Reads logs/rate-limits.json (written by circuit-breaker.js when an API quota/rate-limit hits).
 468  # This ensures the orchestrator respects the same automatic backoffs as the pipeline —
 469  # e.g. if ZenRows hits its daily quota and blocks 'serps', both systems honour that pause.
 470  # Returns 0 = skip (rate-limited), 1 = run
 471  should_skip_for_rate_limits() {
 472    batch_type="$1"
 473    rate_limits_file="$PROJECT_ROOT/logs/rate-limits.json"
 474    [ -f "$rate_limits_file" ] || return 1  # No file = no active rate limits
 475  
 476    # Map batch_type → stage name to check in rate-limits.json
 477    case "$batch_type" in
 478      proposals_email|proposals_sms)                               stage="proposals" ;;
 479      reply_responses)                                              stage="outreach" ;;
 480      classify_replies)                                             stage="replies" ;;
 481      score_semantic|score_sites)                                   stage="scoring" ;;
 482      enrich_sites)                                                 stage="enrich" ;;
 483      followup_generate)                                            stage="outreach" ;;
 484      *)                                                            stage="" ;;
 485    esac
 486  
 487    now_ms=$(date +%s%3N 2>/dev/null || echo "0")
 488  
 489    # Check each entry: if resetAt > now and stages includes our stage, skip.
 490    # Also check 'claude_cli' — when Claude Max credits are exhausted, ALL orchestrator
 491    # batches are blocked regardless of stage mapping (they all use claude -p).
 492    matched=$(node -e "
 493      try {
 494        const data = JSON.parse(require('fs').readFileSync('$rate_limits_file', 'utf8'));
 495        const stage = '$stage';
 496        const now = $now_ms;
 497  
 498        // Global claude_cli gate — blocks ALL orchestrator batches
 499        if (data.claude_cli && Number(data.claude_cli.resetAt) > now) {
 500          const resetAt = new Date(Number(data.claude_cli.resetAt)).toISOString();
 501          process.stdout.write('claude_cli:' + resetAt + ':' + (data.claude_cli.reason || ''));
 502          process.exit(0);
 503        }
 504  
 505        // Stage-specific gate (e.g. openrouter scoring, zenrows serps)
 506        if (stage) {
 507          for (const [api, info] of Object.entries(data)) {
 508            if (Array.isArray(info.stages) && info.stages.includes(stage) && Number(info.resetAt) > now) {
 509              const resetAt = new Date(Number(info.resetAt)).toISOString();
 510              process.stdout.write(api + ':' + resetAt + ':' + (info.reason || ''));
 511              process.exit(0);
 512            }
 513          }
 514        }
 515      } catch(e) {}
 516    " 2>/dev/null)
 517  
 518    if [ -n "$matched" ]; then
 519      api=$(echo "$matched" | cut -d: -f1)
 520      reset_at=$(echo "$matched" | cut -d: -f2)
 521      reason=$(echo "$matched" | cut -d: -f3-)
 522      log "$batch_type: SKIP (rate-limited by $api until $reset_at — $reason)"
 523      return 0
 524    fi
 525    return 1
 526  }
 527  
 528  # Check if a batch type is blocked by SKIP_STAGES.
 529  # Returns 0 = skip (stage paused), 1 = run
 530  should_skip_for_stage_flag() {
 531    batch_type="$1"
 532    [ -z "$SKIP_STAGES" ] && return 1  # Nothing skipped
 533  
 534    # Map batch types to pipeline stage names
 535    case "$batch_type" in
 536      proposals_email|proposals_sms)
 537        echo "$SKIP_STAGES" | grep -q 'proposals' && {
 538          log "$batch_type: SKIP (SKIP_STAGES=proposals)"; return 0; } ;;
 539      reply_responses)
 540        echo "$SKIP_STAGES" | grep -q 'outreach\|replies\|reply' && {
 541          log "$batch_type: SKIP (SKIP_STAGES includes outreach/replies)"; return 0; } ;;
 542      classify_replies)
 543        echo "$SKIP_STAGES" | grep -q 'replies\|classify' && {
 544          log "$batch_type: SKIP (SKIP_STAGES=replies/classify)"; return 0; } ;;
 545      extract_names)
 546        echo "$SKIP_STAGES" | grep -q 'extract_names\|extract' && {
 547          log "$batch_type: SKIP (SKIP_STAGES=extract_names)"; return 0; } ;;
 548      proofread)
 549        echo "$SKIP_STAGES" | grep -q 'proofread' && {
 550          log "$batch_type: SKIP (SKIP_STAGES=proofread)"; return 0; } ;;
 551      score_semantic|score_sites)
 552        echo "$SKIP_STAGES" | grep -q 'scoring' && {
 553          log "$batch_type: SKIP (SKIP_STAGES=scoring)"; return 0; } ;;
 554      enrich_sites)
 555        echo "$SKIP_STAGES" | grep -q 'enrich' && {
 556          log "$batch_type: SKIP (SKIP_STAGES=enrich)"; return 0; } ;;
 557      followup_generate)
 558        echo "$SKIP_STAGES" | grep -q 'followup' && {
 559          log "$batch_type: SKIP (SKIP_STAGES=followup)"; return 0; } ;;
 560    esac
 561    return 1  # Not skipped
 562  }
 563  
 564  # Check if a batch type should be skipped because its downstream queue is already saturated.
 565  # Uses STAGE_THROTTLE_MULTIPLIER (default 3) × the relevant batch size as the threshold.
 566  #
 567  # Stage chain (each gate pauses all earlier stages):
 568  #   enriched → [proposals_*] → proposals_drafted → [proofread] → approved outreach
 569  #
 570  # Gate 1 (outreach gate):  if eligible outreach > 3× rolling 7-day avg daily send rate
 571  #                           (floor: 3× proofread_batch, so the gate isn't too permissive
 572  #                           when historical send rate is very low or zero)
 573  #                           pause: proofread, proposals_*, enrich_sites, score_semantic
 574  # Gate 2 (proposals gate): if proposals_drafted > 3× (proposals_email_batch + proposals_sms_batch)
 575  #                           pause: proposals_*, enrich_sites
 576  # Gate 3 (enrich gate):    if enriched > 3× enrich_sites_batch
 577  #                           pause: enrich_sites, score_semantic
 578  #
 579  # Exception: serps is never paused (ZenRows daily quota must be used).
 580  # Returns 0 = skip, 1 = run
 581  should_skip_for_backlog() {
 582    batch_type="$1"
 583  
 584    # Gate 1: outreach queue saturated — pause everything feeding into it.
 585    # Use 3× the rolling 7-day daily send average as the threshold, with a floor
 586    # of 3× PROOFREAD_BATCH to prevent the gate from being too permissive when
 587    # the send rate is very low (e.g. pipeline just started, GDPR blocking, etc).
 588    _static_floor=$((PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER))
 589    _send_rate_threshold=$((${BACKLOG_DAILY_SEND_AVG:-0} * STAGE_THROTTLE_MULTIPLIER))
 590    _outreach_threshold=$(( _send_rate_threshold > _static_floor ? _send_rate_threshold : _static_floor ))
 591  
 592    case "$batch_type" in
 593      proofread|proposals_email|proposals_sms|enrich_sites|score_semantic)
 594        if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -gt "$_outreach_threshold" ]; then
 595          log "$batch_type: SKIP — eligible_outreach=${BACKLOG_ELIGIBLE_OUTREACH} > threshold=${_outreach_threshold} (${STAGE_THROTTLE_MULTIPLIER}×max(daily_avg=${BACKLOG_DAILY_SEND_AVG:-0},floor=${PROOFREAD_BATCH}))"
 596          return 0
 597        fi
 598        ;;
 599    esac
 600  
 601    # Gate 2: proposals_drafted queue saturated — pause proposal generation and earlier stages.
 602    # Threshold based on PROOFREAD throughput (the bottleneck), not proposal generation batch size.
 603    # Proofread clears PROOFREAD_BATCH per cycle; allow 3× that buffer before pausing upstream.
 604    _proposals_threshold=$(( PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER ))
 605  
 606    case "$batch_type" in
 607      proposals_email|proposals_sms|enrich_sites)
 608        if [ "${BACKLOG_PROPOSALS:-0}" -gt "$_proposals_threshold" ]; then
 609          log "$batch_type: SKIP — proposals_drafted=${BACKLOG_PROPOSALS} > threshold=${_proposals_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${_proposals_batch_max})"
 610          return 0
 611        fi
 612        ;;
 613    esac
 614  
 615    # Gate 3: enriched queue saturated — pause enrich and scoring.
 616    # Threshold based on proposals throughput (PROPOSALS_EMAIL + PROPOSALS_SMS per cycle),
 617    # not enrich batch size. Enriched sites are consumed by proposal generation.
 618    _proposals_batch_max=$(( PROPOSALS_EMAIL_BATCH + PROPOSALS_SMS_BATCH ))
 619    _enrich_threshold=$(( _proposals_batch_max * STAGE_THROTTLE_MULTIPLIER * 5 ))
 620  
 621    case "$batch_type" in
 622      enrich_sites|score_semantic)
 623        if [ "${BACKLOG_ENRICHED:-0}" -gt "$_enrich_threshold" ]; then
 624          log "$batch_type: SKIP — enriched=${BACKLOG_ENRICHED} > threshold=${_enrich_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${ENRICH_SITES_BATCH})"
 625          return 0
 626        fi
 627        ;;
 628    esac
 629  
 630    return 1  # Don't skip — run normally
 631  }
 632  
 633  # Run a single batch type
 634  # Args: batch_type model batch_size context_files...
 635  run_batch() {
 636    batch_type="$1"
 637    model="$2"
 638    batch_size="$3"
 639    shift 3
 640    # Remaining args are context file paths
 641  
 642    log "Pulling $batch_type batch (limit=$batch_size)..."
 643    batch_json=$(node scripts/claude-batch.js "$batch_type" "$batch_size" 2>/dev/null || echo '{"count":0,"items":[]}')
 644  
 645    count=$(echo "$batch_json" | node -e "
 646      let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 647        try { process.stdout.write(String(JSON.parse(d).count||0)); }
 648        catch { process.stdout.write('0'); }
 649      });
 650    " 2>/dev/null || echo "0")
 651    [ -z "$count" ] && count=0
 652  
 653    if [ "$count" = "0" ]; then
 654      log "$batch_type: no items to process"
 655      return 1  # Signal "empty" to caller
 656    fi
 657  
 658    _effort_log=$(effort_for_batch "$batch_type")
 659    log "$batch_type: processing $count items with $model (effort=$_effort_log)..."
 660  
 661    # Build the prompt — include context files and the batch data
 662    # Inject brand/persona env vars into prompt templates (bash expansion, safe with special chars)
 663    context=""
 664    for f in "$@"; do
 665      if [ -f "$f" ]; then
 666        _file_content=$(cat "$f")
 667        _file_content="${_file_content//\[BRAND_NAME\]/${BRAND_NAME:-the brand}}"
 668        _file_content="${_file_content//BRAND_DOMAIN/${BRAND_DOMAIN:-the site}}"
 669        _file_content="${_file_content//BRAND_URL/${BRAND_URL:-the site}}"
 670        context="$context
 671  
 672  --- FILE: $f ---
 673  ${_file_content}
 674  --- END FILE ---"
 675      fi
 676    done
 677  
 678    # Build task-specific instructions
 679    case "$batch_type" in
 680      proposals_email)
 681        task_prompt="Generate email proposals for each site below. Follow the PROPOSAL.md and email best practices. Output JSON: {\"batch_type\":\"proposals_email\",\"results\":[{\"site_id\":N,\"contact_method\":\"email\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\",\"subject_line\":\"...\"}]}"
 682        ;;
 683      proposals_sms)
 684        task_prompt="Generate SMS proposals (<160 chars) for each site below. Follow PROPOSAL.md and SMS best practices. TCPA opt-out required for US/CA only. Output JSON: {\"batch_type\":\"proposals_sms\",\"results\":[{\"site_id\":N,\"contact_method\":\"sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\"}]}"
 685        ;;
 686      classify_replies)
 687        task_prompt="$(cat prompts/CLASSIFY-REPLIES.md)"
 688        ;;
 689      extract_names)
 690        task_prompt="Extract the person's first name from these email addresses and domains. If no name can be inferred, leave null. Output JSON: {\"batch_type\":\"extract_names\",\"results\":[{\"site_id\":N,\"contacts\":[{\"uri\":\"...\",\"name\":\"FirstName\"}]}]}"
 691        ;;
 692      reply_responses)
 693        task_prompt="Follow the reply guidelines in prompts/REPLIES.md. Respond to every inbound in the batch. Output ONLY valid JSON — no markdown, no explanation."
 694        ;;
 695      oversee)
 696        task_prompt="You are the senior overseer of an automated lead-generation pipeline (333 Method). Analyze the system health snapshot and decide what corrective actions to take. The structured JSON is the authoritative source of truth — previous_overseer_findings is historical context only.
 697  KNOWN SELF-HEALING BEHAVIOURS (do NOT escalate): cron_timer_dead is auto-fixed by processGuardian every minute. 1-minute cron gaps are expected.
 698  COST MODEL (do NOT flag as expensive): All LLM batches run via `claude -p --model <name>` which uses the Claude Max subscription — zero marginal cost per call regardless of model (opus/sonnet/haiku). Do NOT flag opus or sonnet model usage as a cost issue. Only OpenRouter direct-API calls (gather_evidence, scoring, replies) count against the $200/day LLM_DAILY_BUDGET.
 699  VALID PIPELINE STATUSES (ALL are intentional — do NOT flag as invalid): found, assets_captured, prog_scored, semantic_scored, vision_scored, enriched_regex, enriched_llm, enriched, proposals_drafted, outreach_partial, outreach_sent, rescored, skipped, error. enriched_regex is an intermediate enrichment step (browser pass complete, LLM pass pending) — it is expected and normal.
 700  Available actions: RESTART_PIPELINE (if stopped/crash-looping), CLEAR_STALE_TASKS (params: min_age_hours), RESET_STUCK_SITES (params: stage, min_stuck_hours ≥ 4 — never lower), LOG_ONLY (params: note). Do NOT reset a stage when its target already has a large backlog (found>5000: don't reset assets_captured or scored; rescored>500: don't reset enriched or rescored). Sites with recapture_count≥2 are auto-excluded from resets. NEVER reset 'enriched' stage — enriched is a deliberate evidence-collection queue (gather_evidence cron picks sites up every 5 min); large enriched backlogs are normal and expected.
 701  Output JSON: {\"batch_type\":\"oversee\",\"results\":[{\"summary\":\"1-2 sentences\",\"severity\":\"ok|warn|critical\",\"findings\":[{\"issue\":\"...\",\"severity\":\"low|medium|high\",\"stage\":\"pipeline|agents|cron|sites\"}],\"actions\":[{\"code\":\"ACTION_CODE\",\"description\":\"...\",\"params\":{}}]}]}"
 702        ;;
 703      classify_errors)
 704        task_prompt="Analyze these pipeline error messages that don't match any existing patterns. Propose regex patterns for categorization using JavaScript regex syntax (no leading/trailing slashes). Group similar errors into one pattern where possible. Terminal = permanent failure; Retriable = transient or fixable.
 705  Output JSON: {\"batch_type\":\"classify_errors\",\"results\":[{\"pattern\":\"regex\",\"label\":\"Short Label\",\"group\":\"terminal|retriable\",\"context\":\"site|outreach\",\"example_errors\":[\"...\"],\"occurrence_count\":N}]}"
 706        ;;
 707      score_sites)
 708        task_prompt="You are a CRO specialist scoring local business websites for conversion effectiveness. For each site in BATCH DATA, analyze the provided HTML and score it using the CRO scoring system defined in the context above.
 709  
 710  CRITICAL RULES:
 711  - Evaluate each site independently using ONLY its provided HTML and http_headers
 712  - Always return COMPLETE factor_scores for every site — never omit any factor
 713  - Do NOT include conversion_score or letter_grade — these are computed programmatically
 714  - Output ONLY valid JSON, no markdown fences
 715  
 716  Output JSON: {\"batch_type\":\"score_sites\",\"results\":[{\"site_id\":N,\"factor_scores\":{\"headline_quality\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"call_to_action\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"urgency_messaging\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"hook_engagement\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"trust_signals\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"imagery_design\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"offer_clarity\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"contextual_appropriateness\":{\"score\":N,\"reasoning\":\"...\",\"industry_context\":\"...\"}},\"overall_calculation\":{\"grade_interpretation\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"country_detection_confidence\":\"high|medium|low\",\"country_detection_evidence\":[\"...\"],\"is_business_directory\":false,\"is_local_business\":true,\"is_law_firm\":false,\"industry_classification\":\"plumbing\",\"is_error_page\":false,\"error_type\":null,\"error_description\":null,\"is_broken_site\":false,\"broken_site_details\":[]},\"contact_details\":{\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"email_addresses\":[],\"phone_numbers\":[],\"social_profiles\":[],\"key_pages\":[]}}]}"
 717        ;;
 718      enrich_sites)
 719        task_prompt="You are a contact data extraction specialist. For each site in BATCH DATA, extract contact information from the provided HTML using the enrichment guidelines in the context above.
 720  
 721  CRITICAL RULES:
 722  - Extract ONLY information explicitly visible in the HTML — do NOT fabricate data
 723  - If current_contacts already has a field, only add NEW items not already present
 724  - Preserve phone numbers EXACTLY as displayed (do not normalize format)
 725  - Return an empty object {} for sites where no new contacts are found
 726  
 727  Output JSON: {\"batch_type\":\"enrich_sites\",\"results\":[{\"site_id\":N,\"domain\":\"example.com\",\"business_name\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"NSW\",\"email_addresses\":[{\"email\":\"info@example.com\",\"label\":\"General\"}],\"phone_numbers\":[{\"number\":\"+61 2 1234 5678\",\"label\":\"Office\"}],\"social_profiles\":[{\"url\":\"https://facebook.com/example\",\"label\":\"Facebook\"}],\"key_pages\":[\"https://example.com/contact\"],\"primary_contact_form\":null}]}"
 728        ;;
 729      score_semantic)
 730        task_prompt="You are scoring local business websites for conversion effectiveness. For each site, evaluate these 3 semantic factors from the extracted text:
 731  
 732  1. **headline_quality** (0-10): Is the H1/title compelling? Does it promise a benefit, address the visitor, create curiosity? Generic 'Welcome' or just a business name = 2-4. Specific benefit + audience = 7-9.
 733  2. **value_proposition** (0-10): Does the page clearly articulate WHAT they do, WHO they help, and WHY choose them? Look for quantified claims, benefit language, customer-focused framing (you/your > we/our). Vague 'quality service' = 3-4. Specific benefits with proof = 7-9.
 734  3. **unique_selling_proposition** (0-10): What differentiates them? Look for 'only', 'exclusive', years of experience, number of customers served, awards, patents, specific methodologies. Generic = 3-4. Clear differentiators = 7-9.
 735  
 736  Score conservatively. Most local business sites score 4-7. A 9-10 is exceptional (clear benefit headline + quantified proof + genuine differentiators). A 0-2 means almost no content or completely generic.
 737  
 738  Output JSON: {\"batch_type\":\"score_semantic\",\"results\":[{\"site_id\":N,\"headline_quality\":{\"score\":N,\"reasoning\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\"}}]}"
 739        ;;
 740      proofread)
 741        task_prompt="Review each message in BATCH DATA against the proofreading rules in the context above. Apply every rule. Output a decision for every item — no omissions.
 742  
 743  Output JSON: {\"batch_type\":\"proofread\",\"results\":[{\"message_id\":N,\"decision\":\"approve\"},{\"message_id\":N,\"decision\":\"rework\",\"rework_instructions\":\"...\"},{\"message_id\":N,\"decision\":\"reject\",\"reject_reason\":\"...\"}]}"
 744        ;;
 745      code_review)
 746        task_prompt="You are a senior code reviewer (Code Reviewer agent). Review the source file in BATCH DATA for security vulnerabilities, bugs, performance issues, and maintainability problems. Follow the review criteria in the context above.
 747  
 748  For each finding include: severity (1-10), category (security/bug/performance/style), line number, description, and a concrete suggestion.
 749  
 750  Only report real issues with concrete fixes. Do not hallucinate problems.
 751  
 752  Output JSON: {\"batch_type\":\"code_review\",\"results\":[{\"file_path\":\"...\",\"findings\":[{\"severity\":N,\"category\":\"...\",\"line\":N,\"description\":\"...\",\"suggestion\":\"...\"}],\"summary\":\"...\"}]}"
 753        ;;
 754      monitor_health)
 755        task_prompt="You are an SRE monitoring a Node.js outreach pipeline. Analyze the system health data in BATCH DATA. Follow the monitoring criteria in the context above.
 756  
 757  Identify any critical issues, warnings, or degraded conditions. If the system is healthy, say so explicitly.
 758  
 759  Output JSON: {\"batch_type\":\"monitor_health\",\"results\":[{\"severity\":\"info|warn|critical\",\"summary\":\"...\",\"findings\":[{\"severity\":\"...\",\"component\":\"...\",\"issue\":\"...\",\"detail\":\"...\"}],\"recommended_actions\":[\"...\"]}]}"
 760        ;;
 761      triage_errors)
 762        task_prompt="You are an Incident Response Commander triaging pipeline errors. Classify each failed task in BATCH DATA. Follow the triage criteria in the context above.
 763  
 764  For each error: determine type, severity, and action. Only flag create_fix_task for critical/high severity issues with a clear code fix needed.
 765  
 766  Output JSON: {\"batch_type\":\"triage_errors\",\"results\":[{\"error_message\":\"...\",\"type\":\"pipeline_bug|infrastructure|external_api|data_quality|config|known_transient\",\"severity\":\"critical|high|medium|low\",\"action\":\"create_fix_task|monitor|ignore\",\"summary\":\"...\",\"suggested_fix\":\"...\"}]}"
 767        ;;
 768      check_docs)
 769        task_prompt="You are a Technical Writer checking documentation freshness. Review the recently changed source files in BATCH DATA. Follow the checking criteria in the context above.
 770  
 771  For each file, identify any stale documentation (comments, JSDoc, CLAUDE.md references, README mentions). Only report genuinely stale content — not missing docs for new code (that's a separate concern).
 772  
 773  Output JSON: {\"batch_type\":\"check_docs\",\"results\":[{\"file_path\":\"...\",\"stale_docs\":[{\"location\":\"...\",\"issue\":\"...\",\"suggested_update\":\"...\"}],\"summary\":\"...\"}]}"
 774        ;;
 775      evidence_merge)
 776        task_prompt="You are a CRO analyst merging evidence-based findings into HTML-scored website evaluations. Follow the merge criteria in the context above.
 777  
 778  For each site in BATCH DATA: compare score_json factors against evidence_pass1 and evidence_pass2 findings. Revise scores only where evidence is concrete and specific. Surface additional findings that HTML scoring could not catch (CSS-confirmed issues, stock photo filenames, wrong-state links, above-fold placement).
 779  
 780  Output JSON: {\"batch_type\":\"evidence_merge\",\"results\":[{\"site_id\":N,\"revised_scores\":{\"factor_name\":{\"original\":N,\"revised\":N,\"reason\":\"...\"}},\"additional_findings\":[{\"factor\":\"...\",\"finding\":\"...\",\"severity\":\"critical|major|minor\"}],\"evidence_summary\":\"...\"}]}"
 781        ;;
 782      followup_generate)
 783        task_prompt="You are writing follow-up outreach messages for a website conversion audit service. Each site in BATCH DATA has already been contacted once (or more) with no reply. Generate the NEXT follow-up touch using a DIFFERENT value angle than previous messages.
 784  
 785  FOLLOW-UP CADENCE (8 touches over 42 days):
 786  - Touch 2 (Day 3): Different weakness + social proof (\"other businesses in your area\")
 787  - Touch 3 (Day 7): ROI framing (\"this could be costing you \$X/month\")
 788  - Touch 4 (Day 14): Case study + sample report link (auditandfix.com)
 789  - Touch 5 (Day 21): Quick win — give one specific free fix to demonstrate value
 790  - Touch 6 (Day 28): Ad waste angle (\"paying for ads with a low-scoring site?\") / competitor gap
 791  - Touch 7 (Day 35): Authority — \"scored 43,000+ sites, pattern is always the same\"
 792  - Touch 8 (Day 42): Breakup email (\"closing the file, offer stands if you need it later\")
 793  
 794  RULES:
 795  - Each result must include ALL channels from the site's channels array (generate one message per channel per site)
 796  - Email: conversational, 3-5 short paragraphs, include subject line
 797  - SMS: under 160 chars, punchy, no subject line needed
 798  - NEVER repeat the same angle or wording as previous_messages
 799  - Use the site's weaknesses for personalisation but pick DIFFERENT weaknesses than touch 1
 800  - Touch 5: lead with a specific free fix — demonstrate competence, not just pitch
 801  - Touch 6: if site has is_running_ads=true, lean into ad waste angle; otherwise use competitor gap
 802  - Touch 8 (breakup): be respectful, leave the door open, mention auditandfix.com
 803  - Match the tone to the country (G'day for AU, casual for US/CA, formal for UK/EU)
 804  
 805  Output JSON: {\"batch_type\":\"followup_generate\",\"results\":[{\"site_id\":N,\"contact_method\":\"email|sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"sequence_step\":N,\"message_body\":\"...\",\"subject_line\":\"...(email only)\"}]}"
 806        ;;
 807    esac
 808  
 809    full_prompt="$task_prompt
 810  
 811  $context
 812  
 813  BATCH DATA:
 814  $batch_json
 815  
 816  IMPORTANT: Output ONLY valid JSON. No markdown, no explanation, no code fences."
 817  
 818    # Call Claude and pipe result to store
 819    # Write prompt to a temp file to avoid shell arg-length limits with large batches.
 820    # Unset CLAUDECODE so claude doesn't refuse to run if invoked inside another Claude Code session.
 821    _prompt_file=$(mktemp /tmp/orch-prompt-XXXXXX)
 822    printf '%s' "$full_prompt" > "$_prompt_file"
 823    _prompt_bytes=$(wc -c < "$_prompt_file" 2>/dev/null || echo 0)
 824    _prompt_kb=$(( _prompt_bytes / 1024 ))
 825    log "$batch_type: context=${_prompt_kb}kB (prompt+context+batch)"
 826    # Claude Code (Electron) requires XDG_RUNTIME_DIR to run. Systemd user services may
 827    # not inherit it — set a fallback so claude doesn't silently exit with empty output.
 828    _xdg="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}"
 829    [ -d "$_xdg" ] || _xdg="/tmp/runtime-$(id -u)" && mkdir -p "$_xdg" 2>/dev/null || true
 830  
 831    # Write claude output DIRECTLY to a file — never capture in $() bash variable.
 832    # bash $() in non-UTF8 locales (NixOS systemd env) corrupts multi-byte characters
 833    # (Japanese, Chinese, emoji), making the extracted JSON invalid at random positions.
 834    # File I/O preserves all bytes exactly regardless of locale.
 835    _raw_file="$PROJECT_ROOT/logs/.orch-raw-$$.json"
 836    _claude_stderr=$(mktemp /tmp/orch-stderr-XXXXXX)
 837    _effort=$(effort_for_batch "$batch_type")
 838    XDG_RUNTIME_DIR="$_xdg" env -u CLAUDECODE "$CLAUDE_BIN" -p --model "$model" --effort "$_effort" --output-format json \
 839      < "$_prompt_file" > "$_raw_file" 2>"$_claude_stderr" || true
 840    rm -f "$_prompt_file"
 841    _stderr_content=$(cat "$_claude_stderr"); rm -f "$_claude_stderr"
 842  
 843    # Always check for usage limit signals (reads only ASCII metadata, file-safe)
 844    check_limit_signal "$(cat "$_raw_file" 2>/dev/null || true)" "$batch_type"
 845  
 846    # Debug: log raw file size and first 200 ASCII chars for diagnosis
 847    _raw_size=$(wc -c < "$_raw_file" 2>/dev/null || echo "missing")
 848    _raw_preview=$(cat "$_raw_file" 2>/dev/null | tr -cd '[:print:]' | head -c 200 || echo "")
 849    log "$batch_type: raw_file=${_raw_size}b preview=$(echo "$_raw_preview" | head -c 150)"
 850  
 851    if [ ! -s "$_raw_file" ]; then
 852      rm -f "$_raw_file"
 853      # Track consecutive empty results per batch type in the gate file.
 854      # Pattern: external tool failure looks identical to "no work" — consecutive
 855      # empties on a batch that had items is a strong signal something is broken.
 856      _empty_key="empty_streak_${batch_type}"
 857      _streak=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(String(d['$_empty_key']||0));}catch{process.stdout.write('0');}" 2>/dev/null || echo "0")
 858      _streak=$(( _streak + 1 ))
 859      node -e "
 860        const fs=require('fs');
 861        let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{}
 862        d['$_empty_key']=$_streak;
 863        fs.writeFileSync('$GATE_FILE',JSON.stringify(d));
 864      " 2>/dev/null || true
 865  
 866      _empty_msg="$batch_type: claude returned empty — skipping store${_stderr_content:+ (stderr: $(echo "$_stderr_content" | head -1))}"
 867      if [ "$_streak" -ge 5 ]; then
 868        log "ALERT: $_empty_msg [consecutive_empty=$_streak — likely tool failure, not empty queue]"
 869      elif [ "$_streak" -ge 3 ]; then
 870        log "WARN: $_empty_msg [consecutive_empty=$_streak]"
 871      else
 872        log "$_empty_msg"
 873      fi
 874      return 0
 875    fi
 876  
 877    # Success — reset the consecutive-empty streak for this batch type
 878    node -e "
 879      const fs=require('fs');
 880      let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{}
 881      delete d['empty_streak_${batch_type}'];
 882      fs.writeFileSync('$GATE_FILE',JSON.stringify(d));
 883    " 2>/dev/null || true
 884  
 885    store_summary=$(node "$PROJECT_ROOT/scripts/claude-store-wrapper.js" "$_raw_file" "$batch_type" 2>&1 || echo '{"error":"store_node_failed"}')
 886  
 887    if [ -z "$store_summary" ] || echo "$store_summary" | grep -q '"error"'; then
 888      # On error: preserve raw file for inspection
 889      _saved_raw="$PROJECT_ROOT/logs/orch-raw-${batch_type}-saved.json"
 890      cp "$_raw_file" "$_saved_raw" 2>/dev/null || true
 891      log "$batch_type: saved raw to $_saved_raw for inspection"
 892    fi
 893    rm -f "$_raw_file"
 894  
 895    if [ -z "$store_summary" ]; then
 896      log "$batch_type: [WARN] store_summary empty — node produced no output. stderr=$(echo "$_stderr_content" | head -c 300)"
 897    else
 898      log "$batch_type: $store_summary"
 899      [ -n "$_stderr_content" ] && echo "$store_summary" | grep -q '"error"' && log "$batch_type: claude_stderr=$(echo "$_stderr_content" | head -c 200)"
 900    fi
 901    return 0
 902  }
 903  
 904  # Returns 0 if the batch type is skipped in conservation mode, 1 if it should run.
 905  # Time-critical and Haiku tasks always run. Opus-heavy generation deferred.
 906  should_skip_for_conservation() {
 907    batch_type="$1"
 908    [ "$CONSERVATION_MODE" = "false" ] && return 1  # Not in conservation mode — don't skip
 909  
 910    case "$batch_type" in
 911      # Always run — time-critical (inbound replies can't wait) or cheap (Haiku)
 912      reply_responses|classify_replies|extract_names|oversee|classify_errors)
 913        return 1 ;;
 914      # Defer — Opus-heavy or pipeline-upstream work that can wait
 915      proposals_email|proposals_sms|score_semantic|score_sites|enrich_sites|proofread|followup_generate)
 916        log "$batch_type: SKIP (conservation mode — Claude usage limit active since $LIMIT_HIT_SINCE)"
 917        return 0 ;;
 918    esac
 919    return 1
 920  }
 921  
 922  # Run a batch with all standard skip checks (rate_limits, stage_flag, conservation, backlog).
 923  # The skip functions themselves know which batch types they apply to — unknown types pass through.
 924  # Usage: run_checked <type> <model> <batch_size> [context_files...]
 925  run_checked() {
 926    _rc_type="$1"; _rc_model="$2"; _rc_size="$3"; shift 3
 927    [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rc_type" ] && return 0
 928    should_skip_for_rate_limits "$_rc_type" && return 0
 929    should_skip_for_stage_flag "$_rc_type" && return 0
 930    should_skip_for_conservation "$_rc_type" && return 0
 931    should_skip_for_backlog "$_rc_type" && return 0
 932    run_batch "$_rc_type" "$_rc_model" "$_rc_size" "$@" && had_work=true
 933  }
 934  
 935  # Run a time-gated batch with all standard skip checks + is_due gate.
 936  # Usage: run_checked_gated <type> <model> <batch_size> <interval_minutes> [context_files...]
 937  run_checked_gated() {
 938    _rg_type="$1"; _rg_model="$2"; _rg_size="$3"; _rg_interval="$4"; shift 4
 939    [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rg_type" ] && return 0
 940    should_skip_for_rate_limits "$_rg_type" && return 0
 941    should_skip_for_stage_flag "$_rg_type" && return 0
 942    should_skip_for_conservation "$_rg_type" && return 0
 943    should_skip_for_backlog "$_rg_type" && return 0
 944    if is_due "$_rg_type" "$_rg_interval"; then
 945      run_batch "$_rg_type" "$_rg_model" "$_rg_size" "$@" && had_work=true
 946      mark_ran "$_rg_type"
 947    else
 948      log "$_rg_type: not due yet (runs every ${_rg_interval}min)"
 949    fi
 950  }
 951  
 952  # Check if any Tier C (interactive) agents are overdue.
 953  # Writes logs/agents-due.json with overdue agents and their session start prompts.
 954  # Called once per cycle so the daily report and AFK monitor can surface them.
 955  check_tier_c_agents() {
 956    _due_agents="[]"
 957  
 958    # Security Engineer: quarterly (129600 min) + major refactor
 959    if is_due "tier_c_security_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 960      _due_agents=$(echo "$_due_agents" | node -e "
 961        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 962          const a=JSON.parse(d);
 963          a.push({agent:'security_audit',schedule:'quarterly',
 964            prompt:'Run a Security Engineer audit on 333Method and 2Step. Review: (1) API key handling in load-env.js and .env files, (2) Twilio webhook authentication in src/inbound/sms.js, (3) PayPal integration in src/payment/paypal.js, (4) unsigned unsubscribe tokens in 2Step src/outreach/email.js, (5) stealth-browser SSRF risk, (6) SQL injection review of all DB queries. Output findings as JSON with severity/file/line/description.'});
 965          process.stdout.write(JSON.stringify(a));
 966        });
 967      " 2>/dev/null || echo "$_due_agents")
 968      log "[AGENT DUE] security_audit — run Security Engineer (Opus, max, thinking on)"
 969    fi
 970  
 971    # Compliance Auditor: quarterly (129600 min) + major refactor
 972    if is_due "tier_c_compliance_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 973      _due_agents=$(echo "$_due_agents" | node -e "
 974        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 975          const a=JSON.parse(d);
 976          a.push({agent:'compliance_audit',schedule:'quarterly',
 977            prompt:'Run a Compliance Auditor review on 333Method outreach system. Check: (1) CAN-SPAM compliance in email templates, (2) TCPA compliance in SMS sending (business hours, opt-out), (3) GDPR country blocking logic in src/config/countries.js, (4) unsubscribe honoring in src/outreach/, (5) 72-hour cooldown enforcement. Also audit 2Step CAN-SPAM compliance in src/outreach/email.js.'});
 978          process.stdout.write(JSON.stringify(a));
 979        });
 980      " 2>/dev/null || echo "$_due_agents")
 981      log "[AGENT DUE] compliance_audit — run Compliance Auditor (Opus, high, thinking on)"
 982    fi
 983  
 984    # SEO Specialist: monthly (43200 min) for colorcraft-ai.com
 985    if is_due "tier_c_seo_audit" 43200; then
 986      _due_agents=$(echo "$_due_agents" | node -e "
 987        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
 988          const a=JSON.parse(d);
 989          a.push({agent:'seo_specialist',schedule:'monthly',
 990            prompt:'Run an SEO Specialist audit on colorcraft-ai.com. Fetch the site, analyze technical SEO (meta tags, page speed indicators, structured data, sitemap, robots.txt), keyword opportunities, content gaps, and link building potential. Output a prioritized report of changes for Base44 to implement.'});
 991          process.stdout.write(JSON.stringify(a));
 992        });
 993      " 2>/dev/null || echo "$_due_agents")
 994      log "[AGENT DUE] seo_specialist — run SEO Specialist (Sonnet, medium)"
 995    fi
 996  
 997    # Automation Governance: quarterly (129600 min) + major refactor
 998    if is_due "tier_c_automation_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then
 999      _due_agents=$(echo "$_due_agents" | node -e "
1000        let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
1001          const a=JSON.parse(d);
1002          a.push({agent:'automation_audit',schedule:'quarterly',
1003            prompt:'Run an Automation Governance audit. Read the cron_jobs table (sqlite3 db/sites.db \"SELECT * FROM cron_jobs\"), review all cron jobs in src/cron/ and the orchestrator in scripts/claude-orchestrator.sh. Identify: redundant jobs, poor sequencing, missing error handling, jobs that should be consolidated. Output a ranked list of changes.'});
1004          process.stdout.write(JSON.stringify(a));
1005        });
1006      " 2>/dev/null || echo "$_due_agents")
1007      log "[AGENT DUE] automation_audit — run Automation Governance Architect (Sonnet, medium)"
1008    fi
1009  
1010    # Code review completion check — surface Phase 5 reminder when queue is done
1011    if [ -f "logs/code-review-queue.json" ]; then
1012      _cr_done=$(node -e "
1013        try {
1014          const q = JSON.parse(require('fs').readFileSync('logs/code-review-queue.json','utf8'));
1015          process.stdout.write(q.next_index >= q.files.length ? 'yes' : 'no');
1016        } catch { process.stdout.write('no'); }
1017      " 2>/dev/null || echo "no")
1018      if [ "$_cr_done" = "yes" ] && is_due "tier_c_phase5_reminder" 43200; then
1019        _due_agents=$(echo "$_due_agents" | node -e "
1020          let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{
1021            const a=JSON.parse(d);
1022            a.push({agent:'phase5_sessions',schedule:'one-shot',
1023              prompt:'Code review of all 150 files is complete. Time to run Phase 5 Tier C sessions:\n1. Security audit: ./scripts/run-agent-session.sh security_audit\n2. 2Step CI/CD: ./scripts/run-agent-session.sh automation_audit\n3. colorcraft-ai.com report: ./scripts/run-agent-session.sh colorcraft_report\n4. Update distributed-agent-system plan (manual task in docs/plans/)'});
1024            process.stdout.write(JSON.stringify(a));
1025          });
1026        " 2>/dev/null || echo "$_due_agents")
1027        log "[REMINDER] Code review complete — Phase 5 Tier C sessions ready to run"
1028        log "  Run: ./scripts/run-agent-session.sh security_audit"
1029        log "  Run: ./scripts/run-agent-session.sh colorcraft_report"
1030        log "  Run: ./scripts/run-agent-session.sh automation_audit"
1031        log "  Update: docs/plans/distributed-agent-system.md"
1032      fi
1033    fi
1034  
1035    # Write results to logs/agents-due.json
1036    printf '%s\n' "$_due_agents" > logs/agents-due.json 2>/dev/null || true
1037  }
1038  
1039  # Detect major refactor: check recent commits for patterns that trigger agent reviews.
1040  # Sets MAJOR_REFACTOR=true if criteria met. Called once per cycle.
1041  check_major_refactor() {
1042    MAJOR_REFACTOR=false
1043    _changed=$(git diff --name-only HEAD~10 2>/dev/null | head -200 || echo "")
1044    _count=$(echo "$_changed" | grep -c '.' 2>/dev/null) || _count=0
1045  
1046    # 10+ files changed
1047    [ "$_count" -ge 10 ] && MAJOR_REFACTOR=true && return 0
1048  
1049    # Security-sensitive files
1050    echo "$_changed" | grep -qE '^src/stages/|^scripts/claude-orchestrator\.sh$|^db/migrations/|^src/outreach/|^src/payment/|^src/inbound/|^load-env\.js$|^src/utils/load-env\.js$|^src/utils/stealth-browser' && MAJOR_REFACTOR=true && return 0
1051  
1052    return 0
1053  }
1054  
1055  # Define batch order — priority: outreach-adjacent first, upstream last
1056  process_all_batches() {
1057    had_work=false
1058  
1059    # Re-resolve CLAUDE_BIN each cycle — auto-updates delete old versions
1060    resolve_claude_bin
1061  
1062    # Run canary once per cycle (verifies node, claude-batch.js, CLAUDE_BIN are usable)
1063    run_canary
1064  
1065    # Detect major refactor (sets MAJOR_REFACTOR env var for this cycle)
1066    check_major_refactor
1067    [ "$MAJOR_REFACTOR" = "true" ] && log "MAJOR_REFACTOR detected — Tier C agents flagged as due"
1068  
1069    # Check Tier C agent schedule — writes logs/agents-due.json
1070    check_tier_c_agents
1071  
1072    # Reload SKIP_STAGES from .env each cycle (respects live changes without restart)
1073    load_skip_stages
1074    [ -n "$SKIP_STAGES" ] && log "SKIP_STAGES=$SKIP_STAGES"
1075  
1076    # Check Claude Max usage limits proactively (live API fetch, ~1s)
1077    check_usage_proactive
1078  
1079    # Refresh pipeline backlog counts for intelligent skipping
1080    refresh_backlog
1081  
1082    # Scale batch sizes up when backlogs are large so we clear them faster.
1083    # Caps are conservative — LLM context window is the real ceiling.
1084    if [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 5000 ]; then
1085      PROOFREAD_BATCH=200
1086    elif [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 1000 ]; then
1087      PROOFREAD_BATCH=100
1088    fi
1089  
1090    if [ "${BACKLOG_PROPOSALS:-0}" -gt 5000 ]; then
1091      # Cap at 20 SMS / 10 email — 50 SMS exceeded 32k output token limit (29min run, hard error)
1092      REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=20; REWORD_FORM_BATCH=30
1093      REWORD_LINKEDIN_BATCH=30; REWORD_X_BATCH=30
1094    elif [ "${BACKLOG_PROPOSALS:-0}" -gt 1000 ]; then
1095      REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=30; REWORD_FORM_BATCH=30
1096    fi
1097  
1098    conservation_note=""
1099    [ "$CONSERVATION_MODE" = "true" ] && conservation_note=" [CONSERVATION MODE: $CONSERVATION_REASON]"
1100    log "Backlog: total_sites=${CANARY_SITE_COUNT:-?} eligible_outreach=$BACKLOG_ELIGIBLE_OUTREACH actionable_proposals=$BACKLOG_PROPOSALS actionable_enriched=$BACKLOG_ENRICHED approved_unsent=$BACKLOG_APPROVED_UNSENT pending=$BACKLOG_PENDING_APPROVAL active_countries=$BACKLOG_ACTIVE_COUNTRIES daily_send_avg=${BACKLOG_DAILY_SEND_AVG:-0}${conservation_note}"
1101    log "Effective batch sizes: proofread=$PROOFREAD_BATCH score_semantic=$SCORE_SEMANTIC_BATCH enrich=$ENRICH_SITES_BATCH"
1102  
1103    # --- Pipeline batches: outreach-adjacent first, upstream last ---
1104    run_checked proposals_email  opus   "$PROPOSALS_EMAIL_BATCH"  prompts/PROPOSAL.md docs/05-outreach/email-best-practices.md
1105    run_checked proposals_sms    opus   "$PROPOSALS_SMS_BATCH"    prompts/PROPOSAL.md docs/05-outreach/sms-best-practices.md
1106    # Semantic scoring: Sonnet evaluates headline, value prop, USP. Accuracy > cost.
1107    run_checked score_semantic   sonnet "$SCORE_SEMANTIC_BATCH"
1108  
1109    # score_sites: HTML-only LLM scoring (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)
1110    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "score_sites" ]; then
1111      _llm_scoring=$(node -e "process.stdout.write(process.env.ENABLE_LLM_SCORING !== 'false' ? '1' : '0')" 2>/dev/null || echo "0")
1112      _enable_vision=$(node -e "process.stdout.write(process.env.ENABLE_VISION !== 'false' ? '1' : '0')" 2>/dev/null || echo "1")
1113      if [ "$_llm_scoring" = "1" ] && [ "$_enable_vision" = "0" ]; then
1114        run_checked score_sites sonnet "$SCORE_SITES_BATCH" prompts/CONVERSION-SCORING-NOVIS.md
1115      else
1116        [ -z "$SINGLE_TYPE" ] || log "score_sites: SKIP (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)"
1117      fi
1118    fi
1119  
1120    # enrich_sites: LLM contact enrichment (requires ENABLE_ENRICHMENT_LLM=true)
1121    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "enrich_sites" ]; then
1122      _enrich_llm=$(node -e "process.stdout.write(process.env.ENABLE_ENRICHMENT_LLM !== 'false' ? '1' : '0')" 2>/dev/null || echo "0")
1123      if [ "$_enrich_llm" = "1" ]; then
1124        run_checked enrich_sites haiku "$ENRICH_SITES_BATCH" prompts/ENRICHMENT.md
1125      else
1126        [ -z "$SINGLE_TYPE" ] || log "enrich_sites: SKIP (ENABLE_ENRICHMENT_LLM=false)"
1127      fi
1128    fi
1129  
1130    # gather_evidence: moved to standalone cron job (5-min interval, migration 112).
1131    # No longer runs inline — the cron system handles execution independently.
1132    # The pending count is still logged here for orchestrator visibility.
1133    # Also reports sites awaiting merge (pass1+pass2 done, evidence_json still NULL).
1134    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "gather_evidence" ]; then
1135      _pending_evidence=$(node -e "
1136        const Database = require('./node_modules/better-sqlite3');
1137        const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true });
1138        const r = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)\").get();
1139        const m = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL\").get();
1140        process.stdout.write(JSON.stringify({ pending: r.c || 0, awaiting_merge: m.c || 0 }));
1141        db.close();
1142      " 2>/dev/null || echo '{"pending":0,"awaiting_merge":0}')
1143      _ev_pending=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).pending||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
1144      _ev_merge=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).awaiting_merge||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0")
1145      if [ "$_ev_pending" -gt 0 ] 2>/dev/null || [ "$_ev_merge" -gt 0 ] 2>/dev/null; then
1146        log "gather_evidence: ${_ev_pending} sites pending collection, ${_ev_merge} awaiting merge (proposals gate requires evidence_json)"
1147        # Detect evidence-blocked state: outreach pipeline is empty and evidence is the critical path.
1148        # Log clearly so the overseer and monitoring can see this is expected, not a crash.
1149        if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -eq 0 ] && [ "${BACKLOG_PROPOSALS:-0}" -eq 0 ] 2>/dev/null; then
1150          log "EVIDENCE-BLOCKED: eligible_outreach=0 proposals=0 — evidence collection is the critical path (pending=${_ev_pending} awaiting_merge=${_ev_merge})"
1151        fi
1152      else
1153        log "gather_evidence: no sites pending"
1154      fi
1155    fi
1156  
1157    run_checked proofread         opus   "$PROOFREAD_BATCH"   prompts/PROOFREAD.md
1158    run_checked followup_generate sonnet 20                    prompts/FOLLOWUP.md
1159    run_checked classify_replies  haiku  "$CLASSIFY_BATCH"    prompts/CLASSIFY-REPLIES.md
1160    run_checked reply_responses   opus   "$REPLIES_BATCH"     prompts/REPLIES.md
1161  
1162    # --- Time-gated batches ---
1163    run_checked_gated extract_names    haiku  "$NAMES_BATCH"  15
1164    run_checked_gated oversee          sonnet 1               30
1165    run_checked_gated classify_errors  haiku  1               240
1166    run_checked_gated monitor_health   haiku  1               60   prompts/agents/MONITOR-HEALTH.md
1167    run_checked_gated triage_errors    haiku  1               120  prompts/agents/TRIAGE-ERRORS.md
1168    run_checked_gated check_docs       haiku  1               240  prompts/agents/CHECK-DOCS.md
1169    # evidence_merge: bypass time gate when sites are awaiting merge — run every cycle until clear.
1170    # Falls back to 15min gate when backlog is empty (avoids unnecessary idle runs).
1171    if [ "${_ev_merge:-0}" -gt 0 ] 2>/dev/null; then
1172      run_checked evidence_merge haiku 1 prompts/EVIDENCE-MERGE.md
1173    else
1174      run_checked_gated evidence_merge haiku 1 15 prompts/EVIDENCE-MERGE.md
1175    fi
1176  
1177    # --- Code review (1 file per cycle, security-first order) ---
1178    # Gate: pause if pending code_review_fix tasks > 30 (backpressure — don't flood the fix queue)
1179    if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "code_review" ]; then
1180      _pending_fixes=$(node -e "
1181        const Database = require('./node_modules/better-sqlite3');
1182        const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true });
1183        const r = db.prepare(\"SELECT COUNT(*) as c FROM agent_tasks WHERE task_type='code_review_fix' AND status IN ('pending','running')\").get();
1184        process.stdout.write(String(r.c || 0));
1185        db.close();
1186      " 2>/dev/null || echo "0")
1187      if [ "${_pending_fixes:-0}" -gt 30 ]; then
1188        log "code_review: SKIP — pending_fixes=${_pending_fixes} > 30 (fix backlog too large)"
1189      else
1190        run_checked code_review sonnet 1 prompts/agents/CODE-REVIEW.md
1191      fi
1192    fi
1193  
1194    [ "$had_work" = true ]
1195  }
1196  
1197  # Main
1198  log "Starting orchestrator (loop=$LOOP, type=${SINGLE_TYPE:-all})"
1199  
1200  if [ "$LOOP" = true ]; then
1201    while true; do
1202      if ! process_all_batches; then
1203        if [ "$CONSERVATION_MODE" = "true" ]; then
1204          # Queues appear empty for deferred tasks, but we're still in conservation mode.
1205          # Keep looping — time-critical tasks (replies, classify) may arrive.
1206          log "Queues empty in conservation mode — waiting 5min before retry..."
1207          sleep 300
1208          continue
1209        fi
1210        log "All queues empty — exiting loop"
1211        break
1212      fi
1213      # In conservation mode, pause between cycles to avoid hammering the quota check
1214      if [ "$CONSERVATION_MODE" = "true" ]; then
1215        log "Conservation mode: pausing 5min before next cycle..."
1216        sleep 300
1217      fi
1218      log "Batch complete, starting next cycle..."
1219    done
1220  else
1221    process_all_batches || log "No work to process"
1222  fi
1223  
1224  log "Orchestrator finished"