claude-orchestrator.sh
1 #!/bin/sh 2 # Claude Orchestrator — Process batches using Claude Code (claude -p) 3 # 4 # Usage: 5 # scripts/claude-orchestrator.sh # One batch per type, then exit 6 # scripts/claude-orchestrator.sh --loop # Repeat until all queues empty 7 # scripts/claude-orchestrator.sh --type proposals_email # Single type only 8 # 9 # Requires: claude CLI (Claude Max subscription) 10 # Each batch runs as a separate `claude -p --model <model>` invocation: 11 # - Fresh context per batch (no RAM accumulation) 12 # - Loads only relevant prompt/context docs 13 # - Output piped to claude-store.js for DB persistence 14 15 set -e 16 17 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" 18 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" 19 cd "$PROJECT_ROOT" 20 21 # Ensure node and claude are in PATH — NixOS systemd services may not inherit them. 22 # Must match the same Node.js version that compiled node_modules (nodejs_22). 23 if ! command -v node >/dev/null 2>&1; then 24 for d in /nix/store/*-nodejs-22.*/bin; do 25 if [ -x "$d/node" ]; then 26 export PATH="$d:$PATH" 27 break 28 fi 29 done 30 fi 31 # Resolve claude CLI — prefer the latest installed version directly to avoid 32 # stale symlinks after updates (e.g. ~/.local/bin/claude -> deleted version). 33 # Extracted to a function so --loop mode can re-resolve each cycle when 34 # Claude auto-updates delete the previously resolved version. 35 resolve_claude_bin() { 36 _claude_bin="" 37 if [ -d "$HOME/.local/share/claude/versions" ]; then 38 # Pick the latest *executable binary* — skip wrapper scripts (<1MB) and broken files 39 for _v in $(ls -1 "$HOME/.local/share/claude/versions" 2>/dev/null | sort -V -r); do 40 _candidate="$HOME/.local/share/claude/versions/$_v" 41 _sz=$(stat -c%s "$_candidate" 2>/dev/null || echo 0) 42 if [ -x "$_candidate" ] && [ "$_sz" -gt 1000000 ]; then 43 _claude_bin="$_candidate" 44 break 45 fi 46 done 47 fi 48 if [ -z "$_claude_bin" ] || [ ! -x "$_claude_bin" ]; then 49 # Fallback to whatever is on PATH (symlink or system install) 50 _claude_bin=$(command -v claude 2>/dev/null || echo "") 51 fi 52 alias claude="$_claude_bin" 53 export CLAUDE_BIN="$_claude_bin" 54 } 55 resolve_claude_bin 56 57 # Self-managed log file when running non-interactively (e.g. systemd) 58 if [ ! -t 1 ]; then 59 mkdir -p "$PROJECT_ROOT/logs" 60 LOG_FILE="$PROJECT_ROOT/logs/orchestrator-$(date +%Y-%m-%d).log" 61 exec >> "$LOG_FILE" 2>&1 62 fi 63 64 # Log CLAUDE_BIN resolution after log redirect so it appears in the log file 65 echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] CLAUDE_BIN=$CLAUDE_BIN (executable: $([ -x "$CLAUDE_BIN" ] && echo yes || echo no))" 66 67 LOOP=false 68 SINGLE_TYPE="" 69 70 # ── Usage limit state ────────────────────────────────────────────────────── 71 # Claude Max has two windows: 5-hour rolling window and a weekly cap. 72 # Utilization percentages are read from ~/.claude/usage-cache.json (written 73 # by the Claude Code TUI after polling api.anthropic.com/api/oauth/usage). 74 # 75 # Strategy: 76 # 1. PROACTIVE: Before each batch cycle, read usage-cache.json. If 5h window 77 # is ≥ WARNING_PCT or weekly is ≥ WEEKLY_WARNING_PCT, enter CONSERVATION 78 # MODE before hitting the hard limit. This preserves tokens for critical work. 79 # 2. REACTIVE: If claude -p returns is_error:true or a limit keyword, enter 80 # conservation mode immediately. 81 # 82 # Conservation mode defers Opus-heavy rewording/proposals; keeps: 83 # - reply_responses (Opus) — time-critical inbound replies can't wait 84 # - classify_replies / extract_names (Haiku) — cheap; extract_names gated to 15min intervals 85 # - oversee (Sonnet) — system health, always run 86 # - classify_errors (Haiku) — cheap, always run 87 # 88 # Once the window resets (resets_at timestamp passes), exits conservation mode. 89 CONSECUTIVE_LIMIT_HITS=0 90 LIMIT_HIT_SINCE="" 91 CONSERVATION_MODE=false 92 CONSERVATION_REASON="" 93 WARNING_PCT=85 # Enter conservation when 5h window ≥ 85% used 94 WEEKLY_WARNING_PCT=90 # Enter conservation when weekly window ≥ 90% used 95 USAGE_CACHE="${HOME}/.claude/usage-cache.json" 96 97 while [ $# -gt 0 ]; do 98 case "$1" in 99 --loop) LOOP=true; shift ;; 100 --type) SINGLE_TYPE="$2"; shift 2 ;; 101 *) echo "Unknown arg: $1"; exit 1 ;; 102 esac 103 done 104 105 # Batch sizes per type 106 PROPOSALS_EMAIL_BATCH=5 107 PROPOSALS_SMS_BATCH=10 108 CLASSIFY_BATCH=50 109 NAMES_BATCH=50 110 REPLIES_BATCH=10 111 PROOFREAD_BATCH=50 112 SCORE_SEMANTIC_BATCH=50 113 SCORE_SITES_BATCH=10 114 ENRICH_SITES_BATCH=15 115 116 # Frequency-gated batch state file (tracks last-run timestamps for periodic batch types) 117 GATE_FILE="$PROJECT_ROOT/logs/orchestrator-gates.json" 118 119 # ── Canary check (runs once per cycle) ─────────────────────────────────────── 120 # Verify external tools can actually execute before attempting any batches. 121 # Catches: missing binary, wrong architecture (glibc on NixOS), broken PATH, 122 # missing node_modules, etc. Fails loudly so monitoring picks it up immediately 123 # rather than silently returning empty results for every batch. 124 # Called from process_all_batches() so it runs once per cycle in --loop mode too. 125 run_canary() { 126 _canary_ok=true 127 128 # 1. node must be able to run a basic script 129 if ! node -e "process.exit(0)" 2>/dev/null; then 130 log "FATAL: node is not executable — $(node --version 2>&1 | head -1). All batches will fail." 131 _canary_ok=false 132 fi 133 134 # 2. claude-batch.js must be loadable (catches missing node_modules, syntax errors) 135 if $_canary_ok && ! node scripts/claude-batch.js --canary 2>/dev/null | grep -q "ok"; then 136 # Tolerate if --canary flag isn't implemented — just check node can load it without crashing 137 node scripts/claude-batch.js --canary 2>/tmp/orch-canary-err.txt || true 138 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 139 if echo "$_canary_err" | grep -qiE "cannot find module|SyntaxError|ERR_"; then 140 log "FATAL: claude-batch.js failed to load: $_canary_err. All batches will fail." 141 _canary_ok=false 142 fi 143 fi 144 145 # 3. CLAUDE_BIN must be executable (catches glibc/nix-ld issues, broken symlinks, wrapper loops) 146 if $_canary_ok && [ -n "$CLAUDE_BIN" ]; then 147 _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true) 148 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 149 if [ -z "$_canary_out" ]; then 150 # Re-resolve — Claude may have auto-updated, deleting the old version 151 log "WARN: CLAUDE_BIN=$CLAUDE_BIN failed (${_canary_err:-no output}). Re-resolving..." 152 resolve_claude_bin 153 if [ -n "$CLAUDE_BIN" ]; then 154 _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true) 155 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 156 fi 157 if [ -z "$_canary_out" ]; then 158 log "FATAL: CLAUDE_BIN=$CLAUDE_BIN still failed after re-resolve: ${_canary_err:-no output}. All LLM batches will return empty." 159 _canary_ok=false 160 else 161 log "Canary OK after re-resolve: node=$(node --version 2>/dev/null), claude=$_canary_out" 162 fi 163 else 164 log "Canary OK: node=$(node --version 2>/dev/null), claude=$_canary_out" 165 fi 166 elif $_canary_ok; then 167 # CLAUDE_BIN empty — try resolving (may have been installed since startup) 168 resolve_claude_bin 169 if [ -n "$CLAUDE_BIN" ]; then 170 log "Resolved CLAUDE_BIN=$CLAUDE_BIN after initially empty" 171 else 172 log "WARN: CLAUDE_BIN is empty — LLM batches will fail. Check PATH and ~/.local/share/claude/versions/." 173 _canary_ok=false 174 fi 175 fi 176 rm -f /tmp/orch-canary-err.txt 177 } 178 179 # Effort level per batch type — matches Agency Agents plan. 180 # Controls how much reasoning the model uses. Saves tokens on simple tasks. 181 # Levels: low (classification/extraction), medium (analysis), high (reasoning/creative), max (extended thinking) 182 effort_for_batch() { 183 case "$1" in 184 proposals_email) echo "high" ;; 185 proposals_sms) echo "medium" ;; 186 classify_replies) echo "medium" ;; 187 extract_names) echo "low" ;; 188 reply_responses) echo "high" ;; 189 proofread) echo "medium" ;; 190 score_sites) echo "medium" ;; 191 score_semantic) echo "medium" ;; 192 enrich_sites) echo "low" ;; 193 oversee) echo "medium" ;; 194 classify_errors) echo "low" ;; 195 code_review) echo "medium" ;; 196 monitor_health) echo "low" ;; 197 triage_errors) echo "medium" ;; 198 check_docs) echo "low" ;; 199 evidence_merge) echo "low" ;; 200 followup_generate) echo "medium" ;; 201 *) echo "medium" ;; 202 esac 203 } 204 205 # Check if a frequency-gated batch type is due to run. 206 # Args: gate_type interval_minutes 207 # Returns 0 (run it) or 1 (skip — not due yet) 208 is_due() { 209 gate_type="$1" 210 interval_mins="$2" 211 if [ ! -f "$GATE_FILE" ]; then return 0; fi 212 elapsed=$(node -e " 213 try { 214 const d = JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8')); 215 const ts = d['$gate_type']; 216 process.stdout.write(ts ? String(Date.now() - new Date(ts).getTime()) : '0'); 217 } catch { process.stdout.write('0'); } 218 " 2>/dev/null) 219 [ -z "$elapsed" ] && elapsed=0 220 threshold=$((interval_mins * 60 * 1000)) 221 [ "$elapsed" -ge "$threshold" ] 2>/dev/null || [ "$elapsed" = "0" ] 222 } 223 224 # Record that a gated batch type just ran 225 mark_ran() { 226 gate_type="$1" 227 node -e " 228 const fs = require('fs'); 229 let d = {}; 230 try { d = JSON.parse(fs.readFileSync('$GATE_FILE','utf8')); } catch {} 231 d['$gate_type'] = new Date().toISOString(); 232 fs.writeFileSync('$GATE_FILE', JSON.stringify(d, null, 2)); 233 " 2>/dev/null 234 } 235 236 log() { 237 echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] $*" 238 } 239 240 # Proactive: fetch live usage from Anthropic API, fall back to cache. 241 # Reads OAuth token from ~/.claude/.credentials.json (Linux credential store). 242 # Called once per batch cycle (before processing batches). 243 check_usage_proactive() { 244 CREDS_FILE="${HOME}/.claude/.credentials.json" 245 246 # Try live fetch first (updates cache for TUI too) 247 usage=$(node -e " 248 const fs = require('fs'); 249 const https = require('https'); 250 let token = ''; 251 try { 252 const creds = JSON.parse(fs.readFileSync('$CREDS_FILE','utf8')); 253 token = creds.claudeAiOauth?.accessToken || ''; 254 } catch {} 255 256 if (!token) { 257 // Fall back to cache 258 try { 259 const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8')); 260 process.stdout.write(JSON.stringify({...d, source:'cache_no_token'})); 261 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"none\"}'); } 262 return; 263 } 264 265 const req = https.request({ 266 hostname: 'api.anthropic.com', 267 path: '/api/oauth/usage', 268 method: 'GET', 269 headers: { 270 'Authorization': 'Bearer ' + token, 271 'anthropic-beta': 'oauth-2025-04-20', 272 'User-Agent': 'claude-orchestrator/1.0' 273 }, 274 timeout: 8000 275 }, (res) => { 276 let body = ''; 277 res.on('data', c => body += c); 278 res.on('end', () => { 279 try { 280 const raw = JSON.parse(body); 281 const result = { 282 five_hour: Math.round(raw.five_hour?.utilization || 0), 283 seven_day: Math.round(raw.seven_day?.utilization || 0), 284 five_hour_resets_at: raw.five_hour?.resets_at || '', 285 seven_day_resets_at: raw.seven_day?.resets_at || '', 286 source: 'live' 287 }; 288 // Update cache file for TUI 289 try { fs.writeFileSync('$USAGE_CACHE', JSON.stringify({...result, stale: false})); } catch {} 290 process.stdout.write(JSON.stringify(result)); 291 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"parse_error\"}'); } 292 }); 293 }); 294 req.on('error', () => { 295 // Fall back to cache on network error 296 try { 297 const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8')); 298 process.stdout.write(JSON.stringify({...d, source:'cache_fallback'})); 299 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"error\"}'); } 300 }); 301 req.on('timeout', () => { req.destroy(); }); 302 req.end(); 303 " 2>/dev/null || node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));d.source='node_error_cache';process.stdout.write(JSON.stringify(d));}catch{process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"node_error\"}');}" 2>/dev/null || echo '{"five_hour":0,"seven_day":0,"source":"node_error"}') 304 305 five_hour=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).five_hour||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 306 seven_day=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).seven_day||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 307 five_resets=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).five_hour_resets_at||'');}catch{process.stdout.write('');}})" 2>/dev/null || echo "") 308 309 source=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).source||'?');}catch{process.stdout.write('?');}})" 2>/dev/null || echo "?") 310 log "Usage [${source}]: 5h=${five_hour}% weekly=${seven_day}% (warn at ${WARNING_PCT}%/${WEEKLY_WARNING_PCT}%)" 311 312 # Check if the 5h window has reset since we entered conservation 313 if [ "$CONSERVATION_MODE" = "true" ] && [ -n "$five_resets" ] && [ -n "$LIMIT_HIT_SINCE" ]; then 314 resets_epoch=$(date -d "$five_resets" +%s 2>/dev/null || echo "0") 315 hit_epoch=$(date -d "$LIMIT_HIT_SINCE" +%s 2>/dev/null || echo "0") 316 if [ "$resets_epoch" -gt 0 ] && [ "$resets_epoch" -gt "$hit_epoch" ]; then 317 CONSERVATION_MODE=false 318 CONSECUTIVE_LIMIT_HITS=0 319 LIMIT_HIT_SINCE="" 320 CONSERVATION_REASON="" 321 node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true 322 log "CONSERVATION MODE OFF — 5h window reset at $five_resets. Usage now ${five_hour}%. Resuming full operation." 323 return 324 fi 325 # Also exit if usage has dropped back below warning threshold 326 if [ "$five_hour" -lt "$((WARNING_PCT - 10))" ] && [ "$seven_day" -lt "$((WEEKLY_WARNING_PCT - 10))" ]; then 327 CONSERVATION_MODE=false 328 CONSECUTIVE_LIMIT_HITS=0 329 LIMIT_HIT_SINCE="" 330 CONSERVATION_REASON="" 331 node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true 332 log "CONSERVATION MODE OFF — usage recovered (5h=${five_hour}% weekly=${seven_day}%). Resuming full operation." 333 return 334 fi 335 fi 336 337 # Enter conservation proactively 338 if [ "$CONSERVATION_MODE" = "false" ]; then 339 if [ "$five_hour" -ge "$WARNING_PCT" ]; then 340 CONSERVATION_MODE=true 341 LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 342 CONSERVATION_REASON="5h=${five_hour}% (>=${WARNING_PCT}%)" 343 log "CONSERVATION MODE ON (proactive) — 5h window at ${five_hour}%. Deferring Opus rewording/proposals. Keeping replies+Haiku. Resets: $five_resets" 344 elif [ "$seven_day" -ge "$WEEKLY_WARNING_PCT" ]; then 345 CONSERVATION_MODE=true 346 LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 347 CONSERVATION_REASON="weekly=${seven_day}% (>=${WEEKLY_WARNING_PCT}%)" 348 log "CONSERVATION MODE ON (proactive) — weekly window at ${seven_day}%. Deferring Opus rewording/proposals. Resets: $(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).seven_day_resets_at||'');}catch{process.stdout.write('');}});")" 349 fi 350 fi 351 } 352 353 # Reactive: detect hard limit errors from claude -p envelope. 354 # Args: raw_result (full JSON envelope string), batch_type 355 check_limit_signal() { 356 raw="$1" 357 btype="$2" 358 359 [ -z "$raw" ] && return 360 361 is_limit=$(echo "$raw" | node -e " 362 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 363 try { 364 const env = JSON.parse(d); 365 // Only scan envelope-level metadata for limit signals — NOT env.result (LLM content). 366 // LLM-generated text (proposals, emails, rewrites) may mention "rate limit" etc. 367 const meta = [ 368 env.is_error ? 'is_error' : '', 369 typeof env.stop_reason === 'string' ? env.stop_reason : '', 370 typeof env.error === 'string' ? env.error : '', 371 typeof env.error?.message === 'string' ? env.error.message : '', 372 env.is_error && typeof env.result === 'string' && env.result.length < 500 ? env.result : '', 373 ].join(' ').toLowerCase(); 374 const hasKeyword = env.is_error && /rate.?limit|usage.?limit|claude.?max|overloaded|quota|529|too.?many.?request/.test(meta); 375 process.stdout.write(hasKeyword ? '1' : '0'); 376 } catch { process.stdout.write('0'); } 377 }); 378 " 2>/dev/null || echo "0") 379 380 if [ "$is_limit" = "1" ]; then 381 CONSECUTIVE_LIMIT_HITS=$((CONSECUTIVE_LIMIT_HITS + 1)) 382 [ -z "$LIMIT_HIT_SINCE" ] && LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 383 CONSERVATION_REASON="hard limit on $btype" 384 if [ "$CONSERVATION_MODE" = "false" ]; then 385 CONSERVATION_MODE=true 386 log "CONSERVATION MODE ON (reactive) — hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS). Deferring Opus tasks." 387 else 388 log "Usage limit still active on $btype (hit #$CONSECUTIVE_LIMIT_HITS since $LIMIT_HIT_SINCE)" 389 fi 390 391 # Persist to rate-limits.json so the state survives orchestrator restarts 392 # and is visible to should_skip_for_rate_limits() on next cycle. 393 # Uses 'hourly' reset window (1h backoff) — conservation mode recovery 394 # logic may clear it sooner if usage drops below threshold. 395 node -e " 396 const { setRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); 397 setRateLimit('claude_cli', { 398 limitType: 'hourly', 399 reason: 'Claude Max hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS)', 400 }); 401 " 2>/dev/null || log "WARN: Failed to persist claude_cli rate limit to rate-limits.json" 402 fi 403 } 404 405 # Query pipeline backlog counts for intelligent batch skipping. 406 # All counts are ACTIONABLE — filtered to only items that can reach a sent outreach. 407 # The core filter is ACTIVE_COUNTRY_CODES = all countries NOT in OUTREACH_BLOCKED_COUNTRIES. 408 # This filter is inherited by every gate so that blocking a country upstream stops all work 409 # for that country at every stage (scoring, enriching, proposals, reword). 410 # 411 # Sets global vars: 412 # BACKLOG_ELIGIBLE_OUTREACH — approved email/sms, delivery_status IS NULL, past 3d cooldown, gdpr_verified where required 413 # BACKLOG_PROPOSALS — proposals_drafted sites with unsent email/sms in active countries 414 # BACKLOG_ENRICHED — enriched sites in active countries, score < LOW_SCORE_CUTOFF 415 # BACKLOG_ACTIVE_COUNTRIES — count of active (non-blocked) country codes in pipeline 416 # BACKLOG_APPROVED_UNSENT — all approved+unsent (any channel, for display only) 417 # BACKLOG_PENDING_APPROVAL — pending approval count (display only) 418 refresh_backlog() { 419 # Read OUTREACH_SKIP_METHODS, OUTREACH_BLOCKED_COUNTRIES, and SMS-blocked countries from .env 420 _skip_methods=$(grep '^OUTREACH_SKIP_METHODS=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 421 _blocked_countries=$(grep '^OUTREACH_BLOCKED_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 422 _sms_blocked_countries=$(grep '^OUTREACH_BLOCKED_SMS_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 423 424 eval "$(node "$PROJECT_ROOT/scripts/refresh-backlog.js" \ 425 --skip-methods="$_skip_methods" \ 426 --blocked-countries="$_blocked_countries" \ 427 --sms-blocked-countries="$_sms_blocked_countries" \ 428 --low-score-cutoff="$LOW_SCORE_CUTOFF" \ 429 --project-root="$PROJECT_ROOT" 2>/dev/null)" 430 431 # ── Data-loss canary: halt if site count dropped >50% ── 432 if [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -gt 0 ]; then 433 _drop_pct=$(( (CANARY_PREVIOUS_COUNT - CANARY_SITE_COUNT) * 100 / CANARY_PREVIOUS_COUNT )) 434 if [ "$_drop_pct" -gt 50 ]; then 435 log "CRITICAL: Site count dropped ${_drop_pct}% (${CANARY_PREVIOUS_COUNT} → ${CANARY_SITE_COUNT}) — HALTING ORCHESTRATOR" 436 log "ACTION REQUIRED: Check PostgreSQL mmo database integrity. Restore from backup if needed." 437 log "To resume: rm logs/site-count-canary.json && restart orchestrator" 438 exit 1 439 fi 440 elif [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -eq 0 ]; then 441 log "CRITICAL: Site count is ZERO (was ${CANARY_PREVIOUS_COUNT}) — HALTING ORCHESTRATOR" 442 log "ACTION REQUIRED: Database has been wiped. Restore from backup immediately." 443 log "To resume: rm logs/site-count-canary.json && restart orchestrator" 444 exit 1 445 fi 446 } 447 448 # Load STAGE_THROTTLE_MULTIPLIER and LOW_SCORE_CUTOFF from .env (defaults: 3 and 82). 449 # Read once at startup — restart orchestrator to pick up changes. 450 STAGE_THROTTLE_MULTIPLIER=$(grep '^STAGE_THROTTLE_MULTIPLIER=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ') 451 STAGE_THROTTLE_MULTIPLIER=${STAGE_THROTTLE_MULTIPLIER:-3} 452 LOW_SCORE_CUTOFF=$(grep '^LOW_SCORE_CUTOFF=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ') 453 LOW_SCORE_CUTOFF=${LOW_SCORE_CUTOFF:-82} 454 455 # Load SKIP_STAGES from .env on every loop iteration so changes take effect without restart. 456 # Always reads .env — edit the file to change skip behaviour live. 457 load_skip_stages() { 458 if [ -f "$PROJECT_ROOT/.env" ]; then 459 val=$(grep '^SKIP_STAGES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'") 460 SKIP_STAGES="$val" 461 fi 462 # Normalise: lowercase, strip spaces 463 SKIP_STAGES=$(echo "${SKIP_STAGES:-}" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 464 } 465 466 # Check if a batch type is blocked by the pipeline's rate-limit-scheduler state. 467 # Reads logs/rate-limits.json (written by circuit-breaker.js when an API quota/rate-limit hits). 468 # This ensures the orchestrator respects the same automatic backoffs as the pipeline — 469 # e.g. if ZenRows hits its daily quota and blocks 'serps', both systems honour that pause. 470 # Returns 0 = skip (rate-limited), 1 = run 471 should_skip_for_rate_limits() { 472 batch_type="$1" 473 rate_limits_file="$PROJECT_ROOT/logs/rate-limits.json" 474 [ -f "$rate_limits_file" ] || return 1 # No file = no active rate limits 475 476 # Map batch_type → stage name to check in rate-limits.json 477 case "$batch_type" in 478 proposals_email|proposals_sms) stage="proposals" ;; 479 reply_responses) stage="outreach" ;; 480 classify_replies) stage="replies" ;; 481 score_semantic|score_sites) stage="scoring" ;; 482 enrich_sites) stage="enrich" ;; 483 followup_generate) stage="outreach" ;; 484 *) stage="" ;; 485 esac 486 487 now_ms=$(date +%s%3N 2>/dev/null || echo "0") 488 489 # Check each entry: if resetAt > now and stages includes our stage, skip. 490 # Also check 'claude_cli' — when Claude Max credits are exhausted, ALL orchestrator 491 # batches are blocked regardless of stage mapping (they all use claude -p). 492 matched=$(node -e " 493 try { 494 const data = JSON.parse(require('fs').readFileSync('$rate_limits_file', 'utf8')); 495 const stage = '$stage'; 496 const now = $now_ms; 497 498 // Global claude_cli gate — blocks ALL orchestrator batches 499 if (data.claude_cli && Number(data.claude_cli.resetAt) > now) { 500 const resetAt = new Date(Number(data.claude_cli.resetAt)).toISOString(); 501 process.stdout.write('claude_cli:' + resetAt + ':' + (data.claude_cli.reason || '')); 502 process.exit(0); 503 } 504 505 // Stage-specific gate (e.g. openrouter scoring, zenrows serps) 506 if (stage) { 507 for (const [api, info] of Object.entries(data)) { 508 if (Array.isArray(info.stages) && info.stages.includes(stage) && Number(info.resetAt) > now) { 509 const resetAt = new Date(Number(info.resetAt)).toISOString(); 510 process.stdout.write(api + ':' + resetAt + ':' + (info.reason || '')); 511 process.exit(0); 512 } 513 } 514 } 515 } catch(e) {} 516 " 2>/dev/null) 517 518 if [ -n "$matched" ]; then 519 api=$(echo "$matched" | cut -d: -f1) 520 reset_at=$(echo "$matched" | cut -d: -f2) 521 reason=$(echo "$matched" | cut -d: -f3-) 522 log "$batch_type: SKIP (rate-limited by $api until $reset_at — $reason)" 523 return 0 524 fi 525 return 1 526 } 527 528 # Check if a batch type is blocked by SKIP_STAGES. 529 # Returns 0 = skip (stage paused), 1 = run 530 should_skip_for_stage_flag() { 531 batch_type="$1" 532 [ -z "$SKIP_STAGES" ] && return 1 # Nothing skipped 533 534 # Map batch types to pipeline stage names 535 case "$batch_type" in 536 proposals_email|proposals_sms) 537 echo "$SKIP_STAGES" | grep -q 'proposals' && { 538 log "$batch_type: SKIP (SKIP_STAGES=proposals)"; return 0; } ;; 539 reply_responses) 540 echo "$SKIP_STAGES" | grep -q 'outreach\|replies\|reply' && { 541 log "$batch_type: SKIP (SKIP_STAGES includes outreach/replies)"; return 0; } ;; 542 classify_replies) 543 echo "$SKIP_STAGES" | grep -q 'replies\|classify' && { 544 log "$batch_type: SKIP (SKIP_STAGES=replies/classify)"; return 0; } ;; 545 extract_names) 546 echo "$SKIP_STAGES" | grep -q 'extract_names\|extract' && { 547 log "$batch_type: SKIP (SKIP_STAGES=extract_names)"; return 0; } ;; 548 proofread) 549 echo "$SKIP_STAGES" | grep -q 'proofread' && { 550 log "$batch_type: SKIP (SKIP_STAGES=proofread)"; return 0; } ;; 551 score_semantic|score_sites) 552 echo "$SKIP_STAGES" | grep -q 'scoring' && { 553 log "$batch_type: SKIP (SKIP_STAGES=scoring)"; return 0; } ;; 554 enrich_sites) 555 echo "$SKIP_STAGES" | grep -q 'enrich' && { 556 log "$batch_type: SKIP (SKIP_STAGES=enrich)"; return 0; } ;; 557 followup_generate) 558 echo "$SKIP_STAGES" | grep -q 'followup' && { 559 log "$batch_type: SKIP (SKIP_STAGES=followup)"; return 0; } ;; 560 esac 561 return 1 # Not skipped 562 } 563 564 # Check if a batch type should be skipped because its downstream queue is already saturated. 565 # Uses STAGE_THROTTLE_MULTIPLIER (default 3) × the relevant batch size as the threshold. 566 # 567 # Stage chain (each gate pauses all earlier stages): 568 # enriched → [proposals_*] → proposals_drafted → [proofread] → approved outreach 569 # 570 # Gate 1 (outreach gate): if eligible outreach > 3× rolling 7-day avg daily send rate 571 # (floor: 3× proofread_batch, so the gate isn't too permissive 572 # when historical send rate is very low or zero) 573 # pause: proofread, proposals_*, enrich_sites, score_semantic 574 # Gate 2 (proposals gate): if proposals_drafted > 3× (proposals_email_batch + proposals_sms_batch) 575 # pause: proposals_*, enrich_sites 576 # Gate 3 (enrich gate): if enriched > 3× enrich_sites_batch 577 # pause: enrich_sites, score_semantic 578 # 579 # Exception: serps is never paused (ZenRows daily quota must be used). 580 # Returns 0 = skip, 1 = run 581 should_skip_for_backlog() { 582 batch_type="$1" 583 584 # Gate 1: outreach queue saturated — pause everything feeding into it. 585 # Use 3× the rolling 7-day daily send average as the threshold, with a floor 586 # of 3× PROOFREAD_BATCH to prevent the gate from being too permissive when 587 # the send rate is very low (e.g. pipeline just started, GDPR blocking, etc). 588 _static_floor=$((PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER)) 589 _send_rate_threshold=$((${BACKLOG_DAILY_SEND_AVG:-0} * STAGE_THROTTLE_MULTIPLIER)) 590 _outreach_threshold=$(( _send_rate_threshold > _static_floor ? _send_rate_threshold : _static_floor )) 591 592 case "$batch_type" in 593 proofread|proposals_email|proposals_sms|enrich_sites|score_semantic) 594 if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -gt "$_outreach_threshold" ]; then 595 log "$batch_type: SKIP — eligible_outreach=${BACKLOG_ELIGIBLE_OUTREACH} > threshold=${_outreach_threshold} (${STAGE_THROTTLE_MULTIPLIER}×max(daily_avg=${BACKLOG_DAILY_SEND_AVG:-0},floor=${PROOFREAD_BATCH}))" 596 return 0 597 fi 598 ;; 599 esac 600 601 # Gate 2: proposals_drafted queue saturated — pause proposal generation and earlier stages. 602 # Threshold based on PROOFREAD throughput (the bottleneck), not proposal generation batch size. 603 # Proofread clears PROOFREAD_BATCH per cycle; allow 3× that buffer before pausing upstream. 604 _proposals_threshold=$(( PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER )) 605 606 case "$batch_type" in 607 proposals_email|proposals_sms|enrich_sites) 608 if [ "${BACKLOG_PROPOSALS:-0}" -gt "$_proposals_threshold" ]; then 609 log "$batch_type: SKIP — proposals_drafted=${BACKLOG_PROPOSALS} > threshold=${_proposals_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${_proposals_batch_max})" 610 return 0 611 fi 612 ;; 613 esac 614 615 # Gate 3: enriched queue saturated — pause enrich and scoring. 616 # Threshold based on proposals throughput (PROPOSALS_EMAIL + PROPOSALS_SMS per cycle), 617 # not enrich batch size. Enriched sites are consumed by proposal generation. 618 _proposals_batch_max=$(( PROPOSALS_EMAIL_BATCH + PROPOSALS_SMS_BATCH )) 619 _enrich_threshold=$(( _proposals_batch_max * STAGE_THROTTLE_MULTIPLIER * 5 )) 620 621 case "$batch_type" in 622 enrich_sites|score_semantic) 623 if [ "${BACKLOG_ENRICHED:-0}" -gt "$_enrich_threshold" ]; then 624 log "$batch_type: SKIP — enriched=${BACKLOG_ENRICHED} > threshold=${_enrich_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${ENRICH_SITES_BATCH})" 625 return 0 626 fi 627 ;; 628 esac 629 630 return 1 # Don't skip — run normally 631 } 632 633 # Run a single batch type 634 # Args: batch_type model batch_size context_files... 635 run_batch() { 636 batch_type="$1" 637 model="$2" 638 batch_size="$3" 639 shift 3 640 # Remaining args are context file paths 641 642 log "Pulling $batch_type batch (limit=$batch_size)..." 643 batch_json=$(node scripts/claude-batch.js "$batch_type" "$batch_size" 2>/dev/null || echo '{"count":0,"items":[]}') 644 645 count=$(echo "$batch_json" | node -e " 646 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 647 try { process.stdout.write(String(JSON.parse(d).count||0)); } 648 catch { process.stdout.write('0'); } 649 }); 650 " 2>/dev/null || echo "0") 651 [ -z "$count" ] && count=0 652 653 if [ "$count" = "0" ]; then 654 log "$batch_type: no items to process" 655 return 1 # Signal "empty" to caller 656 fi 657 658 _effort_log=$(effort_for_batch "$batch_type") 659 log "$batch_type: processing $count items with $model (effort=$_effort_log)..." 660 661 # Build the prompt — include context files and the batch data 662 context="" 663 for f in "$@"; do 664 if [ -f "$f" ]; then 665 context="$context 666 667 --- FILE: $f --- 668 $(cat "$f") 669 --- END FILE ---" 670 fi 671 done 672 673 # Build task-specific instructions 674 case "$batch_type" in 675 proposals_email) 676 task_prompt="Generate email proposals for each site below. Follow the PROPOSAL.md and email best practices. Output JSON: {\"batch_type\":\"proposals_email\",\"results\":[{\"site_id\":N,\"contact_method\":\"email\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\",\"subject_line\":\"...\"}]}" 677 ;; 678 proposals_sms) 679 task_prompt="Generate SMS proposals (<160 chars) for each site below. Follow PROPOSAL.md and SMS best practices. TCPA opt-out required for US/CA only. Output JSON: {\"batch_type\":\"proposals_sms\",\"results\":[{\"site_id\":N,\"contact_method\":\"sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\"}]}" 680 ;; 681 classify_replies) 682 task_prompt="$(cat prompts/CLASSIFY-REPLIES.md)" 683 ;; 684 extract_names) 685 task_prompt="Extract the person's first name from these email addresses and domains. If no name can be inferred, leave null. Output JSON: {\"batch_type\":\"extract_names\",\"results\":[{\"site_id\":N,\"contacts\":[{\"uri\":\"...\",\"name\":\"FirstName\"}]}]}" 686 ;; 687 reply_responses) 688 task_prompt="Follow the reply guidelines in prompts/REPLIES.md. Respond to every inbound in the batch. Output ONLY valid JSON — no markdown, no explanation." 689 ;; 690 oversee) 691 task_prompt="You are the senior overseer of an automated lead-generation pipeline (333 Method). Analyze the system health snapshot and decide what corrective actions to take. The structured JSON is the authoritative source of truth — previous_overseer_findings is historical context only. 692 KNOWN SELF-HEALING BEHAVIOURS (do NOT escalate): cron_timer_dead is auto-fixed by processGuardian every minute. 1-minute cron gaps are expected. 693 COST MODEL (do NOT flag as expensive): All LLM batches run via `claude -p --model <name>` which uses the Claude Max subscription — zero marginal cost per call regardless of model (opus/sonnet/haiku). Do NOT flag opus or sonnet model usage as a cost issue. Only OpenRouter direct-API calls (gather_evidence, scoring, replies) count against the $200/day LLM_DAILY_BUDGET. 694 VALID PIPELINE STATUSES (ALL are intentional — do NOT flag as invalid): found, assets_captured, prog_scored, semantic_scored, vision_scored, enriched_regex, enriched_llm, enriched, proposals_drafted, outreach_partial, outreach_sent, rescored, skipped, error. enriched_regex is an intermediate enrichment step (browser pass complete, LLM pass pending) — it is expected and normal. 695 Available actions: RESTART_PIPELINE (if stopped/crash-looping), CLEAR_STALE_TASKS (params: min_age_hours), RESET_STUCK_SITES (params: stage, min_stuck_hours ≥ 4 — never lower), LOG_ONLY (params: note). Do NOT reset a stage when its target already has a large backlog (found>5000: don't reset assets_captured or scored; rescored>500: don't reset enriched or rescored). Sites with recapture_count≥2 are auto-excluded from resets. NEVER reset 'enriched' stage — enriched is a deliberate evidence-collection queue (gather_evidence cron picks sites up every 5 min); large enriched backlogs are normal and expected. 696 Output JSON: {\"batch_type\":\"oversee\",\"results\":[{\"summary\":\"1-2 sentences\",\"severity\":\"ok|warn|critical\",\"findings\":[{\"issue\":\"...\",\"severity\":\"low|medium|high\",\"stage\":\"pipeline|agents|cron|sites\"}],\"actions\":[{\"code\":\"ACTION_CODE\",\"description\":\"...\",\"params\":{}}]}]}" 697 ;; 698 classify_errors) 699 task_prompt="Analyze these pipeline error messages that don't match any existing patterns. Propose regex patterns for categorization using JavaScript regex syntax (no leading/trailing slashes). Group similar errors into one pattern where possible. Terminal = permanent failure; Retriable = transient or fixable. 700 Output JSON: {\"batch_type\":\"classify_errors\",\"results\":[{\"pattern\":\"regex\",\"label\":\"Short Label\",\"group\":\"terminal|retriable\",\"context\":\"site|outreach\",\"example_errors\":[\"...\"],\"occurrence_count\":N}]}" 701 ;; 702 score_sites) 703 task_prompt="You are a CRO specialist scoring local business websites for conversion effectiveness. For each site in BATCH DATA, analyze the provided HTML and score it using the CRO scoring system defined in the context above. 704 705 CRITICAL RULES: 706 - Evaluate each site independently using ONLY its provided HTML and http_headers 707 - Always return COMPLETE factor_scores for every site — never omit any factor 708 - Do NOT include conversion_score or letter_grade — these are computed programmatically 709 - Output ONLY valid JSON, no markdown fences 710 711 Output JSON: {\"batch_type\":\"score_sites\",\"results\":[{\"site_id\":N,\"factor_scores\":{\"headline_quality\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"call_to_action\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"urgency_messaging\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"hook_engagement\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"trust_signals\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"imagery_design\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"offer_clarity\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"contextual_appropriateness\":{\"score\":N,\"reasoning\":\"...\",\"industry_context\":\"...\"}},\"overall_calculation\":{\"grade_interpretation\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"country_detection_confidence\":\"high|medium|low\",\"country_detection_evidence\":[\"...\"],\"is_business_directory\":false,\"is_local_business\":true,\"is_law_firm\":false,\"industry_classification\":\"plumbing\",\"is_error_page\":false,\"error_type\":null,\"error_description\":null,\"is_broken_site\":false,\"broken_site_details\":[]},\"contact_details\":{\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"email_addresses\":[],\"phone_numbers\":[],\"social_profiles\":[],\"key_pages\":[]}}]}" 712 ;; 713 enrich_sites) 714 task_prompt="You are a contact data extraction specialist. For each site in BATCH DATA, extract contact information from the provided HTML using the enrichment guidelines in the context above. 715 716 CRITICAL RULES: 717 - Extract ONLY information explicitly visible in the HTML — do NOT fabricate data 718 - If current_contacts already has a field, only add NEW items not already present 719 - Preserve phone numbers EXACTLY as displayed (do not normalize format) 720 - Return an empty object {} for sites where no new contacts are found 721 722 Output JSON: {\"batch_type\":\"enrich_sites\",\"results\":[{\"site_id\":N,\"domain\":\"example.com\",\"business_name\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"NSW\",\"email_addresses\":[{\"email\":\"info@example.com\",\"label\":\"General\"}],\"phone_numbers\":[{\"number\":\"+61 2 1234 5678\",\"label\":\"Office\"}],\"social_profiles\":[{\"url\":\"https://facebook.com/example\",\"label\":\"Facebook\"}],\"key_pages\":[\"https://example.com/contact\"],\"primary_contact_form\":null}]}" 723 ;; 724 score_semantic) 725 task_prompt="You are scoring local business websites for conversion effectiveness. For each site, evaluate these 3 semantic factors from the extracted text: 726 727 1. **headline_quality** (0-10): Is the H1/title compelling? Does it promise a benefit, address the visitor, create curiosity? Generic 'Welcome' or just a business name = 2-4. Specific benefit + audience = 7-9. 728 2. **value_proposition** (0-10): Does the page clearly articulate WHAT they do, WHO they help, and WHY choose them? Look for quantified claims, benefit language, customer-focused framing (you/your > we/our). Vague 'quality service' = 3-4. Specific benefits with proof = 7-9. 729 3. **unique_selling_proposition** (0-10): What differentiates them? Look for 'only', 'exclusive', years of experience, number of customers served, awards, patents, specific methodologies. Generic = 3-4. Clear differentiators = 7-9. 730 731 Score conservatively. Most local business sites score 4-7. A 9-10 is exceptional (clear benefit headline + quantified proof + genuine differentiators). A 0-2 means almost no content or completely generic. 732 733 Output JSON: {\"batch_type\":\"score_semantic\",\"results\":[{\"site_id\":N,\"headline_quality\":{\"score\":N,\"reasoning\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\"}}]}" 734 ;; 735 proofread) 736 task_prompt="Review each message in BATCH DATA against the proofreading rules in the context above. Apply every rule. Output a decision for every item — no omissions. 737 738 Output JSON: {\"batch_type\":\"proofread\",\"results\":[{\"message_id\":N,\"decision\":\"approve\"},{\"message_id\":N,\"decision\":\"rework\",\"rework_instructions\":\"...\"},{\"message_id\":N,\"decision\":\"reject\",\"reject_reason\":\"...\"}]}" 739 ;; 740 code_review) 741 task_prompt="You are a senior code reviewer (Code Reviewer agent). Review the source file in BATCH DATA for security vulnerabilities, bugs, performance issues, and maintainability problems. Follow the review criteria in the context above. 742 743 For each finding include: severity (1-10), category (security/bug/performance/style), line number, description, and a concrete suggestion. 744 745 Only report real issues with concrete fixes. Do not hallucinate problems. 746 747 Output JSON: {\"batch_type\":\"code_review\",\"results\":[{\"file_path\":\"...\",\"findings\":[{\"severity\":N,\"category\":\"...\",\"line\":N,\"description\":\"...\",\"suggestion\":\"...\"}],\"summary\":\"...\"}]}" 748 ;; 749 monitor_health) 750 task_prompt="You are an SRE monitoring a Node.js outreach pipeline. Analyze the system health data in BATCH DATA. Follow the monitoring criteria in the context above. 751 752 Identify any critical issues, warnings, or degraded conditions. If the system is healthy, say so explicitly. 753 754 Output JSON: {\"batch_type\":\"monitor_health\",\"results\":[{\"severity\":\"info|warn|critical\",\"summary\":\"...\",\"findings\":[{\"severity\":\"...\",\"component\":\"...\",\"issue\":\"...\",\"detail\":\"...\"}],\"recommended_actions\":[\"...\"]}]}" 755 ;; 756 triage_errors) 757 task_prompt="You are an Incident Response Commander triaging pipeline errors. Classify each failed task in BATCH DATA. Follow the triage criteria in the context above. 758 759 For each error: determine type, severity, and action. Only flag create_fix_task for critical/high severity issues with a clear code fix needed. 760 761 Output JSON: {\"batch_type\":\"triage_errors\",\"results\":[{\"error_message\":\"...\",\"type\":\"pipeline_bug|infrastructure|external_api|data_quality|config|known_transient\",\"severity\":\"critical|high|medium|low\",\"action\":\"create_fix_task|monitor|ignore\",\"summary\":\"...\",\"suggested_fix\":\"...\"}]}" 762 ;; 763 check_docs) 764 task_prompt="You are a Technical Writer checking documentation freshness. Review the recently changed source files in BATCH DATA. Follow the checking criteria in the context above. 765 766 For each file, identify any stale documentation (comments, JSDoc, CLAUDE.md references, README mentions). Only report genuinely stale content — not missing docs for new code (that's a separate concern). 767 768 Output JSON: {\"batch_type\":\"check_docs\",\"results\":[{\"file_path\":\"...\",\"stale_docs\":[{\"location\":\"...\",\"issue\":\"...\",\"suggested_update\":\"...\"}],\"summary\":\"...\"}]}" 769 ;; 770 evidence_merge) 771 task_prompt="You are a CRO analyst merging evidence-based findings into HTML-scored website evaluations. Follow the merge criteria in the context above. 772 773 For each site in BATCH DATA: compare score_json factors against evidence_pass1 and evidence_pass2 findings. Revise scores only where evidence is concrete and specific. Surface additional findings that HTML scoring could not catch (CSS-confirmed issues, stock photo filenames, wrong-state links, above-fold placement). 774 775 Output JSON: {\"batch_type\":\"evidence_merge\",\"results\":[{\"site_id\":N,\"revised_scores\":{\"factor_name\":{\"original\":N,\"revised\":N,\"reason\":\"...\"}},\"additional_findings\":[{\"factor\":\"...\",\"finding\":\"...\",\"severity\":\"critical|major|minor\"}],\"evidence_summary\":\"...\"}]}" 776 ;; 777 followup_generate) 778 task_prompt="You are writing follow-up outreach messages for a website conversion audit service. Each site in BATCH DATA has already been contacted once (or more) with no reply. Generate the NEXT follow-up touch using a DIFFERENT value angle than previous messages. 779 780 FOLLOW-UP CADENCE (8 touches over 42 days): 781 - Touch 2 (Day 3): Different weakness + social proof (\"other businesses in your area\") 782 - Touch 3 (Day 7): ROI framing (\"this could be costing you \$X/month\") 783 - Touch 4 (Day 14): Case study + sample report link (auditandfix.com) 784 - Touch 5 (Day 21): Quick win — give one specific free fix to demonstrate value 785 - Touch 6 (Day 28): Ad waste angle (\"paying for ads with a low-scoring site?\") / competitor gap 786 - Touch 7 (Day 35): Authority — \"scored 43,000+ sites, pattern is always the same\" 787 - Touch 8 (Day 42): Breakup email (\"closing the file, offer stands if you need it later\") 788 789 RULES: 790 - Each result must include ALL channels from the site's channels array (generate one message per channel per site) 791 - Email: conversational, 3-5 short paragraphs, include subject line 792 - SMS: under 160 chars, punchy, no subject line needed 793 - NEVER repeat the same angle or wording as previous_messages 794 - Use the site's weaknesses for personalisation but pick DIFFERENT weaknesses than touch 1 795 - Touch 5: lead with a specific free fix — demonstrate competence, not just pitch 796 - Touch 6: if site has is_running_ads=true, lean into ad waste angle; otherwise use competitor gap 797 - Touch 8 (breakup): be respectful, leave the door open, mention auditandfix.com 798 - Match the tone to the country (G'day for AU, casual for US/CA, formal for UK/EU) 799 800 Output JSON: {\"batch_type\":\"followup_generate\",\"results\":[{\"site_id\":N,\"contact_method\":\"email|sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"sequence_step\":N,\"message_body\":\"...\",\"subject_line\":\"...(email only)\"}]}" 801 ;; 802 esac 803 804 full_prompt="$task_prompt 805 806 $context 807 808 BATCH DATA: 809 $batch_json 810 811 IMPORTANT: Output ONLY valid JSON. No markdown, no explanation, no code fences." 812 813 # Call Claude and pipe result to store 814 # Write prompt to a temp file to avoid shell arg-length limits with large batches. 815 # Unset CLAUDECODE so claude doesn't refuse to run if invoked inside another Claude Code session. 816 _prompt_file=$(mktemp /tmp/orch-prompt-XXXXXX) 817 printf '%s' "$full_prompt" > "$_prompt_file" 818 _prompt_bytes=$(wc -c < "$_prompt_file" 2>/dev/null || echo 0) 819 _prompt_kb=$(( _prompt_bytes / 1024 )) 820 log "$batch_type: context=${_prompt_kb}kB (prompt+context+batch)" 821 # Claude Code (Electron) requires XDG_RUNTIME_DIR to run. Systemd user services may 822 # not inherit it — set a fallback so claude doesn't silently exit with empty output. 823 _xdg="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" 824 [ -d "$_xdg" ] || _xdg="/tmp/runtime-$(id -u)" && mkdir -p "$_xdg" 2>/dev/null || true 825 826 # Write claude output DIRECTLY to a file — never capture in $() bash variable. 827 # bash $() in non-UTF8 locales (NixOS systemd env) corrupts multi-byte characters 828 # (Japanese, Chinese, emoji), making the extracted JSON invalid at random positions. 829 # File I/O preserves all bytes exactly regardless of locale. 830 _raw_file="$PROJECT_ROOT/logs/.orch-raw-$$.json" 831 _claude_stderr=$(mktemp /tmp/orch-stderr-XXXXXX) 832 _effort=$(effort_for_batch "$batch_type") 833 XDG_RUNTIME_DIR="$_xdg" env -u CLAUDECODE "$CLAUDE_BIN" -p --model "$model" --effort "$_effort" --output-format json \ 834 < "$_prompt_file" > "$_raw_file" 2>"$_claude_stderr" || true 835 rm -f "$_prompt_file" 836 _stderr_content=$(cat "$_claude_stderr"); rm -f "$_claude_stderr" 837 838 # Always check for usage limit signals (reads only ASCII metadata, file-safe) 839 check_limit_signal "$(cat "$_raw_file" 2>/dev/null || true)" "$batch_type" 840 841 # Debug: log raw file size and first 200 ASCII chars for diagnosis 842 _raw_size=$(wc -c < "$_raw_file" 2>/dev/null || echo "missing") 843 _raw_preview=$(cat "$_raw_file" 2>/dev/null | tr -cd '[:print:]' | head -c 200 || echo "") 844 log "$batch_type: raw_file=${_raw_size}b preview=$(echo "$_raw_preview" | head -c 150)" 845 846 if [ ! -s "$_raw_file" ]; then 847 rm -f "$_raw_file" 848 # Track consecutive empty results per batch type in the gate file. 849 # Pattern: external tool failure looks identical to "no work" — consecutive 850 # empties on a batch that had items is a strong signal something is broken. 851 _empty_key="empty_streak_${batch_type}" 852 _streak=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(String(d['$_empty_key']||0));}catch{process.stdout.write('0');}" 2>/dev/null || echo "0") 853 _streak=$(( _streak + 1 )) 854 node -e " 855 const fs=require('fs'); 856 let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{} 857 d['$_empty_key']=$_streak; 858 fs.writeFileSync('$GATE_FILE',JSON.stringify(d)); 859 " 2>/dev/null || true 860 861 _empty_msg="$batch_type: claude returned empty — skipping store${_stderr_content:+ (stderr: $(echo "$_stderr_content" | head -1))}" 862 if [ "$_streak" -ge 5 ]; then 863 log "ALERT: $_empty_msg [consecutive_empty=$_streak — likely tool failure, not empty queue]" 864 elif [ "$_streak" -ge 3 ]; then 865 log "WARN: $_empty_msg [consecutive_empty=$_streak]" 866 else 867 log "$_empty_msg" 868 fi 869 return 0 870 fi 871 872 # Success — reset the consecutive-empty streak for this batch type 873 node -e " 874 const fs=require('fs'); 875 let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{} 876 delete d['empty_streak_${batch_type}']; 877 fs.writeFileSync('$GATE_FILE',JSON.stringify(d)); 878 " 2>/dev/null || true 879 880 store_summary=$(node "$PROJECT_ROOT/scripts/claude-store-wrapper.js" "$_raw_file" "$batch_type" 2>&1 || echo '{"error":"store_node_failed"}') 881 882 if [ -z "$store_summary" ] || echo "$store_summary" | grep -q '"error"'; then 883 # On error: preserve raw file for inspection 884 _saved_raw="$PROJECT_ROOT/logs/orch-raw-${batch_type}-saved.json" 885 cp "$_raw_file" "$_saved_raw" 2>/dev/null || true 886 log "$batch_type: saved raw to $_saved_raw for inspection" 887 fi 888 rm -f "$_raw_file" 889 890 if [ -z "$store_summary" ]; then 891 log "$batch_type: [WARN] store_summary empty — node produced no output. stderr=$(echo "$_stderr_content" | head -c 300)" 892 else 893 log "$batch_type: $store_summary" 894 [ -n "$_stderr_content" ] && echo "$store_summary" | grep -q '"error"' && log "$batch_type: claude_stderr=$(echo "$_stderr_content" | head -c 200)" 895 fi 896 return 0 897 } 898 899 # Returns 0 if the batch type is skipped in conservation mode, 1 if it should run. 900 # Time-critical and Haiku tasks always run. Opus-heavy generation deferred. 901 should_skip_for_conservation() { 902 batch_type="$1" 903 [ "$CONSERVATION_MODE" = "false" ] && return 1 # Not in conservation mode — don't skip 904 905 case "$batch_type" in 906 # Always run — time-critical (inbound replies can't wait) or cheap (Haiku) 907 reply_responses|classify_replies|extract_names|oversee|classify_errors) 908 return 1 ;; 909 # Defer — Opus-heavy or pipeline-upstream work that can wait 910 proposals_email|proposals_sms|score_semantic|score_sites|enrich_sites|proofread|followup_generate) 911 log "$batch_type: SKIP (conservation mode — Claude usage limit active since $LIMIT_HIT_SINCE)" 912 return 0 ;; 913 esac 914 return 1 915 } 916 917 # Run a batch with all standard skip checks (rate_limits, stage_flag, conservation, backlog). 918 # The skip functions themselves know which batch types they apply to — unknown types pass through. 919 # Usage: run_checked <type> <model> <batch_size> [context_files...] 920 run_checked() { 921 _rc_type="$1"; _rc_model="$2"; _rc_size="$3"; shift 3 922 [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rc_type" ] && return 0 923 should_skip_for_rate_limits "$_rc_type" && return 0 924 should_skip_for_stage_flag "$_rc_type" && return 0 925 should_skip_for_conservation "$_rc_type" && return 0 926 should_skip_for_backlog "$_rc_type" && return 0 927 run_batch "$_rc_type" "$_rc_model" "$_rc_size" "$@" && had_work=true 928 } 929 930 # Run a time-gated batch with all standard skip checks + is_due gate. 931 # Usage: run_checked_gated <type> <model> <batch_size> <interval_minutes> [context_files...] 932 run_checked_gated() { 933 _rg_type="$1"; _rg_model="$2"; _rg_size="$3"; _rg_interval="$4"; shift 4 934 [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rg_type" ] && return 0 935 should_skip_for_rate_limits "$_rg_type" && return 0 936 should_skip_for_stage_flag "$_rg_type" && return 0 937 should_skip_for_conservation "$_rg_type" && return 0 938 should_skip_for_backlog "$_rg_type" && return 0 939 if is_due "$_rg_type" "$_rg_interval"; then 940 run_batch "$_rg_type" "$_rg_model" "$_rg_size" "$@" && had_work=true 941 mark_ran "$_rg_type" 942 else 943 log "$_rg_type: not due yet (runs every ${_rg_interval}min)" 944 fi 945 } 946 947 # Check if any Tier C (interactive) agents are overdue. 948 # Writes logs/agents-due.json with overdue agents and their session start prompts. 949 # Called once per cycle so the daily report and AFK monitor can surface them. 950 check_tier_c_agents() { 951 _due_agents="[]" 952 953 # Security Engineer: quarterly (129600 min) + major refactor 954 if is_due "tier_c_security_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 955 _due_agents=$(echo "$_due_agents" | node -e " 956 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 957 const a=JSON.parse(d); 958 a.push({agent:'security_audit',schedule:'quarterly', 959 prompt:'Run a Security Engineer audit on 333Method and 2Step. Review: (1) API key handling in load-env.js and .env files, (2) Twilio webhook authentication in src/inbound/sms.js, (3) PayPal integration in src/payment/paypal.js, (4) unsigned unsubscribe tokens in 2Step src/outreach/email.js, (5) stealth-browser SSRF risk, (6) SQL injection review of all DB queries. Output findings as JSON with severity/file/line/description.'}); 960 process.stdout.write(JSON.stringify(a)); 961 }); 962 " 2>/dev/null || echo "$_due_agents") 963 log "[AGENT DUE] security_audit — run Security Engineer (Opus, max, thinking on)" 964 fi 965 966 # Compliance Auditor: quarterly (129600 min) + major refactor 967 if is_due "tier_c_compliance_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 968 _due_agents=$(echo "$_due_agents" | node -e " 969 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 970 const a=JSON.parse(d); 971 a.push({agent:'compliance_audit',schedule:'quarterly', 972 prompt:'Run a Compliance Auditor review on 333Method outreach system. Check: (1) CAN-SPAM compliance in email templates, (2) TCPA compliance in SMS sending (business hours, opt-out), (3) GDPR country blocking logic in src/config/countries.js, (4) unsubscribe honoring in src/outreach/, (5) 72-hour cooldown enforcement. Also audit 2Step CAN-SPAM compliance in src/outreach/email.js.'}); 973 process.stdout.write(JSON.stringify(a)); 974 }); 975 " 2>/dev/null || echo "$_due_agents") 976 log "[AGENT DUE] compliance_audit — run Compliance Auditor (Opus, high, thinking on)" 977 fi 978 979 # SEO Specialist: monthly (43200 min) for colorcraft-ai.com 980 if is_due "tier_c_seo_audit" 43200; then 981 _due_agents=$(echo "$_due_agents" | node -e " 982 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 983 const a=JSON.parse(d); 984 a.push({agent:'seo_specialist',schedule:'monthly', 985 prompt:'Run an SEO Specialist audit on colorcraft-ai.com. Fetch the site, analyze technical SEO (meta tags, page speed indicators, structured data, sitemap, robots.txt), keyword opportunities, content gaps, and link building potential. Output a prioritized report of changes for Base44 to implement.'}); 986 process.stdout.write(JSON.stringify(a)); 987 }); 988 " 2>/dev/null || echo "$_due_agents") 989 log "[AGENT DUE] seo_specialist — run SEO Specialist (Sonnet, medium)" 990 fi 991 992 # Automation Governance: quarterly (129600 min) + major refactor 993 if is_due "tier_c_automation_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 994 _due_agents=$(echo "$_due_agents" | node -e " 995 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 996 const a=JSON.parse(d); 997 a.push({agent:'automation_audit',schedule:'quarterly', 998 prompt:'Run an Automation Governance audit. Read the cron_jobs table (sqlite3 db/sites.db \"SELECT * FROM cron_jobs\"), review all cron jobs in src/cron/ and the orchestrator in scripts/claude-orchestrator.sh. Identify: redundant jobs, poor sequencing, missing error handling, jobs that should be consolidated. Output a ranked list of changes.'}); 999 process.stdout.write(JSON.stringify(a)); 1000 }); 1001 " 2>/dev/null || echo "$_due_agents") 1002 log "[AGENT DUE] automation_audit — run Automation Governance Architect (Sonnet, medium)" 1003 fi 1004 1005 # Code review completion check — surface Phase 5 reminder when queue is done 1006 if [ -f "logs/code-review-queue.json" ]; then 1007 _cr_done=$(node -e " 1008 try { 1009 const q = JSON.parse(require('fs').readFileSync('logs/code-review-queue.json','utf8')); 1010 process.stdout.write(q.next_index >= q.files.length ? 'yes' : 'no'); 1011 } catch { process.stdout.write('no'); } 1012 " 2>/dev/null || echo "no") 1013 if [ "$_cr_done" = "yes" ] && is_due "tier_c_phase5_reminder" 43200; then 1014 _due_agents=$(echo "$_due_agents" | node -e " 1015 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 1016 const a=JSON.parse(d); 1017 a.push({agent:'phase5_sessions',schedule:'one-shot', 1018 prompt:'Code review of all 150 files is complete. Time to run Phase 5 Tier C sessions:\n1. Security audit: ./scripts/run-agent-session.sh security_audit\n2. 2Step CI/CD: ./scripts/run-agent-session.sh automation_audit\n3. colorcraft-ai.com report: ./scripts/run-agent-session.sh colorcraft_report\n4. Update distributed-agent-system plan (manual task in docs/plans/)'}); 1019 process.stdout.write(JSON.stringify(a)); 1020 }); 1021 " 2>/dev/null || echo "$_due_agents") 1022 log "[REMINDER] Code review complete — Phase 5 Tier C sessions ready to run" 1023 log " Run: ./scripts/run-agent-session.sh security_audit" 1024 log " Run: ./scripts/run-agent-session.sh colorcraft_report" 1025 log " Run: ./scripts/run-agent-session.sh automation_audit" 1026 log " Update: docs/plans/distributed-agent-system.md" 1027 fi 1028 fi 1029 1030 # Write results to logs/agents-due.json 1031 printf '%s\n' "$_due_agents" > logs/agents-due.json 2>/dev/null || true 1032 } 1033 1034 # Detect major refactor: check recent commits for patterns that trigger agent reviews. 1035 # Sets MAJOR_REFACTOR=true if criteria met. Called once per cycle. 1036 check_major_refactor() { 1037 MAJOR_REFACTOR=false 1038 _changed=$(git diff --name-only HEAD~10 2>/dev/null | head -200 || echo "") 1039 _count=$(echo "$_changed" | grep -c '.' 2>/dev/null) || _count=0 1040 1041 # 10+ files changed 1042 [ "$_count" -ge 10 ] && MAJOR_REFACTOR=true && return 0 1043 1044 # Security-sensitive files 1045 echo "$_changed" | grep -qE '^src/stages/|^scripts/claude-orchestrator\.sh$|^db/migrations/|^src/outreach/|^src/payment/|^src/inbound/|^load-env\.js$|^src/utils/load-env\.js$|^src/utils/stealth-browser' && MAJOR_REFACTOR=true && return 0 1046 1047 return 0 1048 } 1049 1050 # Define batch order — priority: outreach-adjacent first, upstream last 1051 process_all_batches() { 1052 had_work=false 1053 1054 # Re-resolve CLAUDE_BIN each cycle — auto-updates delete old versions 1055 resolve_claude_bin 1056 1057 # Run canary once per cycle (verifies node, claude-batch.js, CLAUDE_BIN are usable) 1058 run_canary 1059 1060 # Detect major refactor (sets MAJOR_REFACTOR env var for this cycle) 1061 check_major_refactor 1062 [ "$MAJOR_REFACTOR" = "true" ] && log "MAJOR_REFACTOR detected — Tier C agents flagged as due" 1063 1064 # Check Tier C agent schedule — writes logs/agents-due.json 1065 check_tier_c_agents 1066 1067 # Reload SKIP_STAGES from .env each cycle (respects live changes without restart) 1068 load_skip_stages 1069 [ -n "$SKIP_STAGES" ] && log "SKIP_STAGES=$SKIP_STAGES" 1070 1071 # Check Claude Max usage limits proactively (live API fetch, ~1s) 1072 check_usage_proactive 1073 1074 # Refresh pipeline backlog counts for intelligent skipping 1075 refresh_backlog 1076 1077 # Scale batch sizes up when backlogs are large so we clear them faster. 1078 # Caps are conservative — LLM context window is the real ceiling. 1079 if [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 5000 ]; then 1080 PROOFREAD_BATCH=200 1081 elif [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 1000 ]; then 1082 PROOFREAD_BATCH=100 1083 fi 1084 1085 if [ "${BACKLOG_PROPOSALS:-0}" -gt 5000 ]; then 1086 # Cap at 20 SMS / 10 email — 50 SMS exceeded 32k output token limit (29min run, hard error) 1087 REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=20; REWORD_FORM_BATCH=30 1088 REWORD_LINKEDIN_BATCH=30; REWORD_X_BATCH=30 1089 elif [ "${BACKLOG_PROPOSALS:-0}" -gt 1000 ]; then 1090 REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=30; REWORD_FORM_BATCH=30 1091 fi 1092 1093 conservation_note="" 1094 [ "$CONSERVATION_MODE" = "true" ] && conservation_note=" [CONSERVATION MODE: $CONSERVATION_REASON]" 1095 log "Backlog: total_sites=${CANARY_SITE_COUNT:-?} eligible_outreach=$BACKLOG_ELIGIBLE_OUTREACH actionable_proposals=$BACKLOG_PROPOSALS actionable_enriched=$BACKLOG_ENRICHED approved_unsent=$BACKLOG_APPROVED_UNSENT pending=$BACKLOG_PENDING_APPROVAL active_countries=$BACKLOG_ACTIVE_COUNTRIES daily_send_avg=${BACKLOG_DAILY_SEND_AVG:-0}${conservation_note}" 1096 log "Effective batch sizes: proofread=$PROOFREAD_BATCH score_semantic=$SCORE_SEMANTIC_BATCH enrich=$ENRICH_SITES_BATCH" 1097 1098 # --- Pipeline batches: outreach-adjacent first, upstream last --- 1099 run_checked proposals_email opus "$PROPOSALS_EMAIL_BATCH" prompts/PROPOSAL.md docs/05-outreach/email-best-practices.md 1100 run_checked proposals_sms opus "$PROPOSALS_SMS_BATCH" prompts/PROPOSAL.md docs/05-outreach/sms-best-practices.md 1101 # Semantic scoring: Sonnet evaluates headline, value prop, USP. Accuracy > cost. 1102 run_checked score_semantic sonnet "$SCORE_SEMANTIC_BATCH" 1103 1104 # score_sites: HTML-only LLM scoring (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false) 1105 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "score_sites" ]; then 1106 _llm_scoring=$(node -e "process.stdout.write(process.env.ENABLE_LLM_SCORING !== 'false' ? '1' : '0')" 2>/dev/null || echo "0") 1107 _enable_vision=$(node -e "process.stdout.write(process.env.ENABLE_VISION !== 'false' ? '1' : '0')" 2>/dev/null || echo "1") 1108 if [ "$_llm_scoring" = "1" ] && [ "$_enable_vision" = "0" ]; then 1109 run_checked score_sites sonnet "$SCORE_SITES_BATCH" prompts/CONVERSION-SCORING-NOVIS.md 1110 else 1111 [ -z "$SINGLE_TYPE" ] || log "score_sites: SKIP (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)" 1112 fi 1113 fi 1114 1115 # enrich_sites: LLM contact enrichment (requires ENABLE_ENRICHMENT_LLM=true) 1116 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "enrich_sites" ]; then 1117 _enrich_llm=$(node -e "process.stdout.write(process.env.ENABLE_ENRICHMENT_LLM !== 'false' ? '1' : '0')" 2>/dev/null || echo "0") 1118 if [ "$_enrich_llm" = "1" ]; then 1119 run_checked enrich_sites haiku "$ENRICH_SITES_BATCH" prompts/ENRICHMENT.md 1120 else 1121 [ -z "$SINGLE_TYPE" ] || log "enrich_sites: SKIP (ENABLE_ENRICHMENT_LLM=false)" 1122 fi 1123 fi 1124 1125 # gather_evidence: moved to standalone cron job (5-min interval, migration 112). 1126 # No longer runs inline — the cron system handles execution independently. 1127 # The pending count is still logged here for orchestrator visibility. 1128 # Also reports sites awaiting merge (pass1+pass2 done, evidence_json still NULL). 1129 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "gather_evidence" ]; then 1130 _pending_evidence=$(node -e " 1131 const Database = require('./node_modules/better-sqlite3'); 1132 const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true }); 1133 const r = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)\").get(); 1134 const m = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL\").get(); 1135 process.stdout.write(JSON.stringify({ pending: r.c || 0, awaiting_merge: m.c || 0 })); 1136 db.close(); 1137 " 2>/dev/null || echo '{"pending":0,"awaiting_merge":0}') 1138 _ev_pending=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).pending||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 1139 _ev_merge=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).awaiting_merge||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 1140 if [ "$_ev_pending" -gt 0 ] 2>/dev/null || [ "$_ev_merge" -gt 0 ] 2>/dev/null; then 1141 log "gather_evidence: ${_ev_pending} sites pending collection, ${_ev_merge} awaiting merge (proposals gate requires evidence_json)" 1142 # Detect evidence-blocked state: outreach pipeline is empty and evidence is the critical path. 1143 # Log clearly so the overseer and monitoring can see this is expected, not a crash. 1144 if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -eq 0 ] && [ "${BACKLOG_PROPOSALS:-0}" -eq 0 ] 2>/dev/null; then 1145 log "EVIDENCE-BLOCKED: eligible_outreach=0 proposals=0 — evidence collection is the critical path (pending=${_ev_pending} awaiting_merge=${_ev_merge})" 1146 fi 1147 else 1148 log "gather_evidence: no sites pending" 1149 fi 1150 fi 1151 1152 run_checked proofread opus "$PROOFREAD_BATCH" prompts/PROOFREAD.md 1153 run_checked followup_generate sonnet 20 prompts/FOLLOWUP.md 1154 run_checked classify_replies haiku "$CLASSIFY_BATCH" prompts/CLASSIFY-REPLIES.md 1155 run_checked reply_responses opus "$REPLIES_BATCH" prompts/REPLIES.md 1156 1157 # --- Time-gated batches --- 1158 run_checked_gated extract_names haiku "$NAMES_BATCH" 15 1159 run_checked_gated oversee sonnet 1 30 1160 run_checked_gated classify_errors haiku 1 240 1161 run_checked_gated monitor_health haiku 1 60 prompts/agents/MONITOR-HEALTH.md 1162 run_checked_gated triage_errors haiku 1 120 prompts/agents/TRIAGE-ERRORS.md 1163 run_checked_gated check_docs haiku 1 240 prompts/agents/CHECK-DOCS.md 1164 # evidence_merge: bypass time gate when sites are awaiting merge — run every cycle until clear. 1165 # Falls back to 15min gate when backlog is empty (avoids unnecessary idle runs). 1166 if [ "${_ev_merge:-0}" -gt 0 ] 2>/dev/null; then 1167 run_checked evidence_merge haiku 1 prompts/EVIDENCE-MERGE.md 1168 else 1169 run_checked_gated evidence_merge haiku 1 15 prompts/EVIDENCE-MERGE.md 1170 fi 1171 1172 # --- Code review (1 file per cycle, security-first order) --- 1173 # Gate: pause if pending code_review_fix tasks > 30 (backpressure — don't flood the fix queue) 1174 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "code_review" ]; then 1175 _pending_fixes=$(node -e " 1176 const Database = require('./node_modules/better-sqlite3'); 1177 const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true }); 1178 const r = db.prepare(\"SELECT COUNT(*) as c FROM agent_tasks WHERE task_type='code_review_fix' AND status IN ('pending','running')\").get(); 1179 process.stdout.write(String(r.c || 0)); 1180 db.close(); 1181 " 2>/dev/null || echo "0") 1182 if [ "${_pending_fixes:-0}" -gt 30 ]; then 1183 log "code_review: SKIP — pending_fixes=${_pending_fixes} > 30 (fix backlog too large)" 1184 else 1185 run_checked code_review sonnet 1 prompts/agents/CODE-REVIEW.md 1186 fi 1187 fi 1188 1189 [ "$had_work" = true ] 1190 } 1191 1192 # Main 1193 log "Starting orchestrator (loop=$LOOP, type=${SINGLE_TYPE:-all})" 1194 1195 if [ "$LOOP" = true ]; then 1196 while true; do 1197 if ! process_all_batches; then 1198 if [ "$CONSERVATION_MODE" = "true" ]; then 1199 # Queues appear empty for deferred tasks, but we're still in conservation mode. 1200 # Keep looping — time-critical tasks (replies, classify) may arrive. 1201 log "Queues empty in conservation mode — waiting 5min before retry..." 1202 sleep 300 1203 continue 1204 fi 1205 log "All queues empty — exiting loop" 1206 break 1207 fi 1208 # In conservation mode, pause between cycles to avoid hammering the quota check 1209 if [ "$CONSERVATION_MODE" = "true" ]; then 1210 log "Conservation mode: pausing 5min before next cycle..." 1211 sleep 300 1212 fi 1213 log "Batch complete, starting next cycle..." 1214 done 1215 else 1216 process_all_batches || log "No work to process" 1217 fi 1218 1219 log "Orchestrator finished"