claude-orchestrator.sh
1 #!/bin/sh 2 # Claude Orchestrator — Process batches using Claude Code (claude -p) 3 # 4 # Usage: 5 # scripts/claude-orchestrator.sh # One batch per type, then exit 6 # scripts/claude-orchestrator.sh --loop # Repeat until all queues empty 7 # scripts/claude-orchestrator.sh --type proposals_email # Single type only 8 # 9 # Requires: claude CLI (Claude Max subscription) 10 # Each batch runs as a separate `claude -p --model <model>` invocation: 11 # - Fresh context per batch (no RAM accumulation) 12 # - Loads only relevant prompt/context docs 13 # - Output piped to claude-store.js for DB persistence 14 15 set -e 16 17 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" 18 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" 19 cd "$PROJECT_ROOT" 20 21 # Ensure node and claude are in PATH — NixOS systemd services may not inherit them. 22 # Must match the same Node.js version that compiled node_modules (nodejs_22). 23 if ! command -v node >/dev/null 2>&1; then 24 for d in /nix/store/*-nodejs-22.*/bin; do 25 if [ -x "$d/node" ]; then 26 export PATH="$d:$PATH" 27 break 28 fi 29 done 30 fi 31 # Resolve claude CLI — prefer the latest installed version directly to avoid 32 # stale symlinks after updates (e.g. ~/.local/bin/claude -> deleted version). 33 # Extracted to a function so --loop mode can re-resolve each cycle when 34 # Claude auto-updates delete the previously resolved version. 35 resolve_claude_bin() { 36 _claude_bin="" 37 if [ -d "$HOME/.local/share/claude/versions" ]; then 38 # Pick the latest *executable binary* — skip wrapper scripts (<1MB) and broken files 39 for _v in $(ls -1 "$HOME/.local/share/claude/versions" 2>/dev/null | sort -V -r); do 40 _candidate="$HOME/.local/share/claude/versions/$_v" 41 _sz=$(stat -c%s "$_candidate" 2>/dev/null || echo 0) 42 if [ -x "$_candidate" ] && [ "$_sz" -gt 1000000 ]; then 43 _claude_bin="$_candidate" 44 break 45 fi 46 done 47 fi 48 if [ -z "$_claude_bin" ] || [ ! -x "$_claude_bin" ]; then 49 # Fallback to whatever is on PATH (symlink or system install) 50 _claude_bin=$(command -v claude 2>/dev/null || echo "") 51 fi 52 alias claude="$_claude_bin" 53 export CLAUDE_BIN="$_claude_bin" 54 } 55 resolve_claude_bin 56 57 # Self-managed log file when running non-interactively (e.g. systemd) 58 if [ ! -t 1 ]; then 59 mkdir -p "$PROJECT_ROOT/logs" 60 LOG_FILE="$PROJECT_ROOT/logs/orchestrator-$(date +%Y-%m-%d).log" 61 exec >> "$LOG_FILE" 2>&1 62 fi 63 64 # Log CLAUDE_BIN resolution after log redirect so it appears in the log file 65 echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] CLAUDE_BIN=$CLAUDE_BIN (executable: $([ -x "$CLAUDE_BIN" ] && echo yes || echo no))" 66 67 LOOP=false 68 SINGLE_TYPE="" 69 70 # ── Usage limit state ────────────────────────────────────────────────────── 71 # Claude Max has two windows: 5-hour rolling window and a weekly cap. 72 # Utilization percentages are read from ~/.claude/usage-cache.json (written 73 # by the Claude Code TUI after polling api.anthropic.com/api/oauth/usage). 74 # 75 # Strategy: 76 # 1. PROACTIVE: Before each batch cycle, read usage-cache.json. If 5h window 77 # is ≥ WARNING_PCT or weekly is ≥ WEEKLY_WARNING_PCT, enter CONSERVATION 78 # MODE before hitting the hard limit. This preserves tokens for critical work. 79 # 2. REACTIVE: If claude -p returns is_error:true or a limit keyword, enter 80 # conservation mode immediately. 81 # 82 # Conservation mode defers Opus-heavy rewording/proposals; keeps: 83 # - reply_responses (Opus) — time-critical inbound replies can't wait 84 # - classify_replies / extract_names (Haiku) — cheap; extract_names gated to 15min intervals 85 # - oversee (Sonnet) — system health, always run 86 # - classify_errors (Haiku) — cheap, always run 87 # 88 # Once the window resets (resets_at timestamp passes), exits conservation mode. 89 CONSECUTIVE_LIMIT_HITS=0 90 LIMIT_HIT_SINCE="" 91 CONSERVATION_MODE=false 92 CONSERVATION_REASON="" 93 WARNING_PCT=85 # Enter conservation when 5h window ≥ 85% used 94 WEEKLY_WARNING_PCT=90 # Enter conservation when weekly window ≥ 90% used 95 USAGE_CACHE="${HOME}/.claude/usage-cache.json" 96 97 while [ $# -gt 0 ]; do 98 case "$1" in 99 --loop) LOOP=true; shift ;; 100 --type) SINGLE_TYPE="$2"; shift 2 ;; 101 *) echo "Unknown arg: $1"; exit 1 ;; 102 esac 103 done 104 105 # Batch sizes per type 106 PROPOSALS_EMAIL_BATCH=5 107 PROPOSALS_SMS_BATCH=10 108 CLASSIFY_BATCH=50 109 NAMES_BATCH=50 110 REPLIES_BATCH=10 111 PROOFREAD_BATCH=50 112 SCORE_SEMANTIC_BATCH=50 113 SCORE_SITES_BATCH=10 114 ENRICH_SITES_BATCH=15 115 116 # Frequency-gated batch state file (tracks last-run timestamps for periodic batch types) 117 GATE_FILE="$PROJECT_ROOT/logs/orchestrator-gates.json" 118 119 # ── Canary check (runs once per cycle) ─────────────────────────────────────── 120 # Verify external tools can actually execute before attempting any batches. 121 # Catches: missing binary, wrong architecture (glibc on NixOS), broken PATH, 122 # missing node_modules, etc. Fails loudly so monitoring picks it up immediately 123 # rather than silently returning empty results for every batch. 124 # Called from process_all_batches() so it runs once per cycle in --loop mode too. 125 run_canary() { 126 _canary_ok=true 127 128 # 1. node must be able to run a basic script 129 if ! node -e "process.exit(0)" 2>/dev/null; then 130 log "FATAL: node is not executable — $(node --version 2>&1 | head -1). All batches will fail." 131 _canary_ok=false 132 fi 133 134 # 2. claude-batch.js must be loadable (catches missing node_modules, syntax errors) 135 if $_canary_ok && ! node scripts/claude-batch.js --canary 2>/dev/null | grep -q "ok"; then 136 # Tolerate if --canary flag isn't implemented — just check node can load it without crashing 137 node scripts/claude-batch.js --canary 2>/tmp/orch-canary-err.txt || true 138 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 139 if echo "$_canary_err" | grep -qiE "cannot find module|SyntaxError|ERR_"; then 140 log "FATAL: claude-batch.js failed to load: $_canary_err. All batches will fail." 141 _canary_ok=false 142 fi 143 fi 144 145 # 3. CLAUDE_BIN must be executable (catches glibc/nix-ld issues, broken symlinks, wrapper loops) 146 if $_canary_ok && [ -n "$CLAUDE_BIN" ]; then 147 _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true) 148 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 149 if [ -z "$_canary_out" ]; then 150 # Re-resolve — Claude may have auto-updated, deleting the old version 151 log "WARN: CLAUDE_BIN=$CLAUDE_BIN failed (${_canary_err:-no output}). Re-resolving..." 152 resolve_claude_bin 153 if [ -n "$CLAUDE_BIN" ]; then 154 _canary_out=$(timeout 10s env -u CLAUDECODE "$CLAUDE_BIN" --version 2>/tmp/orch-canary-err.txt || true) 155 _canary_err=$(cat /tmp/orch-canary-err.txt 2>/dev/null | head -1) 156 fi 157 if [ -z "$_canary_out" ]; then 158 log "FATAL: CLAUDE_BIN=$CLAUDE_BIN still failed after re-resolve: ${_canary_err:-no output}. All LLM batches will return empty." 159 _canary_ok=false 160 else 161 log "Canary OK after re-resolve: node=$(node --version 2>/dev/null), claude=$_canary_out" 162 fi 163 else 164 log "Canary OK: node=$(node --version 2>/dev/null), claude=$_canary_out" 165 fi 166 elif $_canary_ok; then 167 # CLAUDE_BIN empty — try resolving (may have been installed since startup) 168 resolve_claude_bin 169 if [ -n "$CLAUDE_BIN" ]; then 170 log "Resolved CLAUDE_BIN=$CLAUDE_BIN after initially empty" 171 else 172 log "WARN: CLAUDE_BIN is empty — LLM batches will fail. Check PATH and ~/.local/share/claude/versions/." 173 _canary_ok=false 174 fi 175 fi 176 rm -f /tmp/orch-canary-err.txt 177 } 178 179 # Effort level per batch type — matches Agency Agents plan. 180 # Controls how much reasoning the model uses. Saves tokens on simple tasks. 181 # Levels: low (classification/extraction), medium (analysis), high (reasoning/creative), max (extended thinking) 182 effort_for_batch() { 183 case "$1" in 184 proposals_email) echo "high" ;; 185 proposals_sms) echo "medium" ;; 186 classify_replies) echo "medium" ;; 187 extract_names) echo "low" ;; 188 reply_responses) echo "high" ;; 189 proofread) echo "medium" ;; 190 score_sites) echo "medium" ;; 191 score_semantic) echo "medium" ;; 192 enrich_sites) echo "low" ;; 193 oversee) echo "medium" ;; 194 classify_errors) echo "low" ;; 195 code_review) echo "medium" ;; 196 monitor_health) echo "low" ;; 197 triage_errors) echo "medium" ;; 198 check_docs) echo "low" ;; 199 evidence_merge) echo "low" ;; 200 followup_generate) echo "medium" ;; 201 *) echo "medium" ;; 202 esac 203 } 204 205 # Check if a frequency-gated batch type is due to run. 206 # Args: gate_type interval_minutes 207 # Returns 0 (run it) or 1 (skip — not due yet) 208 is_due() { 209 gate_type="$1" 210 interval_mins="$2" 211 if [ ! -f "$GATE_FILE" ]; then return 0; fi 212 elapsed=$(node -e " 213 try { 214 const d = JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8')); 215 const ts = d['$gate_type']; 216 process.stdout.write(ts ? String(Date.now() - new Date(ts).getTime()) : '0'); 217 } catch { process.stdout.write('0'); } 218 " 2>/dev/null) 219 [ -z "$elapsed" ] && elapsed=0 220 threshold=$((interval_mins * 60 * 1000)) 221 [ "$elapsed" -ge "$threshold" ] 2>/dev/null || [ "$elapsed" = "0" ] 222 } 223 224 # Record that a gated batch type just ran 225 mark_ran() { 226 gate_type="$1" 227 node -e " 228 const fs = require('fs'); 229 let d = {}; 230 try { d = JSON.parse(fs.readFileSync('$GATE_FILE','utf8')); } catch {} 231 d['$gate_type'] = new Date().toISOString(); 232 fs.writeFileSync('$GATE_FILE', JSON.stringify(d, null, 2)); 233 " 2>/dev/null 234 } 235 236 log() { 237 echo "[$(date '+%Y-%m-%d %H:%M:%S')] [orchestrator] $*" 238 } 239 240 # Proactive: fetch live usage from Anthropic API, fall back to cache. 241 # Reads OAuth token from ~/.claude/.credentials.json (Linux credential store). 242 # Called once per batch cycle (before processing batches). 243 check_usage_proactive() { 244 CREDS_FILE="${HOME}/.claude/.credentials.json" 245 246 # Try live fetch first (updates cache for TUI too) 247 usage=$(node -e " 248 const fs = require('fs'); 249 const https = require('https'); 250 let token = ''; 251 try { 252 const creds = JSON.parse(fs.readFileSync('$CREDS_FILE','utf8')); 253 token = creds.claudeAiOauth?.accessToken || ''; 254 } catch {} 255 256 if (!token) { 257 // Fall back to cache 258 try { 259 const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8')); 260 process.stdout.write(JSON.stringify({...d, source:'cache_no_token'})); 261 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"none\"}'); } 262 return; 263 } 264 265 const req = https.request({ 266 hostname: 'api.anthropic.com', 267 path: '/api/oauth/usage', 268 method: 'GET', 269 headers: { 270 'Authorization': 'Bearer ' + token, 271 'anthropic-beta': 'oauth-2025-04-20', 272 'User-Agent': 'claude-orchestrator/1.0' 273 }, 274 timeout: 8000 275 }, (res) => { 276 let body = ''; 277 res.on('data', c => body += c); 278 res.on('end', () => { 279 try { 280 const raw = JSON.parse(body); 281 const result = { 282 five_hour: Math.round(raw.five_hour?.utilization || 0), 283 seven_day: Math.round(raw.seven_day?.utilization || 0), 284 five_hour_resets_at: raw.five_hour?.resets_at || '', 285 seven_day_resets_at: raw.seven_day?.resets_at || '', 286 source: 'live' 287 }; 288 // Update cache file for TUI 289 try { fs.writeFileSync('$USAGE_CACHE', JSON.stringify({...result, stale: false})); } catch {} 290 process.stdout.write(JSON.stringify(result)); 291 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"parse_error\"}'); } 292 }); 293 }); 294 req.on('error', () => { 295 // Fall back to cache on network error 296 try { 297 const d = JSON.parse(fs.readFileSync('$USAGE_CACHE','utf8')); 298 process.stdout.write(JSON.stringify({...d, source:'cache_fallback'})); 299 } catch { process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"error\"}'); } 300 }); 301 req.on('timeout', () => { req.destroy(); }); 302 req.end(); 303 " 2>/dev/null || node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));d.source='node_error_cache';process.stdout.write(JSON.stringify(d));}catch{process.stdout.write('{\"five_hour\":0,\"seven_day\":0,\"source\":\"node_error\"}');}" 2>/dev/null || echo '{"five_hour":0,"seven_day":0,"source":"node_error"}') 304 305 five_hour=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).five_hour||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 306 seven_day=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(Math.round(JSON.parse(d).seven_day||0)));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 307 five_resets=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).five_hour_resets_at||'');}catch{process.stdout.write('');}})" 2>/dev/null || echo "") 308 309 source=$(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).source||'?');}catch{process.stdout.write('?');}})" 2>/dev/null || echo "?") 310 log "Usage [${source}]: 5h=${five_hour}% weekly=${seven_day}% (warn at ${WARNING_PCT}%/${WEEKLY_WARNING_PCT}%)" 311 312 # Check if the 5h window has reset since we entered conservation 313 if [ "$CONSERVATION_MODE" = "true" ] && [ -n "$five_resets" ] && [ -n "$LIMIT_HIT_SINCE" ]; then 314 resets_epoch=$(date -d "$five_resets" +%s 2>/dev/null || echo "0") 315 hit_epoch=$(date -d "$LIMIT_HIT_SINCE" +%s 2>/dev/null || echo "0") 316 if [ "$resets_epoch" -gt 0 ] && [ "$resets_epoch" -gt "$hit_epoch" ]; then 317 CONSERVATION_MODE=false 318 CONSECUTIVE_LIMIT_HITS=0 319 LIMIT_HIT_SINCE="" 320 CONSERVATION_REASON="" 321 node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true 322 log "CONSERVATION MODE OFF — 5h window reset at $five_resets. Usage now ${five_hour}%. Resuming full operation." 323 return 324 fi 325 # Also exit if usage has dropped back below warning threshold 326 if [ "$five_hour" -lt "$((WARNING_PCT - 10))" ] && [ "$seven_day" -lt "$((WEEKLY_WARNING_PCT - 10))" ]; then 327 CONSERVATION_MODE=false 328 CONSECUTIVE_LIMIT_HITS=0 329 LIMIT_HIT_SINCE="" 330 CONSERVATION_REASON="" 331 node -e "const { clearRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); clearRateLimit('claude_cli');" 2>/dev/null || true 332 log "CONSERVATION MODE OFF — usage recovered (5h=${five_hour}% weekly=${seven_day}%). Resuming full operation." 333 return 334 fi 335 fi 336 337 # Enter conservation proactively 338 if [ "$CONSERVATION_MODE" = "false" ]; then 339 if [ "$five_hour" -ge "$WARNING_PCT" ]; then 340 CONSERVATION_MODE=true 341 LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 342 CONSERVATION_REASON="5h=${five_hour}% (>=${WARNING_PCT}%)" 343 log "CONSERVATION MODE ON (proactive) — 5h window at ${five_hour}%. Deferring Opus rewording/proposals. Keeping replies+Haiku. Resets: $five_resets" 344 elif [ "$seven_day" -ge "$WEEKLY_WARNING_PCT" ]; then 345 CONSERVATION_MODE=true 346 LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 347 CONSERVATION_REASON="weekly=${seven_day}% (>=${WEEKLY_WARNING_PCT}%)" 348 log "CONSERVATION MODE ON (proactive) — weekly window at ${seven_day}%. Deferring Opus rewording/proposals. Resets: $(echo "$usage" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(JSON.parse(d).seven_day_resets_at||'');}catch{process.stdout.write('');}});")" 349 fi 350 fi 351 } 352 353 # Reactive: detect hard limit errors from claude -p envelope. 354 # Args: raw_result (full JSON envelope string), batch_type 355 check_limit_signal() { 356 raw="$1" 357 btype="$2" 358 359 [ -z "$raw" ] && return 360 361 is_limit=$(echo "$raw" | node -e " 362 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 363 try { 364 const env = JSON.parse(d); 365 // Only scan envelope-level metadata for limit signals — NOT env.result (LLM content). 366 // LLM-generated text (proposals, emails, rewrites) may mention "rate limit" etc. 367 const meta = [ 368 env.is_error ? 'is_error' : '', 369 typeof env.stop_reason === 'string' ? env.stop_reason : '', 370 typeof env.error === 'string' ? env.error : '', 371 typeof env.error?.message === 'string' ? env.error.message : '', 372 env.is_error && typeof env.result === 'string' && env.result.length < 500 ? env.result : '', 373 ].join(' ').toLowerCase(); 374 const hasKeyword = env.is_error && /rate.?limit|usage.?limit|claude.?max|overloaded|quota|529|too.?many.?request/.test(meta); 375 process.stdout.write(hasKeyword ? '1' : '0'); 376 } catch { process.stdout.write('0'); } 377 }); 378 " 2>/dev/null || echo "0") 379 380 if [ "$is_limit" = "1" ]; then 381 CONSECUTIVE_LIMIT_HITS=$((CONSECUTIVE_LIMIT_HITS + 1)) 382 [ -z "$LIMIT_HIT_SINCE" ] && LIMIT_HIT_SINCE=$(date '+%Y-%m-%d %H:%M:%S') 383 CONSERVATION_REASON="hard limit on $btype" 384 if [ "$CONSERVATION_MODE" = "false" ]; then 385 CONSERVATION_MODE=true 386 log "CONSERVATION MODE ON (reactive) — hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS). Deferring Opus tasks." 387 else 388 log "Usage limit still active on $btype (hit #$CONSECUTIVE_LIMIT_HITS since $LIMIT_HIT_SINCE)" 389 fi 390 391 # Persist to rate-limits.json so the state survives orchestrator restarts 392 # and is visible to should_skip_for_rate_limits() on next cycle. 393 # Uses 'hourly' reset window (1h backoff) — conservation mode recovery 394 # logic may clear it sooner if usage drops below threshold. 395 node -e " 396 const { setRateLimit } = require('$PROJECT_ROOT/src/utils/rate-limit-scheduler.js'); 397 setRateLimit('claude_cli', { 398 limitType: 'hourly', 399 reason: 'Claude Max hard limit on $btype (hit #$CONSECUTIVE_LIMIT_HITS)', 400 }); 401 " 2>/dev/null || log "WARN: Failed to persist claude_cli rate limit to rate-limits.json" 402 fi 403 } 404 405 # Query pipeline backlog counts for intelligent batch skipping. 406 # All counts are ACTIONABLE — filtered to only items that can reach a sent outreach. 407 # The core filter is ACTIVE_COUNTRY_CODES = all countries NOT in OUTREACH_BLOCKED_COUNTRIES. 408 # This filter is inherited by every gate so that blocking a country upstream stops all work 409 # for that country at every stage (scoring, enriching, proposals, reword). 410 # 411 # Sets global vars: 412 # BACKLOG_ELIGIBLE_OUTREACH — approved email/sms, delivery_status IS NULL, past 3d cooldown, gdpr_verified where required 413 # BACKLOG_PROPOSALS — proposals_drafted sites with unsent email/sms in active countries 414 # BACKLOG_ENRICHED — enriched sites in active countries, score < LOW_SCORE_CUTOFF 415 # BACKLOG_ACTIVE_COUNTRIES — count of active (non-blocked) country codes in pipeline 416 # BACKLOG_APPROVED_UNSENT — all approved+unsent (any channel, for display only) 417 # BACKLOG_PENDING_APPROVAL — pending approval count (display only) 418 refresh_backlog() { 419 # Read OUTREACH_SKIP_METHODS, OUTREACH_BLOCKED_COUNTRIES, and SMS-blocked countries from .env 420 _skip_methods=$(grep '^OUTREACH_SKIP_METHODS=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 421 _blocked_countries=$(grep '^OUTREACH_BLOCKED_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 422 _sms_blocked_countries=$(grep '^OUTREACH_BLOCKED_SMS_COUNTRIES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 423 424 eval "$(node "$PROJECT_ROOT/scripts/refresh-backlog.js" \ 425 --skip-methods="$_skip_methods" \ 426 --blocked-countries="$_blocked_countries" \ 427 --sms-blocked-countries="$_sms_blocked_countries" \ 428 --low-score-cutoff="$LOW_SCORE_CUTOFF" \ 429 --project-root="$PROJECT_ROOT" 2>/dev/null)" 430 431 # ── Data-loss canary: halt if site count dropped >50% ── 432 if [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -gt 0 ]; then 433 _drop_pct=$(( (CANARY_PREVIOUS_COUNT - CANARY_SITE_COUNT) * 100 / CANARY_PREVIOUS_COUNT )) 434 if [ "$_drop_pct" -gt 50 ]; then 435 log "CRITICAL: Site count dropped ${_drop_pct}% (${CANARY_PREVIOUS_COUNT} → ${CANARY_SITE_COUNT}) — HALTING ORCHESTRATOR" 436 log "ACTION REQUIRED: Check PostgreSQL mmo database integrity. Restore from backup if needed." 437 log "To resume: rm logs/site-count-canary.json && restart orchestrator" 438 exit 1 439 fi 440 elif [ "${CANARY_PREVIOUS_COUNT:-0}" -gt 100 ] && [ "${CANARY_SITE_COUNT:-0}" -eq 0 ]; then 441 log "CRITICAL: Site count is ZERO (was ${CANARY_PREVIOUS_COUNT}) — HALTING ORCHESTRATOR" 442 log "ACTION REQUIRED: Database has been wiped. Restore from backup immediately." 443 log "To resume: rm logs/site-count-canary.json && restart orchestrator" 444 exit 1 445 fi 446 } 447 448 # Load STAGE_THROTTLE_MULTIPLIER and LOW_SCORE_CUTOFF from .env (defaults: 3 and 82). 449 # Read once at startup — restart orchestrator to pick up changes. 450 STAGE_THROTTLE_MULTIPLIER=$(grep '^STAGE_THROTTLE_MULTIPLIER=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ') 451 STAGE_THROTTLE_MULTIPLIER=${STAGE_THROTTLE_MULTIPLIER:-3} 452 LOW_SCORE_CUTOFF=$(grep '^LOW_SCORE_CUTOFF=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'" | tr -d ' ') 453 LOW_SCORE_CUTOFF=${LOW_SCORE_CUTOFF:-82} 454 455 # Load SKIP_STAGES from .env on every loop iteration so changes take effect without restart. 456 # Always reads .env — edit the file to change skip behaviour live. 457 load_skip_stages() { 458 if [ -f "$PROJECT_ROOT/.env" ]; then 459 val=$(grep '^SKIP_STAGES=' "$PROJECT_ROOT/.env" 2>/dev/null | tail -1 | cut -d= -f2- | sed 's/#.*//' | tr -d '"' | tr -d "'") 460 SKIP_STAGES="$val" 461 fi 462 # Normalise: lowercase, strip spaces 463 SKIP_STAGES=$(echo "${SKIP_STAGES:-}" | tr '[:upper:]' '[:lower:]' | tr -d ' ') 464 } 465 466 # Check if a batch type is blocked by the pipeline's rate-limit-scheduler state. 467 # Reads logs/rate-limits.json (written by circuit-breaker.js when an API quota/rate-limit hits). 468 # This ensures the orchestrator respects the same automatic backoffs as the pipeline — 469 # e.g. if ZenRows hits its daily quota and blocks 'serps', both systems honour that pause. 470 # Returns 0 = skip (rate-limited), 1 = run 471 should_skip_for_rate_limits() { 472 batch_type="$1" 473 rate_limits_file="$PROJECT_ROOT/logs/rate-limits.json" 474 [ -f "$rate_limits_file" ] || return 1 # No file = no active rate limits 475 476 # Map batch_type → stage name to check in rate-limits.json 477 case "$batch_type" in 478 proposals_email|proposals_sms) stage="proposals" ;; 479 reply_responses) stage="outreach" ;; 480 classify_replies) stage="replies" ;; 481 score_semantic|score_sites) stage="scoring" ;; 482 enrich_sites) stage="enrich" ;; 483 followup_generate) stage="outreach" ;; 484 *) stage="" ;; 485 esac 486 487 now_ms=$(date +%s%3N 2>/dev/null || echo "0") 488 489 # Check each entry: if resetAt > now and stages includes our stage, skip. 490 # Also check 'claude_cli' — when Claude Max credits are exhausted, ALL orchestrator 491 # batches are blocked regardless of stage mapping (they all use claude -p). 492 matched=$(node -e " 493 try { 494 const data = JSON.parse(require('fs').readFileSync('$rate_limits_file', 'utf8')); 495 const stage = '$stage'; 496 const now = $now_ms; 497 498 // Global claude_cli gate — blocks ALL orchestrator batches 499 if (data.claude_cli && Number(data.claude_cli.resetAt) > now) { 500 const resetAt = new Date(Number(data.claude_cli.resetAt)).toISOString(); 501 process.stdout.write('claude_cli:' + resetAt + ':' + (data.claude_cli.reason || '')); 502 process.exit(0); 503 } 504 505 // Stage-specific gate (e.g. openrouter scoring, zenrows serps) 506 if (stage) { 507 for (const [api, info] of Object.entries(data)) { 508 if (Array.isArray(info.stages) && info.stages.includes(stage) && Number(info.resetAt) > now) { 509 const resetAt = new Date(Number(info.resetAt)).toISOString(); 510 process.stdout.write(api + ':' + resetAt + ':' + (info.reason || '')); 511 process.exit(0); 512 } 513 } 514 } 515 } catch(e) {} 516 " 2>/dev/null) 517 518 if [ -n "$matched" ]; then 519 api=$(echo "$matched" | cut -d: -f1) 520 reset_at=$(echo "$matched" | cut -d: -f2) 521 reason=$(echo "$matched" | cut -d: -f3-) 522 log "$batch_type: SKIP (rate-limited by $api until $reset_at — $reason)" 523 return 0 524 fi 525 return 1 526 } 527 528 # Check if a batch type is blocked by SKIP_STAGES. 529 # Returns 0 = skip (stage paused), 1 = run 530 should_skip_for_stage_flag() { 531 batch_type="$1" 532 [ -z "$SKIP_STAGES" ] && return 1 # Nothing skipped 533 534 # Map batch types to pipeline stage names 535 case "$batch_type" in 536 proposals_email|proposals_sms) 537 echo "$SKIP_STAGES" | grep -q 'proposals' && { 538 log "$batch_type: SKIP (SKIP_STAGES=proposals)"; return 0; } ;; 539 reply_responses) 540 echo "$SKIP_STAGES" | grep -q 'outreach\|replies\|reply' && { 541 log "$batch_type: SKIP (SKIP_STAGES includes outreach/replies)"; return 0; } ;; 542 classify_replies) 543 echo "$SKIP_STAGES" | grep -q 'replies\|classify' && { 544 log "$batch_type: SKIP (SKIP_STAGES=replies/classify)"; return 0; } ;; 545 extract_names) 546 echo "$SKIP_STAGES" | grep -q 'extract_names\|extract' && { 547 log "$batch_type: SKIP (SKIP_STAGES=extract_names)"; return 0; } ;; 548 proofread) 549 echo "$SKIP_STAGES" | grep -q 'proofread' && { 550 log "$batch_type: SKIP (SKIP_STAGES=proofread)"; return 0; } ;; 551 score_semantic|score_sites) 552 echo "$SKIP_STAGES" | grep -q 'scoring' && { 553 log "$batch_type: SKIP (SKIP_STAGES=scoring)"; return 0; } ;; 554 enrich_sites) 555 echo "$SKIP_STAGES" | grep -q 'enrich' && { 556 log "$batch_type: SKIP (SKIP_STAGES=enrich)"; return 0; } ;; 557 followup_generate) 558 echo "$SKIP_STAGES" | grep -q 'followup' && { 559 log "$batch_type: SKIP (SKIP_STAGES=followup)"; return 0; } ;; 560 esac 561 return 1 # Not skipped 562 } 563 564 # Check if a batch type should be skipped because its downstream queue is already saturated. 565 # Uses STAGE_THROTTLE_MULTIPLIER (default 3) × the relevant batch size as the threshold. 566 # 567 # Stage chain (each gate pauses all earlier stages): 568 # enriched → [proposals_*] → proposals_drafted → [proofread] → approved outreach 569 # 570 # Gate 1 (outreach gate): if eligible outreach > 3× rolling 7-day avg daily send rate 571 # (floor: 3× proofread_batch, so the gate isn't too permissive 572 # when historical send rate is very low or zero) 573 # pause: proofread, proposals_*, enrich_sites, score_semantic 574 # Gate 2 (proposals gate): if proposals_drafted > 3× (proposals_email_batch + proposals_sms_batch) 575 # pause: proposals_*, enrich_sites 576 # Gate 3 (enrich gate): if enriched > 3× enrich_sites_batch 577 # pause: enrich_sites, score_semantic 578 # 579 # Exception: serps is never paused (ZenRows daily quota must be used). 580 # Returns 0 = skip, 1 = run 581 should_skip_for_backlog() { 582 batch_type="$1" 583 584 # Gate 1: outreach queue saturated — pause everything feeding into it. 585 # Use 3× the rolling 7-day daily send average as the threshold, with a floor 586 # of 3× PROOFREAD_BATCH to prevent the gate from being too permissive when 587 # the send rate is very low (e.g. pipeline just started, GDPR blocking, etc). 588 _static_floor=$((PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER)) 589 _send_rate_threshold=$((${BACKLOG_DAILY_SEND_AVG:-0} * STAGE_THROTTLE_MULTIPLIER)) 590 _outreach_threshold=$(( _send_rate_threshold > _static_floor ? _send_rate_threshold : _static_floor )) 591 592 case "$batch_type" in 593 proofread|proposals_email|proposals_sms|enrich_sites|score_semantic) 594 if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -gt "$_outreach_threshold" ]; then 595 log "$batch_type: SKIP — eligible_outreach=${BACKLOG_ELIGIBLE_OUTREACH} > threshold=${_outreach_threshold} (${STAGE_THROTTLE_MULTIPLIER}×max(daily_avg=${BACKLOG_DAILY_SEND_AVG:-0},floor=${PROOFREAD_BATCH}))" 596 return 0 597 fi 598 ;; 599 esac 600 601 # Gate 2: proposals_drafted queue saturated — pause proposal generation and earlier stages. 602 # Threshold based on PROOFREAD throughput (the bottleneck), not proposal generation batch size. 603 # Proofread clears PROOFREAD_BATCH per cycle; allow 3× that buffer before pausing upstream. 604 _proposals_threshold=$(( PROOFREAD_BATCH * STAGE_THROTTLE_MULTIPLIER )) 605 606 case "$batch_type" in 607 proposals_email|proposals_sms|enrich_sites) 608 if [ "${BACKLOG_PROPOSALS:-0}" -gt "$_proposals_threshold" ]; then 609 log "$batch_type: SKIP — proposals_drafted=${BACKLOG_PROPOSALS} > threshold=${_proposals_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${_proposals_batch_max})" 610 return 0 611 fi 612 ;; 613 esac 614 615 # Gate 3: enriched queue saturated — pause enrich and scoring. 616 # Threshold based on proposals throughput (PROPOSALS_EMAIL + PROPOSALS_SMS per cycle), 617 # not enrich batch size. Enriched sites are consumed by proposal generation. 618 _proposals_batch_max=$(( PROPOSALS_EMAIL_BATCH + PROPOSALS_SMS_BATCH )) 619 _enrich_threshold=$(( _proposals_batch_max * STAGE_THROTTLE_MULTIPLIER * 5 )) 620 621 case "$batch_type" in 622 enrich_sites|score_semantic) 623 if [ "${BACKLOG_ENRICHED:-0}" -gt "$_enrich_threshold" ]; then 624 log "$batch_type: SKIP — enriched=${BACKLOG_ENRICHED} > threshold=${_enrich_threshold} (${STAGE_THROTTLE_MULTIPLIER}×${ENRICH_SITES_BATCH})" 625 return 0 626 fi 627 ;; 628 esac 629 630 return 1 # Don't skip — run normally 631 } 632 633 # Run a single batch type 634 # Args: batch_type model batch_size context_files... 635 run_batch() { 636 batch_type="$1" 637 model="$2" 638 batch_size="$3" 639 shift 3 640 # Remaining args are context file paths 641 642 log "Pulling $batch_type batch (limit=$batch_size)..." 643 batch_json=$(node scripts/claude-batch.js "$batch_type" "$batch_size" 2>/dev/null || echo '{"count":0,"items":[]}') 644 645 count=$(echo "$batch_json" | node -e " 646 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 647 try { process.stdout.write(String(JSON.parse(d).count||0)); } 648 catch { process.stdout.write('0'); } 649 }); 650 " 2>/dev/null || echo "0") 651 [ -z "$count" ] && count=0 652 653 if [ "$count" = "0" ]; then 654 log "$batch_type: no items to process" 655 return 1 # Signal "empty" to caller 656 fi 657 658 _effort_log=$(effort_for_batch "$batch_type") 659 log "$batch_type: processing $count items with $model (effort=$_effort_log)..." 660 661 # Build the prompt — include context files and the batch data 662 # Inject brand/persona env vars into prompt templates (bash expansion, safe with special chars) 663 context="" 664 for f in "$@"; do 665 if [ -f "$f" ]; then 666 _file_content=$(cat "$f") 667 _file_content="${_file_content//\[BRAND_NAME\]/${BRAND_NAME:-the brand}}" 668 _file_content="${_file_content//BRAND_DOMAIN/${BRAND_DOMAIN:-the site}}" 669 _file_content="${_file_content//BRAND_URL/${BRAND_URL:-the site}}" 670 context="$context 671 672 --- FILE: $f --- 673 ${_file_content} 674 --- END FILE ---" 675 fi 676 done 677 678 # Build task-specific instructions 679 case "$batch_type" in 680 proposals_email) 681 task_prompt="Generate email proposals for each site below. Follow the PROPOSAL.md and email best practices. Output JSON: {\"batch_type\":\"proposals_email\",\"results\":[{\"site_id\":N,\"contact_method\":\"email\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\",\"subject_line\":\"...\"}]}" 682 ;; 683 proposals_sms) 684 task_prompt="Generate SMS proposals (<160 chars) for each site below. Follow PROPOSAL.md and SMS best practices. TCPA opt-out required for US/CA only. Output JSON: {\"batch_type\":\"proposals_sms\",\"results\":[{\"site_id\":N,\"contact_method\":\"sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"message_body\":\"...\"}]}" 685 ;; 686 classify_replies) 687 task_prompt="$(cat prompts/CLASSIFY-REPLIES.md)" 688 ;; 689 extract_names) 690 task_prompt="Extract the person's first name from these email addresses and domains. If no name can be inferred, leave null. Output JSON: {\"batch_type\":\"extract_names\",\"results\":[{\"site_id\":N,\"contacts\":[{\"uri\":\"...\",\"name\":\"FirstName\"}]}]}" 691 ;; 692 reply_responses) 693 task_prompt="Follow the reply guidelines in prompts/REPLIES.md. Respond to every inbound in the batch. Output ONLY valid JSON — no markdown, no explanation." 694 ;; 695 oversee) 696 task_prompt="You are the senior overseer of an automated lead-generation pipeline (333 Method). Analyze the system health snapshot and decide what corrective actions to take. The structured JSON is the authoritative source of truth — previous_overseer_findings is historical context only. 697 KNOWN SELF-HEALING BEHAVIOURS (do NOT escalate): cron_timer_dead is auto-fixed by processGuardian every minute. 1-minute cron gaps are expected. 698 COST MODEL (do NOT flag as expensive): All LLM batches run via `claude -p --model <name>` which uses the Claude Max subscription — zero marginal cost per call regardless of model (opus/sonnet/haiku). Do NOT flag opus or sonnet model usage as a cost issue. Only OpenRouter direct-API calls (gather_evidence, scoring, replies) count against the $200/day LLM_DAILY_BUDGET. 699 VALID PIPELINE STATUSES (ALL are intentional — do NOT flag as invalid): found, assets_captured, prog_scored, semantic_scored, vision_scored, enriched_regex, enriched_llm, enriched, proposals_drafted, outreach_partial, outreach_sent, rescored, skipped, error. enriched_regex is an intermediate enrichment step (browser pass complete, LLM pass pending) — it is expected and normal. 700 Available actions: RESTART_PIPELINE (if stopped/crash-looping), CLEAR_STALE_TASKS (params: min_age_hours), RESET_STUCK_SITES (params: stage, min_stuck_hours ≥ 4 — never lower), LOG_ONLY (params: note). Do NOT reset a stage when its target already has a large backlog (found>5000: don't reset assets_captured or scored; rescored>500: don't reset enriched or rescored). Sites with recapture_count≥2 are auto-excluded from resets. NEVER reset 'enriched' stage — enriched is a deliberate evidence-collection queue (gather_evidence cron picks sites up every 5 min); large enriched backlogs are normal and expected. 701 Output JSON: {\"batch_type\":\"oversee\",\"results\":[{\"summary\":\"1-2 sentences\",\"severity\":\"ok|warn|critical\",\"findings\":[{\"issue\":\"...\",\"severity\":\"low|medium|high\",\"stage\":\"pipeline|agents|cron|sites\"}],\"actions\":[{\"code\":\"ACTION_CODE\",\"description\":\"...\",\"params\":{}}]}]}" 702 ;; 703 classify_errors) 704 task_prompt="Analyze these pipeline error messages that don't match any existing patterns. Propose regex patterns for categorization using JavaScript regex syntax (no leading/trailing slashes). Group similar errors into one pattern where possible. Terminal = permanent failure; Retriable = transient or fixable. 705 Output JSON: {\"batch_type\":\"classify_errors\",\"results\":[{\"pattern\":\"regex\",\"label\":\"Short Label\",\"group\":\"terminal|retriable\",\"context\":\"site|outreach\",\"example_errors\":[\"...\"],\"occurrence_count\":N}]}" 706 ;; 707 score_sites) 708 task_prompt="You are a CRO specialist scoring local business websites for conversion effectiveness. For each site in BATCH DATA, analyze the provided HTML and score it using the CRO scoring system defined in the context above. 709 710 CRITICAL RULES: 711 - Evaluate each site independently using ONLY its provided HTML and http_headers 712 - Always return COMPLETE factor_scores for every site — never omit any factor 713 - Do NOT include conversion_score or letter_grade — these are computed programmatically 714 - Output ONLY valid JSON, no markdown fences 715 716 Output JSON: {\"batch_type\":\"score_sites\",\"results\":[{\"site_id\":N,\"factor_scores\":{\"headline_quality\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"call_to_action\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"urgency_messaging\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"hook_engagement\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"trust_signals\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"imagery_design\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"offer_clarity\":{\"score\":N,\"reasoning\":\"...\",\"evidence\":\"...\"},\"contextual_appropriateness\":{\"score\":N,\"reasoning\":\"...\",\"industry_context\":\"...\"}},\"overall_calculation\":{\"grade_interpretation\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"country_detection_confidence\":\"high|medium|low\",\"country_detection_evidence\":[\"...\"],\"is_business_directory\":false,\"is_local_business\":true,\"is_law_firm\":false,\"industry_classification\":\"plumbing\",\"is_error_page\":false,\"error_type\":null,\"error_description\":null,\"is_broken_site\":false,\"broken_site_details\":[]},\"contact_details\":{\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"...\",\"email_addresses\":[],\"phone_numbers\":[],\"social_profiles\":[],\"key_pages\":[]}}]}" 717 ;; 718 enrich_sites) 719 task_prompt="You are a contact data extraction specialist. For each site in BATCH DATA, extract contact information from the provided HTML using the enrichment guidelines in the context above. 720 721 CRITICAL RULES: 722 - Extract ONLY information explicitly visible in the HTML — do NOT fabricate data 723 - If current_contacts already has a field, only add NEW items not already present 724 - Preserve phone numbers EXACTLY as displayed (do not normalize format) 725 - Return an empty object {} for sites where no new contacts are found 726 727 Output JSON: {\"batch_type\":\"enrich_sites\",\"results\":[{\"site_id\":N,\"domain\":\"example.com\",\"business_name\":\"...\",\"city\":\"...\",\"country_code\":\"AU\",\"state\":\"NSW\",\"email_addresses\":[{\"email\":\"info@example.com\",\"label\":\"General\"}],\"phone_numbers\":[{\"number\":\"+61 2 1234 5678\",\"label\":\"Office\"}],\"social_profiles\":[{\"url\":\"https://facebook.com/example\",\"label\":\"Facebook\"}],\"key_pages\":[\"https://example.com/contact\"],\"primary_contact_form\":null}]}" 728 ;; 729 score_semantic) 730 task_prompt="You are scoring local business websites for conversion effectiveness. For each site, evaluate these 3 semantic factors from the extracted text: 731 732 1. **headline_quality** (0-10): Is the H1/title compelling? Does it promise a benefit, address the visitor, create curiosity? Generic 'Welcome' or just a business name = 2-4. Specific benefit + audience = 7-9. 733 2. **value_proposition** (0-10): Does the page clearly articulate WHAT they do, WHO they help, and WHY choose them? Look for quantified claims, benefit language, customer-focused framing (you/your > we/our). Vague 'quality service' = 3-4. Specific benefits with proof = 7-9. 734 3. **unique_selling_proposition** (0-10): What differentiates them? Look for 'only', 'exclusive', years of experience, number of customers served, awards, patents, specific methodologies. Generic = 3-4. Clear differentiators = 7-9. 735 736 Score conservatively. Most local business sites score 4-7. A 9-10 is exceptional (clear benefit headline + quantified proof + genuine differentiators). A 0-2 means almost no content or completely generic. 737 738 Output JSON: {\"batch_type\":\"score_semantic\",\"results\":[{\"site_id\":N,\"headline_quality\":{\"score\":N,\"reasoning\":\"...\"},\"value_proposition\":{\"score\":N,\"reasoning\":\"...\"},\"unique_selling_proposition\":{\"score\":N,\"reasoning\":\"...\"}}]}" 739 ;; 740 proofread) 741 task_prompt="Review each message in BATCH DATA against the proofreading rules in the context above. Apply every rule. Output a decision for every item — no omissions. 742 743 Output JSON: {\"batch_type\":\"proofread\",\"results\":[{\"message_id\":N,\"decision\":\"approve\"},{\"message_id\":N,\"decision\":\"rework\",\"rework_instructions\":\"...\"},{\"message_id\":N,\"decision\":\"reject\",\"reject_reason\":\"...\"}]}" 744 ;; 745 code_review) 746 task_prompt="You are a senior code reviewer (Code Reviewer agent). Review the source file in BATCH DATA for security vulnerabilities, bugs, performance issues, and maintainability problems. Follow the review criteria in the context above. 747 748 For each finding include: severity (1-10), category (security/bug/performance/style), line number, description, and a concrete suggestion. 749 750 Only report real issues with concrete fixes. Do not hallucinate problems. 751 752 Output JSON: {\"batch_type\":\"code_review\",\"results\":[{\"file_path\":\"...\",\"findings\":[{\"severity\":N,\"category\":\"...\",\"line\":N,\"description\":\"...\",\"suggestion\":\"...\"}],\"summary\":\"...\"}]}" 753 ;; 754 monitor_health) 755 task_prompt="You are an SRE monitoring a Node.js outreach pipeline. Analyze the system health data in BATCH DATA. Follow the monitoring criteria in the context above. 756 757 Identify any critical issues, warnings, or degraded conditions. If the system is healthy, say so explicitly. 758 759 Output JSON: {\"batch_type\":\"monitor_health\",\"results\":[{\"severity\":\"info|warn|critical\",\"summary\":\"...\",\"findings\":[{\"severity\":\"...\",\"component\":\"...\",\"issue\":\"...\",\"detail\":\"...\"}],\"recommended_actions\":[\"...\"]}]}" 760 ;; 761 triage_errors) 762 task_prompt="You are an Incident Response Commander triaging pipeline errors. Classify each failed task in BATCH DATA. Follow the triage criteria in the context above. 763 764 For each error: determine type, severity, and action. Only flag create_fix_task for critical/high severity issues with a clear code fix needed. 765 766 Output JSON: {\"batch_type\":\"triage_errors\",\"results\":[{\"error_message\":\"...\",\"type\":\"pipeline_bug|infrastructure|external_api|data_quality|config|known_transient\",\"severity\":\"critical|high|medium|low\",\"action\":\"create_fix_task|monitor|ignore\",\"summary\":\"...\",\"suggested_fix\":\"...\"}]}" 767 ;; 768 check_docs) 769 task_prompt="You are a Technical Writer checking documentation freshness. Review the recently changed source files in BATCH DATA. Follow the checking criteria in the context above. 770 771 For each file, identify any stale documentation (comments, JSDoc, CLAUDE.md references, README mentions). Only report genuinely stale content — not missing docs for new code (that's a separate concern). 772 773 Output JSON: {\"batch_type\":\"check_docs\",\"results\":[{\"file_path\":\"...\",\"stale_docs\":[{\"location\":\"...\",\"issue\":\"...\",\"suggested_update\":\"...\"}],\"summary\":\"...\"}]}" 774 ;; 775 evidence_merge) 776 task_prompt="You are a CRO analyst merging evidence-based findings into HTML-scored website evaluations. Follow the merge criteria in the context above. 777 778 For each site in BATCH DATA: compare score_json factors against evidence_pass1 and evidence_pass2 findings. Revise scores only where evidence is concrete and specific. Surface additional findings that HTML scoring could not catch (CSS-confirmed issues, stock photo filenames, wrong-state links, above-fold placement). 779 780 Output JSON: {\"batch_type\":\"evidence_merge\",\"results\":[{\"site_id\":N,\"revised_scores\":{\"factor_name\":{\"original\":N,\"revised\":N,\"reason\":\"...\"}},\"additional_findings\":[{\"factor\":\"...\",\"finding\":\"...\",\"severity\":\"critical|major|minor\"}],\"evidence_summary\":\"...\"}]}" 781 ;; 782 followup_generate) 783 task_prompt="You are writing follow-up outreach messages for a website conversion audit service. Each site in BATCH DATA has already been contacted once (or more) with no reply. Generate the NEXT follow-up touch using a DIFFERENT value angle than previous messages. 784 785 FOLLOW-UP CADENCE (8 touches over 42 days): 786 - Touch 2 (Day 3): Different weakness + social proof (\"other businesses in your area\") 787 - Touch 3 (Day 7): ROI framing (\"this could be costing you \$X/month\") 788 - Touch 4 (Day 14): Case study + sample report link (auditandfix.com) 789 - Touch 5 (Day 21): Quick win — give one specific free fix to demonstrate value 790 - Touch 6 (Day 28): Ad waste angle (\"paying for ads with a low-scoring site?\") / competitor gap 791 - Touch 7 (Day 35): Authority — \"scored 43,000+ sites, pattern is always the same\" 792 - Touch 8 (Day 42): Breakup email (\"closing the file, offer stands if you need it later\") 793 794 RULES: 795 - Each result must include ALL channels from the site's channels array (generate one message per channel per site) 796 - Email: conversational, 3-5 short paragraphs, include subject line 797 - SMS: under 160 chars, punchy, no subject line needed 798 - NEVER repeat the same angle or wording as previous_messages 799 - Use the site's weaknesses for personalisation but pick DIFFERENT weaknesses than touch 1 800 - Touch 5: lead with a specific free fix — demonstrate competence, not just pitch 801 - Touch 6: if site has is_running_ads=true, lean into ad waste angle; otherwise use competitor gap 802 - Touch 8 (breakup): be respectful, leave the door open, mention auditandfix.com 803 - Match the tone to the country (G'day for AU, casual for US/CA, formal for UK/EU) 804 805 Output JSON: {\"batch_type\":\"followup_generate\",\"results\":[{\"site_id\":N,\"contact_method\":\"email|sms\",\"contact_uri\":\"...\",\"country_code\":\"XX\",\"sequence_step\":N,\"message_body\":\"...\",\"subject_line\":\"...(email only)\"}]}" 806 ;; 807 esac 808 809 full_prompt="$task_prompt 810 811 $context 812 813 BATCH DATA: 814 $batch_json 815 816 IMPORTANT: Output ONLY valid JSON. No markdown, no explanation, no code fences." 817 818 # Call Claude and pipe result to store 819 # Write prompt to a temp file to avoid shell arg-length limits with large batches. 820 # Unset CLAUDECODE so claude doesn't refuse to run if invoked inside another Claude Code session. 821 _prompt_file=$(mktemp /tmp/orch-prompt-XXXXXX) 822 printf '%s' "$full_prompt" > "$_prompt_file" 823 _prompt_bytes=$(wc -c < "$_prompt_file" 2>/dev/null || echo 0) 824 _prompt_kb=$(( _prompt_bytes / 1024 )) 825 log "$batch_type: context=${_prompt_kb}kB (prompt+context+batch)" 826 # Claude Code (Electron) requires XDG_RUNTIME_DIR to run. Systemd user services may 827 # not inherit it — set a fallback so claude doesn't silently exit with empty output. 828 _xdg="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" 829 [ -d "$_xdg" ] || _xdg="/tmp/runtime-$(id -u)" && mkdir -p "$_xdg" 2>/dev/null || true 830 831 # Write claude output DIRECTLY to a file — never capture in $() bash variable. 832 # bash $() in non-UTF8 locales (NixOS systemd env) corrupts multi-byte characters 833 # (Japanese, Chinese, emoji), making the extracted JSON invalid at random positions. 834 # File I/O preserves all bytes exactly regardless of locale. 835 _raw_file="$PROJECT_ROOT/logs/.orch-raw-$$.json" 836 _claude_stderr=$(mktemp /tmp/orch-stderr-XXXXXX) 837 _effort=$(effort_for_batch "$batch_type") 838 XDG_RUNTIME_DIR="$_xdg" env -u CLAUDECODE "$CLAUDE_BIN" -p --model "$model" --effort "$_effort" --output-format json \ 839 < "$_prompt_file" > "$_raw_file" 2>"$_claude_stderr" || true 840 rm -f "$_prompt_file" 841 _stderr_content=$(cat "$_claude_stderr"); rm -f "$_claude_stderr" 842 843 # Always check for usage limit signals (reads only ASCII metadata, file-safe) 844 check_limit_signal "$(cat "$_raw_file" 2>/dev/null || true)" "$batch_type" 845 846 # Debug: log raw file size and first 200 ASCII chars for diagnosis 847 _raw_size=$(wc -c < "$_raw_file" 2>/dev/null || echo "missing") 848 _raw_preview=$(cat "$_raw_file" 2>/dev/null | tr -cd '[:print:]' | head -c 200 || echo "") 849 log "$batch_type: raw_file=${_raw_size}b preview=$(echo "$_raw_preview" | head -c 150)" 850 851 if [ ! -s "$_raw_file" ]; then 852 rm -f "$_raw_file" 853 # Track consecutive empty results per batch type in the gate file. 854 # Pattern: external tool failure looks identical to "no work" — consecutive 855 # empties on a batch that had items is a strong signal something is broken. 856 _empty_key="empty_streak_${batch_type}" 857 _streak=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(String(d['$_empty_key']||0));}catch{process.stdout.write('0');}" 2>/dev/null || echo "0") 858 _streak=$(( _streak + 1 )) 859 node -e " 860 const fs=require('fs'); 861 let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{} 862 d['$_empty_key']=$_streak; 863 fs.writeFileSync('$GATE_FILE',JSON.stringify(d)); 864 " 2>/dev/null || true 865 866 _empty_msg="$batch_type: claude returned empty — skipping store${_stderr_content:+ (stderr: $(echo "$_stderr_content" | head -1))}" 867 if [ "$_streak" -ge 5 ]; then 868 log "ALERT: $_empty_msg [consecutive_empty=$_streak — likely tool failure, not empty queue]" 869 elif [ "$_streak" -ge 3 ]; then 870 log "WARN: $_empty_msg [consecutive_empty=$_streak]" 871 else 872 log "$_empty_msg" 873 fi 874 return 0 875 fi 876 877 # Success — reset the consecutive-empty streak for this batch type 878 node -e " 879 const fs=require('fs'); 880 let d={};try{d=JSON.parse(fs.readFileSync('$GATE_FILE','utf8'));}catch{} 881 delete d['empty_streak_${batch_type}']; 882 fs.writeFileSync('$GATE_FILE',JSON.stringify(d)); 883 " 2>/dev/null || true 884 885 store_summary=$(node "$PROJECT_ROOT/scripts/claude-store-wrapper.js" "$_raw_file" "$batch_type" 2>&1 || echo '{"error":"store_node_failed"}') 886 887 if [ -z "$store_summary" ] || echo "$store_summary" | grep -q '"error"'; then 888 # On error: preserve raw file for inspection 889 _saved_raw="$PROJECT_ROOT/logs/orch-raw-${batch_type}-saved.json" 890 cp "$_raw_file" "$_saved_raw" 2>/dev/null || true 891 log "$batch_type: saved raw to $_saved_raw for inspection" 892 fi 893 rm -f "$_raw_file" 894 895 if [ -z "$store_summary" ]; then 896 log "$batch_type: [WARN] store_summary empty — node produced no output. stderr=$(echo "$_stderr_content" | head -c 300)" 897 else 898 log "$batch_type: $store_summary" 899 [ -n "$_stderr_content" ] && echo "$store_summary" | grep -q '"error"' && log "$batch_type: claude_stderr=$(echo "$_stderr_content" | head -c 200)" 900 fi 901 return 0 902 } 903 904 # Returns 0 if the batch type is skipped in conservation mode, 1 if it should run. 905 # Time-critical and Haiku tasks always run. Opus-heavy generation deferred. 906 should_skip_for_conservation() { 907 batch_type="$1" 908 [ "$CONSERVATION_MODE" = "false" ] && return 1 # Not in conservation mode — don't skip 909 910 case "$batch_type" in 911 # Always run — time-critical (inbound replies can't wait) or cheap (Haiku) 912 reply_responses|classify_replies|extract_names|oversee|classify_errors) 913 return 1 ;; 914 # Defer — Opus-heavy or pipeline-upstream work that can wait 915 proposals_email|proposals_sms|score_semantic|score_sites|enrich_sites|proofread|followup_generate) 916 log "$batch_type: SKIP (conservation mode — Claude usage limit active since $LIMIT_HIT_SINCE)" 917 return 0 ;; 918 esac 919 return 1 920 } 921 922 # Run a batch with all standard skip checks (rate_limits, stage_flag, conservation, backlog). 923 # The skip functions themselves know which batch types they apply to — unknown types pass through. 924 # Usage: run_checked <type> <model> <batch_size> [context_files...] 925 run_checked() { 926 _rc_type="$1"; _rc_model="$2"; _rc_size="$3"; shift 3 927 [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rc_type" ] && return 0 928 should_skip_for_rate_limits "$_rc_type" && return 0 929 should_skip_for_stage_flag "$_rc_type" && return 0 930 should_skip_for_conservation "$_rc_type" && return 0 931 should_skip_for_backlog "$_rc_type" && return 0 932 run_batch "$_rc_type" "$_rc_model" "$_rc_size" "$@" && had_work=true 933 } 934 935 # Run a time-gated batch with all standard skip checks + is_due gate. 936 # Usage: run_checked_gated <type> <model> <batch_size> <interval_minutes> [context_files...] 937 run_checked_gated() { 938 _rg_type="$1"; _rg_model="$2"; _rg_size="$3"; _rg_interval="$4"; shift 4 939 [ -n "$SINGLE_TYPE" ] && [ "$SINGLE_TYPE" != "$_rg_type" ] && return 0 940 should_skip_for_rate_limits "$_rg_type" && return 0 941 should_skip_for_stage_flag "$_rg_type" && return 0 942 should_skip_for_conservation "$_rg_type" && return 0 943 should_skip_for_backlog "$_rg_type" && return 0 944 if is_due "$_rg_type" "$_rg_interval"; then 945 run_batch "$_rg_type" "$_rg_model" "$_rg_size" "$@" && had_work=true 946 mark_ran "$_rg_type" 947 else 948 log "$_rg_type: not due yet (runs every ${_rg_interval}min)" 949 fi 950 } 951 952 # Check if any Tier C (interactive) agents are overdue. 953 # Writes logs/agents-due.json with overdue agents and their session start prompts. 954 # Called once per cycle so the daily report and AFK monitor can surface them. 955 check_tier_c_agents() { 956 _due_agents="[]" 957 958 # Security Engineer: quarterly (129600 min) + major refactor 959 if is_due "tier_c_security_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 960 _due_agents=$(echo "$_due_agents" | node -e " 961 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 962 const a=JSON.parse(d); 963 a.push({agent:'security_audit',schedule:'quarterly', 964 prompt:'Run a Security Engineer audit on 333Method and 2Step. Review: (1) API key handling in load-env.js and .env files, (2) Twilio webhook authentication in src/inbound/sms.js, (3) PayPal integration in src/payment/paypal.js, (4) unsigned unsubscribe tokens in 2Step src/outreach/email.js, (5) stealth-browser SSRF risk, (6) SQL injection review of all DB queries. Output findings as JSON with severity/file/line/description.'}); 965 process.stdout.write(JSON.stringify(a)); 966 }); 967 " 2>/dev/null || echo "$_due_agents") 968 log "[AGENT DUE] security_audit — run Security Engineer (Opus, max, thinking on)" 969 fi 970 971 # Compliance Auditor: quarterly (129600 min) + major refactor 972 if is_due "tier_c_compliance_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 973 _due_agents=$(echo "$_due_agents" | node -e " 974 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 975 const a=JSON.parse(d); 976 a.push({agent:'compliance_audit',schedule:'quarterly', 977 prompt:'Run a Compliance Auditor review on 333Method outreach system. Check: (1) CAN-SPAM compliance in email templates, (2) TCPA compliance in SMS sending (business hours, opt-out), (3) GDPR country blocking logic in src/config/countries.js, (4) unsubscribe honoring in src/outreach/, (5) 72-hour cooldown enforcement. Also audit 2Step CAN-SPAM compliance in src/outreach/email.js.'}); 978 process.stdout.write(JSON.stringify(a)); 979 }); 980 " 2>/dev/null || echo "$_due_agents") 981 log "[AGENT DUE] compliance_audit — run Compliance Auditor (Opus, high, thinking on)" 982 fi 983 984 # SEO Specialist: monthly (43200 min) for colorcraft-ai.com 985 if is_due "tier_c_seo_audit" 43200; then 986 _due_agents=$(echo "$_due_agents" | node -e " 987 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 988 const a=JSON.parse(d); 989 a.push({agent:'seo_specialist',schedule:'monthly', 990 prompt:'Run an SEO Specialist audit on colorcraft-ai.com. Fetch the site, analyze technical SEO (meta tags, page speed indicators, structured data, sitemap, robots.txt), keyword opportunities, content gaps, and link building potential. Output a prioritized report of changes for Base44 to implement.'}); 991 process.stdout.write(JSON.stringify(a)); 992 }); 993 " 2>/dev/null || echo "$_due_agents") 994 log "[AGENT DUE] seo_specialist — run SEO Specialist (Sonnet, medium)" 995 fi 996 997 # Automation Governance: quarterly (129600 min) + major refactor 998 if is_due "tier_c_automation_audit" 129600 || [ "${MAJOR_REFACTOR:-false}" = "true" ]; then 999 _due_agents=$(echo "$_due_agents" | node -e " 1000 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 1001 const a=JSON.parse(d); 1002 a.push({agent:'automation_audit',schedule:'quarterly', 1003 prompt:'Run an Automation Governance audit. Read the cron_jobs table (sqlite3 db/sites.db \"SELECT * FROM cron_jobs\"), review all cron jobs in src/cron/ and the orchestrator in scripts/claude-orchestrator.sh. Identify: redundant jobs, poor sequencing, missing error handling, jobs that should be consolidated. Output a ranked list of changes.'}); 1004 process.stdout.write(JSON.stringify(a)); 1005 }); 1006 " 2>/dev/null || echo "$_due_agents") 1007 log "[AGENT DUE] automation_audit — run Automation Governance Architect (Sonnet, medium)" 1008 fi 1009 1010 # Code review completion check — surface Phase 5 reminder when queue is done 1011 if [ -f "logs/code-review-queue.json" ]; then 1012 _cr_done=$(node -e " 1013 try { 1014 const q = JSON.parse(require('fs').readFileSync('logs/code-review-queue.json','utf8')); 1015 process.stdout.write(q.next_index >= q.files.length ? 'yes' : 'no'); 1016 } catch { process.stdout.write('no'); } 1017 " 2>/dev/null || echo "no") 1018 if [ "$_cr_done" = "yes" ] && is_due "tier_c_phase5_reminder" 43200; then 1019 _due_agents=$(echo "$_due_agents" | node -e " 1020 let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{ 1021 const a=JSON.parse(d); 1022 a.push({agent:'phase5_sessions',schedule:'one-shot', 1023 prompt:'Code review of all 150 files is complete. Time to run Phase 5 Tier C sessions:\n1. Security audit: ./scripts/run-agent-session.sh security_audit\n2. 2Step CI/CD: ./scripts/run-agent-session.sh automation_audit\n3. colorcraft-ai.com report: ./scripts/run-agent-session.sh colorcraft_report\n4. Update distributed-agent-system plan (manual task in docs/plans/)'}); 1024 process.stdout.write(JSON.stringify(a)); 1025 }); 1026 " 2>/dev/null || echo "$_due_agents") 1027 log "[REMINDER] Code review complete — Phase 5 Tier C sessions ready to run" 1028 log " Run: ./scripts/run-agent-session.sh security_audit" 1029 log " Run: ./scripts/run-agent-session.sh colorcraft_report" 1030 log " Run: ./scripts/run-agent-session.sh automation_audit" 1031 log " Update: docs/plans/distributed-agent-system.md" 1032 fi 1033 fi 1034 1035 # Write results to logs/agents-due.json 1036 printf '%s\n' "$_due_agents" > logs/agents-due.json 2>/dev/null || true 1037 } 1038 1039 # Detect major refactor: check recent commits for patterns that trigger agent reviews. 1040 # Sets MAJOR_REFACTOR=true if criteria met. Called once per cycle. 1041 check_major_refactor() { 1042 MAJOR_REFACTOR=false 1043 _changed=$(git diff --name-only HEAD~10 2>/dev/null | head -200 || echo "") 1044 _count=$(echo "$_changed" | grep -c '.' 2>/dev/null) || _count=0 1045 1046 # 10+ files changed 1047 [ "$_count" -ge 10 ] && MAJOR_REFACTOR=true && return 0 1048 1049 # Security-sensitive files 1050 echo "$_changed" | grep -qE '^src/stages/|^scripts/claude-orchestrator\.sh$|^db/migrations/|^src/outreach/|^src/payment/|^src/inbound/|^load-env\.js$|^src/utils/load-env\.js$|^src/utils/stealth-browser' && MAJOR_REFACTOR=true && return 0 1051 1052 return 0 1053 } 1054 1055 # Define batch order — priority: outreach-adjacent first, upstream last 1056 process_all_batches() { 1057 had_work=false 1058 1059 # Re-resolve CLAUDE_BIN each cycle — auto-updates delete old versions 1060 resolve_claude_bin 1061 1062 # Run canary once per cycle (verifies node, claude-batch.js, CLAUDE_BIN are usable) 1063 run_canary 1064 1065 # Detect major refactor (sets MAJOR_REFACTOR env var for this cycle) 1066 check_major_refactor 1067 [ "$MAJOR_REFACTOR" = "true" ] && log "MAJOR_REFACTOR detected — Tier C agents flagged as due" 1068 1069 # Check Tier C agent schedule — writes logs/agents-due.json 1070 check_tier_c_agents 1071 1072 # Reload SKIP_STAGES from .env each cycle (respects live changes without restart) 1073 load_skip_stages 1074 [ -n "$SKIP_STAGES" ] && log "SKIP_STAGES=$SKIP_STAGES" 1075 1076 # Check Claude Max usage limits proactively (live API fetch, ~1s) 1077 check_usage_proactive 1078 1079 # Refresh pipeline backlog counts for intelligent skipping 1080 refresh_backlog 1081 1082 # Scale batch sizes up when backlogs are large so we clear them faster. 1083 # Caps are conservative — LLM context window is the real ceiling. 1084 if [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 5000 ]; then 1085 PROOFREAD_BATCH=200 1086 elif [ "${BACKLOG_PENDING_APPROVAL:-0}" -gt 1000 ]; then 1087 PROOFREAD_BATCH=100 1088 fi 1089 1090 if [ "${BACKLOG_PROPOSALS:-0}" -gt 5000 ]; then 1091 # Cap at 20 SMS / 10 email — 50 SMS exceeded 32k output token limit (29min run, hard error) 1092 REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=20; REWORD_FORM_BATCH=30 1093 REWORD_LINKEDIN_BATCH=30; REWORD_X_BATCH=30 1094 elif [ "${BACKLOG_PROPOSALS:-0}" -gt 1000 ]; then 1095 REWORD_EMAIL_BATCH=10; REWORD_SMS_BATCH=30; REWORD_FORM_BATCH=30 1096 fi 1097 1098 conservation_note="" 1099 [ "$CONSERVATION_MODE" = "true" ] && conservation_note=" [CONSERVATION MODE: $CONSERVATION_REASON]" 1100 log "Backlog: total_sites=${CANARY_SITE_COUNT:-?} eligible_outreach=$BACKLOG_ELIGIBLE_OUTREACH actionable_proposals=$BACKLOG_PROPOSALS actionable_enriched=$BACKLOG_ENRICHED approved_unsent=$BACKLOG_APPROVED_UNSENT pending=$BACKLOG_PENDING_APPROVAL active_countries=$BACKLOG_ACTIVE_COUNTRIES daily_send_avg=${BACKLOG_DAILY_SEND_AVG:-0}${conservation_note}" 1101 log "Effective batch sizes: proofread=$PROOFREAD_BATCH score_semantic=$SCORE_SEMANTIC_BATCH enrich=$ENRICH_SITES_BATCH" 1102 1103 # --- Pipeline batches: outreach-adjacent first, upstream last --- 1104 run_checked proposals_email opus "$PROPOSALS_EMAIL_BATCH" prompts/PROPOSAL.md docs/05-outreach/email-best-practices.md 1105 run_checked proposals_sms opus "$PROPOSALS_SMS_BATCH" prompts/PROPOSAL.md docs/05-outreach/sms-best-practices.md 1106 # Semantic scoring: Sonnet evaluates headline, value prop, USP. Accuracy > cost. 1107 run_checked score_semantic sonnet "$SCORE_SEMANTIC_BATCH" 1108 1109 # score_sites: HTML-only LLM scoring (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false) 1110 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "score_sites" ]; then 1111 _llm_scoring=$(node -e "process.stdout.write(process.env.ENABLE_LLM_SCORING !== 'false' ? '1' : '0')" 2>/dev/null || echo "0") 1112 _enable_vision=$(node -e "process.stdout.write(process.env.ENABLE_VISION !== 'false' ? '1' : '0')" 2>/dev/null || echo "1") 1113 if [ "$_llm_scoring" = "1" ] && [ "$_enable_vision" = "0" ]; then 1114 run_checked score_sites sonnet "$SCORE_SITES_BATCH" prompts/CONVERSION-SCORING-NOVIS.md 1115 else 1116 [ -z "$SINGLE_TYPE" ] || log "score_sites: SKIP (requires ENABLE_LLM_SCORING=true + ENABLE_VISION=false)" 1117 fi 1118 fi 1119 1120 # enrich_sites: LLM contact enrichment (requires ENABLE_ENRICHMENT_LLM=true) 1121 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "enrich_sites" ]; then 1122 _enrich_llm=$(node -e "process.stdout.write(process.env.ENABLE_ENRICHMENT_LLM !== 'false' ? '1' : '0')" 2>/dev/null || echo "0") 1123 if [ "$_enrich_llm" = "1" ]; then 1124 run_checked enrich_sites haiku "$ENRICH_SITES_BATCH" prompts/ENRICHMENT.md 1125 else 1126 [ -z "$SINGLE_TYPE" ] || log "enrich_sites: SKIP (ENABLE_ENRICHMENT_LLM=false)" 1127 fi 1128 fi 1129 1130 # gather_evidence: moved to standalone cron job (5-min interval, migration 112). 1131 # No longer runs inline — the cron system handles execution independently. 1132 # The pending count is still logged here for orchestrator visibility. 1133 # Also reports sites awaiting merge (pass1+pass2 done, evidence_json still NULL). 1134 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "gather_evidence" ]; then 1135 _pending_evidence=$(node -e " 1136 const Database = require('./node_modules/better-sqlite3'); 1137 const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true }); 1138 const r = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)\").get(); 1139 const m = db.prepare(\"SELECT COUNT(*) as c FROM sites WHERE evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL\").get(); 1140 process.stdout.write(JSON.stringify({ pending: r.c || 0, awaiting_merge: m.c || 0 })); 1141 db.close(); 1142 " 2>/dev/null || echo '{"pending":0,"awaiting_merge":0}') 1143 _ev_pending=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).pending||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 1144 _ev_merge=$(echo "$_pending_evidence" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{process.stdout.write(String(JSON.parse(d).awaiting_merge||0));}catch{process.stdout.write('0');}})" 2>/dev/null || echo "0") 1145 if [ "$_ev_pending" -gt 0 ] 2>/dev/null || [ "$_ev_merge" -gt 0 ] 2>/dev/null; then 1146 log "gather_evidence: ${_ev_pending} sites pending collection, ${_ev_merge} awaiting merge (proposals gate requires evidence_json)" 1147 # Detect evidence-blocked state: outreach pipeline is empty and evidence is the critical path. 1148 # Log clearly so the overseer and monitoring can see this is expected, not a crash. 1149 if [ "${BACKLOG_ELIGIBLE_OUTREACH:-0}" -eq 0 ] && [ "${BACKLOG_PROPOSALS:-0}" -eq 0 ] 2>/dev/null; then 1150 log "EVIDENCE-BLOCKED: eligible_outreach=0 proposals=0 — evidence collection is the critical path (pending=${_ev_pending} awaiting_merge=${_ev_merge})" 1151 fi 1152 else 1153 log "gather_evidence: no sites pending" 1154 fi 1155 fi 1156 1157 run_checked proofread opus "$PROOFREAD_BATCH" prompts/PROOFREAD.md 1158 run_checked followup_generate sonnet 20 prompts/FOLLOWUP.md 1159 run_checked classify_replies haiku "$CLASSIFY_BATCH" prompts/CLASSIFY-REPLIES.md 1160 run_checked reply_responses opus "$REPLIES_BATCH" prompts/REPLIES.md 1161 1162 # --- Time-gated batches --- 1163 run_checked_gated extract_names haiku "$NAMES_BATCH" 15 1164 run_checked_gated oversee sonnet 1 30 1165 run_checked_gated classify_errors haiku 1 240 1166 run_checked_gated monitor_health haiku 1 60 prompts/agents/MONITOR-HEALTH.md 1167 run_checked_gated triage_errors haiku 1 120 prompts/agents/TRIAGE-ERRORS.md 1168 run_checked_gated check_docs haiku 1 240 prompts/agents/CHECK-DOCS.md 1169 # evidence_merge: bypass time gate when sites are awaiting merge — run every cycle until clear. 1170 # Falls back to 15min gate when backlog is empty (avoids unnecessary idle runs). 1171 if [ "${_ev_merge:-0}" -gt 0 ] 2>/dev/null; then 1172 run_checked evidence_merge haiku 1 prompts/EVIDENCE-MERGE.md 1173 else 1174 run_checked_gated evidence_merge haiku 1 15 prompts/EVIDENCE-MERGE.md 1175 fi 1176 1177 # --- Code review (1 file per cycle, security-first order) --- 1178 # Gate: pause if pending code_review_fix tasks > 30 (backpressure — don't flood the fix queue) 1179 if [ -z "$SINGLE_TYPE" ] || [ "$SINGLE_TYPE" = "code_review" ]; then 1180 _pending_fixes=$(node -e " 1181 const Database = require('./node_modules/better-sqlite3'); 1182 const db = new Database(process.env.DATABASE_PATH || './db/sites.db', { readonly: true }); 1183 const r = db.prepare(\"SELECT COUNT(*) as c FROM agent_tasks WHERE task_type='code_review_fix' AND status IN ('pending','running')\").get(); 1184 process.stdout.write(String(r.c || 0)); 1185 db.close(); 1186 " 2>/dev/null || echo "0") 1187 if [ "${_pending_fixes:-0}" -gt 30 ]; then 1188 log "code_review: SKIP — pending_fixes=${_pending_fixes} > 30 (fix backlog too large)" 1189 else 1190 run_checked code_review sonnet 1 prompts/agents/CODE-REVIEW.md 1191 fi 1192 fi 1193 1194 [ "$had_work" = true ] 1195 } 1196 1197 # Main 1198 log "Starting orchestrator (loop=$LOOP, type=${SINGLE_TYPE:-all})" 1199 1200 if [ "$LOOP" = true ]; then 1201 while true; do 1202 if ! process_all_batches; then 1203 if [ "$CONSERVATION_MODE" = "true" ]; then 1204 # Queues appear empty for deferred tasks, but we're still in conservation mode. 1205 # Keep looping — time-critical tasks (replies, classify) may arrive. 1206 log "Queues empty in conservation mode — waiting 5min before retry..." 1207 sleep 300 1208 continue 1209 fi 1210 log "All queues empty — exiting loop" 1211 break 1212 fi 1213 # In conservation mode, pause between cycles to avoid hammering the quota check 1214 if [ "$CONSERVATION_MODE" = "true" ]; then 1215 log "Conservation mode: pausing 5min before next cycle..." 1216 sleep 300 1217 fi 1218 log "Batch complete, starting next cycle..." 1219 done 1220 else 1221 process_all_batches || log "No work to process" 1222 fi 1223 1224 log "Orchestrator finished"