monitoring-checks.sh
1 #!/bin/sh 2 # Monitoring Checks — System state collector for Claude Code (Tier 3) 3 # Replaces afk-checks.sh. Runs every ~30 minutes during AFK sessions. 4 # Outputs structured system state with inline analysis reminders. 5 # Also includes sonnet overseer health checks and Claude batch processor status. 6 # POSIX sh for NixOS compatibility. Uses sqlite3 CLI for speed (<10s). 7 # 8 # Usage: 9 # bash scripts/monitoring-checks.sh # run check cycle 10 # bash scripts/monitoring-checks.sh --reset # delete snapshot and start fresh 11 12 # No set -e: this is a data collector; we want ALL sections to run even if one query fails 13 14 # ── Config ────────────────────────────────────────────────────────────────── 15 PROJECT_DIR="${PROJECT_DIR:-/home/jason/code/333Method}" 16 DB="$PROJECT_DIR/db/sites.db" 17 OPS_DB="$PROJECT_DIR/db/ops.db" 18 TEL_DB="$PROJECT_DIR/db/telemetry.db" 19 LOGS="$PROJECT_DIR/logs" 20 # Wrapper: run sqlite3 with a 5s busy timeout so we never hang when pipeline holds the lock. 21 # ATTACHes ops.db (as "ops") and telemetry.db (as "tel") so split-schema queries work. 22 # Usage: sq "SELECT ..." (replaces bare sqlite3 "$DB" "..." calls) 23 # .timeout is a sqlite3 dot-command that sets busy_timeout without producing any output. 24 sq() { 25 sqlite3 "$DB" ".timeout 5000" \ 26 "ATTACH DATABASE '$OPS_DB' AS ops;" \ 27 "ATTACH DATABASE '$TEL_DB' AS tel;" \ 28 "$@" 2>/dev/null 29 } 30 SNAPSHOT="/tmp/afk-check-snapshot.txt" 31 NOW_UTC=$(date -u +%Y-%m-%dT%H:%M:%SZ) 32 TODAY=$(date -u +%Y-%m-%d) 33 HALF_HOUR_AGO=$(date -u -d '30 minutes ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-30M +%Y-%m-%dT%H:%M:%S 2>/dev/null || echo "") 34 35 if [ "$1" = "--reset" ]; then 36 rm -f "$SNAPSHOT" 37 echo "Snapshot cleared. Next run will be baseline." 38 exit 0 39 fi 40 41 # ── Load previous snapshot (if any) ──────────────────────────────────────── 42 PREV_EXISTS=false 43 if [ -f "$SNAPSHOT" ]; then 44 PREV_EXISTS=true 45 # shellcheck disable=SC1090 46 . "$SNAPSHOT" 47 fi 48 49 # Helper: compute delta (current - previous). Returns "NEW" if no previous. 50 delta() { 51 curr="$1" 52 prev_var="prev_$2" 53 eval prev_val="\${$prev_var:-}" 54 if [ -z "$prev_val" ]; then 55 echo "NEW" 56 else 57 d=$((curr - prev_val)) 58 if [ "$d" -gt 0 ]; then echo "+$d" 59 elif [ "$d" -lt 0 ]; then echo "$d" 60 else echo "=0 (STALL?)" 61 fi 62 fi 63 } 64 65 echo "╔══════════════════════════════════════════════════════════════════════╗" 66 echo "║ MONITORING CHECK CYCLE — $NOW_UTC ║" 67 echo "╚══════════════════════════════════════════════════════════════════════╝" 68 echo "" 69 70 # ════════════════════════════════════════════════════════════════════════════ 71 # SECTION A: PIPELINE / OUTREACH / INBOUND STATUS (canonical: npm run status) 72 # ════════════════════════════════════════════════════════════════════════════ 73 echo "━━━ A. STATUS (canonical npm run status output) ━━━" 74 echo "# LOOK FOR: new24h and new1h columns show sites that arrived at a stage recently." 75 echo "# →24h shows all that entered (including ones that moved on). new24h = still here." 76 echo "# Outreach new24h/new1h now uses updated_at (trigger 070) — accurate since 2026-03-02." 77 echo "# Agent tasks at bottom: piling up = agent system struggling." 78 echo "" 79 80 # npm run status is the single source of truth — it uses site_status transition log 81 # for pipeline deltas (not inflated updated_at), and updated_at trigger for outreach deltas. 82 cd "$PROJECT_DIR" && npm run status --silent 2>/dev/null || echo " (npm run status failed)" 83 echo "" 84 85 echo "── Agent Task Status (snapshot delta vs last check) ──" 86 sq "SELECT status, COUNT(*) FROM tel.agent_tasks GROUP BY status ORDER BY COUNT(*) DESC;" | while IFS='|' read -r status count; do 87 d=$(delta "$count" "agent_$status") 88 printf " %-25s %7s (delta: %s)\n" "$status" "$count" "$d" 89 done 90 echo "" 91 92 echo "── Snapshot deltas (raw counts vs last check — cross-reference with new24h above) ──" 93 echo "# These are simple current-minus-previous count deltas, not time-windowed." 94 echo "# Use to spot movement between THIS check and the PREVIOUS one (~30 min apart)." 95 site_found=$(sq "SELECT COUNT(*) FROM sites WHERE status='found';") 96 site_assets=$(sq "SELECT COUNT(*) FROM sites WHERE status='assets_captured';") 97 site_scored=$(sq "SELECT COUNT(*) FROM sites WHERE status='prog_scored';") 98 site_rescored=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('semantic_scored','vision_scored');") 99 site_enriched=$(sq "SELECT COUNT(*) FROM sites WHERE status='enriched';") 100 site_proposals=$(sq "SELECT COUNT(*) FROM sites WHERE status='proposals_drafted';") 101 site_sent=$(sq "SELECT COUNT(*) FROM sites WHERE status='outreach_sent';") 102 outreach_approved=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL;") 103 outreach_sent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivery_status IN ('sent','delivered','bounced');") 104 printf " %-25s %7s (delta: %s)\n" "sites:found" "${site_found:-0}" "$(delta "${site_found:-0}" "snap_site_found")" 105 printf " %-25s %7s (delta: %s)\n" "sites:assets_captured" "${site_assets:-0}" "$(delta "${site_assets:-0}" "snap_site_assets")" 106 printf " %-25s %7s (delta: %s)\n" "sites:prog_scored" "${site_scored:-0}" "$(delta "${site_scored:-0}" "snap_site_scored")" 107 printf " %-25s %7s (delta: %s)\n" "sites:semantic/vision" "${site_rescored:-0}" "$(delta "${site_rescored:-0}" "snap_site_rescored")" 108 printf " %-25s %7s (delta: %s)\n" "sites:enriched" "${site_enriched:-0}" "$(delta "${site_enriched:-0}" "snap_site_enriched")" 109 printf " %-25s %7s (delta: %s)\n" "sites:proposals_drafted" "${site_proposals:-0}" "$(delta "${site_proposals:-0}" "snap_site_proposals")" 110 printf " %-25s %7s (delta: %s)\n" "sites:outreach_sent" "${site_sent:-0}" "$(delta "${site_sent:-0}" "snap_site_sent")" 111 printf " %-25s %7s (delta: %s)\n" "outreach:approved" "${outreach_approved:-0}" "$(delta "${outreach_approved:-0}" "snap_out_approved")" 112 printf " %-25s %7s (delta: %s)\n" "outreach:sent+bounced" "${outreach_sent:-0}" "$(delta "${outreach_sent:-0}" "snap_out_sent")" 113 echo "" 114 115 # ════════════════════════════════════════════════════════════════════════════ 116 # SECTION A2: AGENT MONITOR HEALTH (last orchestrator finding) 117 # ════════════════════════════════════════════════════════════════════════════ 118 echo "━━━ A2. AGENT MONITOR HEALTH (last Haiku finding) ━━━" 119 echo "# Source: orchestrator monitor_health batch (runs every 60min via claude-orchestrator.sh)" 120 echo "# ACTION REQUIRED if severity=critical — do not wait for next check cycle." 121 echo "" 122 123 ORCH_LOG_TODAY="$LOGS/orchestrator-$(date +%Y-%m-%d).log" 124 [ ! -f "$ORCH_LOG_TODAY" ] && ORCH_LOG_TODAY="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log" 125 126 if [ -f "$ORCH_LOG_TODAY" ]; then 127 # Strip ANSI codes, find the last "Health check [" line and capture it plus 128 # the indented lines that follow it (findings and actions). 129 last_health=$(sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \ 130 | grep -n "Health check \[" | tail -1) 131 132 if [ -n "$last_health" ]; then 133 line_num=$(echo "$last_health" | cut -d: -f1) 134 sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \ 135 | sed -n "${line_num},$((line_num + 20))p" \ 136 | while IFS= read -r line; do 137 case "$line" in 138 " "*) echo " $line" ;; 139 *"Health check ["*) echo " $line" ;; 140 *) break ;; 141 esac 142 done 143 144 # Check when this finding was made 145 health_ts=$(sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \ 146 | grep "Health check \[" | tail -1 \ 147 | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1) 148 if [ -n "$health_ts" ]; then 149 health_epoch=$(date -d "$health_ts" +%s 2>/dev/null || echo "0") 150 now_epoch=$(date +%s) 151 if [ "$health_epoch" -gt 0 ]; then 152 health_age_min=$(( (now_epoch - health_epoch) / 60 )) 153 printf "\n (Finding is %dmin old — next due in ~%dmin)\n" \ 154 "$health_age_min" "$((60 - health_age_min > 0 ? 60 - health_age_min : 0))" 155 if [ "$health_age_min" -gt 75 ]; then 156 echo " WARN: No monitor_health run in ${health_age_min}min — orchestrator may be stopped." 157 fi 158 fi 159 fi 160 else 161 echo " No monitor_health results found in today's orchestrator log." 162 echo " (batch is gated to run every 60min — may not have fired yet today)" 163 fi 164 else 165 echo " No orchestrator log for today." 166 fi 167 echo "" 168 169 # ════════════════════════════════════════════════════════════════════════════ 170 # SECTION B: BROWSER LOOP HEALTH 171 # ════════════════════════════════════════════════════════════════════════════ 172 echo "━━━ B. BROWSER LOOP HEALTH ━━━" 173 echo "# LOOK FOR: Browser loop stale >30min while API loop is recent = hung Playwright page." 174 echo "# The checkBrowserLoopHung() watchdog in Tier 1 should auto-restart, but verify it did." 175 echo "" 176 177 pc_row=$(sq "SELECT last_browser_loop_at, last_api_loop_at FROM ops.pipeline_control WHERE id=1;") 178 browser_ts=$(echo "$pc_row" | cut -d'|' -f1) 179 api_ts=$(echo "$pc_row" | cut -d'|' -f2) 180 if [ -n "$browser_ts" ] && [ -n "$api_ts" ]; then 181 echo " Browser loop last cycle: $browser_ts" 182 echo " API loop last cycle: $api_ts" 183 browser_epoch=$(date -u -d "${browser_ts}Z" +%s 2>/dev/null || echo "0") 184 api_epoch=$(date -u -d "${api_ts}Z" +%s 2>/dev/null || echo "0") 185 now_epoch=$(date -u +%s) 186 if [ "$browser_epoch" -gt 0 ] && [ "$api_epoch" -gt 0 ]; then 187 browser_age_min=$(( (now_epoch - browser_epoch) / 60 )) 188 api_age_min=$(( (now_epoch - api_epoch) / 60 )) 189 echo " Browser age: ${browser_age_min}min API age: ${api_age_min}min" 190 enable_vision=$(grep "^ENABLE_VISION=" .env 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"' | tr '[:upper:]' '[:lower:]') 191 if [ "$enable_vision" = "false" ]; then 192 echo " ℹ️ ENABLE_VISION=false — browser loop not expected to run (HTML-only mode)" 193 elif [ "$browser_age_min" -gt 30 ] && [ "$api_age_min" -lt 30 ]; then 194 echo " ⚠️ BROWSER LOOP HUNG — stale ${browser_age_min}min while API active ${api_age_min}min" 195 else 196 echo " ✓ Both loops active" 197 fi 198 fi 199 else 200 echo " (no pipeline_control timestamps yet)" 201 fi 202 echo "" 203 204 # ════════════════════════════════════════════════════════════════════════════ 205 # SECTION C: API ERROR RATES (ALL APIS, LAST 30 MIN) 206 # ════════════════════════════════════════════════════════════════════════════ 207 echo "━━━ C. API ERROR RATES (last 30 min) ━━━" 208 echo "# LOOK FOR: >10% error rate for any API = investigate. >30% = circuit breaker territory." 209 echo "# Check ALL APIs: OpenRouter (scoring/enrich), ZenRows (serps), Twilio/Resend (outreach)." 210 echo "# Burst errors at same timestamp = concurrency issue (rate limiter not wired)." 211 echo "" 212 213 # Compute 30-min window cutoff timestamps (current and 30 min ago) 214 # Log format: [2026-03-02T18:30:00 — filter by recent hour:minute prefixes 215 CUTOFF_EPOCH=$(( $(date +%s) - 1800 )) 216 # Compute 30-min cutoff as an epoch for later use in sqlite queries 217 # For log file filtering we use tail-1000 (approx 30min of log activity per file) 218 # rather than timestamp filtering, to avoid timezone offset mismatches between 219 # UTC container clock and AEST log timestamps. 220 CUTOFF_TS="" # unused — tail fallback used instead 221 222 # Build a grep pattern for lines timestamped in the last 30 min. 223 # Log lines use ISO-8601 with local offset e.g. [2026-03-04T01:15:00.123+11:00] 224 # We extract the last 30 lines-worth of unique minute prefixes (UTC) to match. 225 # Strategy: collect the last 35 minutes worth of HH:MM prefixes in UTC, then 226 # match log lines whose bracketed timestamp contains any of those strings. 227 # This is simpler and more reliable than timezone-converting the full timestamp. 228 _ts_filter="" 229 _i=0 230 while [ "$_i" -le 35 ]; do 231 _ts_filter="$_ts_filter $(date -u -d "${_i} minutes ago" +%Y-%m-%dT%H:%M 2>/dev/null || date -u -v-${_i}M +%Y-%m-%dT%H:%M 2>/dev/null)" 232 _i=$((_i + 1)) 233 done 234 # Also accept local-offset timestamps by including AEST/AEDT offset prefixes 235 _tz_filter="" 236 _i=0 237 while [ "$_i" -le 35 ]; do 238 _tz_filter="$_tz_filter $(TZ=Australia/Sydney date -d "${_i} minutes ago" +%Y-%m-%dT%H:%M 2>/dev/null || TZ=Australia/Sydney date -v-${_i}M +%Y-%m-%dT%H:%M 2>/dev/null)" 239 _i=$((_i + 1)) 240 done 241 242 # Build awk pattern from collected minute strings 243 _all_minutes="$_ts_filter $_tz_filter" 244 _awk_pat=$(echo "$_all_minutes" | tr ' ' '\n' | grep -v '^$' | sort -u | tr '\n' '|' | sed 's/|$//') 245 246 for logprefix in pipeline scoring rescoring enrich serps outreach proposals; do 247 # Use the most recently modified log for each prefix (handles NZT day-boundary log rotation) 248 logfile=$(ls -t "$LOGS"/"$logprefix"-*.log 2>/dev/null | head -1) 249 if [ -f "$logfile" ]; then 250 basename=$logprefix 251 # Filter to actual last-30-min lines using timestamp prefixes 252 if [ -n "$_awk_pat" ]; then 253 recent=$(grep -E "$_awk_pat" "$logfile" 2>/dev/null) 254 else 255 recent=$(tail -1000 "$logfile" 2>/dev/null) 256 fi 257 errors=$(echo "$recent" | grep -c 'status code [45][0-9][0-9]\|Timed out\|ETIMEDOUT\|ECONNRESET\|ECONNREFUSED\|Breaker is open' 2>/dev/null || echo "0") 258 total=$(echo "$recent" | wc -l | tr -d ' ') 259 if [ "$total" -gt 0 ] 2>/dev/null && [ "$errors" -gt 0 ] 2>/dev/null; then 260 pct=$((errors * 100 / total)) 261 printf " %-15s errors: %5s / %7s lines (%d%%)\n" "$basename" "$errors" "$total" "$pct" 262 else 263 printf " %-15s errors: 0\n" "$basename" 264 fi 265 fi 266 done 267 echo "" 268 269 # Show actual error lines from last 30 min (sample) — uses _awk_pat for timestamp filtering 270 echo "── Recent error samples (last 10 unique, last 30 min) ──" 271 for logprefix in pipeline scoring rescoring enrich serps outreach proposals app cron; do 272 logfile=$(ls -t "$LOGS"/"$logprefix"-*.log 2>/dev/null | head -1) 273 if [ -f "$logfile" ]; then 274 if [ -n "$_awk_pat" ]; then 275 grep -E "$_awk_pat" "$logfile" 2>/dev/null | grep -i 'ERROR\|status code [45]' | tail -5 276 else 277 tail -200 "$logfile" 2>/dev/null | grep -i 'ERROR\|status code [45]' | tail -5 278 fi 279 fi 280 done | sort -u | tail -10 281 echo "" 282 283 # ════════════════════════════════════════════════════════════════════════════ 284 # SECTION D: OUTREACH FUNNEL DETAIL 285 # ════════════════════════════════════════════════════════════════════════════ 286 echo "━━━ D. OUTREACH FUNNEL DETAIL ━━━" 287 echo "# LOOK FOR: Approved snapshot delta not decreasing between checks = outreach stage blocked." 288 echo "# Verify OUTREACH_SKIP_METHODS is correctly excluded in SQL (not just code-level skip)." 289 echo "# Check if last_outreach_at 3-day cooldown is blocking eligible sites." 290 echo "# Note: npm run status (Section A) shows new24h/new1h for all outreach statuses." 291 echo "" 292 293 echo "── Approved outreaches by channel (snapshot delta) ──" 294 sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL GROUP BY contact_method ORDER BY contact_method;" | while IFS='|' read -r method count; do 295 d=$(delta "$count" "approved_$method") 296 printf " %-12s %7s (delta: %s)\n" "$method" "$count" "$d" 297 done 298 echo "" 299 300 echo "── Actually delivered (ground truth: delivered_at in last 24h, any terminal status) ──" 301 echo "# This is the real 'how many went out' — includes sent+bounced, excludes pending/approved" 302 sq "SELECT contact_method, delivery_status, COUNT(*) FROM messages WHERE direction='outbound' AND delivered_at > datetime('now','-24 hours') GROUP BY contact_method, delivery_status ORDER BY contact_method;" | while IFS='|' read -r method status count; do 303 printf " %-12s %-12s %s\n" "$method" "$status" "$count" 304 done 305 delivered_total=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivered_at > datetime('now','-24 hours');") 306 printf " Total delivered_at in 24h: %s (sent + bounced combined)\n" "${delivered_total:-0}" 307 echo "" 308 309 echo "── SMS fail rate (1d) — ALERT if >10% ──" 310 sms_sent_1d=$(sq "SELECT SUM(CASE WHEN delivery_status IN ('sent','delivered') THEN 1 ELSE 0 END) FROM messages WHERE direction='outbound' AND contact_method='sms' AND updated_at > datetime('now','-1 day');") 311 sms_failed_1d=$(sq "SELECT SUM(CASE WHEN delivery_status='failed' THEN 1 ELSE 0 END) FROM messages WHERE direction='outbound' AND contact_method='sms' AND updated_at > datetime('now','-1 day');") 312 sms_sent_1d="${sms_sent_1d:-0}" 313 sms_failed_1d="${sms_failed_1d:-0}" 314 sms_denom_1d=$((sms_sent_1d + sms_failed_1d)) 315 if [ "$sms_denom_1d" -gt 0 ]; then 316 sms_fail_pct_1d=$(( sms_failed_1d * 100 / sms_denom_1d )) 317 printf " SMS 1d: %s sent, %s failed (%s%%)\n" "$sms_sent_1d" "$sms_failed_1d" "$sms_fail_pct_1d" 318 if [ "$sms_fail_pct_1d" -gt 10 ]; then 319 printf " *** HIGH SMS FAIL RATE: %s%% — check error_message breakdown ***\n" "$sms_fail_pct_1d" 320 sq "SELECT error_message, COUNT(*) FROM messages WHERE direction='outbound' AND contact_method='sms' AND delivery_status='failed' AND updated_at > datetime('now','-1 day') GROUP BY error_message ORDER BY COUNT(*) DESC LIMIT 5;" | while IFS='|' read -r errmsg cnt; do 321 printf " [%s] %s\n" "$cnt" "$errmsg" 322 done 323 fi 324 else 325 printf " SMS 1d: no attempts\n" 326 fi 327 echo "" 328 329 # ════════════════════════════════════════════════════════════════════════════ 330 # SECTION E: OVERSEER RESET DETECTION 331 # ════════════════════════════════════════════════════════════════════════════ 332 echo "━━━ E. OVERSEER RESET DETECTION ━━━" 333 echo "# LOOK FOR: Count not decreasing between checks = overseer resetting sites that get stuck again." 334 echo "# This is a loop: reset → fail → reset → fail. Need anti-loop detection in overseer." 335 echo "" 336 337 overseer_resets=$(sq "SELECT COUNT(*) FROM sites WHERE error_message LIKE '%reset by sonnet overseer%';") 338 d=$(delta "$overseer_resets" "overseer_resets") 339 echo " Sites with overseer reset error: $overseer_resets (delta: $d)" 340 341 echo "── By stage ──" 342 sq "SELECT status, COUNT(*) FROM sites WHERE error_message LIKE '%reset by sonnet overseer%' GROUP BY status;" | while IFS='|' read -r status count; do 343 printf " %-25s %s\n" "$status" "$count" 344 done 345 echo "" 346 347 # ════════════════════════════════════════════════════════════════════════════ 348 # SECTION F: AGENT TASKS (last 2 hours) 349 # ════════════════════════════════════════════════════════════════════════════ 350 echo "━━━ F. AGENT TASKS (last 2h) ━━━" 351 echo "# LOOK FOR: >3 tasks of same type blocked with similar error = systemic agent failure." 352 echo "# Fix root cause, don't just clear tasks. Check if a fix_bug task already exists." 353 echo "" 354 355 echo "── Pending triage_fix tasks (Haiku-validated, high/critical errors) ──" 356 echo "# These were created by triage_errors batch. Each represents a classified bug or infra issue." 357 echo "# Priority 9 = critical, priority 7 = high." 358 echo "" 359 triage_pending=$(sq "SELECT COUNT(*) FROM tel.agent_tasks WHERE task_type='triage_fix' AND status='pending';") 360 if [ "${triage_pending:-0}" -gt 0 ]; then 361 echo " *** ${triage_pending} pending triage_fix task(s) — review and action: ***" 362 sq ".mode list 363 .separator | 364 SELECT id, priority, created_at, 365 json_extract(context_json,'\$.severity') as sev, 366 json_extract(context_json,'\$.error_type') as etype, 367 substr(json_extract(context_json,'\$.summary'),1,80) as summary, 368 substr(json_extract(context_json,'\$.suggested_fix'),1,100) as fix 369 FROM tel.agent_tasks 370 WHERE task_type='triage_fix' AND status='pending' 371 ORDER BY priority DESC, created_at ASC 372 LIMIT 10;" | while IFS='|' read -r tid prio cat sev etype summary fix; do 373 printf " [id=%s pri=%s sev=%s type=%s]\n" "$tid" "$prio" "$sev" "$etype" 374 printf " Summary: %s\n" "$summary" 375 printf " Suggested: %s\n" "$fix" 376 printf " Created: %s\n" "$cat" 377 echo "" 378 done 379 else 380 echo " No pending triage_fix tasks." 381 fi 382 echo "" 383 384 sq ".mode column 385 .headers on 386 SELECT id, task_type, assigned_to, status, substr(error_message,1,60) as error, created_at 387 FROM tel.agent_tasks 388 WHERE created_at > datetime('now','-2 hours') 389 ORDER BY created_at DESC 390 LIMIT 30;" 391 echo "" 392 393 echo "── Blocked task patterns ──" 394 sq "SELECT task_type, substr(error_message,1,50), COUNT(*) as n 395 FROM tel.agent_tasks 396 WHERE status='blocked' AND created_at > datetime('now','-24 hours') 397 GROUP BY task_type, substr(error_message,1,50) 398 HAVING n > 1 399 ORDER BY n DESC;" | while IFS='|' read -r type err count; do 400 printf " [%sx] %-15s %s\n" "$count" "$type" "$err" 401 done 402 echo "" 403 404 # ════════════════════════════════════════════════════════════════════════════ 405 # SECTION G: CRON HEALTH 406 # ════════════════════════════════════════════════════════════════════════════ 407 echo "━━━ G. CRON HEALTH ━━━" 408 echo "# LOOK FOR: Any cron job not run in >2x its interval = cron itself is broken." 409 echo "# Check: systemctl --user status mmo-cron.timer" 410 echo "" 411 412 sq ".mode column 413 .headers on 414 SELECT task_key, 415 interval_value || ' ' || interval_unit as interval, 416 last_run_at, 417 CASE WHEN last_run_at IS NULL THEN 'NEVER' 418 ELSE CAST(ROUND((julianday('now') - julianday(last_run_at)) * 24 * 60) AS INTEGER) || 'min ago' 419 END as age, 420 enabled 421 FROM ops.cron_jobs 422 WHERE enabled = 1 423 ORDER BY last_run_at ASC 424 LIMIT 25;" 425 echo "" 426 427 # Flag overdue jobs. 428 # For minute-interval jobs: use 6x threshold to avoid false positives from the 429 # NixOS OnUnitActiveSec bug (Check 6 resets timer every ~5min — so 1-min jobs 430 # can legitimately gap up to 5min between runs under the workaround). 431 echo "── Overdue cron jobs ──" 432 sq "SELECT task_key, interval_value, interval_unit, last_run_at 433 FROM ops.cron_jobs 434 WHERE enabled = 1 435 AND last_run_at IS NOT NULL 436 AND ( 437 (interval_unit = 'minutes' AND interval_value = 1 AND (julianday('now') - julianday(last_run_at)) * 24 * 60 > 6) 438 OR (interval_unit = 'minutes' AND interval_value > 1 AND (julianday('now') - julianday(last_run_at)) * 24 * 60 > interval_value * 2) 439 OR (interval_unit = 'hours' AND (julianday('now') - julianday(last_run_at)) * 24 > interval_value * 2) 440 OR (interval_unit = 'days' AND (julianday('now') - julianday(last_run_at)) > interval_value * 2) 441 );" | while IFS='|' read -r key iv iu last; do 442 echo " ⚠️ OVERDUE: $key (every $iv $iu, last ran $last)" 443 done 444 echo "" 445 446 # ════════════════════════════════════════════════════════════════════════════ 447 # SECTION H: CIRCUIT BREAKERS / RATE LIMITS 448 # ════════════════════════════════════════════════════════════════════════════ 449 echo "━━━ H. RATE LIMITS & CIRCUIT BREAKERS ━━━" 450 echo "# LOOK FOR: Any entry with resetAt in the past = auto-clear failed." 451 echo "# Fix: npm run rate-limits:clear or restart pipeline." 452 echo "" 453 454 if [ -f "$LOGS/rate-limits.json" ]; then 455 cat "$LOGS/rate-limits.json" 456 else 457 echo " (no rate-limits.json — no active rate limits)" 458 fi 459 echo "" 460 461 # ════════════════════════════════════════════════════════════════════════════ 462 # SECTION I: ZOMBIES & RUNAWAY PROCESSES 463 # ════════════════════════════════════════════════════════════════════════════ 464 echo "━━━ I. ZOMBIES & PROCESSES ━━━" 465 echo "# LOOK FOR: >50 zombies = container restart needed (tell user)." 466 echo "# Any chromium/node process using >30min CPU or >50% CPU = likely hung." 467 echo "" 468 469 zombie_count=$(ps -eo stat --no-headers 2>/dev/null | grep -c '^Z' || echo "0") 470 echo " Zombie processes: $zombie_count" 471 echo "" 472 473 echo "── Top chromium/node/npm processes (by CPU) ──" 474 printf " %-7s %5s %5s %10s %s\n" "PID" "CPU%" "MEM%" "TIME" "CMD" 475 ps -eo pid,pcpu,pmem,time,args --sort=-pcpu --no-headers 2>/dev/null | grep -E 'chromium|node|npm' | grep -v grep | head -15 | while read -r pid cpu mem time cmd; do 476 short_cmd=$(echo "$cmd" | cut -c1-60) 477 printf " %-7s %5s %5s %10s %s\n" "$pid" "$cpu" "$mem" "$time" "$short_cmd" 478 done 479 echo "" 480 481 # ════════════════════════════════════════════════════════════════════════════ 482 # SECTION J: SYSTEM LOAD 483 # ════════════════════════════════════════════════════════════════════════════ 484 echo "━━━ J. SYSTEM LOAD ━━━" 485 echo "# LOOK FOR: <500MB free memory = reduce BROWSER_CONCURRENCY." 486 echo "# >90% disk = run npm run logs:rotate. High load avg = too many concurrent processes." 487 echo "" 488 489 echo "── Uptime & Load ──" 490 uptime 2>/dev/null 491 echo "" 492 493 echo "── Memory ──" 494 free -m 2>/dev/null 495 echo "" 496 497 echo "── Disk ──" 498 df -h "$PROJECT_DIR" /tmp 2>/dev/null 499 echo "" 500 501 # ════════════════════════════════════════════════════════════════════════════ 502 # SECTION K: ALL LOGS (last 30 min, tail of each) 503 # ════════════════════════════════════════════════════════════════════════════ 504 echo "━━━ K. RECENT LOGS (last 30 min) ━━━" 505 echo "# LOOK FOR: Unhandled exceptions, repeated error patterns, novel error types." 506 echo "# This is raw data for higher-order analysis — find what the monitors missed." 507 echo "" 508 509 # Find all logs modified in last 30 min (regardless of date suffix — handles NZT day boundary) 510 find "$LOGS" -name "*.log" -mmin -30 -type f 2>/dev/null | grep -v "pipeline-status\|pipeline-2\|collect-metrics" | sort | while read -r f; do 511 bn=$(basename "$f") 512 lines=$(wc -l < "$f" 2>/dev/null || echo "0") 513 echo "── $bn ($lines lines total, last 50) ──" 514 tail -50 "$f" 2>/dev/null 515 echo "" 516 done 517 518 # Also check pipeline-status.txt for recent Tier 2 output 519 if [ -f "$LOGS/pipeline-status.txt" ]; then 520 echo "── pipeline-status.txt (last 30 lines) ──" 521 tail -30 "$LOGS/pipeline-status.txt" 2>/dev/null 522 echo "" 523 fi 524 525 # ════════════════════════════════════════════════════════════════════════════ 526 # SECTION L: SITES PROCESSED REPEATEDLY 527 # ════════════════════════════════════════════════════════════════════════════ 528 echo "━━━ L. SITES PROCESSED REPEATEDLY ━━━" 529 echo "# LOOK FOR: recapture_count >5 = site in a loop. Investigate why it fails after recapture." 530 echo "# Consider marking as 'failing' if root cause is permanent (site down, blocked, etc)." 531 echo "" 532 533 sq ".mode column 534 .headers on 535 SELECT id, domain, status, recapture_count, substr(error_message,1,50) as error 536 FROM sites 537 WHERE recapture_count > 3 538 AND status NOT IN ('ignored','failing','high_score','outreach_sent') 539 ORDER BY recapture_count DESC 540 LIMIT 20;" 541 echo "" 542 543 # ════════════════════════════════════════════════════════════════════════════ 544 # SECTION M: PERFORMANCE METRICS (last 30 min) 545 # ════════════════════════════════════════════════════════════════════════════ 546 echo "━━━ M. PERFORMANCE METRICS (last 30 min) ━━━" 547 echo "# LOOK FOR: Success rate <90% = investigate that stage." 548 echo "# Avg duration increasing vs last check = resource exhaustion or payload issue." 549 echo "# Throughput (sites/min) declining = bottleneck forming." 550 echo "" 551 552 sq ".mode column 553 .headers on 554 SELECT stage_name, 555 COUNT(*) as runs, 556 SUM(sites_succeeded) as ok, 557 SUM(sites_failed) as fail, 558 ROUND(AVG(duration_ms)/1000.0, 1) as avg_sec, 559 ROUND(CAST(SUM(sites_succeeded) AS REAL) / NULLIF(SUM(sites_succeeded) + SUM(sites_failed), 0) * 100, 1) as success_pct 560 FROM tel.pipeline_metrics 561 WHERE started_at > datetime('now','-30 minutes') 562 GROUP BY stage_name 563 ORDER BY stage_name;" 564 echo "" 565 566 echo "── Stage duration trend (last 2h, avg per 30min window) ──" 567 sq "SELECT stage_name, 568 CASE WHEN started_at > datetime('now','-30 minutes') THEN '0-30min' 569 WHEN started_at > datetime('now','-60 minutes') THEN '30-60min' 570 WHEN started_at > datetime('now','-90 minutes') THEN '60-90min' 571 ELSE '90-120min' 572 END as window, 573 ROUND(AVG(duration_ms)/1000.0,1) as avg_sec, 574 SUM(sites_succeeded) as ok 575 FROM tel.pipeline_metrics 576 WHERE started_at > datetime('now','-2 hours') 577 GROUP BY stage_name, window 578 ORDER BY stage_name, window;" | while IFS='|' read -r stage window avg ok; do 579 printf " %-12s %-10s avg: %6ss sites: %s\n" "$stage" "$window" "$avg" "$ok" 580 done 581 echo "" 582 583 # ════════════════════════════════════════════════════════════════════════════ 584 # SECTION R: BACKUP HEALTH 585 # ════════════════════════════════════════════════════════════════════════════ 586 echo "═══ R. BACKUP HEALTH ═══" 587 588 BACKUP_DIR="$PROJECT_DIR/db/backup" 589 if [ -L "$BACKUP_DIR" ]; then 590 BACKUP_TARGET=$(readlink -f "$BACKUP_DIR" 2>/dev/null) 591 if [ ! -d "$BACKUP_TARGET" ]; then 592 echo " ⚠️ CRITICAL: Backup symlink target not accessible: $BACKUP_TARGET" 593 echo " External drive may be unmounted. NO BACKUPS ARE RUNNING." 594 else 595 # Find newest backup file 596 NEWEST_BACKUP=$(ls -t "$BACKUP_DIR"/sites-backup-*.db 2>/dev/null | head -1) 597 if [ -z "$NEWEST_BACKUP" ]; then 598 echo " ⚠️ CRITICAL: No backup files found in $BACKUP_DIR" 599 else 600 BACKUP_SIZE=$(stat -c%s "$NEWEST_BACKUP" 2>/dev/null || echo "0") 601 BACKUP_AGE_H=$(( ($(date +%s) - $(stat -c%Y "$NEWEST_BACKUP" 2>/dev/null || echo "0")) / 3600 )) 602 BACKUP_NAME=$(basename "$NEWEST_BACKUP") 603 604 if [ "$BACKUP_SIZE" -lt 1024 ]; then 605 echo " ⚠️ CRITICAL: Newest backup is ZERO/CORRUPT: $BACKUP_NAME (${BACKUP_SIZE} bytes)" 606 echo " Backups may have been failing silently. Check backupDatabase cron logs." 607 # Check how many recent backups are zero-sized 608 ZERO_COUNT=$(find "$BACKUP_DIR" -name "sites-backup-*.db" -size -1k 2>/dev/null | wc -l) 609 TOTAL_COUNT=$(find "$BACKUP_DIR" -name "sites-backup-*.db" 2>/dev/null | wc -l) 610 echo " Zero-sized backups: $ZERO_COUNT / $TOTAL_COUNT total" 611 elif [ "$BACKUP_AGE_H" -gt 36 ]; then 612 echo " ⚠️ WARNING: Newest backup is ${BACKUP_AGE_H}h old: $BACKUP_NAME" 613 else 614 echo " ✓ Latest backup: $BACKUP_NAME ($(numfmt --to=iec $BACKUP_SIZE), ${BACKUP_AGE_H}h ago)" 615 fi 616 fi 617 fi 618 elif [ -d "$BACKUP_DIR" ]; then 619 echo " Backup dir is a real directory (not symlink to external drive)" 620 else 621 echo " ⚠️ CRITICAL: Backup directory does not exist: $BACKUP_DIR" 622 fi 623 echo "" 624 625 # ════════════════════════════════════════════════════════════════════════════ 626 # SECTION N: SNAPSHOT MANAGEMENT 627 # ════════════════════════════════════════════════════════════════════════════ 628 629 # ── Site count canary: detect catastrophic data loss ── 630 CURRENT_TOTAL_SITES=$(sq "SELECT COUNT(*) FROM sites;" 2>/dev/null || echo "0") 631 if [ "$PREV_EXISTS" = "true" ] && [ -n "${prev_snap_total_sites:-}" ]; then 632 if [ "${prev_snap_total_sites}" -gt 100 ] && [ "$CURRENT_TOTAL_SITES" -eq 0 ]; then 633 echo " 🚨 CRITICAL: SITE COUNT IS ZERO (was ${prev_snap_total_sites})" 634 echo " DATABASE HAS BEEN WIPED. Restore from backup immediately." 635 echo " Stop all services: systemctl --user stop 333method-orchestrator.timer mmo-cron.timer 333method-pipeline.service" 636 elif [ "${prev_snap_total_sites}" -gt 100 ]; then 637 _canary_drop=$(( (prev_snap_total_sites - CURRENT_TOTAL_SITES) * 100 / prev_snap_total_sites )) 638 if [ "$_canary_drop" -gt 20 ]; then 639 echo " 🚨 CRITICAL: Site count dropped ${_canary_drop}% (${prev_snap_total_sites} → ${CURRENT_TOTAL_SITES})" 640 echo " Possible data loss. Investigate immediately." 641 fi 642 fi 643 fi 644 645 # Write new snapshot (overwrite previous) 646 { 647 echo "# monitoring-checks snapshot $NOW_UTC" 648 649 # Snapshot counts for Section A inter-check deltas 650 snap_found=$(sq "SELECT COUNT(*) FROM sites WHERE status='found';") 651 snap_assets=$(sq "SELECT COUNT(*) FROM sites WHERE status='assets_captured';") 652 snap_scored=$(sq "SELECT COUNT(*) FROM sites WHERE status='prog_scored';") 653 snap_rescored=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('semantic_scored','vision_scored');") 654 snap_enriched=$(sq "SELECT COUNT(*) FROM sites WHERE status='enriched';") 655 snap_proposals=$(sq "SELECT COUNT(*) FROM sites WHERE status='proposals_drafted';") 656 snap_sent=$(sq "SELECT COUNT(*) FROM sites WHERE status='outreach_sent';") 657 snap_out_approved=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL;") 658 snap_out_sent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivery_status IN ('sent','delivered','bounced');") 659 660 echo "prev_snap_site_found=${snap_found:-0}" 661 echo "prev_snap_site_assets=${snap_assets:-0}" 662 echo "prev_snap_site_scored=${snap_scored:-0}" 663 echo "prev_snap_site_rescored=${snap_rescored:-0}" 664 echo "prev_snap_site_enriched=${snap_enriched:-0}" 665 echo "prev_snap_site_proposals=${snap_proposals:-0}" 666 echo "prev_snap_site_sent=${snap_sent:-0}" 667 echo "prev_snap_out_approved=${snap_out_approved:-0}" 668 echo "prev_snap_out_sent=${snap_out_sent:-0}" 669 echo "prev_snap_total_sites=${CURRENT_TOTAL_SITES:-0}" 670 671 # Approved by channel (for Section D) 672 sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL GROUP BY contact_method;" | while IFS='|' read -r m c; do 673 echo "prev_approved_${m}=$c" 674 done 675 676 # Agent task counts 677 sq "SELECT status, COUNT(*) FROM tel.agent_tasks GROUP BY status;" | while IFS='|' read -r s c; do 678 safe=$(echo "$s" | sed 's/[^a-zA-Z0-9]/_/g') 679 echo "prev_agent_${safe}=$c" 680 done 681 682 # Overseer resets 683 echo "prev_overseer_resets=$overseer_resets" 684 685 # Gate timestamps for staleness detection (Change 3: SRE monitoring integration) 686 GATE_FILE_PATH="$PROJECT_DIR/logs/orchestrator-gates.json" 687 if [ -f "$GATE_FILE_PATH" ]; then 688 mh_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE_PATH','utf8'));process.stdout.write(d.monitor_health||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "never") 689 te_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE_PATH','utf8'));process.stdout.write(d.triage_errors||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "never") 690 echo "prev_monitor_health_ts=$mh_ts" 691 echo "prev_triage_errors_ts=$te_ts" 692 fi 693 694 } > "$SNAPSHOT" 695 696 if [ "$PREV_EXISTS" = "true" ]; then 697 echo "(Snapshot updated — deltas computed against previous run)" 698 else 699 echo "(First run — baseline snapshot written. Deltas available on next run.)" 700 fi 701 echo "" 702 703 # ════════════════════════════════════════════════════════════════════════════ 704 # TIER C AGENT DUE NOTIFICATIONS 705 # ════════════════════════════════════════════════════════════════════════════ 706 AGENTS_DUE_FILE="$LOGS/agents-due.json" 707 if [ -f "$AGENTS_DUE_FILE" ]; then 708 due_count=$(node -e " 709 try { 710 const d = JSON.parse(require('fs').readFileSync('$AGENTS_DUE_FILE','utf8')); 711 console.log(Object.keys(d).length); 712 } catch { console.log(0); } 713 " 2>/dev/null || echo 0) 714 if [ "$due_count" -gt 0 ] 2>/dev/null; then 715 echo "" 716 echo "━━━ TIER C AGENTS DUE ━━━" 717 echo "# ACTION REQUIRED: The following interactive agent sessions are ready to run." 718 echo "# These cannot run automatically — they need tool access (WebFetch, Read, Edit)." 719 echo "# Run each from 333Method/: ./scripts/run-agent-session.sh <agent>" 720 echo "" 721 node -e " 722 try { 723 const d = JSON.parse(require('fs').readFileSync('$AGENTS_DUE_FILE','utf8')); 724 for (const [key, val] of Object.entries(d)) { 725 if (val && typeof val === 'object') { 726 const overdue = val.days_overdue ? ' (' + val.days_overdue + 'd overdue)' : ''; 727 console.log(' [DUE] ' + key + overdue); 728 if (val.prompt) console.log(' Prompt: ' + val.prompt.slice(0, 100) + '...'); 729 } else { 730 console.log(' [DUE] ' + key); 731 } 732 } 733 } catch(e) { console.log(' (error reading agents-due.json: ' + e.message + ')'); } 734 " 2>/dev/null || cat "$AGENTS_DUE_FILE" 735 echo "" 736 fi 737 fi 738 739 # ════════════════════════════════════════════════════════════════════════════ 740 # FINAL REMINDER 741 # ════════════════════════════════════════════════════════════════════════════ 742 echo "╔══════════════════════════════════════════════════════════════════════╗" 743 echo "║ CLAUDE CODE AFK REMINDER ║" 744 echo "╠══════════════════════════════════════════════════════════════════════╣" 745 echo "║ Your job: find issues that Tier 1 (cron) and Tier 2 (agents) ║" 746 echo "║ MISSED. Don't duplicate their work — find their BLIND SPOTS. ║" 747 echo "║ ║" 748 echo "║ For each issue found: ║" 749 echo "║ 1. Investigate root cause (read code, query DB) ║" 750 echo "║ 2. Fix the code ║" 751 echo "║ 3. Modify Tier 1/2 monitoring to catch it automatically next ║" 752 echo "║ time (update cron scripts, agent monitors, or add watchdogs) ║" 753 echo "║ 4. Commit with explanation of the blind spot closed ║" 754 echo "║ ║" 755 echo "║ Then start next check cycle: ║" 756 echo "║ sleep 1800 && bash scripts/monitoring-checks.sh ║" 757 echo "╚══════════════════════════════════════════════════════════════════════╝" 758 759 # ════════════════════════════════════════════════════════════════════════════ 760 # 15. CLAUDE BATCH PROCESSOR STATUS 761 # ════════════════════════════════════════════════════════════════════════════ 762 echo "" 763 echo "━━━ 15. CLAUDE BATCH PROCESSOR STATUS ━━━" 764 echo "# LOOK FOR: Large approved/pending outreach queue not draining = outreach stage blocked." 765 echo "# Large parked queue = orchestrator not processing. Unclassified inbound = reply pipeline stuck." 766 echo "" 767 768 # Active reword queue depth per channel (approved outreach not yet sent) 769 echo "── Approved outreach awaiting send (per channel) ──" 770 sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL GROUP BY contact_method ORDER BY contact_method;" | while IFS='|' read -r method count; do 771 printf " %-12s %7s\n" "$method" "$count" 772 done 773 total_approved_unsent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL;") 774 printf " %-12s %7s\n" "TOTAL" "${total_approved_unsent:-0}" 775 echo "" 776 777 # Parked queue depth 778 parked=$(sq "SELECT COUNT(*) FROM messages WHERE approval_status='parked';") 779 echo " Parked messages (awaiting orchestrator): ${parked:-0}" 780 781 # Pending reply responses (generated by Claude, awaiting cron send) 782 pending_replies=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='reply' AND sent_at IS NULL;" 2>/dev/null || echo "0") 783 echo " Pending reply responses (unsent): ${pending_replies:-0}" 784 785 # Recently generated replies (last 1h) 786 recent_replies=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='reply' AND created_at > datetime('now','-1 hour');" 2>/dev/null || echo "0") 787 echo " Replies generated (last 1h): ${recent_replies:-0}" 788 789 # Unclassified inbound messages 790 unclassified=$(sq "SELECT COUNT(*) FROM messages WHERE direction='inbound' AND intent IS NULL AND message_body IS NOT NULL;" 2>/dev/null || echo "0") 791 echo " Unclassified inbound (no intent): ${unclassified:-0}" 792 793 # Classified but unresponded (not opt-out/autoresponder) — likely need a reply 794 unresponded=$(sq "SELECT COUNT(*) FROM messages m WHERE m.direction='inbound' AND m.intent IS NOT NULL AND m.intent NOT IN ('opt-out','autoresponder') AND NOT EXISTS (SELECT 1 FROM messages m2 WHERE m2.site_id=m.site_id AND m2.direction='outbound' AND m2.message_type='reply' AND m2.created_at > m.created_at);" 2>/dev/null || echo "0") 795 echo " Classified inbound without reply: ${unresponded:-0}" 796 797 # ──────────────────────────────────────────────────────────────────────────── 798 # Orchestrator health 799 # ──────────────────────────────────────────────────────────────────────────── 800 echo "" 801 echo "── Orchestrator health ──" 802 echo "# LOOK FOR: NOT running + systemd failed = timer not firing. Conservation mode = usage limit." 803 echo "# reply_responses store failures = inbound data issue in claude-batch.js." 804 echo "# Reword queue draining confirms orchestrator is making progress." 805 echo "" 806 807 orch_pid=$(pgrep -f "claude-orchestrator" 2>/dev/null || echo "") 808 if [ -n "$orch_pid" ]; then 809 echo " Process: running (PID $orch_pid)" 810 else 811 echo " Process: NOT running (expected if systemd timer fires per-invocation)" 812 fi 813 814 # Systemd timer/service status 815 orch_timer=$(systemctl --user is-active claude-orchestrator.timer 2>/dev/null || echo "unknown") 816 orch_service=$(systemctl --user is-active claude-orchestrator.service 2>/dev/null || echo "unknown") 817 echo " systemd timer: $orch_timer" 818 echo " systemd service: $orch_service (inactive/dead = normal between runs)" 819 if systemctl --user is-failed claude-orchestrator.service >/dev/null 2>&1; then 820 echo " WARNING: claude-orchestrator.service is in FAILED state — check: systemctl --user status claude-orchestrator.service" 821 fi 822 823 # Last orchestrator log activity 824 # Log file is named by local date (orchestrator uses date +%Y-%m-%d without -u). 825 # Fall back to UTC date if local-date log doesn't exist (e.g. checked before midnight local). 826 ORCH_LOG="$LOGS/orchestrator-$(date +%Y-%m-%d).log" 827 [ ! -f "$ORCH_LOG" ] && ORCH_LOG="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log" 828 if [ -f "$ORCH_LOG" ]; then 829 last_orch=$(tail -1 "$ORCH_LOG" 2>/dev/null) 830 echo " Last log entry: $last_orch" 831 832 # Check how long ago last activity was 833 last_ts=$(tail -20 "$ORCH_LOG" 2>/dev/null | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' | tail -1) 834 if [ -n "$last_ts" ]; then 835 last_epoch=$(date -d "$last_ts" +%s 2>/dev/null || echo "0") 836 now_epoch=$(date +%s) 837 idle_min=$(( (now_epoch - last_epoch) / 60 )) 838 [ "$idle_min" -lt 0 ] && idle_min=0 839 echo " Idle for: ${idle_min}min" 840 if [ "$idle_min" -gt 10 ]; then 841 echo " WARN: Orchestrator idle >10min — is systemd timer running? Any errors in log?" 842 fi 843 fi 844 845 # Conservation mode signal 846 if grep -q "CONSERVATION MODE ON" "$ORCH_LOG" 2>/dev/null; then 847 cons_line=$(grep "CONSERVATION MODE ON" "$ORCH_LOG" | tail -1) 848 echo " CONSERVATION MODE: $cons_line" 849 fi 850 851 # LLM output health — did any batch actually produce stored results recently? 852 # "claude returned empty" on a batch with items = tool failure, not empty queue. 853 # ALERT/WARN lines from consecutive-empty tracking are the key signal. 854 echo "" 855 echo " LLM batch output health (last 2h):" 856 _orch_stored=$(grep -Ec "stored successfully|store: " "$ORCH_LOG" 2>/dev/null); _orch_stored=${_orch_stored:-0} 857 _orch_empty=$(grep -Fc "claude returned empty" "$ORCH_LOG" 2>/dev/null); _orch_empty=${_orch_empty:-0} 858 _orch_alert=$(grep -Ec "ALERT:|consecutive_empty=[5-9]|consecutive_empty=[0-9][0-9]" "$ORCH_LOG" 2>/dev/null); _orch_alert=${_orch_alert:-0} 859 _orch_warn=$(grep -Ec "WARN:.*consecutive_empty" "$ORCH_LOG" 2>/dev/null); _orch_warn=${_orch_warn:-0} 860 _orch_fatal=$(grep -Fc "FATAL:" "$ORCH_LOG" 2>/dev/null); _orch_fatal=${_orch_fatal:-0} 861 printf " Batches stored: %s | Empty returns: %s | Streak WARNs: %s | ALERTs: %s | FATALs: %s\n" \ 862 "$_orch_stored" "$_orch_empty" "$_orch_warn" "$_orch_alert" "$_orch_fatal" 863 if [ "${_orch_fatal:-0}" -gt 0 ]; then 864 echo " ⛔ FATAL errors detected — tool canary failed. LLM batches non-functional." 865 grep "FATAL:" "$ORCH_LOG" | tail -2 | while read -r line; do echo " $line"; done 866 elif [ "${_orch_alert:-0}" -gt 0 ]; then 867 echo " ⚠️ ALERT: consecutive empty streak detected — likely tool failure, not empty queue." 868 grep "ALERT:" "$ORCH_LOG" | tail -2 | while read -r line; do echo " $line"; done 869 elif [ "${_orch_stored:-0}" -eq 0 ] && [ "${_orch_empty:-0}" -gt 5 ]; then 870 echo " ⚠️ No batches stored today but many empty returns — orchestrator may be non-functional." 871 fi 872 873 # Recent errors/failures 874 echo "" 875 echo " Recent errors (last 30 log lines):" 876 tail -30 "$ORCH_LOG" 2>/dev/null | grep -i "error\|WARN\|ALERT\|FATAL\|failed\|CONSERVATION\|usage limit\|store failed" | while read -r line; do 877 echo " $line" 878 done || echo " (none)" 879 else 880 echo " No orchestrator log found for today ($ORCH_LOG)" 881 fi 882 883 # Claude Max usage (from cache — live fetch happens inside orchestrator) 884 USAGE_CACHE="${HOME}/.claude/usage-cache.json" 885 if [ -f "$USAGE_CACHE" ]; then 886 five_pct=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(String(Math.round(d.five_hour||0)));}catch{process.stdout.write('?');}" 2>/dev/null || echo "?") 887 seven_pct=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(String(Math.round(d.seven_day||0)));}catch{process.stdout.write('?');}" 2>/dev/null || echo "?") 888 five_resets=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(d.five_hour_resets_at||'');}catch{}" 2>/dev/null || echo "") 889 is_stale=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(d.stale?'yes':'no');}catch{process.stdout.write('unknown');}" 2>/dev/null || echo "unknown") 890 echo " Claude Max usage: 5h=${five_pct}% weekly=${seven_pct}% (stale=${is_stale}, resets: $five_resets)" 891 if [ "$five_pct" != "?" ] && [ "$five_pct" -ge 80 ] 2>/dev/null; then 892 echo " WARNING: 5h window at ${five_pct}% — orchestrator may be in conservation mode" 893 fi 894 fi 895 echo "" 896 897 # Gates file — last run times 898 GATE_FILE="$PROJECT_DIR/logs/orchestrator-gates.json" 899 if [ -f "$GATE_FILE" ]; then 900 echo "" 901 echo " Frequency gates (last run):" 902 oversee_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.oversee||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "unknown") 903 classify_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.classify_errors||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "unknown") 904 echo " oversee: $oversee_ts" 905 echo " classify_errors: $classify_ts" 906 fi 907 908 # Reword progress (remaining vs total) 909 reword_remaining=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL AND reworded_at IS NULL;") 910 reword_done=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND reworded_at IS NOT NULL;") 911 echo "" 912 echo " Reword progress: ${reword_done:-0} done, ${reword_remaining:-0} remaining" 913 echo "" 914 915 # ════════════════════════════════════════════════════════════════════════════ 916 # 16. OVERSEER HEALTH (runs via orchestrator `oversee` batch every 30min) 917 # ════════════════════════════════════════════════════════════════════════════ 918 echo "" 919 echo "━━━ 16. OVERSEER HEALTH ━━━" 920 echo "# LOOK FOR: oversee gate >35min old = orchestrator not running or stuck." 921 echo "# Actions: RESTART_PIPELINE, CLEAR_STALE_TASKS, RESET_STUCK_SITES." 922 echo "# Overseer runs inside orchestrator loop (not a cron job) — if orchestrator stopped, overseer stops too." 923 echo "" 924 925 # Primary: check orchestrator gate file for last oversee run 926 if [ -f "$GATE_FILE" ]; then 927 oversee_gate=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.oversee||'');}catch{}" 2>/dev/null || echo "") 928 if [ -n "$oversee_gate" ]; then 929 gate_epoch=$(date -d "$oversee_gate" +%s 2>/dev/null || echo "0") 930 now_epoch=$(date +%s) 931 gate_age_min=$(( (now_epoch - gate_epoch) / 60 )) 932 echo " Last oversee (gate file): $oversee_gate (${gate_age_min}min ago)" 933 if [ "$gate_age_min" -gt 35 ]; then 934 echo " WARNING: Overseer last ran ${gate_age_min}min ago — orchestrator may be stopped or in conservation mode." 935 else 936 echo " Status: on schedule" 937 fi 938 else 939 echo " Oversee gate: never run (orchestrator hasn't completed a 30min cycle yet)" 940 fi 941 else 942 echo " Gate file not found: $GATE_FILE" 943 fi 944 945 # Secondary: legacy cron_jobs row (may still exist from old sonnetOverseer cron) 946 overseer_last=$(sq "SELECT last_run_at FROM ops.cron_jobs WHERE task_key='sonnetOverseer';") 947 if [ -n "$overseer_last" ]; then 948 echo " Legacy cron_jobs row: $overseer_last (historical — cron overseer is now retired)" 949 fi 950 951 echo "" 952 953 # Actions from system_health table (last 24h) 954 echo "── Overseer actions (system_health, last 24h) ──" 955 sq "SELECT action_taken, COUNT(*) FROM tel.system_health WHERE check_type='sonnet_overseer' AND created_at > datetime('now','-1 day') GROUP BY action_taken ORDER BY COUNT(*) DESC;" | while IFS='|' read -r action count; do 956 printf " %-30s %s\n" "$action" "$count" 957 done 958 echo "" 959 960 # Recent orchestrator log entries mentioning overseer outcomes 961 ORCH_LOG_TODAY="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log" 962 if [ -f "$ORCH_LOG_TODAY" ]; then 963 echo "── Recent overseer log entries (today) ──" 964 grep -i "oversee\|RESTART_PIPELINE\|CLEAR_STALE\|RESET_STUCK\|severity" "$ORCH_LOG_TODAY" 2>/dev/null | tail -10 | while read -r line; do 965 echo " $line" 966 done || echo " (none)" 967 fi 968 echo "" 969 970 # ════════════════════════════════════════════════════════════════════════════ 971 # 17. PIPELINE PROGRESS SUMMARY (replaces manual status checks) 972 # ════════════════════════════════════════════════════════════════════════════ 973 echo "" 974 echo "━━━ 17. PIPELINE PROGRESS SUMMARY ━━━" 975 echo "# High-level funnel health. LOOK FOR: evidence collection advancing, proposals generating," 976 echo "# eligible_outreach>0, cooldown expiry dates. Action required if blocked >2 cycles." 977 echo "" 978 979 echo "── Evidence pipeline ──" 980 evidence_ready=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND evidence_json IS NOT NULL AND score < 82;") 981 both_passes=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL;") 982 evidence_blocked=$(sq " 983 SELECT COUNT(*) FROM sites 984 WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82 985 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL) 986 AND UPPER(country_code) NOT IN ( 987 SELECT UPPER(TRIM(value)) FROM json_each( 988 '[\"' || REPLACE(COALESCE( 989 (SELECT value FROM (SELECT value FROM pragma_compile_options LIMIT 0) LIMIT 1), '' 990 ), ',', '\",\"') || '\"]' 991 ) 992 );") 993 # Simpler: just do the count without dynamic blocked list 994 evidence_backlog=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL);") 995 printf " Evidence ready (proposals can generate): %s\n" "${evidence_ready:-0}" 996 printf " Both passes done, awaiting merge: %s\n" "${both_passes:-0}" 997 printf " Backlog (needs collection): %s\n" "${evidence_backlog:-0}" 998 echo "" 999 1000 echo "── Outreach eligibility ──" 1001 # Past-cooldown, approved, non-GDPR English countries 1002 eligible_au_us=$(sq " 1003 SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id 1004 WHERE m.direction='outbound' AND m.approval_status='approved' 1005 AND m.delivery_status IS NULL AND m.sent_at IS NULL 1006 AND m.contact_method IN ('email','sms') 1007 AND UPPER(s.country_code) IN ('AU','US','NZ','CA','IN','ZA') 1008 AND (s.last_outreach_at IS NULL OR s.last_outreach_at < datetime('now','-3 days'));") 1009 eligible_uk=$(sq " 1010 SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id 1011 WHERE m.direction='outbound' AND m.approval_status='approved' 1012 AND m.delivery_status IS NULL AND m.sent_at IS NULL 1013 AND m.contact_method IN ('email','sms') 1014 AND UPPER(s.country_code) IN ('UK','GB','IE') 1015 AND s.gdpr_verified = 1 1016 AND (s.last_outreach_at IS NULL OR s.last_outreach_at < datetime('now','-3 days'));") 1017 cooldown_total=$(sq " 1018 SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id 1019 WHERE m.direction='outbound' AND m.approval_status='approved' 1020 AND m.delivery_status IS NULL AND m.sent_at IS NULL 1021 AND m.contact_method IN ('email','sms') 1022 AND s.last_outreach_at >= datetime('now','-3 days');") 1023 earliest_expire=$(sq " 1024 SELECT MIN(datetime(s.last_outreach_at, '+3 days')) 1025 FROM messages m JOIN sites s ON m.site_id=s.id 1026 WHERE m.direction='outbound' AND m.approval_status='approved' 1027 AND m.delivery_status IS NULL AND m.sent_at IS NULL 1028 AND m.contact_method IN ('email','sms') 1029 AND s.last_outreach_at >= datetime('now','-3 days') 1030 AND UPPER(s.country_code) IN ('AU','US','NZ','CA');") 1031 printf " Eligible to send now (AU/US/NZ/CA/IN/ZA): %s\n" "${eligible_au_us:-0}" 1032 printf " Eligible to send now (UK/GB/IE gdpr_verified): %s\n" "${eligible_uk:-0}" 1033 printf " In 3-day cooldown: %s (earliest AU/US expires: %s)\n" "${cooldown_total:-0}" "${earliest_expire:-(none)}" 1034 echo "" 1035 1036 echo "── Last 30min sends (ground truth) ──" 1037 sent_30m=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND sent_at > datetime('now','-30 minutes');") 1038 proposals_30m=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND created_at > datetime('now','-30 minutes');") 1039 printf " Sent in last 30min: %s | New proposals drafted: %s\n" "${sent_30m:-0}" "${proposals_30m:-0}" 1040 echo ""