/ scripts / monitoring-checks.sh
monitoring-checks.sh
   1  #!/bin/sh
   2  # Monitoring Checks — System state collector for Claude Code (Tier 3)
   3  # Replaces afk-checks.sh. Runs every ~30 minutes during AFK sessions.
   4  # Outputs structured system state with inline analysis reminders.
   5  # Also includes sonnet overseer health checks and Claude batch processor status.
   6  # POSIX sh for NixOS compatibility. Uses sqlite3 CLI for speed (<10s).
   7  #
   8  # Usage:
   9  #   bash scripts/monitoring-checks.sh           # run check cycle
  10  #   bash scripts/monitoring-checks.sh --reset   # delete snapshot and start fresh
  11  
  12  # No set -e: this is a data collector; we want ALL sections to run even if one query fails
  13  
  14  # ── Config ──────────────────────────────────────────────────────────────────
  15  PROJECT_DIR="${PROJECT_DIR:-/home/jason/code/333Method}"
  16  DB="$PROJECT_DIR/db/sites.db"
  17  OPS_DB="$PROJECT_DIR/db/ops.db"
  18  TEL_DB="$PROJECT_DIR/db/telemetry.db"
  19  LOGS="$PROJECT_DIR/logs"
  20  # Wrapper: run sqlite3 with a 5s busy timeout so we never hang when pipeline holds the lock.
  21  # ATTACHes ops.db (as "ops") and telemetry.db (as "tel") so split-schema queries work.
  22  # Usage: sq "SELECT ..."  (replaces bare sqlite3 "$DB" "..." calls)
  23  # .timeout is a sqlite3 dot-command that sets busy_timeout without producing any output.
  24  sq() {
  25    sqlite3 "$DB" ".timeout 5000" \
  26      "ATTACH DATABASE '$OPS_DB' AS ops;" \
  27      "ATTACH DATABASE '$TEL_DB' AS tel;" \
  28      "$@" 2>/dev/null
  29  }
  30  SNAPSHOT="/tmp/afk-check-snapshot.txt"
  31  NOW_UTC=$(date -u +%Y-%m-%dT%H:%M:%SZ)
  32  TODAY=$(date -u +%Y-%m-%d)
  33  HALF_HOUR_AGO=$(date -u -d '30 minutes ago' +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -v-30M +%Y-%m-%dT%H:%M:%S 2>/dev/null || echo "")
  34  
  35  if [ "$1" = "--reset" ]; then
  36    rm -f "$SNAPSHOT"
  37    echo "Snapshot cleared. Next run will be baseline."
  38    exit 0
  39  fi
  40  
  41  # ── Load previous snapshot (if any) ────────────────────────────────────────
  42  PREV_EXISTS=false
  43  if [ -f "$SNAPSHOT" ]; then
  44    PREV_EXISTS=true
  45    # shellcheck disable=SC1090
  46    . "$SNAPSHOT"
  47  fi
  48  
  49  # Helper: compute delta (current - previous). Returns "NEW" if no previous.
  50  delta() {
  51    curr="$1"
  52    prev_var="prev_$2"
  53    eval prev_val="\${$prev_var:-}"
  54    if [ -z "$prev_val" ]; then
  55      echo "NEW"
  56    else
  57      d=$((curr - prev_val))
  58      if [ "$d" -gt 0 ]; then echo "+$d"
  59      elif [ "$d" -lt 0 ]; then echo "$d"
  60      else echo "=0 (STALL?)"
  61      fi
  62    fi
  63  }
  64  
  65  echo "╔══════════════════════════════════════════════════════════════════════╗"
  66  echo "║          MONITORING CHECK CYCLE — $NOW_UTC          ║"
  67  echo "╚══════════════════════════════════════════════════════════════════════╝"
  68  echo ""
  69  
  70  # ════════════════════════════════════════════════════════════════════════════
  71  # SECTION A: PIPELINE / OUTREACH / INBOUND STATUS (canonical: npm run status)
  72  # ════════════════════════════════════════════════════════════════════════════
  73  echo "━━━ A. STATUS (canonical npm run status output) ━━━"
  74  echo "# LOOK FOR: new24h and new1h columns show sites that arrived at a stage recently."
  75  echo "# →24h shows all that entered (including ones that moved on). new24h = still here."
  76  echo "# Outreach new24h/new1h now uses updated_at (trigger 070) — accurate since 2026-03-02."
  77  echo "# Agent tasks at bottom: piling up = agent system struggling."
  78  echo ""
  79  
  80  # npm run status is the single source of truth — it uses site_status transition log
  81  # for pipeline deltas (not inflated updated_at), and updated_at trigger for outreach deltas.
  82  cd "$PROJECT_DIR" && npm run status --silent 2>/dev/null || echo "  (npm run status failed)"
  83  echo ""
  84  
  85  echo "── Agent Task Status (snapshot delta vs last check) ──"
  86  sq "SELECT status, COUNT(*) FROM tel.agent_tasks GROUP BY status ORDER BY COUNT(*) DESC;" | while IFS='|' read -r status count; do
  87    d=$(delta "$count" "agent_$status")
  88    printf "  %-25s %7s  (delta: %s)\n" "$status" "$count" "$d"
  89  done
  90  echo ""
  91  
  92  echo "── Snapshot deltas (raw counts vs last check — cross-reference with new24h above) ──"
  93  echo "# These are simple current-minus-previous count deltas, not time-windowed."
  94  echo "# Use to spot movement between THIS check and the PREVIOUS one (~30 min apart)."
  95  site_found=$(sq "SELECT COUNT(*) FROM sites WHERE status='found';")
  96  site_assets=$(sq "SELECT COUNT(*) FROM sites WHERE status='assets_captured';")
  97  site_scored=$(sq "SELECT COUNT(*) FROM sites WHERE status='prog_scored';")
  98  site_rescored=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('semantic_scored','vision_scored');")
  99  site_enriched=$(sq "SELECT COUNT(*) FROM sites WHERE status='enriched';")
 100  site_proposals=$(sq "SELECT COUNT(*) FROM sites WHERE status='proposals_drafted';")
 101  site_sent=$(sq "SELECT COUNT(*) FROM sites WHERE status='outreach_sent';")
 102  outreach_approved=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL;")
 103  outreach_sent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivery_status IN ('sent','delivered','bounced');")
 104  printf "  %-25s %7s  (delta: %s)\n" "sites:found" "${site_found:-0}" "$(delta "${site_found:-0}" "snap_site_found")"
 105  printf "  %-25s %7s  (delta: %s)\n" "sites:assets_captured" "${site_assets:-0}" "$(delta "${site_assets:-0}" "snap_site_assets")"
 106  printf "  %-25s %7s  (delta: %s)\n" "sites:prog_scored" "${site_scored:-0}" "$(delta "${site_scored:-0}" "snap_site_scored")"
 107  printf "  %-25s %7s  (delta: %s)\n" "sites:semantic/vision" "${site_rescored:-0}" "$(delta "${site_rescored:-0}" "snap_site_rescored")"
 108  printf "  %-25s %7s  (delta: %s)\n" "sites:enriched" "${site_enriched:-0}" "$(delta "${site_enriched:-0}" "snap_site_enriched")"
 109  printf "  %-25s %7s  (delta: %s)\n" "sites:proposals_drafted" "${site_proposals:-0}" "$(delta "${site_proposals:-0}" "snap_site_proposals")"
 110  printf "  %-25s %7s  (delta: %s)\n" "sites:outreach_sent" "${site_sent:-0}" "$(delta "${site_sent:-0}" "snap_site_sent")"
 111  printf "  %-25s %7s  (delta: %s)\n" "outreach:approved" "${outreach_approved:-0}" "$(delta "${outreach_approved:-0}" "snap_out_approved")"
 112  printf "  %-25s %7s  (delta: %s)\n" "outreach:sent+bounced" "${outreach_sent:-0}" "$(delta "${outreach_sent:-0}" "snap_out_sent")"
 113  echo ""
 114  
 115  # ════════════════════════════════════════════════════════════════════════════
 116  # SECTION A2: AGENT MONITOR HEALTH (last orchestrator finding)
 117  # ════════════════════════════════════════════════════════════════════════════
 118  echo "━━━ A2. AGENT MONITOR HEALTH (last Haiku finding) ━━━"
 119  echo "# Source: orchestrator monitor_health batch (runs every 60min via claude-orchestrator.sh)"
 120  echo "# ACTION REQUIRED if severity=critical — do not wait for next check cycle."
 121  echo ""
 122  
 123  ORCH_LOG_TODAY="$LOGS/orchestrator-$(date +%Y-%m-%d).log"
 124  [ ! -f "$ORCH_LOG_TODAY" ] && ORCH_LOG_TODAY="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log"
 125  
 126  if [ -f "$ORCH_LOG_TODAY" ]; then
 127    # Strip ANSI codes, find the last "Health check [" line and capture it plus
 128    # the indented lines that follow it (findings and actions).
 129    last_health=$(sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \
 130      | grep -n "Health check \[" | tail -1)
 131  
 132    if [ -n "$last_health" ]; then
 133      line_num=$(echo "$last_health" | cut -d: -f1)
 134      sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \
 135        | sed -n "${line_num},$((line_num + 20))p" \
 136        | while IFS= read -r line; do
 137            case "$line" in
 138              "  "*)  echo "  $line" ;;
 139              *"Health check ["*)  echo "  $line" ;;
 140              *)  break ;;
 141            esac
 142          done
 143  
 144      # Check when this finding was made
 145      health_ts=$(sed 's/\x1b\[[0-9;]*m//g' "$ORCH_LOG_TODAY" 2>/dev/null \
 146        | grep "Health check \[" | tail -1 \
 147        | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1)
 148      if [ -n "$health_ts" ]; then
 149        health_epoch=$(date -d "$health_ts" +%s 2>/dev/null || echo "0")
 150        now_epoch=$(date +%s)
 151        if [ "$health_epoch" -gt 0 ]; then
 152          health_age_min=$(( (now_epoch - health_epoch) / 60 ))
 153          printf "\n  (Finding is %dmin old — next due in ~%dmin)\n" \
 154            "$health_age_min" "$((60 - health_age_min > 0 ? 60 - health_age_min : 0))"
 155          if [ "$health_age_min" -gt 75 ]; then
 156            echo "  WARN: No monitor_health run in ${health_age_min}min — orchestrator may be stopped."
 157          fi
 158        fi
 159      fi
 160    else
 161      echo "  No monitor_health results found in today's orchestrator log."
 162      echo "  (batch is gated to run every 60min — may not have fired yet today)"
 163    fi
 164  else
 165    echo "  No orchestrator log for today."
 166  fi
 167  echo ""
 168  
 169  # ════════════════════════════════════════════════════════════════════════════
 170  # SECTION B: BROWSER LOOP HEALTH
 171  # ════════════════════════════════════════════════════════════════════════════
 172  echo "━━━ B. BROWSER LOOP HEALTH ━━━"
 173  echo "# LOOK FOR: Browser loop stale >30min while API loop is recent = hung Playwright page."
 174  echo "# The checkBrowserLoopHung() watchdog in Tier 1 should auto-restart, but verify it did."
 175  echo ""
 176  
 177  pc_row=$(sq "SELECT last_browser_loop_at, last_api_loop_at FROM ops.pipeline_control WHERE id=1;")
 178  browser_ts=$(echo "$pc_row" | cut -d'|' -f1)
 179  api_ts=$(echo "$pc_row" | cut -d'|' -f2)
 180  if [ -n "$browser_ts" ] && [ -n "$api_ts" ]; then
 181    echo "  Browser loop last cycle: $browser_ts"
 182    echo "  API loop last cycle:     $api_ts"
 183    browser_epoch=$(date -u -d "${browser_ts}Z" +%s 2>/dev/null || echo "0")
 184    api_epoch=$(date -u -d "${api_ts}Z" +%s 2>/dev/null || echo "0")
 185    now_epoch=$(date -u +%s)
 186    if [ "$browser_epoch" -gt 0 ] && [ "$api_epoch" -gt 0 ]; then
 187      browser_age_min=$(( (now_epoch - browser_epoch) / 60 ))
 188      api_age_min=$(( (now_epoch - api_epoch) / 60 ))
 189      echo "  Browser age: ${browser_age_min}min    API age: ${api_age_min}min"
 190      enable_vision=$(grep "^ENABLE_VISION=" .env 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"' | tr '[:upper:]' '[:lower:]')
 191      if [ "$enable_vision" = "false" ]; then
 192        echo "  ℹ️  ENABLE_VISION=false — browser loop not expected to run (HTML-only mode)"
 193      elif [ "$browser_age_min" -gt 30 ] && [ "$api_age_min" -lt 30 ]; then
 194        echo "  ⚠️  BROWSER LOOP HUNG — stale ${browser_age_min}min while API active ${api_age_min}min"
 195      else
 196        echo "  ✓ Both loops active"
 197      fi
 198    fi
 199  else
 200    echo "  (no pipeline_control timestamps yet)"
 201  fi
 202  echo ""
 203  
 204  # ════════════════════════════════════════════════════════════════════════════
 205  # SECTION C: API ERROR RATES (ALL APIS, LAST 30 MIN)
 206  # ════════════════════════════════════════════════════════════════════════════
 207  echo "━━━ C. API ERROR RATES (last 30 min) ━━━"
 208  echo "# LOOK FOR: >10% error rate for any API = investigate. >30% = circuit breaker territory."
 209  echo "# Check ALL APIs: OpenRouter (scoring/enrich), ZenRows (serps), Twilio/Resend (outreach)."
 210  echo "# Burst errors at same timestamp = concurrency issue (rate limiter not wired)."
 211  echo ""
 212  
 213  # Compute 30-min window cutoff timestamps (current and 30 min ago)
 214  # Log format: [2026-03-02T18:30:00 — filter by recent hour:minute prefixes
 215  CUTOFF_EPOCH=$(( $(date +%s) - 1800 ))
 216  # Compute 30-min cutoff as an epoch for later use in sqlite queries
 217  # For log file filtering we use tail-1000 (approx 30min of log activity per file)
 218  # rather than timestamp filtering, to avoid timezone offset mismatches between
 219  # UTC container clock and AEST log timestamps.
 220  CUTOFF_TS=""  # unused — tail fallback used instead
 221  
 222  # Build a grep pattern for lines timestamped in the last 30 min.
 223  # Log lines use ISO-8601 with local offset e.g. [2026-03-04T01:15:00.123+11:00]
 224  # We extract the last 30 lines-worth of unique minute prefixes (UTC) to match.
 225  # Strategy: collect the last 35 minutes worth of HH:MM prefixes in UTC, then
 226  # match log lines whose bracketed timestamp contains any of those strings.
 227  # This is simpler and more reliable than timezone-converting the full timestamp.
 228  _ts_filter=""
 229  _i=0
 230  while [ "$_i" -le 35 ]; do
 231    _ts_filter="$_ts_filter $(date -u -d "${_i} minutes ago" +%Y-%m-%dT%H:%M 2>/dev/null || date -u -v-${_i}M +%Y-%m-%dT%H:%M 2>/dev/null)"
 232    _i=$((_i + 1))
 233  done
 234  # Also accept local-offset timestamps by including AEST/AEDT offset prefixes
 235  _tz_filter=""
 236  _i=0
 237  while [ "$_i" -le 35 ]; do
 238    _tz_filter="$_tz_filter $(TZ=Australia/Sydney date -d "${_i} minutes ago" +%Y-%m-%dT%H:%M 2>/dev/null || TZ=Australia/Sydney date -v-${_i}M +%Y-%m-%dT%H:%M 2>/dev/null)"
 239    _i=$((_i + 1))
 240  done
 241  
 242  # Build awk pattern from collected minute strings
 243  _all_minutes="$_ts_filter $_tz_filter"
 244  _awk_pat=$(echo "$_all_minutes" | tr ' ' '\n' | grep -v '^$' | sort -u | tr '\n' '|' | sed 's/|$//')
 245  
 246  for logprefix in pipeline scoring rescoring enrich serps outreach proposals; do
 247    # Use the most recently modified log for each prefix (handles NZT day-boundary log rotation)
 248    logfile=$(ls -t "$LOGS"/"$logprefix"-*.log 2>/dev/null | head -1)
 249    if [ -f "$logfile" ]; then
 250      basename=$logprefix
 251      # Filter to actual last-30-min lines using timestamp prefixes
 252      if [ -n "$_awk_pat" ]; then
 253        recent=$(grep -E "$_awk_pat" "$logfile" 2>/dev/null)
 254      else
 255        recent=$(tail -1000 "$logfile" 2>/dev/null)
 256      fi
 257      errors=$(echo "$recent" | grep -c 'status code [45][0-9][0-9]\|Timed out\|ETIMEDOUT\|ECONNRESET\|ECONNREFUSED\|Breaker is open' 2>/dev/null || echo "0")
 258      total=$(echo "$recent" | wc -l | tr -d ' ')
 259      if [ "$total" -gt 0 ] 2>/dev/null && [ "$errors" -gt 0 ] 2>/dev/null; then
 260        pct=$((errors * 100 / total))
 261        printf "  %-15s  errors: %5s / %7s lines  (%d%%)\n" "$basename" "$errors" "$total" "$pct"
 262      else
 263        printf "  %-15s  errors: 0\n" "$basename"
 264      fi
 265    fi
 266  done
 267  echo ""
 268  
 269  # Show actual error lines from last 30 min (sample) — uses _awk_pat for timestamp filtering
 270  echo "── Recent error samples (last 10 unique, last 30 min) ──"
 271  for logprefix in pipeline scoring rescoring enrich serps outreach proposals app cron; do
 272    logfile=$(ls -t "$LOGS"/"$logprefix"-*.log 2>/dev/null | head -1)
 273    if [ -f "$logfile" ]; then
 274      if [ -n "$_awk_pat" ]; then
 275        grep -E "$_awk_pat" "$logfile" 2>/dev/null | grep -i 'ERROR\|status code [45]' | tail -5
 276      else
 277        tail -200 "$logfile" 2>/dev/null | grep -i 'ERROR\|status code [45]' | tail -5
 278      fi
 279    fi
 280  done | sort -u | tail -10
 281  echo ""
 282  
 283  # ════════════════════════════════════════════════════════════════════════════
 284  # SECTION D: OUTREACH FUNNEL DETAIL
 285  # ════════════════════════════════════════════════════════════════════════════
 286  echo "━━━ D. OUTREACH FUNNEL DETAIL ━━━"
 287  echo "# LOOK FOR: Approved snapshot delta not decreasing between checks = outreach stage blocked."
 288  echo "# Verify OUTREACH_SKIP_METHODS is correctly excluded in SQL (not just code-level skip)."
 289  echo "# Check if last_outreach_at 3-day cooldown is blocking eligible sites."
 290  echo "# Note: npm run status (Section A) shows new24h/new1h for all outreach statuses."
 291  echo ""
 292  
 293  echo "── Approved outreaches by channel (snapshot delta) ──"
 294  sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL GROUP BY contact_method ORDER BY contact_method;" | while IFS='|' read -r method count; do
 295    d=$(delta "$count" "approved_$method")
 296    printf "  %-12s %7s  (delta: %s)\n" "$method" "$count" "$d"
 297  done
 298  echo ""
 299  
 300  echo "── Actually delivered (ground truth: delivered_at in last 24h, any terminal status) ──"
 301  echo "# This is the real 'how many went out' — includes sent+bounced, excludes pending/approved"
 302  sq "SELECT contact_method, delivery_status, COUNT(*) FROM messages WHERE direction='outbound' AND delivered_at > datetime('now','-24 hours') GROUP BY contact_method, delivery_status ORDER BY contact_method;" | while IFS='|' read -r method status count; do
 303    printf "  %-12s %-12s %s\n" "$method" "$status" "$count"
 304  done
 305  delivered_total=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivered_at > datetime('now','-24 hours');")
 306  printf "  Total delivered_at in 24h: %s (sent + bounced combined)\n" "${delivered_total:-0}"
 307  echo ""
 308  
 309  echo "── SMS fail rate (1d) — ALERT if >10% ──"
 310  sms_sent_1d=$(sq "SELECT SUM(CASE WHEN delivery_status IN ('sent','delivered') THEN 1 ELSE 0 END) FROM messages WHERE direction='outbound' AND contact_method='sms' AND updated_at > datetime('now','-1 day');")
 311  sms_failed_1d=$(sq "SELECT SUM(CASE WHEN delivery_status='failed' THEN 1 ELSE 0 END) FROM messages WHERE direction='outbound' AND contact_method='sms' AND updated_at > datetime('now','-1 day');")
 312  sms_sent_1d="${sms_sent_1d:-0}"
 313  sms_failed_1d="${sms_failed_1d:-0}"
 314  sms_denom_1d=$((sms_sent_1d + sms_failed_1d))
 315  if [ "$sms_denom_1d" -gt 0 ]; then
 316    sms_fail_pct_1d=$(( sms_failed_1d * 100 / sms_denom_1d ))
 317    printf "  SMS 1d: %s sent, %s failed (%s%%)\n" "$sms_sent_1d" "$sms_failed_1d" "$sms_fail_pct_1d"
 318    if [ "$sms_fail_pct_1d" -gt 10 ]; then
 319      printf "  *** HIGH SMS FAIL RATE: %s%% — check error_message breakdown ***\n" "$sms_fail_pct_1d"
 320      sq "SELECT error_message, COUNT(*) FROM messages WHERE direction='outbound' AND contact_method='sms' AND delivery_status='failed' AND updated_at > datetime('now','-1 day') GROUP BY error_message ORDER BY COUNT(*) DESC LIMIT 5;" | while IFS='|' read -r errmsg cnt; do
 321        printf "    [%s] %s\n" "$cnt" "$errmsg"
 322      done
 323    fi
 324  else
 325    printf "  SMS 1d: no attempts\n"
 326  fi
 327  echo ""
 328  
 329  # ════════════════════════════════════════════════════════════════════════════
 330  # SECTION E: OVERSEER RESET DETECTION
 331  # ════════════════════════════════════════════════════════════════════════════
 332  echo "━━━ E. OVERSEER RESET DETECTION ━━━"
 333  echo "# LOOK FOR: Count not decreasing between checks = overseer resetting sites that get stuck again."
 334  echo "# This is a loop: reset → fail → reset → fail. Need anti-loop detection in overseer."
 335  echo ""
 336  
 337  overseer_resets=$(sq "SELECT COUNT(*) FROM sites WHERE error_message LIKE '%reset by sonnet overseer%';")
 338  d=$(delta "$overseer_resets" "overseer_resets")
 339  echo "  Sites with overseer reset error: $overseer_resets  (delta: $d)"
 340  
 341  echo "── By stage ──"
 342  sq "SELECT status, COUNT(*) FROM sites WHERE error_message LIKE '%reset by sonnet overseer%' GROUP BY status;" | while IFS='|' read -r status count; do
 343    printf "  %-25s %s\n" "$status" "$count"
 344  done
 345  echo ""
 346  
 347  # ════════════════════════════════════════════════════════════════════════════
 348  # SECTION F: AGENT TASKS (last 2 hours)
 349  # ════════════════════════════════════════════════════════════════════════════
 350  echo "━━━ F. AGENT TASKS (last 2h) ━━━"
 351  echo "# LOOK FOR: >3 tasks of same type blocked with similar error = systemic agent failure."
 352  echo "# Fix root cause, don't just clear tasks. Check if a fix_bug task already exists."
 353  echo ""
 354  
 355  echo "── Pending triage_fix tasks (Haiku-validated, high/critical errors) ──"
 356  echo "# These were created by triage_errors batch. Each represents a classified bug or infra issue."
 357  echo "# Priority 9 = critical, priority 7 = high."
 358  echo ""
 359  triage_pending=$(sq "SELECT COUNT(*) FROM tel.agent_tasks WHERE task_type='triage_fix' AND status='pending';")
 360  if [ "${triage_pending:-0}" -gt 0 ]; then
 361    echo "  *** ${triage_pending} pending triage_fix task(s) — review and action: ***"
 362    sq ".mode list
 363  .separator |
 364  SELECT id, priority, created_at,
 365      json_extract(context_json,'\$.severity') as sev,
 366      json_extract(context_json,'\$.error_type') as etype,
 367      substr(json_extract(context_json,'\$.summary'),1,80) as summary,
 368      substr(json_extract(context_json,'\$.suggested_fix'),1,100) as fix
 369  FROM tel.agent_tasks
 370  WHERE task_type='triage_fix' AND status='pending'
 371  ORDER BY priority DESC, created_at ASC
 372  LIMIT 10;" | while IFS='|' read -r tid prio cat sev etype summary fix; do
 373      printf "  [id=%s pri=%s sev=%s type=%s]\n" "$tid" "$prio" "$sev" "$etype"
 374      printf "    Summary:     %s\n" "$summary"
 375      printf "    Suggested:   %s\n" "$fix"
 376      printf "    Created:     %s\n" "$cat"
 377      echo ""
 378    done
 379  else
 380    echo "  No pending triage_fix tasks."
 381  fi
 382  echo ""
 383  
 384  sq ".mode column
 385  .headers on
 386  SELECT id, task_type, assigned_to, status, substr(error_message,1,60) as error, created_at
 387  FROM tel.agent_tasks
 388  WHERE created_at > datetime('now','-2 hours')
 389  ORDER BY created_at DESC
 390  LIMIT 30;"
 391  echo ""
 392  
 393  echo "── Blocked task patterns ──"
 394  sq "SELECT task_type, substr(error_message,1,50), COUNT(*) as n
 395  FROM tel.agent_tasks
 396  WHERE status='blocked' AND created_at > datetime('now','-24 hours')
 397  GROUP BY task_type, substr(error_message,1,50)
 398  HAVING n > 1
 399  ORDER BY n DESC;" | while IFS='|' read -r type err count; do
 400    printf "  [%sx] %-15s %s\n" "$count" "$type" "$err"
 401  done
 402  echo ""
 403  
 404  # ════════════════════════════════════════════════════════════════════════════
 405  # SECTION G: CRON HEALTH
 406  # ════════════════════════════════════════════════════════════════════════════
 407  echo "━━━ G. CRON HEALTH ━━━"
 408  echo "# LOOK FOR: Any cron job not run in >2x its interval = cron itself is broken."
 409  echo "# Check: systemctl --user status mmo-cron.timer"
 410  echo ""
 411  
 412  sq ".mode column
 413  .headers on
 414  SELECT task_key,
 415         interval_value || ' ' || interval_unit as interval,
 416         last_run_at,
 417         CASE WHEN last_run_at IS NULL THEN 'NEVER'
 418              ELSE CAST(ROUND((julianday('now') - julianday(last_run_at)) * 24 * 60) AS INTEGER) || 'min ago'
 419         END as age,
 420         enabled
 421  FROM ops.cron_jobs
 422  WHERE enabled = 1
 423  ORDER BY last_run_at ASC
 424  LIMIT 25;"
 425  echo ""
 426  
 427  # Flag overdue jobs.
 428  # For minute-interval jobs: use 6x threshold to avoid false positives from the
 429  # NixOS OnUnitActiveSec bug (Check 6 resets timer every ~5min — so 1-min jobs
 430  # can legitimately gap up to 5min between runs under the workaround).
 431  echo "── Overdue cron jobs ──"
 432  sq "SELECT task_key, interval_value, interval_unit, last_run_at
 433  FROM ops.cron_jobs
 434  WHERE enabled = 1
 435    AND last_run_at IS NOT NULL
 436    AND (
 437      (interval_unit = 'minutes' AND interval_value = 1 AND (julianday('now') - julianday(last_run_at)) * 24 * 60 > 6)
 438      OR (interval_unit = 'minutes' AND interval_value > 1 AND (julianday('now') - julianday(last_run_at)) * 24 * 60 > interval_value * 2)
 439      OR (interval_unit = 'hours' AND (julianday('now') - julianday(last_run_at)) * 24 > interval_value * 2)
 440      OR (interval_unit = 'days' AND (julianday('now') - julianday(last_run_at)) > interval_value * 2)
 441    );" | while IFS='|' read -r key iv iu last; do
 442    echo "  ⚠️  OVERDUE: $key (every $iv $iu, last ran $last)"
 443  done
 444  echo ""
 445  
 446  # ════════════════════════════════════════════════════════════════════════════
 447  # SECTION H: CIRCUIT BREAKERS / RATE LIMITS
 448  # ════════════════════════════════════════════════════════════════════════════
 449  echo "━━━ H. RATE LIMITS & CIRCUIT BREAKERS ━━━"
 450  echo "# LOOK FOR: Any entry with resetAt in the past = auto-clear failed."
 451  echo "# Fix: npm run rate-limits:clear or restart pipeline."
 452  echo ""
 453  
 454  if [ -f "$LOGS/rate-limits.json" ]; then
 455    cat "$LOGS/rate-limits.json"
 456  else
 457    echo "  (no rate-limits.json — no active rate limits)"
 458  fi
 459  echo ""
 460  
 461  # ════════════════════════════════════════════════════════════════════════════
 462  # SECTION I: ZOMBIES & RUNAWAY PROCESSES
 463  # ════════════════════════════════════════════════════════════════════════════
 464  echo "━━━ I. ZOMBIES & PROCESSES ━━━"
 465  echo "# LOOK FOR: >50 zombies = container restart needed (tell user)."
 466  echo "# Any chromium/node process using >30min CPU or >50% CPU = likely hung."
 467  echo ""
 468  
 469  zombie_count=$(ps -eo stat --no-headers 2>/dev/null | grep -c '^Z' || echo "0")
 470  echo "  Zombie processes: $zombie_count"
 471  echo ""
 472  
 473  echo "── Top chromium/node/npm processes (by CPU) ──"
 474  printf "  %-7s %5s %5s %10s  %s\n" "PID" "CPU%" "MEM%" "TIME" "CMD"
 475  ps -eo pid,pcpu,pmem,time,args --sort=-pcpu --no-headers 2>/dev/null | grep -E 'chromium|node|npm' | grep -v grep | head -15 | while read -r pid cpu mem time cmd; do
 476    short_cmd=$(echo "$cmd" | cut -c1-60)
 477    printf "  %-7s %5s %5s %10s  %s\n" "$pid" "$cpu" "$mem" "$time" "$short_cmd"
 478  done
 479  echo ""
 480  
 481  # ════════════════════════════════════════════════════════════════════════════
 482  # SECTION J: SYSTEM LOAD
 483  # ════════════════════════════════════════════════════════════════════════════
 484  echo "━━━ J. SYSTEM LOAD ━━━"
 485  echo "# LOOK FOR: <500MB free memory = reduce BROWSER_CONCURRENCY."
 486  echo "# >90% disk = run npm run logs:rotate. High load avg = too many concurrent processes."
 487  echo ""
 488  
 489  echo "── Uptime & Load ──"
 490  uptime 2>/dev/null
 491  echo ""
 492  
 493  echo "── Memory ──"
 494  free -m 2>/dev/null
 495  echo ""
 496  
 497  echo "── Disk ──"
 498  df -h "$PROJECT_DIR" /tmp 2>/dev/null
 499  echo ""
 500  
 501  # ════════════════════════════════════════════════════════════════════════════
 502  # SECTION K: ALL LOGS (last 30 min, tail of each)
 503  # ════════════════════════════════════════════════════════════════════════════
 504  echo "━━━ K. RECENT LOGS (last 30 min) ━━━"
 505  echo "# LOOK FOR: Unhandled exceptions, repeated error patterns, novel error types."
 506  echo "# This is raw data for higher-order analysis — find what the monitors missed."
 507  echo ""
 508  
 509  # Find all logs modified in last 30 min (regardless of date suffix — handles NZT day boundary)
 510  find "$LOGS" -name "*.log" -mmin -30 -type f 2>/dev/null | grep -v "pipeline-status\|pipeline-2\|collect-metrics" | sort | while read -r f; do
 511    bn=$(basename "$f")
 512    lines=$(wc -l < "$f" 2>/dev/null || echo "0")
 513    echo "── $bn ($lines lines total, last 50) ──"
 514    tail -50 "$f" 2>/dev/null
 515    echo ""
 516  done
 517  
 518  # Also check pipeline-status.txt for recent Tier 2 output
 519  if [ -f "$LOGS/pipeline-status.txt" ]; then
 520    echo "── pipeline-status.txt (last 30 lines) ──"
 521    tail -30 "$LOGS/pipeline-status.txt" 2>/dev/null
 522    echo ""
 523  fi
 524  
 525  # ════════════════════════════════════════════════════════════════════════════
 526  # SECTION L: SITES PROCESSED REPEATEDLY
 527  # ════════════════════════════════════════════════════════════════════════════
 528  echo "━━━ L. SITES PROCESSED REPEATEDLY ━━━"
 529  echo "# LOOK FOR: recapture_count >5 = site in a loop. Investigate why it fails after recapture."
 530  echo "# Consider marking as 'failing' if root cause is permanent (site down, blocked, etc)."
 531  echo ""
 532  
 533  sq ".mode column
 534  .headers on
 535  SELECT id, domain, status, recapture_count, substr(error_message,1,50) as error
 536  FROM sites
 537  WHERE recapture_count > 3
 538    AND status NOT IN ('ignored','failing','high_score','outreach_sent')
 539  ORDER BY recapture_count DESC
 540  LIMIT 20;"
 541  echo ""
 542  
 543  # ════════════════════════════════════════════════════════════════════════════
 544  # SECTION M: PERFORMANCE METRICS (last 30 min)
 545  # ════════════════════════════════════════════════════════════════════════════
 546  echo "━━━ M. PERFORMANCE METRICS (last 30 min) ━━━"
 547  echo "# LOOK FOR: Success rate <90% = investigate that stage."
 548  echo "# Avg duration increasing vs last check = resource exhaustion or payload issue."
 549  echo "# Throughput (sites/min) declining = bottleneck forming."
 550  echo ""
 551  
 552  sq ".mode column
 553  .headers on
 554  SELECT stage_name,
 555         COUNT(*) as runs,
 556         SUM(sites_succeeded) as ok,
 557         SUM(sites_failed) as fail,
 558         ROUND(AVG(duration_ms)/1000.0, 1) as avg_sec,
 559         ROUND(CAST(SUM(sites_succeeded) AS REAL) / NULLIF(SUM(sites_succeeded) + SUM(sites_failed), 0) * 100, 1) as success_pct
 560  FROM tel.pipeline_metrics
 561  WHERE started_at > datetime('now','-30 minutes')
 562  GROUP BY stage_name
 563  ORDER BY stage_name;"
 564  echo ""
 565  
 566  echo "── Stage duration trend (last 2h, avg per 30min window) ──"
 567  sq "SELECT stage_name,
 568         CASE WHEN started_at > datetime('now','-30 minutes') THEN '0-30min'
 569              WHEN started_at > datetime('now','-60 minutes') THEN '30-60min'
 570              WHEN started_at > datetime('now','-90 minutes') THEN '60-90min'
 571              ELSE '90-120min'
 572         END as window,
 573         ROUND(AVG(duration_ms)/1000.0,1) as avg_sec,
 574         SUM(sites_succeeded) as ok
 575  FROM tel.pipeline_metrics
 576  WHERE started_at > datetime('now','-2 hours')
 577  GROUP BY stage_name, window
 578  ORDER BY stage_name, window;" | while IFS='|' read -r stage window avg ok; do
 579    printf "  %-12s %-10s  avg: %6ss  sites: %s\n" "$stage" "$window" "$avg" "$ok"
 580  done
 581  echo ""
 582  
 583  # ════════════════════════════════════════════════════════════════════════════
 584  # SECTION R: BACKUP HEALTH
 585  # ════════════════════════════════════════════════════════════════════════════
 586  echo "═══ R. BACKUP HEALTH ═══"
 587  
 588  BACKUP_DIR="$PROJECT_DIR/db/backup"
 589  if [ -L "$BACKUP_DIR" ]; then
 590    BACKUP_TARGET=$(readlink -f "$BACKUP_DIR" 2>/dev/null)
 591    if [ ! -d "$BACKUP_TARGET" ]; then
 592      echo "  ⚠️  CRITICAL: Backup symlink target not accessible: $BACKUP_TARGET"
 593      echo "     External drive may be unmounted. NO BACKUPS ARE RUNNING."
 594    else
 595      # Find newest backup file
 596      NEWEST_BACKUP=$(ls -t "$BACKUP_DIR"/sites-backup-*.db 2>/dev/null | head -1)
 597      if [ -z "$NEWEST_BACKUP" ]; then
 598        echo "  ⚠️  CRITICAL: No backup files found in $BACKUP_DIR"
 599      else
 600        BACKUP_SIZE=$(stat -c%s "$NEWEST_BACKUP" 2>/dev/null || echo "0")
 601        BACKUP_AGE_H=$(( ($(date +%s) - $(stat -c%Y "$NEWEST_BACKUP" 2>/dev/null || echo "0")) / 3600 ))
 602        BACKUP_NAME=$(basename "$NEWEST_BACKUP")
 603  
 604        if [ "$BACKUP_SIZE" -lt 1024 ]; then
 605          echo "  ⚠️  CRITICAL: Newest backup is ZERO/CORRUPT: $BACKUP_NAME (${BACKUP_SIZE} bytes)"
 606          echo "     Backups may have been failing silently. Check backupDatabase cron logs."
 607          # Check how many recent backups are zero-sized
 608          ZERO_COUNT=$(find "$BACKUP_DIR" -name "sites-backup-*.db" -size -1k 2>/dev/null | wc -l)
 609          TOTAL_COUNT=$(find "$BACKUP_DIR" -name "sites-backup-*.db" 2>/dev/null | wc -l)
 610          echo "     Zero-sized backups: $ZERO_COUNT / $TOTAL_COUNT total"
 611        elif [ "$BACKUP_AGE_H" -gt 36 ]; then
 612          echo "  ⚠️  WARNING: Newest backup is ${BACKUP_AGE_H}h old: $BACKUP_NAME"
 613        else
 614          echo "  ✓ Latest backup: $BACKUP_NAME ($(numfmt --to=iec $BACKUP_SIZE), ${BACKUP_AGE_H}h ago)"
 615        fi
 616      fi
 617    fi
 618  elif [ -d "$BACKUP_DIR" ]; then
 619    echo "  Backup dir is a real directory (not symlink to external drive)"
 620  else
 621    echo "  ⚠️  CRITICAL: Backup directory does not exist: $BACKUP_DIR"
 622  fi
 623  echo ""
 624  
 625  # ════════════════════════════════════════════════════════════════════════════
 626  # SECTION N: SNAPSHOT MANAGEMENT
 627  # ════════════════════════════════════════════════════════════════════════════
 628  
 629  # ── Site count canary: detect catastrophic data loss ──
 630  CURRENT_TOTAL_SITES=$(sq "SELECT COUNT(*) FROM sites;" 2>/dev/null || echo "0")
 631  if [ "$PREV_EXISTS" = "true" ] && [ -n "${prev_snap_total_sites:-}" ]; then
 632    if [ "${prev_snap_total_sites}" -gt 100 ] && [ "$CURRENT_TOTAL_SITES" -eq 0 ]; then
 633      echo "  🚨 CRITICAL: SITE COUNT IS ZERO (was ${prev_snap_total_sites})"
 634      echo "     DATABASE HAS BEEN WIPED. Restore from backup immediately."
 635      echo "     Stop all services: systemctl --user stop 333method-orchestrator.timer mmo-cron.timer 333method-pipeline.service"
 636    elif [ "${prev_snap_total_sites}" -gt 100 ]; then
 637      _canary_drop=$(( (prev_snap_total_sites - CURRENT_TOTAL_SITES) * 100 / prev_snap_total_sites ))
 638      if [ "$_canary_drop" -gt 20 ]; then
 639        echo "  🚨 CRITICAL: Site count dropped ${_canary_drop}% (${prev_snap_total_sites} → ${CURRENT_TOTAL_SITES})"
 640        echo "     Possible data loss. Investigate immediately."
 641      fi
 642    fi
 643  fi
 644  
 645  # Write new snapshot (overwrite previous)
 646  {
 647    echo "# monitoring-checks snapshot $NOW_UTC"
 648  
 649    # Snapshot counts for Section A inter-check deltas
 650    snap_found=$(sq "SELECT COUNT(*) FROM sites WHERE status='found';")
 651    snap_assets=$(sq "SELECT COUNT(*) FROM sites WHERE status='assets_captured';")
 652    snap_scored=$(sq "SELECT COUNT(*) FROM sites WHERE status='prog_scored';")
 653    snap_rescored=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('semantic_scored','vision_scored');")
 654    snap_enriched=$(sq "SELECT COUNT(*) FROM sites WHERE status='enriched';")
 655    snap_proposals=$(sq "SELECT COUNT(*) FROM sites WHERE status='proposals_drafted';")
 656    snap_sent=$(sq "SELECT COUNT(*) FROM sites WHERE status='outreach_sent';")
 657    snap_out_approved=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL;")
 658    snap_out_sent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND delivery_status IN ('sent','delivered','bounced');")
 659  
 660    echo "prev_snap_site_found=${snap_found:-0}"
 661    echo "prev_snap_site_assets=${snap_assets:-0}"
 662    echo "prev_snap_site_scored=${snap_scored:-0}"
 663    echo "prev_snap_site_rescored=${snap_rescored:-0}"
 664    echo "prev_snap_site_enriched=${snap_enriched:-0}"
 665    echo "prev_snap_site_proposals=${snap_proposals:-0}"
 666    echo "prev_snap_site_sent=${snap_sent:-0}"
 667    echo "prev_snap_out_approved=${snap_out_approved:-0}"
 668    echo "prev_snap_out_sent=${snap_out_sent:-0}"
 669    echo "prev_snap_total_sites=${CURRENT_TOTAL_SITES:-0}"
 670  
 671    # Approved by channel (for Section D)
 672    sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND approval_status='approved' AND delivery_status IS NULL GROUP BY contact_method;" | while IFS='|' read -r m c; do
 673      echo "prev_approved_${m}=$c"
 674    done
 675  
 676    # Agent task counts
 677    sq "SELECT status, COUNT(*) FROM tel.agent_tasks GROUP BY status;" | while IFS='|' read -r s c; do
 678      safe=$(echo "$s" | sed 's/[^a-zA-Z0-9]/_/g')
 679      echo "prev_agent_${safe}=$c"
 680    done
 681  
 682    # Overseer resets
 683    echo "prev_overseer_resets=$overseer_resets"
 684  
 685    # Gate timestamps for staleness detection (Change 3: SRE monitoring integration)
 686    GATE_FILE_PATH="$PROJECT_DIR/logs/orchestrator-gates.json"
 687    if [ -f "$GATE_FILE_PATH" ]; then
 688      mh_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE_PATH','utf8'));process.stdout.write(d.monitor_health||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "never")
 689      te_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE_PATH','utf8'));process.stdout.write(d.triage_errors||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "never")
 690      echo "prev_monitor_health_ts=$mh_ts"
 691      echo "prev_triage_errors_ts=$te_ts"
 692    fi
 693  
 694  } > "$SNAPSHOT"
 695  
 696  if [ "$PREV_EXISTS" = "true" ]; then
 697    echo "(Snapshot updated — deltas computed against previous run)"
 698  else
 699    echo "(First run — baseline snapshot written. Deltas available on next run.)"
 700  fi
 701  echo ""
 702  
 703  # ════════════════════════════════════════════════════════════════════════════
 704  # TIER C AGENT DUE NOTIFICATIONS
 705  # ════════════════════════════════════════════════════════════════════════════
 706  AGENTS_DUE_FILE="$LOGS/agents-due.json"
 707  if [ -f "$AGENTS_DUE_FILE" ]; then
 708    due_count=$(node -e "
 709      try {
 710        const d = JSON.parse(require('fs').readFileSync('$AGENTS_DUE_FILE','utf8'));
 711        console.log(Object.keys(d).length);
 712      } catch { console.log(0); }
 713    " 2>/dev/null || echo 0)
 714    if [ "$due_count" -gt 0 ] 2>/dev/null; then
 715      echo ""
 716      echo "━━━ TIER C AGENTS DUE ━━━"
 717      echo "# ACTION REQUIRED: The following interactive agent sessions are ready to run."
 718      echo "# These cannot run automatically — they need tool access (WebFetch, Read, Edit)."
 719      echo "# Run each from 333Method/: ./scripts/run-agent-session.sh <agent>"
 720      echo ""
 721      node -e "
 722        try {
 723          const d = JSON.parse(require('fs').readFileSync('$AGENTS_DUE_FILE','utf8'));
 724          for (const [key, val] of Object.entries(d)) {
 725            if (val && typeof val === 'object') {
 726              const overdue = val.days_overdue ? ' (' + val.days_overdue + 'd overdue)' : '';
 727              console.log('  [DUE] ' + key + overdue);
 728              if (val.prompt) console.log('       Prompt: ' + val.prompt.slice(0, 100) + '...');
 729            } else {
 730              console.log('  [DUE] ' + key);
 731            }
 732          }
 733        } catch(e) { console.log('  (error reading agents-due.json: ' + e.message + ')'); }
 734      " 2>/dev/null || cat "$AGENTS_DUE_FILE"
 735      echo ""
 736    fi
 737  fi
 738  
 739  # ════════════════════════════════════════════════════════════════════════════
 740  # FINAL REMINDER
 741  # ════════════════════════════════════════════════════════════════════════════
 742  echo "╔══════════════════════════════════════════════════════════════════════╗"
 743  echo "║                    CLAUDE CODE AFK REMINDER                        ║"
 744  echo "╠══════════════════════════════════════════════════════════════════════╣"
 745  echo "║ Your job: find issues that Tier 1 (cron) and Tier 2 (agents)      ║"
 746  echo "║ MISSED. Don't duplicate their work — find their BLIND SPOTS.      ║"
 747  echo "║                                                                    ║"
 748  echo "║ For each issue found:                                              ║"
 749  echo "║   1. Investigate root cause (read code, query DB)                  ║"
 750  echo "║   2. Fix the code                                                  ║"
 751  echo "║   3. Modify Tier 1/2 monitoring to catch it automatically next     ║"
 752  echo "║      time (update cron scripts, agent monitors, or add watchdogs)  ║"
 753  echo "║   4. Commit with explanation of the blind spot closed              ║"
 754  echo "║                                                                    ║"
 755  echo "║ Then start next check cycle:                                       ║"
 756  echo "║   sleep 1800 && bash scripts/monitoring-checks.sh                  ║"
 757  echo "╚══════════════════════════════════════════════════════════════════════╝"
 758  
 759  # ════════════════════════════════════════════════════════════════════════════
 760  # 15. CLAUDE BATCH PROCESSOR STATUS
 761  # ════════════════════════════════════════════════════════════════════════════
 762  echo ""
 763  echo "━━━ 15. CLAUDE BATCH PROCESSOR STATUS ━━━"
 764  echo "# LOOK FOR: Large approved/pending outreach queue not draining = outreach stage blocked."
 765  echo "# Large parked queue = orchestrator not processing. Unclassified inbound = reply pipeline stuck."
 766  echo ""
 767  
 768  # Active reword queue depth per channel (approved outreach not yet sent)
 769  echo "── Approved outreach awaiting send (per channel) ──"
 770  sq "SELECT contact_method, COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL GROUP BY contact_method ORDER BY contact_method;" | while IFS='|' read -r method count; do
 771    printf "  %-12s %7s\n" "$method" "$count"
 772  done
 773  total_approved_unsent=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL;")
 774  printf "  %-12s %7s\n" "TOTAL" "${total_approved_unsent:-0}"
 775  echo ""
 776  
 777  # Parked queue depth
 778  parked=$(sq "SELECT COUNT(*) FROM messages WHERE approval_status='parked';")
 779  echo "  Parked messages (awaiting orchestrator): ${parked:-0}"
 780  
 781  # Pending reply responses (generated by Claude, awaiting cron send)
 782  pending_replies=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='reply' AND sent_at IS NULL;" 2>/dev/null || echo "0")
 783  echo "  Pending reply responses (unsent): ${pending_replies:-0}"
 784  
 785  # Recently generated replies (last 1h)
 786  recent_replies=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='reply' AND created_at > datetime('now','-1 hour');" 2>/dev/null || echo "0")
 787  echo "  Replies generated (last 1h): ${recent_replies:-0}"
 788  
 789  # Unclassified inbound messages
 790  unclassified=$(sq "SELECT COUNT(*) FROM messages WHERE direction='inbound' AND intent IS NULL AND message_body IS NOT NULL;" 2>/dev/null || echo "0")
 791  echo "  Unclassified inbound (no intent): ${unclassified:-0}"
 792  
 793  # Classified but unresponded (not opt-out/autoresponder) — likely need a reply
 794  unresponded=$(sq "SELECT COUNT(*) FROM messages m WHERE m.direction='inbound' AND m.intent IS NOT NULL AND m.intent NOT IN ('opt-out','autoresponder') AND NOT EXISTS (SELECT 1 FROM messages m2 WHERE m2.site_id=m.site_id AND m2.direction='outbound' AND m2.message_type='reply' AND m2.created_at > m.created_at);" 2>/dev/null || echo "0")
 795  echo "  Classified inbound without reply: ${unresponded:-0}"
 796  
 797  # ────────────────────────────────────────────────────────────────────────────
 798  # Orchestrator health
 799  # ────────────────────────────────────────────────────────────────────────────
 800  echo ""
 801  echo "── Orchestrator health ──"
 802  echo "# LOOK FOR: NOT running + systemd failed = timer not firing. Conservation mode = usage limit."
 803  echo "# reply_responses store failures = inbound data issue in claude-batch.js."
 804  echo "# Reword queue draining confirms orchestrator is making progress."
 805  echo ""
 806  
 807  orch_pid=$(pgrep -f "claude-orchestrator" 2>/dev/null || echo "")
 808  if [ -n "$orch_pid" ]; then
 809    echo "  Process: running (PID $orch_pid)"
 810  else
 811    echo "  Process: NOT running (expected if systemd timer fires per-invocation)"
 812  fi
 813  
 814  # Systemd timer/service status
 815  orch_timer=$(systemctl --user is-active claude-orchestrator.timer 2>/dev/null || echo "unknown")
 816  orch_service=$(systemctl --user is-active claude-orchestrator.service 2>/dev/null || echo "unknown")
 817  echo "  systemd timer:   $orch_timer"
 818  echo "  systemd service: $orch_service (inactive/dead = normal between runs)"
 819  if systemctl --user is-failed claude-orchestrator.service >/dev/null 2>&1; then
 820    echo "  WARNING: claude-orchestrator.service is in FAILED state — check: systemctl --user status claude-orchestrator.service"
 821  fi
 822  
 823  # Last orchestrator log activity
 824  # Log file is named by local date (orchestrator uses date +%Y-%m-%d without -u).
 825  # Fall back to UTC date if local-date log doesn't exist (e.g. checked before midnight local).
 826  ORCH_LOG="$LOGS/orchestrator-$(date +%Y-%m-%d).log"
 827  [ ! -f "$ORCH_LOG" ] && ORCH_LOG="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log"
 828  if [ -f "$ORCH_LOG" ]; then
 829    last_orch=$(tail -1 "$ORCH_LOG" 2>/dev/null)
 830    echo "  Last log entry: $last_orch"
 831  
 832    # Check how long ago last activity was
 833    last_ts=$(tail -20 "$ORCH_LOG" 2>/dev/null | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' | tail -1)
 834    if [ -n "$last_ts" ]; then
 835      last_epoch=$(date -d "$last_ts" +%s 2>/dev/null || echo "0")
 836      now_epoch=$(date +%s)
 837      idle_min=$(( (now_epoch - last_epoch) / 60 ))
 838      [ "$idle_min" -lt 0 ] && idle_min=0
 839      echo "  Idle for: ${idle_min}min"
 840      if [ "$idle_min" -gt 10 ]; then
 841        echo "  WARN: Orchestrator idle >10min — is systemd timer running? Any errors in log?"
 842      fi
 843    fi
 844  
 845    # Conservation mode signal
 846    if grep -q "CONSERVATION MODE ON" "$ORCH_LOG" 2>/dev/null; then
 847      cons_line=$(grep "CONSERVATION MODE ON" "$ORCH_LOG" | tail -1)
 848      echo "  CONSERVATION MODE: $cons_line"
 849    fi
 850  
 851    # LLM output health — did any batch actually produce stored results recently?
 852    # "claude returned empty" on a batch with items = tool failure, not empty queue.
 853    # ALERT/WARN lines from consecutive-empty tracking are the key signal.
 854    echo ""
 855    echo "  LLM batch output health (last 2h):"
 856    _orch_stored=$(grep -Ec "stored successfully|store: " "$ORCH_LOG" 2>/dev/null); _orch_stored=${_orch_stored:-0}
 857    _orch_empty=$(grep -Fc "claude returned empty" "$ORCH_LOG" 2>/dev/null); _orch_empty=${_orch_empty:-0}
 858    _orch_alert=$(grep -Ec "ALERT:|consecutive_empty=[5-9]|consecutive_empty=[0-9][0-9]" "$ORCH_LOG" 2>/dev/null); _orch_alert=${_orch_alert:-0}
 859    _orch_warn=$(grep -Ec "WARN:.*consecutive_empty" "$ORCH_LOG" 2>/dev/null); _orch_warn=${_orch_warn:-0}
 860    _orch_fatal=$(grep -Fc "FATAL:" "$ORCH_LOG" 2>/dev/null); _orch_fatal=${_orch_fatal:-0}
 861    printf "    Batches stored: %s  |  Empty returns: %s  |  Streak WARNs: %s  |  ALERTs: %s  |  FATALs: %s\n" \
 862      "$_orch_stored" "$_orch_empty" "$_orch_warn" "$_orch_alert" "$_orch_fatal"
 863    if [ "${_orch_fatal:-0}" -gt 0 ]; then
 864      echo "    ⛔ FATAL errors detected — tool canary failed. LLM batches non-functional."
 865      grep "FATAL:" "$ORCH_LOG" | tail -2 | while read -r line; do echo "      $line"; done
 866    elif [ "${_orch_alert:-0}" -gt 0 ]; then
 867      echo "    ⚠️  ALERT: consecutive empty streak detected — likely tool failure, not empty queue."
 868      grep "ALERT:" "$ORCH_LOG" | tail -2 | while read -r line; do echo "      $line"; done
 869    elif [ "${_orch_stored:-0}" -eq 0 ] && [ "${_orch_empty:-0}" -gt 5 ]; then
 870      echo "    ⚠️  No batches stored today but many empty returns — orchestrator may be non-functional."
 871    fi
 872  
 873    # Recent errors/failures
 874    echo ""
 875    echo "  Recent errors (last 30 log lines):"
 876    tail -30 "$ORCH_LOG" 2>/dev/null | grep -i "error\|WARN\|ALERT\|FATAL\|failed\|CONSERVATION\|usage limit\|store failed" | while read -r line; do
 877      echo "    $line"
 878    done || echo "    (none)"
 879  else
 880    echo "  No orchestrator log found for today ($ORCH_LOG)"
 881  fi
 882  
 883  # Claude Max usage (from cache — live fetch happens inside orchestrator)
 884  USAGE_CACHE="${HOME}/.claude/usage-cache.json"
 885  if [ -f "$USAGE_CACHE" ]; then
 886    five_pct=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(String(Math.round(d.five_hour||0)));}catch{process.stdout.write('?');}" 2>/dev/null || echo "?")
 887    seven_pct=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(String(Math.round(d.seven_day||0)));}catch{process.stdout.write('?');}" 2>/dev/null || echo "?")
 888    five_resets=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(d.five_hour_resets_at||'');}catch{}" 2>/dev/null || echo "")
 889    is_stale=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$USAGE_CACHE','utf8'));process.stdout.write(d.stale?'yes':'no');}catch{process.stdout.write('unknown');}" 2>/dev/null || echo "unknown")
 890    echo "  Claude Max usage: 5h=${five_pct}%  weekly=${seven_pct}%  (stale=${is_stale}, resets: $five_resets)"
 891    if [ "$five_pct" != "?" ] && [ "$five_pct" -ge 80 ] 2>/dev/null; then
 892      echo "  WARNING: 5h window at ${five_pct}% — orchestrator may be in conservation mode"
 893    fi
 894  fi
 895  echo ""
 896  
 897  # Gates file — last run times
 898  GATE_FILE="$PROJECT_DIR/logs/orchestrator-gates.json"
 899  if [ -f "$GATE_FILE" ]; then
 900    echo ""
 901    echo "  Frequency gates (last run):"
 902    oversee_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.oversee||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "unknown")
 903    classify_ts=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.classify_errors||'never');}catch{process.stdout.write('never');}" 2>/dev/null || echo "unknown")
 904    echo "    oversee:         $oversee_ts"
 905    echo "    classify_errors: $classify_ts"
 906  fi
 907  
 908  # Reword progress (remaining vs total)
 909  reword_remaining=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND sent_at IS NULL AND delivery_status IS NULL AND reworded_at IS NULL;")
 910  reword_done=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND approval_status='approved' AND reworded_at IS NOT NULL;")
 911  echo ""
 912  echo "  Reword progress: ${reword_done:-0} done, ${reword_remaining:-0} remaining"
 913  echo ""
 914  
 915  # ════════════════════════════════════════════════════════════════════════════
 916  # 16. OVERSEER HEALTH (runs via orchestrator `oversee` batch every 30min)
 917  # ════════════════════════════════════════════════════════════════════════════
 918  echo ""
 919  echo "━━━ 16. OVERSEER HEALTH ━━━"
 920  echo "# LOOK FOR: oversee gate >35min old = orchestrator not running or stuck."
 921  echo "# Actions: RESTART_PIPELINE, CLEAR_STALE_TASKS, RESET_STUCK_SITES."
 922  echo "# Overseer runs inside orchestrator loop (not a cron job) — if orchestrator stopped, overseer stops too."
 923  echo ""
 924  
 925  # Primary: check orchestrator gate file for last oversee run
 926  if [ -f "$GATE_FILE" ]; then
 927    oversee_gate=$(node -e "try{const d=JSON.parse(require('fs').readFileSync('$GATE_FILE','utf8'));process.stdout.write(d.oversee||'');}catch{}" 2>/dev/null || echo "")
 928    if [ -n "$oversee_gate" ]; then
 929      gate_epoch=$(date -d "$oversee_gate" +%s 2>/dev/null || echo "0")
 930      now_epoch=$(date +%s)
 931      gate_age_min=$(( (now_epoch - gate_epoch) / 60 ))
 932      echo "  Last oversee (gate file): $oversee_gate  (${gate_age_min}min ago)"
 933      if [ "$gate_age_min" -gt 35 ]; then
 934        echo "  WARNING: Overseer last ran ${gate_age_min}min ago — orchestrator may be stopped or in conservation mode."
 935      else
 936        echo "  Status: on schedule"
 937      fi
 938    else
 939      echo "  Oversee gate: never run (orchestrator hasn't completed a 30min cycle yet)"
 940    fi
 941  else
 942    echo "  Gate file not found: $GATE_FILE"
 943  fi
 944  
 945  # Secondary: legacy cron_jobs row (may still exist from old sonnetOverseer cron)
 946  overseer_last=$(sq "SELECT last_run_at FROM ops.cron_jobs WHERE task_key='sonnetOverseer';")
 947  if [ -n "$overseer_last" ]; then
 948    echo "  Legacy cron_jobs row: $overseer_last (historical — cron overseer is now retired)"
 949  fi
 950  
 951  echo ""
 952  
 953  # Actions from system_health table (last 24h)
 954  echo "── Overseer actions (system_health, last 24h) ──"
 955  sq "SELECT action_taken, COUNT(*) FROM tel.system_health WHERE check_type='sonnet_overseer' AND created_at > datetime('now','-1 day') GROUP BY action_taken ORDER BY COUNT(*) DESC;" | while IFS='|' read -r action count; do
 956    printf "  %-30s %s\n" "$action" "$count"
 957  done
 958  echo ""
 959  
 960  # Recent orchestrator log entries mentioning overseer outcomes
 961  ORCH_LOG_TODAY="$LOGS/orchestrator-$(date -u +%Y-%m-%d).log"
 962  if [ -f "$ORCH_LOG_TODAY" ]; then
 963    echo "── Recent overseer log entries (today) ──"
 964    grep -i "oversee\|RESTART_PIPELINE\|CLEAR_STALE\|RESET_STUCK\|severity" "$ORCH_LOG_TODAY" 2>/dev/null | tail -10 | while read -r line; do
 965      echo "  $line"
 966    done || echo "  (none)"
 967  fi
 968  echo ""
 969  
 970  # ════════════════════════════════════════════════════════════════════════════
 971  # 17. PIPELINE PROGRESS SUMMARY (replaces manual status checks)
 972  # ════════════════════════════════════════════════════════════════════════════
 973  echo ""
 974  echo "━━━ 17. PIPELINE PROGRESS SUMMARY ━━━"
 975  echo "# High-level funnel health. LOOK FOR: evidence collection advancing, proposals generating,"
 976  echo "# eligible_outreach>0, cooldown expiry dates. Action required if blocked >2 cycles."
 977  echo ""
 978  
 979  echo "── Evidence pipeline ──"
 980  evidence_ready=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND evidence_json IS NOT NULL AND score < 82;")
 981  both_passes=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND evidence_pass1_json IS NOT NULL AND evidence_pass2_json IS NOT NULL AND evidence_json IS NULL;")
 982  evidence_blocked=$(sq "
 983    SELECT COUNT(*) FROM sites
 984    WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82
 985      AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)
 986      AND UPPER(country_code) NOT IN (
 987        SELECT UPPER(TRIM(value)) FROM json_each(
 988          '[\"' || REPLACE(COALESCE(
 989            (SELECT value FROM (SELECT value FROM pragma_compile_options LIMIT 0) LIMIT 1), ''
 990          ), ',', '\",\"') || '\"]'
 991        )
 992      );")
 993  # Simpler: just do the count without dynamic blocked list
 994  evidence_backlog=$(sq "SELECT COUNT(*) FROM sites WHERE status IN ('enriched','enriched_llm') AND score < 82 AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL);")
 995  printf "  Evidence ready (proposals can generate): %s\n" "${evidence_ready:-0}"
 996  printf "  Both passes done, awaiting merge:        %s\n" "${both_passes:-0}"
 997  printf "  Backlog (needs collection):              %s\n" "${evidence_backlog:-0}"
 998  echo ""
 999  
1000  echo "── Outreach eligibility ──"
1001  # Past-cooldown, approved, non-GDPR English countries
1002  eligible_au_us=$(sq "
1003    SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id
1004    WHERE m.direction='outbound' AND m.approval_status='approved'
1005      AND m.delivery_status IS NULL AND m.sent_at IS NULL
1006      AND m.contact_method IN ('email','sms')
1007      AND UPPER(s.country_code) IN ('AU','US','NZ','CA','IN','ZA')
1008      AND (s.last_outreach_at IS NULL OR s.last_outreach_at < datetime('now','-3 days'));")
1009  eligible_uk=$(sq "
1010    SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id
1011    WHERE m.direction='outbound' AND m.approval_status='approved'
1012      AND m.delivery_status IS NULL AND m.sent_at IS NULL
1013      AND m.contact_method IN ('email','sms')
1014      AND UPPER(s.country_code) IN ('UK','GB','IE')
1015      AND s.gdpr_verified = 1
1016      AND (s.last_outreach_at IS NULL OR s.last_outreach_at < datetime('now','-3 days'));")
1017  cooldown_total=$(sq "
1018    SELECT COUNT(*) FROM messages m JOIN sites s ON m.site_id=s.id
1019    WHERE m.direction='outbound' AND m.approval_status='approved'
1020      AND m.delivery_status IS NULL AND m.sent_at IS NULL
1021      AND m.contact_method IN ('email','sms')
1022      AND s.last_outreach_at >= datetime('now','-3 days');")
1023  earliest_expire=$(sq "
1024    SELECT MIN(datetime(s.last_outreach_at, '+3 days'))
1025    FROM messages m JOIN sites s ON m.site_id=s.id
1026    WHERE m.direction='outbound' AND m.approval_status='approved'
1027      AND m.delivery_status IS NULL AND m.sent_at IS NULL
1028      AND m.contact_method IN ('email','sms')
1029      AND s.last_outreach_at >= datetime('now','-3 days')
1030      AND UPPER(s.country_code) IN ('AU','US','NZ','CA');")
1031  printf "  Eligible to send now (AU/US/NZ/CA/IN/ZA): %s\n" "${eligible_au_us:-0}"
1032  printf "  Eligible to send now (UK/GB/IE gdpr_verified): %s\n" "${eligible_uk:-0}"
1033  printf "  In 3-day cooldown: %s  (earliest AU/US expires: %s)\n" "${cooldown_total:-0}" "${earliest_expire:-(none)}"
1034  echo ""
1035  
1036  echo "── Last 30min sends (ground truth) ──"
1037  sent_30m=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND sent_at > datetime('now','-30 minutes');")
1038  proposals_30m=$(sq "SELECT COUNT(*) FROM messages WHERE direction='outbound' AND message_type='outreach' AND created_at > datetime('now','-30 minutes');")
1039  printf "  Sent in last 30min: %s  |  New proposals drafted: %s\n" "${sent_30m:-0}" "${proposals_30m:-0}"
1040  echo ""