Cradicle Explorer

/ scripts / cache_bench.sh
cache_bench.sh
  1  #!/usr/bin/env bash
  2  # cache_bench.sh — fixture-based cache benchmark for before/after comparison.
  3  #
  4  # Usage:
  5  #   scripts/cache_bench.sh [runs]          # default 3 runs per fixture
  6  #   scripts/cache_bench.sh 1               # quick sanity (1 run per fixture)
  7  #
  8  # Requires SHANNON_CACHE_DEBUG=1 (auto-set). Assumes `shan` is on PATH and
  9  # the daemon is running. Reads ~/.shannon/logs/cache-debug.log, aggregates
 10  # entries added during this run's window, groups by session_id, and reports
 11  # CHR, CER, and double-write rate.
 12  
 13  set -euo pipefail
 14  
 15  RUNS=${1:-3}
 16  LOG="$HOME/.shannon/logs/cache-debug.log"
 17  
 18  # Ensure the log exists so tail won't fail
 19  mkdir -p "$(dirname "$LOG")"
 20  touch "$LOG"
 21  
 22  # Record the pre-run byte offset so we only aggregate new entries
 23  BEFORE_SIZE=$(wc -c < "$LOG" 2>/dev/null | tr -d ' ')
 24  : "${BEFORE_SIZE:=0}"
 25  
 26  export SHANNON_CACHE_DEBUG=1
 27  
 28  FIXTURES=(
 29    "short: read README.md and the first file in internal/agent/ that looks like the main loop, then report the filename"
 30    "research: use x_search to find 3 recent Japanese SaaS discussions on Twitter and summarize them in 3 bullets"
 31  )
 32  
 33  # Multi-turn fixture exercises rolling cache_control: turns 2/3 must read the
 34  # accumulated prefix from earlier turns. Sent through the daemon HTTP API so
 35  # session_id is reused (one-shot CLI always forks a new session).
 36  DAEMON_PORT=$(lsof -p "$(cat ~/.shannon/daemon.pid 2>/dev/null)" 2>/dev/null \
 37    | awk '/LISTEN/ && /localhost/ {split($9,a,":"); print a[2]; exit}')
 38  : "${DAEMON_PORT:=7533}"
 39  MULTI_TURN_PROMPTS=(
 40    "list the top-level files in this repo (use directory_list)"
 41    "now read README.md and tell me the project name in one line"
 42    "what's the main programming language? answer in one word"
 43  )
 44  
 45  echo "=== cache_bench: $RUNS runs per fixture, $(date -u +%Y-%m-%dT%H:%M:%SZ) ===" >&2
 46  
 47  for i in $(seq 1 "$RUNS"); do
 48    for f in "${FIXTURES[@]}"; do
 49      label="${f:0:40}"
 50      echo "  run $i: $label..." >&2
 51      # -y auto-approves tools. Suppress shan's output — we only care about log writes.
 52      shan -y "$f" >/dev/null 2>&1 || true
 53    done
 54  
 55    # Multi-turn run: same session_id across 3 turns via daemon POST /message.
 56    # Auto-approval is implicit on the daemon HTTP API (localhost-trusted).
 57    sid="bench-mt-$(date +%s)-$i"
 58    echo "  run $i: multi-turn session=$sid..." >&2
 59    for prompt in "${MULTI_TURN_PROMPTS[@]}"; do
 60      curl -fsS -X POST "http://localhost:${DAEMON_PORT}/message" \
 61        -H "Content-Type: application/json" \
 62        -d "{\"text\":$(printf '%s' "$prompt" | python3 -c 'import json,sys;print(json.dumps(sys.stdin.read()))'),\"session_id\":\"$sid\",\"source\":\"cache_bench\"}" \
 63        >/dev/null 2>&1 || true
 64    done
 65  done
 66  
 67  echo >&2
 68  echo "=== aggregating new log entries (offset > $BEFORE_SIZE bytes) ===" >&2
 69  
 70  # tail -c +N takes a 1-based byte index. We want everything after BEFORE_SIZE,
 71  # so start at BEFORE_SIZE + 1. Pipe new lines to python3's stdin for aggregation.
 72  # Use python3 -c with a captured heredoc so the heredoc-as-stdin doesn't
 73  # override the pipe (the bug that "python3 <<HEREDOC" on a pipeline causes).
 74  AGG_SCRIPT=$(cat <<'PY'
 75  import sys, json
 76  from collections import defaultdict
 77  
 78  # Group resp entries by session_id (fall back to req_id for entries without
 79  # session_id — e.g. when SHANNON_CACHE_DEBUG was enabled pre-Task-5.0).
 80  sessions = defaultdict(list)
 81  for line in sys.stdin:
 82      line = line.strip()
 83      if not line:
 84          continue
 85      try:
 86          d = json.loads(line)
 87      except json.JSONDecodeError:
 88          continue
 89      if d.get("dir") != "resp":
 90          continue
 91      key = d.get("session_id") or d.get("req_id") or "unknown"
 92      sessions[key].append(d)
 93  
 94  if not sessions:
 95      print("no resp entries captured — did the runs actually hit the gateway?")
 96      sys.exit(0)
 97  
 98  totals = {"cc": 0, "cr": 0, "in": 0, "out": 0, "calls": 0}
 99  first_call_cc = []        # cold-write size (turn-1 cc)
100  subsequent_cc = []        # rolling-write sizes (turn-2+ cc)
101  subsequent_cr = []        # rolling-read sizes (turn-2+ cr)
102  warm_starts = 0           # sessions whose first call was pure read (cc=0, cr>0)
103  multi_turn_sessions = []  # sessions with >=3 calls (proxy for our 3-turn fixture)
104  
105  for sid, entries in sessions.items():
106      entries.sort(key=lambda e: e.get("ts", ""))
107      totals["calls"] += len(entries)
108      if len(entries) >= 3:
109          multi_turn_sessions.append((sid, entries))
110      first = entries[0]
111      if first.get("cc", 0) == 0 and first.get("cr", 0) > 0:
112          warm_starts += 1
113      first_call_cc.append(first.get("cc", 0))
114      for e in entries[1:]:
115          subsequent_cc.append(e.get("cc", 0))
116          subsequent_cr.append(e.get("cr", 0))
117      for e in entries:
118          totals["cc"] += e.get("cc", 0)
119          totals["cr"] += e.get("cr", 0)
120          totals["in"] += e.get("in", 0)
121          totals["out"] += e.get("out", 0)
122  
123  denom = totals["cr"] + totals["cc"] + totals["in"]
124  chr_val = totals["cr"] / denom if denom else 0.0
125  cer = totals["cr"] / totals["cc"] if totals["cc"] else float("inf")
126  warm_rate = warm_starts / len(sessions) if sessions else 0.0
127  avg_first_cc = sum(first_call_cc) / len(first_call_cc) if first_call_cc else 0
128  avg_sub_cc = sum(subsequent_cc) / len(subsequent_cc) if subsequent_cc else 0
129  avg_sub_cr = sum(subsequent_cr) / len(subsequent_cr) if subsequent_cr else 0
130  # Rolling efficiency on subsequent calls: each rolling write should be a tiny
131  # delta while reads accumulate the full prefix. Amplification = cr/cc per
132  # subsequent call. Healthy rolling marker: amplification >> 10.
133  # Note: ratio-vs-first-call is unreliable because warm starts make first_cc=0,
134  # so we report absolute subsequent values + amplification instead.
135  amplification = avg_sub_cr / avg_sub_cc if avg_sub_cc else float("inf")
136  
137  print(f"sessions={len(sessions)}  calls={totals['calls']}  "
138        f"multi_turn_sessions={len(multi_turn_sessions)}")
139  print(f"CHR={chr_val:.3f}  CER={cer:.2f}  warm_start_rate={warm_rate:.2%}")
140  print(f"  read_tokens={totals['cr']}  creation_tokens={totals['cc']}"
141        f"  input_tokens={totals['in']}  output_tokens={totals['out']}")
142  print(f"rolling: avg_first_cc={avg_first_cc:.0f}  avg_subsequent_cc={avg_sub_cc:.0f}  "
143        f"avg_subsequent_cr={avg_sub_cr:.0f}  amplification={amplification:.1f}x")
144  
145  # Per-turn breakdown for multi-turn sessions — the strongest rolling signal
146  if multi_turn_sessions:
147      print(f"\nmulti-turn detail ({len(multi_turn_sessions)} sessions):")
148      max_turns = max(len(e) for _, e in multi_turn_sessions)
149      for turn_idx in range(max_turns):
150          ccs = [e[turn_idx].get("cc", 0) for _, e in multi_turn_sessions if turn_idx < len(e)]
151          crs = [e[turn_idx].get("cr", 0) for _, e in multi_turn_sessions if turn_idx < len(e)]
152          if not ccs:
153              continue
154          print(f"  call#{turn_idx+1}: avg_cc={sum(ccs)/len(ccs):.0f}  avg_cr={sum(crs)/len(crs):.0f}  n={len(ccs)}")
155  
156  # Sanity warnings
157  if all(len(e) == 1 for e in sessions.values()):
158      print("WARN: every session had exactly 1 call — fixtures may be too simple "
159            "or session_id plumbing isn't active (check commit 43749b5 is on this binary)")
160  if multi_turn_sessions and amplification < 5:
161      print(f"WARN: read/write amplification is only {amplification:.1f}x on subsequent calls — "
162            "rolling cache_control marker may not be reaching the gateway "
163            "(check commit 0f944e29 is deployed on shannon-cloud → Shannon → llm-service container)")
164  elif multi_turn_sessions:
165      print(f"OK: amplification={amplification:.1f}x — each rolling write is amortized over "
166            "many reads, marker is healthy")
167  PY
168  )
169  tail -c "+$((BEFORE_SIZE + 1))" "$LOG" | python3 -c "$AGG_SCRIPT"