cache_bench.sh
1 #!/usr/bin/env bash 2 # cache_bench.sh — fixture-based cache benchmark for before/after comparison. 3 # 4 # Usage: 5 # scripts/cache_bench.sh [runs] # default 3 runs per fixture 6 # scripts/cache_bench.sh 1 # quick sanity (1 run per fixture) 7 # 8 # Requires SHANNON_CACHE_DEBUG=1 (auto-set). Assumes `shan` is on PATH and 9 # the daemon is running. Reads ~/.shannon/logs/cache-debug.log, aggregates 10 # entries added during this run's window, groups by session_id, and reports 11 # CHR, CER, and double-write rate. 12 13 set -euo pipefail 14 15 RUNS=${1:-3} 16 LOG="$HOME/.shannon/logs/cache-debug.log" 17 18 # Ensure the log exists so tail won't fail 19 mkdir -p "$(dirname "$LOG")" 20 touch "$LOG" 21 22 # Record the pre-run byte offset so we only aggregate new entries 23 BEFORE_SIZE=$(wc -c < "$LOG" 2>/dev/null | tr -d ' ') 24 : "${BEFORE_SIZE:=0}" 25 26 export SHANNON_CACHE_DEBUG=1 27 28 FIXTURES=( 29 "short: read README.md and the first file in internal/agent/ that looks like the main loop, then report the filename" 30 "research: use x_search to find 3 recent Japanese SaaS discussions on Twitter and summarize them in 3 bullets" 31 ) 32 33 # Multi-turn fixture exercises rolling cache_control: turns 2/3 must read the 34 # accumulated prefix from earlier turns. Sent through the daemon HTTP API so 35 # session_id is reused (one-shot CLI always forks a new session). 36 DAEMON_PORT=$(lsof -p "$(cat ~/.shannon/daemon.pid 2>/dev/null)" 2>/dev/null \ 37 | awk '/LISTEN/ && /localhost/ {split($9,a,":"); print a[2]; exit}') 38 : "${DAEMON_PORT:=7533}" 39 MULTI_TURN_PROMPTS=( 40 "list the top-level files in this repo (use directory_list)" 41 "now read README.md and tell me the project name in one line" 42 "what's the main programming language? answer in one word" 43 ) 44 45 echo "=== cache_bench: $RUNS runs per fixture, $(date -u +%Y-%m-%dT%H:%M:%SZ) ===" >&2 46 47 for i in $(seq 1 "$RUNS"); do 48 for f in "${FIXTURES[@]}"; do 49 label="${f:0:40}" 50 echo " run $i: $label..." >&2 51 # -y auto-approves tools. Suppress shan's output — we only care about log writes. 52 shan -y "$f" >/dev/null 2>&1 || true 53 done 54 55 # Multi-turn run: same session_id across 3 turns via daemon POST /message. 56 # Auto-approval is implicit on the daemon HTTP API (localhost-trusted). 57 sid="bench-mt-$(date +%s)-$i" 58 echo " run $i: multi-turn session=$sid..." >&2 59 for prompt in "${MULTI_TURN_PROMPTS[@]}"; do 60 curl -fsS -X POST "http://localhost:${DAEMON_PORT}/message" \ 61 -H "Content-Type: application/json" \ 62 -d "{\"text\":$(printf '%s' "$prompt" | python3 -c 'import json,sys;print(json.dumps(sys.stdin.read()))'),\"session_id\":\"$sid\",\"source\":\"cache_bench\"}" \ 63 >/dev/null 2>&1 || true 64 done 65 done 66 67 echo >&2 68 echo "=== aggregating new log entries (offset > $BEFORE_SIZE bytes) ===" >&2 69 70 # tail -c +N takes a 1-based byte index. We want everything after BEFORE_SIZE, 71 # so start at BEFORE_SIZE + 1. Pipe new lines to python3's stdin for aggregation. 72 # Use python3 -c with a captured heredoc so the heredoc-as-stdin doesn't 73 # override the pipe (the bug that "python3 <<HEREDOC" on a pipeline causes). 74 AGG_SCRIPT=$(cat <<'PY' 75 import sys, json 76 from collections import defaultdict 77 78 # Group resp entries by session_id (fall back to req_id for entries without 79 # session_id — e.g. when SHANNON_CACHE_DEBUG was enabled pre-Task-5.0). 80 sessions = defaultdict(list) 81 for line in sys.stdin: 82 line = line.strip() 83 if not line: 84 continue 85 try: 86 d = json.loads(line) 87 except json.JSONDecodeError: 88 continue 89 if d.get("dir") != "resp": 90 continue 91 key = d.get("session_id") or d.get("req_id") or "unknown" 92 sessions[key].append(d) 93 94 if not sessions: 95 print("no resp entries captured — did the runs actually hit the gateway?") 96 sys.exit(0) 97 98 totals = {"cc": 0, "cr": 0, "in": 0, "out": 0, "calls": 0} 99 first_call_cc = [] # cold-write size (turn-1 cc) 100 subsequent_cc = [] # rolling-write sizes (turn-2+ cc) 101 subsequent_cr = [] # rolling-read sizes (turn-2+ cr) 102 warm_starts = 0 # sessions whose first call was pure read (cc=0, cr>0) 103 multi_turn_sessions = [] # sessions with >=3 calls (proxy for our 3-turn fixture) 104 105 for sid, entries in sessions.items(): 106 entries.sort(key=lambda e: e.get("ts", "")) 107 totals["calls"] += len(entries) 108 if len(entries) >= 3: 109 multi_turn_sessions.append((sid, entries)) 110 first = entries[0] 111 if first.get("cc", 0) == 0 and first.get("cr", 0) > 0: 112 warm_starts += 1 113 first_call_cc.append(first.get("cc", 0)) 114 for e in entries[1:]: 115 subsequent_cc.append(e.get("cc", 0)) 116 subsequent_cr.append(e.get("cr", 0)) 117 for e in entries: 118 totals["cc"] += e.get("cc", 0) 119 totals["cr"] += e.get("cr", 0) 120 totals["in"] += e.get("in", 0) 121 totals["out"] += e.get("out", 0) 122 123 denom = totals["cr"] + totals["cc"] + totals["in"] 124 chr_val = totals["cr"] / denom if denom else 0.0 125 cer = totals["cr"] / totals["cc"] if totals["cc"] else float("inf") 126 warm_rate = warm_starts / len(sessions) if sessions else 0.0 127 avg_first_cc = sum(first_call_cc) / len(first_call_cc) if first_call_cc else 0 128 avg_sub_cc = sum(subsequent_cc) / len(subsequent_cc) if subsequent_cc else 0 129 avg_sub_cr = sum(subsequent_cr) / len(subsequent_cr) if subsequent_cr else 0 130 # Rolling efficiency on subsequent calls: each rolling write should be a tiny 131 # delta while reads accumulate the full prefix. Amplification = cr/cc per 132 # subsequent call. Healthy rolling marker: amplification >> 10. 133 # Note: ratio-vs-first-call is unreliable because warm starts make first_cc=0, 134 # so we report absolute subsequent values + amplification instead. 135 amplification = avg_sub_cr / avg_sub_cc if avg_sub_cc else float("inf") 136 137 print(f"sessions={len(sessions)} calls={totals['calls']} " 138 f"multi_turn_sessions={len(multi_turn_sessions)}") 139 print(f"CHR={chr_val:.3f} CER={cer:.2f} warm_start_rate={warm_rate:.2%}") 140 print(f" read_tokens={totals['cr']} creation_tokens={totals['cc']}" 141 f" input_tokens={totals['in']} output_tokens={totals['out']}") 142 print(f"rolling: avg_first_cc={avg_first_cc:.0f} avg_subsequent_cc={avg_sub_cc:.0f} " 143 f"avg_subsequent_cr={avg_sub_cr:.0f} amplification={amplification:.1f}x") 144 145 # Per-turn breakdown for multi-turn sessions — the strongest rolling signal 146 if multi_turn_sessions: 147 print(f"\nmulti-turn detail ({len(multi_turn_sessions)} sessions):") 148 max_turns = max(len(e) for _, e in multi_turn_sessions) 149 for turn_idx in range(max_turns): 150 ccs = [e[turn_idx].get("cc", 0) for _, e in multi_turn_sessions if turn_idx < len(e)] 151 crs = [e[turn_idx].get("cr", 0) for _, e in multi_turn_sessions if turn_idx < len(e)] 152 if not ccs: 153 continue 154 print(f" call#{turn_idx+1}: avg_cc={sum(ccs)/len(ccs):.0f} avg_cr={sum(crs)/len(crs):.0f} n={len(ccs)}") 155 156 # Sanity warnings 157 if all(len(e) == 1 for e in sessions.values()): 158 print("WARN: every session had exactly 1 call — fixtures may be too simple " 159 "or session_id plumbing isn't active (check commit 43749b5 is on this binary)") 160 if multi_turn_sessions and amplification < 5: 161 print(f"WARN: read/write amplification is only {amplification:.1f}x on subsequent calls — " 162 "rolling cache_control marker may not be reaching the gateway " 163 "(check commit 0f944e29 is deployed on shannon-cloud → Shannon → llm-service container)") 164 elif multi_turn_sessions: 165 print(f"OK: amplification={amplification:.1f}x — each rolling write is amortized over " 166 "many reads, marker is healthy") 167 PY 168 ) 169 tail -c "+$((BEFORE_SIZE + 1))" "$LOG" | python3 -c "$AGG_SCRIPT"