driver.sh
1 #!/bin/bash 2 # Run 8 test tasks sequentially, capturing session_id + stdout. 3 # Modifying tasks (2, 5) run in throwaway git worktrees. 4 set -u 5 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 6 REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" 7 cd "$REPO_ROOT" 8 RESULTS="${BENCHMARK_RESULTS_DIR:-/tmp/maxiter_tests/results}" 9 mkdir -p "$RESULTS" 10 11 # session_set writes the basenames (without .json) of every session file 12 # that currently exists to stdout, one per line, sorted. Used to diff 13 # before/after so we can attribute the new session(s) to the current task 14 # even when another `shan` process races us, or when the task exits 15 # before persisting (in which case the diff is empty). 16 session_set() { 17 ls ~/.shannon/sessions/*.json 2>/dev/null | xargs -I{} basename {} .json | sort 18 } 19 20 run_task() { 21 local num=$1; local desc=$2; local prompt=$3; local workdir=${4:-} 22 echo "=== Task $num: $desc ===" | tee -a "$RESULTS/driver.log" 23 date +"%Y-%m-%d %H:%M:%S" | tee -a "$RESULTS/driver.log" 24 25 local stdout_file="$RESULTS/task${num}.stdout" 26 local before_file="$RESULTS/task${num}.sessions.before" 27 local after_file="$RESULTS/task${num}.sessions.after" 28 session_set > "$before_file" 29 30 if [ -n "$workdir" ]; then 31 # modifying task: run in worktree 32 ( cd "$workdir" && perl -e 'alarm shift; exec @ARGV' 300 shan -y "$prompt" >"$stdout_file" 2>&1 ) 33 else 34 perl -e 'alarm shift; exec @ARGV' 300 shan -y "$prompt" >"$stdout_file" 2>&1 35 fi 36 local rc=$? 37 38 # New sessions attributable to this run = set-after minus set-before. 39 # Take the most recent by mtime among the new ones so concurrent `shan` 40 # processes on the same machine can't silently steal our session id. 41 session_set > "$after_file" 42 local new_session 43 new_session=$(comm -13 "$before_file" "$after_file" | while read -r sid; do 44 printf '%s\t%s\n' "$(stat -f '%m' "$HOME/.shannon/sessions/$sid.json" 2>/dev/null || echo 0)" "$sid" 45 done | sort -n | tail -1 | awk '{print $2}') 46 if [ -z "$new_session" ]; then 47 new_session="NO_NEW_SESSION" 48 fi 49 echo "session_id=$new_session (exit=$rc)" | tee -a "$RESULTS/driver.log" 50 echo "$new_session" > "$RESULTS/task${num}.session_id" 51 tail -5 "$stdout_file" | sed 's/^/ /' | tee -a "$RESULTS/driver.log" 52 echo "" | tee -a "$RESULTS/driver.log" 53 sleep 2 54 } 55 56 # Tasks 57 run_task 1 "grep-trace runForceStopTurn" \ 58 "找到 runForceStopTurn 函数在 internal/agent/loop.go 里被哪些代码路径触发(具体调用位置),每条路径调用时传的 fallback 文案是什么,总结为一个 markdown 表格" 59 60 # Task 2: modifying — use worktree 61 WT2=/tmp/maxiter_tests/wt_task2 62 git worktree remove --force "$WT2" 2>/dev/null 63 git worktree add "$WT2" HEAD >/dev/null 2>&1 64 run_task 2 "batch edit context.Background in memory/" \ 65 "把 internal/memory/ 目录下所有 .go 文件里的 context.Background() 调用改成 context.TODO(),改完跑 go vet ./internal/memory/... 验证通过" \ 66 "$WT2" 67 68 run_task 3 "audit log failure analysis" \ 69 "分析 ~/.shannon/logs/audit.log 最近 500 条记录,给出失败率最高的 3 个工具(output_summary 包含 error 或 failed 视为失败),每个工具给出 1 条代表性失败记录的 input 和 output 摘要" 70 71 run_task 4 "compare context compaction" \ 72 "对比 ShanClaw 当前 repo 和 ~/Desktop/projects/study/claude-code-source 两个代码库的 context 压缩策略差异(包括压缩触发点、压缩方式、内存持久化),写到 /tmp/compare.md" 73 74 # Task 5: modifying — use worktree 75 WT5=/tmp/maxiter_tests/wt_task5 76 git worktree remove --force "$WT5" 2>/dev/null 77 git worktree add "$WT5" HEAD >/dev/null 2>&1 78 run_task 5 "add Info.Name tests" \ 79 "给 internal/tools/ 下每个工具文件(非 _test.go)加一个对应的 _test.go 测试文件,每个测试验证 Info().Name 非空。如果已存在测试文件就追加一个 TestInfoNameNonEmpty 测试。最后跑 go test ./internal/tools/ 验证全部通过" \ 80 "$WT5" 81 82 run_task 6 "memory_recall fallback trace" \ 83 "在 internal/tools/memory.go 和 internal/memory/ 里追踪:memory_recall 工具的降级路径是什么?从代码证明,给出 3 个失败场景(sidecar 未启动、sidecar 报错、query 超时)下的实际工具行为" 84 85 run_task 7 "decision trace: list /etc" \ 86 "我要你列出 /etc 目录下的文件。完成后告诉我你选了哪个工具、为什么这么选(不用其他工具的原因)" 87 88 run_task 8 "TODO/FIXME priority" \ 89 "在整个 ShanClaw repo 里找 TODO/FIXME/XXX 标记,按优先级(安全/正确性 > 性能 > 可读性)分类,给每条一句修复建议。用表格呈现前 15 条" 90 91 echo "=== ALL DONE ===" 92 date +"%Y-%m-%d %H:%M:%S" | tee -a "$RESULTS/driver.log" 93 94 # Cleanup worktrees 95 git worktree remove --force "$WT2" 2>/dev/null 96 git worktree remove --force "$WT5" 2>/dev/null 97 echo "worktrees removed"