/ analysis.ipynb
analysis.ipynb
1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "id": "ohuujbmsz7", 6 "metadata": {}, 7 "source": [ 8 "# Autoresearch Experiment Analysis\n", 9 "\n", 10 "Analysis of autonomous hyperparameter tuning results from `results.tsv`." 11 ] 12 }, 13 { 14 "cell_type": "code", 15 "execution_count": null, 16 "id": "v3r8c77lxhs", 17 "metadata": {}, 18 "outputs": [], 19 "source": [ 20 "import pandas as pd\n", 21 "import matplotlib.pyplot as plt\n", 22 "import numpy as np\n", 23 "\n", 24 "# Load the TSV (tab-separated, 5 columns: commit, val_bpb, memory_gb, status, description)\n", 25 "df = pd.read_csv(\"results.tsv\", sep=\"\\t\")\n", 26 "df[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\n", 27 "df[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\n", 28 "df[\"status\"] = df[\"status\"].str.strip().str.upper()\n", 29 "\n", 30 "print(f\"Total experiments: {len(df)}\")\n", 31 "print(f\"Columns: {list(df.columns)}\")\n", 32 "df.head(10)" 33 ] 34 }, 35 { 36 "cell_type": "code", 37 "execution_count": null, 38 "id": "0v37bji707o", 39 "metadata": {}, 40 "outputs": [], 41 "source": [ 42 "counts = df[\"status\"].value_counts()\n", 43 "print(\"Experiment outcomes:\")\n", 44 "print(counts.to_string())\n", 45 "\n", 46 "n_keep = counts.get(\"KEEP\", 0)\n", 47 "n_discard = counts.get(\"DISCARD\", 0)\n", 48 "n_crash = counts.get(\"CRASH\", 0)\n", 49 "n_decided = n_keep + n_discard\n", 50 "if n_decided > 0:\n", 51 " print(f\"\\nKeep rate: {n_keep}/{n_decided} = {n_keep / n_decided:.1%}\")" 52 ] 53 }, 54 { 55 "cell_type": "code", 56 "execution_count": null, 57 "id": "j887idiuu5", 58 "metadata": {}, 59 "outputs": [], 60 "source": [ 61 "# Show all KEPT experiments (the improvements that stuck)\n", 62 "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", 63 "print(f\"KEPT experiments ({len(kept)} total):\\n\")\n", 64 "for i, row in kept.iterrows():\n", 65 " bpb = row[\"val_bpb\"]\n", 66 " desc = row[\"description\"]\n", 67 " print(f\" #{i:3d} bpb={bpb:.6f} mem={row['memory_gb']:.1f}GB {desc}\")" 68 ] 69 }, 70 { 71 "cell_type": "markdown", 72 "id": "99l0xlw0lv", 73 "metadata": {}, 74 "source": [ 75 "## Val BPB Over Time\n", 76 "\n", 77 "Track how the best (kept) val_bpb evolves as experiments progress. The running minimum shows the \"frontier\" -- the best result achieved so far." 78 ] 79 }, 80 { 81 "cell_type": "code", 82 "execution_count": null, 83 "id": "79jh74veqg9", 84 "metadata": {}, 85 "outputs": [], 86 "source": [ 87 "fig, ax = plt.subplots(figsize=(16, 8))\n", 88 "\n", 89 "# Filter out crashes for plotting\n", 90 "valid = df[df[\"status\"] != \"CRASH\"].copy()\n", 91 "valid = valid.reset_index(drop=True)\n", 92 "\n", 93 "baseline_bpb = valid.loc[0, \"val_bpb\"]\n", 94 "\n", 95 "# Only plot points at or below baseline (the interesting region)\n", 96 "below = valid[valid[\"val_bpb\"] <= baseline_bpb + 0.0005]\n", 97 "\n", 98 "# Plot discarded as faint background dots\n", 99 "disc = below[below[\"status\"] == \"DISCARD\"]\n", 100 "ax.scatter(disc.index, disc[\"val_bpb\"],\n", 101 " c=\"#cccccc\", s=12, alpha=0.5, zorder=2, label=\"Discarded\")\n", 102 "\n", 103 "# Plot kept experiments as prominent green dots\n", 104 "kept_v = below[below[\"status\"] == \"KEEP\"]\n", 105 "ax.scatter(kept_v.index, kept_v[\"val_bpb\"],\n", 106 " c=\"#2ecc71\", s=50, zorder=4, label=\"Kept\", edgecolors=\"black\", linewidths=0.5)\n", 107 "\n", 108 "# Running minimum step line\n", 109 "kept_mask = valid[\"status\"] == \"KEEP\"\n", 110 "kept_idx = valid.index[kept_mask]\n", 111 "kept_bpb = valid.loc[kept_mask, \"val_bpb\"]\n", 112 "running_min = kept_bpb.cummin()\n", 113 "ax.step(kept_idx, running_min, where=\"post\", color=\"#27ae60\",\n", 114 " linewidth=2, alpha=0.7, zorder=3, label=\"Running best\")\n", 115 "\n", 116 "# Label each kept experiment with its description\n", 117 "for idx, bpb in zip(kept_idx, kept_bpb):\n", 118 " desc = str(valid.loc[idx, \"description\"]).strip()\n", 119 " if len(desc) > 45:\n", 120 " desc = desc[:42] + \"...\"\n", 121 "\n", 122 " ax.annotate(desc, (idx, bpb),\n", 123 " textcoords=\"offset points\",\n", 124 " xytext=(6, 6), fontsize=8.0,\n", 125 " color=\"#1a7a3a\", alpha=0.9,\n", 126 " rotation=30, ha=\"left\", va=\"bottom\")\n", 127 "\n", 128 "n_total = len(df)\n", 129 "n_kept = len(df[df[\"status\"] == \"KEEP\"])\n", 130 "ax.set_xlabel(\"Experiment #\", fontsize=12)\n", 131 "ax.set_ylabel(\"Validation BPB (lower is better)\", fontsize=12)\n", 132 "ax.set_title(f\"Autoresearch Progress: {n_total} Experiments, {n_kept} Kept Improvements\", fontsize=14)\n", 133 "ax.legend(loc=\"upper right\", fontsize=9)\n", 134 "ax.grid(True, alpha=0.2)\n", 135 "\n", 136 "# Y-axis: from just below best to just above baseline\n", 137 "margin = (baseline_bpb - best) * 0.15\n", 138 "ax.set_ylim(best - margin, baseline_bpb + margin)\n", 139 "\n", 140 "plt.tight_layout()\n", 141 "plt.savefig(\"progress.png\", dpi=150, bbox_inches=\"tight\")\n", 142 "plt.show()\n", 143 "print(\"Saved to progress.png\")" 144 ] 145 }, 146 { 147 "cell_type": "markdown", 148 "id": "ce48phivyou", 149 "metadata": {}, 150 "source": [ 151 "## Summary Statistics" 152 ] 153 }, 154 { 155 "cell_type": "code", 156 "execution_count": null, 157 "id": "re1f8za8oj9", 158 "metadata": {}, 159 "outputs": [], 160 "source": [ 161 "# Summary stats\n", 162 "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", 163 "baseline_bpb = df.iloc[0][\"val_bpb\"]\n", 164 "best_bpb = kept[\"val_bpb\"].min()\n", 165 "best_row = kept.loc[kept[\"val_bpb\"].idxmin()]\n", 166 "\n", 167 "print(f\"Baseline val_bpb: {baseline_bpb:.6f}\")\n", 168 "print(f\"Best val_bpb: {best_bpb:.6f}\")\n", 169 "print(f\"Total improvement: {baseline_bpb - best_bpb:.6f} ({(baseline_bpb - best_bpb) / baseline_bpb * 100:.2f}%)\")\n", 170 "print(f\"Best experiment: {best_row['description']}\")\n", 171 "print()\n", 172 "\n", 173 "# How many experiments to find each improvement\n", 174 "print(\"Cumulative effort per improvement:\")\n", 175 "kept_sorted = kept.reset_index()\n", 176 "for i, (_, row) in enumerate(kept_sorted.iterrows()):\n", 177 " desc = str(row[\"description\"]).strip()\n", 178 " print(f\" Experiment #{row['index']:3d}: bpb={row['val_bpb']:.6f} {desc}\")" 179 ] 180 }, 181 { 182 "cell_type": "markdown", 183 "id": "oxri9h5c9gs", 184 "metadata": {}, 185 "source": [ 186 "## Top Hits (Kept Experiments by Improvement)" 187 ] 188 }, 189 { 190 "cell_type": "code", 191 "execution_count": null, 192 "id": "q86hxu10djk", 193 "metadata": {}, 194 "outputs": [], 195 "source": [ 196 "# Each kept experiment's delta is measured vs the previous kept experiment's bpb\n", 197 "# (since experiments are cumulative -- each one builds on the last kept state)\n", 198 "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", 199 "kept[\"prev_bpb\"] = kept[\"val_bpb\"].shift(1)\n", 200 "kept[\"delta\"] = kept[\"prev_bpb\"] - kept[\"val_bpb\"]\n", 201 "\n", 202 "# Drop baseline (no delta)\n", 203 "hits = kept.iloc[1:].copy()\n", 204 "\n", 205 "# Sort by delta improvement (biggest first)\n", 206 "hits = hits.sort_values(\"delta\", ascending=False)\n", 207 "\n", 208 "print(f\"{'Rank':>4} {'Delta':>8} {'BPB':>10} Description\")\n", 209 "print(\"-\" * 80)\n", 210 "for rank, (_, row) in enumerate(hits.iterrows(), 1):\n", 211 " print(f\"{rank:4d} {row['delta']:+.6f} {row['val_bpb']:.6f} {row['description']}\")\n", 212 "\n", 213 "print(f\"\\n{'':>4} {hits['delta'].sum():+.6f} {'':>10} TOTAL improvement over baseline\")" 214 ] 215 }, 216 { 217 "cell_type": "code", 218 "execution_count": null, 219 "id": "f9bffe89", 220 "metadata": {}, 221 "outputs": [], 222 "source": [] 223 } 224 ], 225 "metadata": { 226 "kernelspec": { 227 "display_name": ".venv", 228 "language": "python", 229 "name": "python3" 230 }, 231 "language_info": { 232 "codemirror_mode": { 233 "name": "ipython", 234 "version": 3 235 }, 236 "file_extension": ".py", 237 "mimetype": "text/x-python", 238 "name": "python", 239 "nbconvert_exporter": "python", 240 "pygments_lexer": "ipython3", 241 "version": "3.10.12" 242 } 243 }, 244 "nbformat": 4, 245 "nbformat_minor": 5 246 }