/ analysis.ipynb
analysis.ipynb
  1  {
  2   "cells": [
  3    {
  4     "cell_type": "markdown",
  5     "id": "ohuujbmsz7",
  6     "metadata": {},
  7     "source": [
  8      "# Autoresearch Experiment Analysis\n",
  9      "\n",
 10      "Analysis of autonomous hyperparameter tuning results from `results.tsv`."
 11     ]
 12    },
 13    {
 14     "cell_type": "code",
 15     "execution_count": null,
 16     "id": "v3r8c77lxhs",
 17     "metadata": {},
 18     "outputs": [],
 19     "source": [
 20      "import pandas as pd\n",
 21      "import matplotlib.pyplot as plt\n",
 22      "import numpy as np\n",
 23      "\n",
 24      "# Load the TSV (tab-separated, 5 columns: commit, val_bpb, memory_gb, status, description)\n",
 25      "df = pd.read_csv(\"results.tsv\", sep=\"\\t\")\n",
 26      "df[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\n",
 27      "df[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\n",
 28      "df[\"status\"] = df[\"status\"].str.strip().str.upper()\n",
 29      "\n",
 30      "print(f\"Total experiments: {len(df)}\")\n",
 31      "print(f\"Columns: {list(df.columns)}\")\n",
 32      "df.head(10)"
 33     ]
 34    },
 35    {
 36     "cell_type": "code",
 37     "execution_count": null,
 38     "id": "0v37bji707o",
 39     "metadata": {},
 40     "outputs": [],
 41     "source": [
 42      "counts = df[\"status\"].value_counts()\n",
 43      "print(\"Experiment outcomes:\")\n",
 44      "print(counts.to_string())\n",
 45      "\n",
 46      "n_keep = counts.get(\"KEEP\", 0)\n",
 47      "n_discard = counts.get(\"DISCARD\", 0)\n",
 48      "n_crash = counts.get(\"CRASH\", 0)\n",
 49      "n_decided = n_keep + n_discard\n",
 50      "if n_decided > 0:\n",
 51      "    print(f\"\\nKeep rate: {n_keep}/{n_decided} = {n_keep / n_decided:.1%}\")"
 52     ]
 53    },
 54    {
 55     "cell_type": "code",
 56     "execution_count": null,
 57     "id": "j887idiuu5",
 58     "metadata": {},
 59     "outputs": [],
 60     "source": [
 61      "# Show all KEPT experiments (the improvements that stuck)\n",
 62      "kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
 63      "print(f\"KEPT experiments ({len(kept)} total):\\n\")\n",
 64      "for i, row in kept.iterrows():\n",
 65      "    bpb = row[\"val_bpb\"]\n",
 66      "    desc = row[\"description\"]\n",
 67      "    print(f\"  #{i:3d}  bpb={bpb:.6f}  mem={row['memory_gb']:.1f}GB  {desc}\")"
 68     ]
 69    },
 70    {
 71     "cell_type": "markdown",
 72     "id": "99l0xlw0lv",
 73     "metadata": {},
 74     "source": [
 75      "## Val BPB Over Time\n",
 76      "\n",
 77      "Track how the best (kept) val_bpb evolves as experiments progress. The running minimum shows the \"frontier\" -- the best result achieved so far."
 78     ]
 79    },
 80    {
 81     "cell_type": "code",
 82     "execution_count": null,
 83     "id": "79jh74veqg9",
 84     "metadata": {},
 85     "outputs": [],
 86     "source": [
 87      "fig, ax = plt.subplots(figsize=(16, 8))\n",
 88      "\n",
 89      "# Filter out crashes for plotting\n",
 90      "valid = df[df[\"status\"] != \"CRASH\"].copy()\n",
 91      "valid = valid.reset_index(drop=True)\n",
 92      "\n",
 93      "baseline_bpb = valid.loc[0, \"val_bpb\"]\n",
 94      "\n",
 95      "# Only plot points at or below baseline (the interesting region)\n",
 96      "below = valid[valid[\"val_bpb\"] <= baseline_bpb + 0.0005]\n",
 97      "\n",
 98      "# Plot discarded as faint background dots\n",
 99      "disc = below[below[\"status\"] == \"DISCARD\"]\n",
100      "ax.scatter(disc.index, disc[\"val_bpb\"],\n",
101      "           c=\"#cccccc\", s=12, alpha=0.5, zorder=2, label=\"Discarded\")\n",
102      "\n",
103      "# Plot kept experiments as prominent green dots\n",
104      "kept_v = below[below[\"status\"] == \"KEEP\"]\n",
105      "ax.scatter(kept_v.index, kept_v[\"val_bpb\"],\n",
106      "           c=\"#2ecc71\", s=50, zorder=4, label=\"Kept\", edgecolors=\"black\", linewidths=0.5)\n",
107      "\n",
108      "# Running minimum step line\n",
109      "kept_mask = valid[\"status\"] == \"KEEP\"\n",
110      "kept_idx = valid.index[kept_mask]\n",
111      "kept_bpb = valid.loc[kept_mask, \"val_bpb\"]\n",
112      "running_min = kept_bpb.cummin()\n",
113      "ax.step(kept_idx, running_min, where=\"post\", color=\"#27ae60\",\n",
114      "        linewidth=2, alpha=0.7, zorder=3, label=\"Running best\")\n",
115      "\n",
116      "# Label each kept experiment with its description\n",
117      "for idx, bpb in zip(kept_idx, kept_bpb):\n",
118      "    desc = str(valid.loc[idx, \"description\"]).strip()\n",
119      "    if len(desc) > 45:\n",
120      "        desc = desc[:42] + \"...\"\n",
121      "\n",
122      "    ax.annotate(desc, (idx, bpb),\n",
123      "                textcoords=\"offset points\",\n",
124      "                xytext=(6, 6), fontsize=8.0,\n",
125      "                color=\"#1a7a3a\", alpha=0.9,\n",
126      "                rotation=30, ha=\"left\", va=\"bottom\")\n",
127      "\n",
128      "n_total = len(df)\n",
129      "n_kept = len(df[df[\"status\"] == \"KEEP\"])\n",
130      "ax.set_xlabel(\"Experiment #\", fontsize=12)\n",
131      "ax.set_ylabel(\"Validation BPB (lower is better)\", fontsize=12)\n",
132      "ax.set_title(f\"Autoresearch Progress: {n_total} Experiments, {n_kept} Kept Improvements\", fontsize=14)\n",
133      "ax.legend(loc=\"upper right\", fontsize=9)\n",
134      "ax.grid(True, alpha=0.2)\n",
135      "\n",
136      "# Y-axis: from just below best to just above baseline\n",
137      "margin = (baseline_bpb - best) * 0.15\n",
138      "ax.set_ylim(best - margin, baseline_bpb + margin)\n",
139      "\n",
140      "plt.tight_layout()\n",
141      "plt.savefig(\"progress.png\", dpi=150, bbox_inches=\"tight\")\n",
142      "plt.show()\n",
143      "print(\"Saved to progress.png\")"
144     ]
145    },
146    {
147     "cell_type": "markdown",
148     "id": "ce48phivyou",
149     "metadata": {},
150     "source": [
151      "## Summary Statistics"
152     ]
153    },
154    {
155     "cell_type": "code",
156     "execution_count": null,
157     "id": "re1f8za8oj9",
158     "metadata": {},
159     "outputs": [],
160     "source": [
161      "# Summary stats\n",
162      "kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
163      "baseline_bpb = df.iloc[0][\"val_bpb\"]\n",
164      "best_bpb = kept[\"val_bpb\"].min()\n",
165      "best_row = kept.loc[kept[\"val_bpb\"].idxmin()]\n",
166      "\n",
167      "print(f\"Baseline val_bpb:  {baseline_bpb:.6f}\")\n",
168      "print(f\"Best val_bpb:      {best_bpb:.6f}\")\n",
169      "print(f\"Total improvement: {baseline_bpb - best_bpb:.6f} ({(baseline_bpb - best_bpb) / baseline_bpb * 100:.2f}%)\")\n",
170      "print(f\"Best experiment:   {best_row['description']}\")\n",
171      "print()\n",
172      "\n",
173      "# How many experiments to find each improvement\n",
174      "print(\"Cumulative effort per improvement:\")\n",
175      "kept_sorted = kept.reset_index()\n",
176      "for i, (_, row) in enumerate(kept_sorted.iterrows()):\n",
177      "    desc = str(row[\"description\"]).strip()\n",
178      "    print(f\"  Experiment #{row['index']:3d}: bpb={row['val_bpb']:.6f}  {desc}\")"
179     ]
180    },
181    {
182     "cell_type": "markdown",
183     "id": "oxri9h5c9gs",
184     "metadata": {},
185     "source": [
186      "## Top Hits (Kept Experiments by Improvement)"
187     ]
188    },
189    {
190     "cell_type": "code",
191     "execution_count": null,
192     "id": "q86hxu10djk",
193     "metadata": {},
194     "outputs": [],
195     "source": [
196      "# Each kept experiment's delta is measured vs the previous kept experiment's bpb\n",
197      "# (since experiments are cumulative -- each one builds on the last kept state)\n",
198      "kept = df[df[\"status\"] == \"KEEP\"].copy()\n",
199      "kept[\"prev_bpb\"] = kept[\"val_bpb\"].shift(1)\n",
200      "kept[\"delta\"] = kept[\"prev_bpb\"] - kept[\"val_bpb\"]\n",
201      "\n",
202      "# Drop baseline (no delta)\n",
203      "hits = kept.iloc[1:].copy()\n",
204      "\n",
205      "# Sort by delta improvement (biggest first)\n",
206      "hits = hits.sort_values(\"delta\", ascending=False)\n",
207      "\n",
208      "print(f\"{'Rank':>4}  {'Delta':>8}  {'BPB':>10}  Description\")\n",
209      "print(\"-\" * 80)\n",
210      "for rank, (_, row) in enumerate(hits.iterrows(), 1):\n",
211      "    print(f\"{rank:4d}  {row['delta']:+.6f}  {row['val_bpb']:.6f}  {row['description']}\")\n",
212      "\n",
213      "print(f\"\\n{'':>4}  {hits['delta'].sum():+.6f}  {'':>10}  TOTAL improvement over baseline\")"
214     ]
215    },
216    {
217     "cell_type": "code",
218     "execution_count": null,
219     "id": "f9bffe89",
220     "metadata": {},
221     "outputs": [],
222     "source": []
223    }
224   ],
225   "metadata": {
226    "kernelspec": {
227     "display_name": ".venv",
228     "language": "python",
229     "name": "python3"
230    },
231    "language_info": {
232     "codemirror_mode": {
233      "name": "ipython",
234      "version": 3
235     },
236     "file_extension": ".py",
237     "mimetype": "text/x-python",
238     "name": "python",
239     "nbconvert_exporter": "python",
240     "pygments_lexer": "ipython3",
241     "version": "3.10.12"
242    }
243   },
244   "nbformat": 4,
245   "nbformat_minor": 5
246  }