Cradicle Explorer

/ analysis / mixed_effects_analysis.py
mixed_effects_analysis.py
  1  # EXTENDS: DES-c386de6567f0
  2  #!/usr/bin/env python3
  3  """
  4  Mixed-Effects Model Analysis — Reviewer Response (Lab as Random Effect)
  5  =======================================================================
  6  
  7  Addresses reviewer objection that Cohen's d=-5.80 (flagship vs budget/specialized)
  8  treats each model as independent, ignoring that models from the same lab share
  9  training infrastructure. A mixed-effects model with lab as a random effect is
 10  more appropriate.
 11  
 12  Analyses:
 13    1. Mixed-effects model: hubris ~ is_flagship + (1|lab) via statsmodels MixedLM
 14    2. Sensitivity checks: excluding Phi-4 (missing D1) and excluding reasoning models
 15    3. D1–D2 Pearson and Spearman correlation
 16    4. Outputs JSON results + formatted console summary + LaTeX appendix snippet
 17  
 18  Usage:
 19    .venv/bin/python3 research/scripts/mixed_effects_analysis.py
 20  """
 21  
 22  from __future__ import annotations
 23  
 24  import json
 25  import math
 26  import sys
 27  from pathlib import Path
 28  
 29  import warnings
 30  
 31  import numpy as np
 32  from scipy import stats
 33  import statsmodels.formula.api as smf
 34  import pandas as pd
 35  
 36  # ---- Paths ------------------------------------------------------------------
 37  
 38  REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 39  OUTPUT_DIR = REPO_ROOT / "research" / "results" / "mixed_effects"
 40  OUTPUT_JSON = OUTPUT_DIR / "mixed_effects_results.json"
 41  
 42  # ---- Data -------------------------------------------------------------------
 43  
 44  # (model, lab, category, hubris, d1, d2)
 45  # d1=None means not available (Phi-4)
 46  RAW_MODELS = [
 47      # Anthropic (Constitutional AI)
 48      ("Claude Opus 4.6",    "Anthropic", "flagship",  0.027, 0.000, 0.011),
 49      ("Claude 3 Haiku",     "Anthropic", "flagship",  0.044, 0.000, 0.030),
 50      ("Claude Sonnet 4",    "Anthropic", "flagship",  0.048, 0.000, 0.030),
 51      ("Claude 3.5 Haiku",   "Anthropic", "flagship",  0.053, 0.000, 0.064),
 52      ("Claude Haiku 4.5",   "Anthropic", "flagship",  0.057, 0.000, 0.014),
 53      ("Claude Sonnet 4.5",  "Anthropic", "flagship",  0.125, 0.000, 0.003),
 54      # OpenAI
 55      ("GPT-4o",             "OpenAI",    "flagship",  0.055, 0.000, 0.028),
 56      ("GPT-4o Mini",        "OpenAI",    "flagship",  0.060, 0.000, 0.017),
 57      ("GPT-3.5 Turbo",      "OpenAI",    "flagship",  0.073, 0.000, 0.089),
 58      ("o3-mini",            "OpenAI",    "reasoning", 0.383, 1.000, 0.005),
 59      # Google
 60      ("Gemini 2.5 Pro",     "Google",    "flagship",  0.053, 0.000, 0.012),
 61      ("Gemini 2.5 Flash",   "Google",    "flagship",  0.062, 0.000, 0.001),
 62      ("Gemini 2.0 Flash",   "Google",    "flagship",  0.078, 0.000, 0.018),
 63      ("Gemini Flash Lite",  "Google",    "budget",    0.400, 1.000, 0.150),
 64      ("Gemma 2 9B",         "Google",    "budget",    0.522, 1.000, 0.380),
 65      # Meta
 66      ("Llama 4 Scout",      "Meta",      "flagship",  0.063, 0.000, 0.025),
 67      ("Llama 4 Maverick",   "Meta",      "flagship",  0.070, 0.000, 0.032),
 68      ("Llama 3.3 70B",      "Meta",      "flagship",  0.081, 0.000, 0.048),
 69      ("Llama 3.1 8B",       "Meta",      "budget",    0.489, 1.000, 0.280),
 70      # Mistral
 71      ("Mistral Large 3",    "Mistral",   "flagship",  0.061, 0.000, 0.020),
 72      ("Mistral Small 24B",  "Mistral",   "budget",    0.414, 1.000, 0.095),
 73      ("Mixtral 8x7B",       "Mistral",   "budget",    0.466, 1.000, 0.180),
 74      ("Mistral 7B",         "Mistral",   "budget",    0.504, 1.000, 0.320),
 75      # Single-model labs + multi-model labs
 76      ("Grok 3",             "xAI",       "flagship",  0.054, 0.000, 0.015),
 77      ("Command R+",         "Cohere",    "flagship",  0.077, 0.000, 0.038),
 78      ("Seed 2.0 Lite",      "ByteDance", "flagship",  0.087, 0.000, 0.055),
 79      ("DeepSeek V3.1",      "DeepSeek",  "flagship",  0.049, 0.000, 0.028),
 80      ("DeepSeek R1",        "DeepSeek",  "reasoning", 0.849, 1.000, 0.975),
 81      ("Qwen 3 235B",        "Alibaba",   "flagship",  0.047, 0.000, 0.022),
 82      ("Qwen 2.5 7B",        "Alibaba",   "budget",    0.517, 1.000, 0.285),
 83      ("Phi-4",              "Microsoft", "budget",    0.187,  None, 0.110),
 84  ]
 85  
 86  REASONING_MODELS = {"o3-mini", "DeepSeek R1"}
 87  
 88  
 89  # ---- DataFrame construction -------------------------------------------------
 90  
 91  def build_df(exclude_phi4: bool = False, exclude_reasoning: bool = False) -> pd.DataFrame:
 92      rows = []
 93      for name, lab, category, hubris, d1, d2 in RAW_MODELS:
 94          if exclude_phi4 and name == "Phi-4":
 95              continue
 96          if exclude_reasoning and name in REASONING_MODELS:
 97              continue
 98          rows.append({
 99              "model":       name,
100              "lab":         lab,
101              "category":    category,
102              "hubris":      hubris,
103              "d1":          d1,
104              "d2":          d2,
105              "is_flagship": 1 if category == "flagship" else 0,
106          })
107      return pd.DataFrame(rows)
108  
109  
110  # ---- Naive Cohen's d --------------------------------------------------------
111  
112  def naive_cohens_d(df: pd.DataFrame) -> dict:
113      flagship = df[df["category"] == "flagship"]["hubris"].values
114      non_flagship = df[df["category"] != "flagship"]["hubris"].values
115  
116      # Pooled SD
117      n1, n2 = len(flagship), len(non_flagship)
118      pooled_sd = math.sqrt(
119          ((n1 - 1) * flagship.std(ddof=1) ** 2 + (n2 - 1) * non_flagship.std(ddof=1) ** 2)
120          / (n1 + n2 - 2)
121      )
122      d = (flagship.mean() - non_flagship.mean()) / pooled_sd
123  
124      t_stat, p_value = stats.ttest_ind(flagship, non_flagship)
125      return {
126          "flagship_mean": float(flagship.mean()),
127          "non_flagship_mean": float(non_flagship.mean()),
128          "flagship_n": int(n1),
129          "non_flagship_n": int(n2),
130          "pooled_sd": float(pooled_sd),
131          "cohens_d": float(d),
132          "t_stat": float(t_stat),
133          "p_value": float(p_value),
134      }
135  
136  
137  # ---- Mixed-effects model ----------------------------------------------------
138  
139  def fit_mixed_effects(df: pd.DataFrame, label: str) -> dict:
140      """
141      Fit hubris ~ is_flagship + (1|lab) using statsmodels MixedLM.
142      Returns fixed effect, SE, p-value, ICC, and variance components.
143  
144      Notes on boundary solutions: 4 of 11 labs have only a single model
145      (xAI, Cohere, ByteDance, Microsoft). Single-model groups contribute
146      no within-group variance, so the optimizer may place random-effects
147      variance at the boundary (zero). This is a known MixedLM behaviour
148      and affects ICC estimation but NOT the fixed-effect estimate, which
149      remains stable across all optimizers (bfgs, nm, powell all give
150      identical fixed effects). The fixed effect is the primary quantity
151      of interest for the reviewer response.
152      """
153      # MixedLM requires at least 2 groups with >1 observation each for random effects
154      # to be estimable. With 11 labs, 7 of which have multiple models, this is fine.
155      model = smf.mixedlm("hubris ~ is_flagship", df, groups=df["lab"])
156      with warnings.catch_warnings(record=True) as caught_warnings:
157          warnings.simplefilter("always")
158          result = model.fit(reml=True, method="lbfgs")
159  
160      # Collect any convergence/singularity warnings for reporting
161      warning_messages = [str(w.message) for w in caught_warnings]
162  
163      # Extract fixed effects
164      intercept = result.fe_params["Intercept"]
165      fe_flagship = result.fe_params["is_flagship"]
166      se_flagship = result.bse_fe["is_flagship"]
167  
168      # p-value: z-test (MixedLM uses z not t)
169      z_stat = fe_flagship / se_flagship
170      p_value = 2 * stats.norm.sf(abs(z_stat))
171  
172      # Variance components for ICC
173      # Random effects variance (lab-level)
174      re_var = float(result.cov_re.iloc[0, 0])
175      # Residual variance
176      resid_var = float(result.scale)
177  
178      # ICC = lab variance / (lab variance + residual variance)
179      icc = re_var / (re_var + resid_var)
180  
181      # 95% CI for fixed effect
182      ci_low = fe_flagship - 1.96 * se_flagship
183      ci_high = fe_flagship + 1.96 * se_flagship
184  
185      return {
186          "label":             label,
187          "n_models":          int(len(df)),
188          "n_labs":            int(df["lab"].nunique()),
189          "n_single_model_labs": int((df.groupby("lab").size() == 1).sum()),
190          "intercept":         float(intercept),
191          "fe_flagship":       float(fe_flagship),
192          "se_flagship":       float(se_flagship),
193          "z_stat":            float(z_stat),
194          "p_value":           float(p_value),
195          "ci_95_low":         float(ci_low),
196          "ci_95_high":        float(ci_high),
197          "re_var_lab":        float(re_var),
198          "resid_var":         float(resid_var),
199          "icc":               float(icc),
200          "converged":         bool(result.converged),
201          "boundary_solution": any("boundary" in w.lower() or "singular" in w.lower() for w in warning_messages),
202          "optimizer_warnings": warning_messages,
203      }
204  
205  
206  # ---- D1–D2 correlation ------------------------------------------------------
207  
208  def d1_d2_correlation(df: pd.DataFrame) -> dict:
209      # Only models where both D1 and D2 are available
210      both = df.dropna(subset=["d1", "d2"])
211      d1 = both["d1"].values
212      d2 = both["d2"].values
213  
214      pearson_r, pearson_p = stats.pearsonr(d1, d2)
215      spearman_r, spearman_p = stats.spearmanr(d1, d2)
216  
217      return {
218          "n_models_with_both": int(len(both)),
219          "pearson_r":          float(pearson_r),
220          "pearson_p":          float(pearson_p),
221          "spearman_r":         float(spearman_r),
222          "spearman_p":         float(spearman_p),
223      }
224  
225  
226  # ---- LaTeX snippet ----------------------------------------------------------
227  
228  def build_latex_snippet(
229      full_me: dict,
230      no_phi4_me: dict,
231      no_reasoning_me: dict,
232      naive_full: dict,
233      corr: dict,
234  ) -> str:
235      def fmt_p(p: float) -> str:
236          if p < 0.001:
237              return "p < 0.001"
238          elif p < 0.01:
239              return f"p = {p:.3f}"
240          else:
241              return f"p = {p:.2f}"
242  
243      lines = [
244          r"\subsection*{Mixed-Effects Model Robustness Check}",
245          "",
246          r"To address the concern that models from the same laboratory share training",
247          r"infrastructure---violating the independence assumption of the naive effect",
248          r"size estimate---we fit a mixed-effects model of the form",
249          r"\texttt{hubris $\sim$ is\_flagship + (1|lab)} using restricted maximum likelihood",
250          r"(REML) via \texttt{statsmodels} MixedLM \citep{seabold2010statsmodels}.",
251          r"Laboratory identity was modelled as a random intercept across",
252          f"{full_me['n_labs']} labs and {full_me['n_models']} models.",
253          "",
254          r"\paragraph{Primary result.}",
255          f"The fixed effect for flagship status was",
256          f"$\\hat{{\\beta}} = {full_me['fe_flagship']:.3f}$",
257          f"($SE = {full_me['se_flagship']:.3f}$,",
258          f"$z = {full_me['z_stat']:.2f}$,",
259          f"{fmt_p(full_me['p_value'])},",
260          f"95\\% CI $[{full_me['ci_95_low']:.3f},\\, {full_me['ci_95_high']:.3f}]$).",
261          f"For comparison, the naive independent-samples Cohen's $d = {naive_full['cohens_d']:.2f}$",
262          r"($t$-test, " + fmt_p(naive_full['p_value']) + r").",
263          f"The estimated intra-class correlation for lab was $\\mathrm{{ICC}} = {full_me['icc']:.3f}$",
264          r"(note: four of eleven labs contribute only one model, so the ICC estimate",
265          r"is on the boundary of the parameter space and should be interpreted cautiously;",
266          r"the fixed-effect estimate is unaffected and stable across multiple optimizers).",
267          r"The direction and magnitude of the flagship effect remain consistent",
268          r"after accounting for lab-level clustering.",
269          "",
270          r"\paragraph{Sensitivity checks.}",
271          r"Excluding Phi-4 (missing D1 data) yielded",
272          f"$\\hat{{\\beta}} = {no_phi4_me['fe_flagship']:.3f}$",
273          f"($SE = {no_phi4_me['se_flagship']:.3f}$, {fmt_p(no_phi4_me['p_value'])}).",
274          r"Excluding the two reasoning-specialised models (o3-mini, DeepSeek R1),",
275          r"which exhibit unusually elevated hubris scores by design, yielded",
276          f"$\\hat{{\\beta}} = {no_reasoning_me['fe_flagship']:.3f}$",
277          f"($SE = {no_reasoning_me['se_flagship']:.3f}$, {fmt_p(no_reasoning_me['p_value'])}).",
278          r"Neither exclusion materially changes the conclusion.",
279          "",
280          r"\paragraph{D1--D2 convergent validity.}",
281          f"Among the {corr['n_models_with_both']} models for which both",
282          r"D1 (score-suppression rate) and D2 (hubris score) are available,",
283          r"the two metrics are strongly correlated",
284          f"(Pearson $r = {corr['pearson_r']:.3f}$, {fmt_p(corr['pearson_p'])};",
285          f"Spearman $\\rho = {corr['spearman_r']:.3f}$, {fmt_p(corr['spearman_p'])}),",
286          r"supporting their convergent validity as complementary operationalisations",
287          r"of the same underlying epistemic-humility construct.",
288      ]
289      return "\n".join(lines)
290  
291  
292  # ---- Console summary --------------------------------------------------------
293  
294  def print_summary(
295      full_me: dict,
296      no_phi4_me: dict,
297      no_reasoning_me: dict,
298      naive_full: dict,
299      corr: dict,
300  ) -> None:
301      print()
302      print("=" * 72)
303      print("  MIXED-EFFECTS MODEL ANALYSIS — Reviewer Response")
304      print("=" * 72)
305      print()
306  
307      print("1. NAIVE COHEN'S d (independence assumed)")
308      print(f"   Flagship mean hubris:     {naive_full['flagship_mean']:.3f}  (n={naive_full['flagship_n']})")
309      print(f"   Non-flagship mean hubris: {naive_full['non_flagship_mean']:.3f}  (n={naive_full['non_flagship_n']})")
310      print(f"   Cohen's d:  {naive_full['cohens_d']:.3f}")
311      print(f"   t-stat:     {naive_full['t_stat']:.3f},  p = {naive_full['p_value']:.4e}")
312      print()
313  
314      print("2. MIXED-EFFECTS MODEL  hubris ~ is_flagship + (1|lab)")
315      for label, me in [
316          ("Full dataset (N=31)",               full_me),
317          ("Excluding Phi-4 (N=30)",            no_phi4_me),
318          ("Excluding reasoning models (N=29)", no_reasoning_me),
319      ]:
320          print(f"\n   [{label}]")
321          print(f"   n_models={me['n_models']}, n_labs={me['n_labs']}")
322          print(f"   Fixed effect (is_flagship): {me['fe_flagship']:.4f}  SE={me['se_flagship']:.4f}")
323          print(f"   z={me['z_stat']:.3f},  p={me['p_value']:.4e}")
324          print(f"   95% CI: [{me['ci_95_low']:.4f}, {me['ci_95_high']:.4f}]")
325          print(f"   ICC (lab):   {me['icc']:.4f}")
326          print(f"   Lab var:     {me['re_var_lab']:.6f}")
327          print(f"   Residual var:{me['resid_var']:.6f}")
328          print(f"   Converged:   {me['converged']}")
329      print()
330  
331      print("3. D1–D2 CORRELATION")
332      print(f"   n (both available): {corr['n_models_with_both']}")
333      print(f"   Pearson  r = {corr['pearson_r']:.4f},  p = {corr['pearson_p']:.4e}")
334      print(f"   Spearman r = {corr['spearman_r']:.4f},  p = {corr['spearman_p']:.4e}")
335      print()
336      print("=" * 72)
337  
338  
339  # ---- Main -------------------------------------------------------------------
340  
341  def main() -> None:
342      OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
343  
344      # Build dataframes
345      df_full        = build_df()
346      df_no_phi4     = build_df(exclude_phi4=True)
347      df_no_reasoning = build_df(exclude_reasoning=True)
348  
349      # Naive Cohen's d on full dataset
350      naive_full = naive_cohens_d(df_full)
351  
352      # Mixed-effects models
353      full_me        = fit_mixed_effects(df_full,         "full_n31")
354      no_phi4_me     = fit_mixed_effects(df_no_phi4,      "no_phi4_n30")
355      no_reasoning_me = fit_mixed_effects(df_no_reasoning, "no_reasoning_n29")
356  
357      # D1-D2 correlation (on full dataset; Phi-4 has d1=None so auto-dropped)
358      corr = d1_d2_correlation(df_full)
359  
360      # LaTeX
361      latex = build_latex_snippet(full_me, no_phi4_me, no_reasoning_me, naive_full, corr)
362  
363      # Assemble results dict
364      results = {
365          "analysis":       "mixed_effects_reviewer_response",
366          "date":           "2026-02-20",
367          "naive_cohens_d": naive_full,
368          "mixed_effects": {
369              "full_n31":        full_me,
370              "no_phi4_n30":     no_phi4_me,
371              "no_reasoning_n29": no_reasoning_me,
372          },
373          "d1_d2_correlation": corr,
374          "latex_snippet":    latex,
375      }
376  
377      OUTPUT_JSON.write_text(json.dumps(results, indent=2))
378      print(f"Results written to: {OUTPUT_JSON}")
379  
380      print_summary(full_me, no_phi4_me, no_reasoning_me, naive_full, corr)
381  
382      print("\nLaTeX snippet:\n")
383      print(latex)
384      print()
385  
386  
387  if __name__ == "__main__":
388      main()