paper_figures.py
1 # EXTENDS: DES-c386de6567f0 2 """ 3 Paper Figure Generation — The Externalization Boundary (COLM 2026) 4 5 Generates publication-quality figures from benchmark data: 6 Fig 1: D1 Bimodality — The Externalization Boundary (Section 4.1) 7 Fig 2: Cross-Lab Flagship Convergence (Section 4.5) 8 Fig 3: Reasoning Model Discipline Profiles (Section 4.2) 9 Fig 4: Format Sensitivity Heatmap (Section 4.6) 10 Fig 5: Composite Hubris Landscape (Section 4.5) 11 12 Data source: benchmarks/capability_hubris/results/sensitivity_analysis_COLM.json 13 Output: research/results/paper_figures/ 14 """ 15 16 import json 17 import sys 18 from pathlib import Path 19 20 import matplotlib 21 matplotlib.use('Agg') 22 import matplotlib.pyplot as plt 23 import matplotlib.patches as mpatches 24 import numpy as np 25 26 # Paths 27 REPO = Path(__file__).resolve().parent.parent.parent 28 DATA_FILE = REPO / "benchmarks" / "capability_hubris" / "results" / "sensitivity_analysis_COLM.json" 29 OUTPUT_DIR = REPO / "research" / "results" / "paper_figures" 30 OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 31 32 # Style 33 plt.rcParams.update({ 34 'font.size': 11, 35 'font.family': 'serif', 36 'axes.titlesize': 13, 37 'axes.labelsize': 12, 38 'xtick.labelsize': 9, 39 'ytick.labelsize': 10, 40 'legend.fontsize': 9, 41 'figure.dpi': 300, 42 'savefig.dpi': 300, 43 'savefig.bbox': 'tight', 44 }) 45 46 # Category colors 47 CAT_COLORS = { 48 'Flagship': '#2ecc71', 49 'Flagship (legacy)': '#27ae60', 50 'Budget/distilled': '#e74c3c', 51 'Reasoning-specialized': '#9b59b6', 52 'Local/quantized': '#f39c12', 53 } 54 55 # Lab markers 56 LAB_MARKERS = { 57 'Anthropic': 'o', 'OpenAI': 's', 'Google': '^', 'Meta': 'D', 58 'DeepSeek': 'v', 'Mistral': 'P', 'Alibaba': '*', 'xAI': 'X', 59 'Cohere': 'h', 'ByteDance': 'p', 'AI21': '8', 60 } 61 62 63 def load_data(): 64 with open(DATA_FILE) as f: 65 data = json.load(f) 66 return data['models'] 67 68 69 def fig1_d1_bimodality(models): 70 """Fig 1: D1 Bimodality — The Externalization Boundary""" 71 fig, ax = plt.subplots(figsize=(10, 5)) 72 73 # Sort: externalizers first, then internalizers 74 ext = [m for m in models if m['dimensions']['D1'] == 0.0] 75 inter = [m for m in models if m['dimensions']['D1'] == 1.0] 76 77 # Sort each group by hubris 78 ext.sort(key=lambda m: m['hubris_equal']) 79 inter.sort(key=lambda m: m['hubris_equal']) 80 81 all_models = ext + inter 82 names = [m['name'] for m in all_models] 83 d1_vals = [m['dimensions']['D1'] for m in all_models] 84 colors = [CAT_COLORS.get(m['category'], '#95a5a6') for m in all_models] 85 86 bars = ax.barh(range(len(names)), d1_vals, color=colors, edgecolor='white', linewidth=0.5) 87 88 ax.set_yticks(range(len(names))) 89 ax.set_yticklabels(names, fontsize=7) 90 ax.set_xlabel('D1 (State Externalization)') 91 ax.set_title('The Externalization Boundary: D1 is Bimodal at Trial Level (31-Model Core Study)') 92 ax.set_xlim(-0.05, 1.1) 93 94 # Boundary line 95 boundary_y = len(ext) - 0.5 96 ax.axhline(y=boundary_y, color='red', linewidth=2, linestyle='--', alpha=0.7) 97 ax.text(0.5, boundary_y + 0.3, 'EXTERNALIZATION BOUNDARY', ha='center', 98 fontsize=10, fontweight='bold', color='red', alpha=0.8) 99 100 # Labels for sides 101 ax.text(0.02, boundary_y / 2, f'Externalizers\n(N={len(ext)})', 102 ha='left', va='center', fontsize=9, fontstyle='italic', color='#2ecc71') 103 ax.text(0.98, boundary_y + (len(inter) / 2) + 0.5, f'Internalizers\n(N={len(inter)})', 104 ha='right', va='center', fontsize=9, fontstyle='italic', color='#e74c3c') 105 106 # Legend 107 patches = [mpatches.Patch(color=c, label=l) for l, c in CAT_COLORS.items() 108 if any(m['category'] == l for m in models)] 109 ax.legend(handles=patches, loc='lower right', framealpha=0.9) 110 111 ax.invert_yaxis() 112 plt.tight_layout() 113 plt.savefig(OUTPUT_DIR / 'fig1_d1_bimodality.png') 114 plt.savefig(OUTPUT_DIR / 'fig1_d1_bimodality.pdf') 115 plt.close() 116 print(" Fig 1: D1 bimodality saved") 117 118 119 def fig2_flagship_convergence(models): 120 """Fig 2: Cross-Lab Flagship Convergence""" 121 flagships = [m for m in models if m['category'] == 'Flagship' 122 and m['dimensions']['D1'] == 0.0] 123 flagships.sort(key=lambda m: m['hubris_equal']) 124 125 fig, ax = plt.subplots(figsize=(10, 5)) 126 127 labs = [m['lab'] for m in flagships] 128 hubris_vals = [m['hubris_equal'] for m in flagships] 129 names = [m['name'] for m in flagships] 130 131 # Color by lab 132 unique_labs = sorted(set(labs)) 133 lab_colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labs))) 134 lab_color_map = dict(zip(unique_labs, lab_colors)) 135 136 colors = [lab_color_map[l] for l in labs] 137 138 bars = ax.barh(range(len(names)), hubris_vals, color=colors, edgecolor='gray', linewidth=0.5) 139 140 ax.set_yticks(range(len(names))) 141 ax.set_yticklabels(names, fontsize=8) 142 ax.set_xlabel('Composite Hubris (equal weighting)') 143 ax.set_title('Cross-Lab Flagship Convergence: Mean 0.063, Range [0.027, 0.087]') 144 145 # Mean line 146 mean_h = np.mean(hubris_vals) 147 ax.axvline(x=mean_h, color='red', linewidth=1.5, linestyle='--', alpha=0.7, 148 label=f'Mean: {mean_h:.3f}') 149 150 # Convergence band 151 ax.axvspan(min(hubris_vals), max(hubris_vals), alpha=0.1, color='green', 152 label=f'Range: {max(hubris_vals) - min(hubris_vals):.3f}') 153 154 patches = [mpatches.Patch(color=lab_color_map[l], label=l) for l in unique_labs] 155 ax.legend(handles=patches, loc='lower right', framealpha=0.9, ncol=2, fontsize=7) 156 157 ax.invert_yaxis() 158 ax.set_xlim(0, max(hubris_vals) * 1.3) 159 plt.tight_layout() 160 plt.savefig(OUTPUT_DIR / 'fig2_flagship_convergence.png') 161 plt.savefig(OUTPUT_DIR / 'fig2_flagship_convergence.pdf') 162 plt.close() 163 print(" Fig 2: Flagship convergence saved") 164 165 166 def fig3_reasoning_profiles(models): 167 """Fig 3: Reasoning Model Discipline Profiles (Radar Chart)""" 168 target_names = ['GPT-4o', 'o3-mini', 'DeepSeek V3.1', 'DeepSeek R1'] 169 targets = [] 170 for name in target_names: 171 for m in models: 172 if m['name'] == name: 173 targets.append(m) 174 break 175 176 if len(targets) < 2: 177 print(" Fig 3: Skipped (insufficient reasoning model data)") 178 return 179 180 dims = ['D1', 'D2', 'D3', 'D4', 'D5'] 181 dim_labels = ['D1\nState Ext.', 'D2\nOverconf.', 'D3\nTool Disc.', 'D4\nInstruct.', 'D5\nContext'] 182 183 fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True)) 184 185 angles = np.linspace(0, 2 * np.pi, len(dims), endpoint=False).tolist() 186 angles += angles[:1] # close polygon 187 188 colors_radar = ['#2ecc71', '#9b59b6', '#3498db', '#e74c3c'] 189 for i, m in enumerate(targets): 190 values = [m['dimensions'][d] for d in dims] 191 values += values[:1] # close polygon 192 ax.plot(angles, values, 'o-', linewidth=2, color=colors_radar[i % len(colors_radar)], 193 label=f"{m['name']} (H={m['hubris_equal']:.3f})", markersize=6) 194 ax.fill(angles, values, alpha=0.1, color=colors_radar[i % len(colors_radar)]) 195 196 ax.set_xticks(angles[:-1]) 197 ax.set_xticklabels(dim_labels, fontsize=10) 198 ax.set_ylim(0, 1.05) 199 ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0]) 200 ax.set_yticklabels(['0.0', '0.25', '0.5', '0.75', '1.0'], fontsize=8) 201 ax.set_title('Reasoning Model Discipline Profiles\n(Flagship vs. Reasoning-Specialized)', 202 fontsize=13, pad=20) 203 ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), framealpha=0.9) 204 205 plt.tight_layout() 206 plt.savefig(OUTPUT_DIR / 'fig3_reasoning_profiles.png') 207 plt.savefig(OUTPUT_DIR / 'fig3_reasoning_profiles.pdf') 208 plt.close() 209 print(" Fig 3: Reasoning profiles saved") 210 211 212 def fig4_format_sensitivity_heatmap(): 213 """Fig 4: Format Sensitivity Heatmap from N>=30 dead-trial-filtered data""" 214 # Updated from n30_summary.json (dead-trial-filtered, 1,383 live trials) 215 data = { 216 # Format-invariant (25 models) 217 'GPT-4o': {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 218 'GPT-4.1': {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 219 'GPT-4o-mini': {'native': 0.07, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 220 'Gemini Flash 2.0': {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 221 'Kimi K2': {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 222 'Mistral Large': {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 223 'Mixtral 8x22B': {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 224 'Qwen 2.5 72B': {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 225 'Qwen 3.5 397B': {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 226 'MiMo Flash': {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 227 'Seed 2.0 Lite': {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 228 'Seed 2.0 Pro': {'native': 0.00, 'text': 0.00, 'pythonic': 0.10, 'cluster': 'Format-inv.'}, 229 'Grok 3': {'native': 0.00, 'text': 0.00, 'pythonic': 0.04, 'cluster': 'Format-inv.'}, 230 'GLM-4.7': {'native': 0.00, 'text': None, 'pythonic': 0.00, 'cluster': 'Format-inv.'}, 231 'DeepSeek R1': {'native': 0.00, 'text': 0.13, 'pythonic': None, 'cluster': 'Format-inv.'}, 232 'DeepSeek V3': {'native': 0.04, 'text': 0.09, 'pythonic': None, 'cluster': 'Format-inv.'}, 233 'ERNIE 4.5': {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 234 'Longcat Flash': {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 235 'Phi-4': {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'}, 236 'Qwen 3 235B': {'native': 0.10, 'text': 0.17, 'pythonic': 0.09, 'cluster': 'Format-inv.'}, 237 'MiniMax M2.5': {'native': 0.00, 'text': 0.13, 'pythonic': 0.17, 'cluster': 'Format-inv.'}, 238 'Seed 1.6': {'native': 0.33, 'text': 0.10, 'pythonic': None, 'cluster': 'Format-inv.'}, 239 'Nova Pro': {'native': 0.00, 'text': 0.00, 'pythonic': 0.67, 'cluster': 'Format-inv.'}, 240 'Hunyuan T1': {'native': 0.40, 'text': 0.67, 'pythonic': None, 'cluster': 'Format-inv.'}, 241 'Gemma 3 27B': {'native': None, 'text': 0.50, 'pythonic': 0.40, 'cluster': 'Format-inv.'}, 242 # API-channel-only (3 models) 243 'Claude 3.5 Haiku': {'native': 0.00, 'text': 1.00, 'pythonic': 1.00, 'cluster': 'API-only'}, 244 'GPT-4.1 Mini': {'native': 0.00, 'text': 0.88, 'pythonic': 0.98, 'cluster': 'API-only'}, 245 'GLM-4.5': {'native': 0.00, 'text': 1.00, 'pythonic': 0.00, 'cluster': 'API-only'}, 246 # Text-channel (5 models) 247 'Claude Sonnet 4': {'native': 1.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Text-ch.'}, 248 'o3-mini': {'native': 0.96, 'text': 0.20, 'pythonic': None, 'cluster': 'Text-ch.'}, 249 'Jamba Large': {'native': 1.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Text-ch.'}, 250 'Llama 4 Maverick': {'native': 0.80, 'text': 0.00, 'pythonic': 0.17, 'cluster': 'Text-ch.'}, 251 'GLM-5': {'native': 0.88, 'text': 0.80, 'pythonic': 0.09, 'cluster': 'Text-ch.'}, 252 # Stochastic (6 models) 253 'Claude 3.7 Sonnet': {'native': 0.50, 'text': 0.04, 'pythonic': 0.00, 'cluster': 'Stochastic'}, 254 'Llama 4 Scout': {'native': 0.60, 'text': 0.93, 'pythonic': 0.27, 'cluster': 'Stochastic'}, 255 'Llama 3.3 70B': {'native': 0.38, 'text': None, 'pythonic': None, 'cluster': 'Stochastic'}, 256 'Command R+': {'native': 0.40, 'text': None, 'pythonic': 1.00, 'cluster': 'Stochastic'}, 257 'Gemini 2.5 Pro': {'native': 1.00, 'text': 0.59, 'pythonic': 0.18, 'cluster': 'Stochastic'}, 258 'R1-Distill-70B': {'native': None, 'text': 0.30, 'pythonic': None, 'cluster': 'Stochastic'}, 259 # Tool-incompatible (5 models) 260 'R1-Distill-Qwen-32B':{'native': None, 'text': 1.00, 'pythonic': None, 'cluster': 'Tool-inc.'}, 261 'Hunyuan': {'native': 0.00, 'text': 1.00, 'pythonic': 1.00, 'cluster': 'Tool-inc.'}, 262 'Gemma 2 27B': {'native': None, 'text': 0.93, 'pythonic': 1.00, 'cluster': 'Tool-inc.'}, 263 'Step Flash': {'native': 1.00, 'text': None, 'pythonic': None, 'cluster': 'Tool-inc.'}, 264 'QwQ-32B': {'native': 0.33, 'text': None, 'pythonic': None, 'cluster': 'Tool-inc.'}, 265 } 266 267 cluster_order = ['Format-inv.', 'API-only', 'Text-ch.', 'Stochastic', 'Tool-inc.'] 268 cluster_colors = { 269 'Format-inv.': '#2ecc71', 'API-only': '#3498db', 270 'Text-ch.': '#f39c12', 'Stochastic': '#9b59b6', 'Tool-inc.': '#e74c3c' 271 } 272 273 # Sort by cluster then name 274 sorted_models = sorted(data.keys(), 275 key=lambda m: (cluster_order.index(data[m]['cluster']), m)) 276 277 formats = ['native', 'text', 'pythonic'] 278 format_labels = ['Native API', 'Text XML', 'Pythonic'] 279 280 fig, ax = plt.subplots(figsize=(6, 12)) 281 282 # Build heatmap matrix 283 matrix = np.full((len(sorted_models), len(formats)), np.nan) 284 for i, model in enumerate(sorted_models): 285 for j, fmt in enumerate(formats): 286 val = data[model][fmt] 287 if val is not None: 288 matrix[i, j] = val 289 290 # Custom colormap: green (0) -> yellow (0.5) -> red (1) 291 from matplotlib.colors import LinearSegmentedColormap 292 cmap = LinearSegmentedColormap.from_list('d1', ['#2ecc71', '#f1c40f', '#e74c3c']) 293 cmap.set_bad(color='#ecf0f1') # gray for None 294 295 im = ax.imshow(matrix, cmap=cmap, aspect='auto', vmin=0, vmax=1) 296 297 ax.set_xticks(range(len(format_labels))) 298 ax.set_xticklabels(format_labels, fontsize=10) 299 ax.set_yticks(range(len(sorted_models))) 300 ax.set_yticklabels(sorted_models, fontsize=7) 301 302 # Add cluster separators 303 cluster_boundaries = [] 304 current_cluster = data[sorted_models[0]]['cluster'] 305 for i, model in enumerate(sorted_models): 306 if data[model]['cluster'] != current_cluster: 307 cluster_boundaries.append(i - 0.5) 308 current_cluster = data[model]['cluster'] 309 310 for b in cluster_boundaries: 311 ax.axhline(y=b, color='black', linewidth=1.5) 312 313 # Add value labels 314 for i in range(len(sorted_models)): 315 for j in range(len(formats)): 316 val = matrix[i, j] 317 if not np.isnan(val): 318 text_color = 'white' if val > 0.6 else 'black' 319 ax.text(j, i, f'{val:.2f}', ha='center', va='center', 320 fontsize=7, color=text_color, fontweight='bold') 321 else: 322 ax.text(j, i, '—', ha='center', va='center', 323 fontsize=8, color='#bdc3c7') 324 325 # Cluster labels on right 326 cluster_ranges = {} 327 current_cluster = data[sorted_models[0]]['cluster'] 328 start = 0 329 for i, model in enumerate(sorted_models): 330 if data[model]['cluster'] != current_cluster: 331 cluster_ranges[current_cluster] = (start, i - 1) 332 current_cluster = data[model]['cluster'] 333 start = i 334 cluster_ranges[current_cluster] = (start, len(sorted_models) - 1) 335 336 for cluster, (s, e) in cluster_ranges.items(): 337 mid = (s + e) / 2 338 ax.text(3.2, mid, cluster, ha='left', va='center', fontsize=8, 339 color=cluster_colors[cluster], fontweight='bold') 340 341 ax.set_title('Format Sensitivity of the Externalization Boundary\n(44 Models, N=10-45, Dead-Trial-Filtered)', 342 fontsize=12, pad=10) 343 344 plt.colorbar(im, ax=ax, label='D1 Score', shrink=0.5) 345 plt.tight_layout() 346 plt.savefig(OUTPUT_DIR / 'fig4_format_sensitivity.png') 347 plt.savefig(OUTPUT_DIR / 'fig4_format_sensitivity.pdf') 348 plt.close() 349 print(" Fig 4: Format sensitivity heatmap saved") 350 351 352 def fig5_hubris_landscape(models): 353 """Fig 5: Composite Hubris Landscape — All Models""" 354 models_sorted = sorted([m for m in models if m.get('hubris_equal') is not None], 355 key=lambda m: m['hubris_equal']) 356 357 fig, ax = plt.subplots(figsize=(12, 7)) 358 359 names = [m['name'] for m in models_sorted] 360 hubris = [m['hubris_equal'] for m in models_sorted] 361 colors = [CAT_COLORS.get(m['category'], '#95a5a6') for m in models_sorted] 362 363 bars = ax.bar(range(len(names)), hubris, color=colors, edgecolor='white', linewidth=0.5) 364 365 ax.set_xticks(range(len(names))) 366 ax.set_xticklabels(names, rotation=75, ha='right', fontsize=7) 367 ax.set_ylabel('Composite Hubris (equal weighting)') 368 ax.set_title('The Cross-Laboratory Distortion Field: 31-Model Discipline Landscape') 369 370 # Flagship ceiling 371 flagship_max = max(m['hubris_equal'] for m in models if m['category'] == 'Flagship') 372 ax.axhline(y=flagship_max, color='green', linewidth=1, linestyle='--', 373 alpha=0.6, label=f'Flagship ceiling: {flagship_max:.3f}') 374 375 # Mean lines 376 flagship_mean = np.mean([m['hubris_equal'] for m in models if m['category'] == 'Flagship']) 377 ax.axhline(y=flagship_mean, color='green', linewidth=1.5, linestyle='-', 378 alpha=0.4, label=f'Flagship mean: {flagship_mean:.3f}') 379 380 patches = [mpatches.Patch(color=c, label=l) for l, c in CAT_COLORS.items() 381 if any(m['category'] == l for m in models)] 382 ax.legend(handles=patches, loc='upper left', framealpha=0.9) 383 384 plt.tight_layout() 385 plt.savefig(OUTPUT_DIR / 'fig5_hubris_landscape.png') 386 plt.savefig(OUTPUT_DIR / 'fig5_hubris_landscape.pdf') 387 plt.close() 388 print(" Fig 5: Hubris landscape saved") 389 390 391 def main(): 392 print("Loading data...") 393 models = load_data() 394 print(f" {len(models)} models loaded") 395 396 print("\nGenerating figures...") 397 fig1_d1_bimodality(models) 398 fig2_flagship_convergence(models) 399 fig3_reasoning_profiles(models) 400 fig4_format_sensitivity_heatmap() 401 fig5_hubris_landscape(models) 402 403 print(f"\nAll figures saved to {OUTPUT_DIR}/") 404 print("Files generated:") 405 for f in sorted(OUTPUT_DIR.glob('fig*')): 406 print(f" {f.name}") 407 408 409 if __name__ == '__main__': 410 main()