/ analysis / paper_figures.py
paper_figures.py
  1  # EXTENDS: DES-c386de6567f0
  2  """
  3  Paper Figure Generation — The Externalization Boundary (COLM 2026)
  4  
  5  Generates publication-quality figures from benchmark data:
  6    Fig 1: D1 Bimodality — The Externalization Boundary (Section 4.1)
  7    Fig 2: Cross-Lab Flagship Convergence (Section 4.5)
  8    Fig 3: Reasoning Model Discipline Profiles (Section 4.2)
  9    Fig 4: Format Sensitivity Heatmap (Section 4.6)
 10    Fig 5: Composite Hubris Landscape (Section 4.5)
 11  
 12  Data source: benchmarks/capability_hubris/results/sensitivity_analysis_COLM.json
 13  Output: research/results/paper_figures/
 14  """
 15  
 16  import json
 17  import sys
 18  from pathlib import Path
 19  
 20  import matplotlib
 21  matplotlib.use('Agg')
 22  import matplotlib.pyplot as plt
 23  import matplotlib.patches as mpatches
 24  import numpy as np
 25  
 26  # Paths
 27  REPO = Path(__file__).resolve().parent.parent.parent
 28  DATA_FILE = REPO / "benchmarks" / "capability_hubris" / "results" / "sensitivity_analysis_COLM.json"
 29  OUTPUT_DIR = REPO / "research" / "results" / "paper_figures"
 30  OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 31  
 32  # Style
 33  plt.rcParams.update({
 34      'font.size': 11,
 35      'font.family': 'serif',
 36      'axes.titlesize': 13,
 37      'axes.labelsize': 12,
 38      'xtick.labelsize': 9,
 39      'ytick.labelsize': 10,
 40      'legend.fontsize': 9,
 41      'figure.dpi': 300,
 42      'savefig.dpi': 300,
 43      'savefig.bbox': 'tight',
 44  })
 45  
 46  # Category colors
 47  CAT_COLORS = {
 48      'Flagship': '#2ecc71',
 49      'Flagship (legacy)': '#27ae60',
 50      'Budget/distilled': '#e74c3c',
 51      'Reasoning-specialized': '#9b59b6',
 52      'Local/quantized': '#f39c12',
 53  }
 54  
 55  # Lab markers
 56  LAB_MARKERS = {
 57      'Anthropic': 'o', 'OpenAI': 's', 'Google': '^', 'Meta': 'D',
 58      'DeepSeek': 'v', 'Mistral': 'P', 'Alibaba': '*', 'xAI': 'X',
 59      'Cohere': 'h', 'ByteDance': 'p', 'AI21': '8',
 60  }
 61  
 62  
 63  def load_data():
 64      with open(DATA_FILE) as f:
 65          data = json.load(f)
 66      return data['models']
 67  
 68  
 69  def fig1_d1_bimodality(models):
 70      """Fig 1: D1 Bimodality — The Externalization Boundary"""
 71      fig, ax = plt.subplots(figsize=(10, 5))
 72  
 73      # Sort: externalizers first, then internalizers
 74      ext = [m for m in models if m['dimensions']['D1'] == 0.0]
 75      inter = [m for m in models if m['dimensions']['D1'] == 1.0]
 76  
 77      # Sort each group by hubris
 78      ext.sort(key=lambda m: m['hubris_equal'])
 79      inter.sort(key=lambda m: m['hubris_equal'])
 80  
 81      all_models = ext + inter
 82      names = [m['name'] for m in all_models]
 83      d1_vals = [m['dimensions']['D1'] for m in all_models]
 84      colors = [CAT_COLORS.get(m['category'], '#95a5a6') for m in all_models]
 85  
 86      bars = ax.barh(range(len(names)), d1_vals, color=colors, edgecolor='white', linewidth=0.5)
 87  
 88      ax.set_yticks(range(len(names)))
 89      ax.set_yticklabels(names, fontsize=7)
 90      ax.set_xlabel('D1 (State Externalization)')
 91      ax.set_title('The Externalization Boundary: D1 is Bimodal at Trial Level (31-Model Core Study)')
 92      ax.set_xlim(-0.05, 1.1)
 93  
 94      # Boundary line
 95      boundary_y = len(ext) - 0.5
 96      ax.axhline(y=boundary_y, color='red', linewidth=2, linestyle='--', alpha=0.7)
 97      ax.text(0.5, boundary_y + 0.3, 'EXTERNALIZATION BOUNDARY', ha='center',
 98              fontsize=10, fontweight='bold', color='red', alpha=0.8)
 99  
100      # Labels for sides
101      ax.text(0.02, boundary_y / 2, f'Externalizers\n(N={len(ext)})',
102              ha='left', va='center', fontsize=9, fontstyle='italic', color='#2ecc71')
103      ax.text(0.98, boundary_y + (len(inter) / 2) + 0.5, f'Internalizers\n(N={len(inter)})',
104              ha='right', va='center', fontsize=9, fontstyle='italic', color='#e74c3c')
105  
106      # Legend
107      patches = [mpatches.Patch(color=c, label=l) for l, c in CAT_COLORS.items()
108                 if any(m['category'] == l for m in models)]
109      ax.legend(handles=patches, loc='lower right', framealpha=0.9)
110  
111      ax.invert_yaxis()
112      plt.tight_layout()
113      plt.savefig(OUTPUT_DIR / 'fig1_d1_bimodality.png')
114      plt.savefig(OUTPUT_DIR / 'fig1_d1_bimodality.pdf')
115      plt.close()
116      print("  Fig 1: D1 bimodality saved")
117  
118  
119  def fig2_flagship_convergence(models):
120      """Fig 2: Cross-Lab Flagship Convergence"""
121      flagships = [m for m in models if m['category'] == 'Flagship'
122                   and m['dimensions']['D1'] == 0.0]
123      flagships.sort(key=lambda m: m['hubris_equal'])
124  
125      fig, ax = plt.subplots(figsize=(10, 5))
126  
127      labs = [m['lab'] for m in flagships]
128      hubris_vals = [m['hubris_equal'] for m in flagships]
129      names = [m['name'] for m in flagships]
130  
131      # Color by lab
132      unique_labs = sorted(set(labs))
133      lab_colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labs)))
134      lab_color_map = dict(zip(unique_labs, lab_colors))
135  
136      colors = [lab_color_map[l] for l in labs]
137  
138      bars = ax.barh(range(len(names)), hubris_vals, color=colors, edgecolor='gray', linewidth=0.5)
139  
140      ax.set_yticks(range(len(names)))
141      ax.set_yticklabels(names, fontsize=8)
142      ax.set_xlabel('Composite Hubris (equal weighting)')
143      ax.set_title('Cross-Lab Flagship Convergence: Mean 0.063, Range [0.027, 0.087]')
144  
145      # Mean line
146      mean_h = np.mean(hubris_vals)
147      ax.axvline(x=mean_h, color='red', linewidth=1.5, linestyle='--', alpha=0.7,
148                 label=f'Mean: {mean_h:.3f}')
149  
150      # Convergence band
151      ax.axvspan(min(hubris_vals), max(hubris_vals), alpha=0.1, color='green',
152                 label=f'Range: {max(hubris_vals) - min(hubris_vals):.3f}')
153  
154      patches = [mpatches.Patch(color=lab_color_map[l], label=l) for l in unique_labs]
155      ax.legend(handles=patches, loc='lower right', framealpha=0.9, ncol=2, fontsize=7)
156  
157      ax.invert_yaxis()
158      ax.set_xlim(0, max(hubris_vals) * 1.3)
159      plt.tight_layout()
160      plt.savefig(OUTPUT_DIR / 'fig2_flagship_convergence.png')
161      plt.savefig(OUTPUT_DIR / 'fig2_flagship_convergence.pdf')
162      plt.close()
163      print("  Fig 2: Flagship convergence saved")
164  
165  
166  def fig3_reasoning_profiles(models):
167      """Fig 3: Reasoning Model Discipline Profiles (Radar Chart)"""
168      target_names = ['GPT-4o', 'o3-mini', 'DeepSeek V3.1', 'DeepSeek R1']
169      targets = []
170      for name in target_names:
171          for m in models:
172              if m['name'] == name:
173                  targets.append(m)
174                  break
175  
176      if len(targets) < 2:
177          print("  Fig 3: Skipped (insufficient reasoning model data)")
178          return
179  
180      dims = ['D1', 'D2', 'D3', 'D4', 'D5']
181      dim_labels = ['D1\nState Ext.', 'D2\nOverconf.', 'D3\nTool Disc.', 'D4\nInstruct.', 'D5\nContext']
182  
183      fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
184  
185      angles = np.linspace(0, 2 * np.pi, len(dims), endpoint=False).tolist()
186      angles += angles[:1]  # close polygon
187  
188      colors_radar = ['#2ecc71', '#9b59b6', '#3498db', '#e74c3c']
189      for i, m in enumerate(targets):
190          values = [m['dimensions'][d] for d in dims]
191          values += values[:1]  # close polygon
192          ax.plot(angles, values, 'o-', linewidth=2, color=colors_radar[i % len(colors_radar)],
193                  label=f"{m['name']} (H={m['hubris_equal']:.3f})", markersize=6)
194          ax.fill(angles, values, alpha=0.1, color=colors_radar[i % len(colors_radar)])
195  
196      ax.set_xticks(angles[:-1])
197      ax.set_xticklabels(dim_labels, fontsize=10)
198      ax.set_ylim(0, 1.05)
199      ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
200      ax.set_yticklabels(['0.0', '0.25', '0.5', '0.75', '1.0'], fontsize=8)
201      ax.set_title('Reasoning Model Discipline Profiles\n(Flagship vs. Reasoning-Specialized)',
202                   fontsize=13, pad=20)
203      ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), framealpha=0.9)
204  
205      plt.tight_layout()
206      plt.savefig(OUTPUT_DIR / 'fig3_reasoning_profiles.png')
207      plt.savefig(OUTPUT_DIR / 'fig3_reasoning_profiles.pdf')
208      plt.close()
209      print("  Fig 3: Reasoning profiles saved")
210  
211  
212  def fig4_format_sensitivity_heatmap():
213      """Fig 4: Format Sensitivity Heatmap from N>=30 dead-trial-filtered data"""
214      # Updated from n30_summary.json (dead-trial-filtered, 1,383 live trials)
215      data = {
216          # Format-invariant (25 models)
217          'GPT-4o':             {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
218          'GPT-4.1':            {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
219          'GPT-4o-mini':        {'native': 0.07, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
220          'Gemini Flash 2.0':   {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
221          'Kimi K2':            {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
222          'Mistral Large':      {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
223          'Mixtral 8x22B':      {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
224          'Qwen 2.5 72B':       {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
225          'Qwen 3.5 397B':      {'native': 0.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
226          'MiMo Flash':         {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
227          'Seed 2.0 Lite':      {'native': 0.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
228          'Seed 2.0 Pro':       {'native': 0.00, 'text': 0.00, 'pythonic': 0.10, 'cluster': 'Format-inv.'},
229          'Grok 3':             {'native': 0.00, 'text': 0.00, 'pythonic': 0.04, 'cluster': 'Format-inv.'},
230          'GLM-4.7':            {'native': 0.00, 'text': None, 'pythonic': 0.00, 'cluster': 'Format-inv.'},
231          'DeepSeek R1':        {'native': 0.00, 'text': 0.13, 'pythonic': None, 'cluster': 'Format-inv.'},
232          'DeepSeek V3':        {'native': 0.04, 'text': 0.09, 'pythonic': None, 'cluster': 'Format-inv.'},
233          'ERNIE 4.5':          {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
234          'Longcat Flash':      {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
235          'Phi-4':              {'native': None, 'text': 0.00, 'pythonic': None, 'cluster': 'Format-inv.'},
236          'Qwen 3 235B':        {'native': 0.10, 'text': 0.17, 'pythonic': 0.09, 'cluster': 'Format-inv.'},
237          'MiniMax M2.5':       {'native': 0.00, 'text': 0.13, 'pythonic': 0.17, 'cluster': 'Format-inv.'},
238          'Seed 1.6':           {'native': 0.33, 'text': 0.10, 'pythonic': None, 'cluster': 'Format-inv.'},
239          'Nova Pro':           {'native': 0.00, 'text': 0.00, 'pythonic': 0.67, 'cluster': 'Format-inv.'},
240          'Hunyuan T1':         {'native': 0.40, 'text': 0.67, 'pythonic': None, 'cluster': 'Format-inv.'},
241          'Gemma 3 27B':        {'native': None, 'text': 0.50, 'pythonic': 0.40, 'cluster': 'Format-inv.'},
242          # API-channel-only (3 models)
243          'Claude 3.5 Haiku':   {'native': 0.00, 'text': 1.00, 'pythonic': 1.00, 'cluster': 'API-only'},
244          'GPT-4.1 Mini':       {'native': 0.00, 'text': 0.88, 'pythonic': 0.98, 'cluster': 'API-only'},
245          'GLM-4.5':            {'native': 0.00, 'text': 1.00, 'pythonic': 0.00, 'cluster': 'API-only'},
246          # Text-channel (5 models)
247          'Claude Sonnet 4':    {'native': 1.00, 'text': 0.00, 'pythonic': 0.00, 'cluster': 'Text-ch.'},
248          'o3-mini':            {'native': 0.96, 'text': 0.20, 'pythonic': None, 'cluster': 'Text-ch.'},
249          'Jamba Large':        {'native': 1.00, 'text': 0.00, 'pythonic': None, 'cluster': 'Text-ch.'},
250          'Llama 4 Maverick':   {'native': 0.80, 'text': 0.00, 'pythonic': 0.17, 'cluster': 'Text-ch.'},
251          'GLM-5':              {'native': 0.88, 'text': 0.80, 'pythonic': 0.09, 'cluster': 'Text-ch.'},
252          # Stochastic (6 models)
253          'Claude 3.7 Sonnet':  {'native': 0.50, 'text': 0.04, 'pythonic': 0.00, 'cluster': 'Stochastic'},
254          'Llama 4 Scout':      {'native': 0.60, 'text': 0.93, 'pythonic': 0.27, 'cluster': 'Stochastic'},
255          'Llama 3.3 70B':      {'native': 0.38, 'text': None, 'pythonic': None, 'cluster': 'Stochastic'},
256          'Command R+':         {'native': 0.40, 'text': None, 'pythonic': 1.00, 'cluster': 'Stochastic'},
257          'Gemini 2.5 Pro':     {'native': 1.00, 'text': 0.59, 'pythonic': 0.18, 'cluster': 'Stochastic'},
258          'R1-Distill-70B':     {'native': None, 'text': 0.30, 'pythonic': None, 'cluster': 'Stochastic'},
259          # Tool-incompatible (5 models)
260          'R1-Distill-Qwen-32B':{'native': None, 'text': 1.00, 'pythonic': None, 'cluster': 'Tool-inc.'},
261          'Hunyuan':            {'native': 0.00, 'text': 1.00, 'pythonic': 1.00, 'cluster': 'Tool-inc.'},
262          'Gemma 2 27B':        {'native': None, 'text': 0.93, 'pythonic': 1.00, 'cluster': 'Tool-inc.'},
263          'Step Flash':         {'native': 1.00, 'text': None, 'pythonic': None, 'cluster': 'Tool-inc.'},
264          'QwQ-32B':            {'native': 0.33, 'text': None, 'pythonic': None, 'cluster': 'Tool-inc.'},
265      }
266  
267      cluster_order = ['Format-inv.', 'API-only', 'Text-ch.', 'Stochastic', 'Tool-inc.']
268      cluster_colors = {
269          'Format-inv.': '#2ecc71', 'API-only': '#3498db',
270          'Text-ch.': '#f39c12', 'Stochastic': '#9b59b6', 'Tool-inc.': '#e74c3c'
271      }
272  
273      # Sort by cluster then name
274      sorted_models = sorted(data.keys(),
275                             key=lambda m: (cluster_order.index(data[m]['cluster']), m))
276  
277      formats = ['native', 'text', 'pythonic']
278      format_labels = ['Native API', 'Text XML', 'Pythonic']
279  
280      fig, ax = plt.subplots(figsize=(6, 12))
281  
282      # Build heatmap matrix
283      matrix = np.full((len(sorted_models), len(formats)), np.nan)
284      for i, model in enumerate(sorted_models):
285          for j, fmt in enumerate(formats):
286              val = data[model][fmt]
287              if val is not None:
288                  matrix[i, j] = val
289  
290      # Custom colormap: green (0) -> yellow (0.5) -> red (1)
291      from matplotlib.colors import LinearSegmentedColormap
292      cmap = LinearSegmentedColormap.from_list('d1', ['#2ecc71', '#f1c40f', '#e74c3c'])
293      cmap.set_bad(color='#ecf0f1')  # gray for None
294  
295      im = ax.imshow(matrix, cmap=cmap, aspect='auto', vmin=0, vmax=1)
296  
297      ax.set_xticks(range(len(format_labels)))
298      ax.set_xticklabels(format_labels, fontsize=10)
299      ax.set_yticks(range(len(sorted_models)))
300      ax.set_yticklabels(sorted_models, fontsize=7)
301  
302      # Add cluster separators
303      cluster_boundaries = []
304      current_cluster = data[sorted_models[0]]['cluster']
305      for i, model in enumerate(sorted_models):
306          if data[model]['cluster'] != current_cluster:
307              cluster_boundaries.append(i - 0.5)
308              current_cluster = data[model]['cluster']
309  
310      for b in cluster_boundaries:
311          ax.axhline(y=b, color='black', linewidth=1.5)
312  
313      # Add value labels
314      for i in range(len(sorted_models)):
315          for j in range(len(formats)):
316              val = matrix[i, j]
317              if not np.isnan(val):
318                  text_color = 'white' if val > 0.6 else 'black'
319                  ax.text(j, i, f'{val:.2f}', ha='center', va='center',
320                          fontsize=7, color=text_color, fontweight='bold')
321              else:
322                  ax.text(j, i, '—', ha='center', va='center',
323                          fontsize=8, color='#bdc3c7')
324  
325      # Cluster labels on right
326      cluster_ranges = {}
327      current_cluster = data[sorted_models[0]]['cluster']
328      start = 0
329      for i, model in enumerate(sorted_models):
330          if data[model]['cluster'] != current_cluster:
331              cluster_ranges[current_cluster] = (start, i - 1)
332              current_cluster = data[model]['cluster']
333              start = i
334      cluster_ranges[current_cluster] = (start, len(sorted_models) - 1)
335  
336      for cluster, (s, e) in cluster_ranges.items():
337          mid = (s + e) / 2
338          ax.text(3.2, mid, cluster, ha='left', va='center', fontsize=8,
339                  color=cluster_colors[cluster], fontweight='bold')
340  
341      ax.set_title('Format Sensitivity of the Externalization Boundary\n(44 Models, N=10-45, Dead-Trial-Filtered)',
342                   fontsize=12, pad=10)
343  
344      plt.colorbar(im, ax=ax, label='D1 Score', shrink=0.5)
345      plt.tight_layout()
346      plt.savefig(OUTPUT_DIR / 'fig4_format_sensitivity.png')
347      plt.savefig(OUTPUT_DIR / 'fig4_format_sensitivity.pdf')
348      plt.close()
349      print("  Fig 4: Format sensitivity heatmap saved")
350  
351  
352  def fig5_hubris_landscape(models):
353      """Fig 5: Composite Hubris Landscape — All Models"""
354      models_sorted = sorted([m for m in models if m.get('hubris_equal') is not None],
355                             key=lambda m: m['hubris_equal'])
356  
357      fig, ax = plt.subplots(figsize=(12, 7))
358  
359      names = [m['name'] for m in models_sorted]
360      hubris = [m['hubris_equal'] for m in models_sorted]
361      colors = [CAT_COLORS.get(m['category'], '#95a5a6') for m in models_sorted]
362  
363      bars = ax.bar(range(len(names)), hubris, color=colors, edgecolor='white', linewidth=0.5)
364  
365      ax.set_xticks(range(len(names)))
366      ax.set_xticklabels(names, rotation=75, ha='right', fontsize=7)
367      ax.set_ylabel('Composite Hubris (equal weighting)')
368      ax.set_title('The Cross-Laboratory Distortion Field: 31-Model Discipline Landscape')
369  
370      # Flagship ceiling
371      flagship_max = max(m['hubris_equal'] for m in models if m['category'] == 'Flagship')
372      ax.axhline(y=flagship_max, color='green', linewidth=1, linestyle='--',
373                 alpha=0.6, label=f'Flagship ceiling: {flagship_max:.3f}')
374  
375      # Mean lines
376      flagship_mean = np.mean([m['hubris_equal'] for m in models if m['category'] == 'Flagship'])
377      ax.axhline(y=flagship_mean, color='green', linewidth=1.5, linestyle='-',
378                 alpha=0.4, label=f'Flagship mean: {flagship_mean:.3f}')
379  
380      patches = [mpatches.Patch(color=c, label=l) for l, c in CAT_COLORS.items()
381                 if any(m['category'] == l for m in models)]
382      ax.legend(handles=patches, loc='upper left', framealpha=0.9)
383  
384      plt.tight_layout()
385      plt.savefig(OUTPUT_DIR / 'fig5_hubris_landscape.png')
386      plt.savefig(OUTPUT_DIR / 'fig5_hubris_landscape.pdf')
387      plt.close()
388      print("  Fig 5: Hubris landscape saved")
389  
390  
391  def main():
392      print("Loading data...")
393      models = load_data()
394      print(f"  {len(models)} models loaded")
395  
396      print("\nGenerating figures...")
397      fig1_d1_bimodality(models)
398      fig2_flagship_convergence(models)
399      fig3_reasoning_profiles(models)
400      fig4_format_sensitivity_heatmap()
401      fig5_hubris_landscape(models)
402  
403      print(f"\nAll figures saved to {OUTPUT_DIR}/")
404      print("Files generated:")
405      for f in sorted(OUTPUT_DIR.glob('fig*')):
406          print(f"  {f.name}")
407  
408  
409  if __name__ == '__main__':
410      main()