/ paper / generated_macros.tex
generated_macros.tex
  1  % Auto-generated by research/scripts/generate_paper_data.py
  2  % Generated: 2026-02-21T15:29:35.504460
  3  % DO NOT EDIT MANUALLY. Regenerate with:
  4  %   .venv/bin/python3 research/scripts/generate_paper_data.py
  5  
  6  % ---- Aggregate statistics ----
  7  \newcommand{\nModels}{48}
  8  \newcommand{\nLabs}{22}
  9  \newcommand{\nValidTrials}{2,101}
 10  \newcommand{\nLiveTrials}{2,058}
 11  \newcommand{\nInternalizerTrials}{43}
 12  \newcommand{\nExcludedTrials}{3,804}
 13  \newcommand{\nRawTrials}{5,905}
 14  \newcommand{\exclusionRatePct}{64}
 15  \newcommand{\nFisherSigFDR}{14}
 16  \newcommand{\nFisherTestable}{44}
 17  
 18  % ---- Per-model D1 (weighted mean across formats) ----
 19  \newcommand{\dclaude37sonnetMean}{0.130}
 20  \newcommand{\dclaude37sonnetN}{115}
 21  \newcommand{\dclaude37sonnetnativeapiMean}{0.467}
 22  \newcommand{\dclaude37sonnetnativeapiN}{30}
 23  \newcommand{\dclaude37sonnettextxmlMean}{0.022}
 24  \newcommand{\dclaude37sonnettextxmlN}{45}
 25  \newcommand{\dclaude37sonnetpythonictextMean}{0.000}
 26  \newcommand{\dclaude37sonnetpythonictextN}{40}
 27  \newcommand{\dclaudehaikuMean}{0.668}
 28  \newcommand{\dclaudehaikuN}{90}
 29  \newcommand{\dclaudehaikunativeapiMean}{0.003}
 30  \newcommand{\dclaudehaikunativeapiN}{30}
 31  \newcommand{\dclaudehaikutextxmlMean}{1.000}
 32  \newcommand{\dclaudehaikutextxmlN}{30}
 33  \newcommand{\dclaudehaikupythonictextMean}{1.000}
 34  \newcommand{\dclaudehaikupythonictextN}{30}
 35  \newcommand{\dclaudeopus46Mean}{0.667}
 36  \newcommand{\dclaudeopus46N}{30}
 37  \newcommand{\dclaudeopus46nativeapiMean}{0.000}
 38  \newcommand{\dclaudeopus46nativeapiN}{10}
 39  \newcommand{\dclaudeopus46textxmlMean}{1.000}
 40  \newcommand{\dclaudeopus46textxmlN}{10}
 41  \newcommand{\dclaudeopus46pythonictextMean}{1.000}
 42  \newcommand{\dclaudeopus46pythonictextN}{10}
 43  \newcommand{\dclaudesonnetMean}{0.333}
 44  \newcommand{\dclaudesonnetN}{90}
 45  \newcommand{\dclaudesonnetnativeapiMean}{1.000}
 46  \newcommand{\dclaudesonnetnativeapiN}{30}
 47  \newcommand{\dclaudesonnettextxmlMean}{0.000}
 48  \newcommand{\dclaudesonnettextxmlN}{30}
 49  \newcommand{\dclaudesonnetpythonictextMean}{0.000}
 50  \newcommand{\dclaudesonnetpythonictextN}{30}
 51  \newcommand{\dclaudesonnet46Mean}{0.300}
 52  \newcommand{\dclaudesonnet46N}{30}
 53  \newcommand{\dclaudesonnet46nativeapiMean}{0.000}
 54  \newcommand{\dclaudesonnet46nativeapiN}{10}
 55  \newcommand{\dclaudesonnet46textxmlMean}{0.900}
 56  \newcommand{\dclaudesonnet46textxmlN}{10}
 57  \newcommand{\dclaudesonnet46pythonictextMean}{0.000}
 58  \newcommand{\dclaudesonnet46pythonictextN}{10}
 59  \newcommand{\dcommandrplusMean}{0.595}
 60  \newcommand{\dcommandrplusN}{42}
 61  \newcommand{\dcommandrplusnativeapiMean}{0.500}
 62  \newcommand{\dcommandrplusnativeapiN}{20}
 63  \newcommand{\dcommandrplustextxmlMean}{1.000}
 64  \newcommand{\dcommandrplustextxmlN}{10}
 65  \newcommand{\dcommandrpluspythonictextMean}{0.417}
 66  \newcommand{\dcommandrpluspythonictextN}{12}
 67  \newcommand{\ddeepseekr1Mean}{0.077}
 68  \newcommand{\ddeepseekr1N}{26}
 69  \newcommand{\ddeepseekr1nativeapiMean}{0.000}
 70  \newcommand{\ddeepseekr1nativeapiN}{10}
 71  \newcommand{\ddeepseekr1textxmlMean}{0.125}
 72  \newcommand{\ddeepseekr1textxmlN}{16}
 73  \newcommand{\ddeepseekr1distillllamaMean}{0.300}
 74  \newcommand{\ddeepseekr1distillllamaN}{10}
 75  \newcommand{\ddeepseekr1distillllamatextxmlMean}{0.300}
 76  \newcommand{\ddeepseekr1distillllamatextxmlN}{10}
 77  \newcommand{\ddeepseekr1distillqwenMean}{1.000}
 78  \newcommand{\ddeepseekr1distillqwenN}{16}
 79  \newcommand{\ddeepseekr1distillqwentextxmlMean}{1.000}
 80  \newcommand{\ddeepseekr1distillqwentextxmlN}{16}
 81  \newcommand{\ddeepseekv3Mean}{0.067}
 82  \newcommand{\ddeepseekv3N}{90}
 83  \newcommand{\ddeepseekv3nativeapiMean}{0.044}
 84  \newcommand{\ddeepseekv3nativeapiN}{45}
 85  \newcommand{\ddeepseekv3textxmlMean}{0.089}
 86  \newcommand{\ddeepseekv3textxmlN}{45}
 87  \newcommand{\dernie45Mean}{0.032}
 88  \newcommand{\dernie45N}{31}
 89  \newcommand{\dernie45textxmlMean}{0.000}
 90  \newcommand{\dernie45textxmlN}{20}
 91  \newcommand{\dernie45pythonictextMean}{0.091}
 92  \newcommand{\dernie45pythonictextN}{11}
 93  \newcommand{\dgemini25proMean}{0.592}
 94  \newcommand{\dgemini25proN}{49}
 95  \newcommand{\dgemini25pronativeapiMean}{1.000}
 96  \newcommand{\dgemini25pronativeapiN}{10}
 97  \newcommand{\dgemini25protextxmlMean}{0.593}
 98  \newcommand{\dgemini25protextxmlN}{27}
 99  \newcommand{\dgemini25propythonictextMean}{0.250}
100  \newcommand{\dgemini25propythonictextN}{12}
101  \newcommand{\dgeminiflashMean}{0.000}
102  \newcommand{\dgeminiflashN}{45}
103  \newcommand{\dgeminiflashnativeapiMean}{0.000}
104  \newcommand{\dgeminiflashnativeapiN}{15}
105  \newcommand{\dgeminiflashtextxmlMean}{0.000}
106  \newcommand{\dgeminiflashtextxmlN}{15}
107  \newcommand{\dgeminiflashpythonictextMean}{0.000}
108  \newcommand{\dgeminiflashpythonictextN}{15}
109  \newcommand{\dgemma327bMean}{0.400}
110  \newcommand{\dgemma327bN}{35}
111  \newcommand{\dgemma327btextxmlMean}{0.400}
112  \newcommand{\dgemma327btextxmlN}{20}
113  \newcommand{\dgemma327bpythonictextMean}{0.400}
114  \newcommand{\dgemma327bpythonictextN}{15}
115  \newcommand{\dgemma27bMean}{0.967}
116  \newcommand{\dgemma27bN}{30}
117  \newcommand{\dgemma27btextxmlMean}{0.933}
118  \newcommand{\dgemma27btextxmlN}{15}
119  \newcommand{\dgemma27bpythonictextMean}{1.000}
120  \newcommand{\dgemma27bpythonictextN}{15}
121  \newcommand{\dglm45Mean}{0.405}
122  \newcommand{\dglm45N}{89}
123  \newcommand{\dglm45nativeapiMean}{0.233}
124  \newcommand{\dglm45nativeapiN}{30}
125  \newcommand{\dglm45textxmlMean}{1.000}
126  \newcommand{\dglm45textxmlN}{29}
127  \newcommand{\dglm45pythonictextMean}{0.000}
128  \newcommand{\dglm45pythonictextN}{30}
129  \newcommand{\dglm47Mean}{0.333}
130  \newcommand{\dglm47N}{30}
131  \newcommand{\dglm47nativeapiMean}{0.000}
132  \newcommand{\dglm47nativeapiN}{10}
133  \newcommand{\dglm47textxmlMean}{1.000}
134  \newcommand{\dglm47textxmlN}{10}
135  \newcommand{\dglm47pythonictextMean}{0.000}
136  \newcommand{\dglm47pythonictextN}{10}
137  \newcommand{\dglm5Mean}{0.726}
138  \newcommand{\dglm5N}{51}
139  \newcommand{\dglm5nativeapiMean}{0.882}
140  \newcommand{\dglm5nativeapiN}{17}
141  \newcommand{\dglm5textxmlMean}{0.913}
142  \newcommand{\dglm5textxmlN}{23}
143  \newcommand{\dglm5pythonictextMean}{0.091}
144  \newcommand{\dglm5pythonictextN}{11}
145  \newcommand{\dgpt41Mean}{0.033}
146  \newcommand{\dgpt41N}{30}
147  \newcommand{\dgpt41nativeapiMean}{0.000}
148  \newcommand{\dgpt41nativeapiN}{10}
149  \newcommand{\dgpt41textxmlMean}{0.000}
150  \newcommand{\dgpt41textxmlN}{10}
151  \newcommand{\dgpt41pythonictextMean}{0.100}
152  \newcommand{\dgpt41pythonictextN}{10}
153  \newcommand{\dgpt41miniMean}{0.617}
154  \newcommand{\dgpt41miniN}{120}
155  \newcommand{\dgpt41mininativeapiMean}{0.000}
156  \newcommand{\dgpt41mininativeapiN}{40}
157  \newcommand{\dgpt41minitextxmlMean}{0.875}
158  \newcommand{\dgpt41minitextxmlN}{40}
159  \newcommand{\dgpt41minipythonictextMean}{0.975}
160  \newcommand{\dgpt41minipythonictextN}{40}
161  \newcommand{\dgpt4oMean}{0.000}
162  \newcommand{\dgpt4oN}{30}
163  \newcommand{\dgpt4onativeapiMean}{0.000}
164  \newcommand{\dgpt4onativeapiN}{10}
165  \newcommand{\dgpt4otextxmlMean}{0.000}
166  \newcommand{\dgpt4otextxmlN}{10}
167  \newcommand{\dgpt4opythonictextMean}{0.000}
168  \newcommand{\dgpt4opythonictextN}{10}
169  \newcommand{\dgpt4ominiMean}{0.022}
170  \newcommand{\dgpt4ominiN}{45}
171  \newcommand{\dgpt4omininativeapiMean}{0.067}
172  \newcommand{\dgpt4omininativeapiN}{15}
173  \newcommand{\dgpt4ominitextxmlMean}{0.000}
174  \newcommand{\dgpt4ominitextxmlN}{15}
175  \newcommand{\dgpt4ominipythonictextMean}{0.000}
176  \newcommand{\dgpt4ominipythonictextN}{15}
177  \newcommand{\dgrok3Mean}{0.022}
178  \newcommand{\dgrok3N}{45}
179  \newcommand{\dgrok3nativeapiMean}{0.000}
180  \newcommand{\dgrok3nativeapiN}{10}
181  \newcommand{\dgrok3textxmlMean}{0.000}
182  \newcommand{\dgrok3textxmlN}{10}
183  \newcommand{\dgrok3pythonictextMean}{0.040}
184  \newcommand{\dgrok3pythonictextN}{25}
185  \newcommand{\dhunyuanMean}{0.857}
186  \newcommand{\dhunyuanN}{70}
187  \newcommand{\dhunyuannativeapiMean}{0.000}
188  \newcommand{\dhunyuannativeapiN}{10}
189  \newcommand{\dhunyuantextxmlMean}{1.000}
190  \newcommand{\dhunyuantextxmlN}{30}
191  \newcommand{\dhunyuanpythonictextMean}{1.000}
192  \newcommand{\dhunyuanpythonictextN}{30}
193  \newcommand{\dhunyuant1Mean}{0.700}
194  \newcommand{\dhunyuant1N}{20}
195  \newcommand{\dhunyuant1nativeapiMean}{0.700}
196  \newcommand{\dhunyuant1nativeapiN}{10}
197  \newcommand{\dhunyuant1textxmlMean}{0.700}
198  \newcommand{\dhunyuant1textxmlN}{10}
199  \newcommand{\djambalargeMean}{0.500}
200  \newcommand{\djambalargeN}{60}
201  \newcommand{\djambalargenativeapiMean}{1.000}
202  \newcommand{\djambalargenativeapiN}{30}
203  \newcommand{\djambalargetextxmlMean}{0.000}
204  \newcommand{\djambalargetextxmlN}{30}
205  \newcommand{\dkimik2Mean}{0.000}
206  \newcommand{\dkimik2N}{30}
207  \newcommand{\dkimik2nativeapiMean}{0.000}
208  \newcommand{\dkimik2nativeapiN}{10}
209  \newcommand{\dkimik2textxmlMean}{0.000}
210  \newcommand{\dkimik2textxmlN}{10}
211  \newcommand{\dkimik2pythonictextMean}{0.000}
212  \newcommand{\dkimik2pythonictextN}{10}
213  \newcommand{\dllama3370bMean}{0.179}
214  \newcommand{\dllama3370bN}{39}
215  \newcommand{\dllama3370bnativeapiMean}{0.278}
216  \newcommand{\dllama3370bnativeapiN}{18}
217  \newcommand{\dllama3370btextxmlMean}{0.182}
218  \newcommand{\dllama3370btextxmlN}{11}
219  \newcommand{\dllama3370bpythonictextMean}{0.000}
220  \newcommand{\dllama3370bpythonictextN}{10}
221  \newcommand{\dllama4maverickMean}{0.315}
222  \newcommand{\dllama4maverickN}{92}
223  \newcommand{\dllama4mavericknativeapiMean}{0.900}
224  \newcommand{\dllama4mavericknativeapiN}{30}
225  \newcommand{\dllama4mavericktextxmlMean}{0.000}
226  \newcommand{\dllama4mavericktextxmlN}{30}
227  \newcommand{\dllama4maverickpythonictextMean}{0.062}
228  \newcommand{\dllama4maverickpythonictextN}{32}
229  \newcommand{\dllama4scoutMean}{0.600}
230  \newcommand{\dllama4scoutN}{45}
231  \newcommand{\dllama4scoutnativeapiMean}{0.600}
232  \newcommand{\dllama4scoutnativeapiN}{15}
233  \newcommand{\dllama4scouttextxmlMean}{0.933}
234  \newcommand{\dllama4scouttextxmlN}{15}
235  \newcommand{\dllama4scoutpythonictextMean}{0.267}
236  \newcommand{\dllama4scoutpythonictextN}{15}
237  \newcommand{\dlongcatflashMean}{0.000}
238  \newcommand{\dlongcatflashN}{10}
239  \newcommand{\dlongcatflashtextxmlMean}{0.000}
240  \newcommand{\dlongcatflashtextxmlN}{10}
241  \newcommand{\dmimoflashMean}{0.000}
242  \newcommand{\dmimoflashN}{20}
243  \newcommand{\dmimoflashnativeapiMean}{0.000}
244  \newcommand{\dmimoflashnativeapiN}{10}
245  \newcommand{\dmimoflashtextxmlMean}{0.000}
246  \newcommand{\dmimoflashtextxmlN}{10}
247  \newcommand{\dminimaxm25Mean}{0.118}
248  \newcommand{\dminimaxm25N}{51}
249  \newcommand{\dminimaxm25nativeapiMean}{0.000}
250  \newcommand{\dminimaxm25nativeapiN}{10}
251  \newcommand{\dminimaxm25textxmlMean}{0.130}
252  \newcommand{\dminimaxm25textxmlN}{23}
253  \newcommand{\dminimaxm25pythonictextMean}{0.167}
254  \newcommand{\dminimaxm25pythonictextN}{18}
255  \newcommand{\dmistral7bMean}{0.000}
256  \newcommand{\dmistral7bN}{20}
257  \newcommand{\dmistral7btextxmlMean}{0.000}
258  \newcommand{\dmistral7btextxmlN}{10}
259  \newcommand{\dmistral7bpythonictextMean}{0.000}
260  \newcommand{\dmistral7bpythonictextN}{10}
261  \newcommand{\dmistrallargeMean}{0.000}
262  \newcommand{\dmistrallargeN}{30}
263  \newcommand{\dmistrallargenativeapiMean}{0.000}
264  \newcommand{\dmistrallargenativeapiN}{15}
265  \newcommand{\dmistrallargetextxmlMean}{0.000}
266  \newcommand{\dmistrallargetextxmlN}{15}
267  \newcommand{\dmixtral8x22bMean}{0.000}
268  \newcommand{\dmixtral8x22bN}{20}
269  \newcommand{\dmixtral8x22bnativeapiMean}{0.000}
270  \newcommand{\dmixtral8x22bnativeapiN}{10}
271  \newcommand{\dmixtral8x22btextxmlMean}{0.000}
272  \newcommand{\dmixtral8x22btextxmlN}{10}
273  \newcommand{\dnovaproMean}{0.061}
274  \newcommand{\dnovaproN}{33}
275  \newcommand{\dnovapronativeapiMean}{0.000}
276  \newcommand{\dnovapronativeapiN}{10}
277  \newcommand{\dnovaprotextxmlMean}{0.000}
278  \newcommand{\dnovaprotextxmlN}{10}
279  \newcommand{\dnovapropythonictextMean}{0.154}
280  \newcommand{\dnovapropythonictextN}{13}
281  \newcommand{\do3miniMean}{0.650}
282  \newcommand{\do3miniN}{120}
283  \newcommand{\do3mininativeapiMean}{0.956}
284  \newcommand{\do3mininativeapiN}{45}
285  \newcommand{\do3minitextxmlMean}{0.200}
286  \newcommand{\do3minitextxmlN}{45}
287  \newcommand{\do3minipythonictextMean}{0.867}
288  \newcommand{\do3minipythonictextN}{30}
289  \newcommand{\dphi4Mean}{0.000}
290  \newcommand{\dphi4N}{10}
291  \newcommand{\dphi4textxmlMean}{0.000}
292  \newcommand{\dphi4textxmlN}{10}
293  \newcommand{\dqwen257bMean}{0.200}
294  \newcommand{\dqwen257bN}{30}
295  \newcommand{\dqwen257bnativeapiMean}{0.600}
296  \newcommand{\dqwen257bnativeapiN}{10}
297  \newcommand{\dqwen257btextxmlMean}{0.000}
298  \newcommand{\dqwen257btextxmlN}{10}
299  \newcommand{\dqwen257bpythonictextMean}{0.000}
300  \newcommand{\dqwen257bpythonictextN}{10}
301  \newcommand{\dqwen35397bMean}{0.000}
302  \newcommand{\dqwen35397bN}{30}
303  \newcommand{\dqwen35397bnativeapiMean}{0.000}
304  \newcommand{\dqwen35397bnativeapiN}{10}
305  \newcommand{\dqwen35397btextxmlMean}{0.000}
306  \newcommand{\dqwen35397btextxmlN}{10}
307  \newcommand{\dqwen35397bpythonictextMean}{0.000}
308  \newcommand{\dqwen35397bpythonictextN}{10}
309  \newcommand{\dqwen3235bMean}{0.121}
310  \newcommand{\dqwen3235bN}{33}
311  \newcommand{\dqwen3235bnativeapiMean}{0.100}
312  \newcommand{\dqwen3235bnativeapiN}{10}
313  \newcommand{\dqwen3235btextxmlMean}{0.167}
314  \newcommand{\dqwen3235btextxmlN}{12}
315  \newcommand{\dqwen3235bpythonictextMean}{0.091}
316  \newcommand{\dqwen3235bpythonictextN}{11}
317  \newcommand{\dqwen72bMean}{0.000}
318  \newcommand{\dqwen72bN}{30}
319  \newcommand{\dqwen72bnativeapiMean}{0.000}
320  \newcommand{\dqwen72bnativeapiN}{15}
321  \newcommand{\dqwen72btextxmlMean}{0.000}
322  \newcommand{\dqwen72btextxmlN}{15}
323  \newcommand{\dqwq32bMean}{0.043}
324  \newcommand{\dqwq32bN}{23}
325  \newcommand{\dqwq32bnativeapiMean}{0.077}
326  \newcommand{\dqwq32bnativeapiN}{13}
327  \newcommand{\dqwq32btextxmlMean}{0.000}
328  \newcommand{\dqwq32btextxmlN}{10}
329  \newcommand{\dseed16Mean}{0.065}
330  \newcommand{\dseed16N}{46}
331  \newcommand{\dseed16nativeapiMean}{0.125}
332  \newcommand{\dseed16nativeapiN}{16}
333  \newcommand{\dseed16textxmlMean}{0.050}
334  \newcommand{\dseed16textxmlN}{20}
335  \newcommand{\dseed16pythonictextMean}{0.000}
336  \newcommand{\dseed16pythonictextN}{10}
337  \newcommand{\dseed20liteMean}{0.000}
338  \newcommand{\dseed20liteN}{20}
339  \newcommand{\dseed20litenativeapiMean}{0.000}
340  \newcommand{\dseed20litenativeapiN}{10}
341  \newcommand{\dseed20litetextxmlMean}{0.000}
342  \newcommand{\dseed20litetextxmlN}{10}
343  \newcommand{\dseed20proMean}{0.033}
344  \newcommand{\dseed20proN}{30}
345  \newcommand{\dseed20pronativeapiMean}{0.000}
346  \newcommand{\dseed20pronativeapiN}{10}
347  \newcommand{\dseed20protextxmlMean}{0.000}
348  \newcommand{\dseed20protextxmlN}{10}
349  \newcommand{\dseed20propythonictextMean}{0.100}
350  \newcommand{\dseed20propythonictextN}{10}
351  \newcommand{\dstepflashMean}{1.000}
352  \newcommand{\dstepflashN}{20}
353  \newcommand{\dstepflashnativeapiMean}{1.000}
354  \newcommand{\dstepflashnativeapiN}{10}
355  \newcommand{\dstepflashtextxmlMean}{1.000}
356  \newcommand{\dstepflashtextxmlN}{10}