generated_macros.tex
1 % Auto-generated by research/scripts/generate_paper_data.py 2 % Generated: 2026-02-21T15:29:35.504460 3 % DO NOT EDIT MANUALLY. Regenerate with: 4 % .venv/bin/python3 research/scripts/generate_paper_data.py 5 6 % ---- Aggregate statistics ---- 7 \newcommand{\nModels}{48} 8 \newcommand{\nLabs}{22} 9 \newcommand{\nValidTrials}{2,101} 10 \newcommand{\nLiveTrials}{2,058} 11 \newcommand{\nInternalizerTrials}{43} 12 \newcommand{\nExcludedTrials}{3,804} 13 \newcommand{\nRawTrials}{5,905} 14 \newcommand{\exclusionRatePct}{64} 15 \newcommand{\nFisherSigFDR}{14} 16 \newcommand{\nFisherTestable}{44} 17 18 % ---- Per-model D1 (weighted mean across formats) ---- 19 \newcommand{\dclaude37sonnetMean}{0.130} 20 \newcommand{\dclaude37sonnetN}{115} 21 \newcommand{\dclaude37sonnetnativeapiMean}{0.467} 22 \newcommand{\dclaude37sonnetnativeapiN}{30} 23 \newcommand{\dclaude37sonnettextxmlMean}{0.022} 24 \newcommand{\dclaude37sonnettextxmlN}{45} 25 \newcommand{\dclaude37sonnetpythonictextMean}{0.000} 26 \newcommand{\dclaude37sonnetpythonictextN}{40} 27 \newcommand{\dclaudehaikuMean}{0.668} 28 \newcommand{\dclaudehaikuN}{90} 29 \newcommand{\dclaudehaikunativeapiMean}{0.003} 30 \newcommand{\dclaudehaikunativeapiN}{30} 31 \newcommand{\dclaudehaikutextxmlMean}{1.000} 32 \newcommand{\dclaudehaikutextxmlN}{30} 33 \newcommand{\dclaudehaikupythonictextMean}{1.000} 34 \newcommand{\dclaudehaikupythonictextN}{30} 35 \newcommand{\dclaudeopus46Mean}{0.667} 36 \newcommand{\dclaudeopus46N}{30} 37 \newcommand{\dclaudeopus46nativeapiMean}{0.000} 38 \newcommand{\dclaudeopus46nativeapiN}{10} 39 \newcommand{\dclaudeopus46textxmlMean}{1.000} 40 \newcommand{\dclaudeopus46textxmlN}{10} 41 \newcommand{\dclaudeopus46pythonictextMean}{1.000} 42 \newcommand{\dclaudeopus46pythonictextN}{10} 43 \newcommand{\dclaudesonnetMean}{0.333} 44 \newcommand{\dclaudesonnetN}{90} 45 \newcommand{\dclaudesonnetnativeapiMean}{1.000} 46 \newcommand{\dclaudesonnetnativeapiN}{30} 47 \newcommand{\dclaudesonnettextxmlMean}{0.000} 48 \newcommand{\dclaudesonnettextxmlN}{30} 49 \newcommand{\dclaudesonnetpythonictextMean}{0.000} 50 \newcommand{\dclaudesonnetpythonictextN}{30} 51 \newcommand{\dclaudesonnet46Mean}{0.300} 52 \newcommand{\dclaudesonnet46N}{30} 53 \newcommand{\dclaudesonnet46nativeapiMean}{0.000} 54 \newcommand{\dclaudesonnet46nativeapiN}{10} 55 \newcommand{\dclaudesonnet46textxmlMean}{0.900} 56 \newcommand{\dclaudesonnet46textxmlN}{10} 57 \newcommand{\dclaudesonnet46pythonictextMean}{0.000} 58 \newcommand{\dclaudesonnet46pythonictextN}{10} 59 \newcommand{\dcommandrplusMean}{0.595} 60 \newcommand{\dcommandrplusN}{42} 61 \newcommand{\dcommandrplusnativeapiMean}{0.500} 62 \newcommand{\dcommandrplusnativeapiN}{20} 63 \newcommand{\dcommandrplustextxmlMean}{1.000} 64 \newcommand{\dcommandrplustextxmlN}{10} 65 \newcommand{\dcommandrpluspythonictextMean}{0.417} 66 \newcommand{\dcommandrpluspythonictextN}{12} 67 \newcommand{\ddeepseekr1Mean}{0.077} 68 \newcommand{\ddeepseekr1N}{26} 69 \newcommand{\ddeepseekr1nativeapiMean}{0.000} 70 \newcommand{\ddeepseekr1nativeapiN}{10} 71 \newcommand{\ddeepseekr1textxmlMean}{0.125} 72 \newcommand{\ddeepseekr1textxmlN}{16} 73 \newcommand{\ddeepseekr1distillllamaMean}{0.300} 74 \newcommand{\ddeepseekr1distillllamaN}{10} 75 \newcommand{\ddeepseekr1distillllamatextxmlMean}{0.300} 76 \newcommand{\ddeepseekr1distillllamatextxmlN}{10} 77 \newcommand{\ddeepseekr1distillqwenMean}{1.000} 78 \newcommand{\ddeepseekr1distillqwenN}{16} 79 \newcommand{\ddeepseekr1distillqwentextxmlMean}{1.000} 80 \newcommand{\ddeepseekr1distillqwentextxmlN}{16} 81 \newcommand{\ddeepseekv3Mean}{0.067} 82 \newcommand{\ddeepseekv3N}{90} 83 \newcommand{\ddeepseekv3nativeapiMean}{0.044} 84 \newcommand{\ddeepseekv3nativeapiN}{45} 85 \newcommand{\ddeepseekv3textxmlMean}{0.089} 86 \newcommand{\ddeepseekv3textxmlN}{45} 87 \newcommand{\dernie45Mean}{0.032} 88 \newcommand{\dernie45N}{31} 89 \newcommand{\dernie45textxmlMean}{0.000} 90 \newcommand{\dernie45textxmlN}{20} 91 \newcommand{\dernie45pythonictextMean}{0.091} 92 \newcommand{\dernie45pythonictextN}{11} 93 \newcommand{\dgemini25proMean}{0.592} 94 \newcommand{\dgemini25proN}{49} 95 \newcommand{\dgemini25pronativeapiMean}{1.000} 96 \newcommand{\dgemini25pronativeapiN}{10} 97 \newcommand{\dgemini25protextxmlMean}{0.593} 98 \newcommand{\dgemini25protextxmlN}{27} 99 \newcommand{\dgemini25propythonictextMean}{0.250} 100 \newcommand{\dgemini25propythonictextN}{12} 101 \newcommand{\dgeminiflashMean}{0.000} 102 \newcommand{\dgeminiflashN}{45} 103 \newcommand{\dgeminiflashnativeapiMean}{0.000} 104 \newcommand{\dgeminiflashnativeapiN}{15} 105 \newcommand{\dgeminiflashtextxmlMean}{0.000} 106 \newcommand{\dgeminiflashtextxmlN}{15} 107 \newcommand{\dgeminiflashpythonictextMean}{0.000} 108 \newcommand{\dgeminiflashpythonictextN}{15} 109 \newcommand{\dgemma327bMean}{0.400} 110 \newcommand{\dgemma327bN}{35} 111 \newcommand{\dgemma327btextxmlMean}{0.400} 112 \newcommand{\dgemma327btextxmlN}{20} 113 \newcommand{\dgemma327bpythonictextMean}{0.400} 114 \newcommand{\dgemma327bpythonictextN}{15} 115 \newcommand{\dgemma27bMean}{0.967} 116 \newcommand{\dgemma27bN}{30} 117 \newcommand{\dgemma27btextxmlMean}{0.933} 118 \newcommand{\dgemma27btextxmlN}{15} 119 \newcommand{\dgemma27bpythonictextMean}{1.000} 120 \newcommand{\dgemma27bpythonictextN}{15} 121 \newcommand{\dglm45Mean}{0.405} 122 \newcommand{\dglm45N}{89} 123 \newcommand{\dglm45nativeapiMean}{0.233} 124 \newcommand{\dglm45nativeapiN}{30} 125 \newcommand{\dglm45textxmlMean}{1.000} 126 \newcommand{\dglm45textxmlN}{29} 127 \newcommand{\dglm45pythonictextMean}{0.000} 128 \newcommand{\dglm45pythonictextN}{30} 129 \newcommand{\dglm47Mean}{0.333} 130 \newcommand{\dglm47N}{30} 131 \newcommand{\dglm47nativeapiMean}{0.000} 132 \newcommand{\dglm47nativeapiN}{10} 133 \newcommand{\dglm47textxmlMean}{1.000} 134 \newcommand{\dglm47textxmlN}{10} 135 \newcommand{\dglm47pythonictextMean}{0.000} 136 \newcommand{\dglm47pythonictextN}{10} 137 \newcommand{\dglm5Mean}{0.726} 138 \newcommand{\dglm5N}{51} 139 \newcommand{\dglm5nativeapiMean}{0.882} 140 \newcommand{\dglm5nativeapiN}{17} 141 \newcommand{\dglm5textxmlMean}{0.913} 142 \newcommand{\dglm5textxmlN}{23} 143 \newcommand{\dglm5pythonictextMean}{0.091} 144 \newcommand{\dglm5pythonictextN}{11} 145 \newcommand{\dgpt41Mean}{0.033} 146 \newcommand{\dgpt41N}{30} 147 \newcommand{\dgpt41nativeapiMean}{0.000} 148 \newcommand{\dgpt41nativeapiN}{10} 149 \newcommand{\dgpt41textxmlMean}{0.000} 150 \newcommand{\dgpt41textxmlN}{10} 151 \newcommand{\dgpt41pythonictextMean}{0.100} 152 \newcommand{\dgpt41pythonictextN}{10} 153 \newcommand{\dgpt41miniMean}{0.617} 154 \newcommand{\dgpt41miniN}{120} 155 \newcommand{\dgpt41mininativeapiMean}{0.000} 156 \newcommand{\dgpt41mininativeapiN}{40} 157 \newcommand{\dgpt41minitextxmlMean}{0.875} 158 \newcommand{\dgpt41minitextxmlN}{40} 159 \newcommand{\dgpt41minipythonictextMean}{0.975} 160 \newcommand{\dgpt41minipythonictextN}{40} 161 \newcommand{\dgpt4oMean}{0.000} 162 \newcommand{\dgpt4oN}{30} 163 \newcommand{\dgpt4onativeapiMean}{0.000} 164 \newcommand{\dgpt4onativeapiN}{10} 165 \newcommand{\dgpt4otextxmlMean}{0.000} 166 \newcommand{\dgpt4otextxmlN}{10} 167 \newcommand{\dgpt4opythonictextMean}{0.000} 168 \newcommand{\dgpt4opythonictextN}{10} 169 \newcommand{\dgpt4ominiMean}{0.022} 170 \newcommand{\dgpt4ominiN}{45} 171 \newcommand{\dgpt4omininativeapiMean}{0.067} 172 \newcommand{\dgpt4omininativeapiN}{15} 173 \newcommand{\dgpt4ominitextxmlMean}{0.000} 174 \newcommand{\dgpt4ominitextxmlN}{15} 175 \newcommand{\dgpt4ominipythonictextMean}{0.000} 176 \newcommand{\dgpt4ominipythonictextN}{15} 177 \newcommand{\dgrok3Mean}{0.022} 178 \newcommand{\dgrok3N}{45} 179 \newcommand{\dgrok3nativeapiMean}{0.000} 180 \newcommand{\dgrok3nativeapiN}{10} 181 \newcommand{\dgrok3textxmlMean}{0.000} 182 \newcommand{\dgrok3textxmlN}{10} 183 \newcommand{\dgrok3pythonictextMean}{0.040} 184 \newcommand{\dgrok3pythonictextN}{25} 185 \newcommand{\dhunyuanMean}{0.857} 186 \newcommand{\dhunyuanN}{70} 187 \newcommand{\dhunyuannativeapiMean}{0.000} 188 \newcommand{\dhunyuannativeapiN}{10} 189 \newcommand{\dhunyuantextxmlMean}{1.000} 190 \newcommand{\dhunyuantextxmlN}{30} 191 \newcommand{\dhunyuanpythonictextMean}{1.000} 192 \newcommand{\dhunyuanpythonictextN}{30} 193 \newcommand{\dhunyuant1Mean}{0.700} 194 \newcommand{\dhunyuant1N}{20} 195 \newcommand{\dhunyuant1nativeapiMean}{0.700} 196 \newcommand{\dhunyuant1nativeapiN}{10} 197 \newcommand{\dhunyuant1textxmlMean}{0.700} 198 \newcommand{\dhunyuant1textxmlN}{10} 199 \newcommand{\djambalargeMean}{0.500} 200 \newcommand{\djambalargeN}{60} 201 \newcommand{\djambalargenativeapiMean}{1.000} 202 \newcommand{\djambalargenativeapiN}{30} 203 \newcommand{\djambalargetextxmlMean}{0.000} 204 \newcommand{\djambalargetextxmlN}{30} 205 \newcommand{\dkimik2Mean}{0.000} 206 \newcommand{\dkimik2N}{30} 207 \newcommand{\dkimik2nativeapiMean}{0.000} 208 \newcommand{\dkimik2nativeapiN}{10} 209 \newcommand{\dkimik2textxmlMean}{0.000} 210 \newcommand{\dkimik2textxmlN}{10} 211 \newcommand{\dkimik2pythonictextMean}{0.000} 212 \newcommand{\dkimik2pythonictextN}{10} 213 \newcommand{\dllama3370bMean}{0.179} 214 \newcommand{\dllama3370bN}{39} 215 \newcommand{\dllama3370bnativeapiMean}{0.278} 216 \newcommand{\dllama3370bnativeapiN}{18} 217 \newcommand{\dllama3370btextxmlMean}{0.182} 218 \newcommand{\dllama3370btextxmlN}{11} 219 \newcommand{\dllama3370bpythonictextMean}{0.000} 220 \newcommand{\dllama3370bpythonictextN}{10} 221 \newcommand{\dllama4maverickMean}{0.315} 222 \newcommand{\dllama4maverickN}{92} 223 \newcommand{\dllama4mavericknativeapiMean}{0.900} 224 \newcommand{\dllama4mavericknativeapiN}{30} 225 \newcommand{\dllama4mavericktextxmlMean}{0.000} 226 \newcommand{\dllama4mavericktextxmlN}{30} 227 \newcommand{\dllama4maverickpythonictextMean}{0.062} 228 \newcommand{\dllama4maverickpythonictextN}{32} 229 \newcommand{\dllama4scoutMean}{0.600} 230 \newcommand{\dllama4scoutN}{45} 231 \newcommand{\dllama4scoutnativeapiMean}{0.600} 232 \newcommand{\dllama4scoutnativeapiN}{15} 233 \newcommand{\dllama4scouttextxmlMean}{0.933} 234 \newcommand{\dllama4scouttextxmlN}{15} 235 \newcommand{\dllama4scoutpythonictextMean}{0.267} 236 \newcommand{\dllama4scoutpythonictextN}{15} 237 \newcommand{\dlongcatflashMean}{0.000} 238 \newcommand{\dlongcatflashN}{10} 239 \newcommand{\dlongcatflashtextxmlMean}{0.000} 240 \newcommand{\dlongcatflashtextxmlN}{10} 241 \newcommand{\dmimoflashMean}{0.000} 242 \newcommand{\dmimoflashN}{20} 243 \newcommand{\dmimoflashnativeapiMean}{0.000} 244 \newcommand{\dmimoflashnativeapiN}{10} 245 \newcommand{\dmimoflashtextxmlMean}{0.000} 246 \newcommand{\dmimoflashtextxmlN}{10} 247 \newcommand{\dminimaxm25Mean}{0.118} 248 \newcommand{\dminimaxm25N}{51} 249 \newcommand{\dminimaxm25nativeapiMean}{0.000} 250 \newcommand{\dminimaxm25nativeapiN}{10} 251 \newcommand{\dminimaxm25textxmlMean}{0.130} 252 \newcommand{\dminimaxm25textxmlN}{23} 253 \newcommand{\dminimaxm25pythonictextMean}{0.167} 254 \newcommand{\dminimaxm25pythonictextN}{18} 255 \newcommand{\dmistral7bMean}{0.000} 256 \newcommand{\dmistral7bN}{20} 257 \newcommand{\dmistral7btextxmlMean}{0.000} 258 \newcommand{\dmistral7btextxmlN}{10} 259 \newcommand{\dmistral7bpythonictextMean}{0.000} 260 \newcommand{\dmistral7bpythonictextN}{10} 261 \newcommand{\dmistrallargeMean}{0.000} 262 \newcommand{\dmistrallargeN}{30} 263 \newcommand{\dmistrallargenativeapiMean}{0.000} 264 \newcommand{\dmistrallargenativeapiN}{15} 265 \newcommand{\dmistrallargetextxmlMean}{0.000} 266 \newcommand{\dmistrallargetextxmlN}{15} 267 \newcommand{\dmixtral8x22bMean}{0.000} 268 \newcommand{\dmixtral8x22bN}{20} 269 \newcommand{\dmixtral8x22bnativeapiMean}{0.000} 270 \newcommand{\dmixtral8x22bnativeapiN}{10} 271 \newcommand{\dmixtral8x22btextxmlMean}{0.000} 272 \newcommand{\dmixtral8x22btextxmlN}{10} 273 \newcommand{\dnovaproMean}{0.061} 274 \newcommand{\dnovaproN}{33} 275 \newcommand{\dnovapronativeapiMean}{0.000} 276 \newcommand{\dnovapronativeapiN}{10} 277 \newcommand{\dnovaprotextxmlMean}{0.000} 278 \newcommand{\dnovaprotextxmlN}{10} 279 \newcommand{\dnovapropythonictextMean}{0.154} 280 \newcommand{\dnovapropythonictextN}{13} 281 \newcommand{\do3miniMean}{0.650} 282 \newcommand{\do3miniN}{120} 283 \newcommand{\do3mininativeapiMean}{0.956} 284 \newcommand{\do3mininativeapiN}{45} 285 \newcommand{\do3minitextxmlMean}{0.200} 286 \newcommand{\do3minitextxmlN}{45} 287 \newcommand{\do3minipythonictextMean}{0.867} 288 \newcommand{\do3minipythonictextN}{30} 289 \newcommand{\dphi4Mean}{0.000} 290 \newcommand{\dphi4N}{10} 291 \newcommand{\dphi4textxmlMean}{0.000} 292 \newcommand{\dphi4textxmlN}{10} 293 \newcommand{\dqwen257bMean}{0.200} 294 \newcommand{\dqwen257bN}{30} 295 \newcommand{\dqwen257bnativeapiMean}{0.600} 296 \newcommand{\dqwen257bnativeapiN}{10} 297 \newcommand{\dqwen257btextxmlMean}{0.000} 298 \newcommand{\dqwen257btextxmlN}{10} 299 \newcommand{\dqwen257bpythonictextMean}{0.000} 300 \newcommand{\dqwen257bpythonictextN}{10} 301 \newcommand{\dqwen35397bMean}{0.000} 302 \newcommand{\dqwen35397bN}{30} 303 \newcommand{\dqwen35397bnativeapiMean}{0.000} 304 \newcommand{\dqwen35397bnativeapiN}{10} 305 \newcommand{\dqwen35397btextxmlMean}{0.000} 306 \newcommand{\dqwen35397btextxmlN}{10} 307 \newcommand{\dqwen35397bpythonictextMean}{0.000} 308 \newcommand{\dqwen35397bpythonictextN}{10} 309 \newcommand{\dqwen3235bMean}{0.121} 310 \newcommand{\dqwen3235bN}{33} 311 \newcommand{\dqwen3235bnativeapiMean}{0.100} 312 \newcommand{\dqwen3235bnativeapiN}{10} 313 \newcommand{\dqwen3235btextxmlMean}{0.167} 314 \newcommand{\dqwen3235btextxmlN}{12} 315 \newcommand{\dqwen3235bpythonictextMean}{0.091} 316 \newcommand{\dqwen3235bpythonictextN}{11} 317 \newcommand{\dqwen72bMean}{0.000} 318 \newcommand{\dqwen72bN}{30} 319 \newcommand{\dqwen72bnativeapiMean}{0.000} 320 \newcommand{\dqwen72bnativeapiN}{15} 321 \newcommand{\dqwen72btextxmlMean}{0.000} 322 \newcommand{\dqwen72btextxmlN}{15} 323 \newcommand{\dqwq32bMean}{0.043} 324 \newcommand{\dqwq32bN}{23} 325 \newcommand{\dqwq32bnativeapiMean}{0.077} 326 \newcommand{\dqwq32bnativeapiN}{13} 327 \newcommand{\dqwq32btextxmlMean}{0.000} 328 \newcommand{\dqwq32btextxmlN}{10} 329 \newcommand{\dseed16Mean}{0.065} 330 \newcommand{\dseed16N}{46} 331 \newcommand{\dseed16nativeapiMean}{0.125} 332 \newcommand{\dseed16nativeapiN}{16} 333 \newcommand{\dseed16textxmlMean}{0.050} 334 \newcommand{\dseed16textxmlN}{20} 335 \newcommand{\dseed16pythonictextMean}{0.000} 336 \newcommand{\dseed16pythonictextN}{10} 337 \newcommand{\dseed20liteMean}{0.000} 338 \newcommand{\dseed20liteN}{20} 339 \newcommand{\dseed20litenativeapiMean}{0.000} 340 \newcommand{\dseed20litenativeapiN}{10} 341 \newcommand{\dseed20litetextxmlMean}{0.000} 342 \newcommand{\dseed20litetextxmlN}{10} 343 \newcommand{\dseed20proMean}{0.033} 344 \newcommand{\dseed20proN}{30} 345 \newcommand{\dseed20pronativeapiMean}{0.000} 346 \newcommand{\dseed20pronativeapiN}{10} 347 \newcommand{\dseed20protextxmlMean}{0.000} 348 \newcommand{\dseed20protextxmlN}{10} 349 \newcommand{\dseed20propythonictextMean}{0.100} 350 \newcommand{\dseed20propythonictextN}{10} 351 \newcommand{\dstepflashMean}{1.000} 352 \newcommand{\dstepflashN}{20} 353 \newcommand{\dstepflashnativeapiMean}{1.000} 354 \newcommand{\dstepflashnativeapiN}{10} 355 \newcommand{\dstepflashtextxmlMean}{1.000} 356 \newcommand{\dstepflashtextxmlN}{10}