1 |
grok-4-0709 |
9 |
1.64 |
0.9319 |
▶ |
Task-by-Task Performance for grok-4-0709
|
2 |
claude-opus-4-1-20250805 |
8 |
1.91 |
0.9228 |
▶ |
Task-by-Task Performance for claude-opus-4-1-20250805
|
3 |
gpt-5-2025-08-07 |
8 |
2.00 |
0.9239 |
▶ |
Task-by-Task Performance for gpt-5-2025-08-07
|
4 |
claude-opus-4-20250514 |
7 |
2.27 |
0.9249 |
▶ |
Task-by-Task Performance for claude-opus-4-20250514
|
5 |
o3 |
7 |
2.82 |
0.9192 |
▶ |
Task-by-Task Performance for o3
|
6 |
grok-3-mini |
7 |
3.36 |
0.8901 |
▶ |
Task-by-Task Performance for grok-3-mini
|
7 |
o4-mini |
4 |
5.18 |
0.9059 |
▶ |
Task-by-Task Performance for o4-mini
|
8 |
claude-sonnet-4-20250514 |
4 |
6.36 |
0.8952 |
▶ |
Task-by-Task Performance for claude-sonnet-4-20250514
|
9 |
o3-mini |
6 |
7.00 |
0.8566 |
▶ |
Task-by-Task Performance for o3-mini
|
10 |
openai/gpt-oss-120b |
3 |
8.45 |
0.8884 |
▶ |
Task-by-Task Performance for openai/gpt-oss-120b
|
11 |
claude-3-5-haiku-20241022 |
2 |
9.00 |
0.8803 |
▶ |
Task-by-Task Performance for claude-3-5-haiku-20241022
|
12 |
deepseek-ai/DeepSeek-V3 |
2 |
10.55 |
0.8384 |
▶ |
Task-by-Task Performance for deepseek-ai/DeepSeek-V3
|
13 |
meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo |
1 |
10.64 |
0.8457 |
▶ |
Task-by-Task Performance for meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
|
14 |
deepseek-ai/DeepSeek-R1 |
2 |
13.27 |
0.6825 |
▶ |
Task-by-Task Performance for deepseek-ai/DeepSeek-R1
|
15 |
gpt-4o-mini |
0 |
14.09 |
0.7972 |
▶ |
Task-by-Task Performance for gpt-4o-mini
|
16 |
meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo |
0 |
14.18 |
0.7707 |
▶ |
Task-by-Task Performance for meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
|
17 |
google/gemma-2-27b-it |
0 |
14.64 |
0.7673 |
▶ |
Task-by-Task Performance for google/gemma-2-27b-it
|
18 |
claude-3-haiku-20240307 |
0 |
16.55 |
0.6789 |
▶ |
Task-by-Task Performance for claude-3-haiku-20240307
|
19 |
gpt-4.1-nano |
0 |
17.82 |
0.5911 |
▶ |
Task-by-Task Performance for gpt-4.1-nano
|