metadata
base_model:
- mistralai/Mistral-Small-24B-Instruct-2501
vllm (pretrained=/root/autodl-tmp/Mistral-Small-24B-Instruct-2501,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.928 |
± |
0.0164 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.916 |
± |
0.0176 |
vllm (pretrained=/root/autodl-tmp/Mistral-Small-24B-Instruct-2501,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.910 |
± |
0.0128 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0132 |
vllm (pretrained=/root/autodl-tmp/Mistral-Small-24B-Instruct-2501,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7977 |
± |
0.0130 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8205 |
± |
0.0267 |
| - other |
2 |
none |
|
acc |
↑ |
0.8154 |
± |
0.0266 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8667 |
± |
0.0247 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0249 |
vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.936 |
± |
0.0155 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.928 |
± |
0.0164 |
vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.92 |
± |
0.0121 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.91 |
± |
0.0128 |
vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7942 |
± |
0.0131 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8308 |
± |
0.0255 |
| - other |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0275 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8556 |
± |
0.0253 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0252 |
vllm (pretrained=/root/autodl-tmp/876-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
vllm (pretrained=/root/autodl-tmp/876-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.904 |
± |
0.0132 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.900 |
± |
0.0134 |
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7942 |
± |
0.0130 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8256 |
± |
0.0259 |
| - other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0266 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8556 |
± |
0.0255 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0248 |
vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.900 |
± |
0.0134 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.898 |
± |
0.0135 |
vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7930 |
± |
0.0131 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8256 |
± |
0.0259 |
| - other |
2 |
none |
|
acc |
↑ |
0.7897 |
± |
0.0279 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8667 |
± |
0.0246 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0249 |
vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.908 |
± |
0.0183 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
| Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.906 |
± |
0.0131 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.900 |
± |
0.0134 |
vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
| Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
| mmlu |
2 |
none |
|
acc |
↑ |
0.7906 |
± |
0.0131 |
| - humanities |
2 |
none |
|
acc |
↑ |
0.8205 |
± |
0.0261 |
| - other |
2 |
none |
|
acc |
↑ |
0.7949 |
± |
0.0275 |
| - social sciences |
2 |
none |
|
acc |
↑ |
0.8667 |
± |
0.0246 |
| - stem |
2 |
none |
|
acc |
↑ |
0.7193 |
± |
0.0249 |