Create README.md

70cf80c verified 10 months ago

8.34 kB

metadata

base_model:
  - mistralai/Mistral-Small-24B-Instruct-2501

vllm (pretrained=/root/autodl-tmp/Mistral-Small-24B-Instruct-2501,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.928	±	0.0164
		strict-match	5	exact_match	↑	0.916	±	0.0176

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.910	±	0.0128
		strict-match	5	exact_match	↑	0.904	±	0.0132

vllm (pretrained=/root/autodl-tmp/Mistral-Small-24B-Instruct-2501,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7977	±	0.0130
- humanities	2	none	acc	↑	0.8205	±	0.0267
- other	2	none	acc	↑	0.8154	±	0.0266
- social sciences	2	none	acc	↑	0.8667	±	0.0247
- stem	2	none	acc	↑	0.7263	±	0.0249

vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.936	±	0.0155
		strict-match	5	exact_match	↑	0.928	±	0.0164

vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.92	±	0.0121
		strict-match	5	exact_match	↑	0.91	±	0.0128

vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7942	±	0.0131
- humanities	2	none	acc	↑	0.8308	±	0.0255
- other	2	none	acc	↑	0.8000	±	0.0275
- social sciences	2	none	acc	↑	0.8556	±	0.0253
- stem	2	none	acc	↑	0.7263	±	0.0252

vllm (pretrained=/root/autodl-tmp/876-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.904	±	0.0187
		strict-match	5	exact_match	↑	0.904	±	0.0187

vllm (pretrained=/root/autodl-tmp/876-512,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.904	±	0.0132
		strict-match	5	exact_match	↑	0.900	±	0.0134

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7942	±	0.0130
- humanities	2	none	acc	↑	0.8256	±	0.0259
- other	2	none	acc	↑	0.8051	±	0.0266
- social sciences	2	none	acc	↑	0.8556	±	0.0255
- stem	2	none	acc	↑	0.7263	±	0.0248

vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.904	±	0.0187
		strict-match	5	exact_match	↑	0.904	±	0.0187

vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.900	±	0.0134
		strict-match	5	exact_match	↑	0.898	±	0.0135

vllm (pretrained=/root/autodl-tmp/883-2048,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7930	±	0.0131
- humanities	2	none	acc	↑	0.8256	±	0.0259
- other	2	none	acc	↑	0.7897	±	0.0279
- social sciences	2	none	acc	↑	0.8667	±	0.0246
- stem	2	none	acc	↑	0.7263	±	0.0249

vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.908	±	0.0183
		strict-match	5	exact_match	↑	0.904	±	0.0187

vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=2048,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.906	±	0.0131
		strict-match	5	exact_match	↑	0.900	±	0.0134

vllm (pretrained=/root/autodl-tmp/89-1024,add_bos_token=true,max_model_len=700,tensor_parallel_size=2,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7906	±	0.0131
- humanities	2	none	acc	↑	0.8205	±	0.0261
- other	2	none	acc	↑	0.7949	±	0.0275
- social sciences	2	none	acc	↑	0.8667	±	0.0246
- stem	2	none	acc	↑	0.7193	±	0.0249