Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -155,54 +155,84 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS | |
| 155 | 
             
               <td><strong>Recovery</strong>
         | 
| 156 | 
             
               </td>
         | 
| 157 | 
             
              </tr>
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 158 | 
             
              <tr>
         | 
| 159 | 
             
               <td>MMLU-cot (0-shot)
         | 
| 160 | 
             
               </td>
         | 
| 161 | 
            -
               <td> | 
| 162 | 
             
               </td>
         | 
| 163 | 
            -
               <td> | 
| 164 | 
             
               </td>
         | 
| 165 | 
            -
               <td> | 
| 166 | 
             
               </td>
         | 
| 167 | 
             
              </tr>
         | 
| 168 | 
             
              <tr>
         | 
| 169 | 
             
               <td>ARC Challenge (0-shot)
         | 
| 170 | 
             
               </td>
         | 
| 171 | 
            -
               <td>77. | 
| 172 | 
             
               </td>
         | 
| 173 | 
            -
               <td>76. | 
| 174 | 
             
               </td>
         | 
| 175 | 
            -
               <td>99. | 
| 176 | 
             
               </td>
         | 
| 177 | 
             
              </tr>
         | 
| 178 | 
             
              <tr>
         | 
| 179 | 
             
               <td>GSM-8K-cot (8-shot, strict-match)
         | 
| 180 | 
             
               </td>
         | 
| 181 | 
            -
               <td>77. | 
| 182 | 
             
               </td>
         | 
| 183 | 
            -
               <td>76. | 
| 184 | 
             
               </td>
         | 
| 185 | 
            -
               <td>98. | 
| 186 | 
             
               </td>
         | 
| 187 | 
             
              </tr>
         | 
| 188 | 
             
              <tr>
         | 
| 189 | 
             
               <td>Winogrande (5-shot)
         | 
| 190 | 
             
               </td>
         | 
| 191 | 
            -
               <td> | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 192 | 
             
               </td>
         | 
| 193 | 
            -
               <td> | 
| 194 | 
             
               </td>
         | 
| 195 | 
            -
               <td> | 
| 196 | 
             
               </td>
         | 
| 197 | 
             
              </tr>
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 198 | 
             
              <tr>
         | 
| 199 | 
             
               <td><strong>Average</strong>
         | 
| 200 | 
             
               </td>
         | 
| 201 | 
            -
               <td><strong> | 
| 202 | 
             
               </td>
         | 
| 203 | 
            -
               <td><strong> | 
| 204 | 
             
               </td>
         | 
| 205 | 
            -
               <td><strong> | 
| 206 | 
             
               </td>
         | 
| 207 | 
             
              </tr>
         | 
| 208 | 
             
            </table>
         | 
| @@ -212,11 +242,23 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS | |
| 212 | 
             
            The results were obtained using the following commands:
         | 
| 213 |  | 
| 214 |  | 
| 215 | 
            -
            #### MMLU | 
| 216 | 
             
            ```
         | 
| 217 | 
             
            lm_eval \
         | 
| 218 | 
             
              --model vllm \
         | 
| 219 | 
            -
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto, | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 220 | 
             
              --tasks mmlu_cot_0shot_llama_3.1_instruct \
         | 
| 221 | 
             
              --apply_chat_template \
         | 
| 222 | 
             
              --num_fewshot 0 \
         | 
| @@ -227,7 +269,7 @@ lm_eval \ | |
| 227 | 
             
            ```
         | 
| 228 | 
             
            lm_eval \
         | 
| 229 | 
             
              --model vllm \
         | 
| 230 | 
            -
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto, | 
| 231 | 
             
              --tasks arc_challenge_llama_3.1_instruct \
         | 
| 232 | 
             
              --apply_chat_template \
         | 
| 233 | 
             
              --num_fewshot 0 \
         | 
| @@ -238,21 +280,40 @@ lm_eval \ | |
| 238 | 
             
            ```
         | 
| 239 | 
             
            lm_eval \
         | 
| 240 | 
             
              --model vllm \
         | 
| 241 | 
            -
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto, | 
| 242 | 
             
              --tasks gsm8k_cot_llama_3.1_instruct \
         | 
| 243 | 
            -
              --apply_chat_template \
         | 
| 244 | 
             
              --fewshot_as_multiturn \
         | 
|  | |
| 245 | 
             
              --num_fewshot 8 \
         | 
| 246 | 
             
              --batch_size auto
         | 
| 247 | 
             
            ```
         | 
| 248 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 249 | 
             
            #### Winogrande
         | 
| 250 | 
             
            ```
         | 
| 251 | 
             
            lm_eval \
         | 
| 252 | 
             
              --model vllm \
         | 
| 253 | 
            -
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token= | 
| 254 | 
             
              --tasks winogrande \
         | 
| 255 | 
             
              --num_fewshot 5 \
         | 
| 256 | 
             
              --batch_size auto
         | 
| 257 | 
             
            ```
         | 
| 258 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 155 | 
             
               <td><strong>Recovery</strong>
         | 
| 156 | 
             
               </td>
         | 
| 157 | 
             
              </tr>
         | 
| 158 | 
            +
              <tr>
         | 
| 159 | 
            +
               <td>MMLU (5-shot)
         | 
| 160 | 
            +
               </td>
         | 
| 161 | 
            +
               <td>62.98
         | 
| 162 | 
            +
               </td>
         | 
| 163 | 
            +
               <td>62.95
         | 
| 164 | 
            +
               </td>
         | 
| 165 | 
            +
               <td>100.0%
         | 
| 166 | 
            +
               </td>
         | 
| 167 | 
            +
              </tr>
         | 
| 168 | 
             
              <tr>
         | 
| 169 | 
             
               <td>MMLU-cot (0-shot)
         | 
| 170 | 
             
               </td>
         | 
| 171 | 
            +
               <td>65.40
         | 
| 172 | 
             
               </td>
         | 
| 173 | 
            +
               <td>65.23
         | 
| 174 | 
             
               </td>
         | 
| 175 | 
            +
               <td>99.7%
         | 
| 176 | 
             
               </td>
         | 
| 177 | 
             
              </tr>
         | 
| 178 | 
             
              <tr>
         | 
| 179 | 
             
               <td>ARC Challenge (0-shot)
         | 
| 180 | 
             
               </td>
         | 
| 181 | 
            +
               <td>77.13
         | 
| 182 | 
             
               </td>
         | 
| 183 | 
            +
               <td>76.71
         | 
| 184 | 
             
               </td>
         | 
| 185 | 
            +
               <td>99.4%
         | 
| 186 | 
             
               </td>
         | 
| 187 | 
             
              </tr>
         | 
| 188 | 
             
              <tr>
         | 
| 189 | 
             
               <td>GSM-8K-cot (8-shot, strict-match)
         | 
| 190 | 
             
               </td>
         | 
| 191 | 
            +
               <td>77.94
         | 
| 192 | 
             
               </td>
         | 
| 193 | 
            +
               <td>76.72
         | 
| 194 | 
             
               </td>
         | 
| 195 | 
            +
               <td>98.4%
         | 
| 196 | 
             
               </td>
         | 
| 197 | 
             
              </tr>
         | 
| 198 | 
             
              <tr>
         | 
| 199 | 
             
               <td>Winogrande (5-shot)
         | 
| 200 | 
             
               </td>
         | 
| 201 | 
            +
               <td>71.11
         | 
| 202 | 
            +
               </td>
         | 
| 203 | 
            +
               <td>71.11
         | 
| 204 | 
            +
               </td>
         | 
| 205 | 
            +
               <td>100.0%
         | 
| 206 | 
            +
               </td>
         | 
| 207 | 
            +
              </tr>
         | 
| 208 | 
            +
              <tr>
         | 
| 209 | 
            +
               <td>Hellaswag (10-shot)
         | 
| 210 | 
            +
               </td>
         | 
| 211 | 
            +
               <td>73.62
         | 
| 212 | 
             
               </td>
         | 
| 213 | 
            +
               <td>73.54
         | 
| 214 | 
             
               </td>
         | 
| 215 | 
            +
               <td>99.9%
         | 
| 216 | 
             
               </td>
         | 
| 217 | 
             
              </tr>
         | 
| 218 | 
            +
              <tr>
         | 
| 219 | 
            +
               <td>TruthfulQA (0-shot, mc2)
         | 
| 220 | 
            +
               </td>
         | 
| 221 | 
            +
               <td>51.47
         | 
| 222 | 
            +
               </td>
         | 
| 223 | 
            +
               <td>51.06
         | 
| 224 | 
            +
               </td>
         | 
| 225 | 
            +
               <td>99.2%
         | 
| 226 | 
            +
               </td>
         | 
| 227 | 
            +
              </tr>  
         | 
| 228 | 
             
              <tr>
         | 
| 229 | 
             
               <td><strong>Average</strong>
         | 
| 230 | 
             
               </td>
         | 
| 231 | 
            +
               <td><strong>68.52</strong>
         | 
| 232 | 
             
               </td>
         | 
| 233 | 
            +
               <td><strong>68.19</strong>
         | 
| 234 | 
             
               </td>
         | 
| 235 | 
            +
               <td><strong>99.5%</strong>
         | 
| 236 | 
             
               </td>
         | 
| 237 | 
             
              </tr>
         | 
| 238 | 
             
            </table>
         | 
|  | |
| 242 | 
             
            The results were obtained using the following commands:
         | 
| 243 |  | 
| 244 |  | 
| 245 | 
            +
            #### MMLU
         | 
| 246 | 
             
            ```
         | 
| 247 | 
             
            lm_eval \
         | 
| 248 | 
             
              --model vllm \
         | 
| 249 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
         | 
| 250 | 
            +
              --tasks mmlu_llama_3.1_instruct \
         | 
| 251 | 
            +
              --fewshot_as_multiturn \
         | 
| 252 | 
            +
              --apply_chat_template \
         | 
| 253 | 
            +
              --num_fewshot 5 \
         | 
| 254 | 
            +
              --batch_size auto
         | 
| 255 | 
            +
            ```
         | 
| 256 | 
            +
             | 
| 257 | 
            +
            #### MMLU-CoT
         | 
| 258 | 
            +
            ```
         | 
| 259 | 
            +
            lm_eval \
         | 
| 260 | 
            +
              --model vllm \
         | 
| 261 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
         | 
| 262 | 
             
              --tasks mmlu_cot_0shot_llama_3.1_instruct \
         | 
| 263 | 
             
              --apply_chat_template \
         | 
| 264 | 
             
              --num_fewshot 0 \
         | 
|  | |
| 269 | 
             
            ```
         | 
| 270 | 
             
            lm_eval \
         | 
| 271 | 
             
              --model vllm \
         | 
| 272 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
         | 
| 273 | 
             
              --tasks arc_challenge_llama_3.1_instruct \
         | 
| 274 | 
             
              --apply_chat_template \
         | 
| 275 | 
             
              --num_fewshot 0 \
         | 
|  | |
| 280 | 
             
            ```
         | 
| 281 | 
             
            lm_eval \
         | 
| 282 | 
             
              --model vllm \
         | 
| 283 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
         | 
| 284 | 
             
              --tasks gsm8k_cot_llama_3.1_instruct \
         | 
|  | |
| 285 | 
             
              --fewshot_as_multiturn \
         | 
| 286 | 
            +
              --apply_chat_template \
         | 
| 287 | 
             
              --num_fewshot 8 \
         | 
| 288 | 
             
              --batch_size auto
         | 
| 289 | 
             
            ```
         | 
| 290 |  | 
| 291 | 
            +
            #### Hellaswag
         | 
| 292 | 
            +
            ```
         | 
| 293 | 
            +
            lm_eval \
         | 
| 294 | 
            +
              --model vllm \
         | 
| 295 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
         | 
| 296 | 
            +
              --tasks hellaswag \
         | 
| 297 | 
            +
              --num_fewshot 10 \
         | 
| 298 | 
            +
              --batch_size auto
         | 
| 299 | 
            +
            ```
         | 
| 300 | 
            +
             | 
| 301 | 
             
            #### Winogrande
         | 
| 302 | 
             
            ```
         | 
| 303 | 
             
            lm_eval \
         | 
| 304 | 
             
              --model vllm \
         | 
| 305 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
         | 
| 306 | 
             
              --tasks winogrande \
         | 
| 307 | 
             
              --num_fewshot 5 \
         | 
| 308 | 
             
              --batch_size auto
         | 
| 309 | 
             
            ```
         | 
| 310 |  | 
| 311 | 
            +
            #### TruthfulQA
         | 
| 312 | 
            +
            ```
         | 
| 313 | 
            +
            lm_eval \
         | 
| 314 | 
            +
              --model vllm \
         | 
| 315 | 
            +
              --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
         | 
| 316 | 
            +
              --tasks truthfulqa \
         | 
| 317 | 
            +
              --num_fewshot 0 \
         | 
| 318 | 
            +
              --batch_size auto
         | 
| 319 | 
            +
            ```
         | 
