qgallouedec HF Staff commited on
Commit
e53d2e0
·
verified ·
1 Parent(s): b52ad44

Training in progress, step 500

Browse files
README.md CHANGED
@@ -1,18 +1,18 @@
1
  ---
2
- base_model: Qwen/Qwen2-0.5B
3
- datasets: trl-lib/prm800k
4
  library_name: transformers
5
  model_name: Qwen2-0.5B-Reward
6
  tags:
7
  - generated_from_trainer
 
8
  - trl
9
- - stepwise-reward-trainer
10
  licence: license
11
  ---
12
 
13
  # Model Card for Qwen2-0.5B-Reward
14
 
15
- This model is a fine-tuned version of [Qwen/Qwen2-0.5B](https://huggingface.co/Qwen/Qwen2-0.5B) on the [trl-lib/prm800k](https://huggingface.co/datasets/trl-lib/prm800k) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,25 +28,26 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/huggingface/huggingface/runs/zvj1dih4)
32
 
33
- This model was trained with Stepwise Reward.
 
34
 
35
  ### Framework versions
36
 
37
- - TRL: 0.13.0.dev0
38
- - Transformers: 4.47.0.dev0
39
- - Pytorch: 2.5.0
40
- - Datasets: 3.1.0
41
- - Tokenizers: 0.20.3
42
 
43
  ## Citations
44
 
45
- Cite Stepwise Reward as:
46
 
47
  ```bibtex
48
  @article{uesato2022solving,
49
- title = {Solving Math Word Problems With Process- and Outcome-Based Feedback},
50
  author = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
51
  year = 2022,
52
  journal = {arXiv preprint arXiv:2211.14275}
@@ -58,7 +59,7 @@ Cite TRL as:
58
  ```bibtex
59
  @misc{vonwerra2022trl,
60
  title = {{TRL: Transformer Reinforcement Learning}},
61
- author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
62
  year = 2020,
63
  journal = {GitHub repository},
64
  publisher = {GitHub},
 
1
  ---
2
+ base_model: Qwen/Qwen2-0.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2-0.5B-Reward
5
  tags:
6
  - generated_from_trainer
7
+ - prm
8
  - trl
9
+ - hf_jobs
10
  licence: license
11
  ---
12
 
13
  # Model Card for Qwen2-0.5B-Reward
14
 
15
+ This model is a fine-tuned version of [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct).
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
 
28
 
29
  ## Training procedure
30
 
31
+
32
 
33
+
34
+ This model was trained with PRM.
35
 
36
  ### Framework versions
37
 
38
+ - TRL: 0.24.0.dev0
39
+ - Transformers: 4.56.1
40
+ - Pytorch: 2.8.0
41
+ - Datasets: 4.0.0
42
+ - Tokenizers: 0.22.0
43
 
44
  ## Citations
45
 
46
+ Cite PRM as:
47
 
48
  ```bibtex
49
  @article{uesato2022solving,
50
+ title = {{Solving Math Word Problems With Process- and Outcome-Based Feedback}},
51
  author = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
52
  year = 2022,
53
  journal = {arXiv preprint arXiv:2211.14275}
 
59
  ```bibtex
60
  @misc{vonwerra2022trl,
61
  title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
63
  year = 2020,
64
  journal = {GitHub repository},
65
  publisher = {GitHub},
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
4
+ ' + message['content'] + '<|im_end|>' + '
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
6
+ ' }}{% endif %}
config.json CHANGED
@@ -1,16 +1,41 @@
1
  {
2
- "_name_or_path": "Qwen/Qwen2-0.5B",
3
  "architectures": [
4
  "Qwen2ForTokenClassification"
5
  ],
6
  "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "eos_token_id": 151643,
9
  "hidden_act": "silu",
10
  "hidden_size": 896,
11
  "initializer_range": 0.02,
12
  "intermediate_size": 4864,
13
- "max_position_embeddings": 131072,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "max_window_layers": 24,
15
  "model_type": "qwen2",
16
  "num_attention_heads": 14,
@@ -22,8 +47,7 @@
22
  "rope_theta": 1000000.0,
23
  "sliding_window": null,
24
  "tie_word_embeddings": true,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.47.0.dev0",
27
  "use_cache": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 151936
 
1
  {
 
2
  "architectures": [
3
  "Qwen2ForTokenClassification"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
  "hidden_act": "silu",
9
  "hidden_size": 896,
10
  "initializer_range": 0.02,
11
  "intermediate_size": 4864,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
  "max_window_layers": 24,
40
  "model_type": "qwen2",
41
  "num_attention_heads": 14,
 
47
  "rope_theta": 1000000.0,
48
  "sliding_window": null,
49
  "tie_word_embeddings": true,
50
+ "transformers_version": "4.56.1",
 
51
  "use_cache": false,
52
  "use_sliding_window": false,
53
  "vocab_size": 151936
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b613f904affd73904f07247944d0372d2b2a5d614b24cad27465fef9dc5e499f
3
  size 1976170816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19fb7634844ba4e2157a323aeb507a10f2a9bc977f797dda04130811ba4db2da
3
  size 1976170816
special_tokens_map.json CHANGED
@@ -4,7 +4,7 @@
4
  "<|im_end|>"
5
  ],
6
  "eos_token": {
7
- "content": "<|endoftext|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
 
4
  "<|im_end|>"
5
  ],
6
  "eos_token": {
7
+ "content": "<|im_end|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -31,9 +31,8 @@
31
  "<|im_end|>"
32
  ],
33
  "bos_token": null,
34
- "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
  "clean_up_tokenization_spaces": false,
36
- "eos_token": "<|endoftext|>",
37
  "errors": "replace",
38
  "extra_special_tokens": {},
39
  "model_max_length": 32768,
 
31
  "<|im_end|>"
32
  ],
33
  "bos_token": null,
 
34
  "clean_up_tokenization_spaces": false,
35
+ "eos_token": "<|im_end|>",
36
  "errors": "replace",
37
  "extra_special_tokens": {},
38
  "model_max_length": 32768,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c29057260b37ba464a5874dd5ab5403968e08f2f16311a22fc1d82f3356a84ca
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aebad9bdc8da61ae130c99fd20ae762b915956475209baf2ed6c414c18f14e04
3
+ size 6033