|
|
|
|
|
|
|
|
hydra: |
|
|
searchpath: |
|
|
- file://verl/trainer/config |
|
|
|
|
|
defaults: |
|
|
- ppo_trainer |
|
|
- _self_ |
|
|
|
|
|
data: |
|
|
tokenizer: null |
|
|
train_files: ~/data/rlhf/gsm8k/train.parquet |
|
|
val_files: ~/data/rlhf/gsm8k/test.parquet |
|
|
prompt_key: prompt |
|
|
max_prompt_length: 512 |
|
|
max_response_length: 512 |
|
|
train_batch_size: 1024 |
|
|
val_batch_size: null |
|
|
return_raw_input_ids: False |
|
|
return_raw_chat: False |
|
|
return_full_prompt: False |
|
|
shuffle: True |
|
|
|
|
|
actor_rollout_ref: |
|
|
hybrid_engine: True |
|
|
model: |
|
|
path: ~/models/deepseek-llm-7b-chat |
|
|
external_lib: null |
|
|
override_config: { } |
|
|
enable_gradient_checkpointing: True |
|
|
use_remove_padding: False |
|
|
actor: |
|
|
strategy: fsdp |
|
|
ppo_mini_batch_size: 256 |
|
|
ppo_micro_batch_size: null |
|
|
ppo_micro_batch_size_per_gpu: null |
|
|
use_dynamic_bsz: False |
|
|
ppo_max_token_len_per_gpu: 16384 |
|
|
grad_clip: 1.0 |
|
|
clip_ratio: 0.2 |
|
|
entropy_coeff: 0.0 |
|
|
use_kl_loss: False |
|
|
kl_loss_coef: 0.001 |
|
|
kl_loss_type: low_var_kl |
|
|
ppo_epochs: 1 |
|
|
shuffle: False |
|
|
ulysses_sequence_parallel_size: 1 |
|
|
optim: |
|
|
lr: 1e-6 |
|
|
lr_warmup_steps: -1 |
|
|
lr_warmup_steps_ratio: 0. |
|
|
min_lr_ratio: null |
|
|
warmup_style: constant |
|
|
total_training_steps: -1 |
|
|
fsdp_config: |
|
|
wrap_policy: |
|
|
|
|
|
min_num_params: 0 |
|
|
param_offload: False |
|
|
optimizer_offload: False |
|
|
fsdp_size: -1 |
|
|
ref: |
|
|
fsdp_config: |
|
|
param_offload: False |
|
|
wrap_policy: |
|
|
|
|
|
min_num_params: 0 |
|
|
log_prob_micro_batch_size: null |
|
|
log_prob_micro_batch_size_per_gpu: null |
|
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
|
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
|
|
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} |
|
|
rollout: |
|
|
name: vllm |
|
|
temperature: 1.0 |
|
|
top_k: -1 |
|
|
top_p: 1 |
|
|
prompt_length: ${data.max_prompt_length} |
|
|
response_length: ${data.max_response_length} |
|
|
|
|
|
dtype: bfloat16 |
|
|
gpu_memory_utilization: 0.5 |
|
|
ignore_eos: False |
|
|
enforce_eager: True |
|
|
free_cache_engine: True |
|
|
load_format: dummy_dtensor |
|
|
tensor_model_parallel_size: 2 |
|
|
max_num_batched_tokens: 8192 |
|
|
max_num_seqs: 1024 |
|
|
log_prob_micro_batch_size: null |
|
|
log_prob_micro_batch_size_per_gpu: null |
|
|
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
|
|
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
|
|
disable_log_stats: True |
|
|
enable_chunked_prefill: True |
|
|
|
|
|
do_sample: True |
|
|
|
|
|
n: 1 |
|
|
|
|
|
critic: |
|
|
strategy: fsdp |
|
|
optim: |
|
|
lr: 1e-5 |
|
|
lr_warmup_steps_ratio: 0. |
|
|
min_lr_ratio: null |
|
|
warmup_style: constant |
|
|
total_training_steps: -1 |
|
|
model: |
|
|
path: ~/models/deepseek-llm-7b-chat |
|
|
tokenizer_path: ${actor_rollout_ref.model.path} |
|
|
override_config: { } |
|
|
external_lib: ${actor_rollout_ref.model.external_lib} |
|
|
enable_gradient_checkpointing: True |
|
|
use_remove_padding: False |
|
|
fsdp_config: |
|
|
param_offload: False |
|
|
optimizer_offload: False |
|
|
wrap_policy: |
|
|
|
|
|
min_num_params: 0 |
|
|
fsdp_size: -1 |
|
|
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} |
|
|
ppo_micro_batch_size: null |
|
|
ppo_micro_batch_size_per_gpu: null |
|
|
forward_micro_batch_size: ${critic.ppo_micro_batch_size} |
|
|
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} |
|
|
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
|
|
ppo_max_token_len_per_gpu: 32768 |
|
|
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} |
|
|
ulysses_sequence_parallel_size: 1 |
|
|
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} |
|
|
shuffle: ${actor_rollout_ref.actor.shuffle} |
|
|
grad_clip: 1.0 |
|
|
cliprange_value: 0.5 |
|
|
|
|
|
reward_model: |
|
|
enable: False |
|
|
strategy: fsdp |
|
|
model: |
|
|
input_tokenizer: ${actor_rollout_ref.model.path} |
|
|
path: ~/models/FsfairX-LLaMA3-RM-v0.1 |
|
|
external_lib: ${actor_rollout_ref.model.external_lib} |
|
|
use_remove_padding: False |
|
|
fsdp_config: |
|
|
min_num_params: 0 |
|
|
param_offload: False |
|
|
fsdp_size: -1 |
|
|
micro_batch_size: null |
|
|
micro_batch_size_per_gpu: null |
|
|
max_length: null |
|
|
ulysses_sequence_parallel_size: 1 |
|
|
use_dynamic_bsz: ${critic.use_dynamic_bsz} |
|
|
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} |
|
|
reward_manager: naive |
|
|
|
|
|
algorithm: |
|
|
gamma: 1.0 |
|
|
lam: 1.0 |
|
|
adv_estimator: gae |
|
|
use_kl_in_reward: False |
|
|
kl_penalty: kl |
|
|
kl_ctrl: |
|
|
type: fixed |
|
|
kl_coef: 0.001 |
|
|
|
|
|
trainer: |
|
|
total_epochs: 30 |
|
|
total_training_steps: null |
|
|
project_name: verl_examples |
|
|
experiment_name: gsm8k |
|
|
logger: [ 'console', 'wandb' ] |
|
|
log_val_generations: 0 |
|
|
nnodes: 1 |
|
|
n_gpus_per_node: 8 |
|
|
save_freq: -1 |
|
|
|
|
|
resume_mode: auto |
|
|
resume_from_path: null |
|
|
test_freq: -1 |
|
|
critic_warmup: 0 |
|
|
default_hdfs_dir: null |
|
|
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} |
|
|
|
|
|
ray_init: |
|
|
num_cpus: null |
|
|
|