IvanHU's picture
Upload folder using huggingface_hub
001abb2 verified
bash train.sh --job.config_file flame/models/fla.toml --job.dump_folder /volume/pt-train/users/txie/hyw/flame/exp/hybrid/340M-120B-GLA-hybrid-12-1-8K --model.config /volume/pt-train/users/txie/hyw/flame/configs/gla_12_1_340M.json --model.tokenizer_path /volume/pt-train/users/txie/hyw/tokenizer/gated_delta_net_6_1 --optimizer.name AdamW --optimizer.eps 1e-8 --optimizer.lr 3e-4 --lr_scheduler.warmup_steps 100 --lr_scheduler.lr_min 0.01 --lr_scheduler.decay_type cosine --training.batch_size 16 --training.seq_len 8192 --training.context_len 8192 --training.gradient_accumulation_steps 1 --training.steps 11444 --training.max_norm 1.0 --training.skip_nan_inf --training.dataset /volume/pt-train/users/txie/hyw/dataset/fineweb-edu-sample,/volume/pt-train/users/txie/hyw/dataset/small_repos_20B_sample_merged --training.data_probs 0.7,0.3 --training.dataset_split train,train --training.dataset_name default,default --training.streaming --training.num_workers 32 --training.prefetch_factor 2 --training.seed 42 --training.compile --checkpoint.interval 8192 --checkpoint.load_step 0 --checkpoint.keep_latest_k 100 --metrics.log_freq 1 --metrics.enable_tensorboard --training.streaming
bash train.sh --job.config_file flame/models/fla.toml --job.dump_folder /volume/pt-train/users/txie/hyw/flame/exp/hybrid/340M-120B-GLA-hybrid-3-1-8K --model.config /volume/pt-train/users/txie/hyw/flame/configs/gla_3_1_340M.json --model.tokenizer_path /volume/pt-train/users/txie/hyw/tokenizer/gated_delta_net_6_1 --optimizer.name AdamW --optimizer.eps 1e-8 --optimizer.lr 3e-4 --lr_scheduler.warmup_steps 100 --lr_scheduler.lr_min 0.01 --lr_scheduler.decay_type cosine --training.batch_size 16 --training.seq_len 8192 --training.context_len 8192 --training.gradient_accumulation_steps 1 --training.steps 11444 --training.max_norm 1.0 --training.skip_nan_inf --training.dataset /volume/pt-train/users/txie/hyw/dataset/fineweb-edu-sample,/volume/pt-train/users/txie/hyw/dataset/small_repos_20B_sample_merged --training.data_probs 0.7,0.3 --training.dataset_split train,train --training.dataset_name default,default --training.streaming --training.num_workers 32 --training.prefetch_factor 2 --training.seed 42 --training.compile --checkpoint.interval 8192 --checkpoint.load_step 0 --checkpoint.keep_latest_k 100 --metrics.log_freq 1 --metrics.enable_tensorboard --training.streaming
bash train.sh --job.config_file flame/models/fla.toml --job.dump_folder /volume/pt-train/users/txie/hyw/flame/exp/hybrid/340M-120B-GLA-hybrid-24-1-8K --model.config /volume/pt-train/users/txie/hyw/flame/configs/gla_24_1_340M.json --model.tokenizer_path /volume/pt-train/users/txie/hyw/tokenizer/gated_delta_net_6_1 --optimizer.name AdamW --optimizer.eps 1e-8 --optimizer.lr 3e-4 --lr_scheduler.warmup_steps 100 --lr_scheduler.lr_min 0.01 --lr_scheduler.decay_type cosine --training.batch_size 16 --training.seq_len 8192 --training.context_len 8192 --training.gradient_accumulation_steps 1 --training.steps 11444 --training.max_norm 1.0 --training.skip_nan_inf --training.dataset /volume/pt-train/users/txie/hyw/dataset/fineweb-edu-sample,/volume/pt-train/users/txie/hyw/dataset/small_repos_20B_sample_merged --training.data_probs 0.7,0.3 --training.dataset_split train,train --training.dataset_name default,default --training.streaming --training.num_workers 32 --training.prefetch_factor 2 --training.seed 42 --training.compile --checkpoint.interval 8192 --checkpoint.load_step 0 --checkpoint.keep_latest_k 100 --metrics.log_freq 1 --metrics.enable_tensorboard --training.streaming
bash train.sh --job.config_file flame/models/fla.toml --job.dump_folder /volume/pt-train/users/txie/hyw/flame/exp/hybrid/340M-120B-GLA-hybrid-12-2-8K --model.config /volume/pt-train/users/txie/hyw/flame/configs/gla_12_2_340M.json --model.tokenizer_path /volume/pt-train/users/txie/hyw/tokenizer/gated_delta_net_6_1 --optimizer.name AdamW --optimizer.eps 1e-8 --optimizer.lr 3e-4 --lr_scheduler.warmup_steps 100 --lr_scheduler.lr_min 0.01 --lr_scheduler.decay_type cosine --training.batch_size 16 --training.seq_len 8192 --training.context_len 8192 --training.gradient_accumulation_steps 1 --training.steps 11444 --training.max_norm 1.0 --training.skip_nan_inf --training.dataset /volume/pt-train/users/txie/hyw/dataset/fineweb-edu-sample,/volume/pt-train/users/txie/hyw/dataset/small_repos_20B_sample_merged --training.data_probs 0.7,0.3 --training.dataset_split train,tran --training.dataset_name default,default --training.streaming --training.num_workers 32 --training.prefetch_factor 2 --training.seed 42 --training.compile --checkpoint.interval 8192 --checkpoint.load_step 0 --checkpoint.keep_latest_k 100 --metrics.log_freq 1 --metrics.enable_tensorboard --training.streaming