cd /volume/pt-train/users/txie/hyw/flame source /volume/pt-train/users/txie/hyw/.venv/bin/activate bash train.sh --job.config_file flame/models/fla.toml --job.dump_folder /volume/pt-train/users/txie/hyw/flame/exp/hybrid/340M-100B-GDN-hybrid-6-1-8K --model.config /volume/pt-train/users/txie/hyw/flame/configs/gdn_6_1_340M.json --model.tokenizer_path /volume/pt-train/users/txie/hyw/tokenizer/gated_delta_net_6_1 --optimizer.name AdamW --optimizer.eps 1e-8 --optimizer.lr 3e-4 --lr_scheduler.warmup_steps 100 --lr_scheduler.lr_min 1 --lr_scheduler.decay_type cosine --lr_scheduler.decay_ratio 0 --training.batch_size 16 --training.seq_len 8192 --training.context_len 8192 --training.gradient_accumulation_steps 1 --training.steps 95366 --training.max_norm 1.0 --training.skip_nan_inf --training.dataset /volume/pt-train/users/txie/hyw/dataset/fineweb-edu-sample,/volume/pt-train/users/txie/hyw/dataset/small_repos_20B_sample_merged,/volume/pt-train/users/txie/hyw/dataset/MegaMath/megamath-web-pro --training.data_probs 0.55,0.3,0.15 --training.dataset_split train,train,train --training.dataset_name default,default,default --training.streaming --training.num_workers 32 --training.prefetch_factor 2 --training.seed 42 --training.compile --checkpoint.interval 8192 --checkpoint.load_step -1 --checkpoint.keep_latest_k 100 --metrics.log_freq 1 --metrics.enable_tensorboard --training.streaming