GLM-4.6-CPT-LoRA-HyperSwitch-v1 / training_info.json
AdityaNarayan's picture
Upload 7 files
e37909c verified
{
"training_metadata": {
"timestamp": "20251115_102317",
"training_date": "2025-11-15",
"training_time": "10:23:17",
"final_epoch": 3,
"total_steps": null,
"status": "completed",
"run_name": "GLM-4.6_lr0.0002_20251114_083347"
},
"model_config": {
"base_model": "zai-org/GLM-4.6",
"model_type": "moe_causal_lm",
"architecture": "Glm4MoeForCausalLM",
"total_parameters": 44104060416,
"trainable_parameters": 4333568,
"trainable_percentage": "0.0098%"
},
"lora_config": {
"r": 8,
"lora_alpha": 16,
"lora_dropout": 0.05,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj"
],
"exclude_modules": [
"block_sparse_moe",
"w1",
"w2",
"w3",
"gate"
],
"bias": "none",
"use_rslora": true
},
"training_config": {
"num_epochs": 3,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 1,
"gradient_accumulation_steps": 64,
"effective_batch_size": 512,
"learning_rate": 0.0002,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"bf16": true,
"gradient_checkpointing": true,
"optim": "adafactor",
"logging_steps": 10,
"save_steps": 50,
"eval_steps": 50
},
"dataset_info": {
"train_samples": 16450,
"eval_samples": 100,
"max_seq_length": 512,
"data_source": "hyperswitch"
},
"hardware_config": {
"num_gpus": 8,
"gpu_model": "NVIDIA H200",
"gpu_memory_per_device_gb": 141,
"distributed_strategy": "FSDP (Fully Sharded Data Parallel)",
"fsdp_sharding_strategy": "FULL_SHARD",
"flash_attention": "2.8.3"
},
"moe_config": {
"use_auxiliary_loss": true,
"auxiliary_loss_weight": 0.001,
"freeze_router": false,
"num_experts_per_token": 2,
"monitor_expert_usage": true
},
"performance_metrics": {
"final_train_loss": 0.7243273908441717,
"final_train_runtime": 91094.0431,
"final_train_samples_per_second": 0.542,
"final_train_steps_per_second": 0.001,
"final_train_perplexity": 2.0633428098649813
},
"framework_versions": {
"torch": "2.5.1+cu121",
"transformers": "4.57.1",
"peft": "0.17.1",
"accelerate": "1.11.0",
"python": "3.12.3",
"flash_attn": "2.8.3"
},
"special_features": {
"flash_attention_2": true,
"gradient_checkpointing": true,
"bf16_training": true,
"fsdp_training": true,
"attention_only_lora": true,
"frozen_experts": true,
"eval_accumulation": true
}
}