| { | |
| "training_metadata": { | |
| "timestamp": "20251115_102317", | |
| "training_date": "2025-11-15", | |
| "training_time": "10:23:17", | |
| "final_epoch": 3, | |
| "total_steps": null, | |
| "status": "completed", | |
| "run_name": "GLM-4.6_lr0.0002_20251114_083347" | |
| }, | |
| "model_config": { | |
| "base_model": "zai-org/GLM-4.6", | |
| "model_type": "moe_causal_lm", | |
| "architecture": "Glm4MoeForCausalLM", | |
| "total_parameters": 44104060416, | |
| "trainable_parameters": 4333568, | |
| "trainable_percentage": "0.0098%" | |
| }, | |
| "lora_config": { | |
| "r": 8, | |
| "lora_alpha": 16, | |
| "lora_dropout": 0.05, | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj" | |
| ], | |
| "exclude_modules": [ | |
| "block_sparse_moe", | |
| "w1", | |
| "w2", | |
| "w3", | |
| "gate" | |
| ], | |
| "bias": "none", | |
| "use_rslora": true | |
| }, | |
| "training_config": { | |
| "num_epochs": 3, | |
| "per_device_train_batch_size": 1, | |
| "per_device_eval_batch_size": 1, | |
| "gradient_accumulation_steps": 64, | |
| "effective_batch_size": 512, | |
| "learning_rate": 0.0002, | |
| "lr_scheduler_type": "cosine", | |
| "warmup_ratio": 0.03, | |
| "weight_decay": 0.01, | |
| "max_grad_norm": 1.0, | |
| "bf16": true, | |
| "gradient_checkpointing": true, | |
| "optim": "adafactor", | |
| "logging_steps": 10, | |
| "save_steps": 50, | |
| "eval_steps": 50 | |
| }, | |
| "dataset_info": { | |
| "train_samples": 16450, | |
| "eval_samples": 100, | |
| "max_seq_length": 512, | |
| "data_source": "hyperswitch" | |
| }, | |
| "hardware_config": { | |
| "num_gpus": 8, | |
| "gpu_model": "NVIDIA H200", | |
| "gpu_memory_per_device_gb": 141, | |
| "distributed_strategy": "FSDP (Fully Sharded Data Parallel)", | |
| "fsdp_sharding_strategy": "FULL_SHARD", | |
| "flash_attention": "2.8.3" | |
| }, | |
| "moe_config": { | |
| "use_auxiliary_loss": true, | |
| "auxiliary_loss_weight": 0.001, | |
| "freeze_router": false, | |
| "num_experts_per_token": 2, | |
| "monitor_expert_usage": true | |
| }, | |
| "performance_metrics": { | |
| "final_train_loss": 0.7243273908441717, | |
| "final_train_runtime": 91094.0431, | |
| "final_train_samples_per_second": 0.542, | |
| "final_train_steps_per_second": 0.001, | |
| "final_train_perplexity": 2.0633428098649813 | |
| }, | |
| "framework_versions": { | |
| "torch": "2.5.1+cu121", | |
| "transformers": "4.57.1", | |
| "peft": "0.17.1", | |
| "accelerate": "1.11.0", | |
| "python": "3.12.3", | |
| "flash_attn": "2.8.3" | |
| }, | |
| "special_features": { | |
| "flash_attention_2": true, | |
| "gradient_checkpointing": true, | |
| "bf16_training": true, | |
| "fsdp_training": true, | |
| "attention_only_lora": true, | |
| "frozen_experts": true, | |
| "eval_accumulation": true | |
| } | |
| } |