File size: 2,587 Bytes
e37909c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
{
  "training_metadata": {
    "timestamp": "20251115_102317",
    "training_date": "2025-11-15",
    "training_time": "10:23:17",
    "final_epoch": 3,
    "total_steps": null,
    "status": "completed",
    "run_name": "GLM-4.6_lr0.0002_20251114_083347"
  },
  "model_config": {
    "base_model": "zai-org/GLM-4.6",
    "model_type": "moe_causal_lm",
    "architecture": "Glm4MoeForCausalLM",
    "total_parameters": 44104060416,
    "trainable_parameters": 4333568,
    "trainable_percentage": "0.0098%"
  },
  "lora_config": {
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj"
    ],
    "exclude_modules": [
      "block_sparse_moe",
      "w1",
      "w2",
      "w3",
      "gate"
    ],
    "bias": "none",
    "use_rslora": true
  },
  "training_config": {
    "num_epochs": 3,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 64,
    "effective_batch_size": 512,
    "learning_rate": 0.0002,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "bf16": true,
    "gradient_checkpointing": true,
    "optim": "adafactor",
    "logging_steps": 10,
    "save_steps": 50,
    "eval_steps": 50
  },
  "dataset_info": {
    "train_samples": 16450,
    "eval_samples": 100,
    "max_seq_length": 512,
    "data_source": "hyperswitch"
  },
  "hardware_config": {
    "num_gpus": 8,
    "gpu_model": "NVIDIA H200",
    "gpu_memory_per_device_gb": 141,
    "distributed_strategy": "FSDP (Fully Sharded Data Parallel)",
    "fsdp_sharding_strategy": "FULL_SHARD",
    "flash_attention": "2.8.3"
  },
  "moe_config": {
    "use_auxiliary_loss": true,
    "auxiliary_loss_weight": 0.001,
    "freeze_router": false,
    "num_experts_per_token": 2,
    "monitor_expert_usage": true
  },
  "performance_metrics": {
    "final_train_loss": 0.7243273908441717,
    "final_train_runtime": 91094.0431,
    "final_train_samples_per_second": 0.542,
    "final_train_steps_per_second": 0.001,
    "final_train_perplexity": 2.0633428098649813
  },
  "framework_versions": {
    "torch": "2.5.1+cu121",
    "transformers": "4.57.1",
    "peft": "0.17.1",
    "accelerate": "1.11.0",
    "python": "3.12.3",
    "flash_attn": "2.8.3"
  },
  "special_features": {
    "flash_attention_2": true,
    "gradient_checkpointing": true,
    "bf16_training": true,
    "fsdp_training": true,
    "attention_only_lora": true,
    "frozen_experts": true,
    "eval_accumulation": true
  }
}