| # ZamAI-Mistral-7B-Pashto Training Configuration | |
| # Model Configuration | |
| model: | |
| name: "mistralai/Mistral-7B-Instruct-v0.1" | |
| type: "causal-lm" | |
| use_flash_attention_2: true | |
| load_in_8bit: false | |
| load_in_4bit: true # Use 4-bit quantization for efficiency | |
| # Dataset Configuration | |
| dataset: | |
| name: "tasal9/ZamAI_Pashto_Training" | |
| text_column: "text" # or "instruction" for instruction-response format | |
| train_split: "train" | |
| validation_split: "test" | |
| max_seq_length: 2048 | |
| # Training Parameters | |
| training: | |
| num_train_epochs: 3 | |
| per_device_train_batch_size: 4 | |
| per_device_eval_batch_size: 4 | |
| gradient_accumulation_steps: 4 | |
| gradient_checkpointing: true | |
| learning_rate: 2e-4 | |
| lr_scheduler_type: "cosine" | |
| warmup_ratio: 0.03 | |
| weight_decay: 0.001 | |
| max_grad_norm: 0.3 | |
| # LoRA Configuration | |
| lora: | |
| r: 16 # LoRA attention dimension | |
| lora_alpha: 32 # LoRA scaling parameter | |
| lora_dropout: 0.05 # LoRA dropout | |
| bias: "none" | |
| task_type: "CAUSAL_LM" | |
| target_modules: | |
| - "q_proj" | |
| - "k_proj" | |
| - "v_proj" | |
| - "o_proj" | |
| - "gate_proj" | |
| - "up_proj" | |
| - "down_proj" | |
| - "lm_head" | |
| # Optimization | |
| optimization: | |
| optim: "paged_adamw_32bit" | |
| fp16: false | |
| bf16: true # Use bfloat16 for better stability | |
| tf32: true | |
| # Logging and Saving | |
| output: | |
| output_dir: "./results" | |
| logging_dir: "./logs" | |
| logging_steps: 10 | |
| save_steps: 500 | |
| save_total_limit: 3 | |
| evaluation_strategy: "steps" | |
| eval_steps: 500 | |
| save_strategy: "steps" | |
| load_best_model_at_end: true | |
| metric_for_best_model: "eval_loss" | |
| greater_is_better: false | |
| # Hub Configuration | |
| hub: | |
| push_to_hub: true | |
| hub_model_id: "tasal9/ZamAI-Mistral-7B-Pashto" | |
| hub_strategy: "every_save" | |
| hub_token: "YOUR_HF_TOKEN" # Replace with actual token | |
| # Advanced Settings | |
| advanced: | |
| seed: 42 | |
| data_seed: 42 | |
| dataloader_num_workers: 4 | |
| remove_unused_columns: false | |
| label_names: ["labels"] | |
| report_to: ["tensorboard", "wandb"] # Optional: remove wandb if not using | |
| # Inference Configuration (for testing) | |
| inference: | |
| max_new_tokens: 256 | |
| temperature: 0.7 | |
| top_p: 0.9 | |
| top_k: 50 | |
| do_sample: true | |
| repetition_penalty: 1.1 |