# ZamAI-Mistral-7B-Pashto Training Configuration # Model Configuration model: name: "mistralai/Mistral-7B-Instruct-v0.1" type: "causal-lm" use_flash_attention_2: true load_in_8bit: false load_in_4bit: true # Use 4-bit quantization for efficiency # Dataset Configuration dataset: name: "tasal9/ZamAI_Pashto_Training" text_column: "text" # or "instruction" for instruction-response format train_split: "train" validation_split: "test" max_seq_length: 2048 # Training Parameters training: num_train_epochs: 3 per_device_train_batch_size: 4 per_device_eval_batch_size: 4 gradient_accumulation_steps: 4 gradient_checkpointing: true learning_rate: 2e-4 lr_scheduler_type: "cosine" warmup_ratio: 0.03 weight_decay: 0.001 max_grad_norm: 0.3 # LoRA Configuration lora: r: 16 # LoRA attention dimension lora_alpha: 32 # LoRA scaling parameter lora_dropout: 0.05 # LoRA dropout bias: "none" task_type: "CAUSAL_LM" target_modules: - "q_proj" - "k_proj" - "v_proj" - "o_proj" - "gate_proj" - "up_proj" - "down_proj" - "lm_head" # Optimization optimization: optim: "paged_adamw_32bit" fp16: false bf16: true # Use bfloat16 for better stability tf32: true # Logging and Saving output: output_dir: "./results" logging_dir: "./logs" logging_steps: 10 save_steps: 500 save_total_limit: 3 evaluation_strategy: "steps" eval_steps: 500 save_strategy: "steps" load_best_model_at_end: true metric_for_best_model: "eval_loss" greater_is_better: false # Hub Configuration hub: push_to_hub: true hub_model_id: "tasal9/ZamAI-Mistral-7B-Pashto" hub_strategy: "every_save" hub_token: "YOUR_HF_TOKEN" # Replace with actual token # Advanced Settings advanced: seed: 42 data_seed: 42 dataloader_num_workers: 4 remove_unused_columns: false label_names: ["labels"] report_to: ["tensorboard", "wandb"] # Optional: remove wandb if not using # Inference Configuration (for testing) inference: max_new_tokens: 256 temperature: 0.7 top_p: 0.9 top_k: 50 do_sample: true repetition_penalty: 1.1