# ZamAI-Mistral-7B-Pashto Training Configuration

# Model Configuration
model:
  name: "mistralai/Mistral-7B-Instruct-v0.1"
  type: "causal-lm"
  use_flash_attention_2: true
  load_in_8bit: false
  load_in_4bit: true  # Use 4-bit quantization for efficiency

# Dataset Configuration
dataset:
  name: "tasal9/ZamAI_Pashto_Training"
  text_column: "text"  # or "instruction" for instruction-response format
  train_split: "train"
  validation_split: "test"
  max_seq_length: 2048
  
# Training Parameters
training:
  num_train_epochs: 3
  per_device_train_batch_size: 4
  per_device_eval_batch_size: 4
  gradient_accumulation_steps: 4
  gradient_checkpointing: true
  learning_rate: 2e-4
  lr_scheduler_type: "cosine"
  warmup_ratio: 0.03
  weight_decay: 0.001
  max_grad_norm: 0.3
  
# LoRA Configuration
lora:
  r: 16  # LoRA attention dimension
  lora_alpha: 32  # LoRA scaling parameter
  lora_dropout: 0.05  # LoRA dropout
  bias: "none"
  task_type: "CAUSAL_LM"
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"
    - "lm_head"

# Optimization
optimization:
  optim: "paged_adamw_32bit"
  fp16: false
  bf16: true  # Use bfloat16 for better stability
  tf32: true
  
# Logging and Saving
output:
  output_dir: "./results"
  logging_dir: "./logs"
  logging_steps: 10
  save_steps: 500
  save_total_limit: 3
  evaluation_strategy: "steps"
  eval_steps: 500
  save_strategy: "steps"
  load_best_model_at_end: true
  metric_for_best_model: "eval_loss"
  greater_is_better: false
  
# Hub Configuration
hub:
  push_to_hub: true
  hub_model_id: "tasal9/ZamAI-Mistral-7B-Pashto"
  hub_strategy: "every_save"
  hub_token: "YOUR_HF_TOKEN"  # Replace with actual token
  
# Advanced Settings
advanced:
  seed: 42
  data_seed: 42
  dataloader_num_workers: 4
  remove_unused_columns: false
  label_names: ["labels"]
  report_to: ["tensorboard", "wandb"]  # Optional: remove wandb if not using
  
# Inference Configuration (for testing)
inference:
  max_new_tokens: 256
  temperature: 0.7
  top_p: 0.9
  top_k: 50
  do_sample: true
  repetition_penalty: 1.1