ZamAI-Mistral-7B-Pashto / training_config.yaml
tasal9's picture
Add dataset format checking script, training configuration, and dataset files for ZamAI-Mistral-7B-Pashto
7714053
# ZamAI-Mistral-7B-Pashto Training Configuration
# Model Configuration
model:
name: "mistralai/Mistral-7B-Instruct-v0.1"
type: "causal-lm"
use_flash_attention_2: true
load_in_8bit: false
load_in_4bit: true # Use 4-bit quantization for efficiency
# Dataset Configuration
dataset:
name: "tasal9/ZamAI_Pashto_Training"
text_column: "text" # or "instruction" for instruction-response format
train_split: "train"
validation_split: "test"
max_seq_length: 2048
# Training Parameters
training:
num_train_epochs: 3
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gradient_accumulation_steps: 4
gradient_checkpointing: true
learning_rate: 2e-4
lr_scheduler_type: "cosine"
warmup_ratio: 0.03
weight_decay: 0.001
max_grad_norm: 0.3
# LoRA Configuration
lora:
r: 16 # LoRA attention dimension
lora_alpha: 32 # LoRA scaling parameter
lora_dropout: 0.05 # LoRA dropout
bias: "none"
task_type: "CAUSAL_LM"
target_modules:
- "q_proj"
- "k_proj"
- "v_proj"
- "o_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
- "lm_head"
# Optimization
optimization:
optim: "paged_adamw_32bit"
fp16: false
bf16: true # Use bfloat16 for better stability
tf32: true
# Logging and Saving
output:
output_dir: "./results"
logging_dir: "./logs"
logging_steps: 10
save_steps: 500
save_total_limit: 3
evaluation_strategy: "steps"
eval_steps: 500
save_strategy: "steps"
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false
# Hub Configuration
hub:
push_to_hub: true
hub_model_id: "tasal9/ZamAI-Mistral-7B-Pashto"
hub_strategy: "every_save"
hub_token: "YOUR_HF_TOKEN" # Replace with actual token
# Advanced Settings
advanced:
seed: 42
data_seed: 42
dataloader_num_workers: 4
remove_unused_columns: false
label_names: ["labels"]
report_to: ["tensorboard", "wandb"] # Optional: remove wandb if not using
# Inference Configuration (for testing)
inference:
max_new_tokens: 256
temperature: 0.7
top_p: 0.9
top_k: 50
do_sample: true
repetition_penalty: 1.1