ZamAI-Mistral-7B-Pashto / training_config.yaml

Add dataset format checking script, training configuration, and dataset files for ZamAI-Mistral-7B-Pashto

7714053 5 months ago

2.13 kB

	# ZamAI-Mistral-7B-Pashto Training Configuration

	# Model Configuration
	model:
	name: "mistralai/Mistral-7B-Instruct-v0.1"
	type: "causal-lm"
	use_flash_attention_2: true
	load_in_8bit: false
	load_in_4bit: true # Use 4-bit quantization for efficiency

	# Dataset Configuration
	dataset:
	name: "tasal9/ZamAI_Pashto_Training"
	text_column: "text" # or "instruction" for instruction-response format
	train_split: "train"
	validation_split: "test"
	max_seq_length: 2048

	# Training Parameters
	training:
	num_train_epochs: 3
	per_device_train_batch_size: 4
	per_device_eval_batch_size: 4
	gradient_accumulation_steps: 4
	gradient_checkpointing: true
	learning_rate: 2e-4
	lr_scheduler_type: "cosine"
	warmup_ratio: 0.03
	weight_decay: 0.001
	max_grad_norm: 0.3

	# LoRA Configuration
	lora:
	r: 16 # LoRA attention dimension
	lora_alpha: 32 # LoRA scaling parameter
	lora_dropout: 0.05 # LoRA dropout
	bias: "none"
	task_type: "CAUSAL_LM"
	target_modules:
	- "q_proj"
	- "k_proj"
	- "v_proj"
	- "o_proj"
	- "gate_proj"
	- "up_proj"
	- "down_proj"
	- "lm_head"

	# Optimization
	optimization:
	optim: "paged_adamw_32bit"
	fp16: false
	bf16: true # Use bfloat16 for better stability
	tf32: true

	# Logging and Saving
	output:
	output_dir: "./results"
	logging_dir: "./logs"
	logging_steps: 10
	save_steps: 500
	save_total_limit: 3
	evaluation_strategy: "steps"
	eval_steps: 500
	save_strategy: "steps"
	load_best_model_at_end: true
	metric_for_best_model: "eval_loss"
	greater_is_better: false

	# Hub Configuration
	hub:
	push_to_hub: true
	hub_model_id: "tasal9/ZamAI-Mistral-7B-Pashto"
	hub_strategy: "every_save"
	hub_token: "YOUR_HF_TOKEN" # Replace with actual token

	# Advanced Settings
	advanced:
	seed: 42
	data_seed: 42
	dataloader_num_workers: 4
	remove_unused_columns: false
	label_names: ["labels"]
	report_to: ["tensorboard", "wandb"] # Optional: remove wandb if not using

	# Inference Configuration (for testing)
	inference:
	max_new_tokens: 256
	temperature: 0.7
	top_p: 0.9
	top_k: 50
	do_sample: true
	repetition_penalty: 1.1