Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing
Browse files- config/train_gpt_oss_custom.py +388 -0
- config/train_gpt_oss_openhermes_fr.py +174 -0
- config/train_gpt_oss_openhermes_fr_memory_optimized.py +233 -0
- docs/output.svg +1 -0
- launch.sh +328 -11
- scripts/model_tonic/push_gpt_oss_to_huggingface.py +79 -5
- scripts/model_tonic/push_to_huggingface.py +83 -5
- scripts/training/train_gpt_oss.py +313 -24
- templates/spaces/demo_gpt/README.md +1 -1
    	
        config/train_gpt_oss_custom.py
    ADDED
    
    | @@ -0,0 +1,388 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            GPT-OSS Custom Training Configuration
         | 
| 3 | 
            +
            Based on OpenAI's GPT-OSS fine-tuning tutorial
         | 
| 4 | 
            +
            Fully customizable configuration for any dataset format
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Supports specialized datasets like:
         | 
| 7 | 
            +
            - legmlai/openhermes-fr (French instruction dataset)
         | 
| 8 | 
            +
            - HuggingFaceH4/Multilingual-Thinking
         | 
| 9 | 
            +
            - Custom prompt/completion formats
         | 
| 10 | 
            +
            """
         | 
| 11 | 
            +
            import os
         | 
| 12 | 
            +
            from dataclasses import dataclass
         | 
| 13 | 
            +
            from typing import Optional, Dict, List, Union
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            @dataclass
         | 
| 16 | 
            +
            class GPTOSSEnhancedCustomConfig:
         | 
| 17 | 
            +
                """Enhanced custom configuration for GPT-OSS fine-tuning with maximum flexibility"""
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                # ============================================================================
         | 
| 20 | 
            +
                # CORE MODEL CONFIGURATION
         | 
| 21 | 
            +
                # ============================================================================
         | 
| 22 | 
            +
                trainer_type: str = "sft"  # "sft" or "dpo"
         | 
| 23 | 
            +
                model_name: str = "openai/gpt-oss-20b"
         | 
| 24 | 
            +
                max_seq_length: int = 2048  # Customizable: 512, 1024, 2048, 4096, 8192
         | 
| 25 | 
            +
                use_flash_attention: bool = True
         | 
| 26 | 
            +
                use_gradient_checkpointing: bool = True
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                # ============================================================================
         | 
| 29 | 
            +
                # TRAINING HYPERPARAMETERS - FULLY CUSTOMIZABLE
         | 
| 30 | 
            +
                # ============================================================================
         | 
| 31 | 
            +
                # Batch Configuration
         | 
| 32 | 
            +
                batch_size: int = 4  # Per-device batch size (1-32 depending on GPU memory)
         | 
| 33 | 
            +
                gradient_accumulation_steps: int = 4  # Effective batch = batch_size * accumulation * num_gpus
         | 
| 34 | 
            +
                eval_batch_size: Optional[int] = None  # If None, uses batch_size
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                # Learning Rate Configuration
         | 
| 37 | 
            +
                learning_rate: float = 2e-4  # Main learning rate (1e-5 to 5e-4 typical range)
         | 
| 38 | 
            +
                min_lr: float = 2e-5  # Minimum learning rate for scheduler
         | 
| 39 | 
            +
                warmup_ratio: float = 0.03  # Fraction of steps for warmup (0.01-0.1)
         | 
| 40 | 
            +
                warmup_steps: Optional[int] = None  # If set, overrides warmup_ratio
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                # Training Duration
         | 
| 43 | 
            +
                num_train_epochs: float = 1.0  # Number of epochs (0.5, 1.0, 2.0, 3.0)
         | 
| 44 | 
            +
                max_steps: Optional[int] = None  # If set, overrides num_train_epochs
         | 
| 45 | 
            +
                max_iters: Optional[int] = None  # Legacy compatibility
         | 
| 46 | 
            +
                
         | 
| 47 | 
            +
                # Regularization
         | 
| 48 | 
            +
                weight_decay: float = 0.01  # L2 regularization (0.0-0.1)
         | 
| 49 | 
            +
                max_grad_norm: float = 1.0  # Gradient clipping (0.5-2.0)
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                # ============================================================================
         | 
| 52 | 
            +
                # OPTIMIZER CONFIGURATION
         | 
| 53 | 
            +
                # ============================================================================
         | 
| 54 | 
            +
                optimizer: str = "adamw_torch"  # "adamw_torch", "adamw_hf", "sgd"
         | 
| 55 | 
            +
                beta1: float = 0.9  # Adam beta1 parameter
         | 
| 56 | 
            +
                beta2: float = 0.95  # Adam beta2 parameter (0.95-0.999)
         | 
| 57 | 
            +
                eps: float = 1e-8  # Adam epsilon
         | 
| 58 | 
            +
                
         | 
| 59 | 
            +
                # ============================================================================
         | 
| 60 | 
            +
                # SCHEDULER CONFIGURATION
         | 
| 61 | 
            +
                # ============================================================================
         | 
| 62 | 
            +
                scheduler: str = "cosine_with_min_lr"  # "linear", "cosine", "cosine_with_min_lr", "constant"
         | 
| 63 | 
            +
                lr_scheduler_kwargs: Optional[Dict] = None
         | 
| 64 | 
            +
                
         | 
| 65 | 
            +
                # ============================================================================
         | 
| 66 | 
            +
                # MIXED PRECISION & DISTRIBUTED TRAINING
         | 
| 67 | 
            +
                # ============================================================================
         | 
| 68 | 
            +
                fp16: bool = False  # Use FP16 (not recommended for GPT-OSS)
         | 
| 69 | 
            +
                bf16: bool = True  # Use BF16 (recommended for GPT-OSS)
         | 
| 70 | 
            +
                tf32: Optional[bool] = None  # Use TF32 on A100/H100
         | 
| 71 | 
            +
                ddp_backend: str = "nccl"
         | 
| 72 | 
            +
                ddp_find_unused_parameters: bool = False
         | 
| 73 | 
            +
                
         | 
| 74 | 
            +
                # ============================================================================
         | 
| 75 | 
            +
                # LOGGING, EVALUATION & CHECKPOINTING
         | 
| 76 | 
            +
                # ============================================================================
         | 
| 77 | 
            +
                # Logging
         | 
| 78 | 
            +
                logging_steps: int = 10  # Log every N steps
         | 
| 79 | 
            +
                log_level: str = "info"  # "debug", "info", "warning", "error"
         | 
| 80 | 
            +
                
         | 
| 81 | 
            +
                # Evaluation
         | 
| 82 | 
            +
                eval_strategy: str = "steps"  # "no", "steps", "epoch"
         | 
| 83 | 
            +
                eval_steps: int = 100  # Evaluate every N steps
         | 
| 84 | 
            +
                eval_delay: float = 0  # Delay evaluation for N steps/epochs
         | 
| 85 | 
            +
                eval_accumulation_steps: Optional[int] = None  # Accumulate eval outputs
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                # Checkpointing
         | 
| 88 | 
            +
                save_strategy: str = "steps"  # "no", "steps", "epoch"
         | 
| 89 | 
            +
                save_steps: int = 500  # Save checkpoint every N steps
         | 
| 90 | 
            +
                save_total_limit: Optional[int] = 3  # Keep only N best checkpoints
         | 
| 91 | 
            +
                save_only_model: bool = False  # Save only model weights
         | 
| 92 | 
            +
                
         | 
| 93 | 
            +
                # Model Selection
         | 
| 94 | 
            +
                metric_for_best_model: str = "eval_loss"
         | 
| 95 | 
            +
                greater_is_better: bool = False
         | 
| 96 | 
            +
                load_best_model_at_end: bool = True
         | 
| 97 | 
            +
                
         | 
| 98 | 
            +
                # ============================================================================
         | 
| 99 | 
            +
                # DATASET CONFIGURATION - ENHANCED FOR CUSTOM FORMATS
         | 
| 100 | 
            +
                # ============================================================================
         | 
| 101 | 
            +
                # Dataset Source
         | 
| 102 | 
            +
                dataset_name: str = "legmlai/openhermes-fr"  # Default to French OpenHermes
         | 
| 103 | 
            +
                dataset_split: str = "train"  # Dataset split to use
         | 
| 104 | 
            +
                dataset_config: Optional[str] = None  # Dataset configuration name
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                # Field Mapping - Customize for your dataset format
         | 
| 107 | 
            +
                input_field: str = "prompt"  # Field containing the input/prompt
         | 
| 108 | 
            +
                target_field: str = "accepted_completion"  # Field containing the target/completion
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                # OpenHermes-FR specific fields
         | 
| 111 | 
            +
                filter_bad_entries: bool = True  # Filter entries marked as bad
         | 
| 112 | 
            +
                bad_entry_field: str = "bad_entry"  # Field indicating bad entries
         | 
| 113 | 
            +
                bad_prompt_field: str = "bad_prompt_detected"  # Field for bad prompts
         | 
| 114 | 
            +
                bad_response_field: str = "bad_response_detected"  # Field for bad responses
         | 
| 115 | 
            +
                
         | 
| 116 | 
            +
                # Data Processing Options
         | 
| 117 | 
            +
                concatenate_fields: bool = True  # Combine input and target fields for training
         | 
| 118 | 
            +
                field_separator: str = "\n\n### Response:\n"  # Separator between input and target
         | 
| 119 | 
            +
                add_eos_token: bool = True  # Add EOS token at the end
         | 
| 120 | 
            +
                
         | 
| 121 | 
            +
                # Dataset Filtering & Sampling
         | 
| 122 | 
            +
                max_samples: Optional[int] = None  # Limit dataset size (e.g., 100000 for testing)
         | 
| 123 | 
            +
                min_length: int = 10  # Minimum sequence length
         | 
| 124 | 
            +
                max_length: Optional[int] = None  # Maximum sequence length (None = use max_seq_length)
         | 
| 125 | 
            +
                
         | 
| 126 | 
            +
                # Custom Dataset Formats Support
         | 
| 127 | 
            +
                dataset_format: str = "openhermes_fr"  # "openhermes_fr", "messages", "text", "custom"
         | 
| 128 | 
            +
                
         | 
| 129 | 
            +
                # GPT-OSS Harmony Format Configuration
         | 
| 130 | 
            +
                use_harmony_format: bool = True  # Enable GPT-OSS harmony format
         | 
| 131 | 
            +
                use_chat_template: bool = False  # Set to True for messages format
         | 
| 132 | 
            +
                chat_template_kwargs: Optional[Dict] = None
         | 
| 133 | 
            +
                
         | 
| 134 | 
            +
                # ============================================================================
         | 
| 135 | 
            +
                # TRACKIO MONITORING CONFIGURATION
         | 
| 136 | 
            +
                # ============================================================================
         | 
| 137 | 
            +
                enable_tracking: bool = True
         | 
| 138 | 
            +
                trackio_url: Optional[str] = None
         | 
| 139 | 
            +
                trackio_token: Optional[str] = None
         | 
| 140 | 
            +
                log_artifacts: bool = True
         | 
| 141 | 
            +
                log_metrics: bool = True
         | 
| 142 | 
            +
                log_config: bool = True
         | 
| 143 | 
            +
                experiment_name: Optional[str] = None
         | 
| 144 | 
            +
                
         | 
| 145 | 
            +
                # ============================================================================
         | 
| 146 | 
            +
                # HUGGING FACE INTEGRATION
         | 
| 147 | 
            +
                # ============================================================================
         | 
| 148 | 
            +
                hf_token: Optional[str] = None
         | 
| 149 | 
            +
                dataset_repo: Optional[str] = None
         | 
| 150 | 
            +
                push_to_hub: bool = False  # Push model to HF Hub after training
         | 
| 151 | 
            +
                hub_model_id: Optional[str] = None  # HF Hub model ID
         | 
| 152 | 
            +
                hub_private_repo: bool = False  # Make HF repo private
         | 
| 153 | 
            +
                
         | 
| 154 | 
            +
                # ============================================================================
         | 
| 155 | 
            +
                # GPT-OSS SPECIFIC CONFIGURATIONS
         | 
| 156 | 
            +
                # ============================================================================
         | 
| 157 | 
            +
                # LoRA Configuration
         | 
| 158 | 
            +
                use_lora: bool = True
         | 
| 159 | 
            +
                lora_config: Optional[Dict] = None
         | 
| 160 | 
            +
                
         | 
| 161 | 
            +
                # Quantization Configuration  
         | 
| 162 | 
            +
                use_quantization: bool = True
         | 
| 163 | 
            +
                quantization_config: Optional[Dict] = None
         | 
| 164 | 
            +
                
         | 
| 165 | 
            +
                # Model Loading Configuration
         | 
| 166 | 
            +
                model_kwargs: Optional[Dict] = None
         | 
| 167 | 
            +
                
         | 
| 168 | 
            +
                # Generation Configuration (for evaluation/testing)
         | 
| 169 | 
            +
                generation_config: Optional[Dict] = None
         | 
| 170 | 
            +
                
         | 
| 171 | 
            +
                # ============================================================================
         | 
| 172 | 
            +
                # MULTILINGUAL & DOMAIN SPECIFIC SETTINGS
         | 
| 173 | 
            +
                # ============================================================================
         | 
| 174 | 
            +
                # Language Support (for multilingual datasets)
         | 
| 175 | 
            +
                primary_language: str = "fr"  # Primary language code
         | 
| 176 | 
            +
                reasoning_languages: Optional[List[str]] = None  # Supported languages for reasoning
         | 
| 177 | 
            +
                
         | 
| 178 | 
            +
                # Domain-specific settings
         | 
| 179 | 
            +
                domain_focus: Optional[str] = None  # "reasoning", "conversation", "instruction", "general"
         | 
| 180 | 
            +
                
         | 
| 181 | 
            +
                # ============================================================================
         | 
| 182 | 
            +
                # PERFORMANCE & MEMORY OPTIMIZATION
         | 
| 183 | 
            +
                # ============================================================================
         | 
| 184 | 
            +
                # Data Loading
         | 
| 185 | 
            +
                dataloader_num_workers: int = 4  # Number of data loading workers
         | 
| 186 | 
            +
                dataloader_pin_memory: bool = True  # Pin memory for faster GPU transfer
         | 
| 187 | 
            +
                dataloader_prefetch_factor: int = 2  # Prefetch factor for data loading
         | 
| 188 | 
            +
                
         | 
| 189 | 
            +
                # Memory Management
         | 
| 190 | 
            +
                max_memory_per_gpu: Optional[str] = None  # e.g., "80GB", "40GB"
         | 
| 191 | 
            +
                low_cpu_mem_usage: bool = True  # Use low CPU memory loading
         | 
| 192 | 
            +
                
         | 
| 193 | 
            +
                # Performance Optimizations
         | 
| 194 | 
            +
                group_by_length: bool = True  # Group sequences by length
         | 
| 195 | 
            +
                length_column_name: str = "length"  # Column name for sequence lengths
         | 
| 196 | 
            +
                remove_unused_columns: bool = True  # Remove unused dataset columns
         | 
| 197 | 
            +
                
         | 
| 198 | 
            +
                def __post_init__(self):
         | 
| 199 | 
            +
                    """Initialize default values and validate configuration"""
         | 
| 200 | 
            +
                    
         | 
| 201 | 
            +
                    # ============================================================================
         | 
| 202 | 
            +
                    # LORA CONFIGURATION DEFAULTS
         | 
| 203 | 
            +
                    # ============================================================================
         | 
| 204 | 
            +
                    if self.lora_config is None:
         | 
| 205 | 
            +
                        self.lora_config = {
         | 
| 206 | 
            +
                            "r": 16,  # Rank (4, 8, 16, 32, 64) - higher = more parameters
         | 
| 207 | 
            +
                            "lora_alpha": 32,  # Scaling factor (usually 2*r)
         | 
| 208 | 
            +
                            "target_modules": "all-linear",  # Apply LoRA to all linear layers
         | 
| 209 | 
            +
                            "target_parameters": [
         | 
| 210 | 
            +
                                "7.mlp.experts.gate_up_proj",
         | 
| 211 | 
            +
                                "7.mlp.experts.down_proj", 
         | 
| 212 | 
            +
                                "15.mlp.experts.gate_up_proj",
         | 
| 213 | 
            +
                                "15.mlp.experts.down_proj",
         | 
| 214 | 
            +
                                "23.mlp.experts.gate_up_proj", 
         | 
| 215 | 
            +
                                "23.mlp.experts.down_proj",
         | 
| 216 | 
            +
                            ],
         | 
| 217 | 
            +
                            "bias": "none",  # "none", "all", "lora_only"
         | 
| 218 | 
            +
                            "task_type": "CAUSAL_LM",
         | 
| 219 | 
            +
                            "lora_dropout": 0.05,  # LoRA dropout rate
         | 
| 220 | 
            +
                        }
         | 
| 221 | 
            +
                    
         | 
| 222 | 
            +
                    # ============================================================================
         | 
| 223 | 
            +
                    # QUANTIZATION CONFIGURATION DEFAULTS
         | 
| 224 | 
            +
                    # ============================================================================
         | 
| 225 | 
            +
                    if self.quantization_config is None:
         | 
| 226 | 
            +
                        self.quantization_config = {
         | 
| 227 | 
            +
                            "dequantize": True,  # Use Mxfp4Config as per GPT-OSS tutorial
         | 
| 228 | 
            +
                            "load_in_4bit": False,  # Set to True for extreme memory optimization
         | 
| 229 | 
            +
                            "bnb_4bit_compute_dtype": "bfloat16",  # For 4-bit quantization
         | 
| 230 | 
            +
                            "bnb_4bit_use_double_quant": True,  # Double quantization
         | 
| 231 | 
            +
                            "bnb_4bit_quant_type": "nf4"  # Quantization type
         | 
| 232 | 
            +
                        }
         | 
| 233 | 
            +
                    
         | 
| 234 | 
            +
                    # ============================================================================
         | 
| 235 | 
            +
                    # MODEL LOADING CONFIGURATION DEFAULTS
         | 
| 236 | 
            +
                    # ============================================================================
         | 
| 237 | 
            +
                    if self.model_kwargs is None:
         | 
| 238 | 
            +
                        self.model_kwargs = {
         | 
| 239 | 
            +
                            "attn_implementation": "eager",  # "eager", "flash_attention_2"
         | 
| 240 | 
            +
                            "torch_dtype": "auto",  # "auto", "bfloat16", "float16"
         | 
| 241 | 
            +
                            "use_cache": False,  # Disable KV cache for training
         | 
| 242 | 
            +
                            "device_map": "auto",  # Automatic device mapping
         | 
| 243 | 
            +
                            "low_cpu_mem_usage": self.low_cpu_mem_usage,
         | 
| 244 | 
            +
                        }
         | 
| 245 | 
            +
                        
         | 
| 246 | 
            +
                        # Add memory constraints if specified
         | 
| 247 | 
            +
                        if self.max_memory_per_gpu:
         | 
| 248 | 
            +
                            self.model_kwargs["max_memory"] = {0: self.max_memory_per_gpu}
         | 
| 249 | 
            +
                    
         | 
| 250 | 
            +
                    # ============================================================================
         | 
| 251 | 
            +
                    # GENERATION CONFIGURATION DEFAULTS
         | 
| 252 | 
            +
                    # ============================================================================
         | 
| 253 | 
            +
                    if self.generation_config is None:
         | 
| 254 | 
            +
                        self.generation_config = {
         | 
| 255 | 
            +
                            "max_new_tokens": 512,  # Maximum tokens to generate
         | 
| 256 | 
            +
                            "do_sample": True,  # Use sampling
         | 
| 257 | 
            +
                            "temperature": 0.7,  # Sampling temperature
         | 
| 258 | 
            +
                            "top_p": 0.9,  # Nucleus sampling
         | 
| 259 | 
            +
                            "top_k": 50,  # Top-k sampling
         | 
| 260 | 
            +
                            "repetition_penalty": 1.1,  # Repetition penalty
         | 
| 261 | 
            +
                            "pad_token_id": None,  # Will be set from tokenizer
         | 
| 262 | 
            +
                            "eos_token_id": None,  # Will be set from tokenizer
         | 
| 263 | 
            +
                        }
         | 
| 264 | 
            +
                    
         | 
| 265 | 
            +
                    # ============================================================================
         | 
| 266 | 
            +
                    # LANGUAGE CONFIGURATION DEFAULTS
         | 
| 267 | 
            +
                    # ============================================================================
         | 
| 268 | 
            +
                    if self.reasoning_languages is None:
         | 
| 269 | 
            +
                        if self.primary_language == "fr":
         | 
| 270 | 
            +
                            self.reasoning_languages = [
         | 
| 271 | 
            +
                                "French", "English", "Spanish", "Italian", "German"
         | 
| 272 | 
            +
                            ]
         | 
| 273 | 
            +
                        else:
         | 
| 274 | 
            +
                            self.reasoning_languages = [
         | 
| 275 | 
            +
                                "English", "Spanish", "French", "Italian", "German", 
         | 
| 276 | 
            +
                                "Chinese", "Hindi", "Japanese", "Korean", "Arabic"
         | 
| 277 | 
            +
                            ]
         | 
| 278 | 
            +
                    
         | 
| 279 | 
            +
                    # ============================================================================
         | 
| 280 | 
            +
                    # SCHEDULER CONFIGURATION DEFAULTS
         | 
| 281 | 
            +
                    # ============================================================================
         | 
| 282 | 
            +
                    if self.lr_scheduler_kwargs is None:
         | 
| 283 | 
            +
                        self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
         | 
| 284 | 
            +
                    
         | 
| 285 | 
            +
                    # ============================================================================
         | 
| 286 | 
            +
                    # CHAT TEMPLATE CONFIGURATION DEFAULTS (GPT-OSS Harmony Format)
         | 
| 287 | 
            +
                    # ============================================================================
         | 
| 288 | 
            +
                    if self.chat_template_kwargs is None:
         | 
| 289 | 
            +
                        self.chat_template_kwargs = {
         | 
| 290 | 
            +
                            "add_generation_prompt": True,
         | 
| 291 | 
            +
                            "tokenize": False,
         | 
| 292 | 
            +
                            "auto_insert_role": True,
         | 
| 293 | 
            +
                            # GPT-OSS Harmony Format specific settings
         | 
| 294 | 
            +
                            "reasoning_effort": "medium",  # low, medium, high
         | 
| 295 | 
            +
                            "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
         | 
| 296 | 
            +
                            "builtin_tools": [],  # Can include "browser" and/or "python"
         | 
| 297 | 
            +
                        }
         | 
| 298 | 
            +
                    
         | 
| 299 | 
            +
                    # ============================================================================
         | 
| 300 | 
            +
                    # VALIDATION AND COMPUTED VALUES
         | 
| 301 | 
            +
                    # ============================================================================
         | 
| 302 | 
            +
                    # Compute effective batch size
         | 
| 303 | 
            +
                    effective_batch_size = self.batch_size * self.gradient_accumulation_steps
         | 
| 304 | 
            +
                    
         | 
| 305 | 
            +
                    # Set warmup steps if not provided
         | 
| 306 | 
            +
                    if self.warmup_steps is None and self.max_steps:
         | 
| 307 | 
            +
                        self.warmup_steps = int(self.max_steps * self.warmup_ratio)
         | 
| 308 | 
            +
                    
         | 
| 309 | 
            +
                    # Set max_length for dataset filtering
         | 
| 310 | 
            +
                    if self.max_length is None:
         | 
| 311 | 
            +
                        self.max_length = self.max_seq_length
         | 
| 312 | 
            +
                    
         | 
| 313 | 
            +
                    # Validate configuration
         | 
| 314 | 
            +
                    self._validate_config()
         | 
| 315 | 
            +
                    
         | 
| 316 | 
            +
                    # Print comprehensive configuration summary
         | 
| 317 | 
            +
                    self._print_config_summary(effective_batch_size)
         | 
| 318 | 
            +
                
         | 
| 319 | 
            +
                def _validate_config(self):
         | 
| 320 | 
            +
                    """Validate configuration parameters"""
         | 
| 321 | 
            +
                    
         | 
| 322 | 
            +
                    # Validate batch configuration
         | 
| 323 | 
            +
                    if self.batch_size < 1:
         | 
| 324 | 
            +
                        raise ValueError("batch_size must be >= 1")
         | 
| 325 | 
            +
                    if self.gradient_accumulation_steps < 1:
         | 
| 326 | 
            +
                        raise ValueError("gradient_accumulation_steps must be >= 1")
         | 
| 327 | 
            +
                        
         | 
| 328 | 
            +
                    # Validate learning rate
         | 
| 329 | 
            +
                    if self.learning_rate <= 0:
         | 
| 330 | 
            +
                        raise ValueError("learning_rate must be > 0")
         | 
| 331 | 
            +
                    if self.min_lr >= self.learning_rate:
         | 
| 332 | 
            +
                        raise ValueError("min_lr must be < learning_rate")
         | 
| 333 | 
            +
                        
         | 
| 334 | 
            +
                    # Validate sequence length
         | 
| 335 | 
            +
                    if self.max_seq_length < 1:
         | 
| 336 | 
            +
                        raise ValueError("max_seq_length must be >= 1")
         | 
| 337 | 
            +
                        
         | 
| 338 | 
            +
                    # Validate dataset format
         | 
| 339 | 
            +
                    valid_formats = ["openhermes_fr", "messages", "text", "custom"]
         | 
| 340 | 
            +
                    if self.dataset_format not in valid_formats:
         | 
| 341 | 
            +
                        raise ValueError(f"dataset_format must be one of {valid_formats}")
         | 
| 342 | 
            +
                
         | 
| 343 | 
            +
                def _print_config_summary(self, effective_batch_size):
         | 
| 344 | 
            +
                    """Print detailed configuration summary"""
         | 
| 345 | 
            +
                    
         | 
| 346 | 
            +
                    print("\n" + "="*80)
         | 
| 347 | 
            +
                    print("🚀 GPT-OSS ENHANCED CUSTOM CONFIGURATION")
         | 
| 348 | 
            +
                    print("="*80)
         | 
| 349 | 
            +
                    
         | 
| 350 | 
            +
                    print(f"📊 Model & Training:")
         | 
| 351 | 
            +
                    print(f"   • Model: {self.model_name}")
         | 
| 352 | 
            +
                    print(f"   • Dataset: {self.dataset_name} ({self.dataset_format})")
         | 
| 353 | 
            +
                    print(f"   • Primary Language: {self.primary_language}")
         | 
| 354 | 
            +
                    print(f"   • Sequence Length: {self.max_seq_length}")
         | 
| 355 | 
            +
                    print(f"   • Epochs: {self.num_train_epochs}")
         | 
| 356 | 
            +
                    
         | 
| 357 | 
            +
                    print(f"\n🔄 Batch Configuration:")
         | 
| 358 | 
            +
                    print(f"   • Per-device Batch Size: {self.batch_size}")
         | 
| 359 | 
            +
                    print(f"   • Gradient Accumulation: {self.gradient_accumulation_steps}")
         | 
| 360 | 
            +
                    print(f"   • Effective Batch Size: {effective_batch_size}")
         | 
| 361 | 
            +
                    
         | 
| 362 | 
            +
                    print(f"\n📈 Learning Configuration:")
         | 
| 363 | 
            +
                    print(f"   • Learning Rate: {self.learning_rate}")
         | 
| 364 | 
            +
                    print(f"   • Min Learning Rate: {self.min_lr}")
         | 
| 365 | 
            +
                    print(f"   • Weight Decay: {self.weight_decay}")
         | 
| 366 | 
            +
                    print(f"   • Warmup Ratio: {self.warmup_ratio}")
         | 
| 367 | 
            +
                    
         | 
| 368 | 
            +
                    print(f"\n🎛️ LoRA Configuration:")
         | 
| 369 | 
            +
                    print(f"   • Rank: {self.lora_config['r']}")
         | 
| 370 | 
            +
                    print(f"   • Alpha: {self.lora_config['lora_alpha']}")
         | 
| 371 | 
            +
                    print(f"   • Target Modules: {self.lora_config['target_modules']}")
         | 
| 372 | 
            +
                    
         | 
| 373 | 
            +
                    print(f"\n📁 Dataset Configuration:")
         | 
| 374 | 
            +
                    print(f"   • Input Field: {self.input_field}")
         | 
| 375 | 
            +
                    print(f"   • Target Field: {self.target_field}")
         | 
| 376 | 
            +
                    print(f"   • Filter Bad Entries: {self.filter_bad_entries}")
         | 
| 377 | 
            +
                    print(f"   • Max Samples: {self.max_samples or 'All'}")
         | 
| 378 | 
            +
                    
         | 
| 379 | 
            +
                    print(f"\n💾 Memory & Performance:")
         | 
| 380 | 
            +
                    print(f"   • Mixed Precision: {'BF16' if self.bf16 else 'FP32'}")
         | 
| 381 | 
            +
                    print(f"   • Gradient Checkpointing: {self.use_gradient_checkpointing}")
         | 
| 382 | 
            +
                    print(f"   • Data Workers: {self.dataloader_num_workers}")
         | 
| 383 | 
            +
                    print(f"   • Group by Length: {self.group_by_length}")
         | 
| 384 | 
            +
                    
         | 
| 385 | 
            +
                    print("="*80 + "\n")
         | 
| 386 | 
            +
             | 
| 387 | 
            +
            # Create the config instance with OpenHermes-FR optimized defaults
         | 
| 388 | 
            +
            config = GPTOSSEnhancedCustomConfig()
         | 
    	
        config/train_gpt_oss_openhermes_fr.py
    ADDED
    
    | @@ -0,0 +1,174 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            GPT-OSS OpenHermes-FR Optimized Configuration
         | 
| 3 | 
            +
            Specifically optimized for the legmlai/openhermes-fr dataset
         | 
| 4 | 
            +
            800K French instruction-response pairs with quality filtering
         | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # OpenHermes-FR optimized configuration
         | 
| 10 | 
            +
            config = GPTOSSEnhancedCustomConfig(
         | 
| 11 | 
            +
                # ============================================================================
         | 
| 12 | 
            +
                # DATASET CONFIGURATION - OpenHermes-FR Specific
         | 
| 13 | 
            +
                # ============================================================================
         | 
| 14 | 
            +
                dataset_name="legmlai/openhermes-fr",
         | 
| 15 | 
            +
                dataset_split="train",
         | 
| 16 | 
            +
                dataset_format="openhermes_fr",
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                # OpenHermes-FR field mapping
         | 
| 19 | 
            +
                input_field="prompt",                    # French prompts
         | 
| 20 | 
            +
                target_field="accepted_completion",      # GPT-4o generated completions
         | 
| 21 | 
            +
                
         | 
| 22 | 
            +
                # Quality filtering using OpenHermes-FR metadata
         | 
| 23 | 
            +
                filter_bad_entries=True,                 # Use built-in quality flags
         | 
| 24 | 
            +
                bad_entry_field="bad_entry",
         | 
| 25 | 
            +
                bad_prompt_field="bad_prompt_detected",
         | 
| 26 | 
            +
                bad_response_field="bad_response_detected",
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                # Data processing optimized for French with GPT-OSS Harmony Format
         | 
| 29 | 
            +
                concatenate_fields=True,
         | 
| 30 | 
            +
                field_separator="\n\n### Réponse:\n",   # Fallback separator (harmony format takes precedence)
         | 
| 31 | 
            +
                add_eos_token=True,
         | 
| 32 | 
            +
                use_harmony_format=True,                 # Enable GPT-OSS harmony format
         | 
| 33 | 
            +
                
         | 
| 34 | 
            +
                # Dataset sampling (use all 800K examples by default)
         | 
| 35 | 
            +
                max_samples=None,                        # Use full dataset
         | 
| 36 | 
            +
                min_length=20,                          # Minimum for meaningful French text
         | 
| 37 | 
            +
                max_length=None,                        # Auto-set to max_seq_length
         | 
| 38 | 
            +
                
         | 
| 39 | 
            +
                # ============================================================================
         | 
| 40 | 
            +
                # TRAINING HYPERPARAMETERS - French Language Optimized
         | 
| 41 | 
            +
                # ============================================================================
         | 
| 42 | 
            +
                num_train_epochs=1.5,                   # 1.5 epochs optimal for large dataset
         | 
| 43 | 
            +
                batch_size=6,                           # Balanced for most GPUs
         | 
| 44 | 
            +
                gradient_accumulation_steps=6,          # Effective batch size: 36
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                # Learning rate schedule optimized for French fine-tuning
         | 
| 47 | 
            +
                learning_rate=2.5e-4,                   # Slightly higher for multilingual
         | 
| 48 | 
            +
                min_lr=2.5e-5,                          # 10% of max learning rate
         | 
| 49 | 
            +
                warmup_ratio=0.05,                      # 5% warmup for stability
         | 
| 50 | 
            +
                weight_decay=0.01,                      # Standard L2 regularization
         | 
| 51 | 
            +
                max_grad_norm=1.0,                      # Gradient clipping
         | 
| 52 | 
            +
                
         | 
| 53 | 
            +
                # ============================================================================
         | 
| 54 | 
            +
                # MODEL CONFIGURATION - Optimized for French
         | 
| 55 | 
            +
                # ============================================================================
         | 
| 56 | 
            +
                model_name="openai/gpt-oss-20b",
         | 
| 57 | 
            +
                max_seq_length=3072,                    # Balanced length for French
         | 
| 58 | 
            +
                use_flash_attention=True,
         | 
| 59 | 
            +
                use_gradient_checkpointing=True,
         | 
| 60 | 
            +
                
         | 
| 61 | 
            +
                # Mixed precision for efficiency
         | 
| 62 | 
            +
                fp16=False,
         | 
| 63 | 
            +
                bf16=True,                              # Better for GPT-OSS
         | 
| 64 | 
            +
                
         | 
| 65 | 
            +
                # ============================================================================
         | 
| 66 | 
            +
                # LORA CONFIGURATION - Optimized for French Language Learning
         | 
| 67 | 
            +
                # ============================================================================
         | 
| 68 | 
            +
                use_lora=True,
         | 
| 69 | 
            +
                lora_config={
         | 
| 70 | 
            +
                    "r": 24,                            # Higher rank for language adaptation
         | 
| 71 | 
            +
                    "lora_alpha": 48,                   # 2x rank scaling
         | 
| 72 | 
            +
                    "lora_dropout": 0.05,               # Light regularization
         | 
| 73 | 
            +
                    "target_modules": "all-linear",
         | 
| 74 | 
            +
                    "target_parameters": [
         | 
| 75 | 
            +
                        "7.mlp.experts.gate_up_proj",
         | 
| 76 | 
            +
                        "7.mlp.experts.down_proj",
         | 
| 77 | 
            +
                        "15.mlp.experts.gate_up_proj", 
         | 
| 78 | 
            +
                        "15.mlp.experts.down_proj",
         | 
| 79 | 
            +
                        "23.mlp.experts.gate_up_proj",
         | 
| 80 | 
            +
                        "23.mlp.experts.down_proj",
         | 
| 81 | 
            +
                    ],
         | 
| 82 | 
            +
                    "bias": "none",
         | 
| 83 | 
            +
                    "task_type": "CAUSAL_LM",
         | 
| 84 | 
            +
                },
         | 
| 85 | 
            +
                
         | 
| 86 | 
            +
                # ============================================================================
         | 
| 87 | 
            +
                # QUANTIZATION - Balanced Performance/Memory
         | 
| 88 | 
            +
                # ============================================================================
         | 
| 89 | 
            +
                use_quantization=True,
         | 
| 90 | 
            +
                quantization_config={
         | 
| 91 | 
            +
                    "dequantize": True,                 # MXFP4 as per GPT-OSS tutorial
         | 
| 92 | 
            +
                    "load_in_4bit": False,              # Standard precision for quality
         | 
| 93 | 
            +
                },
         | 
| 94 | 
            +
                
         | 
| 95 | 
            +
                # ============================================================================
         | 
| 96 | 
            +
                # PERFORMANCE OPTIMIZATION
         | 
| 97 | 
            +
                # ============================================================================
         | 
| 98 | 
            +
                # Data loading optimized for large dataset
         | 
| 99 | 
            +
                dataloader_num_workers=6,               # More workers for large dataset
         | 
| 100 | 
            +
                dataloader_pin_memory=True,
         | 
| 101 | 
            +
                dataloader_prefetch_factor=3,           # Higher prefetch for efficiency
         | 
| 102 | 
            +
                
         | 
| 103 | 
            +
                # Memory management
         | 
| 104 | 
            +
                low_cpu_mem_usage=True,
         | 
| 105 | 
            +
                group_by_length=True,                   # Efficient batching
         | 
| 106 | 
            +
                remove_unused_columns=True,
         | 
| 107 | 
            +
                
         | 
| 108 | 
            +
                # ============================================================================
         | 
| 109 | 
            +
                # EVALUATION & LOGGING
         | 
| 110 | 
            +
                # ============================================================================
         | 
| 111 | 
            +
                eval_strategy="steps",
         | 
| 112 | 
            +
                eval_steps=200,                         # Evaluate every 200 steps
         | 
| 113 | 
            +
                logging_steps=20,                       # Log every 20 steps
         | 
| 114 | 
            +
                
         | 
| 115 | 
            +
                save_strategy="steps", 
         | 
| 116 | 
            +
                save_steps=500,                         # Save every 500 steps
         | 
| 117 | 
            +
                save_total_limit=3,                     # Keep 3 best checkpoints
         | 
| 118 | 
            +
                
         | 
| 119 | 
            +
                metric_for_best_model="eval_loss",
         | 
| 120 | 
            +
                greater_is_better=False,
         | 
| 121 | 
            +
                load_best_model_at_end=True,
         | 
| 122 | 
            +
                
         | 
| 123 | 
            +
                # ============================================================================
         | 
| 124 | 
            +
                # MULTILINGUAL & FRENCH SPECIFIC SETTINGS
         | 
| 125 | 
            +
                # ============================================================================
         | 
| 126 | 
            +
                primary_language="fr",                  # French as primary language
         | 
| 127 | 
            +
                reasoning_languages=["French", "English"],  # Bilingual reasoning
         | 
| 128 | 
            +
                domain_focus="instruction",             # Instruction following
         | 
| 129 | 
            +
                
         | 
| 130 | 
            +
                # ============================================================================
         | 
| 131 | 
            +
                # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
         | 
| 132 | 
            +
                # ============================================================================
         | 
| 133 | 
            +
                generation_config={
         | 
| 134 | 
            +
                    "max_new_tokens": 512,
         | 
| 135 | 
            +
                    "do_sample": True,
         | 
| 136 | 
            +
                    "temperature": 0.7,
         | 
| 137 | 
            +
                    "top_p": 0.9,
         | 
| 138 | 
            +
                    "top_k": 50,
         | 
| 139 | 
            +
                    "repetition_penalty": 1.1,
         | 
| 140 | 
            +
                    "pad_token_id": None,
         | 
| 141 | 
            +
                    "eos_token_id": None,
         | 
| 142 | 
            +
                    # GPT-OSS Harmony Format specific settings
         | 
| 143 | 
            +
                    "reasoning_effort": "medium",           # Configurable reasoning level
         | 
| 144 | 
            +
                    "use_harmony_format": True,             # Ensure harmony format in generation
         | 
| 145 | 
            +
                },
         | 
| 146 | 
            +
                
         | 
| 147 | 
            +
                # ============================================================================
         | 
| 148 | 
            +
                # HF HUB INTEGRATION
         | 
| 149 | 
            +
                # ============================================================================
         | 
| 150 | 
            +
                push_to_hub=False,                      # Set to True to auto-push
         | 
| 151 | 
            +
                hub_model_id=None,                      # Will be set by launch script
         | 
| 152 | 
            +
                hub_private_repo=False,
         | 
| 153 | 
            +
                
         | 
| 154 | 
            +
                # ============================================================================
         | 
| 155 | 
            +
                # MONITORING
         | 
| 156 | 
            +
                # ============================================================================
         | 
| 157 | 
            +
                enable_tracking=True,                   # Trackio monitoring
         | 
| 158 | 
            +
                log_artifacts=True,
         | 
| 159 | 
            +
                log_metrics=True,
         | 
| 160 | 
            +
                log_config=True,
         | 
| 161 | 
            +
            )
         | 
| 162 | 
            +
             | 
| 163 | 
            +
            # Print configuration summary on import
         | 
| 164 | 
            +
            print("\n🇫🇷 OpenHermes-FR Configuration Loaded")
         | 
| 165 | 
            +
            print("=" * 50)
         | 
| 166 | 
            +
            print(f"📊 Dataset: {config.dataset_name}")
         | 
| 167 | 
            +
            print(f"🗣️  Language: French (with {config.dataset_format} format)")
         | 
| 168 | 
            +
            print(f"📈 Training: {config.num_train_epochs} epochs")
         | 
| 169 | 
            +
            print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
         | 
| 170 | 
            +
            print(f"🧠 LoRA Rank: {config.lora_config['r']}")
         | 
| 171 | 
            +
            print(f"📏 Sequence Length: {config.max_seq_length}")
         | 
| 172 | 
            +
            print(f"🔍 Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
         | 
| 173 | 
            +
            print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
         | 
| 174 | 
            +
            print("=" * 50)
         | 
    	
        config/train_gpt_oss_openhermes_fr_memory_optimized.py
    ADDED
    
    | @@ -0,0 +1,233 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            GPT-OSS OpenHermes-FR Memory-Optimized Configuration
         | 
| 3 | 
            +
            Combines memory optimization best practices with OpenHermes-FR dataset
         | 
| 4 | 
            +
            Optimized for GPT-OSS harmony format and MXFP4 quantization
         | 
| 5 | 
            +
            Based on OpenAI GPT-OSS specifications and memory optimization principles
         | 
| 6 | 
            +
            """
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Memory-optimized OpenHermes-FR configuration for GPT-OSS
         | 
| 11 | 
            +
            config = GPTOSSEnhancedCustomConfig(
         | 
| 12 | 
            +
                # ============================================================================
         | 
| 13 | 
            +
                # DATASET CONFIGURATION - OpenHermes-FR with Harmony Format
         | 
| 14 | 
            +
                # ============================================================================
         | 
| 15 | 
            +
                dataset_name="legmlai/openhermes-fr",
         | 
| 16 | 
            +
                dataset_split="train",
         | 
| 17 | 
            +
                dataset_format="openhermes_fr",
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                # OpenHermes-FR field mapping optimized for harmony format
         | 
| 20 | 
            +
                input_field="prompt",                    # French prompts
         | 
| 21 | 
            +
                target_field="accepted_completion",      # GPT-4o generated completions
         | 
| 22 | 
            +
                
         | 
| 23 | 
            +
                # Enhanced quality filtering for memory-constrained training
         | 
| 24 | 
            +
                filter_bad_entries=True,                 # Critical for memory efficiency
         | 
| 25 | 
            +
                bad_entry_field="bad_entry",
         | 
| 26 | 
            +
                bad_prompt_field="bad_prompt_detected",
         | 
| 27 | 
            +
                bad_response_field="bad_response_detected",
         | 
| 28 | 
            +
                
         | 
| 29 | 
            +
                # Memory-optimized data processing with GPT-OSS Harmony Format
         | 
| 30 | 
            +
                concatenate_fields=True,
         | 
| 31 | 
            +
                field_separator="\n\n### Réponse:\n",   # Fallback separator (harmony format takes precedence)
         | 
| 32 | 
            +
                add_eos_token=True,                      # Required for proper training
         | 
| 33 | 
            +
                use_harmony_format=True,                 # Enable GPT-OSS harmony format
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                # Dataset sampling optimized for memory constraints
         | 
| 36 | 
            +
                max_samples=200000,                      # Reduced from 800K for memory efficiency
         | 
| 37 | 
            +
                min_length=15,                          # Slightly higher minimum for quality
         | 
| 38 | 
            +
                max_length=2048,                        # Explicit max length for memory control
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                # ============================================================================
         | 
| 41 | 
            +
                # MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
         | 
| 42 | 
            +
                # ============================================================================
         | 
| 43 | 
            +
                # Batch configuration following memory optimization principles
         | 
| 44 | 
            +
                num_train_epochs=1.0,                   # Single epoch to reduce memory pressure
         | 
| 45 | 
            +
                batch_size=2,                           # Reduced from 6 for memory efficiency
         | 
| 46 | 
            +
                gradient_accumulation_steps=16,         # Increased to maintain effective batch size 32
         | 
| 47 | 
            +
                
         | 
| 48 | 
            +
                # Learning rate optimized for single epoch + memory constraints
         | 
| 49 | 
            +
                learning_rate=2e-4,                     # Standard GPT-OSS learning rate
         | 
| 50 | 
            +
                min_lr=2e-5,                            # 10% of max learning rate
         | 
| 51 | 
            +
                warmup_ratio=0.03,                      # Reduced warmup for memory efficiency
         | 
| 52 | 
            +
                weight_decay=0.01,                      # Standard L2 regularization
         | 
| 53 | 
            +
                max_grad_norm=1.0,                      # Gradient clipping for stability
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                # ============================================================================
         | 
| 56 | 
            +
                # MODEL CONFIGURATION - Memory Optimized for GPT-OSS
         | 
| 57 | 
            +
                # ============================================================================
         | 
| 58 | 
            +
                model_name="openai/gpt-oss-20b",
         | 
| 59 | 
            +
                max_seq_length=1024,                    # Reduced from 3072 for memory optimization
         | 
| 60 | 
            +
                use_flash_attention=True,               # Critical for memory efficiency
         | 
| 61 | 
            +
                use_gradient_checkpointing=True,        # Essential for memory optimization
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
                # Mixed precision optimized for GPT-OSS MXFP4
         | 
| 64 | 
            +
                fp16=False,                             # Not recommended for GPT-OSS
         | 
| 65 | 
            +
                bf16=True,                              # Required for GPT-OSS stability
         | 
| 66 | 
            +
                tf32=True,                              # Enable TF32 for A100/H100 efficiency
         | 
| 67 | 
            +
                
         | 
| 68 | 
            +
                # ============================================================================
         | 
| 69 | 
            +
                # LORA CONFIGURATION - Memory Optimized for GPT-OSS MoE
         | 
| 70 | 
            +
                # ============================================================================
         | 
| 71 | 
            +
                use_lora=True,
         | 
| 72 | 
            +
                lora_config={
         | 
| 73 | 
            +
                    "r": 8,                             # Reduced rank for memory efficiency
         | 
| 74 | 
            +
                    "lora_alpha": 16,                   # 2x rank scaling (memory optimized)
         | 
| 75 | 
            +
                    "lora_dropout": 0.1,                # Higher dropout for better generalization
         | 
| 76 | 
            +
                    "target_modules": "all-linear",     # Apply to all linear layers
         | 
| 77 | 
            +
                    "target_parameters": [
         | 
| 78 | 
            +
                        # GPT-OSS specific MoE expert targeting
         | 
| 79 | 
            +
                        "7.mlp.experts.gate_up_proj",
         | 
| 80 | 
            +
                        "7.mlp.experts.down_proj",
         | 
| 81 | 
            +
                        "15.mlp.experts.gate_up_proj", 
         | 
| 82 | 
            +
                        "15.mlp.experts.down_proj",
         | 
| 83 | 
            +
                        "23.mlp.experts.gate_up_proj",
         | 
| 84 | 
            +
                        "23.mlp.experts.down_proj",
         | 
| 85 | 
            +
                    ],
         | 
| 86 | 
            +
                    "bias": "none",                     # No bias adaptation for memory efficiency
         | 
| 87 | 
            +
                    "task_type": "CAUSAL_LM",
         | 
| 88 | 
            +
                    "modules_to_save": [],              # Don't save additional modules for memory
         | 
| 89 | 
            +
                },
         | 
| 90 | 
            +
                
         | 
| 91 | 
            +
                # ============================================================================
         | 
| 92 | 
            +
                # QUANTIZATION - GPT-OSS Native MXFP4 Optimization
         | 
| 93 | 
            +
                # ============================================================================
         | 
| 94 | 
            +
                use_quantization=True,
         | 
| 95 | 
            +
                quantization_config={
         | 
| 96 | 
            +
                    "dequantize": True,                 # Use native MXFP4 as per GPT-OSS specs
         | 
| 97 | 
            +
                    "load_in_4bit": False,              # Don't use BNB 4-bit with MXFP4
         | 
| 98 | 
            +
                    "mxfp4_config": {                   # Native GPT-OSS MXFP4 settings
         | 
| 99 | 
            +
                        "enabled": True,
         | 
| 100 | 
            +
                        "block_size": 32,               # Optimized block size for MoE
         | 
| 101 | 
            +
                    }
         | 
| 102 | 
            +
                },
         | 
| 103 | 
            +
                
         | 
| 104 | 
            +
                # ============================================================================
         | 
| 105 | 
            +
                # MEMORY OPTIMIZATION CONFIGURATION
         | 
| 106 | 
            +
                # ============================================================================
         | 
| 107 | 
            +
                # Model loading with memory constraints
         | 
| 108 | 
            +
                model_kwargs={
         | 
| 109 | 
            +
                    "attn_implementation": "eager",     # Memory-safe attention
         | 
| 110 | 
            +
                    "torch_dtype": "auto",              # Let model decide (MXFP4 compatible)
         | 
| 111 | 
            +
                    "use_cache": False,                 # Disable KV cache for training
         | 
| 112 | 
            +
                    "device_map": "auto",               # Automatic device mapping
         | 
| 113 | 
            +
                    "low_cpu_mem_usage": True,          # Critical for memory optimization
         | 
| 114 | 
            +
                    "max_memory": {0: "75GB"},          # Reserve memory for other processes
         | 
| 115 | 
            +
                },
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                # Data loading optimized for memory efficiency
         | 
| 118 | 
            +
                dataloader_num_workers=2,               # Reduced workers to save memory
         | 
| 119 | 
            +
                dataloader_pin_memory=False,            # Disable to save memory
         | 
| 120 | 
            +
                dataloader_prefetch_factor=1,           # Minimal prefetch for memory
         | 
| 121 | 
            +
                
         | 
| 122 | 
            +
                # Memory management optimizations
         | 
| 123 | 
            +
                max_memory_per_gpu="75GB",              # Explicit memory limit
         | 
| 124 | 
            +
                low_cpu_mem_usage=True,                 # Essential for large models
         | 
| 125 | 
            +
                group_by_length=True,                   # Efficient batching for memory
         | 
| 126 | 
            +
                remove_unused_columns=True,             # Remove unnecessary data
         | 
| 127 | 
            +
                
         | 
| 128 | 
            +
                # ============================================================================
         | 
| 129 | 
            +
                # EVALUATION & LOGGING - Memory Efficient
         | 
| 130 | 
            +
                # ============================================================================
         | 
| 131 | 
            +
                eval_strategy="steps",
         | 
| 132 | 
            +
                eval_steps=500,                         # Less frequent evaluation for memory
         | 
| 133 | 
            +
                logging_steps=50,                       # Reduced logging frequency
         | 
| 134 | 
            +
                
         | 
| 135 | 
            +
                save_strategy="steps", 
         | 
| 136 | 
            +
                save_steps=1000,                        # Less frequent saves for memory/storage
         | 
| 137 | 
            +
                save_total_limit=2,                     # Keep only 2 checkpoints for memory
         | 
| 138 | 
            +
                save_only_model=True,                   # Save only model weights
         | 
| 139 | 
            +
                
         | 
| 140 | 
            +
                metric_for_best_model="eval_loss",
         | 
| 141 | 
            +
                greater_is_better=False,
         | 
| 142 | 
            +
                load_best_model_at_end=True,
         | 
| 143 | 
            +
                
         | 
| 144 | 
            +
                # Evaluation memory optimization
         | 
| 145 | 
            +
                eval_accumulation_steps=4,              # Accumulate eval outputs to save memory
         | 
| 146 | 
            +
                eval_batch_size=1,                      # Smaller eval batch size
         | 
| 147 | 
            +
                
         | 
| 148 | 
            +
                # ============================================================================
         | 
| 149 | 
            +
                # GPT-OSS HARMONY FORMAT OPTIMIZATION
         | 
| 150 | 
            +
                # ============================================================================
         | 
| 151 | 
            +
                # Chat template for harmony format compatibility (following exact template)
         | 
| 152 | 
            +
                use_chat_template=False,                # Use custom harmony format instead
         | 
| 153 | 
            +
                chat_template_kwargs={
         | 
| 154 | 
            +
                    "add_generation_prompt": True,
         | 
| 155 | 
            +
                    "tokenize": False,
         | 
| 156 | 
            +
                    # GPT-OSS Harmony Format specific settings (exact template format)
         | 
| 157 | 
            +
                    "reasoning_effort": "medium",       # low, medium, high
         | 
| 158 | 
            +
                    "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
         | 
| 159 | 
            +
                    "builtin_tools": [],                # Can include "browser" and/or "python"
         | 
| 160 | 
            +
                },
         | 
| 161 | 
            +
                
         | 
| 162 | 
            +
                # Generation config optimized for GPT-OSS harmony format (exact template compliance)
         | 
| 163 | 
            +
                generation_config={
         | 
| 164 | 
            +
                    "max_new_tokens": 256,              # Reduced for memory efficiency
         | 
| 165 | 
            +
                    "do_sample": True,
         | 
| 166 | 
            +
                    "temperature": 0.6,                 # Slightly lower for more focused training
         | 
| 167 | 
            +
                    "top_p": 0.9,
         | 
| 168 | 
            +
                    "top_k": 40,                        # Reduced for memory efficiency
         | 
| 169 | 
            +
                    "repetition_penalty": 1.1,
         | 
| 170 | 
            +
                    "pad_token_id": None,
         | 
| 171 | 
            +
                    "eos_token_id": None,
         | 
| 172 | 
            +
                    # GPT-OSS Harmony Format specific settings (exact template format)
         | 
| 173 | 
            +
                    "reasoning_effort": "medium",       # Configurable reasoning level
         | 
| 174 | 
            +
                    "use_harmony_format": True,         # Ensure harmony format in generation
         | 
| 175 | 
            +
                },
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                # ============================================================================
         | 
| 178 | 
            +
                # MULTILINGUAL & REASONING OPTIMIZATION
         | 
| 179 | 
            +
                # ============================================================================
         | 
| 180 | 
            +
                primary_language="fr",                  # French as primary language
         | 
| 181 | 
            +
                reasoning_languages=["French", "English"],  # Bilingual reasoning capability
         | 
| 182 | 
            +
                domain_focus="reasoning",               # Align with GPT-OSS reasoning focus
         | 
| 183 | 
            +
                
         | 
| 184 | 
            +
                # ============================================================================
         | 
| 185 | 
            +
                # OPTIMIZER & SCHEDULER - Memory Optimized
         | 
| 186 | 
            +
                # ============================================================================
         | 
| 187 | 
            +
                optimizer="adamw_torch",                # Memory-efficient optimizer
         | 
| 188 | 
            +
                beta1=0.9,
         | 
| 189 | 
            +
                beta2=0.95,                             # GPT-OSS optimized beta2
         | 
| 190 | 
            +
                eps=1e-8,
         | 
| 191 | 
            +
                
         | 
| 192 | 
            +
                scheduler="cosine_with_min_lr",         # Stable scheduler for single epoch
         | 
| 193 | 
            +
                lr_scheduler_kwargs={
         | 
| 194 | 
            +
                    "min_lr_rate": 0.1,
         | 
| 195 | 
            +
                    "warmup_steps": None,               # Use warmup_ratio instead
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                
         | 
| 198 | 
            +
                # ============================================================================
         | 
| 199 | 
            +
                # MONITORING & HUB INTEGRATION
         | 
| 200 | 
            +
                # ============================================================================
         | 
| 201 | 
            +
                enable_tracking=True,                   # Trackio monitoring
         | 
| 202 | 
            +
                log_artifacts=False,                    # Disable to save memory/storage
         | 
| 203 | 
            +
                log_metrics=True,
         | 
| 204 | 
            +
                log_config=True,
         | 
| 205 | 
            +
                
         | 
| 206 | 
            +
                push_to_hub=False,                      # Set to True after successful training
         | 
| 207 | 
            +
                hub_model_id=None,
         | 
| 208 | 
            +
                hub_private_repo=False,
         | 
| 209 | 
            +
            )
         | 
| 210 | 
            +
             | 
| 211 | 
            +
            # Configuration validation and optimization tips
         | 
| 212 | 
            +
            print("\n🔧 GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
         | 
| 213 | 
            +
            print("=" * 60)
         | 
| 214 | 
            +
            print(f"📊 Dataset: {config.dataset_name} (200K samples)")
         | 
| 215 | 
            +
            print(f"🗣️  Language: French with GPT-OSS Harmony Format")
         | 
| 216 | 
            +
            print(f"📈 Training: {config.num_train_epochs} epoch (memory optimized)")
         | 
| 217 | 
            +
            print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
         | 
| 218 | 
            +
            print(f"🧠 LoRA Rank: {config.lora_config['r']} (memory optimized)")
         | 
| 219 | 
            +
            print(f"📏 Sequence Length: {config.max_seq_length} (memory optimized)")
         | 
| 220 | 
            +
            print(f"💾 Memory Limit: {config.max_memory_per_gpu}")
         | 
| 221 | 
            +
            print(f"⚡ Quantization: MXFP4 (GPT-OSS native)")
         | 
| 222 | 
            +
            print(f"🔍 Quality Filtering: Enabled")
         | 
| 223 | 
            +
            print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
         | 
| 224 | 
            +
            print("=" * 60)
         | 
| 225 | 
            +
            print("\n💡 Memory Optimization Features:")
         | 
| 226 | 
            +
            print("  • Native MXFP4 quantization for GPT-OSS MoE layers")
         | 
| 227 | 
            +
            print("  • Reduced batch size with increased gradient accumulation")
         | 
| 228 | 
            +
            print("  • Limited sequence length for memory efficiency")
         | 
| 229 | 
            +
            print("  • Reduced LoRA rank while maintaining effectiveness")
         | 
| 230 | 
            +
            print("  • Dataset sampling (200K from 800K) for faster training")
         | 
| 231 | 
            +
            print("  • Gradient checkpointing and efficient data loading")
         | 
| 232 | 
            +
            print("  • Exact GPT-OSS Harmony format with <|return|> tokens")
         | 
| 233 | 
            +
            print("=" * 60)
         | 
    	
        docs/output.svg
    ADDED
    
    |  | 
    	
        launch.sh
    CHANGED
    
    | @@ -234,7 +234,34 @@ show_training_configs() { | |
| 234 | 
             
                echo "   - 4-bit quantization + reduced LoRA"
         | 
| 235 | 
             
                echo "   - Optimized for limited GPU memory"
         | 
| 236 | 
             
                echo ""
         | 
| 237 | 
            -
                echo "9.  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 238 | 
             
                echo "   - User-defined parameters"
         | 
| 239 | 
             
                echo ""
         | 
| 240 | 
             
            }
         | 
| @@ -325,12 +352,142 @@ get_training_config() { | |
| 325 | 
             
                        MAX_SEQ_LENGTH=1024
         | 
| 326 | 
             
                        CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
         | 
| 327 | 
             
                        ;;
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 328 | 
             
                    "Custom Configuration")
         | 
| 329 | 
             
                        get_custom_config
         | 
| 330 | 
             
                        ;;
         | 
| 331 | 
             
                esac
         | 
| 332 | 
             
            }
         | 
| 333 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 334 | 
             
            # Function to get custom configuration
         | 
| 335 | 
             
            get_custom_config() {
         | 
| 336 | 
             
                print_step "Custom Configuration Setup"
         | 
| @@ -352,6 +509,136 @@ get_custom_config() { | |
| 352 | 
             
                fi
         | 
| 353 | 
             
            }
         | 
| 354 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 355 | 
             
            # Function to create training configuration file
         | 
| 356 | 
             
            create_training_config() {
         | 
| 357 | 
             
                local config_file="$1"
         | 
| @@ -499,7 +786,7 @@ print_step "Step 2: Training Configuration" | |
| 499 | 
             
            echo "=================================="
         | 
| 500 |  | 
| 501 | 
             
            show_training_configs
         | 
| 502 | 
            -
            select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
         | 
| 503 |  | 
| 504 | 
             
            get_training_config "$TRAINING_CONFIG_TYPE"
         | 
| 505 |  | 
| @@ -836,13 +1123,25 @@ print_info "Dataset: $DATASET_NAME" | |
| 836 | 
             
            print_info "Batch size: $BATCH_SIZE"
         | 
| 837 | 
             
            print_info "Learning rate: $LEARNING_RATE"
         | 
| 838 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 839 | 
             
            # Step 15: Start training
         | 
| 840 | 
             
            print_step "Step 15: Starting Training"
         | 
| 841 | 
             
            echo "=============================="
         | 
| 842 |  | 
| 843 | 
             
            print_info "Starting training with configuration: $CONFIG_FILE"
         | 
| 844 | 
             
            print_info "Experiment: $EXPERIMENT_NAME"
         | 
| 845 | 
            -
            print_info "Output:  | 
| 846 | 
             
            print_info "Trackio: $TRACKIO_URL"
         | 
| 847 |  | 
| 848 | 
             
            # Ensure environment variables are available for training
         | 
| @@ -852,6 +1151,7 @@ export HF_TOKEN="$HF_TOKEN" | |
| 852 | 
             
            export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
         | 
| 853 | 
             
            export HF_USERNAME="$HF_USERNAME"
         | 
| 854 | 
             
            export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
         | 
|  | |
| 855 |  | 
| 856 | 
             
            # Run the appropriate training script based on model type
         | 
| 857 | 
             
            if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
         | 
| @@ -859,7 +1159,7 @@ if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then | |
| 859 | 
             
                python scripts/training/train_gpt_oss.py \
         | 
| 860 | 
             
                    --config "$CONFIG_FILE" \
         | 
| 861 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 862 | 
            -
                    --output-dir  | 
| 863 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 864 | 
             
                    --trainer-type "$TRAINER_TYPE_LOWER"
         | 
| 865 | 
             
            else
         | 
| @@ -867,7 +1167,7 @@ else | |
| 867 | 
             
                python scripts/training/train.py \
         | 
| 868 | 
             
                    --config "$CONFIG_FILE" \
         | 
| 869 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 870 | 
            -
                    --output-dir  | 
| 871 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 872 | 
             
                    --trainer-type "$TRAINER_TYPE_LOWER"
         | 
| 873 | 
             
            fi
         | 
| @@ -877,7 +1177,7 @@ print_step "Step 16: Pushing Model to HF Hub" | |
| 877 | 
             
            echo "====================================="
         | 
| 878 |  | 
| 879 | 
             
            print_info "Pushing model to: $REPO_NAME"
         | 
| 880 | 
            -
            print_info "Checkpoint:  | 
| 881 |  | 
| 882 | 
             
            # Ensure environment variables are available for model push
         | 
| 883 | 
             
            export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
         | 
| @@ -886,26 +1186,43 @@ export HF_TOKEN="$HF_TOKEN" | |
| 886 | 
             
            export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
         | 
| 887 | 
             
            export HF_USERNAME="$HF_USERNAME"
         | 
| 888 | 
             
            export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
         | 
|  | |
| 889 |  | 
| 890 | 
             
            # Run the appropriate push script based on model type
         | 
| 891 | 
             
            if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
         | 
| 892 | 
             
                print_info "Using GPT-OSS specialized push script..."
         | 
| 893 | 
            -
                python scripts/model_tonic/push_gpt_oss_to_huggingface.py  | 
| 894 | 
             
                    --token "$HF_TOKEN" \
         | 
| 895 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 896 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 897 | 
             
                    --dataset-repo "$TRACKIO_DATASET_REPO" \
         | 
| 898 | 
             
                    --author-name "$AUTHOR_NAME" \
         | 
| 899 | 
            -
                    --model-description "$MODEL_DESCRIPTION"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 900 | 
             
            else
         | 
| 901 | 
             
                print_info "Using standard SmolLM3 push script..."
         | 
| 902 | 
            -
                python scripts/model_tonic/push_to_huggingface.py  | 
| 903 | 
             
                    --token "$HF_TOKEN" \
         | 
| 904 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 905 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 906 | 
             
                    --dataset-repo "$TRACKIO_DATASET_REPO" \
         | 
| 907 | 
             
                    --author-name "$AUTHOR_NAME" \
         | 
| 908 | 
            -
                    --model-description "$MODEL_DESCRIPTION"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 909 | 
             
            fi
         | 
| 910 |  | 
| 911 | 
             
            # Step 16.5: Switch Trackio Space to Read Token (Security)
         | 
| @@ -1018,7 +1335,7 @@ fi) | |
| 1018 |  | 
| 1019 | 
             
            ## Files Created
         | 
| 1020 | 
             
            - Training configuration: \`$CONFIG_FILE\`
         | 
| 1021 | 
            -
            - Model checkpoint:  | 
| 1022 | 
             
            - Training logs: \`training.log\`
         | 
| 1023 | 
             
            - Summary report: \`training_summary.md\`
         | 
| 1024 | 
             
            EOF
         | 
|  | |
| 234 | 
             
                echo "   - 4-bit quantization + reduced LoRA"
         | 
| 235 | 
             
                echo "   - Optimized for limited GPU memory"
         | 
| 236 | 
             
                echo ""
         | 
| 237 | 
            +
                echo "9. GPT-OSS OpenHermes-FR (Recommended)"
         | 
| 238 | 
            +
                echo "   - Model: openai/gpt-oss-20b"
         | 
| 239 | 
            +
                echo "   - Dataset: legmlai/openhermes-fr (800K French examples)"
         | 
| 240 | 
            +
                echo "   - Epochs: 1.5"
         | 
| 241 | 
            +
                echo "   - Batch Size: 6 (effective 36 with accumulation)"
         | 
| 242 | 
            +
                echo "   - Learning Rate: 2.5e-4"
         | 
| 243 | 
            +
                echo "   - Optimized for French language training"
         | 
| 244 | 
            +
                echo "   - Quality filtering enabled"
         | 
| 245 | 
            +
                echo ""
         | 
| 246 | 
            +
                echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
         | 
| 247 | 
            +
                echo "   - Model: openai/gpt-oss-20b"
         | 
| 248 | 
            +
                echo "   - Dataset: legmlai/openhermes-fr (200K samples)"
         | 
| 249 | 
            +
                echo "   - Epochs: 1"
         | 
| 250 | 
            +
                echo "   - Batch Size: 2 (effective 32 with accumulation)"
         | 
| 251 | 
            +
                echo "   - Learning Rate: 2e-4"
         | 
| 252 | 
            +
                echo "   - Native MXFP4 quantization"
         | 
| 253 | 
            +
                echo "   - Memory optimized for 40-80GB GPUs"
         | 
| 254 | 
            +
                echo "   - Harmony format compatible"
         | 
| 255 | 
            +
                echo ""
         | 
| 256 | 
            +
                echo "10. GPT-OSS Custom Dataset"
         | 
| 257 | 
            +
                echo "   - Model: openai/gpt-oss-20b"
         | 
| 258 | 
            +
                echo "   - Dataset: User-defined (fully customizable)"
         | 
| 259 | 
            +
                echo "   - Epochs: Configurable"
         | 
| 260 | 
            +
                echo "   - Batch Size: Configurable"
         | 
| 261 | 
            +
                echo "   - Learning Rate: Configurable"
         | 
| 262 | 
            +
                echo "   - Maximum flexibility with all parameters"
         | 
| 263 | 
            +
                echo ""
         | 
| 264 | 
            +
                echo "11. Custom Configuration"
         | 
| 265 | 
             
                echo "   - User-defined parameters"
         | 
| 266 | 
             
                echo ""
         | 
| 267 | 
             
            }
         | 
|  | |
| 352 | 
             
                        MAX_SEQ_LENGTH=1024
         | 
| 353 | 
             
                        CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
         | 
| 354 | 
             
                        ;;
         | 
| 355 | 
            +
                    "GPT-OSS OpenHermes-FR (Recommended)")
         | 
| 356 | 
            +
                        MODEL_NAME="openai/gpt-oss-20b"
         | 
| 357 | 
            +
                        DATASET_NAME="legmlai/openhermes-fr"
         | 
| 358 | 
            +
                        MAX_EPOCHS=1.5
         | 
| 359 | 
            +
                        BATCH_SIZE=6
         | 
| 360 | 
            +
                        GRADIENT_ACCUMULATION_STEPS=6
         | 
| 361 | 
            +
                        LEARNING_RATE=2.5e-4
         | 
| 362 | 
            +
                        MAX_SEQ_LENGTH=3072
         | 
| 363 | 
            +
                        CONFIG_FILE="config/train_gpt_oss_openhermes_fr.py"
         | 
| 364 | 
            +
                        ;;
         | 
| 365 | 
            +
                    "GPT-OSS OpenHermes-FR Memory Optimized")
         | 
| 366 | 
            +
                        MODEL_NAME="openai/gpt-oss-20b"
         | 
| 367 | 
            +
                        DATASET_NAME="legmlai/openhermes-fr"
         | 
| 368 | 
            +
                        MAX_EPOCHS=1
         | 
| 369 | 
            +
                        BATCH_SIZE=2
         | 
| 370 | 
            +
                        GRADIENT_ACCUMULATION_STEPS=16
         | 
| 371 | 
            +
                        LEARNING_RATE=2e-4
         | 
| 372 | 
            +
                        MAX_SEQ_LENGTH=1024
         | 
| 373 | 
            +
                        CONFIG_FILE="config/train_gpt_oss_openhermes_fr_memory_optimized.py"
         | 
| 374 | 
            +
                        ;;
         | 
| 375 | 
            +
                    "GPT-OSS Custom Dataset")
         | 
| 376 | 
            +
                        MODEL_NAME="openai/gpt-oss-20b"
         | 
| 377 | 
            +
                        DATASET_NAME="legmlai/openhermes-fr"  # Will be customizable
         | 
| 378 | 
            +
                        MAX_EPOCHS=1
         | 
| 379 | 
            +
                        BATCH_SIZE=4
         | 
| 380 | 
            +
                        GRADIENT_ACCUMULATION_STEPS=4
         | 
| 381 | 
            +
                        LEARNING_RATE=2e-4
         | 
| 382 | 
            +
                        MAX_SEQ_LENGTH=2048
         | 
| 383 | 
            +
                        CONFIG_FILE="config/train_gpt_oss_custom.py"
         | 
| 384 | 
            +
                        get_custom_dataset_config
         | 
| 385 | 
            +
                        ;;
         | 
| 386 | 
             
                    "Custom Configuration")
         | 
| 387 | 
             
                        get_custom_config
         | 
| 388 | 
             
                        ;;
         | 
| 389 | 
             
                esac
         | 
| 390 | 
             
            }
         | 
| 391 |  | 
| 392 | 
            +
            # Function to get custom dataset configuration
         | 
| 393 | 
            +
            get_custom_dataset_config() {
         | 
| 394 | 
            +
                print_step "GPT-OSS Custom Configuration"
         | 
| 395 | 
            +
                echo "======================================"
         | 
| 396 | 
            +
                
         | 
| 397 | 
            +
                echo "Configure your GPT-OSS training:"
         | 
| 398 | 
            +
                echo ""
         | 
| 399 | 
            +
                
         | 
| 400 | 
            +
                # Dataset Configuration
         | 
| 401 | 
            +
                print_info "📊 Dataset Configuration"
         | 
| 402 | 
            +
                get_input "Dataset name (HuggingFace format: username/dataset)" "legmlai/openhermes-fr" DATASET_NAME
         | 
| 403 | 
            +
                get_input "Dataset split" "train" DATASET_SPLIT
         | 
| 404 | 
            +
                
         | 
| 405 | 
            +
                echo ""
         | 
| 406 | 
            +
                echo "Dataset format options:"
         | 
| 407 | 
            +
                echo "1. OpenHermes-FR (prompt + accepted_completion fields)"
         | 
| 408 | 
            +
                echo "2. Messages format (chat conversations)"
         | 
| 409 | 
            +
                echo "3. Text format (plain text field)"
         | 
| 410 | 
            +
                echo "4. Custom format (specify field names)"
         | 
| 411 | 
            +
                echo ""
         | 
| 412 | 
            +
                
         | 
| 413 | 
            +
                select_option "Select dataset format:" "OpenHermes-FR" "Messages format" "Text format" "Custom format" DATASET_FORMAT
         | 
| 414 | 
            +
                
         | 
| 415 | 
            +
                case "$DATASET_FORMAT" in
         | 
| 416 | 
            +
                    "OpenHermes-FR")
         | 
| 417 | 
            +
                        INPUT_FIELD="prompt"
         | 
| 418 | 
            +
                        TARGET_FIELD="accepted_completion"
         | 
| 419 | 
            +
                        DATASET_FORMAT_CODE="openhermes_fr"
         | 
| 420 | 
            +
                        FILTER_BAD_ENTRIES="true"
         | 
| 421 | 
            +
                        ;;
         | 
| 422 | 
            +
                    "Messages format")
         | 
| 423 | 
            +
                        INPUT_FIELD="messages"
         | 
| 424 | 
            +
                        TARGET_FIELD=""
         | 
| 425 | 
            +
                        DATASET_FORMAT_CODE="messages"
         | 
| 426 | 
            +
                        FILTER_BAD_ENTRIES="false"
         | 
| 427 | 
            +
                        ;;
         | 
| 428 | 
            +
                    "Text format")
         | 
| 429 | 
            +
                        INPUT_FIELD="text"
         | 
| 430 | 
            +
                        TARGET_FIELD=""
         | 
| 431 | 
            +
                        DATASET_FORMAT_CODE="text"
         | 
| 432 | 
            +
                        FILTER_BAD_ENTRIES="false"
         | 
| 433 | 
            +
                        ;;
         | 
| 434 | 
            +
                    "Custom format")
         | 
| 435 | 
            +
                        get_input "Input field name" "prompt" INPUT_FIELD
         | 
| 436 | 
            +
                        get_input "Target field name (leave empty if not needed)" "accepted_completion" TARGET_FIELD
         | 
| 437 | 
            +
                        DATASET_FORMAT_CODE="custom"
         | 
| 438 | 
            +
                        get_input "Filter bad entries? (true/false)" "false" FILTER_BAD_ENTRIES
         | 
| 439 | 
            +
                        ;;
         | 
| 440 | 
            +
                esac
         | 
| 441 | 
            +
                
         | 
| 442 | 
            +
                # Dataset Filtering Options
         | 
| 443 | 
            +
                echo ""
         | 
| 444 | 
            +
                print_info "🔍 Dataset Filtering Options"
         | 
| 445 | 
            +
                get_input "Maximum samples to use (leave empty for all)" "" MAX_SAMPLES
         | 
| 446 | 
            +
                get_input "Minimum sequence length" "10" MIN_LENGTH
         | 
| 447 | 
            +
                get_input "Maximum sequence length (leave empty for auto)" "" MAX_LENGTH
         | 
| 448 | 
            +
                
         | 
| 449 | 
            +
                # Training Hyperparameters
         | 
| 450 | 
            +
                echo ""
         | 
| 451 | 
            +
                print_info "⚙️ Training Hyperparameters"
         | 
| 452 | 
            +
                get_input "Number of epochs" "1.0" NUM_EPOCHS
         | 
| 453 | 
            +
                get_input "Batch size per device" "4" BATCH_SIZE
         | 
| 454 | 
            +
                get_input "Gradient accumulation steps" "4" GRAD_ACCUM_STEPS
         | 
| 455 | 
            +
                get_input "Learning rate" "2e-4" LEARNING_RATE
         | 
| 456 | 
            +
                get_input "Minimum learning rate" "2e-5" MIN_LR
         | 
| 457 | 
            +
                get_input "Weight decay" "0.01" WEIGHT_DECAY
         | 
| 458 | 
            +
                get_input "Warmup ratio" "0.03" WARMUP_RATIO
         | 
| 459 | 
            +
                
         | 
| 460 | 
            +
                # Sequence Length
         | 
| 461 | 
            +
                echo ""
         | 
| 462 | 
            +
                print_info "📏 Sequence Configuration"
         | 
| 463 | 
            +
                get_input "Maximum sequence length" "2048" MAX_SEQ_LENGTH
         | 
| 464 | 
            +
                
         | 
| 465 | 
            +
                # LoRA Configuration
         | 
| 466 | 
            +
                echo ""
         | 
| 467 | 
            +
                print_info "🎛️ LoRA Configuration"
         | 
| 468 | 
            +
                get_input "LoRA rank" "16" LORA_RANK
         | 
| 469 | 
            +
                get_input "LoRA alpha" "32" LORA_ALPHA
         | 
| 470 | 
            +
                get_input "LoRA dropout" "0.05" LORA_DROPOUT
         | 
| 471 | 
            +
                
         | 
| 472 | 
            +
                # Memory & Performance
         | 
| 473 | 
            +
                echo ""
         | 
| 474 | 
            +
                print_info "💾 Memory & Performance"
         | 
| 475 | 
            +
                select_option "Mixed precision:" "BF16 (recommended)" "FP16" "FP32" MIXED_PRECISION
         | 
| 476 | 
            +
                get_input "Data loading workers" "4" NUM_WORKERS
         | 
| 477 | 
            +
                select_option "Quantization:" "MXFP4 (default)" "4-bit BNB" "None" QUANTIZATION_TYPE
         | 
| 478 | 
            +
                
         | 
| 479 | 
            +
                # Advanced Options
         | 
| 480 | 
            +
                echo ""
         | 
| 481 | 
            +
                echo "Advanced options (press Enter for defaults):"
         | 
| 482 | 
            +
                get_input "Max gradient norm" "1.0" MAX_GRAD_NORM
         | 
| 483 | 
            +
                get_input "Logging steps" "10" LOGGING_STEPS
         | 
| 484 | 
            +
                get_input "Evaluation steps" "100" EVAL_STEPS
         | 
| 485 | 
            +
                get_input "Save steps" "500" SAVE_STEPS
         | 
| 486 | 
            +
                
         | 
| 487 | 
            +
                # Update the custom config file with user's choices
         | 
| 488 | 
            +
                update_enhanced_gpt_oss_config
         | 
| 489 | 
            +
            }
         | 
| 490 | 
            +
             | 
| 491 | 
             
            # Function to get custom configuration
         | 
| 492 | 
             
            get_custom_config() {
         | 
| 493 | 
             
                print_step "Custom Configuration Setup"
         | 
|  | |
| 509 | 
             
                fi
         | 
| 510 | 
             
            }
         | 
| 511 |  | 
| 512 | 
            +
            # Function to update enhanced GPT-OSS config with user choices
         | 
| 513 | 
            +
            update_enhanced_gpt_oss_config() {
         | 
| 514 | 
            +
                print_info "Generating enhanced custom GPT-OSS configuration..."
         | 
| 515 | 
            +
                
         | 
| 516 | 
            +
                # Process mixed precision setting
         | 
| 517 | 
            +
                case "$MIXED_PRECISION" in
         | 
| 518 | 
            +
                    "BF16 (recommended)")
         | 
| 519 | 
            +
                        FP16="False"
         | 
| 520 | 
            +
                        BF16="True"
         | 
| 521 | 
            +
                        ;;
         | 
| 522 | 
            +
                    "FP16")
         | 
| 523 | 
            +
                        FP16="True"
         | 
| 524 | 
            +
                        BF16="False"
         | 
| 525 | 
            +
                        ;;
         | 
| 526 | 
            +
                    "FP32")
         | 
| 527 | 
            +
                        FP16="False"
         | 
| 528 | 
            +
                        BF16="False"
         | 
| 529 | 
            +
                        ;;
         | 
| 530 | 
            +
                esac
         | 
| 531 | 
            +
                
         | 
| 532 | 
            +
                # Process quantization setting
         | 
| 533 | 
            +
                case "$QUANTIZATION_TYPE" in
         | 
| 534 | 
            +
                    "MXFP4 (default)")
         | 
| 535 | 
            +
                        USE_QUANTIZATION="True"
         | 
| 536 | 
            +
                        QUANTIZATION_CONFIG='{"dequantize": True, "load_in_4bit": False}'
         | 
| 537 | 
            +
                        ;;
         | 
| 538 | 
            +
                    "4-bit BNB")
         | 
| 539 | 
            +
                        USE_QUANTIZATION="True"
         | 
| 540 | 
            +
                        QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
         | 
| 541 | 
            +
                        ;;
         | 
| 542 | 
            +
                    "None")
         | 
| 543 | 
            +
                        USE_QUANTIZATION="False"
         | 
| 544 | 
            +
                        QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": False}'
         | 
| 545 | 
            +
                        ;;
         | 
| 546 | 
            +
                esac
         | 
| 547 | 
            +
                
         | 
| 548 | 
            +
                # Create enhanced config file with all user choices
         | 
| 549 | 
            +
                cat > "$CONFIG_FILE" << EOF
         | 
| 550 | 
            +
            """
         | 
| 551 | 
            +
            GPT-OSS Enhanced Custom Training Configuration - Generated by launch.sh
         | 
| 552 | 
            +
            Dataset: $DATASET_NAME ($DATASET_FORMAT)
         | 
| 553 | 
            +
            Optimized for: ${DATASET_FORMAT} format with full customization
         | 
| 554 | 
            +
            """
         | 
| 555 | 
            +
             | 
| 556 | 
            +
            from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
         | 
| 557 | 
            +
             | 
| 558 | 
            +
            # Create enhanced config with all customizations
         | 
| 559 | 
            +
            config = GPTOSSEnhancedCustomConfig(
         | 
| 560 | 
            +
                # ============================================================================
         | 
| 561 | 
            +
                # DATASET CONFIGURATION
         | 
| 562 | 
            +
                # ============================================================================
         | 
| 563 | 
            +
                dataset_name="$DATASET_NAME",
         | 
| 564 | 
            +
                dataset_split="$DATASET_SPLIT",
         | 
| 565 | 
            +
                dataset_format="$DATASET_FORMAT_CODE",
         | 
| 566 | 
            +
                input_field="$INPUT_FIELD",
         | 
| 567 | 
            +
                target_field=$(if [ -n "$TARGET_FIELD" ]; then echo "\"$TARGET_FIELD\""; else echo "None"; fi),
         | 
| 568 | 
            +
                filter_bad_entries=$FILTER_BAD_ENTRIES,
         | 
| 569 | 
            +
                max_samples=$(if [ -n "$MAX_SAMPLES" ]; then echo "$MAX_SAMPLES"; else echo "None"; fi),
         | 
| 570 | 
            +
                min_length=$MIN_LENGTH,
         | 
| 571 | 
            +
                max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
         | 
| 572 | 
            +
                
         | 
| 573 | 
            +
                # ============================================================================
         | 
| 574 | 
            +
                # TRAINING HYPERPARAMETERS
         | 
| 575 | 
            +
                # ============================================================================
         | 
| 576 | 
            +
                num_train_epochs=$NUM_EPOCHS,
         | 
| 577 | 
            +
                batch_size=$BATCH_SIZE,
         | 
| 578 | 
            +
                gradient_accumulation_steps=$GRAD_ACCUM_STEPS,
         | 
| 579 | 
            +
                learning_rate=$LEARNING_RATE,
         | 
| 580 | 
            +
                min_lr=$MIN_LR,
         | 
| 581 | 
            +
                weight_decay=$WEIGHT_DECAY,
         | 
| 582 | 
            +
                warmup_ratio=$WARMUP_RATIO,
         | 
| 583 | 
            +
                max_grad_norm=$MAX_GRAD_NORM,
         | 
| 584 | 
            +
                
         | 
| 585 | 
            +
                # ============================================================================
         | 
| 586 | 
            +
                # MODEL CONFIGURATION
         | 
| 587 | 
            +
                # ============================================================================
         | 
| 588 | 
            +
                max_seq_length=$MAX_SEQ_LENGTH,
         | 
| 589 | 
            +
                
         | 
| 590 | 
            +
                # ============================================================================
         | 
| 591 | 
            +
                # MIXED PRECISION
         | 
| 592 | 
            +
                # ============================================================================
         | 
| 593 | 
            +
                fp16=$FP16,
         | 
| 594 | 
            +
                bf16=$BF16,
         | 
| 595 | 
            +
                
         | 
| 596 | 
            +
                # ============================================================================
         | 
| 597 | 
            +
                # LORA CONFIGURATION
         | 
| 598 | 
            +
                # ============================================================================
         | 
| 599 | 
            +
                lora_config={
         | 
| 600 | 
            +
                    "r": $LORA_RANK,
         | 
| 601 | 
            +
                    "lora_alpha": $LORA_ALPHA,
         | 
| 602 | 
            +
                    "lora_dropout": $LORA_DROPOUT,
         | 
| 603 | 
            +
                    "target_modules": "all-linear",
         | 
| 604 | 
            +
                    "bias": "none",
         | 
| 605 | 
            +
                    "task_type": "CAUSAL_LM",
         | 
| 606 | 
            +
                },
         | 
| 607 | 
            +
                
         | 
| 608 | 
            +
                # ============================================================================
         | 
| 609 | 
            +
                # QUANTIZATION CONFIGURATION
         | 
| 610 | 
            +
                # ============================================================================
         | 
| 611 | 
            +
                use_quantization=$USE_QUANTIZATION,
         | 
| 612 | 
            +
                quantization_config=$QUANTIZATION_CONFIG,
         | 
| 613 | 
            +
                
         | 
| 614 | 
            +
                # ============================================================================
         | 
| 615 | 
            +
                # PERFORMANCE CONFIGURATION
         | 
| 616 | 
            +
                # ============================================================================
         | 
| 617 | 
            +
                dataloader_num_workers=$NUM_WORKERS,
         | 
| 618 | 
            +
                dataloader_pin_memory=True,
         | 
| 619 | 
            +
                group_by_length=True,
         | 
| 620 | 
            +
                
         | 
| 621 | 
            +
                # ============================================================================
         | 
| 622 | 
            +
                # LOGGING & EVALUATION
         | 
| 623 | 
            +
                # ============================================================================
         | 
| 624 | 
            +
                logging_steps=$LOGGING_STEPS,
         | 
| 625 | 
            +
                eval_steps=$EVAL_STEPS,
         | 
| 626 | 
            +
                save_steps=$SAVE_STEPS,
         | 
| 627 | 
            +
                
         | 
| 628 | 
            +
                # ============================================================================
         | 
| 629 | 
            +
                # RUNTIME CONFIGURATION
         | 
| 630 | 
            +
                # ============================================================================
         | 
| 631 | 
            +
                experiment_name="$EXPERIMENT_NAME",
         | 
| 632 | 
            +
                trackio_url="$TRACKIO_URL",
         | 
| 633 | 
            +
                dataset_repo="$TRACKIO_DATASET_REPO",
         | 
| 634 | 
            +
                enable_tracking=True,
         | 
| 635 | 
            +
            )
         | 
| 636 | 
            +
            EOF
         | 
| 637 | 
            +
                
         | 
| 638 | 
            +
                print_status "Enhanced GPT-OSS configuration generated successfully!"
         | 
| 639 | 
            +
                print_info "Configuration saved to: $CONFIG_FILE"
         | 
| 640 | 
            +
            }
         | 
| 641 | 
            +
             | 
| 642 | 
             
            # Function to create training configuration file
         | 
| 643 | 
             
            create_training_config() {
         | 
| 644 | 
             
                local config_file="$1"
         | 
|  | |
| 786 | 
             
            echo "=================================="
         | 
| 787 |  | 
| 788 | 
             
            show_training_configs
         | 
| 789 | 
            +
            select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
         | 
| 790 |  | 
| 791 | 
             
            get_training_config "$TRAINING_CONFIG_TYPE"
         | 
| 792 |  | 
|  | |
| 1123 | 
             
            print_info "Batch size: $BATCH_SIZE"
         | 
| 1124 | 
             
            print_info "Learning rate: $LEARNING_RATE"
         | 
| 1125 |  | 
| 1126 | 
            +
            # Step 14.5: Define Output Directory
         | 
| 1127 | 
            +
            print_step "Step 14.5: Output Directory Configuration"
         | 
| 1128 | 
            +
            echo "============================================="
         | 
| 1129 | 
            +
             | 
| 1130 | 
            +
            # Define the output directory for training results
         | 
| 1131 | 
            +
            OUTPUT_DIR="./outputs/${EXPERIMENT_NAME}_$(date +%Y%m%d_%H%M%S)"
         | 
| 1132 | 
            +
            print_info "Training output directory: $OUTPUT_DIR"
         | 
| 1133 | 
            +
             | 
| 1134 | 
            +
            # Create output directory
         | 
| 1135 | 
            +
            mkdir -p "$OUTPUT_DIR"
         | 
| 1136 | 
            +
            print_status "Output directory created: $OUTPUT_DIR"
         | 
| 1137 | 
            +
             | 
| 1138 | 
             
            # Step 15: Start training
         | 
| 1139 | 
             
            print_step "Step 15: Starting Training"
         | 
| 1140 | 
             
            echo "=============================="
         | 
| 1141 |  | 
| 1142 | 
             
            print_info "Starting training with configuration: $CONFIG_FILE"
         | 
| 1143 | 
             
            print_info "Experiment: $EXPERIMENT_NAME"
         | 
| 1144 | 
            +
            print_info "Output: $OUTPUT_DIR"
         | 
| 1145 | 
             
            print_info "Trackio: $TRACKIO_URL"
         | 
| 1146 |  | 
| 1147 | 
             
            # Ensure environment variables are available for training
         | 
|  | |
| 1151 | 
             
            export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
         | 
| 1152 | 
             
            export HF_USERNAME="$HF_USERNAME"
         | 
| 1153 | 
             
            export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
         | 
| 1154 | 
            +
            export OUTPUT_DIR="$OUTPUT_DIR"
         | 
| 1155 |  | 
| 1156 | 
             
            # Run the appropriate training script based on model type
         | 
| 1157 | 
             
            if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
         | 
|  | |
| 1159 | 
             
                python scripts/training/train_gpt_oss.py \
         | 
| 1160 | 
             
                    --config "$CONFIG_FILE" \
         | 
| 1161 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 1162 | 
            +
                    --output-dir "$OUTPUT_DIR" \
         | 
| 1163 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 1164 | 
             
                    --trainer-type "$TRAINER_TYPE_LOWER"
         | 
| 1165 | 
             
            else
         | 
|  | |
| 1167 | 
             
                python scripts/training/train.py \
         | 
| 1168 | 
             
                    --config "$CONFIG_FILE" \
         | 
| 1169 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 1170 | 
            +
                    --output-dir "$OUTPUT_DIR" \
         | 
| 1171 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 1172 | 
             
                    --trainer-type "$TRAINER_TYPE_LOWER"
         | 
| 1173 | 
             
            fi
         | 
|  | |
| 1177 | 
             
            echo "====================================="
         | 
| 1178 |  | 
| 1179 | 
             
            print_info "Pushing model to: $REPO_NAME"
         | 
| 1180 | 
            +
            print_info "Checkpoint: $OUTPUT_DIR"
         | 
| 1181 |  | 
| 1182 | 
             
            # Ensure environment variables are available for model push
         | 
| 1183 | 
             
            export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
         | 
|  | |
| 1186 | 
             
            export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
         | 
| 1187 | 
             
            export HF_USERNAME="$HF_USERNAME"
         | 
| 1188 | 
             
            export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
         | 
| 1189 | 
            +
            export OUTPUT_DIR="$OUTPUT_DIR"
         | 
| 1190 |  | 
| 1191 | 
             
            # Run the appropriate push script based on model type
         | 
| 1192 | 
             
            if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
         | 
| 1193 | 
             
                print_info "Using GPT-OSS specialized push script..."
         | 
| 1194 | 
            +
                python scripts/model_tonic/push_gpt_oss_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
         | 
| 1195 | 
             
                    --token "$HF_TOKEN" \
         | 
| 1196 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 1197 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 1198 | 
             
                    --dataset-repo "$TRACKIO_DATASET_REPO" \
         | 
| 1199 | 
             
                    --author-name "$AUTHOR_NAME" \
         | 
| 1200 | 
            +
                    --model-description "$MODEL_DESCRIPTION" \
         | 
| 1201 | 
            +
                    --training-config-type "$TRAINING_CONFIG_TYPE" \
         | 
| 1202 | 
            +
                    --model-name "$MODEL_NAME" \
         | 
| 1203 | 
            +
                    --dataset-name "$DATASET_NAME" \
         | 
| 1204 | 
            +
                    --batch-size "$BATCH_SIZE" \
         | 
| 1205 | 
            +
                    --learning-rate "$LEARNING_RATE" \
         | 
| 1206 | 
            +
                    --max-epochs "$MAX_EPOCHS" \
         | 
| 1207 | 
            +
                    --max-seq-length "$MAX_SEQ_LENGTH" \
         | 
| 1208 | 
            +
                    --trainer-type "$TRAINER_TYPE"
         | 
| 1209 | 
             
            else
         | 
| 1210 | 
             
                print_info "Using standard SmolLM3 push script..."
         | 
| 1211 | 
            +
                python scripts/model_tonic/push_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
         | 
| 1212 | 
             
                    --token "$HF_TOKEN" \
         | 
| 1213 | 
             
                    --trackio-url "$TRACKIO_URL" \
         | 
| 1214 | 
             
                    --experiment-name "$EXPERIMENT_NAME" \
         | 
| 1215 | 
             
                    --dataset-repo "$TRACKIO_DATASET_REPO" \
         | 
| 1216 | 
             
                    --author-name "$AUTHOR_NAME" \
         | 
| 1217 | 
            +
                    --model-description "$MODEL_DESCRIPTION" \
         | 
| 1218 | 
            +
                    --training-config-type "$TRAINING_CONFIG_TYPE" \
         | 
| 1219 | 
            +
                    --model-name "$MODEL_NAME" \
         | 
| 1220 | 
            +
                    --dataset-name "$DATASET_NAME" \
         | 
| 1221 | 
            +
                    --batch-size "$BATCH_SIZE" \
         | 
| 1222 | 
            +
                    --learning-rate "$LEARNING_RATE" \
         | 
| 1223 | 
            +
                    --max-epochs "$MAX_EPOCHS" \
         | 
| 1224 | 
            +
                    --max-seq-length "$MAX_SEQ_LENGTH" \
         | 
| 1225 | 
            +
                    --trainer-type "$TRAINER_TYPE"
         | 
| 1226 | 
             
            fi
         | 
| 1227 |  | 
| 1228 | 
             
            # Step 16.5: Switch Trackio Space to Read Token (Security)
         | 
|  | |
| 1335 |  | 
| 1336 | 
             
            ## Files Created
         | 
| 1337 | 
             
            - Training configuration: \`$CONFIG_FILE\`
         | 
| 1338 | 
            +
            - Model checkpoint: \`$OUTPUT_DIR/\`
         | 
| 1339 | 
             
            - Training logs: \`training.log\`
         | 
| 1340 | 
             
            - Summary report: \`training_summary.md\`
         | 
| 1341 | 
             
            EOF
         | 
    	
        scripts/model_tonic/push_gpt_oss_to_huggingface.py
    CHANGED
    
    | @@ -43,8 +43,59 @@ def merge_lora_weights(checkpoint_path, base_model_name, output_path): | |
| 43 |  | 
| 44 | 
             
                return model, tokenizer
         | 
| 45 |  | 
| 46 | 
            -
            def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
         | 
| 47 | 
            -
                """Create a comprehensive model card for GPT-OSS models"""
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 48 |  | 
| 49 | 
             
                card_content = f"""---
         | 
| 50 | 
             
            language:
         | 
| @@ -196,7 +247,7 @@ This model is licensed under the MIT License. | |
| 196 |  | 
| 197 | 
             
                return card_content
         | 
| 198 |  | 
| 199 | 
            -
            def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description):
         | 
| 200 | 
             
                """Push GPT-OSS model to Hugging Face Hub"""
         | 
| 201 |  | 
| 202 | 
             
                print("=== GPT-OSS Model Push Pipeline ===")
         | 
| @@ -230,7 +281,14 @@ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experi | |
| 230 | 
             
                        trackio_url=trackio_url,
         | 
| 231 | 
             
                        dataset_repo=dataset_repo,
         | 
| 232 | 
             
                        author_name=author_name,
         | 
| 233 | 
            -
                        model_description=model_description
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 234 | 
             
                    )
         | 
| 235 |  | 
| 236 | 
             
                    # Save model card
         | 
| @@ -291,6 +349,14 @@ def main(): | |
| 291 | 
             
                parser.add_argument("--dataset-repo", help="Dataset repository")
         | 
| 292 | 
             
                parser.add_argument("--author-name", help="Author name")
         | 
| 293 | 
             
                parser.add_argument("--model-description", help="Model description")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 294 |  | 
| 295 | 
             
                args = parser.parse_args()
         | 
| 296 |  | 
| @@ -308,7 +374,15 @@ def main(): | |
| 308 | 
             
                    experiment_name=experiment_name,
         | 
| 309 | 
             
                    dataset_repo=dataset_repo,
         | 
| 310 | 
             
                    author_name=author_name,
         | 
| 311 | 
            -
                    model_description=model_description
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 312 | 
             
                )
         | 
| 313 |  | 
| 314 | 
             
                sys.exit(0 if success else 1)
         | 
|  | |
| 43 |  | 
| 44 | 
             
                return model, tokenizer
         | 
| 45 |  | 
| 46 | 
            +
            def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
         | 
| 47 | 
            +
                """Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
         | 
| 48 | 
            +
                
         | 
| 49 | 
            +
                try:
         | 
| 50 | 
            +
                    # Import the model card generator
         | 
| 51 | 
            +
                    import sys
         | 
| 52 | 
            +
                    import os
         | 
| 53 | 
            +
                    sys.path.append(os.path.join(os.path.dirname(__file__)))
         | 
| 54 | 
            +
                    from generate_model_card import ModelCardGenerator, create_default_variables
         | 
| 55 | 
            +
                    
         | 
| 56 | 
            +
                    # Create generator
         | 
| 57 | 
            +
                    generator = ModelCardGenerator()
         | 
| 58 | 
            +
                    
         | 
| 59 | 
            +
                    # Create variables for the model card
         | 
| 60 | 
            +
                    variables = create_default_variables()
         | 
| 61 | 
            +
                    
         | 
| 62 | 
            +
                    # Update with GPT-OSS specific values
         | 
| 63 | 
            +
                    variables.update({
         | 
| 64 | 
            +
                        "repo_name": model_name,
         | 
| 65 | 
            +
                        "model_name": model_name.split('/')[-1],
         | 
| 66 | 
            +
                        "experiment_name": experiment_name or "gpt_oss_finetune",
         | 
| 67 | 
            +
                        "dataset_repo": dataset_repo,
         | 
| 68 | 
            +
                        "author_name": author_name or "GPT-OSS Fine-tuner",
         | 
| 69 | 
            +
                        "model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
         | 
| 70 | 
            +
                        "training_config_type": training_config_type or "GPT-OSS Configuration",
         | 
| 71 | 
            +
                        "base_model": "openai/gpt-oss-20b",
         | 
| 72 | 
            +
                        "dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
         | 
| 73 | 
            +
                        "trainer_type": trainer_type or "SFTTrainer",
         | 
| 74 | 
            +
                        "batch_size": str(batch_size) if batch_size else "4",
         | 
| 75 | 
            +
                        "learning_rate": str(learning_rate) if learning_rate else "2e-4",
         | 
| 76 | 
            +
                        "max_epochs": str(max_epochs) if max_epochs else "1",
         | 
| 77 | 
            +
                        "max_seq_length": str(max_seq_length) if max_seq_length else "2048",
         | 
| 78 | 
            +
                        "hardware_info": "GPU (H100/A100)",
         | 
| 79 | 
            +
                        "trackio_url": trackio_url or "N/A",
         | 
| 80 | 
            +
                        "training_loss": "N/A",
         | 
| 81 | 
            +
                        "validation_loss": "N/A",
         | 
| 82 | 
            +
                        "perplexity": "N/A",
         | 
| 83 | 
            +
                        "quantized_models": False
         | 
| 84 | 
            +
                    })
         | 
| 85 | 
            +
                    
         | 
| 86 | 
            +
                    # Generate the model card
         | 
| 87 | 
            +
                    model_card_content = generator.generate_model_card(variables)
         | 
| 88 | 
            +
                    
         | 
| 89 | 
            +
                    print("✅ Model card generated using generate_model_card.py")
         | 
| 90 | 
            +
                    return model_card_content
         | 
| 91 | 
            +
                    
         | 
| 92 | 
            +
                except Exception as e:
         | 
| 93 | 
            +
                    print(f"❌ Failed to generate model card with generator: {e}")
         | 
| 94 | 
            +
                    print("🔄 Falling back to original GPT-OSS model card")
         | 
| 95 | 
            +
                    return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
         | 
| 98 | 
            +
                """Create the original GPT-OSS model card as fallback"""
         | 
| 99 |  | 
| 100 | 
             
                card_content = f"""---
         | 
| 101 | 
             
            language:
         | 
|  | |
| 247 |  | 
| 248 | 
             
                return card_content
         | 
| 249 |  | 
| 250 | 
            +
            def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
         | 
| 251 | 
             
                """Push GPT-OSS model to Hugging Face Hub"""
         | 
| 252 |  | 
| 253 | 
             
                print("=== GPT-OSS Model Push Pipeline ===")
         | 
|  | |
| 281 | 
             
                        trackio_url=trackio_url,
         | 
| 282 | 
             
                        dataset_repo=dataset_repo,
         | 
| 283 | 
             
                        author_name=author_name,
         | 
| 284 | 
            +
                        model_description=model_description,
         | 
| 285 | 
            +
                        training_config_type=training_config_type,
         | 
| 286 | 
            +
                        dataset_name=dataset_name,
         | 
| 287 | 
            +
                        batch_size=batch_size,
         | 
| 288 | 
            +
                        learning_rate=learning_rate,
         | 
| 289 | 
            +
                        max_epochs=max_epochs,
         | 
| 290 | 
            +
                        max_seq_length=max_seq_length,
         | 
| 291 | 
            +
                        trainer_type=trainer_type
         | 
| 292 | 
             
                    )
         | 
| 293 |  | 
| 294 | 
             
                    # Save model card
         | 
|  | |
| 349 | 
             
                parser.add_argument("--dataset-repo", help="Dataset repository")
         | 
| 350 | 
             
                parser.add_argument("--author-name", help="Author name")
         | 
| 351 | 
             
                parser.add_argument("--model-description", help="Model description")
         | 
| 352 | 
            +
                parser.add_argument("--training-config-type", help="Training configuration type")
         | 
| 353 | 
            +
                parser.add_argument("--model-name", help="Base model name")
         | 
| 354 | 
            +
                parser.add_argument("--dataset-name", help="Dataset name")
         | 
| 355 | 
            +
                parser.add_argument("--batch-size", help="Batch size")
         | 
| 356 | 
            +
                parser.add_argument("--learning-rate", help="Learning rate")
         | 
| 357 | 
            +
                parser.add_argument("--max-epochs", help="Maximum epochs")
         | 
| 358 | 
            +
                parser.add_argument("--max-seq-length", help="Maximum sequence length")
         | 
| 359 | 
            +
                parser.add_argument("--trainer-type", help="Trainer type")
         | 
| 360 |  | 
| 361 | 
             
                args = parser.parse_args()
         | 
| 362 |  | 
|  | |
| 374 | 
             
                    experiment_name=experiment_name,
         | 
| 375 | 
             
                    dataset_repo=dataset_repo,
         | 
| 376 | 
             
                    author_name=author_name,
         | 
| 377 | 
            +
                    model_description=model_description,
         | 
| 378 | 
            +
                    training_config_type=args.training_config_type,
         | 
| 379 | 
            +
                    model_name=args.model_name,
         | 
| 380 | 
            +
                    dataset_name=args.dataset_name,
         | 
| 381 | 
            +
                    batch_size=args.batch_size,
         | 
| 382 | 
            +
                    learning_rate=args.learning_rate,
         | 
| 383 | 
            +
                    max_epochs=args.max_epochs,
         | 
| 384 | 
            +
                    max_seq_length=args.max_seq_length,
         | 
| 385 | 
            +
                    trainer_type=args.trainer_type
         | 
| 386 | 
             
                )
         | 
| 387 |  | 
| 388 | 
             
                sys.exit(0 if success else 1)
         | 
    	
        scripts/model_tonic/push_to_huggingface.py
    CHANGED
    
    | @@ -62,7 +62,15 @@ class HuggingFacePusher: | |
| 62 | 
             
                    dataset_repo: Optional[str] = None,
         | 
| 63 | 
             
                    hf_token: Optional[str] = None,
         | 
| 64 | 
             
                    author_name: Optional[str] = None,
         | 
| 65 | 
            -
                    model_description: Optional[str] = None
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 | 
             
                ):
         | 
| 67 | 
             
                    self.model_path = Path(model_path)
         | 
| 68 | 
             
                    self.repo_name = repo_name
         | 
| @@ -73,6 +81,16 @@ class HuggingFacePusher: | |
| 73 | 
             
                    self.author_name = author_name
         | 
| 74 | 
             
                    self.model_description = model_description
         | 
| 75 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 76 | 
             
                    # HF Datasets configuration
         | 
| 77 | 
             
                    self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         | 
| 78 | 
             
                    self.hf_token = hf_token or os.getenv('HF_TOKEN')
         | 
| @@ -156,9 +174,53 @@ class HuggingFacePusher: | |
| 156 | 
             
                    return True
         | 
| 157 |  | 
| 158 | 
             
                def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         | 
| 159 | 
            -
                    """Create a comprehensive model card using the  | 
| 160 | 
            -
                     | 
| 161 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 162 |  | 
| 163 | 
             
                def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         | 
| 164 | 
             
                    """Create a simple model card without complex YAML to avoid formatting issues"""
         | 
| @@ -531,6 +593,14 @@ def parse_args(): | |
| 531 | 
             
                parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
         | 
| 532 | 
             
                parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
         | 
| 533 | 
             
                parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 534 |  | 
| 535 | 
             
                return parser.parse_args()
         | 
| 536 |  | 
| @@ -558,7 +628,15 @@ def main(): | |
| 558 | 
             
                        dataset_repo=args.dataset_repo,
         | 
| 559 | 
             
                        hf_token=args.hf_token,
         | 
| 560 | 
             
                        author_name=args.author_name,
         | 
| 561 | 
            -
                        model_description=args.model_description
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 562 | 
             
                    )
         | 
| 563 |  | 
| 564 | 
             
                    # Push model
         | 
|  | |
| 62 | 
             
                    dataset_repo: Optional[str] = None,
         | 
| 63 | 
             
                    hf_token: Optional[str] = None,
         | 
| 64 | 
             
                    author_name: Optional[str] = None,
         | 
| 65 | 
            +
                    model_description: Optional[str] = None,
         | 
| 66 | 
            +
                    training_config_type: Optional[str] = None,
         | 
| 67 | 
            +
                    model_name: Optional[str] = None,
         | 
| 68 | 
            +
                    dataset_name: Optional[str] = None,
         | 
| 69 | 
            +
                    batch_size: Optional[str] = None,
         | 
| 70 | 
            +
                    learning_rate: Optional[str] = None,
         | 
| 71 | 
            +
                    max_epochs: Optional[str] = None,
         | 
| 72 | 
            +
                    max_seq_length: Optional[str] = None,
         | 
| 73 | 
            +
                    trainer_type: Optional[str] = None
         | 
| 74 | 
             
                ):
         | 
| 75 | 
             
                    self.model_path = Path(model_path)
         | 
| 76 | 
             
                    self.repo_name = repo_name
         | 
|  | |
| 81 | 
             
                    self.author_name = author_name
         | 
| 82 | 
             
                    self.model_description = model_description
         | 
| 83 |  | 
| 84 | 
            +
                    # Training configuration details for model card generation
         | 
| 85 | 
            +
                    self.training_config_type = training_config_type
         | 
| 86 | 
            +
                    self.model_name = model_name  
         | 
| 87 | 
            +
                    self.dataset_name = dataset_name
         | 
| 88 | 
            +
                    self.batch_size = batch_size
         | 
| 89 | 
            +
                    self.learning_rate = learning_rate
         | 
| 90 | 
            +
                    self.max_epochs = max_epochs
         | 
| 91 | 
            +
                    self.max_seq_length = max_seq_length
         | 
| 92 | 
            +
                    self.trainer_type = trainer_type
         | 
| 93 | 
            +
                    
         | 
| 94 | 
             
                    # HF Datasets configuration
         | 
| 95 | 
             
                    self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         | 
| 96 | 
             
                    self.hf_token = hf_token or os.getenv('HF_TOKEN')
         | 
|  | |
| 174 | 
             
                    return True
         | 
| 175 |  | 
| 176 | 
             
                def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         | 
| 177 | 
            +
                    """Create a comprehensive model card using the generate_model_card.py script"""
         | 
| 178 | 
            +
                    try:
         | 
| 179 | 
            +
                        # Import the model card generator
         | 
| 180 | 
            +
                        import sys
         | 
| 181 | 
            +
                        sys.path.append(os.path.join(os.path.dirname(__file__)))
         | 
| 182 | 
            +
                        from generate_model_card import ModelCardGenerator, create_default_variables
         | 
| 183 | 
            +
                        
         | 
| 184 | 
            +
                        # Create generator
         | 
| 185 | 
            +
                        generator = ModelCardGenerator()
         | 
| 186 | 
            +
                        
         | 
| 187 | 
            +
                        # Create variables for the model card
         | 
| 188 | 
            +
                        variables = create_default_variables()
         | 
| 189 | 
            +
                        
         | 
| 190 | 
            +
                        # Update with actual values
         | 
| 191 | 
            +
                        variables.update({
         | 
| 192 | 
            +
                            "repo_name": self.repo_name,
         | 
| 193 | 
            +
                            "model_name": self.repo_name.split('/')[-1],
         | 
| 194 | 
            +
                            "experiment_name": self.experiment_name or "model_push",
         | 
| 195 | 
            +
                            "dataset_repo": self.dataset_repo,
         | 
| 196 | 
            +
                            "author_name": self.author_name or "Model Author",
         | 
| 197 | 
            +
                            "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
         | 
| 198 | 
            +
                            "training_config_type": self.training_config_type or "Custom Configuration",
         | 
| 199 | 
            +
                            "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
         | 
| 200 | 
            +
                            "dataset_name": self.dataset_name or "Custom Dataset",
         | 
| 201 | 
            +
                            "trainer_type": self.trainer_type or "SFTTrainer",
         | 
| 202 | 
            +
                            "batch_size": str(self.batch_size) if self.batch_size else "8",
         | 
| 203 | 
            +
                            "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
         | 
| 204 | 
            +
                            "max_epochs": str(self.max_epochs) if self.max_epochs else "3",
         | 
| 205 | 
            +
                            "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
         | 
| 206 | 
            +
                            "hardware_info": self._get_hardware_info(),
         | 
| 207 | 
            +
                            "trackio_url": self.trackio_url or "N/A",
         | 
| 208 | 
            +
                            "training_loss": str(results.get('train_loss', 'N/A')),
         | 
| 209 | 
            +
                            "validation_loss": str(results.get('eval_loss', 'N/A')),
         | 
| 210 | 
            +
                            "perplexity": str(results.get('perplexity', 'N/A')),
         | 
| 211 | 
            +
                            "quantized_models": False  # Set to True if quantized models are available
         | 
| 212 | 
            +
                        })
         | 
| 213 | 
            +
                        
         | 
| 214 | 
            +
                        # Generate the model card
         | 
| 215 | 
            +
                        model_card_content = generator.generate_model_card(variables)
         | 
| 216 | 
            +
                        
         | 
| 217 | 
            +
                        logger.info("✅ Model card generated using generate_model_card.py")
         | 
| 218 | 
            +
                        return model_card_content
         | 
| 219 | 
            +
                        
         | 
| 220 | 
            +
                    except Exception as e:
         | 
| 221 | 
            +
                        logger.error(f"❌ Failed to generate model card with generator: {e}")
         | 
| 222 | 
            +
                        logger.info("🔄 Falling back to simple model card")
         | 
| 223 | 
            +
                        return self._create_simple_model_card(training_config, results)
         | 
| 224 |  | 
| 225 | 
             
                def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         | 
| 226 | 
             
                    """Create a simple model card without complex YAML to avoid formatting issues"""
         | 
|  | |
| 593 | 
             
                parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
         | 
| 594 | 
             
                parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
         | 
| 595 | 
             
                parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
         | 
| 596 | 
            +
                parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
         | 
| 597 | 
            +
                parser.add_argument('--model-name', type=str, default=None, help='Base model name')
         | 
| 598 | 
            +
                parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
         | 
| 599 | 
            +
                parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
         | 
| 600 | 
            +
                parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
         | 
| 601 | 
            +
                parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
         | 
| 602 | 
            +
                parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
         | 
| 603 | 
            +
                parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
         | 
| 604 |  | 
| 605 | 
             
                return parser.parse_args()
         | 
| 606 |  | 
|  | |
| 628 | 
             
                        dataset_repo=args.dataset_repo,
         | 
| 629 | 
             
                        hf_token=args.hf_token,
         | 
| 630 | 
             
                        author_name=args.author_name,
         | 
| 631 | 
            +
                        model_description=args.model_description,
         | 
| 632 | 
            +
                        training_config_type=args.training_config_type,
         | 
| 633 | 
            +
                        model_name=args.model_name,
         | 
| 634 | 
            +
                        dataset_name=args.dataset_name,
         | 
| 635 | 
            +
                        batch_size=args.batch_size,
         | 
| 636 | 
            +
                        learning_rate=args.learning_rate,
         | 
| 637 | 
            +
                        max_epochs=args.max_epochs,
         | 
| 638 | 
            +
                        max_seq_length=args.max_seq_length,
         | 
| 639 | 
            +
                        trainer_type=args.trainer_type
         | 
| 640 | 
             
                    )
         | 
| 641 |  | 
| 642 | 
             
                    # Push model
         | 
    	
        scripts/training/train_gpt_oss.py
    CHANGED
    
    | @@ -95,12 +95,215 @@ def setup_lora_for_gpt_oss(model, config): | |
| 95 |  | 
| 96 | 
             
                return peft_model
         | 
| 97 |  | 
| 98 | 
            -
            def  | 
| 99 | 
            -
                """Load  | 
| 100 |  | 
| 101 | 
            -
                 | 
| 102 | 
            -
                 | 
| 103 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 104 |  | 
| 105 | 
             
                return dataset
         | 
| 106 |  | 
| @@ -127,25 +330,111 @@ def setup_trackio_tracking(config): | |
| 127 |  | 
| 128 | 
             
                return trackio_client
         | 
| 129 |  | 
| 130 | 
            -
            def create_sft_config(config):
         | 
| 131 | 
            -
                """Create SFTConfig for GPT-OSS training"""
         | 
| 132 | 
            -
                
         | 
| 133 | 
            -
                print("Creating SFT configuration...")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 134 |  | 
| 135 | 
             
                sft_config = SFTConfig(
         | 
| 136 | 
            -
                     | 
| 137 | 
            -
                     | 
| 138 | 
            -
                     | 
| 139 | 
            -
                     | 
| 140 | 
            -
                     | 
| 141 | 
            -
                     | 
| 142 | 
            -
                     | 
| 143 | 
            -
                     | 
| 144 | 
            -
                     | 
| 145 | 
            -
                     | 
| 146 | 
            -
                     | 
| 147 | 
            -
                     | 
| 148 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 149 | 
             
                )
         | 
| 150 |  | 
| 151 | 
             
                return sft_config
         | 
| @@ -193,13 +482,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer | |
| 193 | 
             
                peft_model = setup_lora_for_gpt_oss(model, config)
         | 
| 194 |  | 
| 195 | 
             
                # Load dataset
         | 
| 196 | 
            -
                dataset =  | 
| 197 |  | 
| 198 | 
             
                # Setup Trackio tracking
         | 
| 199 | 
             
                trackio_client = setup_trackio_tracking(config)
         | 
| 200 |  | 
| 201 | 
             
                # Create SFT configuration
         | 
| 202 | 
            -
                sft_config = create_sft_config(config)
         | 
| 203 |  | 
| 204 | 
             
                # Create trainer
         | 
| 205 | 
             
                print("Creating SFT trainer...")
         | 
|  | |
| 95 |  | 
| 96 | 
             
                return peft_model
         | 
| 97 |  | 
| 98 | 
            +
            def load_dataset_from_config(config):
         | 
| 99 | 
            +
                """Load dataset based on configuration"""
         | 
| 100 |  | 
| 101 | 
            +
                dataset_name = getattr(config, 'dataset_name', 'HuggingFaceH4/Multilingual-Thinking')
         | 
| 102 | 
            +
                dataset_split = getattr(config, 'dataset_split', 'train')
         | 
| 103 | 
            +
                dataset_config = getattr(config, 'dataset_config', None)
         | 
| 104 | 
            +
                
         | 
| 105 | 
            +
                print(f"Loading dataset: {dataset_name}")
         | 
| 106 | 
            +
                print(f"Dataset split: {dataset_split}")
         | 
| 107 | 
            +
                if dataset_config:
         | 
| 108 | 
            +
                    print(f"Dataset config: {dataset_config}")
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                # Load the dataset
         | 
| 111 | 
            +
                if dataset_config:
         | 
| 112 | 
            +
                    dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
         | 
| 113 | 
            +
                else:
         | 
| 114 | 
            +
                    dataset = load_dataset(dataset_name, split=dataset_split)
         | 
| 115 | 
            +
                
         | 
| 116 | 
            +
                print(f"Original dataset size: {len(dataset)} examples")
         | 
| 117 | 
            +
                
         | 
| 118 | 
            +
                # Apply filtering based on configuration
         | 
| 119 | 
            +
                dataset = apply_dataset_filtering(dataset, config)
         | 
| 120 | 
            +
                
         | 
| 121 | 
            +
                # Apply dataset processing based on format
         | 
| 122 | 
            +
                dataset = process_dataset_format(dataset, config)
         | 
| 123 | 
            +
                
         | 
| 124 | 
            +
                print(f"Final dataset size: {len(dataset)} examples")
         | 
| 125 | 
            +
                
         | 
| 126 | 
            +
                return dataset
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            def apply_dataset_filtering(dataset, config):
         | 
| 129 | 
            +
                """Apply filtering based on configuration"""
         | 
| 130 | 
            +
                
         | 
| 131 | 
            +
                # Filter bad entries if specified
         | 
| 132 | 
            +
                if getattr(config, 'filter_bad_entries', False):
         | 
| 133 | 
            +
                    bad_entry_field = getattr(config, 'bad_entry_field', 'bad_entry')
         | 
| 134 | 
            +
                    bad_prompt_field = getattr(config, 'bad_prompt_field', 'bad_prompt_detected')
         | 
| 135 | 
            +
                    bad_response_field = getattr(config, 'bad_response_field', 'bad_response_detected')
         | 
| 136 | 
            +
                    
         | 
| 137 | 
            +
                    original_size = len(dataset)
         | 
| 138 | 
            +
                    
         | 
| 139 | 
            +
                    # Filter out bad entries
         | 
| 140 | 
            +
                    if bad_entry_field in dataset.column_names:
         | 
| 141 | 
            +
                        dataset = dataset.filter(lambda x: not x.get(bad_entry_field, False))
         | 
| 142 | 
            +
                        print(f"Filtered {original_size - len(dataset)} bad entries")
         | 
| 143 | 
            +
                    
         | 
| 144 | 
            +
                    # Filter out bad prompts
         | 
| 145 | 
            +
                    if bad_prompt_field in dataset.column_names:
         | 
| 146 | 
            +
                        dataset = dataset.filter(lambda x: not x.get(bad_prompt_field, False))
         | 
| 147 | 
            +
                        print(f"Filtered bad prompts, remaining: {len(dataset)} examples")
         | 
| 148 | 
            +
                    
         | 
| 149 | 
            +
                    # Filter out bad responses
         | 
| 150 | 
            +
                    if bad_response_field in dataset.column_names:
         | 
| 151 | 
            +
                        dataset = dataset.filter(lambda x: not x.get(bad_response_field, False))
         | 
| 152 | 
            +
                        print(f"Filtered bad responses, remaining: {len(dataset)} examples")
         | 
| 153 | 
            +
                
         | 
| 154 | 
            +
                # Apply length filtering
         | 
| 155 | 
            +
                min_length = getattr(config, 'min_length', 10)
         | 
| 156 | 
            +
                max_length = getattr(config, 'max_length', None)
         | 
| 157 | 
            +
                
         | 
| 158 | 
            +
                input_field = getattr(config, 'input_field', 'prompt')
         | 
| 159 | 
            +
                target_field = getattr(config, 'target_field', 'accepted_completion')
         | 
| 160 | 
            +
                
         | 
| 161 | 
            +
                if min_length > 0 or max_length:
         | 
| 162 | 
            +
                    def length_filter(example):
         | 
| 163 | 
            +
                        input_len = len(example.get(input_field, ''))
         | 
| 164 | 
            +
                        target_len = len(example.get(target_field, ''))
         | 
| 165 | 
            +
                        total_len = input_len + target_len
         | 
| 166 | 
            +
                        
         | 
| 167 | 
            +
                        if total_len < min_length:
         | 
| 168 | 
            +
                            return False
         | 
| 169 | 
            +
                        if max_length and total_len > max_length:
         | 
| 170 | 
            +
                            return False
         | 
| 171 | 
            +
                        return True
         | 
| 172 | 
            +
                    
         | 
| 173 | 
            +
                    original_size = len(dataset)
         | 
| 174 | 
            +
                    dataset = dataset.filter(length_filter)
         | 
| 175 | 
            +
                    print(f"Length filtering: {original_size} -> {len(dataset)} examples")
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                # Apply sampling if specified
         | 
| 178 | 
            +
                max_samples = getattr(config, 'max_samples', None)
         | 
| 179 | 
            +
                if max_samples and len(dataset) > max_samples:
         | 
| 180 | 
            +
                    dataset = dataset.shuffle(seed=42).select(range(max_samples))
         | 
| 181 | 
            +
                    print(f"Sampled {max_samples} examples from dataset")
         | 
| 182 | 
            +
                
         | 
| 183 | 
            +
                return dataset
         | 
| 184 | 
            +
             | 
| 185 | 
            +
            def format_gpt_oss_harmony(prompt, completion, add_eos_token=True):
         | 
| 186 | 
            +
                """
         | 
| 187 | 
            +
                Format data for GPT-OSS Harmony format following the exact template structure.
         | 
| 188 | 
            +
                Based on: https://huggingface.co/openai/gpt-oss-20b/raw/main/chat_template.jinja
         | 
| 189 | 
            +
                """
         | 
| 190 | 
            +
                # GPT-OSS Harmony format structure (exact template compliance)
         | 
| 191 | 
            +
                # User message: <|start|>user<|message|>content<|end|>
         | 
| 192 | 
            +
                # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|end|> (inference)
         | 
| 193 | 
            +
                # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|return|> (training)
         | 
| 194 | 
            +
                
         | 
| 195 | 
            +
                harmony_text = f"<|start|>user<|message|>{prompt}<|end|><|start|>assistant<|channel|>final<|message|>{completion}"
         | 
| 196 | 
            +
                
         | 
| 197 | 
            +
                if add_eos_token:
         | 
| 198 | 
            +
                    # Use <|return|> for training as per template specification
         | 
| 199 | 
            +
                    # This indicates the end of generation in training
         | 
| 200 | 
            +
                    harmony_text += "<|return|>"
         | 
| 201 | 
            +
                else:
         | 
| 202 | 
            +
                    # Use <|end|> for inference
         | 
| 203 | 
            +
                    harmony_text += "<|end|>"
         | 
| 204 | 
            +
                
         | 
| 205 | 
            +
                return harmony_text
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            def process_dataset_format(dataset, config):
         | 
| 208 | 
            +
                """Process dataset based on format configuration with exact GPT-OSS Harmony compliance"""
         | 
| 209 | 
            +
                
         | 
| 210 | 
            +
                dataset_format = getattr(config, 'dataset_format', 'openhermes_fr')
         | 
| 211 | 
            +
                input_field = getattr(config, 'input_field', 'prompt')
         | 
| 212 | 
            +
                target_field = getattr(config, 'target_field', 'accepted_completion')
         | 
| 213 | 
            +
                concatenate_fields = getattr(config, 'concatenate_fields', True)
         | 
| 214 | 
            +
                field_separator = getattr(config, 'field_separator', '\n\n### Response:\n')
         | 
| 215 | 
            +
                add_eos_token = getattr(config, 'add_eos_token', True)
         | 
| 216 | 
            +
                use_harmony_format = getattr(config, 'use_harmony_format', True)
         | 
| 217 | 
            +
                
         | 
| 218 | 
            +
                print(f"Processing dataset format: {dataset_format}")
         | 
| 219 | 
            +
                print(f"Input field: {input_field}, Target field: {target_field}")
         | 
| 220 | 
            +
                print(f"GPT-OSS Harmony Format: {'Enabled' if use_harmony_format else 'Disabled'}")
         | 
| 221 | 
            +
                
         | 
| 222 | 
            +
                if dataset_format == "openhermes_fr":
         | 
| 223 | 
            +
                    # Process OpenHermes-FR format: prompt + accepted_completion
         | 
| 224 | 
            +
                    def format_openhermes_fr(example):
         | 
| 225 | 
            +
                        prompt = example.get(input_field, '')
         | 
| 226 | 
            +
                        completion = example.get(target_field, '')
         | 
| 227 | 
            +
                        
         | 
| 228 | 
            +
                        if concatenate_fields:
         | 
| 229 | 
            +
                            if use_harmony_format:
         | 
| 230 | 
            +
                                # Use exact GPT-OSS Harmony format from template
         | 
| 231 | 
            +
                                text = format_gpt_oss_harmony(prompt, completion, add_eos_token)
         | 
| 232 | 
            +
                            else:
         | 
| 233 | 
            +
                                # Fallback to standard format with separator
         | 
| 234 | 
            +
                                text = prompt + field_separator + completion
         | 
| 235 | 
            +
                                if add_eos_token:
         | 
| 236 | 
            +
                                    text += "</s>"
         | 
| 237 | 
            +
                            
         | 
| 238 | 
            +
                            return {"text": text}
         | 
| 239 | 
            +
                        else:
         | 
| 240 | 
            +
                            # Keep separate for more advanced training setups
         | 
| 241 | 
            +
                            return {
         | 
| 242 | 
            +
                                "input": prompt,
         | 
| 243 | 
            +
                                "output": completion
         | 
| 244 | 
            +
                            }
         | 
| 245 | 
            +
                    
         | 
| 246 | 
            +
                    dataset = dataset.map(format_openhermes_fr, remove_columns=dataset.column_names)
         | 
| 247 | 
            +
                    
         | 
| 248 | 
            +
                elif dataset_format == "messages":
         | 
| 249 | 
            +
                    # Process messages format (like HuggingFaceH4/Multilingual-Thinking)
         | 
| 250 | 
            +
                    def format_messages(example):
         | 
| 251 | 
            +
                        messages = example.get(input_field, [])
         | 
| 252 | 
            +
                        
         | 
| 253 | 
            +
                        if use_harmony_format and len(messages) >= 2:
         | 
| 254 | 
            +
                            # Extract user and assistant messages for harmony format
         | 
| 255 | 
            +
                            user_message = ""
         | 
| 256 | 
            +
                            assistant_message = ""
         | 
| 257 | 
            +
                            
         | 
| 258 | 
            +
                            for message in messages:
         | 
| 259 | 
            +
                                role = message.get("role", "")
         | 
| 260 | 
            +
                                content = message.get("content", "")
         | 
| 261 | 
            +
                                
         | 
| 262 | 
            +
                                if role == "user":
         | 
| 263 | 
            +
                                    user_message = content
         | 
| 264 | 
            +
                                elif role == "assistant":
         | 
| 265 | 
            +
                                    assistant_message = content
         | 
| 266 | 
            +
                            
         | 
| 267 | 
            +
                            if user_message and assistant_message:
         | 
| 268 | 
            +
                                # Use GPT-OSS Harmony format
         | 
| 269 | 
            +
                                text = format_gpt_oss_harmony(user_message, assistant_message, add_eos_token)
         | 
| 270 | 
            +
                            else:
         | 
| 271 | 
            +
                                # Fallback to simple concatenation
         | 
| 272 | 
            +
                                text = ""
         | 
| 273 | 
            +
                                for message in messages:
         | 
| 274 | 
            +
                                    role = message.get("role", "")
         | 
| 275 | 
            +
                                    content = message.get("content", "")
         | 
| 276 | 
            +
                                    text += f"{role}: {content}\n"
         | 
| 277 | 
            +
                                if add_eos_token:
         | 
| 278 | 
            +
                                    text += "</s>"
         | 
| 279 | 
            +
                        else:
         | 
| 280 | 
            +
                            # Standard format - convert messages to simple text
         | 
| 281 | 
            +
                            text = ""
         | 
| 282 | 
            +
                            for message in messages:
         | 
| 283 | 
            +
                                role = message.get("role", "")
         | 
| 284 | 
            +
                                content = message.get("content", "")
         | 
| 285 | 
            +
                                text += f"{role}: {content}\n"
         | 
| 286 | 
            +
                            if add_eos_token:
         | 
| 287 | 
            +
                                text += "</s>"
         | 
| 288 | 
            +
                        
         | 
| 289 | 
            +
                        return {"text": text}
         | 
| 290 | 
            +
                    
         | 
| 291 | 
            +
                    dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
         | 
| 292 | 
            +
                    
         | 
| 293 | 
            +
                elif dataset_format == "text":
         | 
| 294 | 
            +
                    # Process plain text format
         | 
| 295 | 
            +
                    text_field = input_field
         | 
| 296 | 
            +
                    def format_text(example):
         | 
| 297 | 
            +
                        text = example.get(text_field, '')
         | 
| 298 | 
            +
                        if add_eos_token:
         | 
| 299 | 
            +
                            text += "</s>"
         | 
| 300 | 
            +
                        return {"text": text}
         | 
| 301 | 
            +
                    
         | 
| 302 | 
            +
                    dataset = dataset.map(format_text, remove_columns=dataset.column_names)
         | 
| 303 | 
            +
                
         | 
| 304 | 
            +
                elif dataset_format == "custom":
         | 
| 305 | 
            +
                    # Custom format - user handles this in their config
         | 
| 306 | 
            +
                    print("Using custom dataset format - no automatic processing")
         | 
| 307 |  | 
| 308 | 
             
                return dataset
         | 
| 309 |  | 
|  | |
| 330 |  | 
| 331 | 
             
                return trackio_client
         | 
| 332 |  | 
| 333 | 
            +
            def create_sft_config(config, output_dir):
         | 
| 334 | 
            +
                """Create enhanced SFTConfig for GPT-OSS training"""
         | 
| 335 | 
            +
                
         | 
| 336 | 
            +
                print("Creating enhanced SFT configuration...")
         | 
| 337 | 
            +
                
         | 
| 338 | 
            +
                # Extract training parameters from config with enhanced defaults
         | 
| 339 | 
            +
                num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
         | 
| 340 | 
            +
                max_steps = getattr(config, 'max_steps', None)
         | 
| 341 | 
            +
                warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
         | 
| 342 | 
            +
                warmup_steps = getattr(config, 'warmup_steps', None)
         | 
| 343 | 
            +
                
         | 
| 344 | 
            +
                # Learning rate configuration
         | 
| 345 | 
            +
                learning_rate = config.learning_rate
         | 
| 346 | 
            +
                lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
         | 
| 347 | 
            +
                lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
         | 
| 348 | 
            +
                
         | 
| 349 | 
            +
                # Batch configuration
         | 
| 350 | 
            +
                per_device_train_batch_size = config.batch_size
         | 
| 351 | 
            +
                per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
         | 
| 352 | 
            +
                gradient_accumulation_steps = config.gradient_accumulation_steps
         | 
| 353 | 
            +
                
         | 
| 354 | 
            +
                # Evaluation and logging
         | 
| 355 | 
            +
                eval_strategy = getattr(config, 'eval_strategy', 'steps')
         | 
| 356 | 
            +
                eval_steps = getattr(config, 'eval_steps', 100)
         | 
| 357 | 
            +
                logging_steps = getattr(config, 'logging_steps', 10)
         | 
| 358 | 
            +
                
         | 
| 359 | 
            +
                # Saving configuration
         | 
| 360 | 
            +
                save_strategy = getattr(config, 'save_strategy', 'steps')
         | 
| 361 | 
            +
                save_steps = getattr(config, 'save_steps', 500)
         | 
| 362 | 
            +
                save_total_limit = getattr(config, 'save_total_limit', 3)
         | 
| 363 | 
            +
                
         | 
| 364 | 
            +
                # Mixed precision
         | 
| 365 | 
            +
                fp16 = getattr(config, 'fp16', False)
         | 
| 366 | 
            +
                bf16 = getattr(config, 'bf16', True)
         | 
| 367 | 
            +
                
         | 
| 368 | 
            +
                # Regularization
         | 
| 369 | 
            +
                weight_decay = getattr(config, 'weight_decay', 0.01)
         | 
| 370 | 
            +
                max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
         | 
| 371 | 
            +
                
         | 
| 372 | 
            +
                # HuggingFace Hub integration
         | 
| 373 | 
            +
                push_to_hub = getattr(config, 'push_to_hub', False)
         | 
| 374 | 
            +
                
         | 
| 375 | 
            +
                print(f"  • Epochs: {num_train_epochs}")
         | 
| 376 | 
            +
                print(f"  • Learning rate: {learning_rate}")
         | 
| 377 | 
            +
                print(f"  • Batch size: {per_device_train_batch_size}")
         | 
| 378 | 
            +
                print(f"  • Gradient accumulation: {gradient_accumulation_steps}")
         | 
| 379 | 
            +
                print(f"  • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
         | 
| 380 |  | 
| 381 | 
             
                sft_config = SFTConfig(
         | 
| 382 | 
            +
                    # Training duration
         | 
| 383 | 
            +
                    num_train_epochs=num_train_epochs,
         | 
| 384 | 
            +
                    max_steps=max_steps,
         | 
| 385 | 
            +
                    
         | 
| 386 | 
            +
                    # Learning rate
         | 
| 387 | 
            +
                    learning_rate=learning_rate,
         | 
| 388 | 
            +
                    lr_scheduler_type=lr_scheduler_type,
         | 
| 389 | 
            +
                    lr_scheduler_kwargs=lr_scheduler_kwargs,
         | 
| 390 | 
            +
                    warmup_ratio=warmup_ratio,
         | 
| 391 | 
            +
                    warmup_steps=warmup_steps,
         | 
| 392 | 
            +
                    
         | 
| 393 | 
            +
                    # Batch configuration
         | 
| 394 | 
            +
                    per_device_train_batch_size=per_device_train_batch_size,
         | 
| 395 | 
            +
                    per_device_eval_batch_size=per_device_eval_batch_size,
         | 
| 396 | 
            +
                    gradient_accumulation_steps=gradient_accumulation_steps,
         | 
| 397 | 
            +
                    
         | 
| 398 | 
            +
                    # Model configuration
         | 
| 399 | 
            +
                    max_seq_length=config.max_seq_length,
         | 
| 400 | 
            +
                    gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', True),
         | 
| 401 | 
            +
                    
         | 
| 402 | 
            +
                    # Mixed precision
         | 
| 403 | 
            +
                    fp16=fp16,
         | 
| 404 | 
            +
                    bf16=bf16,
         | 
| 405 | 
            +
                    
         | 
| 406 | 
            +
                    # Regularization
         | 
| 407 | 
            +
                    weight_decay=weight_decay,
         | 
| 408 | 
            +
                    max_grad_norm=max_grad_norm,
         | 
| 409 | 
            +
                    
         | 
| 410 | 
            +
                    # Evaluation
         | 
| 411 | 
            +
                    evaluation_strategy=eval_strategy,
         | 
| 412 | 
            +
                    eval_steps=eval_steps,
         | 
| 413 | 
            +
                    
         | 
| 414 | 
            +
                    # Logging
         | 
| 415 | 
            +
                    logging_steps=logging_steps,
         | 
| 416 | 
            +
                    
         | 
| 417 | 
            +
                    # Saving
         | 
| 418 | 
            +
                    save_strategy=save_strategy,
         | 
| 419 | 
            +
                    save_steps=save_steps,
         | 
| 420 | 
            +
                    save_total_limit=save_total_limit,
         | 
| 421 | 
            +
                    
         | 
| 422 | 
            +
                    # Output
         | 
| 423 | 
            +
                    output_dir=output_dir,
         | 
| 424 | 
            +
                    
         | 
| 425 | 
            +
                    # Data loading
         | 
| 426 | 
            +
                    dataloader_num_workers=getattr(config, 'dataloader_num_workers', 4),
         | 
| 427 | 
            +
                    dataloader_pin_memory=getattr(config, 'dataloader_pin_memory', True),
         | 
| 428 | 
            +
                    
         | 
| 429 | 
            +
                    # Performance
         | 
| 430 | 
            +
                    group_by_length=getattr(config, 'group_by_length', True),
         | 
| 431 | 
            +
                    remove_unused_columns=getattr(config, 'remove_unused_columns', True),
         | 
| 432 | 
            +
                    
         | 
| 433 | 
            +
                    # HuggingFace Hub
         | 
| 434 | 
            +
                    push_to_hub=push_to_hub,
         | 
| 435 | 
            +
                    
         | 
| 436 | 
            +
                    # Monitoring
         | 
| 437 | 
            +
                    report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
         | 
| 438 | 
             
                )
         | 
| 439 |  | 
| 440 | 
             
                return sft_config
         | 
|  | |
| 482 | 
             
                peft_model = setup_lora_for_gpt_oss(model, config)
         | 
| 483 |  | 
| 484 | 
             
                # Load dataset
         | 
| 485 | 
            +
                dataset = load_dataset_from_config(config)
         | 
| 486 |  | 
| 487 | 
             
                # Setup Trackio tracking
         | 
| 488 | 
             
                trackio_client = setup_trackio_tracking(config)
         | 
| 489 |  | 
| 490 | 
             
                # Create SFT configuration
         | 
| 491 | 
            +
                sft_config = create_sft_config(config, output_dir)
         | 
| 492 |  | 
| 493 | 
             
                # Create trainer
         | 
| 494 | 
             
                print("Creating SFT trainer...")
         | 
    	
        templates/spaces/demo_gpt/README.md
    CHANGED
    
    | @@ -6,7 +6,7 @@ colorTo: pink | |
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.40.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
            -
            pinned:  | 
| 10 | 
             
            short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
|  | |
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.40.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
            +
            pinned: false
         | 
| 10 | 
             
            short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
