Spaces:
Running
Running
| """ | |
| GPT-OSS OpenHermes-FR Optimized Configuration | |
| Specifically optimized for the legmlai/openhermes-fr dataset | |
| 800K French instruction-response pairs with quality filtering | |
| """ | |
| from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig | |
| # OpenHermes-FR optimized configuration | |
| config = GPTOSSEnhancedCustomConfig( | |
| # ============================================================================ | |
| # DATASET CONFIGURATION - OpenHermes-FR Specific | |
| # ============================================================================ | |
| dataset_name="legmlai/openhermes-fr", | |
| dataset_split="train", | |
| dataset_format="openhermes_fr", | |
| # OpenHermes-FR field mapping | |
| input_field="prompt", # French prompts | |
| target_field="accepted_completion", # GPT-4o generated completions | |
| # Quality filtering using OpenHermes-FR metadata | |
| filter_bad_entries=True, # Use built-in quality flags | |
| bad_entry_field="bad_entry", | |
| bad_prompt_field="bad_prompt_detected", | |
| bad_response_field="bad_response_detected", | |
| # Data processing optimized for French with GPT-OSS Harmony Format | |
| concatenate_fields=True, | |
| field_separator="\n\n### RΓ©ponse:\n", # Fallback separator (harmony format takes precedence) | |
| add_eos_token=True, | |
| use_harmony_format=True, # Enable GPT-OSS harmony format | |
| # Dataset sampling (use all 800K examples by default) | |
| max_samples=None, # Use full dataset | |
| min_length=20, # Minimum for meaningful French text | |
| max_length=None, # Auto-set to max_seq_length | |
| # ============================================================================ | |
| # TRAINING HYPERPARAMETERS - French Language Optimized | |
| # ============================================================================ | |
| num_train_epochs=1.5, # 1.5 epochs optimal for large dataset | |
| batch_size=6, # Balanced for most GPUs | |
| gradient_accumulation_steps=6, # Effective batch size: 36 | |
| # Learning rate schedule optimized for French fine-tuning | |
| learning_rate=2.5e-4, # Slightly higher for multilingual | |
| min_lr=2.5e-5, # 10% of max learning rate | |
| warmup_ratio=0.05, # 5% warmup for stability | |
| weight_decay=0.01, # Standard L2 regularization | |
| max_grad_norm=1.0, # Gradient clipping | |
| # ============================================================================ | |
| # MODEL CONFIGURATION - Optimized for French | |
| # ============================================================================ | |
| model_name="openai/gpt-oss-20b", | |
| max_seq_length=3072, # Balanced length for French | |
| use_flash_attention=True, | |
| use_gradient_checkpointing=True, | |
| # Mixed precision for efficiency | |
| fp16=False, | |
| bf16=True, # Better for GPT-OSS | |
| # ============================================================================ | |
| # LORA CONFIGURATION - Optimized for French Language Learning | |
| # ============================================================================ | |
| use_lora=True, | |
| lora_config={ | |
| "r": 24, # Higher rank for language adaptation | |
| "lora_alpha": 48, # 2x rank scaling | |
| "lora_dropout": 0.05, # Light regularization | |
| "target_modules": "all-linear", | |
| "target_parameters": [ | |
| "7.mlp.experts.gate_up_proj", | |
| "7.mlp.experts.down_proj", | |
| "15.mlp.experts.gate_up_proj", | |
| "15.mlp.experts.down_proj", | |
| "23.mlp.experts.gate_up_proj", | |
| "23.mlp.experts.down_proj", | |
| ], | |
| "bias": "none", | |
| "task_type": "CAUSAL_LM", | |
| }, | |
| # ============================================================================ | |
| # QUANTIZATION - Balanced Performance/Memory | |
| # ============================================================================ | |
| use_quantization=True, | |
| quantization_config={ | |
| "dequantize": True, # MXFP4 as per GPT-OSS tutorial | |
| "load_in_4bit": False, # Standard precision for quality | |
| }, | |
| # ============================================================================ | |
| # PERFORMANCE OPTIMIZATION | |
| # ============================================================================ | |
| # Data loading optimized for large dataset | |
| dataloader_num_workers=6, # More workers for large dataset | |
| dataloader_pin_memory=True, | |
| dataloader_prefetch_factor=3, # Higher prefetch for efficiency | |
| # Memory management | |
| low_cpu_mem_usage=True, | |
| group_by_length=True, # Efficient batching | |
| remove_unused_columns=True, | |
| # ============================================================================ | |
| # EVALUATION & LOGGING | |
| # ============================================================================ | |
| eval_strategy="steps", | |
| eval_steps=200, # Evaluate every 200 steps | |
| logging_steps=20, # Log every 20 steps | |
| save_strategy="steps", | |
| save_steps=500, # Save every 500 steps | |
| save_total_limit=3, # Keep 3 best checkpoints | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| load_best_model_at_end=True, | |
| # Split ratios for automatic validation/test creation | |
| eval_ratio=0.01, | |
| test_ratio=0.01, | |
| # ============================================================================ | |
| # MULTILINGUAL & FRENCH SPECIFIC SETTINGS | |
| # ============================================================================ | |
| primary_language="fr", # French as primary language | |
| reasoning_languages=["French", "English"], # Bilingual reasoning | |
| domain_focus="instruction", # Instruction following | |
| # ============================================================================ | |
| # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format | |
| # ============================================================================ | |
| generation_config={ | |
| "max_new_tokens": 512, | |
| "do_sample": True, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "top_k": 50, | |
| "repetition_penalty": 1.1, | |
| "pad_token_id": None, | |
| "eos_token_id": None, | |
| # GPT-OSS Harmony Format specific settings | |
| "reasoning_effort": "medium", # Configurable reasoning level | |
| "use_harmony_format": True, # Ensure harmony format in generation | |
| }, | |
| # ============================================================================ | |
| # HF HUB INTEGRATION | |
| # ============================================================================ | |
| push_to_hub=False, # Set to True to auto-push | |
| hub_model_id=None, # Will be set by launch script | |
| hub_private_repo=False, | |
| # ============================================================================ | |
| # MONITORING | |
| # ============================================================================ | |
| enable_tracking=True, # Trackio monitoring | |
| log_artifacts=True, | |
| log_metrics=True, | |
| log_config=True, | |
| ) | |
| # Print configuration summary on import | |
| print("\nπ«π· OpenHermes-FR Configuration Loaded") | |
| print("=" * 50) | |
| print(f"π Dataset: {config.dataset_name}") | |
| print(f"π£οΈ Language: French (with {config.dataset_format} format)") | |
| print(f"π Training: {config.num_train_epochs} epochs") | |
| print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}") | |
| print(f"π§ LoRA Rank: {config.lora_config['r']}") | |
| print(f"π Sequence Length: {config.max_seq_length}") | |
| print(f"π Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}") | |
| print(f"π΅ GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}") | |
| print("=" * 50) | |