""" Inference Script für trainiertes MoE Modell Lädt automatisch den neuesten Checkpoint und testet verschiedene Sampling Strategien """ import os import sys import torch from transformers import AutoTokenizer from moe_config import MoEGPTConfig from moe_model import MoEGPTForCausalLM # Force UTF-8 encoding for Windows console if sys.platform == 'win32': sys.stdout.reconfigure(encoding='utf-8') def find_latest_checkpoint(checkpoint_dir="./moe_checkpoints_v8_clean"): """ Findet den neuesten Checkpoint automatisch (v8 OPUS Edition!) Returns: str: Pfad zum neuesten Checkpoint oder None """ if not os.path.exists(checkpoint_dir): return None checkpoints = [ os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-") ] if not checkpoints: return None # Neuesten Checkpoint finden (nach creation time) latest = max(checkpoints, key=os.path.getctime) # Step Number extrahieren step = latest.split("checkpoint-")[-1] print(f"\n🔍 Neuester Checkpoint gefunden: Step {step}") return latest def load_model(model_path=None, device="cuda"): """ Lädt trainiertes MoE Modell Wenn model_path=None, wird automatisch der neueste Checkpoint geladen Args: model_path: Pfad zum gespeicherten Modell (None = auto-find) device: Device für Inference (cuda/cpu) Returns: model: Geladenes Modell config: Model Config """ # Auto-find neuesten Checkpoint if model_path is None: model_path = find_latest_checkpoint() if model_path is None: # Fallback: Versuche finales Modell (v8) model_path = "./moe_final_v8_clean" if not os.path.exists(model_path): raise ValueError("Kein Checkpoint gefunden! Trainiere zuerst ein Modell.") print(f"\n📥 Lade Modell von: {model_path}") config = MoEGPTConfig.from_pretrained(model_path) model = MoEGPTForCausalLM.from_pretrained(model_path) # Auf Device verschieben if device == "cuda" and torch.cuda.is_available(): model = model.cuda() print(f"✅ Modell geladen auf GPU") else: model = model.cpu() print(f"✅ Modell geladen auf CPU") model.eval() total_params = sum(p.numel() for p in model.parameters()) print(f" 📊 Parameter: {total_params:,} ({total_params/1e6:.1f}M)") print(f" 🧠 Experten: {config.total_experts}") print(f" ⚡ Aktive Params: {config.active_parameters_ratio:.1%}") return model, config def generate_text( model, tokenizer, prompt, max_new_tokens=400, temperature=0.8, top_k=50, top_p=0.95, repetition_penalty=1.0, device="cuda", ): """ Generiert Text mit dem MoE Modell Args: model: MoE Modell tokenizer: Tokenizer prompt: Input Prompt (String) max_new_tokens: Maximale neue Tokens (400!) temperature: Sampling Temperature top_k: Top-k Sampling top_p: Nucleus Sampling repetition_penalty: Penalty für Wiederholungen device: Device Returns: generated_text: Generierter Text """ # Tokenize prompt input_ids = tokenizer.encode(prompt, return_tensors="pt") if device == "cuda": input_ids = input_ids.cuda() # Generieren with torch.no_grad(): output_ids = model.generate( input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) # Decode generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) return generated_text def test_sampling_strategies(model, tokenizer, prompts, device="cuda"): """ Testet verschiedene Sampling Strategien Args: model: MoE Modell tokenizer: Tokenizer prompts: Liste von Test-Prompts device: Device """ # Optimale Strategien (basierend auf umfangreichen Tests) strategies = { "Standard (temp=0.7, rep=1.2, top_k=50, top_p=0.8)": { "temperature": 0.7, "top_k": 50, "top_p": 0.7, "repetition_penalty": 1.2, }, "Focused (temp=0.7, rep=1.4, #top_k=30, top_p=0.7)": { "temperature": 0.7, "top_k": 20, "top_p": 0.7, "repetition_penalty": 1.4, }, } print("\n" + "=" * 80) print("🧪 TESTING SAMPLING STRATEGIES") print("=" * 80) for prompt in prompts: print(f"\n{'='*80}") print(f"PROMPT: '{prompt}'") print(f"{'='*80}\n") for strategy_name, params in strategies.items(): print(f"\n🎯 Strategy: {strategy_name}") print("-" * 80) try: generated = generate_text( model=model, tokenizer=tokenizer, prompt=prompt, max_new_tokens=400, # 400 Tokens! device=device, **params ) print(f"{generated}") print() except Exception as e: print(f"❌ Error: {str(e)}\n") print("\n" + "=" * 80) print("💡 EMPFEHLUNG") print("=" * 80) print(""" """) def main(): # Device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"\n🖥️ Device: {device}") # Modell laden (automatisch neuester Checkpoint!) model, config = load_model(model_path=None, device=device) # Tokenizer laden print("\n📚 Lade Tokenizer...") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") tokenizer.pad_token = tokenizer.eos_token print("✅ Llama 3.2 Tokenizer geladen") print(f" - Vocab Size: {tokenizer.vocab_size:,}") print(f" - EOS Token: {tokenizer.eos_token}") # ==================== SAMPLING STRATEGY TESTS ==================== # Test Prompts (diverse!) test_prompts = [ "Gestern bin ich ", # Narrativ "Der Mond ", # Poetisch "Im Labor ", # Wissenschaftlich "Hast du auch das Gefühl, dass", # Persönlich/Forum "Die Zeit", "Was ist die Definition von Philosophie?" ] # Teste verschiedene Sampling Strategien test_sampling_strategies(model, tokenizer, test_prompts, device) if __name__ == "__main__": main()