import gradio as gr
from transformers import pipeline, AutoTokenizer
from typing import List, Dict, Any, Tuple
import torch

# CPU-модели (маленькие, chat-ready)
MODELS = {
    "Qwen2.5-0.5B": "Qwen/Qwen2.5-0.5B-Instruct",
    "Qwen2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
    "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
}

def load_model(model_key: str):
    """Lazy load pipeline."""
    model_id = MODELS[model_key]
    print(f"🚀 Загрузка {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    pipe = pipeline(
        "text-generation",
        model=model_id,
        tokenizer=tokenizer,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    print(f"✅ {model_id} загружена!")
    return pipe

# Global cache
model_cache = {}

def respond(message: str, 
            history: List[Dict[str, str]], 
            model_key: str, 
            system_prompt: str) -> Tuple[List[Dict[str, str]], str, Dict[str, Any]]:
    """Локальный чат с pipeline."""
    try:
        if model_key not in model_cache:
            model_cache[model_key] = load_model(model_key)
        pipe = model_cache[model_key]
        
        print(f"🚀 Генерация: {model_key}, Msg='{message[:30]}...'")
        
        # Chat format (system + history + user)
        messages = []
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt})
        messages.extend(history)
        messages.append({"role": "user", "content": message})
        
        # Apply chat template (для instruct)
        tokenizer = pipe.tokenizer
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Generate
        outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
        bot_reply = outputs[0]["generated_text"][len(prompt):].strip()
        
        print(f"✅ Ответ: {bot_reply[:50]}...")
        
        new_history = history + [
            {"role": "user", "content": message},
            {"role": "assistant", "content": bot_reply}
        ]
        return new_history, "", gr.update(value="")
    
    except Exception as e:
        error_msg = f"❌ {model_key}: {str(e)}"
        print(f"💥 {error_msg}")
        new_history = history + [
            {"role": "user", "content": message},
            {"role": "assistant", "content": error_msg}
        ]
        return new_history, error_msg, gr.update(value="")

# UI
with gr.Blocks(title="🚀 Локальный HF Чат (на слабом CPU!)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Локальный Inference (без API!)\n**Маленькие модели** — 1-3 сек CPU. Большие думают ооочень долго. Нет limits/token. В качестве примера.")
    
    with gr.Row(variant="compact"):
        model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), value="Qwen2.5-0.5B", label="🧠 Модель")
        system_prompt = gr.Textbox(label="📝 System", placeholder="Ты весёлый ИИ.", lines=2)
    
    chatbot = gr.Chatbot(type="messages", height=500)
    
    with gr.Row():
        msg_input = gr.Textbox(placeholder="Привет! (Enter)", scale=7)
        send_btn = gr.Button("📤", variant="primary", scale=1)
    
    with gr.Row():
        clear_btn = gr.Button("🗑️ Clear")
        retry_btn = gr.Button("🔄 Retry")
    
    status = gr.Textbox(label="Логи", interactive=False, lines=4)
    
    # Events
    send_btn.click(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
    msg_input.submit(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
    
    def clear():
        return [], "", gr.update(value="")
    clear_btn.click(clear, outputs=[chatbot, status, msg_input])
    
    def retry(history):
        if len(history) >= 2 and history[-2]["role"] == "user":
            return history[-2]["content"]
        return ""
    retry_btn.click(retry, inputs=[chatbot], outputs=[msg_input])

if __name__ == "__main__":
    demo.queue(max_size=10).launch(debug=True)