import gradio as gr from transformers import pipeline, AutoTokenizer from typing import List, Dict, Any, Tuple import torch # CPU-модели (маленькие, chat-ready) MODELS = { "Qwen2.5-0.5B": "Qwen/Qwen2.5-0.5B-Instruct", "Qwen2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct", "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct" } def load_model(model_key: str): """Lazy load pipeline.""" model_id = MODELS[model_key] print(f"🚀 Загрузка {model_id}...") tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token pipe = pipeline( "text-generation", model=model_id, tokenizer=tokenizer, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, max_new_tokens=512, do_sample=True, temperature=0.7, pad_token_id=tokenizer.eos_token_id ) print(f"✅ {model_id} загружена!") return pipe # Global cache model_cache = {} def respond(message: str, history: List[Dict[str, str]], model_key: str, system_prompt: str) -> Tuple[List[Dict[str, str]], str, Dict[str, Any]]: """Локальный чат с pipeline.""" try: if model_key not in model_cache: model_cache[model_key] = load_model(model_key) pipe = model_cache[model_key] print(f"🚀 Генерация: {model_key}, Msg='{message[:30]}...'") # Chat format (system + history + user) messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) messages.extend(history) messages.append({"role": "user", "content": message}) # Apply chat template (для instruct) tokenizer = pipe.tokenizer prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Generate outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7) bot_reply = outputs[0]["generated_text"][len(prompt):].strip() print(f"✅ Ответ: {bot_reply[:50]}...") new_history = history + [ {"role": "user", "content": message}, {"role": "assistant", "content": bot_reply} ] return new_history, "", gr.update(value="") except Exception as e: error_msg = f"❌ {model_key}: {str(e)}" print(f"💥 {error_msg}") new_history = history + [ {"role": "user", "content": message}, {"role": "assistant", "content": error_msg} ] return new_history, error_msg, gr.update(value="") # UI with gr.Blocks(title="🚀 Локальный HF Чат (на слабом CPU!)", theme=gr.themes.Soft()) as demo: gr.Markdown("# Локальный Inference (без API!)\n**Маленькие модели** — 1-3 сек CPU. Большие думают ооочень долго. Нет limits/token. В качестве примера.") with gr.Row(variant="compact"): model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), value="Qwen2.5-0.5B", label="🧠 Модель") system_prompt = gr.Textbox(label="📝 System", placeholder="Ты весёлый ИИ.", lines=2) chatbot = gr.Chatbot(type="messages", height=500) with gr.Row(): msg_input = gr.Textbox(placeholder="Привет! (Enter)", scale=7) send_btn = gr.Button("📤", variant="primary", scale=1) with gr.Row(): clear_btn = gr.Button("🗑️ Clear") retry_btn = gr.Button("🔄 Retry") status = gr.Textbox(label="Логи", interactive=False, lines=4) # Events send_btn.click(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input]) msg_input.submit(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input]) def clear(): return [], "", gr.update(value="") clear_btn.click(clear, outputs=[chatbot, status, msg_input]) def retry(history): if len(history) >= 2 and history[-2]["role"] == "user": return history[-2]["content"] return "" retry_btn.click(retry, inputs=[chatbot], outputs=[msg_input]) if __name__ == "__main__": demo.queue(max_size=10).launch(debug=True)