Spaces:
Running
Running
File size: 4,572 Bytes
4b20767 92773d2 50c1962 92773d2 4b20767 92773d2 582b919 92773d2 4b20767 5d83c2a 4b38efb 92773d2 c4cd467 92773d2 50c1962 92773d2 c4cd467 92773d2 c4cd467 5d83c2a c4cd467 fd4506a 5d83c2a c4cd467 92773d2 c4cd467 92773d2 fd4506a 4b38efb c4cd467 92773d2 fd4506a 92773d2 fd4506a 92773d2 4b20767 4b38efb 8966a06 c4cd467 5d83c2a 92773d2 5d83c2a fd4506a 50c1962 fd4506a c4cd467 5d83c2a 50c1962 5d83c2a 92773d2 5d83c2a 4b38efb 92773d2 5d83c2a fd4506a 5d83c2a fd4506a 4b38efb fd4506a 4b20767 92773d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
from typing import List, Dict, Any, Tuple
import torch
# CPU-модели (маленькие, chat-ready)
MODELS = {
"Qwen2.5-0.5B": "Qwen/Qwen2.5-0.5B-Instruct",
"Qwen2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
"Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
}
def load_model(model_key: str):
"""Lazy load pipeline."""
model_id = MODELS[model_key]
print(f"🚀 Загрузка {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
pipe = pipeline(
"text-generation",
model=model_id,
tokenizer=tokenizer,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id
)
print(f"✅ {model_id} загружена!")
return pipe
# Global cache
model_cache = {}
def respond(message: str,
history: List[Dict[str, str]],
model_key: str,
system_prompt: str) -> Tuple[List[Dict[str, str]], str, Dict[str, Any]]:
"""Локальный чат с pipeline."""
try:
if model_key not in model_cache:
model_cache[model_key] = load_model(model_key)
pipe = model_cache[model_key]
print(f"🚀 Генерация: {model_key}, Msg='{message[:30]}...'")
# Chat format (system + history + user)
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt})
messages.extend(history)
messages.append({"role": "user", "content": message})
# Apply chat template (для instruct)
tokenizer = pipe.tokenizer
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Generate
outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
bot_reply = outputs[0]["generated_text"][len(prompt):].strip()
print(f"✅ Ответ: {bot_reply[:50]}...")
new_history = history + [
{"role": "user", "content": message},
{"role": "assistant", "content": bot_reply}
]
return new_history, "", gr.update(value="")
except Exception as e:
error_msg = f"❌ {model_key}: {str(e)}"
print(f"💥 {error_msg}")
new_history = history + [
{"role": "user", "content": message},
{"role": "assistant", "content": error_msg}
]
return new_history, error_msg, gr.update(value="")
# UI
with gr.Blocks(title="🚀 Локальный HF Чат (на слабом CPU!)", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Локальный Inference (без API!)\n**Маленькие модели** — 1-3 сек CPU. Большие думают ооочень долго. Нет limits/token. В качестве примера.")
with gr.Row(variant="compact"):
model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), value="Qwen2.5-0.5B", label="🧠 Модель")
system_prompt = gr.Textbox(label="📝 System", placeholder="Ты весёлый ИИ.", lines=2)
chatbot = gr.Chatbot(type="messages", height=500)
with gr.Row():
msg_input = gr.Textbox(placeholder="Привет! (Enter)", scale=7)
send_btn = gr.Button("📤", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.Button("🗑️ Clear")
retry_btn = gr.Button("🔄 Retry")
status = gr.Textbox(label="Логи", interactive=False, lines=4)
# Events
send_btn.click(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
msg_input.submit(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
def clear():
return [], "", gr.update(value="")
clear_btn.click(clear, outputs=[chatbot, status, msg_input])
def retry(history):
if len(history) >= 2 and history[-2]["role"] == "user":
return history[-2]["content"]
return ""
retry_btn.click(retry, inputs=[chatbot], outputs=[msg_input])
if __name__ == "__main__":
demo.queue(max_size=10).launch(debug=True) |