File size: 4,572 Bytes
4b20767
92773d2
50c1962
92773d2
4b20767
92773d2
 
 
 
582b919
92773d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b20767
5d83c2a
4b38efb
92773d2
 
 
c4cd467
92773d2
 
 
50c1962
92773d2
c4cd467
92773d2
c4cd467
5d83c2a
c4cd467
fd4506a
5d83c2a
c4cd467
92773d2
 
 
c4cd467
92773d2
 
 
 
 
fd4506a
 
 
 
 
4b38efb
c4cd467
 
92773d2
 
fd4506a
 
92773d2
fd4506a
92773d2
4b20767
4b38efb
8966a06
 
c4cd467
5d83c2a
92773d2
 
5d83c2a
fd4506a
 
 
50c1962
fd4506a
c4cd467
5d83c2a
50c1962
 
5d83c2a
92773d2
5d83c2a
4b38efb
92773d2
 
5d83c2a
fd4506a
 
 
5d83c2a
fd4506a
4b38efb
 
 
fd4506a
4b20767
 
92773d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
from transformers import pipeline, AutoTokenizer
from typing import List, Dict, Any, Tuple
import torch

# CPU-модели (маленькие, chat-ready)
MODELS = {
    "Qwen2.5-0.5B": "Qwen/Qwen2.5-0.5B-Instruct",
    "Qwen2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
    "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
}

def load_model(model_key: str):
    """Lazy load pipeline."""
    model_id = MODELS[model_key]
    print(f"🚀 Загрузка {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    pipe = pipeline(
        "text-generation",
        model=model_id,
        tokenizer=tokenizer,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    print(f"✅ {model_id} загружена!")
    return pipe

# Global cache
model_cache = {}

def respond(message: str, 
            history: List[Dict[str, str]], 
            model_key: str, 
            system_prompt: str) -> Tuple[List[Dict[str, str]], str, Dict[str, Any]]:
    """Локальный чат с pipeline."""
    try:
        if model_key not in model_cache:
            model_cache[model_key] = load_model(model_key)
        pipe = model_cache[model_key]
        
        print(f"🚀 Генерация: {model_key}, Msg='{message[:30]}...'")
        
        # Chat format (system + history + user)
        messages = []
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt})
        messages.extend(history)
        messages.append({"role": "user", "content": message})
        
        # Apply chat template (для instruct)
        tokenizer = pipe.tokenizer
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Generate
        outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
        bot_reply = outputs[0]["generated_text"][len(prompt):].strip()
        
        print(f"✅ Ответ: {bot_reply[:50]}...")
        
        new_history = history + [
            {"role": "user", "content": message},
            {"role": "assistant", "content": bot_reply}
        ]
        return new_history, "", gr.update(value="")
    
    except Exception as e:
        error_msg = f"❌ {model_key}: {str(e)}"
        print(f"💥 {error_msg}")
        new_history = history + [
            {"role": "user", "content": message},
            {"role": "assistant", "content": error_msg}
        ]
        return new_history, error_msg, gr.update(value="")

# UI
with gr.Blocks(title="🚀 Локальный HF Чат (на слабом CPU!)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Локальный Inference (без API!)\n**Маленькие модели** — 1-3 сек CPU. Большие думают ооочень долго. Нет limits/token. В качестве примера.")
    
    with gr.Row(variant="compact"):
        model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), value="Qwen2.5-0.5B", label="🧠 Модель")
        system_prompt = gr.Textbox(label="📝 System", placeholder="Ты весёлый ИИ.", lines=2)
    
    chatbot = gr.Chatbot(type="messages", height=500)
    
    with gr.Row():
        msg_input = gr.Textbox(placeholder="Привет! (Enter)", scale=7)
        send_btn = gr.Button("📤", variant="primary", scale=1)
    
    with gr.Row():
        clear_btn = gr.Button("🗑️ Clear")
        retry_btn = gr.Button("🔄 Retry")
    
    status = gr.Textbox(label="Логи", interactive=False, lines=4)
    
    # Events
    send_btn.click(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
    msg_input.submit(fn=respond, inputs=[msg_input, chatbot, model_dropdown, system_prompt], outputs=[chatbot, status, msg_input])
    
    def clear():
        return [], "", gr.update(value="")
    clear_btn.click(clear, outputs=[chatbot, status, msg_input])
    
    def retry(history):
        if len(history) >= 2 and history[-2]["role"] == "user":
            return history[-2]["content"]
        return ""
    retry_btn.click(retry, inputs=[chatbot], outputs=[msg_input])

if __name__ == "__main__":
    demo.queue(max_size=10).launch(debug=True)