import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

REPO_ID = "mradermacher/Qwen3-4B-Claude-Sonnet-4-Reasoning-Distill-Safetensor-GGUF"
MODEL_FILENAME = "qwen3-4b-claude-sonnet-4-reasoning-distill.Q8_0.gguf"

model_path = hf_hub_download(
    repo_id=REPO_ID,
    filename=MODEL_FILENAME,
    local_dir="/home/user/app/models"
)

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=4,
    temperature=0.4,
    repeat_penalty=1.1,
)

# Claude-style system/user/assistant formatted prompt
def generate_response(user_input):
    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n"
        f"<|im_start|>user\n{user_input}<|im_end|>\n"
        "<|im_start|>assistant\n"
    ).format(user_input=user_input)

    output = llm(prompt, max_tokens=512, stop=["<|im_end|>"])
    return output["choices"][0]["text"]

gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Prompt", lines=4),
    outputs=gr.Textbox(label="Claude-Sonnet Response"),
    title="Claude Reasoning Chat - Qwen3-4B",
    description="Uses Claude-style system/user/assistant prompting with Qwen3-4B Reasoning Distill model."
).launch()