import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama REPO_ID = "mradermacher/Qwen3-4B-Claude-Sonnet-4-Reasoning-Distill-Safetensor-GGUF" MODEL_FILENAME = "qwen3-4b-claude-sonnet-4-reasoning-distill.Q8_0.gguf" model_path = hf_hub_download( repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir="/home/user/app/models" ) llm = Llama( model_path=model_path, n_ctx=4096, n_threads=4, temperature=0.4, repeat_penalty=1.1, ) # Claude-style system/user/assistant formatted prompt def generate_response(user_input): prompt = ( "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n" f"<|im_start|>user\n{user_input}<|im_end|>\n" "<|im_start|>assistant\n" ).format(user_input=user_input) output = llm(prompt, max_tokens=512, stop=["<|im_end|>"]) return output["choices"][0]["text"] gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Prompt", lines=4), outputs=gr.Textbox(label="Claude-Sonnet Response"), title="Claude Reasoning Chat - Qwen3-4B", description="Uses Claude-style system/user/assistant prompting with Qwen3-4B Reasoning Distill model." ).launch()