Spaces:

stepfun-ai
/

Step-Audio-R1

Running

App Files Files Community

moevis commited on 11 days ago

Commit

621db7d

verified ·

1 Parent(s): e6f110c

Update app.py

Browse files

Files changed (1) hide show

app.py +500 -134

app.py CHANGED Viewed

@@ -15,6 +15,17 @@ import httpx
 API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
 def process_audio(audio_path):
     """
     Process audio: convert to wav, split if > 25s.
@@ -63,19 +74,28 @@ def format_messages(system, history, user_text, audio_data_list=None):
     # 处理历史记录
     for item in history:
-        # Filter out thinking process messages
-        metadata = item.get("metadata") if isinstance(item, dict) else getattr(item, "metadata", None)
-        if metadata and isinstance(metadata, dict) and metadata.get("title") == "⏳ Thinking Process":
-            continue
         role = item.get("role") if isinstance(item, dict) else getattr(item, "role", None)
         content = item.get("content") if isinstance(item, dict) else getattr(item, "content", None)
         if not role or content is None:
             continue
         # Check for Audio
-        is_audio = not isinstance(content, list) and content.get("component", None) == "audio"
         if is_audio:
             audio_path = content["value"]["path"]
@@ -97,11 +117,14 @@ def format_messages(system, history, user_text, audio_data_list=None):
         elif isinstance(content, str):
             messages.append({"role": role, "content": content})
         elif isinstance(content, list):
-            # Assume it's already a list of parts or mixed
-            safe_content = []
             for c in content:
                 # Check for Audio in list
-                is_c_audio = c.get('component', None) == "audio"
                 if is_c_audio:
                     audio_path = c["value"]["path"]
@@ -109,7 +132,7 @@ def format_messages(system, history, user_text, audio_data_list=None):
                         try:
                             item_audio_data_list = process_audio(audio_path)
                             for audio_data in item_audio_data_list:
-                                safe_content.append({
                                     "type": "input_audio",
                                     "input_audio": {
                                         "data": audio_data,
@@ -118,15 +141,31 @@ def format_messages(system, history, user_text, audio_data_list=None):
                                 })
                         except Exception as e:
                             print(f"[ERROR] Failed to process history audio in list: {e}")
-                elif isinstance(c, dict):
-                    safe_content.append(c)
                 elif isinstance(c, str):
-                    safe_content.append({"type": "text", "text": c})
-            messages.append({"role": role, "content": safe_content})
-    # 添加当前用户消息
     if user_text and audio_data_list:
         content = []
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
@@ -135,10 +174,6 @@ def format_messages(system, history, user_text, audio_data_list=None):
                     "format": "wav"
                 }
             })
-        content.append({
-            "type": "text",
-            "text": user_text
-        })
         messages.append({
             "role": "user",
@@ -148,10 +183,6 @@ def format_messages(system, history, user_text, audio_data_list=None):
         messages.append({"role": "user", "content": user_text})
     elif audio_data_list:
         content = []
-        messages.append({
-            "role": "user",
-            "content": content
-        })
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
@@ -160,17 +191,21 @@ def format_messages(system, history, user_text, audio_data_list=None):
                     "format": "wav"
                 }
             })
     return messages
-def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, model_name=None):
     """Chat function"""
     # If model is not specified, use global configuration
     if model_name is None:
         model_name = MODEL_NAME
     if not user_text and not audio_file:
-        yield history or [], "Please enter text or upload audio"
         return
     # Ensure history is a list and formatted correctly
@@ -191,7 +226,7 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
-        yield history or [], "Invalid input"
         return
     # Debug: Print message format
@@ -218,149 +253,474 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
     print(f"[DEBUG] Messages to API: {json.dumps(debug_messages, ensure_ascii=False, indent=2)}")
-    # Update history with user message immediately
-    if audio_file:
-        # 1. Add audio message
         history.append({"role": "user", "content": gr.Audio(audio_file)})
-        # 2. If text exists, add text message
-        if user_text:
-            history.append({"role": "user", "content": user_text})
-    else:
         # Text only
         history.append({"role": "user", "content": user_text})
     # Add thinking placeholder
-    history.append(gr.ChatMessage(
-        role="assistant",
-        content="",
-        metadata={"title": "⏳ Thinking Process"}
-    ))
-    yield history, "Generating..."
     try:
-        with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
-            # Use client.stream for better streaming control
-            with client.stream("POST", "/chat/completions", json={
                 "model": model_name,
                 "messages": messages,
                 "max_tokens": max_tokens,
                 "temperature": temperature,
                 "top_p": top_p,
                 "stream": True,
-                "repetition_penalty": 1.07,
                 "stop_token_ids": [151665]
-            }) as response:
-                if response.status_code != 200:
-                    error_msg = f"❌ API Error {response.status_code}"
-                    if response.status_code == 404:
-                        error_msg += " - vLLM service not ready"
-                    elif response.status_code == 400:
-                        error_msg += " - Bad request"
-                    elif response.status_code == 500:
-                        error_msg += " - Model error"
-                    yield history, error_msg
-                    return
-                # Process streaming response
-                buffer = ""
-                is_thinking = True
-                print("[DEBUG] Start receiving stream...")
-                for line in response.iter_lines():
-                    if not line:
-                        continue
-                    # Ensure line is string format
-                    if isinstance(line, bytes):
-                        line = line.decode('utf-8')
-                    else:
-                        line = str(line)
-                    if line.startswith('data: '):
-                        data_str = line[6:]
-                        if data_str.strip() == '[DONE]':
-                            print("[DEBUG] Stream finished [DONE]")
-                            break
-                        try:
-                            data = json.loads(data_str)
-                            if 'choices' in data and len(data['choices']) > 0:
-                                delta = data['choices'][0].get('delta', {})
-                                if 'content' in delta:
-                                    content = delta['content']
-                                    buffer += content
-                                    if is_thinking:
-                                        if "</think>" in buffer:
-                                            is_thinking = False
-                                            parts = buffer.split("</think>", 1)
-                                            think_content = parts[0]
-                                            response_content = parts[1]
-                                            if think_content.startswith("<think>"):
-                                                think_content = think_content[len("<think>"):].strip()
-                                            # Update thinking message
-                                            history[-1].content = think_content
-                                            # Add response message
-                                            history.append({"role": "assistant", "content": response_content})
                                         else:
-                                            # Update thinking message
                                             current_think = buffer
                                             if current_think.startswith("<think>"):
-                                                current_think = current_think[len("<think>"):]
-                                            history[-1].content = current_think
-                                    else:
-                                        # Already split, just update response message
-                                        parts = buffer.split("</think>", 1)
-                                        response_content = parts[1]
-                                        history[-1]["content"] = response_content
-                                    yield history, ""
-                        except json.JSONDecodeError:
-                            continue
     except httpx.ConnectError:
-        yield history, "❌ Cannot connect to vLLM API"
     except Exception as e:
-        yield history, f"❌ Error: {str(e)}"
 # Gradio Interface
-with gr.Blocks(title="Step Audio R1") as demo:
-    gr.Markdown("# Step Audio R1 Chat")
     with gr.Row():
-        # Left Configuration
-        with gr.Column(scale=1):
-            with gr.Accordion("Configuration", open=True):
                 system_prompt = gr.Textbox(
                     label="System Prompt",
                     lines=2,
-                    value="你是一个语音助手，你有非常丰富的音频处理经验。"
                 )
-                max_tokens = gr.Slider(1, 7192, value=1024, label="Max Tokens")
-                temperature = gr.Slider(0.0, 2.0, value=0.7, label="Temperature")
-                top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
-            status = gr.Textbox(label="Status", interactive=False)
-        # Right Chat
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Chat History", height=450)
-            user_text = gr.Textbox(label="Input", lines=2, placeholder="Enter message...")
-            audio_file = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
-            with gr.Row():
-                submit_btn = gr.Button("Send", variant="primary", scale=2)
-                clear_btn = gr.Button("Clear", scale=1)
     submit_btn.click(
         fn=chat,
-        inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p],
-        outputs=[chatbot, status]
     )
     clear_btn.click(
@@ -372,9 +732,15 @@ if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=7860)
     parser.add_argument("--model", default=MODEL_NAME)
     args = parser.parse_args()
     # 更新全局模型名称
     if args.model:

 API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
+def escape_html(text):
+    """Escape HTML special characters to prevent XSS"""
+    if not isinstance(text, str):
+        return text
+    return (text
+        .replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+        .replace("'", "&#x27;"))
 def process_audio(audio_path):
     """
     Process audio: convert to wav, split if > 25s.
     # 处理历史记录
     for item in history:
         role = item.get("role") if isinstance(item, dict) else getattr(item, "role", None)
         content = item.get("content") if isinstance(item, dict) else getattr(item, "content", None)
         if not role or content is None:
             continue
+        # If content contains thinking process (with thinking-block div), extract only the response part
+        if role == "assistant" and isinstance(content, str) and '<div class="thinking-block">' in content:
+            # Find the end of the thinking block and extract what comes after
+            # Match the entire thinking block
+            pattern = r'<div class="thinking-block">.*?</div>\s*</div>\s*'
+            remaining_content = re.sub(pattern, '', content, flags=re.DOTALL).strip()
+            # If there's meaningful content after the thinking block, use it
+            if remaining_content and not remaining_content.startswith('<'):
+                content = remaining_content
+            else:
+                # Still in thinking phase or no response yet, skip
+                continue
         # Check for Audio
+        is_audio = isinstance(content, dict) and content.get("component") == "audio"
         if is_audio:
             audio_path = content["value"]["path"]
         elif isinstance(content, str):
             messages.append({"role": role, "content": content})
         elif isinstance(content, list):
+            # Process list items and ensure text comes before audio
+            text_items = []
+            audio_items = []
+            other_items = []
             for c in content:
                 # Check for Audio in list
+                is_c_audio = isinstance(c, dict) and c.get('component') == "audio"
                 if is_c_audio:
                     audio_path = c["value"]["path"]
                         try:
                             item_audio_data_list = process_audio(audio_path)
                             for audio_data in item_audio_data_list:
+                                audio_items.append({
                                     "type": "input_audio",
                                     "input_audio": {
                                         "data": audio_data,
                                 })
                         except Exception as e:
                             print(f"[ERROR] Failed to process history audio in list: {e}")
                 elif isinstance(c, str):
+                    text_items.append({"type": "text", "text": c})
+                elif isinstance(c, dict):
+                    # Distinguish between text and audio types
+                    if c.get("type") == "text":
+                        text_items.append(c)
+                    elif c.get("type") == "input_audio":
+                        audio_items.append(c)
+                    else:
+                        other_items.append(c)
+            # Combine: text first, then audio, then others
+            safe_content = text_items + audio_items + other_items
+            if safe_content:
+                messages.append({"role": role, "content": safe_content})
+    # 添加当前用户消息（文本在前，音频在后）
     if user_text and audio_data_list:
         content = []
+        # 先添加文本
+        content.append({
+            "type": "text",
+            "text": user_text
+        })
+        # 再添加音频
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
                     "format": "wav"
                 }
             })
         messages.append({
             "role": "user",
         messages.append({"role": "user", "content": user_text})
     elif audio_data_list:
         content = []
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
                     "format": "wav"
                 }
             })
+        messages.append({
+            "role": "user",
+            "content": content
+        })
     return messages
+def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, show_thinking=True, model_name=None):
     """Chat function"""
     # If model is not specified, use global configuration
     if model_name is None:
         model_name = MODEL_NAME
     if not user_text and not audio_file:
+        yield history or []
         return
     # Ensure history is a list and formatted correctly
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
+        yield history or []
         return
     # Debug: Print message format
     print(f"[DEBUG] Messages to API: {json.dumps(debug_messages, ensure_ascii=False, indent=2)}")
+    # Update history with user message immediately (text first, then audio)
+    if user_text and audio_file:
+        # 1. Add text message first
+        history.append({"role": "user", "content": user_text})
+        # 2. Add audio message second
         history.append({"role": "user", "content": gr.Audio(audio_file)})
+    elif user_text:
         # Text only
         history.append({"role": "user", "content": user_text})
+    elif audio_file:
+        # Audio only
+        history.append({"role": "user", "content": gr.Audio(audio_file)})
     # Add thinking placeholder
+    if show_thinking:
+        history.append({
+            "role": "assistant",
+            "content": (
+                '<div class="thinking-block">\n'
+                '<div class="thinking-header">💭 Thinking...</div>\n'
+                '<div class="thinking-content">Processing your request...</div>\n'
+                '</div>'
+            )
+        })
+        yield history
+    else:
+        history.append({
+            "role": "assistant",
+            "content": "⏳ Generating response..."
+        })
+        yield history
     try:
+        # 禁用代理以访问内网 API
+        with httpx.Client(base_url=API_BASE_URL, timeout=120, proxies={}) as client:
+            response = client.post("/chat/completions", json={
                 "model": model_name,
                 "messages": messages,
                 "max_tokens": max_tokens,
                 "temperature": temperature,
                 "top_p": top_p,
                 "stream": True,
+                "repetition_penalty": 1.0,
                 "stop_token_ids": [151665]
+            })
+            if response.status_code != 200:
+                error_msg = f"❌ API Error {response.status_code}"
+                if response.status_code == 404:
+                    error_msg += " - vLLM service not ready"
+                elif response.status_code == 400:
+                    error_msg += " - Bad request"
+                elif response.status_code == 500:
+                    error_msg += " - Model error"
+                # Update the last message with error
+                history[-1]["content"] = error_msg
+                yield history
+                return
+            # Process streaming response
+            buffer = ""
+            is_thinking = True
+            for line in response.iter_lines():
+                if not line:
+                    continue
+                # Ensure line is string format
+                if isinstance(line, bytes):
+                    line = line.decode('utf-8')
+                else:
+                    line = str(line)
+                if line.startswith('data: '):
+                    data_str = line[6:]
+                    if data_str.strip() == '[DONE]':
+                        break
+                    try:
+                        data = json.loads(data_str)
+                        if 'choices' in data and len(data['choices']) > 0:
+                            delta = data['choices'][0].get('delta', {})
+                            if 'content' in delta:
+                                content = delta['content']
+                                buffer += content
+                                if is_thinking:
+                                    if "</think>" in buffer:
+                                        is_thinking = False
+                                        parts = buffer.split("</think>", 1)
+                                        think_content = parts[0]
+                                        response_content = parts[1]
+                                        if think_content.startswith("<think>"):
+                                            think_content = think_content[len("<think>"):].strip()
+                                        if show_thinking:
+                                            # Format thinking with custom styled block (escape HTML for safety)
+                                            escaped_think = escape_html(think_content)
+                                            formatted_content = (
+                                                f'<div class="thinking-block">\n'
+                                                f'<div class="thinking-header">💭 Thinking Process</div>\n'
+                                                f'<div class="thinking-content">{escaped_think}</div>\n'
+                                                f'</div>\n\n'
+                                                f'{response_content}'
+                                            )
+                                            history[-1]["content"] = formatted_content
                                         else:
+                                            # Don't show thinking, replace with response message directly
+                                            history[-1]["content"] = response_content
+                                    else:
+                                        # Update thinking message with collapsible format (only if showing)
+                                        if show_thinking:
                                             current_think = buffer
                                             if current_think.startswith("<think>"):
+                                                current_think = current_think[len("<think>"):].strip()
+                                            escaped_think = escape_html(current_think)
+                                            formatted_content = (
+                                                f'<div class="thinking-block">\n'
+                                                f'<div class="thinking-header">💭 Thinking...</div>\n'
+                                                f'<div class="thinking-content">{escaped_think}</div>\n'
+                                                f'</div>'
+                                            )
+                                            history[-1]["content"] = formatted_content
+                                else:
+                                    # Already split, update the combined message
+                                    parts = buffer.split("</think>", 1)
+                                    think_content = parts[0]
+                                    response_content = parts[1]
+                                    if think_content.startswith("<think>"):
+                                        think_content = think_content[len("<think>"):].strip()
+                                    if show_thinking:
+                                        # Update with formatted thinking + response
+                                        escaped_think = escape_html(think_content)
+                                        formatted_content = (
+                                            f'<div class="thinking-block">\n'
+                                            f'<div class="thinking-header">💭 Thinking Process</div>\n'
+                                            f'<div class="thinking-content">{escaped_think}</div>\n'
+                                            f'</div>\n\n'
+                                            f'{response_content}'
+                                        )
+                                        history[-1]["content"] = formatted_content
+                                    else:
+                                        # Only show response
+                                        history[-1]["content"] = response_content
+                                yield history
+                    except json.JSONDecodeError:
+                        continue
     except httpx.ConnectError:
+        history[-1]["content"] = "❌ Cannot connect to vLLM API"
+        yield history
     except Exception as e:
+        history[-1]["content"] = f"❌ Error: {str(e)}"
+        yield history
+# Custom CSS for better UI
+custom_css = """
+/* 全局样式 */
+.gradio-container {
+    max-width: 100% !important;
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+}
+/* 标题样式 */
+.app-header {
+    text-align: center;
+    padding: 2.5rem 1.5rem;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    position: relative;
+    overflow: hidden;
+    border-radius: 16px;
+    margin-bottom: 1.5rem;
+    box-shadow: 0 8px 24px rgba(102, 126, 234, 0.35);
+}
+/* 标题背景装饰 */
+.app-header::before {
+    content: '';
+    position: absolute;
+    top: -50%;
+    right: -50%;
+    width: 200%;
+    height: 200%;
+    background: radial-gradient(circle, rgba(255, 255, 255, 0.1) 0%, transparent 70%);
+    animation: rotate 20s linear infinite;
+}
+@keyframes rotate {
+    from { transform: rotate(0deg); }
+    to { transform: rotate(360deg); }
+}
+.app-header h1 {
+    margin: 0;
+    font-size: 2.8rem;
+    font-weight: 700;
+    color: white !important;
+    text-shadow: 0 3px 6px rgba(0, 0, 0, 0.25);
+    letter-spacing: 1px;
+    position: relative;
+    z-index: 1;
+}
+.app-header p {
+    color: rgba(255, 255, 255, 0.95) !important;
+    text-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+    position: relative;
+    z-index: 1;
+    line-height: 1.5;
+}
+/* 聊天框样式 */
+.chatbot-container {
+    border-radius: 12px;
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
+    overflow: hidden;
+}
+/* 思考过程样式 - 模仿Claude/ChatGPT的风格 */
+.thinking-block {
+    background: linear-gradient(135deg, #f5f7fa 0%, #eef2f7 100%);
+    border-left: 4px solid #667eea;
+    padding: 16px 20px;
+    margin: 12px 0;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+}
+.thinking-header {
+    display: flex;
+    align-items: center;
+    font-weight: 600;
+    color: #667eea;
+    margin-bottom: 10px;
+    font-size: 0.95rem;
+}
+.thinking-content {
+    background: #ffffff;
+    padding: 12px 16px;
+    border-radius: 6px;
+    font-family: 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas, 'Courier New', monospace;
+    font-size: 0.9rem;
+    line-height: 1.6;
+    color: #374151;
+    white-space: pre-wrap;
+    word-wrap: break-word;
+    border: 1px solid #e5e7eb;
+}
+/* 回复分隔线 */
+.response-divider {
+    border: none;
+    height: 2px;
+    background: linear-gradient(to right, transparent, #e5e7eb, transparent);
+    margin: 20px 0;
+}
+/* 按钮样式 */
+.primary-btn {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    transition: all 0.3s ease !important;
+}
+.primary-btn:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
+}
+/* 左侧面板样式 */
+.left-panel {
+    background: #f9fafb;
+    border-radius: 12px;
+    padding: 1rem;
+    height: 100%;
+}
+/* 输入框样式 */
+.input-box textarea {
+    border-radius: 8px !important;
+    border: 2px solid #e5e7eb !important;
+    transition: border-color 0.3s ease !important;
+}
+.input-box textarea:focus {
+    border-color: #667eea !important;
+    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
+}
+/* 输入区域标题 */
+h3 {
+    color: #374151;
+    font-size: 1.1rem;
+    margin: 1rem 0 0.5rem 0;
+}
+/* 聊天消息样式优化 */
+.message-wrap {
+    padding: 1rem !important;
+}
+.message {
+    padding: 1rem !important;
+    border-radius: 12px !important;
+    line-height: 1.6 !important;
+}
+/* 用户消息 */
+.message.user {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    color: white !important;
+}
+/* 助手消息 */
+.message.bot {
+    background: #f9fafb !important;
+    border: 1px solid #e5e7eb !important;
+}
+/* 左侧面板整体样式 */
+.left-column {
+    background: linear-gradient(to bottom, #ffffff 0%, #f9fafb 100%);
+    border-radius: 12px;
+    padding: 1rem;
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
+}
+/* 按钮容器样式 */
+.button-row {
+    margin-top: 1rem;
+    gap: 0.5rem;
+}
+/* 滚动条美化 */
+::-webkit-scrollbar {
+    width: 8px;
+    height: 8px;
+}
+::-webkit-scrollbar-track {
+    background: #f1f1f1;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: #888;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: #555;
+}
+"""
 # Gradio Interface
+with gr.Blocks(title="Step Audio R1", css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    gr.HTML("""
+        <div class="app-header">
+            <h1 style="color: white;">🔊 Step-Audio-R1</h1>
+            <p style="color: white; margin: 0.8rem 0 0 0; opacity: 0.95; font-size: 1.15rem; font-weight: 500;">
+                Advanced Audio-Language Model with Reasoning
+            </p>
+            <p style="color: white; margin: 0.5rem 0 0 0; opacity: 0.85; font-size: 0.95rem;">
+                Comprehensive audio understanding: Speech, Sound, Music & Lyrics
+            </p>
+        </div>
+    """)
     with gr.Row():
+        # Left Panel - Input Area
+        with gr.Column(scale=1, min_width=350):
+            # Configuration
+            with gr.Accordion("⚙️ Configuration", open=False):
                 system_prompt = gr.Textbox(
                     label="System Prompt",
                     lines=2,
+                    value="You are a voice assistant with extensive experience in audio processing.",
+                    placeholder="Enter system prompt...",
+                    elem_classes=["input-box"]
                 )
+                max_tokens = gr.Slider(
+                    1, 7192,
+                    value=6400,
+                    label="Max Tokens",
+                    info="Maximum tokens to generate"
+                )
+                temperature = gr.Slider(
+                    0.0, 2.0,
+                    value=0.7,
+                    label="Temperature",
+                    info="Higher = more random"
+                )
+                top_p = gr.Slider(
+                    0.0, 1.0,
+                    value=0.9,
+                    label="Top P",
+                    info="Nucleus sampling"
+                )
+                show_thinking = gr.Checkbox(
+                    label="💭 Show Thinking Process",
+                    value=True,
+                    info="Display reasoning steps"
+                )
+            # Input Area
+            gr.Markdown("### 📝 Your Input")
+            user_text = gr.Textbox(
+                label="Text Message",
+                lines=4,
+                placeholder="Type your message here...",
+                elem_classes=["input-box"],
+                show_label=False
+            )
+            audio_file = gr.Audio(
+                label="🎤 Audio Input",
+                type="filepath",
+                sources=["microphone", "upload"],
+                show_label=True
+            )
+            # Buttons
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear", scale=1, size="lg")
+                submit_btn = gr.Button(
+                    "🚀 Send",
+                    variant="primary",
+                    scale=2,
+                    size="lg",
+                    elem_classes=["primary-btn"]
+                )
+            # Usage Guide at bottom
+            with gr.Accordion("📖 Quick Guide", open=False):
+                gr.Markdown("""
+                **Usage:**
+                - Type text, upload audio, or both
+                - Audio > 25s auto-splits
+                - Toggle thinking process display
+                **Tips:**
+                - Thinking shown in blue gradient block
+                - History auto-cleaned for API
+                - Adjust params in Configuration
+                """)
+        # Right Panel - Conversation Area
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(
+                label="💬 Conversation",
+                height=700,
+                type="messages",
+                elem_classes=["chatbot-container"],
+                show_label=True,
+                avatar_images=(None, None),
+                bubble_full_width=False
+            )
     submit_btn.click(
         fn=chat,
+        inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p, show_thinking],
+        outputs=[chatbot]
     )
     clear_btn.click(
     import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=6008)
     parser.add_argument("--model", default=MODEL_NAME)
     args = parser.parse_args()
+    import os
+    # 取消代理设置
+    os.environ.update({
+        'http_proxy': '', 'https_proxy': '', 'all_proxy': '',
+        'HTTP_PROXY': '', 'HTTPS_PROXY': '', 'ALL_PROXY': ''
+    })
     # 更新全局模型名称
     if args.model: