Spaces:

stepfun-ai
/

Step-Audio-R1

Running

App Files Files Community

moevis commited on 1 day ago

Commit

e8b5c5e

verified ·

1 Parent(s): 6b94939

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -23

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import base64
 import json
 import os
 import io
 from pydub import AudioSegment
 import re
@@ -17,6 +18,46 @@ API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
 SECRET = os.getenv("API_SECRET", "")
 def escape_html(text):
     """Escape HTML special characters to prevent XSS"""
     if not isinstance(text, str):
@@ -200,16 +241,35 @@ def format_messages(system, history, user_text, audio_data_list=None):
     return messages
-def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, show_thinking=True, model_name=None):
     """Chat function"""
     # If model is not specified, use global configuration
     if model_name is None:
         model_name = MODEL_NAME
     if not user_text and not audio_file:
-        yield history or []
         return
     # Ensure history is a list and formatted correctly
     history = history or []
     clean_history = []
@@ -228,7 +288,7 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
-        yield history or []
         return
     # Debug: Print message format
@@ -242,7 +302,14 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                     if "input_audio" in item_copy:
                         audio_info = item_copy["input_audio"].copy()
                         if "data" in audio_info:
-                            audio_info["data"] = f"[BASE64_AUDIO_DATA_LEN_{len(audio_info['data'])}]"
                         item_copy["input_audio"] = audio_info
                     new_content.append(item_copy)
                 else:
@@ -268,6 +335,9 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
         # Audio only
         history.append({"role": "user", "content": gr.Audio(audio_file)})
     # Add thinking placeholder
     if show_thinking:
         history.append({
@@ -279,16 +349,19 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                 '</div>'
             )
         })
-        yield history
     else:
         history.append({
             "role": "assistant",
             "content": "⏳ Generating response..."
         })
-        yield history
     try:
         # 禁用代理以访问内网 API
         with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
             response = client.post("/chat/completions", json={
                 "model": model_name,
@@ -304,6 +377,8 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
             })
             if response.status_code != 200:
                 error_msg = f"❌ API Error {response.status_code}"
                 if response.status_code == 404:
                     error_msg += " - vLLM service not ready"
@@ -313,7 +388,7 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                     error_msg += f" - Model error ({response.text})"
                 # Update the last message with error
                 history[-1]["content"] = error_msg
-                yield history
                 return
             # Process streaming response
@@ -340,17 +415,17 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                             if 'content' in delta:
                                 content = delta['content']
                                 buffer += content
                                 if is_thinking:
                                     if "</think>" in buffer:
                                         is_thinking = False
                                         parts = buffer.split("</think>", 1)
                                         think_content = parts[0]
                                         response_content = parts[1]
                                         if think_content.startswith("<think>"):
                                             think_content = think_content[len("<think>"):].strip()
                                         if show_thinking:
                                             # Format thinking with custom styled block (escape HTML for safety)
                                             escaped_think = escape_html(think_content)
@@ -384,10 +459,10 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                                     parts = buffer.split("</think>", 1)
                                     think_content = parts[0]
                                     response_content = parts[1]
                                     if think_content.startswith("<think>"):
                                         think_content = think_content[len("<think>"):].strip()
                                     if show_thinking:
                                         # Update with formatted thinking + response
                                         escaped_think = escape_html(think_content)
@@ -402,18 +477,26 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                                     else:
                                         # Only show response
                                         history[-1]["content"] = response_content
-                                yield history
                     except json.JSONDecodeError:
                         continue
     except httpx.ConnectError:
         history[-1]["content"] = "❌ Cannot connect to vLLM API"
-        yield history
     except Exception as e:
         history[-1]["content"] = f"❌ Error: {str(e)}"
-        yield history
 # Custom CSS for better UI
 custom_css = """
@@ -571,7 +654,6 @@ h3 {
     margin-top: 1rem;
     gap: 0.5rem;
 }
 /* Dark Mode Support */
 .dark .message.bot {
     background: #1f2937 !important;
@@ -604,7 +686,6 @@ h3 {
 .dark h3 {
     color: #e5e7eb;
 }
 /* 滚动条美化 */
 ::-webkit-scrollbar {
     width: 8px;
@@ -692,6 +773,12 @@ with gr.Blocks(title="Step Audio R1", css=custom_css, theme=gr.themes.Soft()) as
                 show_label=True
             )
             # Buttons
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear", scale=1, size="lg")
@@ -729,15 +816,37 @@ with gr.Blocks(title="Step Audio R1", css=custom_css, theme=gr.themes.Soft()) as
                 bubble_full_width=False
             )
     submit_btn.click(
         fn=chat,
-        inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p, show_thinking],
-        outputs=[chatbot]
     )
     clear_btn.click(
-        fn=lambda: ([], "", None),
-        outputs=[chatbot, user_text, audio_file]
     )
 if __name__ == "__main__":

 import json
 import os
 import io
+import time
 from pydub import AudioSegment
 import re
 MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
 SECRET = os.getenv("API_SECRET", "")
+# 音频大小限制 (10MB)
+MAX_AUDIO_SIZE_MB = 10
+MAX_AUDIO_SIZE_BYTES = MAX_AUDIO_SIZE_MB * 1024 * 1024
+def get_wav_size(audio_path):
+    """Calculate the size of audio after converting to wav (in bytes)"""
+    if not audio_path or not os.path.exists(audio_path):
+        return 0
+    try:
+        audio = AudioSegment.from_file(audio_path)
+        buffer = io.BytesIO()
+        audio.export(buffer, format="wav")
+        return len(buffer.getvalue())
+    except Exception as e:
+        print(f"[ERROR] Failed to calculate wav size: {e}")
+        return 0
+def get_audio_size_info(used_size_bytes, current_audio_path=None):
+    """Get audio size usage info message"""
+    current_size = 0
+    if current_audio_path and os.path.exists(current_audio_path):
+        current_size = get_wav_size(current_audio_path)
+    remaining = MAX_AUDIO_SIZE_BYTES - used_size_bytes
+    used_mb = used_size_bytes / (1024 * 1024)
+    remaining_mb = remaining / (1024 * 1024)
+    current_mb = current_size / (1024 * 1024)
+    if used_size_bytes == 0 and current_size == 0:
+        return f"📊 Audio limit: {MAX_AUDIO_SIZE_MB}MB total available"
+    elif current_size > 0:
+        new_remaining = remaining - current_size
+        new_remaining_mb = new_remaining / (1024 * 1024)
+        if new_remaining < 0:
+            return f"📊 ⚠️ Current audio ({current_mb:.2f}MB) exceeds remaining limit ({remaining_mb:.2f}MB)"
+        return f"📊 Audio: {used_mb:.2f}MB used + {current_mb:.2f}MB pending = {new_remaining_mb:.2f}MB remaining"
+    else:
+        return f"📊 Audio limit: {used_mb:.2f}MB used, {remaining_mb:.2f}MB remaining (max {MAX_AUDIO_SIZE_MB}MB)"
 def escape_html(text):
     """Escape HTML special characters to prevent XSS"""
     if not isinstance(text, str):
     return messages
+def chat(system_prompt, user_text, audio_file, history, used_audio_size, max_tokens, temperature, top_p, show_thinking=True, model_name=None):
     """Chat function"""
     # If model is not specified, use global configuration
     if model_name is None:
         model_name = MODEL_NAME
+    # 初始化已使用音频大小
+    if used_audio_size is None:
+        used_audio_size = 0
     if not user_text and not audio_file:
+        yield history or [], used_audio_size, get_audio_size_info(used_audio_size, None)
         return
+    # 检查音频大小限制
+    current_audio_size = 0
+    if audio_file:
+        current_audio_size = get_wav_size(audio_file)
+        total_size = used_audio_size + current_audio_size
+        if total_size > MAX_AUDIO_SIZE_BYTES:
+            history = history or []
+            remaining_mb = (MAX_AUDIO_SIZE_BYTES - used_audio_size) / (1024 * 1024)
+            current_mb = current_audio_size / (1024 * 1024)
+            error_msg = f"❌ Audio size limit exceeded! Current audio is {current_mb:.2f}MB, but only {max(0, remaining_mb):.2f}MB remaining (max {MAX_AUDIO_SIZE_MB}MB)"
+            history.append({"role": "assistant", "content": error_msg})
+            yield history, used_audio_size, get_audio_size_info(used_audio_size, None)
+            return
     # Ensure history is a list and formatted correctly
     history = history or []
     clean_history = []
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
+        yield history or [], used_audio_size, get_audio_size_info(used_audio_size, None)
         return
     # Debug: Print message format
                     if "input_audio" in item_copy:
                         audio_info = item_copy["input_audio"].copy()
                         if "data" in audio_info:
+                            data_len = len(audio_info['data'])
+                            if data_len >= 1024 * 1024:
+                                human_size = f"{data_len / (1024 * 1024):.2f} MB"
+                            elif data_len >= 1024:
+                                human_size = f"{data_len / 1024:.2f} KB"
+                            else:
+                                human_size = f"{data_len} B"
+                            audio_info["data"] = f"[BASE64_AUDIO_DATA: {human_size} ({data_len} bytes)]"
                         item_copy["input_audio"] = audio_info
                     new_content.append(item_copy)
                 else:
         # Audio only
         history.append({"role": "user", "content": gr.Audio(audio_file)})
+    # 更新已使用的音频大小
+    new_used_audio_size = used_audio_size + current_audio_size
     # Add thinking placeholder
     if show_thinking:
         history.append({
                 '</div>'
             )
         })
+        yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
     else:
         history.append({
             "role": "assistant",
             "content": "⏳ Generating response..."
         })
+        yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
     try:
         # 禁用代理以访问内网 API
+        start_time = time.time()
+        print(f"[API] Starting request to {API_BASE_URL}/chat/completions ...")
         with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
             response = client.post("/chat/completions", json={
                 "model": model_name,
             })
             if response.status_code != 200:
+                elapsed_time = time.time() - start_time
+                print(f"[API] ❌ FAILED - Status: {response.status_code}, Time: {elapsed_time:.2f}s")
                 error_msg = f"❌ API Error {response.status_code}"
                 if response.status_code == 404:
                     error_msg += " - vLLM service not ready"
                     error_msg += f" - Model error ({response.text})"
                 # Update the last message with error
                 history[-1]["content"] = error_msg
+                yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
                 return
             # Process streaming response
                             if 'content' in delta:
                                 content = delta['content']
                                 buffer += content
                                 if is_thinking:
                                     if "</think>" in buffer:
                                         is_thinking = False
                                         parts = buffer.split("</think>", 1)
                                         think_content = parts[0]
                                         response_content = parts[1]
                                         if think_content.startswith("<think>"):
                                             think_content = think_content[len("<think>"):].strip()
                                         if show_thinking:
                                             # Format thinking with custom styled block (escape HTML for safety)
                                             escaped_think = escape_html(think_content)
                                     parts = buffer.split("</think>", 1)
                                     think_content = parts[0]
                                     response_content = parts[1]
                                     if think_content.startswith("<think>"):
                                         think_content = think_content[len("<think>"):].strip()
                                     if show_thinking:
                                         # Update with formatted thinking + response
                                         escaped_think = escape_html(think_content)
                                     else:
                                         # Only show response
                                         history[-1]["content"] = response_content
+                                yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
                     except json.JSONDecodeError:
                         continue
+            # 请求成功完成
+            elapsed_time = time.time() - start_time
+            print(f"[API] ✅ SUCCESS - Time: {elapsed_time:.2f}s")
     except httpx.ConnectError:
+        elapsed_time = time.time() - start_time
+        print(f"[API] ❌ FAILED - Connection error, Time: {elapsed_time:.2f}s")
         history[-1]["content"] = "❌ Cannot connect to vLLM API"
+        yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
     except Exception as e:
+        elapsed_time = time.time() - start_time
+        print(f"[API] ❌ FAILED - Error: {str(e)}, Time: {elapsed_time:.2f}s")
         history[-1]["content"] = f"❌ Error: {str(e)}"
+        yield history, new_used_audio_size, get_audio_size_info(new_used_audio_size, None)
 # Custom CSS for better UI
 custom_css = """
     margin-top: 1rem;
     gap: 0.5rem;
 }
 /* Dark Mode Support */
 .dark .message.bot {
     background: #1f2937 !important;
 .dark h3 {
     color: #e5e7eb;
 }
 /* 滚动条美化 */
 ::-webkit-scrollbar {
     width: 8px;
                 show_label=True
             )
+            # Audio size limit info
+            audio_size_info = gr.Markdown(
+                value=f"📊 Audio limit: {MAX_AUDIO_SIZE_MB}MB total available",
+                elem_classes=["audio-size-info"]
+            )
             # Buttons
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear", scale=1, size="lg")
                 bubble_full_width=False
             )
+    # State to track used audio size (in bytes)
+    used_audio_size = gr.State(value=0)
     submit_btn.click(
         fn=chat,
+        inputs=[system_prompt, user_text, audio_file, chatbot, used_audio_size, max_tokens, temperature, top_p, show_thinking],
+        outputs=[chatbot, used_audio_size, audio_size_info]
     )
     clear_btn.click(
+        fn=lambda: ([], 0, "", None, f"📊 Audio limit: {MAX_AUDIO_SIZE_MB}MB total available"),
+        outputs=[chatbot, used_audio_size, user_text, audio_file, audio_size_info]
+    )
+    # Update audio size info when audio file changes
+    audio_file.change(
+        fn=lambda audio, used_size: get_audio_size_info(used_size, audio),
+        inputs=[audio_file, used_audio_size],
+        outputs=[audio_size_info]
+    )
+    # Also listen to upload and stop_recording events
+    audio_file.upload(
+        fn=lambda audio, used_size: get_audio_size_info(used_size, audio),
+        inputs=[audio_file, used_audio_size],
+        outputs=[audio_size_info]
+    )
+    audio_file.stop_recording(
+        fn=lambda audio, used_size: get_audio_size_info(used_size, audio),
+        inputs=[audio_file, used_audio_size],
+        outputs=[audio_size_info]
     )
 if __name__ == "__main__":