moevis commited on
Commit
621db7d
·
verified ·
1 Parent(s): e6f110c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +500 -134
app.py CHANGED
@@ -15,6 +15,17 @@ import httpx
15
  API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
16
  MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  def process_audio(audio_path):
19
  """
20
  Process audio: convert to wav, split if > 25s.
@@ -63,19 +74,28 @@ def format_messages(system, history, user_text, audio_data_list=None):
63
 
64
  # 处理历史记录
65
  for item in history:
66
- # Filter out thinking process messages
67
- metadata = item.get("metadata") if isinstance(item, dict) else getattr(item, "metadata", None)
68
- if metadata and isinstance(metadata, dict) and metadata.get("title") == "⏳ Thinking Process":
69
- continue
70
-
71
  role = item.get("role") if isinstance(item, dict) else getattr(item, "role", None)
72
  content = item.get("content") if isinstance(item, dict) else getattr(item, "content", None)
73
 
74
  if not role or content is None:
75
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Check for Audio
78
- is_audio = not isinstance(content, list) and content.get("component", None) == "audio"
79
 
80
  if is_audio:
81
  audio_path = content["value"]["path"]
@@ -97,11 +117,14 @@ def format_messages(system, history, user_text, audio_data_list=None):
97
  elif isinstance(content, str):
98
  messages.append({"role": role, "content": content})
99
  elif isinstance(content, list):
100
- # Assume it's already a list of parts or mixed
101
- safe_content = []
 
 
 
102
  for c in content:
103
  # Check for Audio in list
104
- is_c_audio = c.get('component', None) == "audio"
105
 
106
  if is_c_audio:
107
  audio_path = c["value"]["path"]
@@ -109,7 +132,7 @@ def format_messages(system, history, user_text, audio_data_list=None):
109
  try:
110
  item_audio_data_list = process_audio(audio_path)
111
  for audio_data in item_audio_data_list:
112
- safe_content.append({
113
  "type": "input_audio",
114
  "input_audio": {
115
  "data": audio_data,
@@ -118,15 +141,31 @@ def format_messages(system, history, user_text, audio_data_list=None):
118
  })
119
  except Exception as e:
120
  print(f"[ERROR] Failed to process history audio in list: {e}")
121
- elif isinstance(c, dict):
122
- safe_content.append(c)
123
  elif isinstance(c, str):
124
- safe_content.append({"type": "text", "text": c})
125
- messages.append({"role": role, "content": safe_content})
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # 添加当前用户消息
128
  if user_text and audio_data_list:
129
  content = []
 
 
 
 
 
 
130
  for audio_data in audio_data_list:
131
  content.append({
132
  "type": "input_audio",
@@ -135,10 +174,6 @@ def format_messages(system, history, user_text, audio_data_list=None):
135
  "format": "wav"
136
  }
137
  })
138
- content.append({
139
- "type": "text",
140
- "text": user_text
141
- })
142
 
143
  messages.append({
144
  "role": "user",
@@ -148,10 +183,6 @@ def format_messages(system, history, user_text, audio_data_list=None):
148
  messages.append({"role": "user", "content": user_text})
149
  elif audio_data_list:
150
  content = []
151
- messages.append({
152
- "role": "user",
153
- "content": content
154
- })
155
  for audio_data in audio_data_list:
156
  content.append({
157
  "type": "input_audio",
@@ -160,17 +191,21 @@ def format_messages(system, history, user_text, audio_data_list=None):
160
  "format": "wav"
161
  }
162
  })
 
 
 
 
163
 
164
  return messages
165
 
166
- def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, model_name=None):
167
  """Chat function"""
168
  # If model is not specified, use global configuration
169
  if model_name is None:
170
  model_name = MODEL_NAME
171
 
172
  if not user_text and not audio_file:
173
- yield history or [], "Please enter text or upload audio"
174
  return
175
 
176
  # Ensure history is a list and formatted correctly
@@ -191,7 +226,7 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
191
 
192
  messages = format_messages(system_prompt, history, user_text, audio_data_list)
193
  if not messages:
194
- yield history or [], "Invalid input"
195
  return
196
 
197
  # Debug: Print message format
@@ -218,149 +253,474 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
218
 
219
  print(f"[DEBUG] Messages to API: {json.dumps(debug_messages, ensure_ascii=False, indent=2)}")
220
 
221
- # Update history with user message immediately
222
- if audio_file:
223
- # 1. Add audio message
 
 
224
  history.append({"role": "user", "content": gr.Audio(audio_file)})
225
-
226
- # 2. If text exists, add text message
227
- if user_text:
228
- history.append({"role": "user", "content": user_text})
229
- else:
230
  # Text only
231
  history.append({"role": "user", "content": user_text})
 
 
 
232
 
233
  # Add thinking placeholder
234
- history.append(gr.ChatMessage(
235
- role="assistant",
236
- content="",
237
- metadata={"title": "⏳ Thinking Process"}
238
- ))
239
-
240
- yield history, "Generating..."
 
 
 
 
 
 
 
 
 
 
241
 
242
  try:
243
- with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
244
- # Use client.stream for better streaming control
245
- with client.stream("POST", "/chat/completions", json={
246
  "model": model_name,
247
  "messages": messages,
248
  "max_tokens": max_tokens,
249
  "temperature": temperature,
250
  "top_p": top_p,
251
  "stream": True,
252
- "repetition_penalty": 1.07,
253
  "stop_token_ids": [151665]
254
- }) as response:
255
-
256
- if response.status_code != 200:
257
- error_msg = f"❌ API Error {response.status_code}"
258
- if response.status_code == 404:
259
- error_msg += " - vLLM service not ready"
260
- elif response.status_code == 400:
261
- error_msg += " - Bad request"
262
- elif response.status_code == 500:
263
- error_msg += " - Model error"
264
- yield history, error_msg
265
- return
266
-
267
- # Process streaming response
268
- buffer = ""
269
- is_thinking = True
270
-
271
- print("[DEBUG] Start receiving stream...")
272
- for line in response.iter_lines():
273
- if not line:
274
- continue
275
- # Ensure line is string format
276
- if isinstance(line, bytes):
277
- line = line.decode('utf-8')
278
- else:
279
- line = str(line)
280
 
281
- if line.startswith('data: '):
282
- data_str = line[6:]
283
- if data_str.strip() == '[DONE]':
284
- print("[DEBUG] Stream finished [DONE]")
285
- break
286
- try:
287
- data = json.loads(data_str)
288
- if 'choices' in data and len(data['choices']) > 0:
289
- delta = data['choices'][0].get('delta', {})
290
- if 'content' in delta:
291
- content = delta['content']
292
- buffer += content
293
-
294
- if is_thinking:
295
- if "</think>" in buffer:
296
- is_thinking = False
297
- parts = buffer.split("</think>", 1)
298
- think_content = parts[0]
299
- response_content = parts[1]
300
-
301
- if think_content.startswith("<think>"):
302
- think_content = think_content[len("<think>"):].strip()
303
-
304
- # Update thinking message
305
- history[-1].content = think_content
306
-
307
- # Add response message
308
- history.append({"role": "assistant", "content": response_content})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  else:
310
- # Update thinking message
 
 
 
 
311
  current_think = buffer
312
  if current_think.startswith("<think>"):
313
- current_think = current_think[len("<think>"):]
314
- history[-1].content = current_think
315
- else:
316
- # Already split, just update response message
317
- parts = buffer.split("</think>", 1)
318
- response_content = parts[1]
319
- history[-1]["content"] = response_content
 
 
 
 
 
 
 
320
 
321
- yield history, ""
 
322
 
323
- except json.JSONDecodeError:
324
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
  except httpx.ConnectError:
327
- yield history, "❌ Cannot connect to vLLM API"
 
328
  except Exception as e:
329
- yield history, f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  # Gradio Interface
332
- with gr.Blocks(title="Step Audio R1") as demo:
333
- gr.Markdown("# Step Audio R1 Chat")
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  with gr.Row():
336
- # Left Configuration
337
- with gr.Column(scale=1):
338
- with gr.Accordion("Configuration", open=True):
 
339
  system_prompt = gr.Textbox(
340
  label="System Prompt",
341
  lines=2,
342
- value="你是一个语音助手,你有非常丰富的音频处理经验。"
 
 
343
  )
344
- max_tokens = gr.Slider(1, 7192, value=1024, label="Max Tokens")
345
- temperature = gr.Slider(0.0, 2.0, value=0.7, label="Temperature")
346
- top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
347
-
348
- status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- # Right Chat
351
  with gr.Column(scale=2):
352
- chatbot = gr.Chatbot(label="Chat History", height=450)
353
- user_text = gr.Textbox(label="Input", lines=2, placeholder="Enter message...")
354
- audio_file = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
355
-
356
- with gr.Row():
357
- submit_btn = gr.Button("Send", variant="primary", scale=2)
358
- clear_btn = gr.Button("Clear", scale=1)
 
 
359
 
360
  submit_btn.click(
361
  fn=chat,
362
- inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p],
363
- outputs=[chatbot, status]
364
  )
365
 
366
  clear_btn.click(
@@ -372,9 +732,15 @@ if __name__ == "__main__":
372
  import argparse
373
  parser = argparse.ArgumentParser()
374
  parser.add_argument("--host", default="0.0.0.0")
375
- parser.add_argument("--port", type=int, default=7860)
376
  parser.add_argument("--model", default=MODEL_NAME)
377
  args = parser.parse_args()
 
 
 
 
 
 
378
 
379
  # 更新全局模型名称
380
  if args.model:
 
15
  API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
16
  MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
17
 
18
+ def escape_html(text):
19
+ """Escape HTML special characters to prevent XSS"""
20
+ if not isinstance(text, str):
21
+ return text
22
+ return (text
23
+ .replace("&", "&amp;")
24
+ .replace("<", "&lt;")
25
+ .replace(">", "&gt;")
26
+ .replace('"', "&quot;")
27
+ .replace("'", "&#x27;"))
28
+
29
  def process_audio(audio_path):
30
  """
31
  Process audio: convert to wav, split if > 25s.
 
74
 
75
  # 处理历史记录
76
  for item in history:
 
 
 
 
 
77
  role = item.get("role") if isinstance(item, dict) else getattr(item, "role", None)
78
  content = item.get("content") if isinstance(item, dict) else getattr(item, "content", None)
79
 
80
  if not role or content is None:
81
  continue
82
+
83
+ # If content contains thinking process (with thinking-block div), extract only the response part
84
+ if role == "assistant" and isinstance(content, str) and '<div class="thinking-block">' in content:
85
+ # Find the end of the thinking block and extract what comes after
86
+ # Match the entire thinking block
87
+ pattern = r'<div class="thinking-block">.*?</div>\s*</div>\s*'
88
+ remaining_content = re.sub(pattern, '', content, flags=re.DOTALL).strip()
89
+
90
+ # If there's meaningful content after the thinking block, use it
91
+ if remaining_content and not remaining_content.startswith('<'):
92
+ content = remaining_content
93
+ else:
94
+ # Still in thinking phase or no response yet, skip
95
+ continue
96
 
97
  # Check for Audio
98
+ is_audio = isinstance(content, dict) and content.get("component") == "audio"
99
 
100
  if is_audio:
101
  audio_path = content["value"]["path"]
 
117
  elif isinstance(content, str):
118
  messages.append({"role": role, "content": content})
119
  elif isinstance(content, list):
120
+ # Process list items and ensure text comes before audio
121
+ text_items = []
122
+ audio_items = []
123
+ other_items = []
124
+
125
  for c in content:
126
  # Check for Audio in list
127
+ is_c_audio = isinstance(c, dict) and c.get('component') == "audio"
128
 
129
  if is_c_audio:
130
  audio_path = c["value"]["path"]
 
132
  try:
133
  item_audio_data_list = process_audio(audio_path)
134
  for audio_data in item_audio_data_list:
135
+ audio_items.append({
136
  "type": "input_audio",
137
  "input_audio": {
138
  "data": audio_data,
 
141
  })
142
  except Exception as e:
143
  print(f"[ERROR] Failed to process history audio in list: {e}")
 
 
144
  elif isinstance(c, str):
145
+ text_items.append({"type": "text", "text": c})
146
+ elif isinstance(c, dict):
147
+ # Distinguish between text and audio types
148
+ if c.get("type") == "text":
149
+ text_items.append(c)
150
+ elif c.get("type") == "input_audio":
151
+ audio_items.append(c)
152
+ else:
153
+ other_items.append(c)
154
+
155
+ # Combine: text first, then audio, then others
156
+ safe_content = text_items + audio_items + other_items
157
+ if safe_content:
158
+ messages.append({"role": role, "content": safe_content})
159
 
160
+ # 添加当前用户消息(文本在前,音频在后)
161
  if user_text and audio_data_list:
162
  content = []
163
+ # 先添加文本
164
+ content.append({
165
+ "type": "text",
166
+ "text": user_text
167
+ })
168
+ # 再添加音频
169
  for audio_data in audio_data_list:
170
  content.append({
171
  "type": "input_audio",
 
174
  "format": "wav"
175
  }
176
  })
 
 
 
 
177
 
178
  messages.append({
179
  "role": "user",
 
183
  messages.append({"role": "user", "content": user_text})
184
  elif audio_data_list:
185
  content = []
 
 
 
 
186
  for audio_data in audio_data_list:
187
  content.append({
188
  "type": "input_audio",
 
191
  "format": "wav"
192
  }
193
  })
194
+ messages.append({
195
+ "role": "user",
196
+ "content": content
197
+ })
198
 
199
  return messages
200
 
201
+ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, show_thinking=True, model_name=None):
202
  """Chat function"""
203
  # If model is not specified, use global configuration
204
  if model_name is None:
205
  model_name = MODEL_NAME
206
 
207
  if not user_text and not audio_file:
208
+ yield history or []
209
  return
210
 
211
  # Ensure history is a list and formatted correctly
 
226
 
227
  messages = format_messages(system_prompt, history, user_text, audio_data_list)
228
  if not messages:
229
+ yield history or []
230
  return
231
 
232
  # Debug: Print message format
 
253
 
254
  print(f"[DEBUG] Messages to API: {json.dumps(debug_messages, ensure_ascii=False, indent=2)}")
255
 
256
+ # Update history with user message immediately (text first, then audio)
257
+ if user_text and audio_file:
258
+ # 1. Add text message first
259
+ history.append({"role": "user", "content": user_text})
260
+ # 2. Add audio message second
261
  history.append({"role": "user", "content": gr.Audio(audio_file)})
262
+ elif user_text:
 
 
 
 
263
  # Text only
264
  history.append({"role": "user", "content": user_text})
265
+ elif audio_file:
266
+ # Audio only
267
+ history.append({"role": "user", "content": gr.Audio(audio_file)})
268
 
269
  # Add thinking placeholder
270
+ if show_thinking:
271
+ history.append({
272
+ "role": "assistant",
273
+ "content": (
274
+ '<div class="thinking-block">\n'
275
+ '<div class="thinking-header">💭 Thinking...</div>\n'
276
+ '<div class="thinking-content">Processing your request...</div>\n'
277
+ '</div>'
278
+ )
279
+ })
280
+ yield history
281
+ else:
282
+ history.append({
283
+ "role": "assistant",
284
+ "content": "⏳ Generating response..."
285
+ })
286
+ yield history
287
 
288
  try:
289
+ # 禁用代理以访问内网 API
290
+ with httpx.Client(base_url=API_BASE_URL, timeout=120, proxies={}) as client:
291
+ response = client.post("/chat/completions", json={
292
  "model": model_name,
293
  "messages": messages,
294
  "max_tokens": max_tokens,
295
  "temperature": temperature,
296
  "top_p": top_p,
297
  "stream": True,
298
+ "repetition_penalty": 1.0,
299
  "stop_token_ids": [151665]
300
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ if response.status_code != 200:
303
+ error_msg = f"❌ API Error {response.status_code}"
304
+ if response.status_code == 404:
305
+ error_msg += " - vLLM service not ready"
306
+ elif response.status_code == 400:
307
+ error_msg += " - Bad request"
308
+ elif response.status_code == 500:
309
+ error_msg += " - Model error"
310
+ # Update the last message with error
311
+ history[-1]["content"] = error_msg
312
+ yield history
313
+ return
314
+
315
+ # Process streaming response
316
+ buffer = ""
317
+ is_thinking = True
318
+
319
+ for line in response.iter_lines():
320
+ if not line:
321
+ continue
322
+ # Ensure line is string format
323
+ if isinstance(line, bytes):
324
+ line = line.decode('utf-8')
325
+ else:
326
+ line = str(line)
327
+
328
+ if line.startswith('data: '):
329
+ data_str = line[6:]
330
+ if data_str.strip() == '[DONE]':
331
+ break
332
+ try:
333
+ data = json.loads(data_str)
334
+ if 'choices' in data and len(data['choices']) > 0:
335
+ delta = data['choices'][0].get('delta', {})
336
+ if 'content' in delta:
337
+ content = delta['content']
338
+ buffer += content
339
+
340
+ if is_thinking:
341
+ if "</think>" in buffer:
342
+ is_thinking = False
343
+ parts = buffer.split("</think>", 1)
344
+ think_content = parts[0]
345
+ response_content = parts[1]
346
+
347
+ if think_content.startswith("<think>"):
348
+ think_content = think_content[len("<think>"):].strip()
349
+
350
+ if show_thinking:
351
+ # Format thinking with custom styled block (escape HTML for safety)
352
+ escaped_think = escape_html(think_content)
353
+ formatted_content = (
354
+ f'<div class="thinking-block">\n'
355
+ f'<div class="thinking-header">💭 Thinking Process</div>\n'
356
+ f'<div class="thinking-content">{escaped_think}</div>\n'
357
+ f'</div>\n\n'
358
+ f'{response_content}'
359
+ )
360
+ history[-1]["content"] = formatted_content
361
  else:
362
+ # Don't show thinking, replace with response message directly
363
+ history[-1]["content"] = response_content
364
+ else:
365
+ # Update thinking message with collapsible format (only if showing)
366
+ if show_thinking:
367
  current_think = buffer
368
  if current_think.startswith("<think>"):
369
+ current_think = current_think[len("<think>"):].strip()
370
+ escaped_think = escape_html(current_think)
371
+ formatted_content = (
372
+ f'<div class="thinking-block">\n'
373
+ f'<div class="thinking-header">💭 Thinking...</div>\n'
374
+ f'<div class="thinking-content">{escaped_think}</div>\n'
375
+ f'</div>'
376
+ )
377
+ history[-1]["content"] = formatted_content
378
+ else:
379
+ # Already split, update the combined message
380
+ parts = buffer.split("</think>", 1)
381
+ think_content = parts[0]
382
+ response_content = parts[1]
383
 
384
+ if think_content.startswith("<think>"):
385
+ think_content = think_content[len("<think>"):].strip()
386
 
387
+ if show_thinking:
388
+ # Update with formatted thinking + response
389
+ escaped_think = escape_html(think_content)
390
+ formatted_content = (
391
+ f'<div class="thinking-block">\n'
392
+ f'<div class="thinking-header">💭 Thinking Process</div>\n'
393
+ f'<div class="thinking-content">{escaped_think}</div>\n'
394
+ f'</div>\n\n'
395
+ f'{response_content}'
396
+ )
397
+ history[-1]["content"] = formatted_content
398
+ else:
399
+ # Only show response
400
+ history[-1]["content"] = response_content
401
+
402
+ yield history
403
+
404
+ except json.JSONDecodeError:
405
+ continue
406
 
407
  except httpx.ConnectError:
408
+ history[-1]["content"] = "❌ Cannot connect to vLLM API"
409
+ yield history
410
  except Exception as e:
411
+ history[-1]["content"] = f"❌ Error: {str(e)}"
412
+ yield history
413
+
414
+ # Custom CSS for better UI
415
+ custom_css = """
416
+ /* 全局样式 */
417
+ .gradio-container {
418
+ max-width: 100% !important;
419
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
420
+ }
421
+
422
+ /* 标题样式 */
423
+ .app-header {
424
+ text-align: center;
425
+ padding: 2.5rem 1.5rem;
426
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
427
+ position: relative;
428
+ overflow: hidden;
429
+ border-radius: 16px;
430
+ margin-bottom: 1.5rem;
431
+ box-shadow: 0 8px 24px rgba(102, 126, 234, 0.35);
432
+ }
433
+
434
+ /* 标题背景装饰 */
435
+ .app-header::before {
436
+ content: '';
437
+ position: absolute;
438
+ top: -50%;
439
+ right: -50%;
440
+ width: 200%;
441
+ height: 200%;
442
+ background: radial-gradient(circle, rgba(255, 255, 255, 0.1) 0%, transparent 70%);
443
+ animation: rotate 20s linear infinite;
444
+ }
445
+
446
+ @keyframes rotate {
447
+ from { transform: rotate(0deg); }
448
+ to { transform: rotate(360deg); }
449
+ }
450
+
451
+ .app-header h1 {
452
+ margin: 0;
453
+ font-size: 2.8rem;
454
+ font-weight: 700;
455
+ color: white !important;
456
+ text-shadow: 0 3px 6px rgba(0, 0, 0, 0.25);
457
+ letter-spacing: 1px;
458
+ position: relative;
459
+ z-index: 1;
460
+ }
461
+
462
+ .app-header p {
463
+ color: rgba(255, 255, 255, 0.95) !important;
464
+ text-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
465
+ position: relative;
466
+ z-index: 1;
467
+ line-height: 1.5;
468
+ }
469
+
470
+ /* 聊天框样式 */
471
+ .chatbot-container {
472
+ border-radius: 12px;
473
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
474
+ overflow: hidden;
475
+ }
476
+
477
+ /* 思考过程样式 - 模仿Claude/ChatGPT的风格 */
478
+ .thinking-block {
479
+ background: linear-gradient(135deg, #f5f7fa 0%, #eef2f7 100%);
480
+ border-left: 4px solid #667eea;
481
+ padding: 16px 20px;
482
+ margin: 12px 0;
483
+ border-radius: 8px;
484
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
485
+ }
486
+
487
+ .thinking-header {
488
+ display: flex;
489
+ align-items: center;
490
+ font-weight: 600;
491
+ color: #667eea;
492
+ margin-bottom: 10px;
493
+ font-size: 0.95rem;
494
+ }
495
+
496
+ .thinking-content {
497
+ background: #ffffff;
498
+ padding: 12px 16px;
499
+ border-radius: 6px;
500
+ font-family: 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas, 'Courier New', monospace;
501
+ font-size: 0.9rem;
502
+ line-height: 1.6;
503
+ color: #374151;
504
+ white-space: pre-wrap;
505
+ word-wrap: break-word;
506
+ border: 1px solid #e5e7eb;
507
+ }
508
+
509
+ /* 回复分隔线 */
510
+ .response-divider {
511
+ border: none;
512
+ height: 2px;
513
+ background: linear-gradient(to right, transparent, #e5e7eb, transparent);
514
+ margin: 20px 0;
515
+ }
516
+
517
+ /* 按钮样式 */
518
+ .primary-btn {
519
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
520
+ border: none !important;
521
+ transition: all 0.3s ease !important;
522
+ }
523
+
524
+ .primary-btn:hover {
525
+ transform: translateY(-2px);
526
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
527
+ }
528
+
529
+ /* 左侧面板样式 */
530
+ .left-panel {
531
+ background: #f9fafb;
532
+ border-radius: 12px;
533
+ padding: 1rem;
534
+ height: 100%;
535
+ }
536
+
537
+ /* 输入框样式 */
538
+ .input-box textarea {
539
+ border-radius: 8px !important;
540
+ border: 2px solid #e5e7eb !important;
541
+ transition: border-color 0.3s ease !important;
542
+ }
543
+
544
+ .input-box textarea:focus {
545
+ border-color: #667eea !important;
546
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
547
+ }
548
+
549
+ /* 输入区域标题 */
550
+ h3 {
551
+ color: #374151;
552
+ font-size: 1.1rem;
553
+ margin: 1rem 0 0.5rem 0;
554
+ }
555
+
556
+ /* 聊天消息样式优化 */
557
+ .message-wrap {
558
+ padding: 1rem !important;
559
+ }
560
+
561
+ .message {
562
+ padding: 1rem !important;
563
+ border-radius: 12px !important;
564
+ line-height: 1.6 !important;
565
+ }
566
+
567
+ /* 用户消息 */
568
+ .message.user {
569
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
570
+ color: white !important;
571
+ }
572
+
573
+ /* 助手消息 */
574
+ .message.bot {
575
+ background: #f9fafb !important;
576
+ border: 1px solid #e5e7eb !important;
577
+ }
578
+
579
+ /* 左侧面板整体样式 */
580
+ .left-column {
581
+ background: linear-gradient(to bottom, #ffffff 0%, #f9fafb 100%);
582
+ border-radius: 12px;
583
+ padding: 1rem;
584
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
585
+ }
586
+
587
+ /* 按钮容器样式 */
588
+ .button-row {
589
+ margin-top: 1rem;
590
+ gap: 0.5rem;
591
+ }
592
+
593
+ /* 滚动条美化 */
594
+ ::-webkit-scrollbar {
595
+ width: 8px;
596
+ height: 8px;
597
+ }
598
+
599
+ ::-webkit-scrollbar-track {
600
+ background: #f1f1f1;
601
+ border-radius: 4px;
602
+ }
603
+
604
+ ::-webkit-scrollbar-thumb {
605
+ background: #888;
606
+ border-radius: 4px;
607
+ }
608
+
609
+ ::-webkit-scrollbar-thumb:hover {
610
+ background: #555;
611
+ }
612
+ """
613
 
614
  # Gradio Interface
615
+ with gr.Blocks(title="Step Audio R1", css=custom_css, theme=gr.themes.Soft()) as demo:
616
+ # Header
617
+ gr.HTML("""
618
+ <div class="app-header">
619
+ <h1 style="color: white;">🔊 Step-Audio-R1</h1>
620
+ <p style="color: white; margin: 0.8rem 0 0 0; opacity: 0.95; font-size: 1.15rem; font-weight: 500;">
621
+ Advanced Audio-Language Model with Reasoning
622
+ </p>
623
+ <p style="color: white; margin: 0.5rem 0 0 0; opacity: 0.85; font-size: 0.95rem;">
624
+ Comprehensive audio understanding: Speech, Sound, Music & Lyrics
625
+ </p>
626
+ </div>
627
+ """)
628
 
629
  with gr.Row():
630
+ # Left Panel - Input Area
631
+ with gr.Column(scale=1, min_width=350):
632
+ # Configuration
633
+ with gr.Accordion("⚙️ Configuration", open=False):
634
  system_prompt = gr.Textbox(
635
  label="System Prompt",
636
  lines=2,
637
+ value="You are a voice assistant with extensive experience in audio processing.",
638
+ placeholder="Enter system prompt...",
639
+ elem_classes=["input-box"]
640
  )
641
+
642
+ max_tokens = gr.Slider(
643
+ 1, 7192,
644
+ value=6400,
645
+ label="Max Tokens",
646
+ info="Maximum tokens to generate"
647
+ )
648
+ temperature = gr.Slider(
649
+ 0.0, 2.0,
650
+ value=0.7,
651
+ label="Temperature",
652
+ info="Higher = more random"
653
+ )
654
+ top_p = gr.Slider(
655
+ 0.0, 1.0,
656
+ value=0.9,
657
+ label="Top P",
658
+ info="Nucleus sampling"
659
+ )
660
+ show_thinking = gr.Checkbox(
661
+ label="💭 Show Thinking Process",
662
+ value=True,
663
+ info="Display reasoning steps"
664
+ )
665
+
666
+ # Input Area
667
+ gr.Markdown("### 📝 Your Input")
668
+ user_text = gr.Textbox(
669
+ label="Text Message",
670
+ lines=4,
671
+ placeholder="Type your message here...",
672
+ elem_classes=["input-box"],
673
+ show_label=False
674
+ )
675
+
676
+ audio_file = gr.Audio(
677
+ label="🎤 Audio Input",
678
+ type="filepath",
679
+ sources=["microphone", "upload"],
680
+ show_label=True
681
+ )
682
+
683
+ # Buttons
684
+ with gr.Row():
685
+ clear_btn = gr.Button("🗑️ Clear", scale=1, size="lg")
686
+ submit_btn = gr.Button(
687
+ "🚀 Send",
688
+ variant="primary",
689
+ scale=2,
690
+ size="lg",
691
+ elem_classes=["primary-btn"]
692
+ )
693
+
694
+ # Usage Guide at bottom
695
+ with gr.Accordion("📖 Quick Guide", open=False):
696
+ gr.Markdown("""
697
+ **Usage:**
698
+ - Type text, upload audio, or both
699
+ - Audio > 25s auto-splits
700
+ - Toggle thinking process display
701
+
702
+ **Tips:**
703
+ - Thinking shown in blue gradient block
704
+ - History auto-cleaned for API
705
+ - Adjust params in Configuration
706
+ """)
707
 
708
+ # Right Panel - Conversation Area
709
  with gr.Column(scale=2):
710
+ chatbot = gr.Chatbot(
711
+ label="💬 Conversation",
712
+ height=700,
713
+ type="messages",
714
+ elem_classes=["chatbot-container"],
715
+ show_label=True,
716
+ avatar_images=(None, None),
717
+ bubble_full_width=False
718
+ )
719
 
720
  submit_btn.click(
721
  fn=chat,
722
+ inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p, show_thinking],
723
+ outputs=[chatbot]
724
  )
725
 
726
  clear_btn.click(
 
732
  import argparse
733
  parser = argparse.ArgumentParser()
734
  parser.add_argument("--host", default="0.0.0.0")
735
+ parser.add_argument("--port", type=int, default=6008)
736
  parser.add_argument("--model", default=MODEL_NAME)
737
  args = parser.parse_args()
738
+ import os
739
+ # 取消代理设置
740
+ os.environ.update({
741
+ 'http_proxy': '', 'https_proxy': '', 'all_proxy': '',
742
+ 'HTTP_PROXY': '', 'HTTPS_PROXY': '', 'ALL_PROXY': ''
743
+ })
744
 
745
  # 更新全局模型名称
746
  if args.model: