moevis commited on
Commit
e6f110c
·
verified ·
1 Parent(s): d78daff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -69
app.py CHANGED
@@ -75,7 +75,7 @@ def format_messages(system, history, user_text, audio_data_list=None):
75
  continue
76
 
77
  # Check for Audio
78
- is_audio = not isinstance(content, list) and content["component"] == "audio"
79
 
80
  if is_audio:
81
  audio_path = content["value"]["path"]
@@ -241,7 +241,8 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
241
 
242
  try:
243
  with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
244
- response = client.post("/chat/completions", json={
 
245
  "model": model_name,
246
  "messages": messages,
247
  "max_tokens": max_tokens,
@@ -250,75 +251,77 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
250
  "stream": True,
251
  "repetition_penalty": 1.07,
252
  "stop_token_ids": [151665]
253
- })
254
-
255
- if response.status_code != 200:
256
- error_msg = f"❌ API Error {response.status_code}"
257
- if response.status_code == 404:
258
- error_msg += " - vLLM service not ready"
259
- elif response.status_code == 400:
260
- error_msg += " - Bad request"
261
- elif response.status_code == 500:
262
- error_msg += " - Model error"
263
- yield history, error_msg
264
- return
265
-
266
- # Process streaming response
267
- buffer = ""
268
- is_thinking = True
269
-
270
- for line in response.iter_lines():
271
- if not line:
272
- continue
273
- # Ensure line is string format
274
- if isinstance(line, bytes):
275
- line = line.decode('utf-8')
276
- else:
277
- line = str(line)
278
-
279
- if line.startswith('data: '):
280
- data_str = line[6:]
281
- if data_str.strip() == '[DONE]':
282
- break
283
- try:
284
- data = json.loads(data_str)
285
- if 'choices' in data and len(data['choices']) > 0:
286
- delta = data['choices'][0].get('delta', {})
287
- if 'content' in delta:
288
- content = delta['content']
289
- buffer += content
290
-
291
- if is_thinking:
292
- if "</think>" in buffer:
293
- is_thinking = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  parts = buffer.split("</think>", 1)
295
- think_content = parts[0]
296
  response_content = parts[1]
297
-
298
- if think_content.startswith("<think>"):
299
- think_content = think_content[len("<think>"):].strip()
300
-
301
- # Update thinking message
302
- history[-1].content = think_content
303
-
304
- # Add response message
305
- history.append({"role": "assistant", "content": response_content})
306
- else:
307
- # Update thinking message
308
- current_think = buffer
309
- if current_think.startswith("<think>"):
310
- current_think = current_think[len("<think>"):]
311
- history[-1].content = current_think
312
- else:
313
- # Already split, just update response message
314
- parts = buffer.split("</think>", 1)
315
- response_content = parts[1]
316
- history[-1]["content"] = response_content
317
-
318
- yield history, ""
319
-
320
- except json.JSONDecodeError:
321
- continue
322
 
323
  except httpx.ConnectError:
324
  yield history, "❌ Cannot connect to vLLM API"
 
75
  continue
76
 
77
  # Check for Audio
78
+ is_audio = not isinstance(content, list) and content.get("component", None) == "audio"
79
 
80
  if is_audio:
81
  audio_path = content["value"]["path"]
 
241
 
242
  try:
243
  with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
244
+ # Use client.stream for better streaming control
245
+ with client.stream("POST", "/chat/completions", json={
246
  "model": model_name,
247
  "messages": messages,
248
  "max_tokens": max_tokens,
 
251
  "stream": True,
252
  "repetition_penalty": 1.07,
253
  "stop_token_ids": [151665]
254
+ }) as response:
255
+
256
+ if response.status_code != 200:
257
+ error_msg = f"❌ API Error {response.status_code}"
258
+ if response.status_code == 404:
259
+ error_msg += " - vLLM service not ready"
260
+ elif response.status_code == 400:
261
+ error_msg += " - Bad request"
262
+ elif response.status_code == 500:
263
+ error_msg += " - Model error"
264
+ yield history, error_msg
265
+ return
266
+
267
+ # Process streaming response
268
+ buffer = ""
269
+ is_thinking = True
270
+
271
+ print("[DEBUG] Start receiving stream...")
272
+ for line in response.iter_lines():
273
+ if not line:
274
+ continue
275
+ # Ensure line is string format
276
+ if isinstance(line, bytes):
277
+ line = line.decode('utf-8')
278
+ else:
279
+ line = str(line)
280
+
281
+ if line.startswith('data: '):
282
+ data_str = line[6:]
283
+ if data_str.strip() == '[DONE]':
284
+ print("[DEBUG] Stream finished [DONE]")
285
+ break
286
+ try:
287
+ data = json.loads(data_str)
288
+ if 'choices' in data and len(data['choices']) > 0:
289
+ delta = data['choices'][0].get('delta', {})
290
+ if 'content' in delta:
291
+ content = delta['content']
292
+ buffer += content
293
+
294
+ if is_thinking:
295
+ if "</think>" in buffer:
296
+ is_thinking = False
297
+ parts = buffer.split("</think>", 1)
298
+ think_content = parts[0]
299
+ response_content = parts[1]
300
+
301
+ if think_content.startswith("<think>"):
302
+ think_content = think_content[len("<think>"):].strip()
303
+
304
+ # Update thinking message
305
+ history[-1].content = think_content
306
+
307
+ # Add response message
308
+ history.append({"role": "assistant", "content": response_content})
309
+ else:
310
+ # Update thinking message
311
+ current_think = buffer
312
+ if current_think.startswith("<think>"):
313
+ current_think = current_think[len("<think>"):]
314
+ history[-1].content = current_think
315
+ else:
316
+ # Already split, just update response message
317
  parts = buffer.split("</think>", 1)
 
318
  response_content = parts[1]
319
+ history[-1]["content"] = response_content
320
+
321
+ yield history, ""
322
+
323
+ except json.JSONDecodeError:
324
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
  except httpx.ConnectError:
327
  yield history, "❌ Cannot connect to vLLM API"