import gradio as gr from fastrtc import ReplyOnPause, AlgoOptions, SileroVadOptions, AdditionalOutputs, WebRTC, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials #get_hf_turn_credentials, import os from dotenv import load_dotenv import time import numpy as np import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from backend.tts import synthesize_text from backend.asr import transcribe_audio from backend.utils import preprocess_audio, is_valid_turn from backend.main import stream_chat_response from pydub import AudioSegment load_dotenv(override=True) phone_waiting_sound = AudioSegment.from_mp3("frontend/phone-ringing-382734.mp3")[:1000] sound_samples = np.array(phone_waiting_sound.get_array_of_samples(), dtype=np.int16) if phone_waiting_sound.channels > 1: sound_samples = sound_samples.reshape((-1, phone_waiting_sound.channels)).mean(axis=1) sound_samples = sound_samples.astype(np.float32) / 32768.0 # Normalize to [-1, def startup(_): yield (phone_waiting_sound.frame_rate, sound_samples) STARTUP_MESSAGE = "สวัสดีค่ะ มีข้อมูลสอบถามด้านใดคะ?" yield from synthesize_text(STARTUP_MESSAGE) time.sleep(2) yield AdditionalOutputs([{"role": "assistant", "content": STARTUP_MESSAGE}]) custom_css = """ /* Overall Gradio page styling: hot pink background */ body { /* background-color: #ff69b4; /* Hot pink */ margin: 0; padding: 0; font-family: sans-serif;} /* Title styling */ h1 { color: #fff; text-shadow: 1px 1px 2px #ff85a2; font-size: 2.5em; margin-bottom: 20px; text-align: center; } /* Style the column holding the telephone interface */ .phone-column { max-width: 350px !important; /* Limit the width of the phone column */ margin: 0 auto; /* Center the column */ border-radius: 20px; background-color: #ff69b4; /* Lighter pink for telephone interface */ box-shadow: 0 0 15px rgba(0, 0, 0, 0.2); padding: 20px; } /* Conversation history box styling */ #conversation-history-chatbot { background-color: #ffc0cb; /* Lighter pink for conversation history */ border: 1px solid #ccc; border-radius: 10px; padding: 10px; box-shadow: 0 0 15px rgba(0, 0, 0, 0.2); } """ def response(audio: tuple[int, np.ndarray] | None, conversation_history): """ Handles user audio input, transcribes it, streams LLM text via backend.main, and synthesizes chunks to audio while updating the conversation history. """ print(f"--- Latency Breakdown ---") start_time = time.time() if conversation_history is None: conversation_history = [] previous_history = list(conversation_history) if not audio or audio[1] is None or not np.any(audio[1]): print("No audio input detected; skipping response generation.") print(f"------------------------") return import soundfile as sf sample_rate, audio_array = audio try: processed_audio = preprocess_audio((sample_rate, audio_array), target_frame_rate=16000) except Exception as audio_err: print(f"Audio preprocessing failed: {audio_err}") print(f"------------------------") return t0 = time.time() transcription = transcribe_audio( processed_audio) t_asr = time.time() - t0 print(f"ASR: {t_asr:.4f}s") if not transcription.strip(): print("No valid transcription; skipping response generation.") print(f"------------------------") return user_turn = {"role": "user", "content": transcription} print(f"User: {transcription}") if is_valid_turn(user_turn): conversation_history.append(user_turn) yield AdditionalOutputs(conversation_history) print("Conversation history:", conversation_history) assistant_turn = {"role": "assistant", "content": ""} conversation_history.append(assistant_turn) history_for_stream = [dict(turn) for turn in previous_history if is_valid_turn(turn)] text_buffer = "" full_response = "" delimiter_count = 0 n_threshold = 3 max_n_threshold = 5 lang = "th" chunk_count = 0 first_chunk_sent = False start_llm_stream = time.time() try: for text_chunk in stream_chat_response(history_for_stream, transcription): if not isinstance(text_chunk, str): text_chunk = str(text_chunk) i = 0 while i < len(text_chunk): char = text_chunk[i] text_buffer += char full_response += char assistant_turn["content"] = full_response.strip() is_delimiter = False if char in {' ', '\n'}: is_delimiter = True delimiter_count += 1 if i + 1 < len(text_chunk) and text_chunk[i + 1] == 'ๆ': text_buffer += text_chunk[i + 1] full_response += text_chunk[i + 1] i += 1 send_now = False if not first_chunk_sent: if is_delimiter and text_buffer.strip(): send_now = True else: if delimiter_count >= n_threshold and text_buffer.strip(): send_now = True if n_threshold < max_n_threshold: n_threshold += 1 if send_now: buffer_to_send = text_buffer.strip() try: if buffer_to_send and buffer_to_send.endswith('วันที่'): buffer_to_send = buffer_to_send[:-len('วันที่')] if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'): buffer_to_send = buffer_to_send[:-len('ค่ะ')] except Exception: buffer_to_send = buffer_to_send.replace('ค่ะ', '') if buffer_to_send: chunk_count += 1 if chunk_count == 1: first_llm_chunk_time = time.time() t_llm_first_token = first_llm_chunk_time - start_llm_stream print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)") yield from synthesize_text(buffer_to_send, lang=lang) first_chunk_sent = True text_buffer = "" delimiter_count = 0 yield AdditionalOutputs(conversation_history) i += 1 if text_buffer.strip(): buffer_to_send = text_buffer.strip() try: if buffer_to_send and buffer_to_send.endswith('วันที่'): buffer_to_send = buffer_to_send[:-len('วันที่')] if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'): buffer_to_send = buffer_to_send[:-len('ค่ะ')] except Exception: buffer_to_send = buffer_to_send.replace('ค่ะ', '') if buffer_to_send: chunk_count += 1 if chunk_count == 1: first_llm_chunk_time = time.time() t_llm_first_token = first_llm_chunk_time - start_llm_stream print(f"LLM TTFC: {t_llm_first_token:.4f}s (Time To First Chunk)") yield from synthesize_text(buffer_to_send, lang=lang) first_chunk_sent = True text_buffer = "" delimiter_count = 0 yield AdditionalOutputs(conversation_history) except Exception as e: print(f"An error occurred during response generation or synthesis: {e}") error_message = "ขออภัยค่ะ เกิดข้อผิดพลาดบางอย่าง" try: yield from synthesize_text(error_message, lang=lang) except Exception as synth_error: print(f"Could not synthesize error message: {synth_error}") assistant_turn["content"] = (assistant_turn.get("content", "") + f" [Error: {e}]").strip() yield AdditionalOutputs(conversation_history) total_latency = time.time() - start_time print(f"Total: {total_latency:.4f}s") print(f"------------------------") async def get_credentials(): return await get_cloudflare_turn_credentials_async(hf_token=os.getenv('HF_TOKEN')) with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="pink")) as demo: gr.HTML("""