import gradio as gr
from fastrtc import ReplyOnPause, AlgoOptions, SileroVadOptions, AdditionalOutputs, WebRTC, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials #get_hf_turn_credentials, 
import os
from dotenv import load_dotenv
import time
import numpy as np
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from backend.tts import synthesize_text
from backend.asr import transcribe_audio
from backend.utils import preprocess_audio, is_valid_turn
from backend.main import stream_chat_response

from pydub import AudioSegment


load_dotenv(override=True)

phone_waiting_sound = AudioSegment.from_mp3("frontend/phone-ringing-382734.mp3")[:1000]
sound_samples = np.array(phone_waiting_sound.get_array_of_samples(), dtype=np.int16)
if phone_waiting_sound.channels > 1:
    sound_samples = sound_samples.reshape((-1, phone_waiting_sound.channels)).mean(axis=1)
sound_samples = sound_samples.astype(np.float32) / 32768.0  # Normalize to [-1, 
def startup(_):
    yield (phone_waiting_sound.frame_rate, sound_samples)
    STARTUP_MESSAGE = "สวัสดีค่ะ มีข้อมูลสอบถามด้านใดคะ?"
    yield from synthesize_text(STARTUP_MESSAGE)
    time.sleep(2)
    yield AdditionalOutputs([{"role": "assistant", "content": STARTUP_MESSAGE}])

custom_css = """
/* Overall Gradio page styling: hot pink background */
body {
/*    background-color: #ff69b4; /* Hot pink */
    margin: 0;
    padding: 0;
    font-family: sans-serif;}
/* Title styling */
h1 {
    color: #fff;
    text-shadow: 1px 1px 2px #ff85a2;
    font-size: 2.5em;
    margin-bottom: 20px;
    text-align: center;
}
/* Style the column holding the telephone interface */
.phone-column {
    max-width: 350px !important; /* Limit the width of the phone column */
    margin: 0 auto;              /* Center the column */
    border-radius: 20px;
    background-color: #ff69b4;   /* Lighter pink for telephone interface */
    box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
    padding: 20px;
}
/* Conversation history box styling */
#conversation-history-chatbot {
    background-color: #ffc0cb;   /* Lighter pink for conversation history */
    border: 1px solid #ccc;
    border-radius: 10px;
    padding: 10px;
    box-shadow: 0 0 15px rgba(0, 0, 0, 0.2);
}
"""
def response(audio: tuple[int, np.ndarray] | None, conversation_history):
    """
    Handles user audio input, transcribes it, streams LLM text via backend.main,
    and synthesizes chunks to audio while updating the conversation history.
    """
    print(f"--- Latency Breakdown ---")
    start_time = time.time()
    if conversation_history is None:
        conversation_history = []

    previous_history = list(conversation_history)

    if not audio or audio[1] is None or not np.any(audio[1]):
        print("No audio input detected; skipping response generation.")
        print(f"------------------------")
        return
    import soundfile as sf

    sample_rate, audio_array = audio

    try:
        processed_audio = preprocess_audio((sample_rate, audio_array), target_frame_rate=16000)
    except Exception as audio_err:
        print(f"Audio preprocessing failed: {audio_err}")
        print(f"------------------------")
        return

    t0 = time.time()
    transcription = transcribe_audio( processed_audio)
    t_asr = time.time() - t0
    print(f"ASR:        {t_asr:.4f}s")

    if not transcription.strip():
        print("No valid transcription; skipping response generation.")
        print(f"------------------------")
        return

    user_turn = {"role": "user", "content": transcription}
    print(f"User: {transcription}")
    if is_valid_turn(user_turn):
        conversation_history.append(user_turn)
        yield AdditionalOutputs(conversation_history)

    print("Conversation history:", conversation_history)

    assistant_turn = {"role": "assistant", "content": ""}
    conversation_history.append(assistant_turn)

    history_for_stream = [dict(turn) for turn in previous_history if is_valid_turn(turn)]

    text_buffer = ""
    full_response = ""
    delimiter_count = 0
    n_threshold = 3
    max_n_threshold = 5
    lang = "th"
    chunk_count = 0
    first_chunk_sent = False
    start_llm_stream = time.time()

    try:
        for text_chunk in stream_chat_response(history_for_stream, transcription):
            if not isinstance(text_chunk, str):
                text_chunk = str(text_chunk)

            i = 0
            while i < len(text_chunk):
                char = text_chunk[i]
                text_buffer += char
                full_response += char

                assistant_turn["content"] = full_response.strip()

                is_delimiter = False
                if char in {' ', '\n'}:
                    is_delimiter = True
                    delimiter_count += 1
                    if i + 1 < len(text_chunk) and text_chunk[i + 1] == 'ๆ':
                        text_buffer += text_chunk[i + 1]
                        full_response += text_chunk[i + 1]
                        i += 1

                send_now = False
                if not first_chunk_sent:
                    if is_delimiter and text_buffer.strip():
                        send_now = True
                else:
                    if delimiter_count >= n_threshold and text_buffer.strip():
                        send_now = True
                        if n_threshold < max_n_threshold:
                            n_threshold += 1

                if send_now:
                    buffer_to_send = text_buffer.strip()
                    try:
                        if buffer_to_send and buffer_to_send.endswith('วันที่'):
                            buffer_to_send = buffer_to_send[:-len('วันที่')]
                        if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
                            buffer_to_send = buffer_to_send[:-len('ค่ะ')]
                    except Exception:
                        buffer_to_send = buffer_to_send.replace('ค่ะ', '')

                    if buffer_to_send:
                        chunk_count += 1
                        if chunk_count == 1:
                            first_llm_chunk_time = time.time()
                            t_llm_first_token = first_llm_chunk_time - start_llm_stream
                            print(f"LLM TTFC:   {t_llm_first_token:.4f}s (Time To First Chunk)")
                        yield from synthesize_text(buffer_to_send, lang=lang)
                        first_chunk_sent = True
                        text_buffer = ""
                        delimiter_count = 0
                        yield AdditionalOutputs(conversation_history)

                i += 1

        if text_buffer.strip():
            buffer_to_send = text_buffer.strip()
            try:
                if buffer_to_send and buffer_to_send.endswith('วันที่'):
                    buffer_to_send = buffer_to_send[:-len('วันที่')]
                if buffer_to_send and first_chunk_sent and buffer_to_send.endswith('ค่ะ'):
                    buffer_to_send = buffer_to_send[:-len('ค่ะ')]
            except Exception:
                buffer_to_send = buffer_to_send.replace('ค่ะ', '')

            if buffer_to_send:
                chunk_count += 1
                if chunk_count == 1:
                    first_llm_chunk_time = time.time()
                    t_llm_first_token = first_llm_chunk_time - start_llm_stream
                    print(f"LLM TTFC:   {t_llm_first_token:.4f}s (Time To First Chunk)")
                yield from synthesize_text(buffer_to_send, lang=lang)
                first_chunk_sent = True
                text_buffer = ""
                delimiter_count = 0
                yield AdditionalOutputs(conversation_history)

    except Exception as e:
        print(f"An error occurred during response generation or synthesis: {e}")
        error_message = "ขออภัยค่ะ เกิดข้อผิดพลาดบางอย่าง"
        try:
            yield from synthesize_text(error_message, lang=lang)
        except Exception as synth_error:
            print(f"Could not synthesize error message: {synth_error}")
        assistant_turn["content"] = (assistant_turn.get("content", "") + f" [Error: {e}]").strip()
        yield AdditionalOutputs(conversation_history)

    total_latency = time.time() - start_time
    print(f"Total:      {total_latency:.4f}s")
    print(f"------------------------")


async def get_credentials():
    return await get_cloudflare_turn_credentials_async(hf_token=os.getenv('HF_TOKEN'))
   
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="pink")) as demo:
    gr.HTML("""<h1 style='text-align: center'>1157 Voicebot Demo</h1>""")
    with gr.Row():
        with gr.Column(scale=1, elem_classes=["phone-column"]):
            audio = WebRTC(
                mode="send-receive",
                modality="audio", 
                track_constraints={
                    "echoCancellation": True, 
                    "noiseSuppression": {"exact": True}, 
                    "autoGainControl": {"exact": True}
                },
                rtc_configuration=get_credentials,
                server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000),
                icon="https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif",
                icon_button_color="#17dbaa",
                pulse_color="#b0f83b",
                button_labels={"start": "Call", "stop": "Hang up", "waiting": "Connecting…"},
                icon_radius=45,
                height="650px",     
                width="100%",  
                container=False,    
                elem_id="phone-call-webrtc"
            )
        with gr.Column():
            conversation_history = gr.Chatbot(
                label="Conversation History", 
                type="messages", 
                value=[],
                height="675px",
                resizable=True,
                avatar_images=(None, "https://i.pinimg.com/originals/0c/67/5a/0c675a8e1061478d2b7b21b330093444.gif"),
            )
    gr.DeepLinkButton()

    audio.stream(
        fn=ReplyOnPause(
            response,
            algo_options=AlgoOptions(
                audio_chunk_duration=0.6,
                started_talking_threshold=0.3,
                speech_threshold=0.6
            ),
            model_options=SileroVadOptions(
                threshold=0.8,
                min_speech_duration_ms=300,
                max_speech_duration_s=float("inf"),
                min_silence_duration_ms=1200,
            ),
            can_interrupt=True,
            startup_fn=startup,
        ),
        inputs=[audio, conversation_history], 
        outputs=[audio],
        concurrency_limit=1000,
        time_limit=8192
    )

    audio.on_additional_outputs(
        lambda history: history,
        outputs=[conversation_history],
        queue=True,
        show_progress="hidden"
    )

demo.queue(default_concurrency_limit=1000)
demo.launch(debug=True, show_error=True, share=True)