import gradio as gr
import spaces
import torch
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM
from peft import PeftModel
from src.mimo_audio.mimo_audio import MimoAudio
import tempfile
import os

# Download base models from Hugging Face
print("Downloading MiMo-Audio base models from Hugging Face...")
base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
print(f"Base models downloaded to: {base_model_path}")

# Download LoRA weights
print("Downloading EmoAct-MiMo LoRA weights...")
hf_token = os.environ.get("HF_TOKEN")
lora_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token)
print(f"LoRA weights downloaded to: {lora_path}")

# Load tokenizer and get special tokens
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
sosp_idx = tokenizer.convert_tokens_to_ids("<|sosp|>")
eosp_idx = tokenizer.convert_tokens_to_ids("<|eosp|>")
empty_token = tokenizer.convert_tokens_to_ids("<|empty|>")
sostm_idx = tokenizer.convert_tokens_to_ids("<|sostm|>")
eostm_idx = tokenizer.convert_tokens_to_ids("<|eostm|>")
eot_idx = tokenizer.convert_tokens_to_ids("<|eot|>")

# Create model args
model_args = MiMoAudioArguments(
    model_name_or_path=base_model_path,
    sosp_idx=sosp_idx,
    eosp_idx=eosp_idx,
    empty_idx=empty_token,
    sostm_idx=sostm_idx,
    eostm_idx=eostm_idx,
    eot_idx=eot_idx,
)

# Load base model
print("Loading base MiMo-Audio model...")
base_model = MiMoAudioForCausalLM.from_pretrained(
    base_model_path,
    args=model_args,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
print("Base model loaded")

# Load and merge LoRA
print("Loading LoRA adapter...")
model_with_lora = PeftModel.from_pretrained(base_model, lora_path)
print("Merging LoRA weights...")
merged_model = model_with_lora.merge_and_unload()
print("LoRA weights merged!")

# Save merged model to temporary directory
print("Saving merged model...")
merged_model_path = "/tmp/merged_mimo_audio"
os.makedirs(merged_model_path, exist_ok=True)
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print(f"Merged model saved to {merged_model_path}")

# Initialize MimoAudio with merged model
print("Initializing MimoAudio wrapper...")
model = MimoAudio(
    model_path=merged_model_path,
    mimo_audio_tokenizer_path=tokenizer_path
)
print("Model ready!")

@spaces.GPU
def generate_speech(emotion, text):
    """Generate emotional speech from text using EmoAct-MiMo"""
    if not emotion or not emotion.strip():
        return None, "Please enter an emotion description."
    if not text or not text.strip():
        return None, "Please enter text to convert to speech."

    print("Generating:", text)
    print("With emotion:", emotion)
    try:
        # Create temporary file for output
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            output_path = tmp_file.name

        # Format the instruction with emotion and text
        full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"

        # Generate TTS with emotion instruction
        model.tts_sft(
            text=text.strip(),
            output_path=output_path,
            instruct=emotion.strip()
        )

        return output_path, "✅ Speech generated successfully!"

    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
    gr.Markdown("""
    # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech

    Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).

    This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!

    It may hallucinate, try a few times to get good results.

    Voice cloning is not supported yet.
    """)

    with gr.Row():
        with gr.Column():
            emotion_input = gr.Textbox(
                label="Emotion",
                placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
                lines=3
            )
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter the text to speak with emotion...",
                lines=5
            )
            generate_btn = gr.Button("Generate Emotional Speech", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            status_output = gr.Textbox(
                label="Status",
                interactive=False
            )

    # Intense emotion examples
    gr.Examples(
        examples=[
            [
                "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
                "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
            ],
            [
                "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
                "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
            ],
            [
                "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
                "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
            ],
            [
                "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
                "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
            ],
            [
                "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
                "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
            ],
            [
                "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
                "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
            ]
        ],
        inputs=[emotion_input, text_input]
    )

    # Event handler
    generate_btn.click(
        fn=generate_speech,
        inputs=[emotion_input, text_input],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()