import gradio as gr
import os
import time
import tempfile
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import hf_hub_download
import scipy.io.wavfile
import soundfile as sf

# Set the cache directory for Hugging Face models to ensure they are saved within the Space
os.environ['HUGGINGFACE_HUB_CACHE'] = '/app/.cache/huggingface/hub'
# required for agreeing to license terms for non-commercial purposes of coqui
os.environ['COQUI_TOS_AGREED'] = '1'

config_path = hf_hub_download("16pramodh/coqui-hindi", "config.json")
model_path = hf_hub_download("16pramodh/coqui-hindi", "model.pth")
vocab_path = hf_hub_download("16pramodh/coqui-hindi", "vocab.json")
checkpoint_dir = os.path.dirname(model_path)

config = XttsConfig()
config.load_json(config_path)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir=checkpoint_dir, eval=True)

# Define the transcription function that Gradio will expose as an API
def speech_gen(audio_file_path,hindi_text):
    if audio_file_path is None:
        return "No audio file provided."

    try:
        outputs = model.synthesize(
            hindi_text,
            config,
            speaker_wav=audio_file_path,
            gpt_cond_len=3,
            language="hi",
        )

        # Save output audio to wav file
        temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        sf.write(temp_wav_file.name, outputs["wav"], 24000)
        
        return temp_wav_file.name

    except Exception as e:
        print(e)
        return None

# Create the Gradio Interface
iface = gr.Interface(
    fn=speech_gen,
    inputs=[gr.Audio(type="filepath"),"text"],
    outputs=gr.Audio(type="filepath"),
    title="Coqui TTS model space",
    description="converts Hindi text to Hindi speech",
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()