import os
import torch
import gradio as gr
import torchaudio
import time
import spaces
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.audio import load_audio, load_voice, load_voices
tts = TextToSpeech(kv_cache=True)
@spaces.GPU
def inference(
text,
reference_audio,
):
output_wav_path = tempfile.mktemp(suffix=".wav")
texts = split_and_recombine_text(text)
start_time = time.time()
all_parts = []
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=load_audio(init_audio_file),
preset="fast",
):
# print("Time taken: ", time.time() - start_time)
all_parts.append(audio_frame)
# yield (24000, audio_frame.cpu().detach().numpy())
wav = torch.cat(all_parts, dim=0).unsqueeze(0)
print(wav.shape)
torchaudio.save(output_wav_path, wav.cpu(), 24000)
return output_wav_path
def main():
title = "Tortoise TTS 🐢"
description = """
A text-to-speech system which powers lot of organizations in Speech synthesis domain.
a model with strong multi-voice capabilities, highly realistic prosody and intonation.
for faster inference, use the 'ultra_fast' preset and duplicate space if you don't want to wait in a queue.
"""
text = gr.Textbox(
lines=1,
label="Text",
)
reference_audio = gr.Audio(label="Reference Audio", type="filepath")
output_audio = gr.Audio(label="Generated Speech")
# download_audio = gr.Audio(label="dowanload audio:")
interface = gr.Interface(
fn=inference,
inputs=[
text,
reference_audio,
],
title=title,
description=description,
outputs=[output_audio],
)
interface.queue().launch()
if __name__ == "__main__":
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
f.write(
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
)
main()