tortoise-tts / app.py
cdminix's picture
Update app.py
0509612 verified
import os
import torch
import gradio as gr
import torchaudio
import time
import spaces
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.audio import load_audio, load_voice, load_voices
tts = TextToSpeech(kv_cache=True)
@spaces.GPU
def inference(
text,
reference_audio,
):
output_wav_path = tempfile.mktemp(suffix=".wav")
texts = split_and_recombine_text(text)
start_time = time.time()
all_parts = []
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=load_audio(init_audio_file),
preset="fast",
):
# print("Time taken: ", time.time() - start_time)
all_parts.append(audio_frame)
# yield (24000, audio_frame.cpu().detach().numpy())
wav = torch.cat(all_parts, dim=0).unsqueeze(0)
print(wav.shape)
torchaudio.save(output_wav_path, wav.cpu(), 24000)
return output_wav_path
def main():
title = "Tortoise TTS 🐢"
text = gr.Textbox(
label="Text",
)
reference_audio = gr.Audio(label="Reference Audio", type="filepath")
output_audio = gr.Audio(label="Generated Speech")
# download_audio = gr.Audio(label="dowanload audio:")
interface = gr.Interface(
fn=inference,
inputs=[
text,
reference_audio,
],
title=title,
outputs=output_audio,
)
interface.launch()
if __name__ == "__main__":
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
f.write(
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
)
main()