import os import torch import gradio as gr import torchaudio import time from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.text import split_and_recombine_text from tortoise.utils.audio import load_audio, load_voice, load_voices def inference( text, reference_audio, seed, ): texts = split_and_recombine_text(text) start_time = time.time() all_parts = [] for j, text in enumerate(texts): for audio_frame in tts.tts_with_preset( text, voice_samples=load_audio(init_audio_file), preset="fast", ): # print("Time taken: ", time.time() - start_time) all_parts.append(audio_frame) yield (24000, audio_frame.cpu().detach().numpy()) wav = torch.cat(all_parts, dim=0).unsqueeze(0) print(wav.shape) torchaudio.save("output.wav", wav.cpu(), 24000) yield (None, gr.make_waveform(audio="output.wav",)) def main(): title = "Tortoise TTS 🐢" description = """ A text-to-speech system which powers lot of organizations in Speech synthesis domain.
a model with strong multi-voice capabilities, highly realistic prosody and intonation.
for faster inference, use the 'ultra_fast' preset and duplicate space if you don't want to wait in a queue.
""" text = gr.Textbox( lines=1, label="Text", ) reference_audio = gr.Audio(label="Reference Audio", type="filepath") output_audio = gr.Audio(label="Generated Speech") # download_audio = gr.Audio(label="dowanload audio:") interface = gr.Interface( fn=inference, inputs=[ text, reference_audio, ], title=title, description=description, outputs=[output_audio], ) interface.queue().launch() if __name__ == "__main__": tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) with open("Tortoise_TTS_Runs_Scripts.log", "a") as f: f.write( f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n" ) main()