import os import torch import gradio as gr import torchaudio import time import spaces from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.text import split_and_recombine_text from tortoise.utils.audio import load_audio, load_voice, load_voices tts = TextToSpeech(kv_cache=True) @spaces.GPU def inference( text, reference_audio, ): output_wav_path = tempfile.mktemp(suffix=".wav") texts = split_and_recombine_text(text) start_time = time.time() all_parts = [] for j, text in enumerate(texts): for audio_frame in tts.tts_with_preset( text, voice_samples=load_audio(init_audio_file), preset="fast", ): # print("Time taken: ", time.time() - start_time) all_parts.append(audio_frame) # yield (24000, audio_frame.cpu().detach().numpy()) wav = torch.cat(all_parts, dim=0).unsqueeze(0) print(wav.shape) torchaudio.save(output_wav_path, wav.cpu(), 24000) return output_wav_path def main(): title = "Tortoise TTS 🐢" text = gr.Textbox( label="Text", ) reference_audio = gr.Audio(label="Reference Audio", type="filepath") output_audio = gr.Audio(label="Generated Speech") # download_audio = gr.Audio(label="dowanload audio:") interface = gr.Interface( fn=inference, inputs=[ text, reference_audio, ], title=title, outputs=output_audio, ) interface.launch() if __name__ == "__main__": with open("Tortoise_TTS_Runs_Scripts.log", "a") as f: f.write( f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n" ) main()