tortoise-tts

Sleeping

App Files Files Community

cdminix commited on 17 days ago

Commit

5f95221

verified ·

1 Parent(s): e991413

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -90

app.py CHANGED Viewed

@@ -8,88 +8,32 @@ from tortoise.api import TextToSpeech
 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.audio import load_audio, load_voice, load_voices
-VOICE_OPTIONS = [
-    "angie",
-    "deniro",
-    "freeman",
-    "halle",
-    "lj",
-    "myself",
-    "pat2",
-    "snakes",
-    "tom",
-    "daws",
-    "dreams",
-    "grace",
-    "lescault",
-    "weaver",
-    "applejack",
-    "daniel",
-    "emma",
-    "geralt",
-    "jlaw",
-    "mol",
-    "pat",
-    "rainbow",
-    "tim_reynolds",
-    "atkins",
-    "dortice",
-    "empire",
-    "kennard",
-    "mouse",
-    "william",
-    "jane_eyre",
-    "random",  # special option for random voice
-]
 def inference(
     text,
-    script,
-    voice,
-    voice_b,
     seed,
-    split_by_newline,
 ):
-    if text is None or text.strip() == "":
-        with open(script.name) as f:
-            text = f.read()
-        if text.strip() == "":
-            raise gr.Error("Please provide either text or script file with content.")
-    if split_by_newline == "Yes":
-        texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
-    else:
-        texts = split_and_recombine_text(text)
-    voices = [voice]
-    if voice_b != "disabled":
-        voices.append(voice_b)
-    if len(voices) == 1:
-        voice_samples, conditioning_latents = load_voice(voice)
-    else:
-        voice_samples, conditioning_latents = load_voices(voices)
     start_time = time.time()
-    # all_parts = []
     for j, text in enumerate(texts):
         for audio_frame in tts.tts_with_preset(
             text,
-            voice_samples=voice_samples,
-            conditioning_latents=conditioning_latents,
-            preset="ultra_fast",
-            k=1
         ):
             # print("Time taken: ", time.time() - start_time)
-            # all_parts.append(audio_frame)
             yield (24000, audio_frame.cpu().detach().numpy())
-    # wav = torch.cat(all_parts, dim=0).unsqueeze(0)
-    # print(wav.shape)
-    # torchaudio.save("output.wav", wav.cpu(), 24000)
-    # yield (None, gr.make_waveform(audio="output.wav",))
 def main():
     title = "Tortoise TTS 🐢"
     description = """
@@ -101,37 +45,19 @@ def main():
     <br/>
     """
     text = gr.Textbox(
-        lines=4,
-        label="Text (Provide either text, or upload a newline separated text file below):",
     )
-    script = gr.File(label="Upload a text file")
-    voice = gr.Dropdown(
-        VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
-    )
-    voice_b = gr.Dropdown(
-        VOICE_OPTIONS,
-        value="disabled",
-        label="(Optional) Select second voice:",
-        type="value",
-    )
-    split_by_newline = gr.Radio(
-        ["Yes", "No"],
-        label="Split by newline (If [No], it will automatically try to find relevant splits):",
-        type="value",
-        value="No",
-    )
-    output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
     # download_audio = gr.Audio(label="dowanload audio:")
     interface = gr.Interface(
         fn=inference,
         inputs=[
             text,
-            script,
-            voice,
-            voice_b,
-            split_by_newline,
         ],
         title=title,
         description=description,

 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.audio import load_audio, load_voice, load_voices
 def inference(
     text,
+    reference_audio,
     seed,
 ):
+    texts = split_and_recombine_text(text)
     start_time = time.time()
+    all_parts = []
     for j, text in enumerate(texts):
         for audio_frame in tts.tts_with_preset(
             text,
+            voice_samples=load_audio(init_audio_file),
+            preset="fast",
         ):
             # print("Time taken: ", time.time() - start_time)
+            all_parts.append(audio_frame)
             yield (24000, audio_frame.cpu().detach().numpy())
+    wav = torch.cat(all_parts, dim=0).unsqueeze(0)
+    print(wav.shape)
+    torchaudio.save("output.wav", wav.cpu(), 24000)
+    yield (None, gr.make_waveform(audio="output.wav",))
 def main():
     title = "Tortoise TTS 🐢"
     description = """
     <br/>
     """
     text = gr.Textbox(
+        lines=1,
+        label="Text",
     )
+    reference_audio = gr.Audio(label="Reference Audio", type="filepath"),
+    output_audio = gr.Audio(label="Audio:")
     # download_audio = gr.Audio(label="dowanload audio:")
     interface = gr.Interface(
         fn=inference,
         inputs=[
             text,
+            reference_audio,
         ],
         title=title,
         description=description,