cdminix commited on
Commit
5f95221
·
verified ·
1 Parent(s): e991413

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -90
app.py CHANGED
@@ -8,88 +8,32 @@ from tortoise.api import TextToSpeech
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
11
- VOICE_OPTIONS = [
12
- "angie",
13
- "deniro",
14
- "freeman",
15
- "halle",
16
- "lj",
17
- "myself",
18
- "pat2",
19
- "snakes",
20
- "tom",
21
- "daws",
22
- "dreams",
23
- "grace",
24
- "lescault",
25
- "weaver",
26
- "applejack",
27
- "daniel",
28
- "emma",
29
- "geralt",
30
- "jlaw",
31
- "mol",
32
- "pat",
33
- "rainbow",
34
- "tim_reynolds",
35
- "atkins",
36
- "dortice",
37
- "empire",
38
- "kennard",
39
- "mouse",
40
- "william",
41
- "jane_eyre",
42
- "random", # special option for random voice
43
- ]
44
-
45
 
46
  def inference(
47
  text,
48
- script,
49
- voice,
50
- voice_b,
51
  seed,
52
- split_by_newline,
53
  ):
54
- if text is None or text.strip() == "":
55
- with open(script.name) as f:
56
- text = f.read()
57
- if text.strip() == "":
58
- raise gr.Error("Please provide either text or script file with content.")
59
-
60
- if split_by_newline == "Yes":
61
- texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
62
- else:
63
- texts = split_and_recombine_text(text)
64
-
65
- voices = [voice]
66
- if voice_b != "disabled":
67
- voices.append(voice_b)
68
-
69
- if len(voices) == 1:
70
- voice_samples, conditioning_latents = load_voice(voice)
71
- else:
72
- voice_samples, conditioning_latents = load_voices(voices)
73
 
74
  start_time = time.time()
75
 
76
- # all_parts = []
77
  for j, text in enumerate(texts):
78
  for audio_frame in tts.tts_with_preset(
79
  text,
80
- voice_samples=voice_samples,
81
- conditioning_latents=conditioning_latents,
82
- preset="ultra_fast",
83
- k=1
84
  ):
85
  # print("Time taken: ", time.time() - start_time)
86
- # all_parts.append(audio_frame)
87
  yield (24000, audio_frame.cpu().detach().numpy())
88
 
89
- # wav = torch.cat(all_parts, dim=0).unsqueeze(0)
90
- # print(wav.shape)
91
- # torchaudio.save("output.wav", wav.cpu(), 24000)
92
- # yield (None, gr.make_waveform(audio="output.wav",))
 
93
  def main():
94
  title = "Tortoise TTS 🐢"
95
  description = """
@@ -101,37 +45,19 @@ def main():
101
  <br/>
102
  """
103
  text = gr.Textbox(
104
- lines=4,
105
- label="Text (Provide either text, or upload a newline separated text file below):",
106
  )
107
- script = gr.File(label="Upload a text file")
108
 
109
- voice = gr.Dropdown(
110
- VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
111
- )
112
- voice_b = gr.Dropdown(
113
- VOICE_OPTIONS,
114
- value="disabled",
115
- label="(Optional) Select second voice:",
116
- type="value",
117
- )
118
- split_by_newline = gr.Radio(
119
- ["Yes", "No"],
120
- label="Split by newline (If [No], it will automatically try to find relevant splits):",
121
- type="value",
122
- value="No",
123
- )
124
 
125
- output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
126
  # download_audio = gr.Audio(label="dowanload audio:")
127
  interface = gr.Interface(
128
  fn=inference,
129
  inputs=[
130
  text,
131
- script,
132
- voice,
133
- voice_b,
134
- split_by_newline,
135
  ],
136
  title=title,
137
  description=description,
 
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def inference(
13
  text,
14
+ reference_audio,
 
 
15
  seed,
 
16
  ):
17
+ texts = split_and_recombine_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  start_time = time.time()
20
 
21
+ all_parts = []
22
  for j, text in enumerate(texts):
23
  for audio_frame in tts.tts_with_preset(
24
  text,
25
+ voice_samples=load_audio(init_audio_file),
26
+ preset="fast",
 
 
27
  ):
28
  # print("Time taken: ", time.time() - start_time)
29
+ all_parts.append(audio_frame)
30
  yield (24000, audio_frame.cpu().detach().numpy())
31
 
32
+ wav = torch.cat(all_parts, dim=0).unsqueeze(0)
33
+ print(wav.shape)
34
+ torchaudio.save("output.wav", wav.cpu(), 24000)
35
+ yield (None, gr.make_waveform(audio="output.wav",))
36
+
37
  def main():
38
  title = "Tortoise TTS 🐢"
39
  description = """
 
45
  <br/>
46
  """
47
  text = gr.Textbox(
48
+ lines=1,
49
+ label="Text",
50
  )
 
51
 
52
+ reference_audio = gr.Audio(label="Reference Audio", type="filepath"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ output_audio = gr.Audio(label="Audio:")
55
  # download_audio = gr.Audio(label="dowanload audio:")
56
  interface = gr.Interface(
57
  fn=inference,
58
  inputs=[
59
  text,
60
+ reference_audio,
 
 
 
61
  ],
62
  title=title,
63
  description=description,