n/c
Browse files- api.py +14 -6
- models/autoregressive.py +1 -1
api.py
CHANGED
|
@@ -6,6 +6,7 @@ from urllib import request
|
|
| 6 |
import torch
|
| 7 |
import torch.nn.functional as F
|
| 8 |
import progressbar
|
|
|
|
| 9 |
|
| 10 |
from models.cvvp import CVVP
|
| 11 |
from models.diffusion_decoder import DiffusionTts
|
|
@@ -118,29 +119,36 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
|
| 118 |
return codes
|
| 119 |
|
| 120 |
|
| 121 |
-
def do_spectrogram_diffusion(diffusion_model, diffuser,
|
| 122 |
"""
|
| 123 |
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
| 124 |
"""
|
| 125 |
with torch.no_grad():
|
| 126 |
cond_mels = []
|
| 127 |
for sample in conditioning_samples:
|
|
|
|
|
|
|
| 128 |
sample = pad_or_truncate(sample, 102400)
|
| 129 |
-
cond_mel = wav_to_univnet_mel(sample.to(
|
| 130 |
cond_mels.append(cond_mel)
|
| 131 |
cond_mels = torch.stack(cond_mels, dim=1)
|
| 132 |
|
| 133 |
-
output_seq_len =
|
| 134 |
-
output_shape = (
|
| 135 |
-
precomputed_embeddings = diffusion_model.timestep_independent(
|
| 136 |
|
| 137 |
-
noise = torch.randn(output_shape, device=
|
| 138 |
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
| 139 |
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
|
| 140 |
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
| 141 |
|
| 142 |
|
| 143 |
class TextToSpeech:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def __init__(self, autoregressive_batch_size=16):
|
| 145 |
self.autoregressive_batch_size = autoregressive_batch_size
|
| 146 |
self.tokenizer = VoiceBpeTokenizer()
|
|
|
|
| 6 |
import torch
|
| 7 |
import torch.nn.functional as F
|
| 8 |
import progressbar
|
| 9 |
+
import torchaudio
|
| 10 |
|
| 11 |
from models.cvvp import CVVP
|
| 12 |
from models.diffusion_decoder import DiffusionTts
|
|
|
|
| 119 |
return codes
|
| 120 |
|
| 121 |
|
| 122 |
+
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_samples, temperature=1):
|
| 123 |
"""
|
| 124 |
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
| 125 |
"""
|
| 126 |
with torch.no_grad():
|
| 127 |
cond_mels = []
|
| 128 |
for sample in conditioning_samples:
|
| 129 |
+
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
| 130 |
+
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
| 131 |
sample = pad_or_truncate(sample, 102400)
|
| 132 |
+
cond_mel = wav_to_univnet_mel(sample.to(latents.device), do_normalization=False)
|
| 133 |
cond_mels.append(cond_mel)
|
| 134 |
cond_mels = torch.stack(cond_mels, dim=1)
|
| 135 |
|
| 136 |
+
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
| 137 |
+
output_shape = (latents.shape[0], 100, output_seq_len)
|
| 138 |
+
precomputed_embeddings = diffusion_model.timestep_independent(latents, cond_mels, output_seq_len, False)
|
| 139 |
|
| 140 |
+
noise = torch.randn(output_shape, device=latents.device) * temperature
|
| 141 |
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
| 142 |
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings})
|
| 143 |
return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
| 144 |
|
| 145 |
|
| 146 |
class TextToSpeech:
|
| 147 |
+
"""
|
| 148 |
+
Main entry point into Tortoise.
|
| 149 |
+
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
| 150 |
+
GPU OOM errors. Larger numbers generates slightly faster.
|
| 151 |
+
"""
|
| 152 |
def __init__(self, autoregressive_batch_size=16):
|
| 153 |
self.autoregressive_batch_size = autoregressive_batch_size
|
| 154 |
self.tokenizer = VoiceBpeTokenizer()
|
models/autoregressive.py
CHANGED
|
@@ -356,7 +356,7 @@ class UnifiedVoice(nn.Module):
|
|
| 356 |
preformatting to create a working TTS model.
|
| 357 |
"""
|
| 358 |
# Set padding areas within MEL (currently it is coded with the MEL code for <zero>).
|
| 359 |
-
mel_lengths = wav_lengths
|
| 360 |
for b in range(len(mel_lengths)):
|
| 361 |
actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
|
| 362 |
if actual_end < mel_input_tokens.shape[-1]:
|
|
|
|
| 356 |
preformatting to create a working TTS model.
|
| 357 |
"""
|
| 358 |
# Set padding areas within MEL (currently it is coded with the MEL code for <zero>).
|
| 359 |
+
mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc')
|
| 360 |
for b in range(len(mel_lengths)):
|
| 361 |
actual_end = mel_lengths[b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
|
| 362 |
if actual_end < mel_input_tokens.shape[-1]:
|