del unused vits / --soundscape omit=None specified=Fjord
Browse files- Modules/vits/README.md +0 -58
- Modules/vits/losses.py +0 -61
- Modules/vits/models.py +4 -216
- Modules/vits/monotonic_align/__init__.py +0 -19
- Modules/vits/monotonic_align/core.pyx +0 -42
- Modules/vits/monotonic_align/setup.py +0 -9
- Modules/vits/preprocess.py +0 -25
- tts.py +3 -3
Modules/vits/README.md
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
# VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech
|
| 2 |
-
|
| 3 |
-
### Jaehyeon Kim, Jungil Kong, and Juhee Son
|
| 4 |
-
|
| 5 |
-
In our recent [paper](https://arxiv.org/abs/2106.06103), we propose VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.
|
| 6 |
-
|
| 7 |
-
Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth.
|
| 8 |
-
|
| 9 |
-
Visit our [demo](https://jaywalnut310.github.io/vits-demo/index.html) for audio samples.
|
| 10 |
-
|
| 11 |
-
We also provide the [pretrained models](https://drive.google.com/drive/folders/1ksarh-cJf3F5eKJjLVWY0X1j1qsQqiS2?usp=sharing).
|
| 12 |
-
|
| 13 |
-
** Update note: Thanks to [Rishikesh (ऋषिकेश)](https://github.com/jaywalnut310/vits/issues/1), our interactive TTS demo is now available on [Colab Notebook](https://colab.research.google.com/drive/1CO61pZizDj7en71NQG_aqqKdGaA_SaBf?usp=sharing).
|
| 14 |
-
|
| 15 |
-
<table style="width:100%">
|
| 16 |
-
<tr>
|
| 17 |
-
<th>VITS at training</th>
|
| 18 |
-
<th>VITS at inference</th>
|
| 19 |
-
</tr>
|
| 20 |
-
<tr>
|
| 21 |
-
<td><img src="resources/fig_1a.png" alt="VITS at training" height="400"></td>
|
| 22 |
-
<td><img src="resources/fig_1b.png" alt="VITS at inference" height="400"></td>
|
| 23 |
-
</tr>
|
| 24 |
-
</table>
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
## Pre-requisites
|
| 28 |
-
0. Python >= 3.6
|
| 29 |
-
0. Clone this repository
|
| 30 |
-
0. Install python requirements. Please refer [requirements.txt](requirements.txt)
|
| 31 |
-
1. You may need to install espeak first: `apt-get install espeak`
|
| 32 |
-
0. Download datasets
|
| 33 |
-
1. Download and extract the LJ Speech dataset, then rename or create a link to the dataset folder: `ln -s /path/to/LJSpeech-1.1/wavs DUMMY1`
|
| 34 |
-
1. For mult-speaker setting, download and extract the VCTK dataset, and downsample wav files to 22050 Hz. Then rename or create a link to the dataset folder: `ln -s /path/to/VCTK-Corpus/downsampled_wavs DUMMY2`
|
| 35 |
-
0. Build Monotonic Alignment Search and run preprocessing if you use your own datasets.
|
| 36 |
-
```sh
|
| 37 |
-
# Cython-version Monotonoic Alignment Search
|
| 38 |
-
cd monotonic_align
|
| 39 |
-
python setup.py build_ext --inplace
|
| 40 |
-
|
| 41 |
-
# Preprocessing (g2p) for your own datasets. Preprocessed phonemes for LJ Speech and VCTK have been already provided.
|
| 42 |
-
# python preprocess.py --text_index 1 --filelists filelists/ljs_audio_text_train_filelist.txt filelists/ljs_audio_text_val_filelist.txt filelists/ljs_audio_text_test_filelist.txt
|
| 43 |
-
# python preprocess.py --text_index 2 --filelists filelists/vctk_audio_sid_text_train_filelist.txt filelists/vctk_audio_sid_text_val_filelist.txt filelists/vctk_audio_sid_text_test_filelist.txt
|
| 44 |
-
```
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
## Training Exmaple
|
| 48 |
-
```sh
|
| 49 |
-
# LJ Speech
|
| 50 |
-
python train.py -c configs/ljs_base.json -m ljs_base
|
| 51 |
-
|
| 52 |
-
# VCTK
|
| 53 |
-
python train_ms.py -c configs/vctk_base.json -m vctk_base
|
| 54 |
-
```
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
## Inference Example
|
| 58 |
-
See [inference.ipynb](inference.ipynb)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/losses.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
from torch.nn import functional as F
|
| 3 |
-
|
| 4 |
-
import commons
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def feature_loss(fmap_r, fmap_g):
|
| 8 |
-
loss = 0
|
| 9 |
-
for dr, dg in zip(fmap_r, fmap_g):
|
| 10 |
-
for rl, gl in zip(dr, dg):
|
| 11 |
-
rl = rl.float().detach()
|
| 12 |
-
gl = gl.float()
|
| 13 |
-
loss += torch.mean(torch.abs(rl - gl))
|
| 14 |
-
|
| 15 |
-
return loss * 2
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
| 19 |
-
loss = 0
|
| 20 |
-
r_losses = []
|
| 21 |
-
g_losses = []
|
| 22 |
-
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
| 23 |
-
dr = dr.float()
|
| 24 |
-
dg = dg.float()
|
| 25 |
-
r_loss = torch.mean((1-dr)**2)
|
| 26 |
-
g_loss = torch.mean(dg**2)
|
| 27 |
-
loss += (r_loss + g_loss)
|
| 28 |
-
r_losses.append(r_loss.item())
|
| 29 |
-
g_losses.append(g_loss.item())
|
| 30 |
-
|
| 31 |
-
return loss, r_losses, g_losses
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def generator_loss(disc_outputs):
|
| 35 |
-
loss = 0
|
| 36 |
-
gen_losses = []
|
| 37 |
-
for dg in disc_outputs:
|
| 38 |
-
dg = dg.float()
|
| 39 |
-
l = torch.mean((1-dg)**2)
|
| 40 |
-
gen_losses.append(l)
|
| 41 |
-
loss += l
|
| 42 |
-
|
| 43 |
-
return loss, gen_losses
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
| 47 |
-
"""
|
| 48 |
-
z_p, logs_q: [b, h, t_t]
|
| 49 |
-
m_p, logs_p: [b, h, t_t]
|
| 50 |
-
"""
|
| 51 |
-
z_p = z_p.float()
|
| 52 |
-
logs_q = logs_q.float()
|
| 53 |
-
m_p = m_p.float()
|
| 54 |
-
logs_p = logs_p.float()
|
| 55 |
-
z_mask = z_mask.float()
|
| 56 |
-
|
| 57 |
-
kl = logs_p - logs_q - 0.5
|
| 58 |
-
kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
|
| 59 |
-
kl = torch.sum(kl * z_mask)
|
| 60 |
-
l = kl / torch.sum(z_mask)
|
| 61 |
-
return l
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/models.py
CHANGED
|
@@ -7,7 +7,6 @@ from torch.nn import functional as F
|
|
| 7 |
import commons
|
| 8 |
import modules
|
| 9 |
import attentions
|
| 10 |
-
import monotonic_align
|
| 11 |
|
| 12 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
| 13 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
|
@@ -94,44 +93,6 @@ class StochasticDurationPredictor(nn.Module):
|
|
| 94 |
logw = z0
|
| 95 |
return logw
|
| 96 |
|
| 97 |
-
|
| 98 |
-
class DurationPredictor(nn.Module):
|
| 99 |
-
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
| 100 |
-
super().__init__()
|
| 101 |
-
|
| 102 |
-
self.in_channels = in_channels
|
| 103 |
-
self.filter_channels = filter_channels
|
| 104 |
-
self.kernel_size = kernel_size
|
| 105 |
-
self.p_dropout = p_dropout
|
| 106 |
-
self.gin_channels = gin_channels
|
| 107 |
-
|
| 108 |
-
self.drop = nn.Dropout(p_dropout)
|
| 109 |
-
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
| 110 |
-
self.norm_1 = modules.LayerNorm(filter_channels)
|
| 111 |
-
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
|
| 112 |
-
self.norm_2 = modules.LayerNorm(filter_channels)
|
| 113 |
-
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
| 114 |
-
|
| 115 |
-
if gin_channels != 0:
|
| 116 |
-
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
| 117 |
-
|
| 118 |
-
def forward(self, x, x_mask, g=None):
|
| 119 |
-
x = torch.detach(x)
|
| 120 |
-
if g is not None:
|
| 121 |
-
g = torch.detach(g)
|
| 122 |
-
x = x + self.cond(g)
|
| 123 |
-
x = self.conv_1(x * x_mask)
|
| 124 |
-
x = torch.relu(x)
|
| 125 |
-
x = self.norm_1(x)
|
| 126 |
-
x = self.drop(x)
|
| 127 |
-
x = self.conv_2(x * x_mask)
|
| 128 |
-
x = torch.relu(x)
|
| 129 |
-
x = self.norm_2(x)
|
| 130 |
-
x = self.drop(x)
|
| 131 |
-
x = self.proj(x * x_mask)
|
| 132 |
-
return x * x_mask
|
| 133 |
-
|
| 134 |
-
|
| 135 |
class TextEncoder(nn.Module):
|
| 136 |
def __init__(self,
|
| 137 |
n_vocab,
|
|
@@ -208,39 +169,6 @@ class ResidualCouplingBlock(nn.Module):
|
|
| 208 |
x = flow(x, x_mask, g=g, reverse=reverse)
|
| 209 |
return x
|
| 210 |
|
| 211 |
-
|
| 212 |
-
class PosteriorEncoder(nn.Module):
|
| 213 |
-
def __init__(self,
|
| 214 |
-
in_channels,
|
| 215 |
-
out_channels,
|
| 216 |
-
hidden_channels,
|
| 217 |
-
kernel_size,
|
| 218 |
-
dilation_rate,
|
| 219 |
-
n_layers,
|
| 220 |
-
gin_channels=0):
|
| 221 |
-
super().__init__()
|
| 222 |
-
self.in_channels = in_channels
|
| 223 |
-
self.out_channels = out_channels
|
| 224 |
-
self.hidden_channels = hidden_channels
|
| 225 |
-
self.kernel_size = kernel_size
|
| 226 |
-
self.dilation_rate = dilation_rate
|
| 227 |
-
self.n_layers = n_layers
|
| 228 |
-
self.gin_channels = gin_channels
|
| 229 |
-
|
| 230 |
-
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
| 231 |
-
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
| 232 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
| 233 |
-
|
| 234 |
-
def forward(self, x, x_lengths, g=None):
|
| 235 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
| 236 |
-
x = self.pre(x) * x_mask
|
| 237 |
-
x = self.enc(x, x_mask, g=g)
|
| 238 |
-
stats = self.proj(x) * x_mask
|
| 239 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
| 240 |
-
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
| 241 |
-
return z, m, logs, x_mask
|
| 242 |
-
|
| 243 |
-
|
| 244 |
class Generator(torch.nn.Module):
|
| 245 |
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
| 246 |
super(Generator, self).__init__()
|
|
@@ -296,97 +224,6 @@ class Generator(torch.nn.Module):
|
|
| 296 |
l.remove_weight_norm()
|
| 297 |
|
| 298 |
|
| 299 |
-
class DiscriminatorP(torch.nn.Module):
|
| 300 |
-
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
| 301 |
-
super(DiscriminatorP, self).__init__()
|
| 302 |
-
self.period = period
|
| 303 |
-
self.use_spectral_norm = use_spectral_norm
|
| 304 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
| 305 |
-
self.convs = nn.ModuleList([
|
| 306 |
-
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
| 307 |
-
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
| 308 |
-
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
| 309 |
-
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
| 310 |
-
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
| 311 |
-
])
|
| 312 |
-
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
| 313 |
-
|
| 314 |
-
def forward(self, x):
|
| 315 |
-
fmap = []
|
| 316 |
-
|
| 317 |
-
# 1d to 2d
|
| 318 |
-
b, c, t = x.shape
|
| 319 |
-
if t % self.period != 0: # pad first
|
| 320 |
-
n_pad = self.period - (t % self.period)
|
| 321 |
-
x = F.pad(x, (0, n_pad), "reflect")
|
| 322 |
-
t = t + n_pad
|
| 323 |
-
x = x.view(b, c, t // self.period, self.period)
|
| 324 |
-
|
| 325 |
-
for l in self.convs:
|
| 326 |
-
x = l(x)
|
| 327 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
| 328 |
-
fmap.append(x)
|
| 329 |
-
x = self.conv_post(x)
|
| 330 |
-
fmap.append(x)
|
| 331 |
-
x = torch.flatten(x, 1, -1)
|
| 332 |
-
|
| 333 |
-
return x, fmap
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
class DiscriminatorS(torch.nn.Module):
|
| 337 |
-
def __init__(self, use_spectral_norm=False):
|
| 338 |
-
super(DiscriminatorS, self).__init__()
|
| 339 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
| 340 |
-
self.convs = nn.ModuleList([
|
| 341 |
-
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
| 342 |
-
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
| 343 |
-
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
| 344 |
-
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
| 345 |
-
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
| 346 |
-
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
| 347 |
-
])
|
| 348 |
-
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
| 349 |
-
|
| 350 |
-
def forward(self, x):
|
| 351 |
-
fmap = []
|
| 352 |
-
|
| 353 |
-
for l in self.convs:
|
| 354 |
-
x = l(x)
|
| 355 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
| 356 |
-
fmap.append(x)
|
| 357 |
-
x = self.conv_post(x)
|
| 358 |
-
fmap.append(x)
|
| 359 |
-
x = torch.flatten(x, 1, -1)
|
| 360 |
-
|
| 361 |
-
return x, fmap
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
class MultiPeriodDiscriminator(torch.nn.Module):
|
| 365 |
-
def __init__(self, use_spectral_norm=False):
|
| 366 |
-
super(MultiPeriodDiscriminator, self).__init__()
|
| 367 |
-
periods = [2,3,5,7,11]
|
| 368 |
-
|
| 369 |
-
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
| 370 |
-
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
| 371 |
-
self.discriminators = nn.ModuleList(discs)
|
| 372 |
-
|
| 373 |
-
def forward(self, y, y_hat):
|
| 374 |
-
y_d_rs = []
|
| 375 |
-
y_d_gs = []
|
| 376 |
-
fmap_rs = []
|
| 377 |
-
fmap_gs = []
|
| 378 |
-
for i, d in enumerate(self.discriminators):
|
| 379 |
-
y_d_r, fmap_r = d(y)
|
| 380 |
-
y_d_g, fmap_g = d(y_hat)
|
| 381 |
-
y_d_rs.append(y_d_r)
|
| 382 |
-
y_d_gs.append(y_d_g)
|
| 383 |
-
fmap_rs.append(fmap_r)
|
| 384 |
-
fmap_gs.append(fmap_g)
|
| 385 |
-
|
| 386 |
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
class SynthesizerTrn(nn.Module):
|
| 391 |
"""
|
| 392 |
Synthesizer for Training
|
|
@@ -445,57 +282,19 @@ class SynthesizerTrn(nn.Module):
|
|
| 445 |
kernel_size,
|
| 446 |
p_dropout)
|
| 447 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
| 448 |
-
|
| 449 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
| 450 |
|
| 451 |
if use_sdp:
|
|
|
|
| 452 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
| 453 |
else:
|
| 454 |
-
|
|
|
|
| 455 |
|
| 456 |
if n_speakers > 1:
|
| 457 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
| 458 |
|
| 459 |
-
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
| 460 |
-
|
| 461 |
-
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 462 |
-
if self.n_speakers > 0:
|
| 463 |
-
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
| 464 |
-
else:
|
| 465 |
-
g = None
|
| 466 |
-
|
| 467 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
| 468 |
-
z_p = self.flow(z, y_mask, g=g)
|
| 469 |
-
|
| 470 |
-
with torch.no_grad():
|
| 471 |
-
# negative cross-entropy
|
| 472 |
-
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
| 473 |
-
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
|
| 474 |
-
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
| 475 |
-
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
| 476 |
-
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
| 477 |
-
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
| 478 |
-
|
| 479 |
-
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
| 480 |
-
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
| 481 |
-
|
| 482 |
-
w = attn.sum(2)
|
| 483 |
-
if self.use_sdp:
|
| 484 |
-
l_length = self.dp(x, x_mask, w, g=g)
|
| 485 |
-
l_length = l_length / torch.sum(x_mask)
|
| 486 |
-
else:
|
| 487 |
-
logw_ = torch.log(w + 1e-6) * x_mask
|
| 488 |
-
logw = self.dp(x, x_mask, g=g)
|
| 489 |
-
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
| 490 |
-
|
| 491 |
-
# expand prior
|
| 492 |
-
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
| 493 |
-
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
| 494 |
-
|
| 495 |
-
z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
|
| 496 |
-
o = self.dec(z_slice, g=g)
|
| 497 |
-
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
| 498 |
-
|
| 499 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
| 500 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 501 |
if self.n_speakers > 0:
|
|
@@ -521,14 +320,3 @@ class SynthesizerTrn(nn.Module):
|
|
| 521 |
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
| 522 |
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
| 523 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
| 524 |
-
|
| 525 |
-
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
| 526 |
-
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
| 527 |
-
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
| 528 |
-
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
| 529 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
| 530 |
-
z_p = self.flow(z, y_mask, g=g_src)
|
| 531 |
-
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
| 532 |
-
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
| 533 |
-
return o_hat, y_mask, (z, z_p, z_hat)
|
| 534 |
-
|
|
|
|
| 7 |
import commons
|
| 8 |
import modules
|
| 9 |
import attentions
|
|
|
|
| 10 |
|
| 11 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
| 12 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
|
|
|
| 93 |
logw = z0
|
| 94 |
return logw
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
class TextEncoder(nn.Module):
|
| 97 |
def __init__(self,
|
| 98 |
n_vocab,
|
|
|
|
| 169 |
x = flow(x, x_mask, g=g, reverse=reverse)
|
| 170 |
return x
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
class Generator(torch.nn.Module):
|
| 173 |
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
| 174 |
super(Generator, self).__init__()
|
|
|
|
| 224 |
l.remove_weight_norm()
|
| 225 |
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
class SynthesizerTrn(nn.Module):
|
| 228 |
"""
|
| 229 |
Synthesizer for Training
|
|
|
|
| 282 |
kernel_size,
|
| 283 |
p_dropout)
|
| 284 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
| 285 |
+
|
| 286 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
| 287 |
|
| 288 |
if use_sdp:
|
| 289 |
+
# raise ValueError
|
| 290 |
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
| 291 |
else:
|
| 292 |
+
raise ValueError
|
| 293 |
+
# self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
| 294 |
|
| 295 |
if n_speakers > 1:
|
| 296 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
| 299 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 300 |
if self.n_speakers > 0:
|
|
|
|
| 320 |
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
| 321 |
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
| 322 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/__init__.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import torch
|
| 3 |
-
from .monotonic_align.core import maximum_path_c
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def maximum_path(neg_cent, mask):
|
| 7 |
-
""" Cython optimized version.
|
| 8 |
-
neg_cent: [b, t_t, t_s]
|
| 9 |
-
mask: [b, t_t, t_s]
|
| 10 |
-
"""
|
| 11 |
-
device = neg_cent.device
|
| 12 |
-
dtype = neg_cent.dtype
|
| 13 |
-
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
| 14 |
-
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
| 15 |
-
|
| 16 |
-
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
| 17 |
-
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
| 18 |
-
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
| 19 |
-
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/core.pyx
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
cimport cython
|
| 2 |
-
from cython.parallel import prange
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
@cython.boundscheck(False)
|
| 6 |
-
@cython.wraparound(False)
|
| 7 |
-
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
| 8 |
-
cdef int x
|
| 9 |
-
cdef int y
|
| 10 |
-
cdef float v_prev
|
| 11 |
-
cdef float v_cur
|
| 12 |
-
cdef float tmp
|
| 13 |
-
cdef int index = t_x - 1
|
| 14 |
-
|
| 15 |
-
for y in range(t_y):
|
| 16 |
-
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
| 17 |
-
if x == y:
|
| 18 |
-
v_cur = max_neg_val
|
| 19 |
-
else:
|
| 20 |
-
v_cur = value[y-1, x]
|
| 21 |
-
if x == 0:
|
| 22 |
-
if y == 0:
|
| 23 |
-
v_prev = 0.
|
| 24 |
-
else:
|
| 25 |
-
v_prev = max_neg_val
|
| 26 |
-
else:
|
| 27 |
-
v_prev = value[y-1, x-1]
|
| 28 |
-
value[y, x] += max(v_prev, v_cur)
|
| 29 |
-
|
| 30 |
-
for y in range(t_y - 1, -1, -1):
|
| 31 |
-
path[y, index] = 1
|
| 32 |
-
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
| 33 |
-
index = index - 1
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
@cython.boundscheck(False)
|
| 37 |
-
@cython.wraparound(False)
|
| 38 |
-
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
| 39 |
-
cdef int b = paths.shape[0]
|
| 40 |
-
cdef int i
|
| 41 |
-
for i in prange(b, nogil=True):
|
| 42 |
-
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/monotonic_align/setup.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
from distutils.core import setup
|
| 2 |
-
from Cython.Build import cythonize
|
| 3 |
-
import numpy
|
| 4 |
-
|
| 5 |
-
setup(
|
| 6 |
-
name = 'monotonic_align',
|
| 7 |
-
ext_modules = cythonize("core.pyx"),
|
| 8 |
-
include_dirs=[numpy.get_include()]
|
| 9 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/preprocess.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
import text
|
| 3 |
-
from utils import load_filepaths_and_text
|
| 4 |
-
|
| 5 |
-
if __name__ == '__main__':
|
| 6 |
-
parser = argparse.ArgumentParser()
|
| 7 |
-
parser.add_argument("--out_extension", default="cleaned")
|
| 8 |
-
parser.add_argument("--text_index", default=1, type=int)
|
| 9 |
-
parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
|
| 10 |
-
parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
|
| 11 |
-
|
| 12 |
-
args = parser.parse_args()
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
for filelist in args.filelists:
|
| 16 |
-
print("START:", filelist)
|
| 17 |
-
filepaths_and_text = load_filepaths_and_text(filelist)
|
| 18 |
-
for i in range(len(filepaths_and_text)):
|
| 19 |
-
original_text = filepaths_and_text[i][args.text_index]
|
| 20 |
-
cleaned_text = text._clean_text(original_text, args.text_cleaners)
|
| 21 |
-
filepaths_and_text[i][args.text_index] = cleaned_text
|
| 22 |
-
|
| 23 |
-
new_filelist = filelist + "." + args.out_extension
|
| 24 |
-
with open(new_filelist, "w", encoding="utf-8") as f:
|
| 25 |
-
f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tts.py
CHANGED
|
@@ -45,10 +45,10 @@ def command_line_args():
|
|
| 45 |
parser.add_argument(
|
| 46 |
'--soundscape',
|
| 47 |
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
| 48 |
-
default='wind fjord',
|
| 49 |
nargs='?',
|
| 50 |
type=str,
|
| 51 |
-
const=
|
| 52 |
)
|
| 53 |
parser.add_argument(
|
| 54 |
'--native',
|
|
@@ -175,4 +175,4 @@ if __name__ == '__main__':
|
|
| 175 |
cli()
|
| 176 |
|
| 177 |
# assume also video and text for video we have to write some classes for video for audiocraft
|
| 178 |
-
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|
|
|
|
| 45 |
parser.add_argument(
|
| 46 |
'--soundscape',
|
| 47 |
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
| 48 |
+
default=None, #'wind fjord',
|
| 49 |
nargs='?',
|
| 50 |
type=str,
|
| 51 |
+
const='wind fjord",
|
| 52 |
)
|
| 53 |
parser.add_argument(
|
| 54 |
'--native',
|
|
|
|
| 175 |
cli()
|
| 176 |
|
| 177 |
# assume also video and text for video we have to write some classes for video for audiocraft
|
| 178 |
+
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|