dkounadis
/

artificial-styletts2

@@ -1,181 +0,0 @@
-from math import sqrt
-import torch.nn as nn
-from einops import rearrange
-from torch import Tensor
-from functools import reduce
-# from inspect import isfunction
-import torch
-import torch.nn.functional as F
-def default(val, d):
-    if val is not None: #exists(val):
-        return val
-    return d #d() if isfunction(d) else d
-class LogNormalDistribution():
-    def __init__(self, mean: float, std: float):
-        self.mean = mean
-        self.std = std
-    def __call__(
-        self, num_samples: int, device: torch.device = torch.device("cpu")
-    ) -> Tensor:
-        normal = self.mean + self.std * torch.randn((num_samples,), device=device)
-        return normal.exp()
-class UniformDistribution():
-    def __call__(self, num_samples: int, device: torch.device = torch.device("cpu")):
-        return torch.rand(num_samples, device=device)
-def to_batch(
-    batch_size: int,
-    device: torch.device,
-    x = None,
-    xs = None):
-    # assert exists(x) ^ exists(xs), "Either x or xs must be provided"
-    # If x provided use the same for all batch items
-    if x is not None: #exists(x):
-        xs = torch.full(size=(batch_size,), fill_value=x).to(device)
-    # assert exists(xs)
-    return xs
-class KDiffusion(nn.Module):
-    """Elucidated Diffusion (Karras et al. 2022): https://arxiv.org/abs/2206.00364"""
-    alias = "k"
-    def __init__(
-        self,
-        net: nn.Module,
-        *,
-        sigma_distribution,
-        sigma_data: float,  # data distribution standard deviation
-        dynamic_threshold: float = 0.0,
-    ):
-        super().__init__()
-        self.net = net
-        self.sigma_data = sigma_data
-    def get_scale_weights(self, sigmas):
-        sigma_data = self.sigma_data
-        c_noise = torch.log(sigmas) * 0.25
-        sigmas = rearrange(sigmas, "b -> b 1 1")
-        c_skip = (sigma_data ** 2) / (sigmas ** 2 + sigma_data ** 2)
-        c_out = sigmas * sigma_data * (sigma_data ** 2 + sigmas ** 2) ** -0.5
-        c_in = (sigmas ** 2 + sigma_data ** 2) ** -0.5
-        return c_skip, c_out, c_in, c_noise
-    def denoise_fn(
-        self,
-        x_noisy,
-        sigmas = None,
-        sigma = None,
-        **kwargs,
-    ):
-        # raise ValueError
-        batch_size, device = x_noisy.shape[0], x_noisy.device
-        sigmas = to_batch(x=sigma, xs=sigmas, batch_size=batch_size, device=device)
-        # Predict network output and add skip connection
-        # print('\n\n\n\n', kwargs, '\nKWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWAr\n\n\n\n')  'embedding tensor'
-        c_skip, c_out, c_in, c_noise = self.get_scale_weights(sigmas)
-        x_pred = self.net(c_in * x_noisy, c_noise, **kwargs)
-        x_denoised = c_skip * x_noisy + c_out * x_pred
-        return x_denoised
-class KarrasSchedule(nn.Module):
-    """https://arxiv.org/abs/2206.00364 equation 5"""
-    def __init__(self, sigma_min: float, sigma_max: float, rho: float = 7.0):
-        super().__init__()
-        self.sigma_min = sigma_min
-        self.sigma_max = sigma_max
-        self.rho = rho
-    def forward(self, num_steps: int, device):
-        rho_inv = 1.0 / self.rho
-        steps = torch.arange(num_steps, device=device, dtype=torch.float32)
-        sigmas = (
-            self.sigma_max ** rho_inv
-            + (steps / (num_steps - 1))
-            * (self.sigma_min ** rho_inv - self.sigma_max ** rho_inv)
-        ) ** self.rho
-        sigmas = F.pad(sigmas, pad=(0, 1), value=0.0)
-        return sigmas
-class ADPM2Sampler(nn.Module):
-    """https://www.desmos.com/calculator/jbxjlqd9mb"""
-    diffusion_types = [KDiffusion,] # VKDiffusion]
-    def __init__(self, rho: float = 1.0):
-        super().__init__()
-        self.rho = rho
-    def get_sigmas(self,
-                   sigma,
-                   sigma_next):
-        r = self.rho
-        sigma_up = sqrt(sigma_next ** 2 * (sigma ** 2 - sigma_next ** 2) / sigma ** 2)
-        sigma_down = sqrt(sigma_next ** 2 - sigma_up ** 2)
-        sigma_mid = ((sigma ** (1 / r) + sigma_down ** (1 / r)) / 2) ** r
-        return sigma_up, sigma_down, sigma_mid
-    def step(self, x, fn, sigma, sigma_next):
-        sigma_up, sigma_down, sigma_mid = self.get_sigmas(sigma, sigma_next)
-        # Derivative at sigma (∂x/∂sigma)
-        d = (x - fn(x, sigma=sigma)) / sigma
-        # Denoise to midpoint
-        x_mid = x + d * (sigma_mid - sigma)
-        # Derivative at sigma_mid (∂x_mid/∂sigma_mid)
-        d_mid = (x_mid - fn(x_mid, sigma=sigma_mid)) / sigma_mid
-        # Denoise to next
-        x = x + d_mid * (sigma_down - sigma)
-        # Add randomness
-        x_next = x + torch.randn_like(x) * sigma_up
-        return x_next
-    def forward(
-        self, noise, fn, sigmas, num_steps):
-        # raise ValueError
-        x = sigmas[0] * noise
-        # Denoise to sample
-        for i in range(num_steps - 1):
-            x = self.step(x, fn=fn, sigma=sigmas[i], sigma_next=sigmas[i + 1])  # type: ignore # noqa
-        return x
-class DiffusionSampler(nn.Module):
-    def __init__(
-        self,
-        diffusion=None,
-        num_steps=None,
-        clamp=True,  # default=False
-    ):
-        super().__init__()
-        self.denoise_fn = diffusion.denoise_fn
-        self.sampler = ADPM2Sampler()
-        self.sigma_schedule = KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0)
-        self.num_steps = num_steps
-        self.clamp = clamp
-    def forward(
-        self, noise, num_steps=None, **kwargs):
-        # raise ValueError
-        device = noise.device
-        num_steps = default(num_steps, self.num_steps)  # type: ignore
-        # Compute sigmas using schedule
-        sigmas = self.sigma_schedule(num_steps, device)
-        # L242 KWARGS dict_keys(['embedding', 'features'])
-        fn = lambda *a, **ka: self.denoise_fn(*a, **{**ka, **kwargs})  # noqa
-        # Sample using sampler
-        x = self.sampler(noise, fn=fn, sigmas=sigmas, num_steps=num_steps)
-        x = x.clamp(-1.0, 1.0) if self.clamp else x
-        return x

models.py CHANGED Viewed

@@ -11,20 +11,19 @@ import torch.nn.functional as F
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
-from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
 from Modules.diffusion.modules import StyleTransformer1d
-# from Modules.diffusion.diffusion import AudioDiffusionConditional
 from munch import Munch
 import yaml
 from math import pi
 from random import randint
-# from typing import Any, Optional, Sequence, Tuple, Union
 import torch
 from einops import rearrange
 from torch import Tensor, nn
 from tqdm import tqdm
-# from Modules.diffusion.utils import *
-# from Modules.diffusion.sampler import *
@@ -623,23 +622,7 @@ def build_model(args, text_aligner, pitch_extractor, bert):
     else:
         raise NotImplementedError
-    diffusion = AudioDiffusionConditional(
-        in_channels=1,
-        embedding_max_length=bert.config.max_position_embeddings,
-        embedding_features=bert.config.hidden_size,
-        embedding_mask_proba=args.diffusion.embedding_mask_proba, # Conditional dropout of batch elements,
-        channels=args.style_dim*2,
-        context_features=args.style_dim*2,
-    )
-    # this initialises self.diffusion for AudioDiffusionConditional
-    diffusion.diffusion = KDiffusion(
-        net=diffusion.unet,
-        sigma_distribution=LogNormalDistribution(mean = args.diffusion.dist.mean, std = args.diffusion.dist.std),
-        sigma_data=args.diffusion.dist.sigma_data, # a placeholder, will be changed dynamically when start training diffusion model
-        dynamic_threshold=0.0
-    )
-    diffusion.diffusion.net = transformer
-    diffusion.unet = transformer
     nets = Munch(
@@ -652,7 +635,6 @@ def build_model(args, text_aligner, pitch_extractor, bert):
             predictor_encoder=predictor_encoder,
             style_encoder=style_encoder,
-            diffusion=diffusion,
             text_aligner = text_aligner,
             pitch_extractor=pitch_extractor

 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
 from Modules.diffusion.modules import StyleTransformer1d
 from munch import Munch
 import yaml
 from math import pi
 from random import randint
 import torch
 from einops import rearrange
 from torch import Tensor, nn
 from tqdm import tqdm
     else:
         raise NotImplementedError
     nets = Munch(
             predictor_encoder=predictor_encoder,
             style_encoder=style_encoder,
             text_aligner = text_aligner,
             pitch_extractor=pitch_extractor

msinference.py CHANGED Viewed

@@ -160,9 +160,7 @@ for key in model:
 #                 _load(params[key], model[key])
 _ = [model[key].eval() for key in model]
-from Modules.diffusion.sampler import DiffusionSampler
-sampler = DiffusionSampler(diffusion=model.diffusion.diffusion)
 def inference(text,
               ref_s,
@@ -205,17 +203,10 @@ def inference(text,
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
-        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
-                                          embedding=bert_dur,
-                                            features=ref_s, # reference from the same speaker as the embedding
-                                             num_steps=diffusion_steps).squeeze(1)
-        s = s_pred[:, 128:]
-        ref = s_pred[:, :128]
-        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
-        s = beta * s + (1 - beta)  * ref_s[:, 128:]
         d = model.predictor.text_encoder(d_en,
                                          s, input_lengths, text_mask)

 #                 _load(params[key], model[key])
 _ = [model[key].eval() for key in model]
 def inference(text,
               ref_s,
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
+        ref = ref_s[:, :128]
+        s = ref_s[:, 128:]
         d = model.predictor.text_encoder(d_en,
                                          s, input_lengths, text_mask)