del diffusion [unused]

Browse files

Files changed (6) hide show

audiocraft/audiogen.py +0 -129
audiocraft/builders.py +2 -24
audiocraft/diffusion_schedule.py +0 -272
audiocraft/loaders.py +0 -24
audiocraft/rope.py +0 -125
audiocraft/unet.py +0 -214

audiocraft/audiogen.py DELETED Viewed

@@ -1,129 +0,0 @@
-import typing as tp
-import torch
-from audiocraft.loaders import load_compression_model, load_lm_model
-import typing as tp
-import omegaconf
-import torch
-import numpy as np
-from .lm import LMModel
-from .conditioners import ConditioningAttributes
-from .utils.autocast import TorchAutocast
-def _shift(x):
-    n = x.shape[2]
-    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD do we have very short segments
-    x = torch.roll(x, i, dims=2)
-    return x
-class AudioGen():
-    """Base generative model with convenient generation API.
-    Args:
-        name (str)
-        compression_model (CompressionModel): Encodec with Seanet Decoder
-        lm
-        max_duration (float, optional): As is using top250 token draw() we can gen xN sequences
-    """
-    def __init__(self,
-                 name,
-                 compression_model,
-                 lm,
-                 max_duration=None):
-        self.name = name
-        self.compression_model = compression_model
-        self.lm = lm
-        self.cfg: tp.Optional[omegaconf.DictConfig] = None
-        # Just to be safe, let's put everything in eval mode.
-        self.compression_model.eval()
-        self.lm.eval()
-        if hasattr(lm, 'cfg'):
-            cfg = lm.cfg
-            assert isinstance(cfg, omegaconf.DictConfig)
-            self.cfg = cfg
-        if max_duration is None:
-            if self.cfg is not None:
-                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
-            else:
-                raise ValueError("You must provide max_duration when building directly your GenModel")
-        assert max_duration is not None
-        self.max_duration: float = max_duration
-        self.duration = self.max_duration
-        self.device = next(iter(lm.parameters())).device
-        self.generation_params={}
-        if self.device.type == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-        else:
-            self.autocast = TorchAutocast(
-                enabled=True,
-                device_type=self.device.type,
-                dtype=torch.float16)
-    @property
-    def frame_rate(self) -> float:
-        """Roughly the number of AR steps per seconds."""
-        return self.compression_model.frame_rate
-    @property
-    def sample_rate(self) -> int:
-        """Sample rate of the generated audio."""
-        return self.compression_model.sample_rate
-    def generate(self, descriptions):
-        attributes = [
-            ConditioningAttributes(text={'description': d}) for d in descriptions]
-        tokens = self._generate_tokens(attributes)
-        print(f'\n{tokens.shape=}\n{tokens=}  FINAL 5 AUD')
-        return self.generate_audio(tokens)
-    def _generate_tokens(self, attributes):
-        total_gen_len = int(self.duration * self.frame_rate)
-        if self.duration <= self.max_duration:
-            # generate by sampling from LM, simple case.
-            with self.autocast:
-                gen_tokens = self.lm.generate(conditions=attributes, max_gen_len=total_gen_len)
-        else:
-            print('<>Long gen ?<>')
-        # print(f'{gen_tokens.shape=}')   # [5,4,35]
-        # FLATTEN BATCH AS EXTRA SEQUENCE (BATCH IS VIRTUAL JUST MULTINOMIAL SAMPLING OF N_DRAW TOKENS)
-        gen_tokens = gen_tokens.transpose(0, 1).reshape(4, -1)[None, :, :]
-        for _ in range(3):
-            print(gen_tokens.shape)
-            gen_tokens = _shift(gen_tokens)
-        return gen_tokens
-    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
-        """Generate Audio from tokens."""
-        assert gen_tokens.dim() == 3
-        with torch.no_grad():
-            gen_audio = self.compression_model.decode(gen_tokens, None)
-        return gen_audio
-def get_pretrained(name='facebook/audiogen-medium',
-                   device=None):
-    """Return pretrained model, we provide a single model for now:
-    - facebook/audiogen-medium (1.5B), text to sound,
-        # see: https://huggingface.co/facebook/audiogen-medium
-    """
-    compression_model = load_compression_model(name, device=device)
-    lm = load_lm_model(name, device=device)
-    assert 'self_wav' not in lm.condition_provider.conditioners, \
-        "AudioGen do not support waveform conditioning for now"
-    return AudioGen(name, compression_model, lm)

audiocraft/builders.py CHANGED Viewed

@@ -16,10 +16,10 @@ from .conditioners import (
     ConditioningProvider,
     T5Conditioner,
 )
-from .unet import DiffusionUnet
 from .vq import ResidualVectorQuantizer
-from .diffusion_schedule import MultiBandProcessor, SampleProcessor
 def dict_from_config(cfg):
     dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
@@ -155,25 +155,3 @@ def get_codebooks_pattern_provider(n_q, cfg):
     klass = pattern_providers[name]
     return klass(n_q, **kwargs)
-def get_diffusion_model(cfg: omegaconf.DictConfig):
-    # TODO Find a way to infer the channels from dset
-    channels = cfg.channels
-    num_steps = cfg.schedule.num_steps
-    return DiffusionUnet(
-            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
-def get_processor(cfg, sample_rate: int = 24000):
-    sample_processor = SampleProcessor()
-    if cfg.use:
-        kw = dict(cfg)
-        kw.pop('use')
-        kw.pop('name')
-        if cfg.name == "multi_band_processor":
-            sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
-    return sample_processor

     ConditioningProvider,
     T5Conditioner,
 )
 from .vq import ResidualVectorQuantizer
 def dict_from_config(cfg):
     dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
     klass = pattern_providers[name]
     return klass(n_q, **kwargs)

audiocraft/diffusion_schedule.py DELETED Viewed

@@ -1,272 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Functions for Noise Schedule, defines diffusion process, reverse process and data processor.
-"""
-from collections import namedtuple
-import random
-import typing as tp
-import julius
-import torch
-TrainingItem = namedtuple("TrainingItem", "noisy noise step")
-def betas_from_alpha_bar(alpha_bar):
-    alphas = torch.cat([torch.Tensor([alpha_bar[0]]), alpha_bar[1:]/alpha_bar[:-1]])
-    return 1 - alphas
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        return x
-    def return_sample(self, z: torch.Tensor):
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class MultiBandProcessor(SampleProcessor):
-    """
-    MultiBand sample processor. The input audio is splitted across
-    frequency bands evenly distributed in mel-scale.
-    Each band will be rescaled to match the power distribution
-    of Gaussian noise in that band, using online metrics
-    computed on the first few samples.
-    Args:
-        n_bands (int): Number of mel-bands to split the signal over.
-        sample_rate (int): Sample rate of the audio.
-        num_samples (int): Number of samples to use to fit the rescaling
-            for each band. The processor won't be stable
-            until it has seen that many samples.
-        power_std (float or list/tensor): The rescaling factor computed to match the
-            power of Gaussian noise in each band is taken to
-            that power, i.e. `1.` means full correction of the energy
-            in each band, and values less than `1` means only partial
-            correction. Can be used to balance the relative importance
-            of low vs. high freq in typical audio signals.
-    """
-    def __init__(self, n_bands: int = 8, sample_rate: float = 24_000,
-                 num_samples: int = 10_000, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1.):
-        super().__init__()
-        self.n_bands = n_bands
-        self.split_bands = julius.SplitBands(sample_rate, n_bands=n_bands)
-        self.num_samples = num_samples
-        self.power_std = power_std
-        if isinstance(power_std, list):
-            assert len(power_std) == n_bands
-            power_std = torch.tensor(power_std)
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(n_bands))
-        self.register_buffer('sum_x2', torch.zeros(n_bands))
-        self.register_buffer('sum_target_x2', torch.zeros(n_bands))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-        self.sum_target_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        return std
-    @property
-    def target_std(self):
-        target_std = self.sum_target_x2 / self.counts
-        return target_std
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        bands = self.split_bands(x)
-        if self.counts.item() < self.num_samples:
-            ref_bands = self.split_bands(torch.randn_like(x))
-            self.counts += len(x)
-            self.sum_x += bands.mean(dim=(2, 3)).sum(dim=1)
-            self.sum_x2 += bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
-            self.sum_target_x2 += ref_bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        bands = (bands - self.mean.view(-1, 1, 1, 1)) * rescale.view(-1, 1, 1, 1)
-        return bands.sum(dim=0)
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        bands = self.split_bands(x)
-        rescale = (self.std / self.target_std) ** self.power_std
-        bands = bands * rescale.view(-1, 1, 1, 1) + self.mean.view(-1, 1, 1, 1)
-        return bands.sum(dim=0)
-class NoiseSchedule:
-    """Noise schedule for diffusion.
-    Args:
-        beta_t0 (float): Variance of the first diffusion step.
-        beta_t1 (float): Variance of the last diffusion step.
-        beta_exp (float): Power schedule exponent
-        num_steps (int): Number of diffusion step.
-        variance (str): choice of the sigma value for the denoising eq. Choices: "beta" or "beta_tilde"
-        clip (float): clipping value for the denoising steps
-        rescale (float): rescaling value to avoid vanishing signals unused by default (i.e 1)
-        repartition (str): shape of the schedule only power schedule is supported
-        sample_processor (SampleProcessor): Module that normalize data to match better the gaussian distribution
-        noise_scale (float): Scaling factor for the noise
-    """
-    def __init__(self, beta_t0: float = 1e-4, beta_t1: float = 0.02, num_steps: int = 1000, variance: str = 'beta',
-                 clip: float = 5., rescale: float = 1., device='cuda', beta_exp: float = 1,
-                 repartition: str = "power", alpha_sigmoid: dict = {}, n_bands: tp.Optional[int] = None,
-                 sample_processor: SampleProcessor = SampleProcessor(), noise_scale: float = 1.0, **kwargs):
-        self.beta_t0 = beta_t0
-        self.beta_t1 = beta_t1
-        self.variance = variance
-        self.num_steps = num_steps
-        self.clip = clip
-        self.sample_processor = sample_processor
-        self.rescale = rescale
-        self.n_bands = n_bands
-        self.noise_scale = noise_scale
-        assert n_bands is None
-        if repartition == "power":
-            self.betas = torch.linspace(beta_t0 ** (1 / beta_exp), beta_t1 ** (1 / beta_exp), num_steps,
-                                        device=device, dtype=torch.float) ** beta_exp
-        else:
-            raise RuntimeError('Not implemented')
-        self.rng = random.Random(1234)
-    def get_beta(self, step: tp.Union[int, torch.Tensor]):
-        if self.n_bands is None:
-            return self.betas[step]
-        else:
-            return self.betas[:, step]  # [n_bands, len(step)]
-    def get_initial_noise(self, x: torch.Tensor):
-        if self.n_bands is None:
-            return torch.randn_like(x)
-        return torch.randn((x.size(0), self.n_bands, x.size(2)))
-    def get_alpha_bar(self, step: tp.Optional[tp.Union[int, torch.Tensor]] = None) -> torch.Tensor:
-        """Return 'alpha_bar', either for a given step, or as a tensor with its value for each step."""
-        if step is None:
-            return (1 - self.betas).cumprod(dim=-1)  # works for simgle and multi bands
-        if type(step) is int:
-            return (1 - self.betas[:step + 1]).prod()
-        else:
-            return (1 - self.betas).cumprod(dim=0)[step].view(-1, 1, 1)
-    def get_training_item(self, x: torch.Tensor, tensor_step: bool = False) -> TrainingItem:
-        """Create a noisy data item for diffusion model training:
-        Args:
-            x (torch.Tensor): clean audio data torch.tensor(bs, 1, T)
-            tensor_step (bool): If tensor_step = false, only one step t is sample,
-                the whole batch is diffused to the same step and t is int.
-                If tensor_step = true, t is a tensor of size (x.size(0),)
-                every element of the batch is diffused to a independently sampled.
-        """
-        step: tp.Union[int, torch.Tensor]
-        if tensor_step:
-            bs = x.size(0)
-            step = torch.randint(0, self.num_steps, size=(bs,), device=x.device)
-        else:
-            step = self.rng.randrange(self.num_steps)
-        alpha_bar = self.get_alpha_bar(step)  # [batch_size, n_bands, 1]
-        x = self.sample_processor.project_sample(x)
-        noise = torch.randn_like(x)
-        noisy = (alpha_bar.sqrt() / self.rescale) * x + (1 - alpha_bar).sqrt() * noise * self.noise_scale
-        return TrainingItem(noisy, noise, step)
-    def generate(self, model: torch.nn.Module, initial: tp.Optional[torch.Tensor] = None,
-                 condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
-        """Full ddpm reverse process.
-        Args:
-            model (nn.Module): Diffusion model.
-            initial (tensor): Initial Noise.
-            condition (tensor): Input conditionning Tensor (e.g. encodec compressed representation).
-            return_list (bool): Whether to return the whole process or only the sampled point.
-        """
-        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
-        current = initial
-        iterates = [initial]
-        for step in range(self.num_steps)[::-1]:
-            with torch.no_grad():
-                estimate = model(current, step, condition=condition).sample
-            alpha = 1 - self.betas[step]
-            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
-            previous_alpha_bar = self.get_alpha_bar(step=step - 1)
-            if step == 0:
-                sigma2 = 0
-            elif self.variance == 'beta':
-                sigma2 = 1 - alpha
-            elif self.variance == 'beta_tilde':
-                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
-            elif self.variance == 'none':
-                sigma2 = 0
-            else:
-                raise ValueError(f'Invalid variance type {self.variance}')
-            if sigma2 > 0:
-                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
-            if self.clip:
-                previous = previous.clamp(-self.clip, self.clip)
-            current = previous
-            alpha_bar = previous_alpha_bar
-            if step == 0:
-                previous *= self.rescale
-            if return_list:
-                iterates.append(previous.cpu())
-        if return_list:
-            return iterates
-        else:
-            return self.sample_processor.return_sample(previous)
-    def generate_subsampled(self, model: torch.nn.Module, initial: torch.Tensor, step_list: tp.Optional[list] = None,
-                            condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
-        """Reverse process that only goes through Markov chain states in step_list."""
-        if step_list is None:
-            step_list = list(range(1000))[::-50] + [0]
-        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
-        alpha_bars_subsampled = (1 - self.betas).cumprod(dim=0)[list(reversed(step_list))].cpu()
-        betas_subsampled = betas_from_alpha_bar(alpha_bars_subsampled)
-        current = initial * self.noise_scale
-        iterates = [current]
-        for idx, step in enumerate(step_list[:-1]):
-            with torch.no_grad():
-                estimate = model(current, step, condition=condition).sample * self.noise_scale
-            alpha = 1 - betas_subsampled[-1 - idx]
-            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
-            previous_alpha_bar = self.get_alpha_bar(step_list[idx + 1])
-            if step == step_list[-2]:
-                sigma2 = 0
-                previous_alpha_bar = torch.tensor(1.0)
-            else:
-                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
-            if sigma2 > 0:
-                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
-            if self.clip:
-                previous = previous.clamp(-self.clip, self.clip)
-            current = previous
-            alpha_bar = previous_alpha_bar
-            if step == 0:
-                previous *= self.rescale
-            if return_list:
-                iterates.append(previous.cpu())
-        if return_list:
-            return iterates
-        else:
-            return self.sample_processor.return_sample(previous)

audiocraft/loaders.py CHANGED Viewed

@@ -1,33 +1,9 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility functions to load from the checkpoints.
-Each checkpoint is a torch.saved dict with the following keys:
-- 'xp.cfg': the hydra config as dumped during training. This should be used
-    to rebuild the object using the audiocraft.models.builders functions,
-- 'model_best_state': a readily loadable best state for the model, including
-    the conditioner. The model obtained from `xp.cfg` should be compatible
-    with this state dict. In the case of a LM, the encodec model would not be
-    bundled along but instead provided separately.
-Those functions also support loading from a remote location with the Torch Hub API.
-They also support overriding some parameters, in particular the device and dtype
-of the returned model.
-"""
 from pathlib import Path
 from huggingface_hub import hf_hub_download
 import typing as tp
 import os
 from omegaconf import OmegaConf, DictConfig
 import torch
-import audiocraft
 from . import builders
 from .encodec import EncodecModel

 from pathlib import Path
 from huggingface_hub import hf_hub_download
 import typing as tp
 import os
 from omegaconf import OmegaConf, DictConfig
 import torch
 from . import builders
 from .encodec import EncodecModel

audiocraft/rope.py DELETED Viewed

@@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-from torch import nn
-import torch
-class XPos(nn.Module):
-    """Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
-    This applies an exponential decay to the RoPE rotation matrix.
-    Args:
-        dim (int): Embedding dimension.
-        smoothing (float): Smoothing factor applied to the decay rates.
-        base_scale (int): Base decay rate, given in terms of scaling time.
-        device (torch.device, optional): Device on which to initialize the module.
-        dtype (torch.dtype): dtype to use to generate the embedding.
-    """
-    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
-                 device=None, dtype: torch.dtype = torch.float32):
-        super().__init__()
-        assert dim % 2 == 0
-        assert dtype in [torch.float64, torch.float32]
-        self.dtype = dtype
-        self.base_scale = base_scale
-        half_dim = dim // 2
-        adim = torch.arange(half_dim, device=device, dtype=dtype)
-        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
-        self.register_buffer("decay_rates", decay_rates)
-        self.decay: tp.Optional[torch.Tensor] = None
-    def get_decay(self, start: int, end: int):
-        """Create complex decay tensor, cache values for fast computation."""
-        if self.decay is None or end > self.decay.shape[0]:
-            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
-            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
-            power = idx / self.base_scale
-            scale = self.decay_rates ** power.unsqueeze(-1)
-            self.decay = torch.polar(scale, torch.zeros_like(scale))
-        return self.decay[start:end]  # [T, C/2]
-class RotaryEmbedding(nn.Module):
-    """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
-    Args:
-        dim (int): Embedding dimension (twice the number of frequencies).
-        max_period (float): Maximum period of the rotation frequencies.
-        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
-        scale (float): Scale of positional embedding, set to 0 to deactivate.
-        device (torch.device, optional): Device on which to initialize the module.
-        dtype (torch.dtype): dtype to use to generate the embedding.
-    """
-    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
-                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
-        super().__init__()
-        assert dim % 2 == 0
-        self.scale = scale
-        assert dtype in [torch.float64, torch.float32]
-        self.dtype = dtype
-        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
-        frequencies = 1.0 / (max_period ** (adim / dim))
-        self.register_buffer("frequencies", frequencies)
-        self.rotation: tp.Optional[torch.Tensor] = None
-        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
-    def get_rotation(self, start: int, end: int):
-        """Create complex rotation tensor, cache values for fast computation."""
-        if self.rotation is None or end > self.rotation.shape[0]:
-            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
-            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
-            angles = torch.outer(idx, self.frequencies)
-            self.rotation = torch.polar(torch.ones_like(angles), angles)
-        return self.rotation[start:end]
-    def rotate(self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False):
-        """Apply rope rotation to query or key tensor."""
-        T = x.shape[time_dim]
-        target_shape = [1] * x.dim()
-        target_shape[time_dim] = T
-        target_shape[-1] = -1
-        rotation = self.get_rotation(start, start + T).view(target_shape)
-        if self.xpos:
-            decay = self.xpos.get_decay(start, start + T).view(target_shape)
-        else:
-            decay = 1.0
-        if invert_decay:
-            decay = decay ** -1
-        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
-        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
-        x_out = torch.view_as_real(x_complex * scaled_rotation).view_as(x)
-        return x_out.type_as(x)
-    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1):
-        """ Apply rope rotation to both query and key tensors.
-        Supports streaming mode, in which query and key are not expected to have the same shape.
-        In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
-        query will be [C] (typically C == 1).
-        Args:
-            query (torch.Tensor): Query to rotate.
-            key (torch.Tensor): Key to rotate.
-            start (int): Start index of the sequence for time offset.
-            time_dim (int): which dimension represent the time steps.
-        """
-        query_timesteps = query.shape[time_dim]
-        key_timesteps = key.shape[time_dim]
-        streaming_offset = key_timesteps - query_timesteps
-        query_out = self.rotate(query, start + streaming_offset, time_dim)
-        key_out = self.rotate(key, start, time_dim, invert_decay=True)
-        return query_out, key_out

audiocraft/unet.py DELETED Viewed

@@ -1,214 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Pytorch Unet Module used for diffusion.
-"""
-from dataclasses import dataclass
-import typing as tp
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .transformer import StreamingTransformer, create_sin_embedding
-@dataclass
-class Output:
-    sample: torch.Tensor
-def get_model(cfg, channels: int, side: int, num_steps: int):
-    if cfg.model == 'unet':
-        return DiffusionUnet(
-            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
-    else:
-        raise RuntimeError('Not Implemented')
-class ResBlock(nn.Module):
-    def __init__(self, channels: int, kernel: int = 3, norm_groups: int = 4,
-                 dilation: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        stride = 1
-        padding = dilation * (kernel - stride) // 2
-        Conv = nn.Conv1d
-        Drop = nn.Dropout1d
-        self.norm1 = nn.GroupNorm(norm_groups, channels)
-        self.conv1 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
-        self.activation1 = activation()
-        self.dropout1 = Drop(dropout)
-        self.norm2 = nn.GroupNorm(norm_groups, channels)
-        self.conv2 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
-        self.activation2 = activation()
-        self.dropout2 = Drop(dropout)
-    def forward(self, x):
-        h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
-        h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
-        return x + h
-class DecoderLayer(nn.Module):
-    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
-                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        padding = (kernel - stride) // 2
-        self.res_blocks = nn.Sequential(
-            *[ResBlock(chin, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
-              for idx in range(res_blocks)])
-        self.norm = nn.GroupNorm(norm_groups, chin)
-        ConvTr = nn.ConvTranspose1d
-        self.convtr = ConvTr(chin, chout, kernel, stride, padding, bias=False)
-        self.activation = activation()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.res_blocks(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.convtr(x)
-        return x
-class EncoderLayer(nn.Module):
-    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
-                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        padding = (kernel - stride) // 2
-        Conv = nn.Conv1d
-        self.conv = Conv(chin, chout, kernel, stride, padding, bias=False)
-        self.norm = nn.GroupNorm(norm_groups, chout)
-        self.activation = activation()
-        self.res_blocks = nn.Sequential(
-            *[ResBlock(chout, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
-              for idx in range(res_blocks)])
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, C, T = x.shape
-        stride, = self.conv.stride
-        pad = (stride - (T % stride)) % stride
-        x = F.pad(x, (0, pad))
-        x = self.conv(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.res_blocks(x)
-        return x
-class BLSTM(nn.Module):
-    """BiLSTM with same hidden units as input dim.
-    """
-    def __init__(self, dim, layers=2):
-        super().__init__()
-        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
-        self.linear = nn.Linear(2 * dim, dim)
-    def forward(self, x):
-        x = x.permute(2, 0, 1)
-        x = self.lstm(x)[0]
-        x = self.linear(x)
-        x = x.permute(1, 2, 0)
-        return x
-class DiffusionUnet(nn.Module):
-    def __init__(self, chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.,
-                 max_channels: int = 10_000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False,
-                 bilstm: bool = False, transformer: bool = False,
-                 codec_dim: tp.Optional[int] = None, **kwargs):
-        super().__init__()
-        self.encoders = nn.ModuleList()
-        self.decoders = nn.ModuleList()
-        self.embeddings: tp.Optional[nn.ModuleList] = None
-        self.embedding = nn.Embedding(num_steps, hidden)
-        if emb_all_layers:
-            self.embeddings = nn.ModuleList()
-        self.condition_embedding: tp.Optional[nn.Module] = None
-        for d in range(depth):
-            encoder = EncoderLayer(chin, hidden, **kwargs)
-            decoder = DecoderLayer(hidden, chin, **kwargs)
-            self.encoders.append(encoder)
-            self.decoders.insert(0, decoder)
-            if emb_all_layers and d > 0:
-                assert self.embeddings is not None
-                self.embeddings.append(nn.Embedding(num_steps, hidden))
-            chin = hidden
-            hidden = min(int(chin * growth), max_channels)
-        self.bilstm: tp.Optional[nn.Module]
-        if bilstm:
-            self.bilstm = BLSTM(chin)
-        else:
-            self.bilstm = None
-        self.use_transformer = transformer
-        self.cross_attention = False
-        if transformer:
-            self.cross_attention = cross_attention
-            self.transformer = StreamingTransformer(chin, 8, 6, bias_ff=False, bias_attn=False,
-                                                    cross_attention=cross_attention)
-        self.use_codec = False
-        if codec_dim is not None:
-            self.conv_codec = nn.Conv1d(codec_dim, chin, 1)
-            self.use_codec = True
-    def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
-        skips = []
-        bs = x.size(0)
-        z = x
-        view_args = [1]
-        if type(step) is torch.Tensor:
-            step_tensor = step
-        else:
-            step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
-        for idx, encoder in enumerate(self.encoders):
-            z = encoder(z)
-            if idx == 0:
-                z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
-            elif self.embeddings is not None:
-                z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
-            skips.append(z)
-        if self.use_codec:  # insert condition in the bottleneck
-            assert condition is not None, "Model defined for conditionnal generation"
-            condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
-            assert condition_emb.size(-1) <= 2 * z.size(-1), \
-                f"You are downsampling the conditionning with factor >=2 : {condition_emb.size(-1)=} and {z.size(-1)=}"
-            if not self.cross_attention:
-                condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
-                assert z.size() == condition_emb.size()
-                z += condition_emb
-                cross_attention_src = None
-            else:
-                cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
-                B, T, C = cross_attention_src.shape
-                positions = torch.arange(T, device=x.device).view(1, -1, 1)
-                pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
-                cross_attention_src = cross_attention_src + pos_emb
-        if self.use_transformer:
-            z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
-        else:
-            if self.bilstm is None:
-                z = torch.zeros_like(z)
-            else:
-                z = self.bilstm(z)
-        for decoder in self.decoders:
-            s = skips.pop(-1)
-            z = z[:, :, :s.shape[2]]
-            z = z + s
-            z = decoder(z)
-        z = z[:, :, :x.shape[2]]
-        return Output(z)