debug special token

Browse files

Files changed (7) hide show

audiocraft/builders.py +26 -20
audiocraft/conditioners.py +1 -8
audiocraft/encodec.py +9 -51
audiocraft/lm.py +34 -70
audiocraft/loaders.py +1 -1
audiocraft/seanet.py +14 -125
audiocraft/utils/utils.py +11 -136

audiocraft/builders.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
-from .seanet import SEANetEncoder, SEANetDecoder
 from .codebooks_patterns import (
     CodebooksPatternProvider,
     DelayedPatternProvider,
@@ -49,34 +49,40 @@ def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) ->
     return klass(**kwargs)
-def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
-    if encoder_name == 'seanet':
-        kwargs = dict_from_config(getattr(cfg, 'seanet'))
-        encoder_override_kwargs = kwargs.pop('encoder')
-        decoder_override_kwargs = kwargs.pop('decoder')
-        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
-        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
-        encoder = SEANetEncoder(**encoder_kwargs)
-        decoder = SEANetDecoder(**decoder_kwargs)
-        return encoder, decoder
-    else:
-        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-def get_compression_model(cfg: omegaconf.DictConfig) -> CompressionModel:
     """Instantiate a compression model."""
     if cfg.compression_model == 'encodec':
         kwargs = dict_from_config(getattr(cfg, 'encodec'))
-        encoder_name = kwargs.pop('autoencoder')
         quantizer_name = kwargs.pop('quantizer')
-        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
-        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
-        frame_rate = kwargs['sample_rate'] // encoder.hop_length
         renormalize = kwargs.pop('renormalize', False)
         # deprecated params
         kwargs.pop('renorm', None)
-        return EncodecModel(encoder, decoder, quantizer,
-                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
     else:
         raise KeyError(f"Unexpected compression model {cfg.compression_model}")

 from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
+from .seanet import SEANetDecoder
 from .codebooks_patterns import (
     CodebooksPatternProvider,
     DelayedPatternProvider,
     return klass(**kwargs)
+def get_encodec_autoencoder(cfg):
+    kwargs = dict_from_config(getattr(cfg, 'seanet'))
+    _ = kwargs.pop('encoder')
+    decoder_override_kwargs = kwargs.pop('decoder')
+    decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+    decoder = SEANetDecoder(**decoder_kwargs)
+    return decoder
+def get_compression_model(cfg):
     """Instantiate a compression model."""
     if cfg.compression_model == 'encodec':
         kwargs = dict_from_config(getattr(cfg, 'encodec'))
         quantizer_name = kwargs.pop('quantizer')
+        decoder = get_encodec_autoencoder(cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, 128)
         renormalize = kwargs.pop('renormalize', False)
         # deprecated params
+        # print(f'{frame_rate=} {encoder.dimension=}')  frame_rate=50 encoder.dimension=128
         kwargs.pop('renorm', None)
+        # print('\n______!____________\n', kwargs, '\n______!____________\n')
+        #     ______!____________
+        #     {'autoencoder': 'seanet', 'sample_rate': 16000, 'channels': 1, 'causal': False}
+        #     ______!____________
+        return EncodecModel(decoder=decoder,
+                            quantizer=quantizer,
+                            frame_rate=50,
+                            renormalize=renormalize,
+                            sample_rate=16000,
+                            channels=1,
+                            causal=False
+                            ).to(cfg.device)
     else:
         raise KeyError(f"Unexpected compression model {cfg.compression_model}")

audiocraft/conditioners.py CHANGED Viewed

@@ -1,11 +1,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
-from itertools import chain
 import logging
-import math
-from pathlib import Path
 import random
-import re
 import typing as tp
 import warnings
 import soundfile
@@ -14,11 +10,8 @@ import torch
 from torch import nn
 from .streaming import StreamingModule
-from .quantization import ResidualVectorQuantizer
 from .utils.autocast import TorchAutocast
-from .utils.cache import EmbeddingCache
-from .utils.utils import collate, hash_trick, length_to_mask, load_clap_state_dict, warn_once
 logger = logging.getLogger(__name__)

 from collections import defaultdict
 from dataclasses import dataclass, field
 import logging
 import random
 import typing as tp
 import warnings
 import soundfile
 from torch import nn
 from .streaming import StreamingModule
 from .utils.autocast import TorchAutocast
 logger = logging.getLogger(__name__)

audiocraft/encodec.py CHANGED Viewed

@@ -30,14 +30,7 @@ class CompressionModel(ABC, nn.Module):
     with a language model.
     """
-    @abstractmethod
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        ...
-    @abstractmethod
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """See `EncodecModel.encode`."""
-        ...
     @abstractmethod
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
@@ -142,16 +135,15 @@ class EncodecModel(CompressionModel):
     channels: int = 0
     def __init__(self,
-                 encoder: nn.Module,
-                 decoder: nn.Module,
-                 quantizer: qt.BaseQuantizer,
-                 frame_rate: int,
-                 sample_rate: int,
-                 channels: int,
-                 causal: bool = False,
-                 renormalize: bool = False):
         super().__init__()
-        self.encoder = encoder
         self.decoder = decoder
         self.quantizer = quantizer
         self.frame_rate = frame_rate
@@ -203,40 +195,6 @@ class EncodecModel(CompressionModel):
             x = x * scale.view(-1, 1, 1)
         return x
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        assert x.dim() == 3
-        length = x.shape[-1]
-        x, scale = self.preprocess(x)
-        emb = self.encoder(x)
-        q_res = self.quantizer(emb, self.frame_rate)
-        out = self.decoder(q_res.x)
-        # remove extra padding added by the encoder and decoder
-        assert out.shape[-1] >= length, (out.shape[-1], length)
-        out = out[..., :length]
-        q_res.x = self.postprocess(out, scale)
-        return q_res
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """Encode the given input tensor to quantized representation along with scale parameter.
-        Args:
-            x (torch.Tensor): Float tensor of shape [B, C, T]
-        Returns:
-            codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
-                codes: a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
-                scale: a float tensor containing the scale for audio renormalization.
-        """
-        assert x.dim() == 3
-        x, scale = self.preprocess(x)
-        emb = self.encoder(x)
-        codes = self.quantizer.encode(emb)
-        return codes, scale
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
         """Decode the given codes to a reconstructed representation, using the scale to perform
         audio denormalization if needed.

     with a language model.
     """
     @abstractmethod
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
     channels: int = 0
     def __init__(self,
+                 decoder=None,
+                 quantizer=None,
+                 frame_rate=None,
+                 sample_rate=None,
+                 channels=None,
+                 causal=False,
+                 renormalize=False):
         super().__init__()
         self.decoder = decoder
         self.quantizer = quantizer
         self.frame_rate = frame_rate
             x = x * scale.view(-1, 1, 1)
         return x
     def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
         """Decode the given codes to a reconstructed representation, using the scale to perform
         audio denormalization if needed.

audiocraft/lm.py CHANGED Viewed

@@ -14,16 +14,14 @@ import warnings
 import einops
 from num2words import num2words
 import spacy
-from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence
 from audiocraft.streaming import StreamingModule
 from audiocraft.transformer import create_sin_embedding
-from audiocraft.utils.audio_utils import convert_audio
 from audiocraft.utils.autocast import TorchAutocast
-from audiocraft.utils.cache import EmbeddingCache
-from audiocraft.utils.utils import collate, hash_trick, length_to_mask, load_clap_state_dict, warn_once
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
@@ -297,13 +295,7 @@ class BaseConditioner(nn.Module):
         self.output_dim = output_dim
         self.output_proj = nn.Linear(dim, output_dim)
-    def tokenize(self, *args, **kwargs) -> tp.Any:
-        """Should be any part of the processing that will lead to a synchronization
-        point, e.g. BPE tokenization with transfer to the GPU.
-        The returned value will be saved and return later when calling forward().
-        """
-        raise NotImplementedError()
     def forward(self, inputs: tp.Any) -> ConditionType:
         """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
@@ -530,34 +522,6 @@ class ConditioningProvider(nn.Module):
     def has_wav_condition(self):
         return len(self.wav_conditions) > 0
-    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
-        """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
-        This should be called before starting any real GPU work to avoid synchronization points.
-        This will return a dict matching conditioner names to their arbitrary tokenized representations.
-        Args:
-            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
-                text and wav conditions.
-        """
-        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
-            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
-            f" but types were {set([type(x) for x in inputs])}"
-        )
-        output = {}
-        text = self._collate_text(inputs)
-        wavs = self._collate_wavs(inputs)
-        joint_embeds = self._collate_joint_embeds(inputs)
-        assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
-            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
-            f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
-        )
-        for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
-            output[attribute] = self.conditioners[attribute].tokenize(batch)
-        return output
     def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
         """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
         The output is for example:
@@ -780,6 +744,7 @@ class ConditionFuser(StreamingModule):
                 raise ValueError(f"unknown op ({op})")
         if self.cross_attention_pos_emb and cross_attention_output is not None:
             positions = torch.arange(
                 cross_attention_output.shape[1],
                 device=cross_attention_output.device
@@ -925,7 +890,7 @@ class LMModel(StreamingModule):
         self.condition_provider = condition_provider
         self.fuser = fuser
-        self.card = card
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
@@ -1030,6 +995,7 @@ class LMModel(StreamingModule):
         # remove the prefix from the model outputs
         if len(self.fuser.fuse2cond['prepend']) > 0:
             logits = logits[:, :, -S:]
         return logits  # [B, K, S, card]
@@ -1067,6 +1033,8 @@ class LMModel(StreamingModule):
         B, K, T = codes.shape
         codes = codes.contiguous()
         # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
         pattern = self.pattern_provider.get_pattern(T)
         sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
             codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
@@ -1118,35 +1086,33 @@ class LMModel(StreamingModule):
         model = self if self._fsdp is None else self._fsdp
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
         if two_step_cfg and cfg_conditions != {}:
-            assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
-            condition_tensors, null_condition_tensors = cfg_conditions
-            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
-            state = self.get_streaming_state()
-            self.set_streaming_state(unconditional_state)
-            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
-            unconditional_state.update(self.get_streaming_state())
-            self.set_streaming_state(state)
-            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
         else:
             assert isinstance(cfg_conditions, dict)
             condition_tensors = cfg_conditions
             if condition_tensors:
                 # Preparing for CFG, predicting both conditional and unconditional logits.
                 sequence = torch.cat([sequence, sequence], dim=0)
             all_logits = model(
                 sequence,
                 conditions=[], condition_tensors=condition_tensors)
             if condition_tensors:
-                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
-                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
             else:
-                logits = all_logits
         logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
         logits = logits[..., -1]  # [B x K x card]
         # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
         if use_sampling and temp > 0.0:
             probs = torch.softmax(logits / temp, dim=-1)
             if top_p > 0.0:
                 next_token = utils.sample_top_p(probs, p=top_p)
@@ -1155,7 +1121,9 @@ class LMModel(StreamingModule):
             else:
                 next_token = utils.multinomial(probs, num_samples=1)
         else:
-            next_token = torch.argmax(logits, dim=-1, keepdim=True)
         return next_token
@@ -1249,9 +1217,9 @@ class LMModel(StreamingModule):
         # this token is used as default value for codes that are not generated yet
         unknown_token = -1
-        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
         gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
-        # filling the gen_codes with the prompt if needed
         gen_codes[..., :start_offset] = prompt
         # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
         gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
@@ -1280,9 +1248,17 @@ class LMModel(StreamingModule):
                 # ensure the tokens that should be masked are properly set to special_token_id
                 # as the model never output special_token_id
                 valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
-                next_token[~valid_mask] = self.special_token_id
                 # ensure we don't overwrite prompt tokens, we only write over unknown tokens
-                # (then mask tokens should be left as is as well, which is correct)
                 gen_sequence[..., offset:offset+1] = torch.where(
                     gen_sequence[..., offset:offset+1] == unknown_token,
                     next_token, gen_sequence[..., offset:offset+1]
@@ -1292,23 +1268,11 @@ class LMModel(StreamingModule):
                     callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
         unconditional_state.clear()
-        # ensure sequence has been entirely filled
-        assert not (gen_sequence == unknown_token).any()
-        # ensure gen_sequence pattern and mask are matching
-        # which means the gen_sequence is valid according to the pattern
-        assert (
-            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
-        ).all()
-        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
         out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
-        # sanity checks over the returned codes and corresponding masks
-        assert (out_codes[..., :max_gen_len] != unknown_token).all()
-        assert (out_mask[..., :max_gen_len] == 1).all()
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]
         # ensure the returned codes are all valid
-        assert (out_codes >= 0).all() and (out_codes <= self.card).all()
         return out_codes

 import einops
 from num2words import num2words
 import spacy
+from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pad_sequence
 from audiocraft.streaming import StreamingModule
 from audiocraft.transformer import create_sin_embedding
 from audiocraft.utils.autocast import TorchAutocast
+from audiocraft.utils.utils import collate, length_to_mask
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
         self.output_dim = output_dim
         self.output_proj = nn.Linear(dim, output_dim)
     def forward(self, inputs: tp.Any) -> ConditionType:
         """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
     def has_wav_condition(self):
         return len(self.wav_conditions) > 0
     def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
         """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
         The output is for example:
                 raise ValueError(f"unknown op ({op})")
         if self.cross_attention_pos_emb and cross_attention_output is not None:
+            print('SIN EMBED')
             positions = torch.arange(
                 cross_attention_output.shape[1],
                 device=cross_attention_output.device
         self.condition_provider = condition_provider
         self.fuser = fuser
+        self.card = card  # 2048 ?
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
         # remove the prefix from the model outputs
         if len(self.fuser.fuse2cond['prepend']) > 0:
             logits = logits[:, :, -S:]
+            print('PRESFIX')
         return logits  # [B, K, S, card]
         B, K, T = codes.shape
         codes = codes.contiguous()
         # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        # what is the T is it 2048 ?
+        # and then what is pattern -> another function?
         pattern = self.pattern_provider.get_pattern(T)
         sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
             codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
         model = self if self._fsdp is None else self._fsdp
         two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
         if two_step_cfg and cfg_conditions != {}:
+            print('\nNOT HERE\n')
         else:
+            print('C')
             assert isinstance(cfg_conditions, dict)
             condition_tensors = cfg_conditions
             if condition_tensors:
+                # print('\nD\n')
                 # Preparing for CFG, predicting both conditional and unconditional logits.
                 sequence = torch.cat([sequence, sequence], dim=0)
             all_logits = model(
                 sequence,
                 conditions=[], condition_tensors=condition_tensors)
             if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0) #torch.Size([2, 4, 1, 2048])
+                # logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+                # logits = 3 * cond_logits - 2.4 * uncond_logits
+                logits = 2 * cond_logits - 1.4 * uncond_logits
             else:
+                print('\nF!\n')
         logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
         logits = logits[..., -1]  # [B x K x card]
         # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
         if use_sampling and temp > 0.0:
+            # print(f'\nR {temp=} {top_p=} {top_k=}\n') -------------> R temp=1.0 top_p=0.0 top_k=250
             probs = torch.softmax(logits / temp, dim=-1)
             if top_p > 0.0:
                 next_token = utils.sample_top_p(probs, p=top_p)
             else:
                 next_token = utils.multinomial(probs, num_samples=1)
         else:
+            #
+            print('\nNeverHere\n')
         return next_token
         # this token is used as default value for codes that are not generated yet
         unknown_token = -1
         gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
         gen_codes[..., :start_offset] = prompt
         # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
         gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
                 # ensure the tokens that should be masked are properly set to special_token_id
                 # as the model never output special_token_id
                 valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                # next_token[~valid_mask] = self.special_token_id
+                # print(f'{unconditional_state=} \n
+                # print('Set All to Special')
+                # next_token[:] = self.special_token_id
                 # ensure we don't overwrite prompt tokens, we only write over unknown tokens
                 gen_sequence[..., offset:offset+1] = torch.where(
                     gen_sequence[..., offset:offset+1] == unknown_token,
                     next_token, gen_sequence[..., offset:offset+1]
                     callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
         unconditional_state.clear()
         out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
         out_start_offset = start_offset if remove_prompts else 0
         out_codes = out_codes[..., out_start_offset:max_gen_len]
         # ensure the returned codes are all valid
+        # assert (out_codes >= 0).all() and (out_codes <= self.card).all()
         return out_codes

audiocraft/loaders.py CHANGED Viewed

@@ -79,7 +79,7 @@ def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu',
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
-    model.load_state_dict(pkg['best_state'])
     model.eval()
     return model

     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg['best_state'], strict=False)  # ckpt contains uninstantiated encoder
     model.eval()
     return model

audiocraft/seanet.py CHANGED Viewed

@@ -60,136 +60,25 @@ class SEANetResnetBlock(nn.Module):
         return self.shortcut(x) + self.block(x)
-class SEANetEncoder(nn.Module):
-    """SEANet encoder.
-    Args:
-        channels (int): Audio channels.
-        dimension (int): Intermediate representation dimension.
-        n_filters (int): Base width for the model.
-        n_residual_layers (int): nb of residual layers.
-        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
-            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
-            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        norm (str): Normalization method.
-        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
-        kernel_size (int): Kernel size for the initial convolution.
-        last_kernel_size (int): Kernel size for the initial convolution.
-        residual_kernel_size (int): Kernel size for the residual layers.
-        dilation_base (int): How much to increase the dilation with each layer.
-        causal (bool): Whether to use fully causal convolution.
-        pad_mode (str): Padding mode for the convolutions.
-        true_skip (bool): Whether to use true skip connection or a simple
-            (streamable) convolution as the skip connection in the residual network blocks.
-        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
-        lstm (int): Number of LSTM layers at the end of the encoder.
-        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
-            For the encoder, it corresponds to the N first blocks.
-    """
-    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
-                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
-                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
-                 pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
-                 disable_norm_outer_blocks: int = 0):
-        super().__init__()
-        self.channels = channels
-        self.dimension = dimension
-        self.n_filters = n_filters
-        self.ratios = list(reversed(ratios))
-        del ratios
-        self.n_residual_layers = n_residual_layers
-        self.hop_length = np.prod(self.ratios)
-        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
-        self.disable_norm_outer_blocks = disable_norm_outer_blocks
-        assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
-            "Number of blocks for which to disable norm is invalid." \
-            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
-        act = getattr(nn, activation)
-        mult = 1
-        model: tp.List[nn.Module] = [
-            StreamableConv1d(channels, mult * n_filters, kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        # Downsample to raw audio scale
-        for i, ratio in enumerate(self.ratios):
-            block_norm = 'none' if self.disable_norm_outer_blocks >= i + 2 else norm
-            # Add residual layers
-            for j in range(n_residual_layers):
-                model += [
-                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
-                                      dilations=[dilation_base ** j, 1],
-                                      norm=block_norm, norm_params=norm_params,
-                                      activation=activation, activation_params=activation_params,
-                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
-            # Add downsampling layers
-            model += [
-                act(**activation_params),
-                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
-                                 kernel_size=ratio * 2, stride=ratio,
-                                 norm=block_norm, norm_kwargs=norm_params,
-                                 causal=causal, pad_mode=pad_mode),
-            ]
-            mult *= 2
-        if lstm:
-            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
-        model += [
-            act(**activation_params),
-            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        self.model = nn.Sequential(*model)
-    def forward(self, x):
-        return self.model(x)
 class SEANetDecoder(nn.Module):
-    """SEANet decoder.
-    Args:
-        channels (int): Audio channels.
-        dimension (int): Intermediate representation dimension.
-        n_filters (int): Base width for the model.
-        n_residual_layers (int): nb of residual layers.
-        ratios (Sequence[int]): kernel size and stride ratios.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        final_activation (str): Final activation function after all convolutions.
-        final_activation_params (dict): Parameters to provide to the activation function.
-        norm (str): Normalization method.
-        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
-        kernel_size (int): Kernel size for the initial convolution.
-        last_kernel_size (int): Kernel size for the initial convolution.
-        residual_kernel_size (int): Kernel size for the residual layers.
-        dilation_base (int): How much to increase the dilation with each layer.
-        causal (bool): Whether to use fully causal convolution.
-        pad_mode (str): Padding mode for the convolutions.
-        true_skip (bool): Whether to use true skip connection or a simple.
-            (streamable) convolution as the skip connection in the residual network blocks.
-        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
-        lstm (int): Number of LSTM layers at the end of the encoder.
-        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
-            For the decoder, it corresponds to the N last blocks.
-        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
-            If equal to 1.0, it means that all the trimming is done at the right.
-    """
-    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
-                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
-                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
-                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
-                 pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
-                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
         super().__init__()
         self.dimension = dimension
         self.channels = channels

         return self.shortcut(x) + self.block(x)
 class SEANetDecoder(nn.Module):
+    def __init__(self, channels: int = 1,
+                 dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU',
+                 activation_params: dict = {'alpha': 1.0},
+                 final_activation: tp.Optional[str] = None,
+                 final_activation_params: tp.Optional[dict] = None,
+                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {},
+                 kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3,
+                 dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = 'reflect', true_skip: bool = True,
+                 compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0,
+                 trim_right_ratio: float = 1.0):
         super().__init__()
         self.dimension = dimension
         self.channels = channels

audiocraft/utils/utils.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
 from functools import wraps, lru_cache
 import hashlib
@@ -103,6 +103,9 @@ def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, gen
     input_ = input.reshape(-1, input.shape[-1])
     output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
     output = output_.reshape(*list(input.shape[:-1]), -1)
     return output
@@ -115,61 +118,18 @@ def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
     Returns:
         torch.Tensor: Sampled tokens.
     """
-    top_k_value, _ = torch.topk(probs, k, dim=-1)
-    min_value_top_k = top_k_value[..., [-1]]
-    probs *= (probs >= min_value_top_k).float()
-    probs.div_(probs.sum(dim=-1, keepdim=True))
     next_token = multinomial(probs, num_samples=1)
     return next_token
-def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
-    """Sample next token from top P probabilities along the last dimension of the input probs tensor.
-    Args:
-        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
-        p (int): The p in “top-p”.
-    Returns:
-        torch.Tensor: Sampled tokens.
-    """
-    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort *= (~mask).float()
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    next_token = multinomial(probs_sort, num_samples=1)
-    next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token
-class DummyPoolExecutor:
-    """Dummy pool executor to use when we actually have only 1 worker.
-    (e.g. instead of ProcessPoolExecutor).
-    """
-    class DummyResult:
-        def __init__(self, func, *args, **kwargs):
-            self.func = func
-            self.args = args
-            self.kwargs = kwargs
-        def result(self):
-            return self.func(*self.args, **self.kwargs)
-    def __init__(self, workers, mp_context=None):
-        pass
-    def submit(self, func, *args, **kwargs):
-        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        return
-def get_pool_executor(num_workers: int, mp_context=None):
-    return ProcessPoolExecutor(num_workers, mp_context) if num_workers > 1 else DummyPoolExecutor(1)
 def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
@@ -188,42 +148,6 @@ def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> t
     return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
-def hash_trick(word: str, vocab_size: int) -> int:
-    """Hash trick to pair each word with an index
-    Args:
-        word (str): word we wish to convert to an index
-        vocab_size (int): size of the vocabulary
-    Returns:
-        int: index of the word in the embedding LUT
-    """
-    hash = int(hashlib.sha256(word.encode("utf-8")).hexdigest(), 16)
-    return hash % vocab_size
-def with_rank_rng(base_seed: int = 1234):
-    """Decorator for a function so that the function will use a Random Number Generator
-    whose state depend on the GPU rank. The original RNG state is restored upon returning.
-    Args:
-        base_seed (int): Random seed.
-    """
-    def _decorator(fun: tp.Callable):
-        @wraps(fun)
-        def _decorated(*args, **kwargs):
-            state = torch.get_rng_state()
-            seed = base_seed ^ flashy.distrib.rank()
-            torch.manual_seed(seed)
-            logger.debug('Rank dependent seed set to %d', seed)
-            try:
-                return fun(*args, **kwargs)
-            finally:
-                torch.set_rng_state(state)
-                logger.debug('RNG state restored.')
-        return _decorated
-    return _decorator
 def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """Get a list of tensors and collate them to a single tensor. according to the following logic:
     - `dim` specifies the time dimension which will be stacked and padded.
@@ -247,52 +171,3 @@ def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tens
     return padded_tensors, lens
-# TODO: Move to flashy?
-def copy_state(state: tp.Any, device: tp.Union[torch.device, str] = 'cpu',
-               dtype: tp.Optional[torch.dtype] = None) -> tp.Any:
-    if isinstance(state, torch.Tensor):
-        if dtype is None or not state.is_floating_point():
-            dtype = state.dtype
-        return state.detach().to(device=device, dtype=dtype, copy=True)
-    elif isinstance(state, dict):
-        return {k: copy_state(v, device, dtype) for k, v in state.items()}
-    elif isinstance(state, list):
-        return [copy_state(v, device, dtype) for v in state]
-# TODO: Move to flashy?
-@contextmanager
-def swap_state(model, state, **kwargs):
-    old_state = copy_state(model.state_dict())
-    model.load_state_dict(state, **kwargs)
-    try:
-        yield
-    finally:
-        model.load_state_dict(old_state)
-@lru_cache(None)
-def warn_once(logger, msg):
-    """Warn about a given message only once."""
-    logger.warning(msg)
-def is_jsonable(x: tp.Any):
-    """Check if an object can be serialized into a json:"""
-    try:
-        json.dumps(x)
-        return True
-    except (TypeError, OverflowError):
-        return False
-def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
-    """Wrapper around state dict loading of CLAP model
-    addressing compatibility issues between CLAP and AudioCraft
-    HuggingFace transformer version.
-    See: https://github.com/LAION-AI/CLAP/issues/118
-    """
-    from clap_module.factory import load_state_dict  # type: ignore
-    pkg = load_state_dict(path)
-    pkg.pop('text_branch.embeddings.position_ids', None)
-    clap_model.model.load_state_dict(pkg)

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 from contextlib import contextmanager
 from functools import wraps, lru_cache
 import hashlib
     input_ = input.reshape(-1, input.shape[-1])
     output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
     output = output_.reshape(*list(input.shape[:-1]), -1)
+    # print('MULTINOmial', input.shape, output.shape) # MULTINOmial torch.Size([1, 4, 2048]) torch.Size([1, 4, 1])
+    # output = input[..., 0:1]
     return output
     Returns:
         torch.Tensor: Sampled tokens.
     """
+    top_k_value, i250 = torch.topk(probs, k, dim=-1)   # probs: [1, 4, 2048]
+    min_value_top_k = top_k_value[..., [-1]]  #
+    probs *= (probs >= min_value_top_k).float()  # multiply all being > of min_topk with 1 thus zeroing others
+    probs.div_(probs.sum(dim=-1, keepdim=True))    # why normalize by the sum ? oh in order to choose mult
     next_token = multinomial(probs, num_samples=1)
+    # so instead of chooose multinomial what happens if we take all 250 topk tokens
+    # probs.shape=torch.Size([1, 4, 2048]) <,    print(next_token,f'{probs.shape=}', 'h')  # 1,4,1  next token is 4tok
+    # next_token = i250
     return next_token
 def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
     return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
 def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """Get a list of tensors and collate them to a single tensor. according to the following logic:
     - `dim` specifies the time dimension which will be stacked and padded.
     return padded_tensors, lens