distortion on barging DEBUG

Browse files

Files changed (4) hide show

audiocraft/lm.py +4 -6
audiocraft/seanet.py +1 -0
audiocraft/transformer.py +7 -36
demo.py +21 -41

audiocraft/lm.py CHANGED Viewed

@@ -1,12 +1,10 @@
-from dataclasses import dataclass, field
-from itertools import chain
 import logging
 import math
-import re
 import typing as tp
 import torch
 import torch.nn.functional as F
-from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
 from torch import nn
@@ -173,7 +171,7 @@ class LMModel(nn.Module):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 1
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -207,7 +205,7 @@ class LMModel(nn.Module):
             norm_first=norm_first, **kwargs)
         self.out_norm: tp.Optional[nn.Module] = None
         if norm_first:
-            self.out_norm = create_norm_fn(norm, dim)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
         self._init_weights(weight_init, depthwise_init, zero_bias_init)
         self._fsdp: tp.Optional[nn.Module]

+from dataclasses import dataclass
 import logging
 import math
 import typing as tp
 import torch
 import torch.nn.functional as F
+from audiocraft.transformer import StreamingTransformer
 from dataclasses import dataclass
 from functools import partial
 from torch import nn
         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 3
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
             norm_first=norm_first, **kwargs)
         self.out_norm: tp.Optional[nn.Module] = None
         if norm_first:
+            self.out_norm = nn.LayerNorm(dim, eps=1e-5)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
         self._init_weights(weight_init, depthwise_init, zero_bias_init)
         self._fsdp: tp.Optional[nn.Module]

audiocraft/seanet.py CHANGED Viewed

@@ -102,6 +102,7 @@ class SEANetDecoder(nn.Module):
         ]
         if lstm:
             model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
         # Upsample to raw audio scale

         ]
         if lstm:
+            print('\n\n\n\nLSTM IN SEANET\n\n\n\n')
             model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
         # Upsample to raw audio scale

audiocraft/transformer.py CHANGED Viewed

@@ -21,24 +21,6 @@ def _get_attention_time_dimension(memory_efficient: bool) -> int:
-def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
-    """Create normalization module for transformer encoder layer.
-    Args:
-        norm_type (str): Normalization method.
-        dim (int): Dimension of the normalized layer.
-        **kwargs (dict): Additional parameters for normalization layer.
-    Returns:
-        nn.Module: Normalization module.
-    """
-    if norm_type == 'layer_norm':
-        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
-    else:
-        raise ValueError(f"Unknown norm type: {norm_type}")
 def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
                          dtype: torch.dtype = torch.float32) -> torch.Tensor:
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
@@ -105,7 +87,7 @@ class StreamingMultiheadAttention(nn.Module):
         self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.memory_efficient = memory_efficient
-        self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
@@ -227,20 +209,9 @@ class StreamingMultiheadAttention(nn.Module):
                 # KV COMPLETION ONLY ON SELF ATTENTION
-                #======================================================
-                # so the previous layer passes you here the k,v having concatenated all previous
-                #
-                # also return those 2 for the next transformer layer
-                #
-                # also clean up after ending the transformer? NOOOOOOOOOOOOO is goes along tokens
-                #
-                # also why completekv does not grow longer during the 47 transformers but changes sum
-                # k, v = self._complete_kv(k, v)
-                # print(k.sum(), v.sum(), k.shape, v.shape,'ATTNext')
-            print(f'{self.attention_as_float32=}')
             if self.memory_efficient:
                 # print('EVER IN MEMORY EFFICIENT A')
@@ -319,14 +290,14 @@ class StreamingTransformerLayer(nn.Module): #nn.TransformerEncoderLayer):
             self.dropout_cross = nn.Dropout(dropout)
             self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
-        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
-        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
     def forward(self,
                 src,
                 cross_attention_src=None):  # txtcond
-        '''T layer'''
         x = src
@@ -412,7 +383,7 @@ class StreamingTransformer(nn.Module):
         for j, lay in enumerate(self.layers):
-            print(f'_________________________{j}___________________')
             x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

 def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
                          dtype: torch.dtype = torch.float32) -> torch.Tensor:
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
         self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.memory_efficient = memory_efficient
         self.cross_attention = cross_attention
                 # KV COMPLETION ONLY ON SELF ATTENTION
             if self.memory_efficient:
                 # print('EVER IN MEMORY EFFICIENT A')
             self.dropout_cross = nn.Dropout(dropout)
             self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
+        self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
+        self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
     def forward(self,
                 src,
                 cross_attention_src=None):  # txtcond
+        '''T is saved float16 weights - should we cast src to float16'''
         x = src
         for j, lay in enumerate(self.layers):
+            # print(f'_________________________{j}___________________')
             x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

demo.py CHANGED Viewed

@@ -1,10 +1,7 @@
 import audiofile
 import numpy as np
-import typing as tp
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
-from audiocraft.lm import LMModel
 from audiocraft.conditioners import ConditioningAttributes
@@ -15,57 +12,40 @@ class AudioGen():
     def __init__(self,
                  compression_model=None,
                  lm=None,
-                 duration=.04,
-                 top_k=249):
         self.compression_model = compression_model
         self.lm = lm
-        self.top_k = top_k
-        self.compression_model.eval()
-        self.lm.eval()
         self.duration = duration
-        self.device = next(iter(lm.parameters())).device
     @property
-    def frame_rate(self) -> float:
-        """Roughly the number of AR steps per seconds."""
         return self.compression_model.frame_rate
-    @property
-    def sample_rate(self) -> int:
-        """Sample rate of the generated audio."""
-        return self.compression_model.sample_rate
-    def generate(self, descriptions):
-        attributes = [
-            ConditioningAttributes(text={'description': d}) for d in descriptions]
-        tokens = self._generate_tokens(attributes)
-        print(f'\n{tokens.shape=}\n{tokens=}  FINAL 5 AUD')
-        return self.generate_audio(tokens)
-    def _generate_tokens(self, attributes):
-        total_gen_len = int(self.duration * self.frame_rate)
-        gen_tokens = self.lm.generate(conditions=attributes,
-                                          max_gen_len=total_gen_len)
-        gen_tokens = gen_tokens.transpose(0, 1).reshape(4, -1)[None, :, :]
-        return gen_tokens
-    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
-        """Generate Audio from tokens."""
-        assert gen_tokens.dim() == 3
         with torch.no_grad():
-            gen_audio = self.compression_model.decode(gen_tokens, None)
-        return gen_audio
 device = 'cuda:0'
   # https://huggingface.co/facebook/audiogen-medium
 sound_generator = AudioGen(
-    compression_model=load_compression_model('facebook/audiogen-medium', device=device),
-    lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float),
-    duration=.04,
-    top_k=1)
@@ -79,7 +59,7 @@ print('\n\n\n\n___________________')
 txt = 'dogs barging in the street'
-x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7
 audiofile.write('del_seane.wav', x, 16000)

 import audiofile
 import numpy as np
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
 from audiocraft.conditioners import ConditioningAttributes
     def __init__(self,
                  compression_model=None,
                  lm=None,
+                 duration=.74):
         self.compression_model = compression_model
         self.lm = lm
         self.duration = duration
     @property
+    def frame_rate(self):
         return self.compression_model.frame_rate
+    def generate(self,
+                 descriptions):
         with torch.no_grad():
+            attributes = [
+                ConditioningAttributes(text={'description': d}) for d in descriptions]
+            gen_tokens = self.lm.generate(
+                conditions=attributes,
+                max_gen_len=int(self.duration * self.frame_rate)) #[n_draw, 4, 37]
+            x = self.compression_model.decode(gen_tokens, None)   #[n_draw, 1, 11840]
+            n_draw, _, n_time_samples = x.shape
+            x = x.reshape(1, n_draw * n_time_samples)  # linearise n_draw
+        return x
 device = 'cuda:0'
   # https://huggingface.co/facebook/audiogen-medium
 sound_generator = AudioGen(
+    compression_model=load_compression_model('facebook/audiogen-medium', device=device).eval(),
+    lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float).eval(),
+    duration=.74)
 txt = 'dogs barging in the street'
+x = sound_generator.generate([txt])[0].detach().cpu().numpy()
 x /= np.abs(x).max() + 1e-7
 audiofile.write('del_seane.wav', x, 16000)