apply null cond lm.forw()

Browse files

Files changed (4) hide show

audiocraft/builders.py +1 -1
audiocraft/lm.py +54 -46
audiocraft/transformer.py +10 -7
demo.py +2 -1

audiocraft/builders.py CHANGED Viewed

@@ -79,7 +79,7 @@ class AudioGen(nn.Module):
                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            print('______________\nGENTOk 5', gen_tokens.shape)
             print('GENAUD 5', x.sum())
         return x

                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            print('______________\nGENTOk 5', gen_tokens)
             print('GENAUD 5', x.sum())
         return x

audiocraft/lm.py CHANGED Viewed

@@ -9,8 +9,13 @@ from dataclasses import dataclass
 from functools import partial
 from torch import nn
 from audiocraft.activations import get_activation_fn
@@ -141,7 +146,7 @@ class LMModel(nn.Module):
                  dim: int = 128,
                  num_heads: int = 8,
                  hidden_scale: int = 4,
-                 norm: str = 'layer_norm',
                  norm_first: bool = False,
                  emb_lr: tp.Optional[float] = None,
                  bias_proj: bool = True,
@@ -155,7 +160,7 @@ class LMModel(nn.Module):
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
-        self.n_draw = 8  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
@@ -233,71 +238,73 @@ class LMModel(nn.Module):
     def special_token_id(self) -> int:
         return self.card
-    def sample_top_k(self, p, k=249):
-        bs, _, _, hidden = p.shape # logits [3, 4, 1, 2048]
-        p = torch.softmax(p, dim=3)
-        top_k_value, i250 = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
-        min_value_top_k = top_k_value[:, :, :, -1:]
-        p *= (p >= min_value_top_k).float()   # zero low probs
-        p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
-        # BRING THE nq = 4 IN BATCH
-        p = p.reshape(bs * self.n_q, hidden)
-        out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
-                                num_samples=self.n_draw,
-                                replacement=False)  # [bs*4, self.n_draw]
-        return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs, self.n_draw, 4]
     def forward(self,
                 sequence,
                 condition_tensors=None,
                 token_count=None):
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
-        out = self.transformer(input_,
-                               cross_attention_src=condition_tensors['description'][0],
                                token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
-        logits = torch.stack([self.linears[k](out) for k in range(self.n_q)], dim=1)
-        return logits # [bs, 4, 1, 2048]
-    # GENERATE class revert_codebook_patterns()
     @torch.no_grad()
-    def generate(self,
-                 prompt = None,
-                 conditions = [],
                  max_gen_len=256):
-        print(f'{prompt=} {conditions=}')
-        first_param = next(iter(self.parameters()))
-        device = first_param.device
         tokenized = self.condition_provider.tokenize(conditions)
-        # print(f'TOKENIZ, {tokenized.keys()=}, {tokenized=}')  # 'description'
-        # TOKENIZ {'description': {'input_ids': tensor([[3887,   16, 2815,    1],
-        # [3887,   16, 2815,    1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1],
-        # [1, 1, 1, 1]], device='cuda:0')}}
         cfg_conditions = self.condition_provider(tokenized)
-        # print(f'CFGcon, {cfg_conditions.keys()=}, {cfg_conditions["description"][0].shape=}')
-        # USE THIS ATTENTION MASK IF NOT SAME LEN;
-        bs, _7, _1536 = cfg_conditions['description'][0].shape  # [bs, textlen, 1536]
         pattern = self.pattern_provider.get_pattern(max_gen_len)
         gen_codes = torch.full((bs,
                                 self.n_q,
-                                max_gen_len), -1, dtype=torch.long, device=device)
         gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
         _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
@@ -317,13 +324,13 @@ class LMModel(nn.Module):
         for offset in range(1, audiodur):
-            # pass only 0-th draw in forward
-            logits = self.forward(gen_sequence[:, 0, :, offset-1:offset],
-                                  condition_tensors=cfg_conditions,
-                                  token_count=offset)  # [bs, 4, 1, 2048]
-            next_token = self.sample_top_k(logits)  # [bs, n_draw, 4]
             # MASK is not full 1---- HAS 4 x audioduration PATTERN
             m = mask[:, :, :, offset]
@@ -346,6 +353,7 @@ class LMModel(nn.Module):
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
         # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:

 from functools import partial
 from torch import nn
 from audiocraft.activations import get_activation_fn
+import numpy as np
+def _shift(x):
+    n = x.shape[0]
+    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD do we have very short segments
+    x = torch.roll(x, i, dims=2)
+    return x
                  dim: int = 128,
                  num_heads: int = 8,
                  hidden_scale: int = 4,
+                 norm: str = 'layer_norm',
                  norm_first: bool = False,
                  emb_lr: tp.Optional[float] = None,
                  bias_proj: bool = True,
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
+        self.n_draw = 14  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
     def special_token_id(self) -> int:
         return self.card
     def forward(self,
                 sequence,
                 condition_tensors=None,
                 token_count=None):
+        # takes bs=3 duplicates null condition to bs=6 splits logits to cfg returns bs=3
+        bs, _, _ = sequence.shape         # sequence [bs, n_draw,4]
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
+        out = self.transformer(torch.cat([input_, input_], 0),
+                               cross_attention_src=condition_tensors,
                                token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
+        logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
+        # SAMPLE TOP K
+        k = 250
+        p = torch.softmax(logits, dim=3)
+        top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
+        min_value_top_k = top_k_value[:, :, :, -1:]
+        p *= (p >= min_value_top_k).float()   # zero low probs
+        p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
+        # BRING THE nq = 4 IN BATCH
+        p = p.reshape(bs * self.n_q, 2048)
+        out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
+                                num_samples=self.n_draw,
+                                replacement=True)  # [bs*4, self.n_draw]
+        return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
+    def generate(self, conditions = [],
                  max_gen_len=256):
         tokenized = self.condition_provider.tokenize(conditions)
         cfg_conditions = self.condition_provider(tokenized)
+        # NULL CONDITION
+        text_condition = cfg_conditions['description'][0]
+        bs, _, _ = text_condition.shape
+        text_condition = torch.cat(
+            [
+                text_condition,
+                torch.zeros_like(text_condition)
+            ], 0)
         pattern = self.pattern_provider.get_pattern(max_gen_len)
         gen_codes = torch.full((bs,
                                 self.n_q,
+                                max_gen_len), -1, dtype=torch.long,
+                                device=text_condition.device)
         gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
         _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
         for offset in range(1, audiodur):
+            # forward duplicates the query to nullcond - then cfg & returns deduplicate token
+            next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],
+                                      condition_tensors=text_condition,
+                                      token_count=offset-1)  # [bs, 4, 1, 2048]
             # MASK is not full 1---- HAS 4 x audioduration PATTERN
             m = mask[:, :, :, offset]
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
+        out_codes = _shift(out_codes)
         # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:

audiocraft/transformer.py CHANGED Viewed

@@ -175,6 +175,7 @@ class StreamingMultiheadAttention(nn.Module):
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 # 1st projected makes k,v (instantaneous)
                 # 2nd cat
@@ -190,6 +191,7 @@ class StreamingMultiheadAttention(nn.Module):
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 if self.k_history is not None:
                     #
@@ -198,8 +200,10 @@ class StreamingMultiheadAttention(nn.Module):
                     # 24 heads 64 dimofh
                     self.k_history = torch.cat([self.k_history, k], 2)
                     self.v_history = torch.cat([self.v_history, v], 2)
                 else:
                     # init on 1st token (for all 47 transf layers)
                     self.k_history = k
                     self.v_history = v
@@ -209,7 +213,7 @@ class StreamingMultiheadAttention(nn.Module):
                 # KV COMPLETION ONLY ON SELF ATTENTION
             if self.memory_efficient:
@@ -327,18 +331,17 @@ class StreamingTransformer(nn.Module):
                  attention_as_float32: bool = False,
                  cross_attention: bool = False,
                  positional_embedding: str = 'sin',
-                 max_period: float = 10_000,
-                 positional_scale: float = 1,
                  layer_class=StreamingTransformerLayer,
                  checkpointing: str = 'none',
                  device=None,
-                 dtype=None, **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
-        self.positional_scale = positional_scale
@@ -378,12 +381,12 @@ class StreamingTransformer(nn.Module):
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
             positions = positions + kwargs['token_count']  #offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
-            x = x + self.positional_scale * pos_emb
         for j, lay in enumerate(self.layers):
-            # print(f'_________________________{j}___________________')
             x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
+                print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
             else:
                 # 1st projected makes k,v (instantaneous)
                 # 2nd cat
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 if self.k_history is not None:
                     #
                     # 24 heads 64 dimofh
                     self.k_history = torch.cat([self.k_history, k], 2)
                     self.v_history = torch.cat([self.v_history, v], 2)
                 else:
                     # init on 1st token (for all 47 transf layers)
+                    print(f'else skip')
                     self.k_history = k
                     self.v_history = v
                 # KV COMPLETION ONLY ON SELF ATTENTION
+                print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
             if self.memory_efficient:
                  attention_as_float32: bool = False,
                  cross_attention: bool = False,
                  positional_embedding: str = 'sin',
+                 max_period: float = 10_000,
                  layer_class=StreamingTransformerLayer,
                  checkpointing: str = 'none',
                  device=None,
+                 dtype=None,
+                 **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
             positions = positions + kwargs['token_count']  #offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + pos_emb
         for j, lay in enumerate(self.layers):
+            print(f'5_________________________{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
             x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

demo.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import audiofile
 import numpy as np
 from audiocraft import AudioGen
-text_list = ['dogs barging in the street', 'people po']
 sound_generator = AudioGen(duration=.74,
                            device='cuda:0').to('cuda:0').eval()

 import audiofile
 import numpy as np
 from audiocraft import AudioGen
+text_list = ['dogs barging in the street',
+             'music']
 sound_generator = AudioGen(duration=.74,
                            device='cuda:0').to('cuda:0').eval()