soudscape discard last 1s from AudioGen - avoids splash sound

Browse files

Files changed (4) hide show

api.py +41 -13
audiocraft/builders.py +5 -1
audiocraft/lm.py +1 -1
audiocraft/transformer.py +4 -4

api.py CHANGED Viewed

@@ -20,7 +20,7 @@ from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
 NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same soundscape for long video)
-sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
@@ -87,11 +87,11 @@ def overlay(x, soundscape=None):
     if soundscape is not None:
         # SOUNDS
-        print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
-                                        ).reshape(-1).detach().cpu().numpy() # bs, 11400
         # upsample 16 kHz AudioGen to 24kHZ StyleTTS
         print('Resampling')
@@ -100,20 +100,48 @@ def overlay(x, soundscape=None):
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
-            target_rate=24000)[0, :]
         # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
-        # replicat audiogen to match TTS
-        n_repeat = len(x) // background.shape[0] + 2
-        # Reach the full length of TTS by cloning
-        print(f'Additional Repeat {n_repeat=}')
-        background = np.concatenate(n_repeat * [background])
         # background = _shift(background)
-        print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
-              f'{np.abs(background.max())=}\n{x.shape=}')
-        x = .6 * x + .4 * background[:len(x)]
     else:
         print('sound_background = None')
     return x

 CACHE_DIR = 'flask_cache/'
 NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same soundscape for long video)
+sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
     if soundscape is not None:
         # SOUNDS
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
+                                        ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
+        # sound_generator._flush()  # ALREADY done in lm.generate() THE Encodec does not SEEM TO HAVE TRANSFORMERS thys no kvclean up kv cache from previous soundscape
         # upsample 16 kHz AudioGen to 24kHZ StyleTTS
         print('Resampling')
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
+            target_rate=24000)[0, :-25000]  # discard last samples as they have the splash sound / polarity change;
         # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
+        k = background.shape[0]
+        hop = int(.7 * k)  # only overlap 10%
+        n_repeat = len(x) // hop
+        total = np.zeros( hop * (n_repeat + 2))  # add some extra pad space for last frame to fit
+        m = np.ones(k)
+        overlap = k - hop
+        m[hop:] = np.linspace(1, 0, overlap)  # tril mask for avg sound in the interpolated hop
+        # m[:overlap] = np.linspace(0, 1, overlap)
+        for j in range(n_repeat):
+            # total[j*k + hop:(j+1)*k + hop] += background
+            # total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background  # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
+            # total[j * (k+hop):(j+1) * k + j*hop] =background
+            total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
+        # total = total.clip(-1, 1)  # if too many signals were added on top of each other
+        # print(total[40000:70000].tolist())
+        print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
+        # background = np.concatenate(n_repeat * [background])
         # background = _shift(background)
+        # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
+        #       f'{np.abs(background.max())=}\n{x.shape=}')
+        total /= np.abs(total).max() + 1e-7  # amplify speech to full [-1,1]
+        x = .4 * x + .6 * total[:len(x)]
     else:
         print('sound_background = None')
     return x

audiocraft/builders.py CHANGED Viewed

@@ -252,4 +252,8 @@ class AudioGen(nn.Module):
         model.load_state_dict(pkg['best_state'])
         model.cfg = cfg
         # return model
-        self.lm = model.to(torch.float)

         model.load_state_dict(pkg['best_state'])
         model.cfg = cfg
         # return model
+        self.lm = model.to(torch.float)
+    # def _flush(self):
+        # self.lm._flush()  # already done in lm generate at end

audiocraft/lm.py CHANGED Viewed

@@ -164,7 +164,7 @@ class LMModel(nn.Module):
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
-        self.n_draw = 2  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim

         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
+        self.n_draw = 1  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim

audiocraft/transformer.py CHANGED Viewed

@@ -175,7 +175,7 @@ class StreamingMultiheadAttention(nn.Module):
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
-                print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
             else:
                 # 1st projected makes k,v (instantaneous)
                 # 2nd cat
@@ -213,7 +213,7 @@ class StreamingMultiheadAttention(nn.Module):
                 # KV COMPLETION ONLY ON SELF ATTENTION
-                print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
             if self.memory_efficient:
@@ -386,7 +386,7 @@ class StreamingTransformer(nn.Module):
         for j, lay in enumerate(self.layers):
-            print(f'5_________________________{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
-            x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x

                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
+                # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
             else:
                 # 1st projected makes k,v (instantaneous)
                 # 2nd cat
                 # KV COMPLETION ONLY ON SELF ATTENTION
+                # print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
             if self.memory_efficient:
         for j, lay in enumerate(self.layers):
+            # print(f'Transf Layer{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
+            x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # cross_attention_src = txt-cond
             # each layer (mha) keeps history of its own k,v for all tokens
         return x