soudscape discard last 1s from AudioGen - avoids splash sound
Browse files- api.py +41 -13
- audiocraft/builders.py +5 -1
- audiocraft/lm.py +1 -1
- audiocraft/transformer.py +4 -4
api.py
CHANGED
|
@@ -20,7 +20,7 @@ from audiocraft.builders import AudioGen
|
|
| 20 |
CACHE_DIR = 'flask_cache/'
|
| 21 |
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
| 22 |
|
| 23 |
-
sound_generator = AudioGen(duration
|
| 24 |
|
| 25 |
|
| 26 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
|
@@ -87,11 +87,11 @@ def overlay(x, soundscape=None):
|
|
| 87 |
if soundscape is not None:
|
| 88 |
|
| 89 |
# SOUNDS
|
| 90 |
-
|
| 91 |
background = sound_generator.generate(
|
| 92 |
[soundscape] * NUM_SOUND_GENERATIONS
|
| 93 |
-
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
| 94 |
-
|
| 95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
| 96 |
|
| 97 |
print('Resampling')
|
|
@@ -100,20 +100,48 @@ def overlay(x, soundscape=None):
|
|
| 100 |
background = audresample.resample(
|
| 101 |
background,
|
| 102 |
original_rate=16000, # sound_generator.sample_rate,
|
| 103 |
-
target_rate=24000)[0,
|
| 104 |
|
| 105 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
| 106 |
|
| 107 |
-
# replicat audiogen to match TTS
|
| 108 |
-
n_repeat = len(x) // background.shape[0] + 2
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# background = _shift(background)
|
| 114 |
-
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
else:
|
| 118 |
print('sound_background = None')
|
| 119 |
return x
|
|
|
|
| 20 |
CACHE_DIR = 'flask_cache/'
|
| 21 |
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
| 22 |
|
| 23 |
+
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
| 24 |
|
| 25 |
|
| 26 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
|
|
|
| 87 |
if soundscape is not None:
|
| 88 |
|
| 89 |
# SOUNDS
|
| 90 |
+
|
| 91 |
background = sound_generator.generate(
|
| 92 |
[soundscape] * NUM_SOUND_GENERATIONS
|
| 93 |
+
).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
|
| 94 |
+
# sound_generator._flush() # ALREADY done in lm.generate() THE Encodec does not SEEM TO HAVE TRANSFORMERS thys no kvclean up kv cache from previous soundscape
|
| 95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
| 96 |
|
| 97 |
print('Resampling')
|
|
|
|
| 100 |
background = audresample.resample(
|
| 101 |
background,
|
| 102 |
original_rate=16000, # sound_generator.sample_rate,
|
| 103 |
+
target_rate=24000)[0, :-25000] # discard last samples as they have the splash sound / polarity change;
|
| 104 |
|
| 105 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
| 106 |
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
k = background.shape[0]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
hop = int(.7 * k) # only overlap 10%
|
| 120 |
+
n_repeat = len(x) // hop
|
| 121 |
+
total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
|
| 122 |
+
|
| 123 |
+
m = np.ones(k)
|
| 124 |
+
overlap = k - hop
|
| 125 |
+
m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
|
| 126 |
+
# m[:overlap] = np.linspace(0, 1, overlap)
|
| 127 |
+
|
| 128 |
+
for j in range(n_repeat):
|
| 129 |
+
# total[j*k + hop:(j+1)*k + hop] += background
|
| 130 |
+
# total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
|
| 131 |
+
# total[j * (k+hop):(j+1) * k + j*hop] =background
|
| 132 |
+
total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
|
| 133 |
+
# total = total.clip(-1, 1) # if too many signals were added on top of each other
|
| 134 |
+
# print(total[40000:70000].tolist())
|
| 135 |
+
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
| 136 |
+
|
| 137 |
+
# background = np.concatenate(n_repeat * [background])
|
| 138 |
+
|
| 139 |
# background = _shift(background)
|
| 140 |
+
# print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
| 141 |
+
# f'{np.abs(background.max())=}\n{x.shape=}')
|
| 142 |
+
total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
|
| 143 |
+
x = .4 * x + .6 * total[:len(x)]
|
| 144 |
+
|
| 145 |
else:
|
| 146 |
print('sound_background = None')
|
| 147 |
return x
|
audiocraft/builders.py
CHANGED
|
@@ -252,4 +252,8 @@ class AudioGen(nn.Module):
|
|
| 252 |
model.load_state_dict(pkg['best_state'])
|
| 253 |
model.cfg = cfg
|
| 254 |
# return model
|
| 255 |
-
self.lm = model.to(torch.float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
model.load_state_dict(pkg['best_state'])
|
| 253 |
model.cfg = cfg
|
| 254 |
# return model
|
| 255 |
+
self.lm = model.to(torch.float)
|
| 256 |
+
|
| 257 |
+
# def _flush(self):
|
| 258 |
+
# self.lm._flush() # already done in lm generate at end
|
| 259 |
+
|
audiocraft/lm.py
CHANGED
|
@@ -164,7 +164,7 @@ class LMModel(nn.Module):
|
|
| 164 |
self.cfg_coef = cfg_coef
|
| 165 |
self.condition_provider = condition_provider
|
| 166 |
self.card = card # 2048 ?
|
| 167 |
-
self.n_draw =
|
| 168 |
embed_dim = self.card + 1
|
| 169 |
self.n_q = n_q
|
| 170 |
self.dim = dim
|
|
|
|
| 164 |
self.cfg_coef = cfg_coef
|
| 165 |
self.condition_provider = condition_provider
|
| 166 |
self.card = card # 2048 ?
|
| 167 |
+
self.n_draw = 1 # replicate so many times the generation of each text in batch
|
| 168 |
embed_dim = self.card + 1
|
| 169 |
self.n_q = n_q
|
| 170 |
self.dim = dim
|
audiocraft/transformer.py
CHANGED
|
@@ -175,7 +175,7 @@ class StreamingMultiheadAttention(nn.Module):
|
|
| 175 |
v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
|
| 176 |
|
| 177 |
q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
|
| 178 |
-
print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
|
| 179 |
else:
|
| 180 |
# 1st projected makes k,v (instantaneous)
|
| 181 |
# 2nd cat
|
|
@@ -213,7 +213,7 @@ class StreamingMultiheadAttention(nn.Module):
|
|
| 213 |
|
| 214 |
|
| 215 |
# KV COMPLETION ONLY ON SELF ATTENTION
|
| 216 |
-
print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
|
| 217 |
|
| 218 |
|
| 219 |
if self.memory_efficient:
|
|
@@ -386,7 +386,7 @@ class StreamingTransformer(nn.Module):
|
|
| 386 |
|
| 387 |
|
| 388 |
for j, lay in enumerate(self.layers):
|
| 389 |
-
print(f'
|
| 390 |
-
x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # txt
|
| 391 |
# each layer (mha) keeps history of its own k,v for all tokens
|
| 392 |
return x
|
|
|
|
| 175 |
v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
|
| 176 |
|
| 177 |
q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
|
| 178 |
+
# print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
|
| 179 |
else:
|
| 180 |
# 1st projected makes k,v (instantaneous)
|
| 181 |
# 2nd cat
|
|
|
|
| 213 |
|
| 214 |
|
| 215 |
# KV COMPLETION ONLY ON SELF ATTENTION
|
| 216 |
+
# print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
|
| 217 |
|
| 218 |
|
| 219 |
if self.memory_efficient:
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
for j, lay in enumerate(self.layers):
|
| 389 |
+
# print(f'Transf Layer{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
|
| 390 |
+
x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # cross_attention_src = txt-cond
|
| 391 |
# each layer (mha) keeps history of its own k,v for all tokens
|
| 392 |
return x
|