revert transformer.py
Browse files- audiocraft/lm.py +20 -42
- audiocraft/transformer.py +204 -39
- audiocraft/utils/utils.py +7 -64
- demo.py +2 -2
- live_api.py +8 -6
audiocraft/lm.py
CHANGED
|
@@ -246,32 +246,29 @@ class LMModel(StreamingModule):
|
|
| 246 |
def _sample_next_token(self,
|
| 247 |
sequence,
|
| 248 |
cfg_conditions,
|
| 249 |
-
unconditional_state
|
| 250 |
-
use_sampling=False,
|
| 251 |
-
temp: float = 1.0,
|
| 252 |
-
top_k: int = 0,
|
| 253 |
-
top_p: float = 0.0,
|
| 254 |
-
cfg_coef: tp.Optional[float] = None,
|
| 255 |
-
two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
|
| 256 |
"""self.n_draw"""
|
| 257 |
B = sequence.shape[0]
|
| 258 |
-
cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
|
| 259 |
-
model = self if self._fsdp is None else self._fsdp
|
| 260 |
-
two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
|
| 261 |
-
condition_tensors = cfg_conditions
|
| 262 |
-
|
| 263 |
|
|
|
|
| 264 |
|
|
|
|
|
|
|
| 265 |
logits = model(
|
| 266 |
sequence, # cond_logits = wav condition
|
| 267 |
conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
|
| 268 |
|
| 269 |
-
# print(f'{logits.shape=} L')
|
| 270 |
-
logits = logits[0, :, :, :].transpose(1,0) # sample expects [1, 4, 2048]
|
| 271 |
-
# logits = [2, 4, 1, 2048]
|
| 272 |
-
# print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
|
| 273 |
-
next_token = utils.sample_top_k(logits, k=top_k, n_draw=self.n_draw) # [1,4,2048] logits
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
return next_token
|
| 276 |
|
| 277 |
# GENERATE class revert_codebook_patterns()
|
|
@@ -282,15 +279,7 @@ class LMModel(StreamingModule):
|
|
| 282 |
num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
|
| 283 |
max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
|
| 284 |
use_sampling: bool = True,
|
| 285 |
-
|
| 286 |
-
top_k: int = 250,
|
| 287 |
-
top_p: float = 0.0,
|
| 288 |
-
cfg_coef: tp.Optional[float] = None,
|
| 289 |
-
two_step_cfg: tp.Optional[bool] = None,
|
| 290 |
-
remove_prompts: bool = False,
|
| 291 |
-
check: bool = False,
|
| 292 |
-
callback: tp.Optional[tp.Callable[[int, int], None]] = None,
|
| 293 |
-
**kwargs) -> torch.Tensor:
|
| 294 |
|
| 295 |
print(f'{num_samples=}')
|
| 296 |
first_param = next(iter(self.parameters()))
|
|
@@ -365,32 +354,21 @@ class LMModel(StreamingModule):
|
|
| 365 |
next_token = self._sample_next_token(
|
| 366 |
curr_sequence,
|
| 367 |
cfg_conditions,
|
| 368 |
-
unconditional_state,
|
| 369 |
-
use_sampling,
|
| 370 |
-
temp, top_k, top_p,
|
| 371 |
-
cfg_coef=cfg_coef,
|
| 372 |
-
two_step_cfg=two_step_cfg) # [5, 4, 1]
|
| 373 |
-
print(f'{next_token.shape=}')
|
| 374 |
-
# replicate the sequence to hold 5 or more sequences as we generate 5 tokens or more
|
| 375 |
|
| 376 |
|
| 377 |
-
|
| 378 |
-
# ensure the tokens that should be masked are properly set to special_token_id
|
| 379 |
-
# as the model never output special_token_id
|
| 380 |
-
# valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
|
| 381 |
|
| 382 |
-
# next_token[~valid_mask] = self.special_token_id
|
| 383 |
|
| 384 |
-
# print(f'{unconditional_state=} \n
|
| 385 |
-
# print('Set All to Special')
|
| 386 |
|
| 387 |
-
|
|
|
|
|
|
|
| 388 |
# special_token_id is filler for CODEBOOK_PATTERN ?
|
| 389 |
|
| 390 |
# next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
|
| 391 |
|
| 392 |
_gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
|
| 393 |
-
|
| 394 |
duplicate_draw.append(next_token)
|
| 395 |
|
| 396 |
prev_offset = offset
|
|
|
|
| 246 |
def _sample_next_token(self,
|
| 247 |
sequence,
|
| 248 |
cfg_conditions,
|
| 249 |
+
unconditional_state):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
"""self.n_draw"""
|
| 251 |
B = sequence.shape[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
+
model = self if self._fsdp is None else self._fsdp
|
| 254 |
|
| 255 |
+
condition_tensors = cfg_conditions
|
| 256 |
+
# logits = [2, 4, 1, 2048]
|
| 257 |
logits = model(
|
| 258 |
sequence, # cond_logits = wav condition
|
| 259 |
conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
# use cfg
|
| 263 |
+
# logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
|
| 264 |
+
|
| 265 |
+
# or use 1 of logits
|
| 266 |
+
logits = logits[0, :, :, :].transpose(1,0) # [2,4,1, 2048] -> [1,4,2048]
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
|
| 271 |
+
next_token = utils.sample_top_k(logits, n_draw=self.n_draw) # [1,4,2048] logits
|
| 272 |
return next_token
|
| 273 |
|
| 274 |
# GENERATE class revert_codebook_patterns()
|
|
|
|
| 279 |
num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
|
| 280 |
max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
|
| 281 |
use_sampling: bool = True,
|
| 282 |
+
**kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
print(f'{num_samples=}')
|
| 285 |
first_param = next(iter(self.parameters()))
|
|
|
|
| 354 |
next_token = self._sample_next_token(
|
| 355 |
curr_sequence,
|
| 356 |
cfg_conditions,
|
| 357 |
+
unconditional_state) # [5, 4, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
|
|
|
| 361 |
|
|
|
|
|
|
|
| 362 |
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# RUNS with = 2047 just different of self.special_token_id = 2047 = alwayssingletoken = drill noise
|
| 366 |
# special_token_id is filler for CODEBOOK_PATTERN ?
|
| 367 |
|
| 368 |
# next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
|
| 369 |
|
| 370 |
_gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
|
| 371 |
+
|
| 372 |
duplicate_draw.append(next_token)
|
| 373 |
|
| 374 |
prev_offset = offset
|
audiocraft/transformer.py
CHANGED
|
@@ -86,6 +86,7 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
|
|
| 86 |
adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
|
| 87 |
max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
|
| 88 |
phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
|
|
|
|
| 89 |
return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
|
| 90 |
|
| 91 |
|
|
@@ -177,7 +178,7 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 177 |
self.past_context = past_context
|
| 178 |
self.memory_efficient = memory_efficient
|
| 179 |
self.attention_as_float32 = attention_as_float32
|
| 180 |
-
|
| 181 |
self.cross_attention = cross_attention
|
| 182 |
self.safe_streaming = safe_streaming
|
| 183 |
self.num_heads = num_heads
|
|
@@ -230,8 +231,41 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 230 |
state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
|
| 231 |
super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
def _complete_kv(self, k, v):
|
|
|
|
| 235 |
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
| 236 |
if self.cross_attention:
|
| 237 |
# With cross attention we assume all keys and values
|
|
@@ -240,16 +274,15 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 240 |
return k, v
|
| 241 |
# Complete the key/value pair using the streaming state.
|
| 242 |
if self._streaming_state:
|
| 243 |
-
# print('{self._streaming_state.keys()=}') EMPTY - ALTHOUGH WE HAVE STREAMING STATE
|
| 244 |
pk = self._streaming_state['past_keys']
|
| 245 |
nk = torch.cat([pk, k], dim=time_dim)
|
|
|
|
| 246 |
if v is k:
|
| 247 |
-
|
| 248 |
nv = nk
|
| 249 |
else:
|
| 250 |
-
|
| 251 |
pv = self._streaming_state['past_values']
|
| 252 |
nv = torch.cat([pv, v], dim=time_dim)
|
|
|
|
| 253 |
else:
|
| 254 |
nk = k
|
| 255 |
nv = v
|
|
@@ -257,28 +290,35 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 257 |
assert nk.shape[time_dim] == nv.shape[time_dim]
|
| 258 |
offset = 0
|
| 259 |
if self.past_context is not None:
|
| 260 |
-
|
| 261 |
offset = max(0, nk.shape[time_dim] - self.past_context)
|
| 262 |
if self._is_streaming:
|
| 263 |
self._streaming_state['past_keys'] = nk[:, offset:]
|
| 264 |
if v is not k:
|
| 265 |
-
|
| 266 |
self._streaming_state['past_values'] = nv[:, offset:]
|
| 267 |
if 'offset' in self._streaming_state:
|
| 268 |
-
|
| 269 |
self._streaming_state['offset'] += offset
|
| 270 |
else:
|
| 271 |
-
|
| 272 |
self._streaming_state['offset'] = torch.tensor(0)
|
| 273 |
return nk, nv
|
| 274 |
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
|
| 278 |
key_padding_mask=None, need_weights=False, attn_mask=None,
|
| 279 |
average_attn_weights=True, is_causal=False):
|
| 280 |
-
|
| 281 |
-
|
| 282 |
assert not is_causal, ("New param added in torch 2.0.1 not supported, "
|
| 283 |
"use the causal args in the constructor.")
|
| 284 |
|
|
@@ -292,22 +332,29 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 292 |
assert self.causal or self.cross_attention, \
|
| 293 |
"Streaming only available for causal or cross attention"
|
| 294 |
|
| 295 |
-
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
if self.custom:
|
| 302 |
-
|
|
|
|
|
|
|
| 303 |
if self.cross_attention:
|
| 304 |
-
|
| 305 |
# Different queries, keys, values, we have to spit manually the weights
|
| 306 |
# before applying the linear.
|
| 307 |
dim = self.in_proj_weight.shape[0] // 3
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
| 311 |
q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
|
| 312 |
# todo: when streaming, we could actually save k, v and check the shape actually match.
|
| 313 |
k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
|
|
@@ -323,31 +370,125 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
| 323 |
assert value is key, "specialized implementation"
|
| 324 |
projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
|
| 325 |
if self.kv_repeat == 1:
|
| 326 |
-
|
| 327 |
if time_dim == 2:
|
| 328 |
bound_layout = "b h p t d"
|
| 329 |
else:
|
| 330 |
bound_layout = "b t p h d"
|
| 331 |
packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
|
| 332 |
q, k, v = ops.unbind(packed, dim=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
k, v = self._complete_kv(k, v)
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
q, k, v
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
x = x.to(dtype)
|
| 344 |
x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
|
| 345 |
x = self.out_proj(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
return x, None
|
| 347 |
|
| 348 |
|
| 349 |
class StreamingTransformerLayer(nn.TransformerEncoderLayer):
|
|
|
|
|
|
|
|
|
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
|
| 352 |
bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
|
| 353 |
past_context: tp.Optional[int] = None, custom: bool = False,
|
|
@@ -495,7 +636,6 @@ class StreamingTransformer(StreamingModule):
|
|
| 495 |
assert positional_embedding in ['sin', 'rope', 'sin_rope']
|
| 496 |
self.rope: tp.Optional[RotaryEmbedding] = None
|
| 497 |
if self.positional_embedding in ['rope', 'sin_rope']:
|
| 498 |
-
print('ROPE\nL')
|
| 499 |
assert _is_custom(custom, memory_efficient)
|
| 500 |
self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
|
| 501 |
xpos=xpos, scale=positional_scale, device=device)
|
|
@@ -523,11 +663,39 @@ class StreamingTransformer(StreamingModule):
|
|
| 523 |
# backward hook inside of FSDP...
|
| 524 |
layer._magma_checkpointed = True # type: ignore
|
| 525 |
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
def forward(self, x: torch.Tensor, *args, **kwargs):
|
| 529 |
-
|
| 530 |
-
# Output x: [2, 1, 1536] how is batch expanded to 2
|
| 531 |
B, T, C = x.shape
|
| 532 |
|
| 533 |
if 'offsets' in self._streaming_state:
|
|
@@ -536,20 +704,17 @@ class StreamingTransformer(StreamingModule):
|
|
| 536 |
offsets = torch.zeros(B, dtype=torch.long, device=x.device)
|
| 537 |
|
| 538 |
if self.positional_embedding in ['sin', 'sin_rope']:
|
| 539 |
-
# print(f'{self.positional_embedding=}\n') 'sin'
|
| 540 |
positions = torch.arange(T, device=x.device).view(1, -1, 1)
|
| 541 |
positions = positions + offsets.view(-1, 1, 1)
|
| 542 |
pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
|
| 543 |
x = x + self.positional_scale * pos_emb
|
| 544 |
|
| 545 |
for layer in self.layers:
|
| 546 |
-
|
| 547 |
-
# # kwargs=() kwargs={'cross_attention_src', 'src_mask'}
|
| 548 |
-
x = layer(x, **kwargs)
|
| 549 |
|
| 550 |
if self._is_streaming:
|
| 551 |
self._streaming_state['offsets'] = offsets + T
|
| 552 |
-
|
| 553 |
return x
|
| 554 |
|
| 555 |
def make_optim_group(self):
|
|
@@ -592,4 +757,4 @@ def _verify_xformers_internal_compat():
|
|
| 592 |
|
| 593 |
|
| 594 |
def _is_custom(custom: bool, memory_efficient: bool):
|
| 595 |
-
return custom or memory_efficient
|
|
|
|
| 86 |
adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
|
| 87 |
max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
|
| 88 |
phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
|
| 89 |
+
print('==============CONCAT 3 ============')
|
| 90 |
return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
|
| 91 |
|
| 92 |
|
|
|
|
| 178 |
self.past_context = past_context
|
| 179 |
self.memory_efficient = memory_efficient
|
| 180 |
self.attention_as_float32 = attention_as_float32
|
| 181 |
+
self.rope = rope
|
| 182 |
self.cross_attention = cross_attention
|
| 183 |
self.safe_streaming = safe_streaming
|
| 184 |
self.num_heads = num_heads
|
|
|
|
| 231 |
state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
|
| 232 |
super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
|
| 233 |
|
| 234 |
+
def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
|
| 235 |
+
# Return a causal mask, accounting for potentially stored past keys/values
|
| 236 |
+
# We actually return a bias for the attention score, as this has the same
|
| 237 |
+
# convention both in the builtin MHA in Pytorch, and Xformers functions.
|
| 238 |
+
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
| 239 |
+
if self.memory_efficient:
|
| 240 |
+
from xformers.ops import LowerTriangularMask
|
| 241 |
+
if current_steps == 1:
|
| 242 |
+
# If we only have one step, then we do not need a mask.
|
| 243 |
+
return None
|
| 244 |
+
elif 'past_keys' in self._streaming_state:
|
| 245 |
+
raise RuntimeError("Not supported at the moment")
|
| 246 |
+
else:
|
| 247 |
+
# Then we can safely use a lower triangular mask
|
| 248 |
+
return LowerTriangularMask()
|
| 249 |
+
if self._streaming_state:
|
| 250 |
+
past_keys = self._streaming_state['past_keys']
|
| 251 |
+
past_steps = past_keys.shape[time_dim]
|
| 252 |
+
else:
|
| 253 |
+
past_steps = 0
|
| 254 |
+
|
| 255 |
+
queries_pos = torch.arange(
|
| 256 |
+
past_steps, current_steps + past_steps, device=device).view(-1, 1)
|
| 257 |
+
keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
|
| 258 |
+
delta = queries_pos - keys_pos
|
| 259 |
+
valid = delta >= 0
|
| 260 |
+
if self.past_context is not None:
|
| 261 |
+
valid &= (delta <= self.past_context)
|
| 262 |
+
return torch.where(
|
| 263 |
+
valid,
|
| 264 |
+
torch.zeros([], device=device, dtype=dtype),
|
| 265 |
+
torch.full([], float('-inf'), device=device, dtype=dtype))
|
| 266 |
|
| 267 |
def _complete_kv(self, k, v):
|
| 268 |
+
|
| 269 |
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
| 270 |
if self.cross_attention:
|
| 271 |
# With cross attention we assume all keys and values
|
|
|
|
| 274 |
return k, v
|
| 275 |
# Complete the key/value pair using the streaming state.
|
| 276 |
if self._streaming_state:
|
|
|
|
| 277 |
pk = self._streaming_state['past_keys']
|
| 278 |
nk = torch.cat([pk, k], dim=time_dim)
|
| 279 |
+
print('==============CONCAT 1===============')
|
| 280 |
if v is k:
|
|
|
|
| 281 |
nv = nk
|
| 282 |
else:
|
|
|
|
| 283 |
pv = self._streaming_state['past_values']
|
| 284 |
nv = torch.cat([pv, v], dim=time_dim)
|
| 285 |
+
print('==============CONCAT 2================')
|
| 286 |
else:
|
| 287 |
nk = k
|
| 288 |
nv = v
|
|
|
|
| 290 |
assert nk.shape[time_dim] == nv.shape[time_dim]
|
| 291 |
offset = 0
|
| 292 |
if self.past_context is not None:
|
|
|
|
| 293 |
offset = max(0, nk.shape[time_dim] - self.past_context)
|
| 294 |
if self._is_streaming:
|
| 295 |
self._streaming_state['past_keys'] = nk[:, offset:]
|
| 296 |
if v is not k:
|
|
|
|
| 297 |
self._streaming_state['past_values'] = nv[:, offset:]
|
| 298 |
if 'offset' in self._streaming_state:
|
|
|
|
| 299 |
self._streaming_state['offset'] += offset
|
| 300 |
else:
|
|
|
|
| 301 |
self._streaming_state['offset'] = torch.tensor(0)
|
| 302 |
return nk, nv
|
| 303 |
|
| 304 |
+
def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
|
| 305 |
+
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
| 306 |
+
# Apply rope embeddings to query and key tensors.
|
| 307 |
+
assert self.rope is not None
|
| 308 |
+
if 'past_keys' in self._streaming_state:
|
| 309 |
+
past_keys_offset = self._streaming_state['past_keys'].shape[1]
|
| 310 |
+
else:
|
| 311 |
+
past_keys_offset = 0
|
| 312 |
+
if 'offset' in self._streaming_state:
|
| 313 |
+
past_context_offset = int(self._streaming_state['offset'].item())
|
| 314 |
+
else:
|
| 315 |
+
past_context_offset = 0
|
| 316 |
+
streaming_offset = past_context_offset + past_keys_offset
|
| 317 |
+
return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
|
| 318 |
|
| 319 |
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
|
| 320 |
key_padding_mask=None, need_weights=False, attn_mask=None,
|
| 321 |
average_attn_weights=True, is_causal=False):
|
|
|
|
|
|
|
| 322 |
assert not is_causal, ("New param added in torch 2.0.1 not supported, "
|
| 323 |
"use the causal args in the constructor.")
|
| 324 |
|
|
|
|
| 332 |
assert self.causal or self.cross_attention, \
|
| 333 |
"Streaming only available for causal or cross attention"
|
| 334 |
|
| 335 |
+
custom_attn_mask = attn_mask is not None
|
| 336 |
|
| 337 |
+
if self.causal:
|
| 338 |
+
assert attn_mask is None
|
| 339 |
+
# At the moment we specialize only for the self-attention case.
|
| 340 |
+
assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
|
| 341 |
+
assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
|
| 342 |
+
attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
|
| 343 |
|
| 344 |
if self.custom:
|
| 345 |
+
# custom implementation
|
| 346 |
+
assert need_weights is False
|
| 347 |
+
assert key_padding_mask is None
|
| 348 |
if self.cross_attention:
|
|
|
|
| 349 |
# Different queries, keys, values, we have to spit manually the weights
|
| 350 |
# before applying the linear.
|
| 351 |
dim = self.in_proj_weight.shape[0] // 3
|
| 352 |
+
if self.in_proj_bias is None:
|
| 353 |
+
bias_q, bias_k, bias_v = None, None, None
|
| 354 |
+
else:
|
| 355 |
+
bias_q = self.in_proj_bias[:dim]
|
| 356 |
+
bias_k = self.in_proj_bias[dim: 2 * dim]
|
| 357 |
+
bias_v = self.in_proj_bias[2 * dim:]
|
| 358 |
q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
|
| 359 |
# todo: when streaming, we could actually save k, v and check the shape actually match.
|
| 360 |
k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
|
|
|
|
| 370 |
assert value is key, "specialized implementation"
|
| 371 |
projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
|
| 372 |
if self.kv_repeat == 1:
|
|
|
|
| 373 |
if time_dim == 2:
|
| 374 |
bound_layout = "b h p t d"
|
| 375 |
else:
|
| 376 |
bound_layout = "b t p h d"
|
| 377 |
packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
|
| 378 |
q, k, v = ops.unbind(packed, dim=2)
|
| 379 |
+
else:
|
| 380 |
+
embed_dim = self.embed_dim
|
| 381 |
+
per_head_dim = (embed_dim // self.num_heads)
|
| 382 |
+
kv_heads = self.num_heads // self.kv_repeat
|
| 383 |
+
q = projected[:, :, :embed_dim]
|
| 384 |
+
start = embed_dim
|
| 385 |
+
end = start + per_head_dim * kv_heads
|
| 386 |
+
k = projected[:, :, start: end]
|
| 387 |
+
v = projected[:, :, end:]
|
| 388 |
+
q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
|
| 389 |
+
k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
|
| 390 |
+
v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
|
| 391 |
|
| 392 |
+
if self.qk_layer_norm is True:
|
| 393 |
+
assert self.kv_repeat == 1
|
| 394 |
+
q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
|
| 395 |
+
q = self.q_layer_norm(q)
|
| 396 |
+
k = self.k_layer_norm(k)
|
| 397 |
+
q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
|
| 398 |
+
if self.rope:
|
| 399 |
+
q, k = self._apply_rope(q, k)
|
| 400 |
k, v = self._complete_kv(k, v)
|
| 401 |
+
if self.kv_repeat > 1:
|
| 402 |
+
k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
|
| 403 |
+
v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
|
| 404 |
+
if self.attention_as_float32:
|
| 405 |
+
q, k, v = [x.float() for x in [q, k, v]]
|
| 406 |
+
if self.memory_efficient:
|
| 407 |
+
if custom_attn_mask:
|
| 408 |
+
# When using a custom attn mask:
|
| 409 |
+
# Move to query's device, repeat for each sample, remove align8 padding
|
| 410 |
+
seq_len = query.shape[1]
|
| 411 |
+
attn_mask = attn_mask.to(q.dtype)
|
| 412 |
+
attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
|
| 413 |
+
attn_mask = attn_mask[..., :seq_len, :seq_len]
|
| 414 |
+
|
| 415 |
+
p = self.dropout if self.training else 0
|
| 416 |
+
if _efficient_attention_backend == 'torch':
|
| 417 |
+
x = torch.nn.functional.scaled_dot_product_attention(
|
| 418 |
+
q, k, v, is_causal=attn_mask is not None, dropout_p=p)
|
| 419 |
+
else:
|
| 420 |
+
x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
|
| 421 |
+
else:
|
| 422 |
+
# We include the dot product as float32, for consistency
|
| 423 |
+
# with the other implementations that include that step
|
| 424 |
+
# as part of the attention. Note that when using `autocast`,
|
| 425 |
+
# the einsums would be done as bfloat16, but the softmax
|
| 426 |
+
# would be done as bfloat16, so `attention_as_float32` will
|
| 427 |
+
# extend a bit the range of operations done in float32,
|
| 428 |
+
# although this should make no difference.
|
| 429 |
+
q = q / q.shape[-1] ** 0.5
|
| 430 |
+
key_layout = layout.replace('t', 'k')
|
| 431 |
+
query_layout = layout
|
| 432 |
+
if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
|
| 433 |
+
with torch.autocast(device_type=q.device.type, dtype=torch.float32):
|
| 434 |
+
pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
|
| 435 |
+
else:
|
| 436 |
+
pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
|
| 437 |
+
if attn_mask is not None:
|
| 438 |
+
pre_w = pre_w + attn_mask
|
| 439 |
+
w = torch.softmax(pre_w, dim=-1)
|
| 440 |
+
w = F.dropout(w, self.dropout, training=self.training).to(v)
|
| 441 |
+
# Key and value have the same format.
|
| 442 |
+
x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
|
| 443 |
x = x.to(dtype)
|
| 444 |
x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
|
| 445 |
x = self.out_proj(x)
|
| 446 |
+
else:
|
| 447 |
+
key, value = self._complete_kv(key, value)
|
| 448 |
+
if self.attention_as_float32:
|
| 449 |
+
query, key, value = [x.float() for x in [query, key, value]]
|
| 450 |
+
x, _ = self.mha(
|
| 451 |
+
query, key, value, key_padding_mask,
|
| 452 |
+
need_weights, attn_mask, average_attn_weights)
|
| 453 |
+
x = x.to(dtype)
|
| 454 |
+
|
| 455 |
return x, None
|
| 456 |
|
| 457 |
|
| 458 |
class StreamingTransformerLayer(nn.TransformerEncoderLayer):
|
| 459 |
+
"""TransformerLayer with Streaming / Causal support.
|
| 460 |
+
This also integrates cross_attention, when passing `cross_attention=True`,
|
| 461 |
+
rather than having two separate classes like in PyTorch.
|
| 462 |
|
| 463 |
+
Args:
|
| 464 |
+
d_model (int): Dimension of the data.
|
| 465 |
+
num_heads (int): Number of heads.
|
| 466 |
+
dim_feedforward (int): Intermediate dimension of FF module.
|
| 467 |
+
dropout (float): Dropout both for MHA and FF.
|
| 468 |
+
bias_ff (bool): Use bias for FF.
|
| 469 |
+
bias_attn (bool): Use bias for MHA.
|
| 470 |
+
causal (bool): Causal mask applied automatically.
|
| 471 |
+
past_context (int, optional): Receptive field for the causal mask, infinite if None.
|
| 472 |
+
custom (bool): Use custom MHA implementation, for testing / benchmarking.
|
| 473 |
+
memory_efficient (bool): Use xformers based memory efficient attention.
|
| 474 |
+
attention_as_float32 (bool): Perform the attention as float32
|
| 475 |
+
(especially important with memory_efficient as autocast won't do this automatically).
|
| 476 |
+
qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
|
| 477 |
+
qk_layer_norm_cross (bool): Same for the cross attention.
|
| 478 |
+
cross_attention (bool): If True, expect to get secondary input for cross-attention.
|
| 479 |
+
Cross attention will use the default MHA, as it typically won't require
|
| 480 |
+
special treatment.
|
| 481 |
+
layer_scale (float, optional): If not None, LayerScale will be used with
|
| 482 |
+
the given value as initial scale.
|
| 483 |
+
rope (`RotaryEmbedding`, optional): Rope embedding to use.
|
| 484 |
+
attention_dropout (float, optional): If not None, separate the value of the dimension dropout
|
| 485 |
+
in FFN and of the attention dropout.
|
| 486 |
+
kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
|
| 487 |
+
This will lead to faster decoding time on A100 or other GPUs with tensorcore.
|
| 488 |
+
device (torch.device, optional): Device on which to initialize.
|
| 489 |
+
dtype (torch.dtype, optional): dtype to use.
|
| 490 |
+
**kwargs: See `nn.TransformerEncoderLayer`.
|
| 491 |
+
"""
|
| 492 |
def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
|
| 493 |
bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
|
| 494 |
past_context: tp.Optional[int] = None, custom: bool = False,
|
|
|
|
| 636 |
assert positional_embedding in ['sin', 'rope', 'sin_rope']
|
| 637 |
self.rope: tp.Optional[RotaryEmbedding] = None
|
| 638 |
if self.positional_embedding in ['rope', 'sin_rope']:
|
|
|
|
| 639 |
assert _is_custom(custom, memory_efficient)
|
| 640 |
self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
|
| 641 |
xpos=xpos, scale=positional_scale, device=device)
|
|
|
|
| 663 |
# backward hook inside of FSDP...
|
| 664 |
layer._magma_checkpointed = True # type: ignore
|
| 665 |
|
| 666 |
+
def _apply_layer(self, layer, *args, **kwargs):
|
| 667 |
+
method = self.checkpointing
|
| 668 |
+
if method == 'none':
|
| 669 |
+
return layer(*args, **kwargs)
|
| 670 |
+
elif method == 'torch':
|
| 671 |
+
return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
|
| 672 |
+
elif method.startswith('xformers'):
|
| 673 |
+
from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
|
| 674 |
+
if method == 'xformers_default':
|
| 675 |
+
# those operations will be saved, and not recomputed.
|
| 676 |
+
# According to Francisco we can get smarter policies but this is a good start.
|
| 677 |
+
allow_list = [
|
| 678 |
+
"xformers.efficient_attention_forward_cutlass.default",
|
| 679 |
+
"xformers_flash.flash_fwd.default",
|
| 680 |
+
"aten.addmm.default",
|
| 681 |
+
"aten.mm.default",
|
| 682 |
+
]
|
| 683 |
+
elif method == 'xformers_mm':
|
| 684 |
+
# those operations will be saved, and not recomputed.
|
| 685 |
+
# According to Francisco we can get smarter policies but this is a good start.
|
| 686 |
+
allow_list = [
|
| 687 |
+
"aten.addmm.default",
|
| 688 |
+
"aten.mm.default",
|
| 689 |
+
]
|
| 690 |
+
else:
|
| 691 |
+
raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
|
| 692 |
+
policy_fn = _get_default_policy(allow_list)
|
| 693 |
+
return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
|
| 694 |
+
else:
|
| 695 |
+
raise ValueError(f"Checkpointing method {method} is unknown.")
|
| 696 |
|
| 697 |
def forward(self, x: torch.Tensor, *args, **kwargs):
|
| 698 |
+
|
|
|
|
| 699 |
B, T, C = x.shape
|
| 700 |
|
| 701 |
if 'offsets' in self._streaming_state:
|
|
|
|
| 704 |
offsets = torch.zeros(B, dtype=torch.long, device=x.device)
|
| 705 |
|
| 706 |
if self.positional_embedding in ['sin', 'sin_rope']:
|
|
|
|
| 707 |
positions = torch.arange(T, device=x.device).view(1, -1, 1)
|
| 708 |
positions = positions + offsets.view(-1, 1, 1)
|
| 709 |
pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
|
| 710 |
x = x + self.positional_scale * pos_emb
|
| 711 |
|
| 712 |
for layer in self.layers:
|
| 713 |
+
x = self._apply_layer(layer, x, *args, **kwargs)
|
|
|
|
|
|
|
| 714 |
|
| 715 |
if self._is_streaming:
|
| 716 |
self._streaming_state['offsets'] = offsets + T
|
| 717 |
+
|
| 718 |
return x
|
| 719 |
|
| 720 |
def make_optim_group(self):
|
|
|
|
| 757 |
|
| 758 |
|
| 759 |
def _is_custom(custom: bool, memory_efficient: bool):
|
| 760 |
+
return custom or memory_efficient
|
audiocraft/utils/utils.py
CHANGED
|
@@ -1,23 +1,11 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
from contextlib import contextmanager
|
| 9 |
-
from functools import wraps, lru_cache
|
| 10 |
import hashlib
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
-
from pathlib import Path
|
| 14 |
import typing as tp
|
| 15 |
-
|
| 16 |
import flashy
|
| 17 |
import flashy.distrib
|
| 18 |
import omegaconf
|
| 19 |
import torch
|
| 20 |
-
from torch.nn.utils.rnn import pad_sequence
|
| 21 |
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
|
@@ -46,13 +34,7 @@ def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
|
|
| 46 |
return dct
|
| 47 |
|
| 48 |
|
| 49 |
-
def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
|
| 50 |
-
if max_samples >= len(dataset):
|
| 51 |
-
return dataset
|
| 52 |
|
| 53 |
-
generator = torch.Generator().manual_seed(seed)
|
| 54 |
-
perm = torch.randperm(len(dataset), generator=generator)
|
| 55 |
-
return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
|
| 56 |
|
| 57 |
|
| 58 |
def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
|
|
@@ -89,67 +71,28 @@ def get_dataset_from_loader(dataloader):
|
|
| 89 |
|
| 90 |
|
| 91 |
|
| 92 |
-
def sample_top_k(p, k, n_draw=None):
|
| 93 |
"""
|
| 94 |
p probabs 2048 ?
|
| 95 |
num_draw : how many tokens to sample (for duplicate elongation)
|
| 96 |
"""
|
| 97 |
|
| 98 |
-
p = torch.softmax(p
|
| 99 |
|
| 100 |
|
| 101 |
|
| 102 |
top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
|
|
|
|
| 103 |
min_value_top_k = top_k_value[..., [-1]] #
|
| 104 |
p *= (p >= min_value_top_k).float()
|
| 105 |
p.div_(p.sum(dim=-1, keepdim=True))
|
| 106 |
# -- next_token = multinomial(probs, num_samples=num_draw)
|
|
|
|
|
|
|
| 107 |
p_ = p.reshape(-1, p.shape[-1])
|
|
|
|
|
|
|
| 108 |
out = torch.multinomial(p_,
|
| 109 |
num_samples=n_draw,
|
| 110 |
replacement=False) # [4, num_draw]
|
| 111 |
return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
|
| 118 |
-
"""Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
|
| 119 |
-
For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
|
| 120 |
-
|
| 121 |
-
Args:
|
| 122 |
-
lengths (torch.Tensor): tensor with lengths
|
| 123 |
-
max_len (int): can set the max length manually. Defaults to None.
|
| 124 |
-
Returns:
|
| 125 |
-
torch.Tensor: mask with 0s where there is pad tokens else 1s
|
| 126 |
-
"""
|
| 127 |
-
assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
|
| 128 |
-
final_length = lengths.max().item() if not max_len else max_len
|
| 129 |
-
final_length = max(final_length, 1) # if all seqs are of len zero we don't want a zero-size tensor
|
| 130 |
-
return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
|
| 134 |
-
"""Get a list of tensors and collate them to a single tensor. according to the following logic:
|
| 135 |
-
- `dim` specifies the time dimension which will be stacked and padded.
|
| 136 |
-
- The output will contain 1 new dimension (dimension index 0) which will be the size of
|
| 137 |
-
of the original list.
|
| 138 |
-
|
| 139 |
-
Args:
|
| 140 |
-
tensors (tp.List[torch.Tensor]): List of tensors to collate.
|
| 141 |
-
dim (int): Dimension which will be stacked and padded.
|
| 142 |
-
Returns:
|
| 143 |
-
tp.Tuple[torch.Tensor, torch.Tensor]:
|
| 144 |
-
torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
|
| 145 |
-
(dimension index 0) which will be the size of the original list.
|
| 146 |
-
torch.Tensor: Tensor containing length of original tensor sizes (without padding).
|
| 147 |
-
"""
|
| 148 |
-
tensors = [x.transpose(0, dim) for x in tensors]
|
| 149 |
-
lens = torch.LongTensor([len(x) for x in tensors])
|
| 150 |
-
padded_tensors = pad_sequence(tensors)
|
| 151 |
-
padded_tensors = padded_tensors.transpose(0, 1)
|
| 152 |
-
padded_tensors = padded_tensors.transpose(1, dim + 1)
|
| 153 |
-
return padded_tensors, lens
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import hashlib
|
| 2 |
import json
|
| 3 |
import logging
|
|
|
|
| 4 |
import typing as tp
|
|
|
|
| 5 |
import flashy
|
| 6 |
import flashy.distrib
|
| 7 |
import omegaconf
|
| 8 |
import torch
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
|
|
|
| 34 |
return dct
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
|
| 74 |
+
def sample_top_k(p, k=250, n_draw=None):
|
| 75 |
"""
|
| 76 |
p probabs 2048 ?
|
| 77 |
num_draw : how many tokens to sample (for duplicate elongation)
|
| 78 |
"""
|
| 79 |
|
| 80 |
+
p = torch.softmax(p, dim=-1) # p/temp
|
| 81 |
|
| 82 |
|
| 83 |
|
| 84 |
top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
|
| 85 |
+
# print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
|
| 86 |
min_value_top_k = top_k_value[..., [-1]] #
|
| 87 |
p *= (p >= min_value_top_k).float()
|
| 88 |
p.div_(p.sum(dim=-1, keepdim=True))
|
| 89 |
# -- next_token = multinomial(probs, num_samples=num_draw)
|
| 90 |
+
|
| 91 |
+
# RESHAPED into bs, 4, 250
|
| 92 |
p_ = p.reshape(-1, p.shape[-1])
|
| 93 |
+
|
| 94 |
+
|
| 95 |
out = torch.multinomial(p_,
|
| 96 |
num_samples=n_draw,
|
| 97 |
replacement=False) # [4, num_draw]
|
| 98 |
return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.py
CHANGED
|
@@ -4,10 +4,10 @@ import numpy as np
|
|
| 4 |
|
| 5 |
print('\n\n\n\n___________________')
|
| 6 |
|
| 7 |
-
txt = '
|
| 8 |
|
| 9 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 10 |
-
sound_generator.set_generation_params(duration
|
| 11 |
|
| 12 |
x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
|
| 13 |
x /= np.abs(x).max() + 1e-7
|
|
|
|
| 4 |
|
| 5 |
print('\n\n\n\n___________________')
|
| 6 |
|
| 7 |
+
txt = 'dogs in street'
|
| 8 |
|
| 9 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 10 |
+
sound_generator.set_generation_params(duration=1.7) # why is generating so long at 14 seconds
|
| 11 |
|
| 12 |
x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
|
| 13 |
x /= np.abs(x).max() + 1e-7
|
live_api.py
CHANGED
|
@@ -17,7 +17,7 @@ from flask_cors import CORS
|
|
| 17 |
from audiocraft.audiogen import AudioGen #, audio_write
|
| 18 |
|
| 19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 20 |
-
sound_generator.set_generation_params(duration
|
| 21 |
|
| 22 |
|
| 23 |
# ====STYLE VECTOR====
|
|
@@ -51,11 +51,13 @@ def tts_multi_sentence(scene=None):
|
|
| 51 |
x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
|
| 52 |
|
| 53 |
x /= np.abs(x).max() + 1e-7
|
| 54 |
-
#
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
|
| 60 |
else:
|
| 61 |
print(scene, '\nDrop\n')
|
|
|
|
| 17 |
from audiocraft.audiogen import AudioGen #, audio_write
|
| 18 |
|
| 19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 20 |
+
sound_generator.set_generation_params(duration=.7)
|
| 21 |
|
| 22 |
|
| 23 |
# ====STYLE VECTOR====
|
|
|
|
| 51 |
x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
|
| 52 |
|
| 53 |
x /= np.abs(x).max() + 1e-7
|
| 54 |
+
# is 16kHz - AUdiogen Fs
|
| 55 |
+
x = audresample.resample(x,
|
| 56 |
+
original_rate=sound_generator.sample_rate, # 16000
|
| 57 |
+
target_rate=24000)[0, :]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
#
|
| 61 |
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
|
| 62 |
else:
|
| 63 |
print(scene, '\nDrop\n')
|