fx sounds batch inference

Browse files

Files changed (8) hide show

README.md +2 -2
api.py +7 -15
audiocraft/builders.py +10 -40
audiocraft/conditioners.py +24 -198
audiocraft/lm.py +7 -23
audiocraft/transformer.py +1 -4
models.py +5 -1
requirements.txt +19 -0

README.md CHANGED Viewed

@@ -62,7 +62,7 @@ pip install -r requirements.txt
 Flask `tmux-session`
 ```
-CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
@@ -127,5 +127,5 @@ Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.y
 ```python
 #  audiobook will be saved in ./tts_audiobooks
-python audiobook.py
 ```

 Flask `tmux-session`
 ```
+CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=0 python api.py
 ```
 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 ```python
 #  audiobook will be saved in ./tts_audiobooks
+python audiobook.py
 ```

api.py CHANGED Viewed

@@ -9,12 +9,11 @@ import re
 import srt
 import subprocess
 import cv2
-import markdown
 from pathlib import Path
 from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
-from flask_cors import CORS
-from moviepy.editor import *
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
 NUM_SOUND_GENERATIONS = 3  # batch size to generate same text (same soundscape for long video)
@@ -79,10 +78,10 @@ def overlay(x, soundscape=None):
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
-        # sound_generator._flush()  # ALREADY done in lm.generate() THE Encodec does not SEEM TO HAVE TRANSFORMERS thys no kvclean up kv cache from previous soundscape
-        # upsample 16 kHz AudioGen to 24kHZ StyleTTS
-        print('Resampling')
         background = audresample.resample(
@@ -178,14 +177,6 @@ def tts_multi_sentence(precomputed_style_vector=None,
 # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 app = Flask(__name__)
-cors = CORS(app)
-@app.route("/")
-def index():
-    with open('README.md', 'r') as f:
-        return markdown.markdown(f.read())
 @app.route("/", methods=['GET', 'POST', 'PUT'])
 def serve_wav():
@@ -460,7 +451,8 @@ def serve_wav():
         # SILENT CLIP
-        clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
         clip_silent.write_videofile(SILENT_VIDEO, fps=24)
         x = tts_multi_sentence(text=text,

 import srt
 import subprocess
 import cv2
 from pathlib import Path
 from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from moviepy.video.VideoClip import ImageClip
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
 NUM_SOUND_GENERATIONS = 3  # batch size to generate same text (same soundscape for long video)
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
+        # upsample 16 kHz AudioGen to 24kHZ of VITS/StyleTTS2
+        print('Resampling')  # soundscape each generation in batch differs from the other generations thus clone/shift each element in batch, finally  concat w/o shift
         background = audresample.resample(
 # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 app = Flask(__name__)
 @app.route("/", methods=['GET', 'POST', 'PUT'])
 def serve_wav():
         # SILENT CLIP
+        clip_silent = ImageClip(img=STATIC_FRAME,
+                                duration=5)  # ffmpeg continues this silent video for duration of TTS
         clip_silent.write_videofile(SILENT_VIDEO, fps=24)
         x = tts_multi_sentence(text=text,

audiocraft/builders.py CHANGED Viewed

@@ -10,11 +10,7 @@ from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
-from .conditioners import (
-    ConditioningProvider,
-    T5Conditioner,
-    ConditioningAttributes
-)
 from .vq import ResidualVectorQuantizer
@@ -73,10 +69,8 @@ class AudioGen(nn.Module):
     def generate(self,
                  descriptions):
         with torch.no_grad():
-            attributes = [
-                ConditioningAttributes(text={'description': d}) for d in descriptions]
             gen_tokens = self.lm.generate(
-                conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             # print('______________\nAudioGen Tokens', gen_tokens)
@@ -144,10 +138,8 @@ class AudioGen(nn.Module):
             codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
             attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
             cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
-            cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
-            condition_provider = self.get_conditioner_provider(kwargs["dim"], cfg
-                                                               ).to(self.device)
             # if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
@@ -163,7 +155,7 @@ class AudioGen(nn.Module):
             pattern_provider = self.get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
             return LMModel(
                 pattern_provider=pattern_provider,
-                condition_provider=condition_provider,
                 cfg_dropout=cfg_prob,
                 cfg_coef=cfg_coef,
                 attribute_dropout=attribute_dropout,
@@ -173,34 +165,8 @@ class AudioGen(nn.Module):
             ).to(cfg.device)
         else:
             raise KeyError(f"Unexpected LM model {cfg.lm_model}")
-    def get_conditioner_provider(self, output_dim,
-                                cfg):
-        """Instantiate T5 text"""
-        cfg = getattr(cfg, 'conditioners')
-        dict_cfg = {} if cfg is None else dict_from_config(cfg)
-        conditioners={}
-        condition_provider_args = dict_cfg.pop('args', {})
-        condition_provider_args.pop('merge_text_conditions_p', None)
-        condition_provider_args.pop('drop_desc_p', None)
-        for cond, cond_cfg in dict_cfg.items():
-            model_type = cond_cfg['model']
-            model_args = cond_cfg[model_type]
-            if model_type == 't5':
-                conditioners[str(cond)] = T5Conditioner(output_dim=output_dim,
-                                                        device=self.device,
-                                                        **model_args)
-            else:
-                raise ValueError(f"Unrecognized conditioning model: {model_type}")
-        # print(f'{condition_provider_args=}')
-        return ConditioningProvider(conditioners)
     def get_codebooks_pattern_provider(self, n_q, cfg):
         pattern_providers = {
             'delay': DelayedPatternProvider,  # THIS
@@ -249,6 +215,10 @@ class AudioGen(nn.Module):
         _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
         _delete_param(cfg, 'conditioners.args.drop_desc_p')
         model = self.get_lm_model(cfg)
         model.load_state_dict(pkg['best_state'])
         model.cfg = cfg
         # return model

 from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
+from .conditioners import T5Conditioner
 from .vq import ResidualVectorQuantizer
     def generate(self,
                  descriptions):
         with torch.no_grad():
             gen_tokens = self.lm.generate(
+                descriptions=descriptions,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             # print('______________\nAudioGen Tokens', gen_tokens)
             codebooks_pattern_cfg = getattr(cfg, 'codebooks_pattern')
             attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
             cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
+            cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
             # if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
             pattern_provider = self.get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
             return LMModel(
                 pattern_provider=pattern_provider,
+                condition_provider=T5Conditioner(name='t5-large', output_dim=kwargs["dim"], device=self.device),
                 cfg_dropout=cfg_prob,
                 cfg_coef=cfg_coef,
                 attribute_dropout=attribute_dropout,
             ).to(cfg.device)
         else:
             raise KeyError(f"Unexpected LM model {cfg.lm_model}")
     def get_codebooks_pattern_provider(self, n_q, cfg):
         pattern_providers = {
             'delay': DelayedPatternProvider,  # THIS
         _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
         _delete_param(cfg, 'conditioners.args.drop_desc_p')
         model = self.get_lm_model(cfg)
+        _best = pkg['best_state']
+        _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
+        _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         model.load_state_dict(pkg['best_state'])
         model.cfg = cfg
         # return model

audiocraft/conditioners.py CHANGED Viewed

@@ -1,82 +1,9 @@
-from collections import defaultdict
-from dataclasses import dataclass, field
-import logging
-import random
-import typing as tp
 import warnings
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
-logger = logging.getLogger(__name__)
-TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
-ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
-class JointEmbedCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    text: tp.List[tp.Optional[str]]
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
-@dataclass
-class ConditioningAttributes:
-    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
-    wav: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
-    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
-    def __getitem__(self, item):
-        return getattr(self, item)
-    @property
-    def text_attributes(self):
-        return self.text.keys()
-    @property
-    def wav_attributes(self):
-        return self.wav.keys()
-    @property
-    def joint_embed_attributes(self):
-        return self.joint_embed.keys()
-    @property
-    def attributes(self):
-        return {
-            "text": self.text_attributes,
-            "wav": self.wav_attributes,
-            "joint_embed": self.joint_embed_attributes,
-        }
-    def to_flat_dict(self):
-        return {
-            **{f"text.{k}": v for k, v in self.text.items()},
-            **{f"wav.{k}": v for k, v in self.wav.items()},
-            **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()}
-        }
-    @classmethod
-    def from_flat_dict(cls, x):
-        out = cls()
-        for k, v in x.items():
-            kind, att = k.split(".")
-            out[kind][att] = v
-        return out
-class Tokenizer:
-    """Base tokenizer implementation
-    (in case we want to introduce more advances tokenizers in the future).
-    """
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError()
 class T5Conditioner(nn.Module):
     MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
@@ -95,12 +22,10 @@ class T5Conditioner(nn.Module):
         "google/flan-t5-11b": 1024,
     }
-    def __init__(self,
-                 name: str,
-                 output_dim: int,
-                 device: str,
-                 word_dropout: float = 0.,
-                 normalize_text: bool = False,
                  finetune=False):
         print(f'{finetune=}')
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
@@ -110,19 +35,9 @@ class T5Conditioner(nn.Module):
         self.output_proj = nn.Linear(self.dim, output_dim)
         self.device = device
         self.name = name
-        self.word_dropout = word_dropout
-        # Let's disable logging temporarily because T5 will vomit some errors otherwise.
-        # thanks https://gist.github.com/simon-weber/7853144
-        previous_level = logging.root.manager.disable
-        logging.disable(logging.ERROR)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            try:
-                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
-                t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
-            finally:
-                logging.disable(previous_level)
         if finetune:
             self.t5 = t5
         else:
@@ -130,116 +45,27 @@ class T5Conditioner(nn.Module):
             # of the saved checkpoint
             self.__dict__['t5'] = t5.to(device)
-        self.normalize_text = normalize_text
-        if normalize_text:
-            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
-        # if current sample doesn't have a certain attribute, replace with empty string
-        entries: tp.List[str] = [xi if xi is not None else "" for xi in x]
-        if self.normalize_text:
-            _, _, entries = self.text_normalizer(entries, return_text=True)
-        if self.word_dropout > 0. and self.training:
-            new_entries = []
-            for entry in entries:
-                words = [word for word in entry.split(" ") if random.random() >= self.word_dropout]
-                new_entries.append(" ".join(words))
-            entries = new_entries
-        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == ""])
-        inputs = self.t5_tokenizer(entries, return_tensors='pt', padding=True).to(self.device)
-        mask = inputs['attention_mask']
-        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-        return inputs
-    def forward(self, inputs):
-        mask = inputs['attention_mask']
         with torch.no_grad():
-            embeds = self.t5(**inputs).last_hidden_state
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
-        embeds = (embeds * mask.unsqueeze(-1))
-        # T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask'])
-        print(f'{embeds.dtype=}')  # inputs["input_ids"].shape=torch.Size([2, 4])
-        return embeds, mask
-class ConditioningProvider(nn.Module):
-    def __init__(self,
-                 conditioners):
-        super().__init__()
-        self.conditioners = nn.ModuleDict(conditioners)
-    @property
-    def text_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)]
-    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
-        output = {}
-        text = self._collate_text(inputs)
-        # wavs = self._collate_wavs(inputs)
-        # joint_embeds = self._collate_joint_embeds(inputs)
-        # assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
-        #     f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
-        #     f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
-        # )
-        for attribute, batch in text.items(): #, joint_embeds.items()):
-            output[attribute] = self.conditioners[attribute].tokenize(batch)
-        print(f'COndProvToknz {output=}\n==')
-        return output
-    def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
-        """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
-        The output is for example:
-        {
-            "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
-            "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
-            ...
-        }
-        Args:
-            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
-        """
-        output = {}
-        for attribute, inputs in tokenized.items():
-            condition, mask = self.conditioners[attribute](inputs)
-            output[attribute] = (condition, mask)
-        return output
-    def _collate_text(self, samples):
-        """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
-        are the attributes and the values are the aggregated input per attribute.
-        For example:
-        Input:
-        [
-            ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...),
-            ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, wav=...),
-        ]
-        Output:
-        {
-            "genre": ["Rock", "Hip-hop"],
-            "description": ["A rock song with a guitar solo", "A hip-hop verse"]
-        }
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
-        """
-        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
-        texts = [x.text for x in samples]
-        for text in texts:
-            for condition in self.text_conditions:
-                out[condition].append(text[condition])
-        return out

 import warnings
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 class T5Conditioner(nn.Module):
     MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
         "google/flan-t5-11b": 1024,
     }
+    def __init__(self,
+                 name,
+                 output_dim,
+                 device,
                  finetune=False):
         print(f'{finetune=}')
         assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
         self.output_proj = nn.Linear(self.dim, output_dim)
         self.device = device
         self.name = name
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+        t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
         if finetune:
             self.t5 = t5
         else:
             # of the saved checkpoint
             self.__dict__['t5'] = t5.to(device)
+    def tokenize(self, x):
+        entries = [xi if xi is not None else "" for xi in x]
+        inputs = self.t5_tokenizer(entries,
+                                   return_tensors='pt',
+                                   padding=True).to(self.device)
+        return inputs  # 'input_ids' 'attentio mask'
+    def forward(self, descriptions):
+        d = self.tokenize(descriptions)
         with torch.no_grad():
+            embeds = self.t5(input_ids=d['input_ids'],
+                             attention_mask=d['attention_mask']
+                             ).last_hidden_state  # no kvcache for txt conditioning
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
+        embeds = (embeds * d['attention_mask'].unsqueeze(-1))
+        return embeds # , d['attention_mask']

audiocraft/lm.py CHANGED Viewed

@@ -23,17 +23,6 @@ def _shift(x):
-# ============================================== From LM.py
-logger = logging.getLogger(__name__)
-TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
-ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
-ConditionTensors = tp.Dict[str, ConditionType]
-CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
 def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
     """LM layer initialization.
     Inspired from xlformers: https://github.com/fairinternal/xlformers
@@ -280,19 +269,14 @@ class LMModel(nn.Module):
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
-    def generate(self, conditions = [],
-                 max_gen_len=256):
-        tokenized = self.condition_provider.tokenize(conditions)
-        cfg_conditions = self.condition_provider(tokenized)
         # NULL CONDITION
-        text_condition = cfg_conditions['description'][0]
         bs, _, _ = text_condition.shape
         text_condition = torch.cat(
             [
@@ -330,7 +314,7 @@ class LMModel(nn.Module):
             # forward duplicates the query to nullcond - then cfg & returns deduplicate token
             next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],
-                                      condition_tensors=text_condition,
                                       token_count=offset-1)  # [bs, 4, 1, 2048]

 def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
     """LM layer initialization.
     Inspired from xlformers: https://github.com/fairinternal/xlformers
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
+    def generate(self,
+                 descriptions = ['windy day', 'rain storm'],
+                 max_gen_len = 256):
+        text_condition = self.condition_provider(descriptions)
         # NULL CONDITION
+        # text_condition = cfg_conditions['description'][0]
         bs, _, _ = text_condition.shape
         text_condition = torch.cat(
             [
             # forward duplicates the query to nullcond - then cfg & returns deduplicate token
             next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],
+                                      condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset-1)  # [bs, 4, 1, 2048]

audiocraft/transformer.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from torch.utils.checkpoint import checkpoint as torch_checkpoint
-from xformers import ops
 _efficient_attention_backend: str = 'torch'
@@ -12,7 +11,6 @@ _efficient_attention_backend: str = 'torch'
 def _get_attention_time_dimension(memory_efficient: bool) -> int:
     if _efficient_attention_backend == 'torch' and memory_efficient:
         return 2
@@ -190,7 +188,7 @@ class StreamingMultiheadAttention(nn.Module):
                     # else:
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
-                    q, k, v = ops.unbind(packed, dim=2)
                 if self.k_history is not None:
@@ -222,7 +220,6 @@ class StreamingMultiheadAttention(nn.Module):
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
-                    # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(), 'CROSSopen')
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=False, dropout_p=p
                     )

 import torch.nn as nn
 from torch.nn import functional as F
 from torch.utils.checkpoint import checkpoint as torch_checkpoint
 _efficient_attention_backend: str = 'torch'
 def _get_attention_time_dimension(memory_efficient: bool) -> int:
     if _efficient_attention_backend == 'torch' and memory_efficient:
         return 2
                     # else:
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
+                    q, k, v = packed.unbind(dim=2)
                 if self.k_history is not None:
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=False, dropout_p=p
                     )

models.py CHANGED Viewed

@@ -511,7 +511,11 @@ def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
     def _load_model(model_config, model_path):
         model = ASRCNN(**model_config)
-        params = torch.load(model_path, map_location='cpu')['model']
         model.load_state_dict(params)
         return model

     def _load_model(model_config, model_path):
         model = ASRCNN(**model_config)
+        params = torch.load(
+            model_path,
+            map_location='cpu',
+            weights_only=False
+            )['model']
         model.load_state_dict(params)
         return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+torch
+torchaudio
+numpy
+audiofile
+audresample
+cached_path
+einops
+flask
+librosa
+moviepy
+sentencepiece
+omegaconf
+opencv-python
+soundfile
+transformers
+munch
+srt
+nltk
+phonemizer