Upload UltravoxPipeline

Browse files

Files changed (7) hide show

README.md +9 -9
config.json +2 -2
generation_config.json +1 -1
tokenizer.json +2 -2
ultravox_config.py +1 -3
ultravox_model.py +32 -24
ultravox_processing.py +6 -1

README.md CHANGED Viewed

@@ -1,4 +1,12 @@
 ---
 language:
 - ar
 - de
@@ -15,16 +23,8 @@ language:
 - tr
 - uk
 - zh
-license: mit
 library_name: transformers
-datasets:
-- fixie-ai/librispeech_asr
-- fixie-ai/common_voice_17_0
-- fixie-ai/peoples_speech
-- fixie-ai/gigaspeech
-- fixie-ai/multilingual_librispeech
-- fixie-ai/wenetspeech
-- fixie-ai/covost2
 metrics:
 - bleu
 ---

 ---
+datasets:
+- fixie-ai/librispeech_asr
+- fixie-ai/common_voice_17_0
+- fixie-ai/peoples_speech
+- fixie-ai/gigaspeech
+- fixie-ai/multilingual_librispeech
+- fixie-ai/wenetspeech
+- fixie-ai/covost2
 language:
 - ar
 - de
 - tr
 - uk
 - zh
 library_name: transformers
+license: mit
 metrics:
 - bleu
 ---

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "/ultravox-expts/artifacts/model-zhuang.2024-10-09-v0_4_1.stacking-4b.8c44a2e:v8",
   "architectures": [
     "UltravoxModel"
   ],
@@ -28,6 +28,6 @@
   "stack_factor": 8,
   "text_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.46.1",
   "vocab_size": 128256
 }

 {
+  "_name_or_path": "/Users/zhuang/expts/2024-10-09-v0_4_1/stacking-4b/ultravox/artifacts/model-zhuang.2024-10-09-v0_4_1.stacking-4b.8c44a2e:v8",
   "architectures": [
     "UltravoxModel"
   ],
   "stack_factor": 8,
   "text_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
   "vocab_size": 128256
 }

generation_config.json CHANGED Viewed

@@ -7,5 +7,5 @@
     128009
   ],
   "pad_token_id": 128009,
-  "transformers_version": "4.46.1"
 }

     128009
   ],
   "pad_token_id": 128009,
+  "transformers_version": "4.44.0"
 }

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
-size 17209920

 version https://git-lfs.github.com/spec/v1
+oid sha256:79e3e522635f3171300913bb421464a87de6222182a0570b9b2ccba2a964b2b4
+size 9085657

ultravox_config.py CHANGED Viewed

@@ -19,8 +19,6 @@ class LoraConfigSimplified:
     target_modules: Optional[List[str]] = dataclasses.field(
         default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"]
     )
-    # A list of module names regex patterns to unfreeze. Only used if r == 0.
-    unfreeze_layers: Optional[List[str]] = None
 class LossFunction(str, Enum):
@@ -30,7 +28,7 @@ class LossFunction(str, Enum):
 @dataclasses.dataclass
 class LossConfig:
-    loss_function: LossFunction = LossFunction.CrossEntropy
     kl_temperature: float = 2.0
     @property

     target_modules: Optional[List[str]] = dataclasses.field(
         default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"]
     )
 class LossFunction(str, Enum):
 @dataclasses.dataclass
 class LossConfig:
+    loss_function: LossFunction = LossFunction.KL_Divergence
     kl_temperature: float = 2.0
     @property

ultravox_model.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import re
 from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
@@ -35,14 +34,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
-    # We minimize the weights in state_dict in order to reduce the size of the checkpoint
-    # The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
-    # As such we have to tell is to ignore some keys that are not always in the model
-    _keys_to_ignore_on_load_unexpected = ["audio_tower.*", "language_model.*"]
-    # Usually we load encoder weights from a pretrained model, so we don't want to load the decoder weights
-    # Technically we never hit this issue because these keys are already removed from state_dict() however,
-    # but there's no harm in keeping it here for when we change that behavior.
-    _keys_to_ignore_on_load_missing = ["audio_tower.*"]
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
@@ -155,6 +148,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
@@ -196,7 +190,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             # B x A/3200 x D
             audio_tower_output = self.audio_tower.forward(
-                audio_values.to(self.audio_tower.dtype)
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
@@ -242,6 +237,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -270,6 +266,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
         return model_input
@@ -373,6 +370,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     def push_to_hub(self, *args, **kwargs):
         self.merge_and_unload()
         return super().push_to_hub(*args, **kwargs)
     def save_pretrained(
@@ -424,7 +422,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
-# TODO: refactor common parts to a shared module
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
@@ -442,18 +439,12 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
-    unfreeze_layers = lora_config.pop("unfreeze_layers", None)
     lora_config = peft.LoraConfig(**lora_config or {})
     if lora_config.r == 0:
-        # freeze the model entirely, except for the specified layers
-        for name, param in model.named_parameters():
-            if not unfreeze_layers or not any(
-                re.match(layer, name) for layer in unfreeze_layers
-            ):
-                param.requires_grad = False
-            else:
-                logging.info(f"Unfreezing layer: {name} with #{param.numel()} params")
     else:
         model = peft.get_peft_model(model, lora_config)
@@ -521,7 +512,7 @@ class UltravoxProjector(nn.Sequential):
         return hidden_states
-class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     """
     Encoder portion of OpenAI's Whisper model.
@@ -540,7 +531,7 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     def forward(
         self,
         input_features,
-        attention_mask=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -583,6 +574,23 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
@@ -606,14 +614,14 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
-                        None,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
-                        None,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),

 import logging
 from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
+    # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
+    _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
+        audio_len: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
             # B x A/3200 x D
             audio_tower_output = self.audio_tower.forward(
+                audio_values.to(self.audio_tower.dtype),
+                audio_len = audio_len
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
+            model_input["audio_len"] = audio_len
         return model_input
     def push_to_hub(self, *args, **kwargs):
         self.merge_and_unload()
+        self.to(self.language_model.dtype)
         return super().push_to_hub(*args, **kwargs)
     def save_pretrained(
         )
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
     lora_config = peft.LoraConfig(**lora_config or {})
     if lora_config.r == 0:
+        # freeze the model entirely
+        for param in model.parameters():
+            param.requires_grad = False
     else:
         model = peft.get_peft_model(model, lora_config)
         return hidden_states
+class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
     """
     Encoder portion of OpenAI's Whisper model.
     def forward(
         self,
         input_features,
+        audio_len=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+        attention_mask = None
+        if audio_len != None:
+            audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
+            batch_size = hidden_states.shape[0]
+            max_seq_len = hidden_states.shape[1]
+            attention_mask = (
+                torch.arange(max_seq_len, device=hidden_states.device)[None, :]
+                .expand(batch_size, -1)
+                .lt(audio_feature_len.view(batch_size, 1))
+            )
+            attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                None,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
+                        attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
+                        attention_mask,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),

ultravox_processing.py CHANGED Viewed

@@ -62,7 +62,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
@@ -154,12 +154,17 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 sampling_rate=sampling_rate,
                 padding="longest",
                 max_length=audio_len,
                 **kwargs,
             )
             if "input_features" in x:
                 data["audio_values"] = x.input_features
             else:
                 data["audio_values"] = x.input_values
         if text is not None:
             assert isinstance(

         super().__init__(audio_processor=audio_processor, tokenizer=tokenizer)
     @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
                 sampling_rate=sampling_rate,
                 padding="longest",
                 max_length=audio_len,
+                return_attention_mask=True,
                 **kwargs,
             )
             if "input_features" in x:
                 data["audio_values"] = x.input_features
             else:
                 data["audio_values"] = x.input_values
+            if self.audio_padding == "max_length":
+                data["audio_len"] = x.attention_mask.sum(-1) - 1
+            else:
+                data["audio_len"] = [data["audio_values"].shape[-1]]
         if text is not None:
             assert isinstance(