fixie-ai
/

ultravox-v0_3

@@ -2,6 +2,7 @@ import logging
 import re
 from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
 import peft
 import torch
 import torch.nn as nn
@@ -19,6 +20,15 @@ from .ultravox_config import LossConfig
 from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
 class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
@@ -69,44 +79,29 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.loss_config = LossConfig()
         self.post_init()
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        model = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        model._load_child_model_weights(*args, **kwargs)
         return model
-    def _load_child_model_weights(self, *args, **kwargs) -> "UltravoxModel":
-        if "torch_dtype" in kwargs:
-            self.config.torch_dtype = kwargs.pop("torch_dtype")
-        kwargs.pop("config", None)
-        if (
-            self.config.text_model_id is not None
-            and self.language_model.device.type == "meta"
-        ):
-            # Load the language model weights
-            self.language_model = transformers.AutoModelForCausalLM.from_pretrained(
-                self.config.text_model_id,
-                torch_dtype=self.config.torch_dtype,
-                *args,
-                **kwargs,
-            )
-        if (
-            self.config.audio_model_id is not None
-            and self.audio_tower.device.type == "meta"
-        ):
-            # Load the audio tower weights
-            self.audio_tower = transformers.AutoModel.from_pretrained(
-                self.config.audio_model_id,
-                torch_dtype=self.config.torch_dtype,
-                *args,
-                **kwargs,
-            )
-        return self
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -153,21 +148,29 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
-    def _get_prediction_mask(self, labels: Optional[torch.Tensor]) -> torch.Tensor:
-        """Get a boolean mask for positions where we want to compute KL divergence.
         For each label position, we want the position before it since that's where
         the model makes the prediction for that label.
         Args:
             labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
                    with -100 for masked positions and token ids for label positions
         Returns:
-            Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
         """
         if labels is None:
             raise ValueError("labels must be provided")
         # Shift the label mask right by 1 along the sequence dimension
         # This gives us positions where we make predictions for the next token
         label_mask = labels != -100
@@ -175,7 +178,19 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         pred_mask[:, :-1] = label_mask[
             :, 1:
         ]  # shift right by 1 along sequence dimension
-        return pred_mask
     def _compute_kl_loss(
         self,
@@ -198,21 +213,38 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                 past_key_values=past_key_values,
                 **kwargs,
             )
-        # compute the KL divergence loss between the two models
         kl_loss = F.kl_div(
             F.log_softmax(
-                lm_output.logits[self._get_prediction_mask(labels)]
-                / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
-                alt_lm_output.logits[self._get_prediction_mask(alt_labels)]
-                / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             reduction="batchmean",
         )
-        return {"loss": kl_loss}
     def _audio_iter(
         self, audio_batch_size: torch.Tensor
@@ -380,18 +412,27 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         cls, config: UltravoxConfig
     ) -> "UltravoxProjector":
         projector = UltravoxProjector(config)
-        projector.to(config.torch_dtype)
         return projector
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
-        with transformers.modeling_utils.no_init_weights():
-            # we only ever use from_config if the weights are retrained, hence initializing is not
-            # required. This makes the model quite creation faster since init on CPU is quite slow.
-            if "whisper" in config.audio_config._name_or_path.lower():
-                audio_tower = ModifiedWhisperEncoder(config.audio_config)
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
                 )
@@ -400,7 +441,27 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
                     None,
                     0,
                 ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
-                audio_tower = transformers.AutoModel.from_config(config.audio_config)
         if isinstance(
             audio_tower,
@@ -418,14 +479,27 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
-        with transformers.modeling_utils.no_init_weights():
-            # we only ever use from_config if the weights are retrained, hence initializing is not
-            # required. This makes the model quite creation faster since init on CPU is quite slow.
-            language_model = transformers.AutoModelForCausalLM.from_config(
-                config.text_config,
-                attn_implementation=config.text_config._attn_implementation,
-                torch_dtype=config.torch_dtype,
             )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
@@ -525,6 +599,39 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         )
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],

 import re
 from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
+import accelerate
 import peft
 import torch
 import torch.nn as nn
 from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
+FROM_PRETRAINED_KWARGS = {}
+SHARED_PRETRAINED_KWARGS = [
+    "tp_plan",
+    "device_map",
+    "torch_dtype",
+    "attn_implementation",
+    "use_flash_attention_2",
+]
 class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
         self.loss_config = LossConfig()
         self.post_init()
+    def _init_weights(self, module):
+        if module is self:
+            if self.config.text_model_id is not None:
+                self.language_model = self._create_language_model(self.config)
+            if self.config.audio_model_id is not None:
+                self.audio_tower = self._create_audio_tower(self.config)
+        elif module in self.language_model.modules():
+            pass
+        elif module in self.audio_tower.modules():
+            pass
+        else:
+            super()._init_weights(module)
     @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        global FROM_PRETRAINED_KWARGS
+        FROM_PRETRAINED_KWARGS = {
+            k: v for k, v in kwargs.items() if k in SHARED_PRETRAINED_KWARGS
+        }
+        model = super().from_pretrained(*args, **kwargs)
+        FROM_PRETRAINED_KWARGS = {}
         return model
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
+    def _get_prediction_mask(
+        self, labels: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get boolean masks for positions where we want to compute KL divergence.
         For each label position, we want the position before it since that's where
         the model makes the prediction for that label.
+        Additionally, we want to identify the position right before the EOT token
+        (the last token with label != -100).
         Args:
             labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
                    with -100 for masked positions and token ids for label positions
         Returns:
+            Tuple containing:
+            - pred_mask: Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
+            - eot_mask: Boolean tensor of shape (B, T) that's True only for the last prediction position in each sequence
         """
         if labels is None:
             raise ValueError("labels must be provided")
         # Shift the label mask right by 1 along the sequence dimension
         # This gives us positions where we make predictions for the next token
         label_mask = labels != -100
         pred_mask[:, :-1] = label_mask[
             :, 1:
         ]  # shift right by 1 along sequence dimension
+        # Create EOT mask - identify only the last prediction position in each sequence
+        eot_mask = torch.zeros_like(pred_mask)
+        batch_size = labels.shape[0]
+        for i in range(batch_size):
+            # Find positions where we make predictions
+            pred_positions = torch.where(pred_mask[i])[0]
+            if len(pred_positions) > 0:
+                # Only mark the last prediction position
+                eot_mask[i, pred_positions[-1]] = True
+        return pred_mask, eot_mask
     def _compute_kl_loss(
         self,
                 past_key_values=past_key_values,
                 **kwargs,
             )
+        # Get prediction masks for regular tokens and EOT tokens
+        pred_mask, eot_mask = self._get_prediction_mask(labels)
+        alt_pred_mask, alt_eot_mask = self._get_prediction_mask(alt_labels)
+        # compute the KL divergence loss between the two models for regular tokens
         kl_loss = F.kl_div(
             F.log_softmax(
+                lm_output.logits[pred_mask] / self.loss_config.kl_temperature,
+                dim=-1,
+            ),
+            F.softmax(
+                alt_lm_output.logits[alt_pred_mask] / self.loss_config.kl_temperature,
+                dim=-1,
+            ),
+            reduction="batchmean",
+        )
+        # Compute the KL divergence loss for EOT token positions if any exist
+        eot_loss = F.kl_div(
+            F.log_softmax(
+                lm_output.logits[eot_mask] / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
+                alt_lm_output.logits[alt_eot_mask] / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             reduction="batchmean",
         )
+        return {"loss": kl_loss + self.loss_config.eot_loss_weight * eot_loss}
     def _audio_iter(
         self, audio_batch_size: torch.Tensor
         cls, config: UltravoxConfig
     ) -> "UltravoxProjector":
         projector = UltravoxProjector(config)
+        dtype = config.torch_dtype
+        if isinstance(dtype, str):
+            dtype = getattr(torch, dtype)
+        projector.to(dtype)
         return projector
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
+        # We probably don't want to pass tp_plan or device_map to the audio tower
+        # But potentially other kwargs can be passed in. TODO
+        kwargs = {"torch_dtype": config.torch_dtype}
+        if (
+            transformers.modeling_utils._init_weights
+            and config.audio_model_id is not None
+        ):
+            if "whisper" in config.audio_model_id.lower():
+                audio_tower = ModifiedWhisperEncoder.from_pretrained(
+                    config.audio_model_id, **kwargs
+                )
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
                 )
                     None,
                     0,
                 ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
+                audio_tower = transformers.AutoModel.from_pretrained(
+                    config.audio_model_id, **kwargs
+                )
+        else:
+            with accelerate.init_empty_weights():
+                if "whisper" in config.audio_config._name_or_path.lower():
+                    audio_tower = ModifiedWhisperEncoder(config.audio_config)
+                    audio_tower.init_latency_mask(
+                        config.audio_latency_block_size,
+                        dtype=config.torch_dtype,
+                    )
+                else:
+                    assert config.audio_latency_block_size in (
+                        None,
+                        0,
+                    ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
+                    # we only ever use from_config if the weights are retrained, hence initializing is not
+                    # required. This makes the model quite creation faster since init on CPU is quite slow.
+                    audio_tower = transformers.AutoModel.from_config(
+                        config.audio_config, **kwargs
+                    )
         if isinstance(
             audio_tower,
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
+        if (
+            transformers.modeling_utils._init_weights
+            and config.text_model_id is not None
+        ):
+            language_model = transformers.AutoModelForCausalLM.from_pretrained(
+                config.text_model_id,
+                **{
+                    "attn_implementation": config.text_config._attn_implementation,
+                    "torch_dtype": config.torch_dtype,
+                    **FROM_PRETRAINED_KWARGS,
+                },
             )
+        else:
+            with accelerate.init_empty_weights():
+                # we only ever use from_config if the weights are retrained, hence initializing is not
+                # required. This makes the model quite creation faster since init on CPU is quite slow.
+                language_model = transformers.AutoModelForCausalLM.from_config(
+                    config.text_config,
+                    attn_implementation=config.text_config._attn_implementation,
+                    torch_dtype=config.torch_dtype,
+                )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
         )
+def get_checkpoint_files(
+    model_id: str,
+) -> tuple[list[str], dict | None, list[str]]:
+    resolved_archive_file = transformers.utils.cached_file(
+        model_id,
+        transformers.utils.SAFE_WEIGHTS_NAME,
+        _raise_exceptions_for_missing_entries=False,
+    )
+    if resolved_archive_file is not None:
+        # not sharded
+        sharded_metadata = None
+        state_dict = transformers.modeling_utils.load_state_dict(resolved_archive_file)
+        loaded_state_dict_keys = list(state_dict.keys())
+    else:
+        # sharded
+        resolved_archive_file = transformers.utils.cached_file(
+            model_id, transformers.utils.SAFE_WEIGHTS_INDEX_NAME
+        )
+        resolved_archive_file, sharded_metadata = (
+            transformers.modeling_utils.get_checkpoint_shard_files(
+                model_id,
+                resolved_archive_file,
+            )
+        )
+        loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
+    if isinstance(resolved_archive_file, str):
+        resolved_archive_file = [resolved_archive_file]
+    return resolved_archive_file, sharded_metadata, loaded_state_dict_keys
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],