Initial upload for X-VLA-Google-Robot

Browse files

Files changed (13) hide show

action_hub.py +275 -0
config.json +78 -0
configuration_florence2.py +340 -0
configuration_xvla.py +95 -0
model.safetensors +3 -0
modeling_florence2.py +0 -0
modeling_xvla.py +287 -0
preprocessor_config.json +38 -0
processing_xvla.py +206 -0
tokenizer.json +0 -0
tokenizer_config.json +4 -0
transformer.py +403 -0
vocab.json +0 -0

action_hub.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+from typing import Iterable, Tuple, Dict, Type
+import torch
+import torch.nn as nn
+# =============================================================================
+# Registry
+# =============================================================================
+ACTION_REGISTRY: Dict[str, Type["BaseActionSpace"]] = {}
+def register_action(name: str):
+    """Decorator for registering a new action space."""
+    def _wrap(cls):
+        key = name.lower()
+        if key in ACTION_REGISTRY:
+            raise KeyError(f"ActionSpace '{key}' already registered -> {ACTION_REGISTRY[key]}")
+        ACTION_REGISTRY[key] = cls
+        cls.name = key
+        return cls
+    return _wrap
+def build_action_space(name: str, **kwargs) -> "BaseActionSpace":
+    """Instantiate a registered action space by name."""
+    key = name.lower()
+    if key not in ACTION_REGISTRY:
+        raise KeyError(f"Unknown action space '{name}'. Available: {list(ACTION_REGISTRY.keys())}")
+    return ACTION_REGISTRY[key](**kwargs)
+# =============================================================================
+# Base class
+# =============================================================================
+class BaseActionSpace(nn.Module):
+    """
+    Abstract base class for all action-space definitions.
+    Each subclass defines:
+      - `dim_action`: dimension of the action vector.
+      - `gripper_idx`: indices of gripper channels.
+      - `compute_loss(pred, target)`: supervised loss for this space.
+      - `preprocess(proprio, action, mode)`: pre-step modifications.
+      - `postprocess(action)`: post-step corrections (e.g. apply sigmoid).
+    """
+    name: str = "base"
+    dim_action: int = 0
+    gripper_idx: Tuple[int, ...] = ()
+    def __init__(self):
+        super().__init__()
+    # ---------------------------------------------------------------------
+    # Core supervised loss
+    # ---------------------------------------------------------------------
+    def compute_loss(self, pred: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
+    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """Alias for compute_loss."""
+        return self.compute_loss(pred, target)
+    # ---------------------------------------------------------------------
+    # Space-level hooks
+    # ---------------------------------------------------------------------
+    def preprocess(
+        self,
+        proprio: torch.Tensor,
+        action: torch.Tensor,
+        mode: str = "train",
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Default: return unchanged."""
+        return proprio, action
+    def postprocess(self, action: torch.Tensor) -> torch.Tensor:
+        """Default: return unchanged."""
+        return action
+# =============================================================================
+# Utilities
+# =============================================================================
+def _ensure_indices_valid(D: int, idx: Iterable[int], name: str) -> None:
+    bad = [i for i in idx if i < 0 or i >= D]
+    if bad:
+        raise IndexError(f"{name} contains out-of-range indices {bad} for action dim D={D}")
+# =============================================================================
+# Implementations
+# =============================================================================
+@register_action("ee6d")
+class EE6DActionSpace(BaseActionSpace):
+    """End-effector layout with xyz, 6D rotation, and gripper channels."""
+    dim_action = 20
+    gripper_idx = (9, 19)
+    GRIPPER_SCALE = 1.0
+    XYZ_SCALE = 500.0
+    ROT_SCALE = 10.0
+    POS_IDX_1 = (0, 1, 2)
+    POS_IDX_2 = (10, 11, 12)
+    ROT_IDX_1 = (3, 4, 5, 6, 7, 8)
+    ROT_IDX_2 = (13, 14, 15, 16, 17, 18)
+    def __init__(self):
+        super().__init__()
+        self.mse = nn.MSELoss()
+        self.bce = nn.BCEWithLogitsLoss()
+    def compute_loss(self, pred, target):
+        assert pred.shape == target.shape, "pred/target shapes must match"
+        B, T, D = pred.shape
+        _ensure_indices_valid(D, self.gripper_idx, "gripper_idx")
+        # Gripper BCE
+        g_losses = [self.bce(pred[:, :, gi], target[:, :, gi]) for gi in self.gripper_idx]
+        gripper_loss = sum(g_losses) / len(self.gripper_idx) * self.GRIPPER_SCALE
+        # XYZ position
+        pos_loss = (
+            self.mse(pred[:, :, self.POS_IDX_1], target[:, :, self.POS_IDX_1]) +
+            self.mse(pred[:, :, self.POS_IDX_2], target[:, :, self.POS_IDX_2])
+        ) * self.XYZ_SCALE
+        # Rotation 6D
+        rot_loss = (
+            self.mse(pred[:, :, self.ROT_IDX_1], target[:, :, self.ROT_IDX_1]) +
+            self.mse(pred[:, :, self.ROT_IDX_2], target[:, :, self.ROT_IDX_2])
+        ) * self.ROT_SCALE
+        return {
+            "position_loss": pos_loss,
+            "rotate6D_loss": rot_loss,
+            "gripper_loss": gripper_loss,
+        }
+    def preprocess(self, proprio, action, mode="train"):
+        """Zero-out gripper channels in proprio/action."""
+        proprio_m = proprio.clone()
+        action_m = action.clone()
+        proprio_m[..., self.gripper_idx] = 0.0
+        action_m[..., self.gripper_idx] = 0.0
+        return proprio_m, action_m
+    def postprocess(self, action: torch.Tensor) -> torch.Tensor:
+        """Apply sigmoid to gripper logits."""
+        if action.size(-1) > max(self.gripper_idx):
+            action[..., self.gripper_idx] = torch.sigmoid(action[..., self.gripper_idx])
+        return action
+@register_action("joint")
+class JointActionSpace(BaseActionSpace):
+    """Joint-space layout with joints + gripper only."""
+    dim_action = 14
+    gripper_idx = (6, 13)
+    GRIPPER_SCALE = 0.1
+    JOINTS_SCALE = 1.0
+    def __init__(self):
+        super().__init__()
+        self.mse = nn.MSELoss()
+        self.bce = nn.BCEWithLogitsLoss()
+    def compute_loss(self, pred, target):
+        assert pred.shape == target.shape
+        B, T, D = pred.shape
+        _ensure_indices_valid(D, self.gripper_idx, "gripper_idx")
+        g_losses = [self.bce(pred[:, :, gi], target[:, :, gi]) for gi in self.gripper_idx]
+        gripper_loss = sum(g_losses) / len(self.gripper_idx) * self.GRIPPER_SCALE
+        joints_idx = tuple(i for i in range(D) if i not in set(self.gripper_idx))
+        joints_loss = self.mse(pred[:, :, joints_idx], target[:, :, joints_idx]) * self.JOINTS_SCALE
+        return {
+            "joints_loss": joints_loss,
+            "gripper_loss": gripper_loss,
+        }
+    def preprocess(self, proprio, action, mode="train"):
+        """Zero-out gripper channels in proprio/action."""
+        proprio_m = proprio.clone()
+        action_m = action.clone()
+        proprio_m[..., self.gripper_idx] = 0.0
+        action_m[..., self.gripper_idx] = 0.0
+        return proprio_m, action_m
+    def postprocess(self, action: torch.Tensor) -> torch.Tensor:
+        """Apply sigmoid to gripper logits."""
+        if action.size(-1) > max(self.gripper_idx):
+            action[..., self.gripper_idx] = torch.sigmoid(action[..., self.gripper_idx])
+        return action
+@register_action("agibot_ee6d")
+class AGIBOTEE6DActionSpace(BaseActionSpace):
+    """AGI-bot variant of EE6DActionSpace using MSE for all components."""
+    dim_action = 20
+    gripper_idx = (9, 19)
+    GRIPPER_SCALE = 10.0
+    XYZ_SCALE = 500.0
+    ROT_SCALE = 10.0
+    POS_IDX_1 = (0, 1, 2)
+    POS_IDX_2 = (10, 11, 12)
+    ROT_IDX_1 = (3, 4, 5, 6, 7, 8)
+    ROT_IDX_2 = (13, 14, 15, 16, 17, 18)
+    def __init__(self):
+        super().__init__()
+        self.mse = nn.MSELoss()
+    def compute_loss(self, pred, target):
+        assert pred.shape == target.shape
+        B, T, D = pred.shape
+        _ensure_indices_valid(D, self.gripper_idx, "gripper_idx")
+        gripper_loss = self.mse(pred[:, :, self.gripper_idx], target[:, :, self.gripper_idx]) * self.GRIPPER_SCALE
+        pos_loss = (
+            self.mse(pred[:, :, self.POS_IDX_1], target[:, :, self.POS_IDX_1]) +
+            self.mse(pred[:, :, self.POS_IDX_2], target[:, :, self.POS_IDX_2])
+        ) * self.XYZ_SCALE
+        rot_loss = (
+            self.mse(pred[:, :, self.ROT_IDX_1], target[:, :, self.ROT_IDX_1]) +
+            self.mse(pred[:, :, self.ROT_IDX_2], target[:, :, self.ROT_IDX_2])
+        ) * self.ROT_SCALE
+        return {
+            "position_loss": pos_loss,
+            "rotate6D_loss": rot_loss,
+            "gripper_loss": gripper_loss,
+        }
+    def preprocess(self, proprio, action, mode="train"):
+        """No preprocessing applied in AGIBOT variant."""
+        return proprio, action
+    def postprocess(self, action: torch.Tensor) -> torch.Tensor:
+        """AGIBOT does not postprocess."""
+        return action
+# =============================================================================
+# Exports
+# =============================================================================
+__all__ = [
+    "BaseActionSpace",
+    "build_action_space",
+    "register_action",
+    "EE6DActionSpace",
+    "JointActionSpace",
+    "AGIBOTEE6DActionSpace",
+    "ACTION_REGISTRY",
+]

config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "_name_or_path": "xvla",
+  "model_type": "xvla",
+  "architectures": ["XVLA"],
+  "auto_map": {
+    "AutoConfig": "configuration_xvla.XVLAConfig",
+    "AutoModel": "modeling_xvla.XVLA"
+  },
+  "action_mode": "ee6d",
+  "use_proprio": true,
+  "num_actions": 30,
+  "hidden_size": 1024,
+  "depth": 24,
+  "num_heads": 16,
+  "mlp_ratio": 4.0,
+  "num_domains": 30,
+  "len_soft_prompts": 32,
+  "dim_time": 32,
+  "max_len_seq": 512,
+  "use_hetero_proj": false,
+  "soft_prompt_length": 32,
+  "florence_config": {
+    "model_type": "florence2",
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "ignore_index": -100,
+    "pad_token_id": 1,
+    "projection_dim": 1024,
+    "text_config": {
+      "vocab_size": 51289,
+      "activation_dropout": 0.1,
+      "activation_function": "gelu",
+      "attention_dropout": 0.1,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_layers": 12,
+      "encoder_attention_heads": 16,
+      "encoder_layers": 12,
+      "dropout": 0.1,
+      "max_position_embeddings": 4096,
+      "num_hidden_layers": 12,
+      "num_beams": 3
+    },
+    "vision_config": {
+      "model_type": "davit",
+      "drop_path_rate": 0.1,
+      "patch_size": [7, 3, 3, 3],
+      "patch_stride": [4, 2, 2, 2],
+      "patch_padding": [3, 1, 1, 1],
+      "patch_prenorm": [false, true, true, true],
+      "enable_checkpoint": false,
+      "dim_embed": [256, 512, 1024, 2048],
+      "num_heads": [8, 16, 32, 64],
+      "num_groups": [8, 16, 32, 64],
+      "depths": [1, 1, 9, 1],
+      "window_size": 12,
+      "projection_dim": 1024,
+      "visual_temporal_embedding": {
+        "type": "COSINE",
+        "max_temporal_embeddings": 100
+      },
+      "image_pos_embed": {
+        "type": "learned_abs_2d",
+        "max_pos_embeddings": 50
+      },
+      "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
+    },
+    "vocab_size": 51289,
+    "torch_dtype": "float16",
+    "is_encoder_decoder": true
+  }
+}

configuration_florence2.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+from typing import Optional
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+            The patch size of the image.
+        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+            The patch stride of the image.
+        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+            The patch padding of the image.
+        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+            Whether to apply layer normalization before the patch embedding layer.
+        enable_checkpoint (`bool`, *optional*, defaults to False):
+            Whether to enable checkpointing.
+        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+            The dimension of the embedding layer.
+        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of attention heads.
+        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of groups.
+        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+            The depth of the model.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        visual_temporal_embedding (`dict`, *optional*):
+            The configuration of the visual temporal embedding.
+        image_pos_embed (`dict`, *optional*):
+            The configuration of the image position embedding.
+        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+            The source of the image feature.
+    Example:
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "davit"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=[7, 3, 3, 3],
+        patch_stride=[4, 2, 2, 2],
+        patch_padding=[3, 1, 1, 1],
+        patch_prenorm=[False, True, True, True],
+        enable_checkpoint=False,
+        dim_embed=[256, 512, 1024, 2048],
+        num_heads=[8, 16, 32, 64],
+        num_groups=[8, 16, 32, 64],
+        depths=[1, 1, 9, 1],
+        window_size=12,
+        projection_dim=1024,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+        **kwargs,
+    ):
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.patch_prenorm = patch_prenorm
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = dim_embed
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.depths = depths
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding
+        self.image_pos_embed = image_pos_embed
+        self.image_feature_source = image_feature_source
+        super().__init__(**kwargs)
+class Florence2LanguageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Florence2LanguageModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+    >>> # Initializing a Florence2 Language style configuration
+    >>> configuration = Florence2LanguageConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2LangaugeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_language"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Florence2VisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the multimodal projection space.
+    Example:
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=1024,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        if vision_config is not None:
+            vision_config = Florence2VisionConfig(**vision_config)
+        self.vision_config = vision_config
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if text_config is not None:
+            self.text_config = Florence2LanguageConfig(**text_config)
+        super().__init__(**kwargs)

configuration_xvla.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from .configuration_florence2 import Florence2Config
+from transformers.configuration_utils import PretrainedConfig
+class XVLAConfig(PretrainedConfig):
+    """
+    Configuration class for the **XVLA (Extended Vision-Language-Action)** model.
+    This configuration defines all submodules of XVLA in a single place:
+      - The visual-language backbone (Florence2)
+      - The temporal/action transformer
+      - The action/proprio setup
+    """
+    model_type = "xvla"
+    def __init__(
+        # === Florence backbone ===
+        self,
+        florence_config: dict | None = None,
+        # === Transformer head ===
+        hidden_size: int = 1024,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_domains: int = 30,
+        len_soft_prompts: int = 32,
+        dim_time: int = 32,
+        max_len_seq: int = 512,
+        use_hetero_proj: bool = False,
+        soft_prompt_length: int = 32,
+        # === Action & proprio ===
+        num_actions: int = 30,
+        action_mode: str = "ee6d",
+        use_proprio: bool = True,
+        **kwargs,
+    ):
+        # Florence2 backbone configuration
+        if isinstance(florence_config, dict):
+            self.florence_config = Florence2Config(**florence_config)
+        elif isinstance(florence_config, Florence2Config):
+            self.florence_config = florence_config
+        else:
+            self.florence_config = Florence2Config()
+        # Transformer hyperparameters
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.num_domains = num_domains
+        self.len_soft_prompts = len_soft_prompts
+        self.dim_time = dim_time
+        self.max_len_seq = max_len_seq
+        self.use_hetero_proj = use_hetero_proj
+        self.soft_prompt_length = soft_prompt_length
+        # Action/proprioception settings
+        self.num_actions = num_actions
+        self.action_mode = action_mode
+        self.use_proprio = use_proprio
+        # Initialize base HF config attributes (e.g. name_or_path)
+        super().__init__(**kwargs)
+    # -------------------------------------------------------------------------
+    # Serialization helpers
+    # -------------------------------------------------------------------------
+    def to_dict(self):
+        """
+        Convert this configuration (and its Florence sub-config)
+        into a fully serializable dictionary for HF save/load.
+        """
+        output = super().to_dict()
+        output["florence_config"] = self.florence_config.to_dict()
+        return output

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ea279d74b9a5878da79f7dae949a1d8e92cead2cf0f58612f9d11e4ba89788e
+size 3519068172

modeling_florence2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_xvla.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+import logging
+import traceback
+from typing import Any, Dict
+import numpy as np
+import torch
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from PIL import Image
+import uvicorn
+import json_numpy
+import cv2
+from transformers import PreTrainedModel
+from .modeling_florence2 import Florence2ForConditionalGeneration
+from .transformer import SoftPromptedTransformer
+from .action_hub import build_action_space
+from .configuration_xvla import XVLAConfig
+class XVLA(PreTrainedModel):
+    """
+    XVLA: HuggingFace-compatible Vision-Language-Action policy.
+    Components:
+      • Florence2 encoder-only backbone (vision-language)
+      • SoftPromptedTransformer (temporal/action head)
+      • Action space (pre/post-processing + loss)
+    """
+    config_class = XVLAConfig
+    base_model_prefix = "xvla"
+    supports_gradient_checkpointing = True
+    def __init__(self, config: XVLAConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        # Core settings
+        self.num_actions: int = config.num_actions
+        self.use_proprio: bool = config.use_proprio
+        self.action_mode: str = config.action_mode.lower()
+        # Action space (dimensions + hooks)
+        self.action_space = build_action_space(config.action_mode.lower())
+        dim_action = self.action_space.dim_action
+        dim_proprio = getattr(self.action_space, "dim_proprio", dim_action)
+        # Florence2 backbone (encoder only)
+        self.vlm = Florence2ForConditionalGeneration(config.florence_config)
+        if hasattr(self.vlm, "language_model"):
+            lm = self.vlm.language_model
+            if hasattr(lm, "model") and hasattr(lm.model, "decoder"):
+                del lm.model.decoder
+            if hasattr(lm, "lm_head"):
+                del lm.lm_head
+        projection_dim = getattr(self.vlm.config, "projection_dim", None)
+        if projection_dim is None:
+            raise ValueError("Florence2 config must provide `projection_dim` for multimodal fusion.")
+        # Temporal/action head
+        self.transformer = SoftPromptedTransformer(
+            hidden_size=config.hidden_size,
+            multi_modal_input_size=projection_dim,
+            depth=config.depth,
+            num_heads=config.num_heads,
+            mlp_ratio=config.mlp_ratio,
+            num_domains=config.num_domains,
+            dim_action=dim_action,
+            dim_propio=dim_proprio,
+            len_soft_prompts=config.len_soft_prompts,
+            dim_time=config.dim_time,
+            max_len_seq=config.max_len_seq,
+            use_hetero_proj=config.use_hetero_proj,
+        )
+        # Deferred FastAPI app
+        self.app: FastAPI | None = None
+    # ============================= Florence2 encoder =============================
+    def forward_vlm(
+        self,
+        input_ids: torch.LongTensor,        # [B, L]
+        pixel_values: torch.FloatTensor,    # [B, V, C, H, W]
+        image_mask: torch.Tensor,           # [B, V] (bool or 0/1)
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Encode text + multi-view images via Florence2 encoder.
+        Returns:
+          { "vlm_features": [B, T_enc, D], "aux_visual_inputs": [B, (V-1)*N, D] }
+        """
+        B, V = pixel_values.shape[:2]
+        flat_mask = image_mask.view(-1).to(torch.bool)         # [B*V]
+        flat_images = pixel_values.flatten(0, 1)                # [B*V, C, H, W]
+        num_valid = int(flat_mask.sum().item())
+        if num_valid == 0:
+            raise ValueError("At least one image view must be valid per batch.")
+        valid_images = flat_images[flat_mask]                   # [#valid, C, H, W]
+        valid_feats = self.vlm._encode_image(valid_images)      # [#valid, N, D]
+        N, D = valid_feats.shape[1:]
+        image_features = valid_feats.new_zeros((B * V, N, D))
+        image_features[flat_mask] = valid_feats
+        image_features = image_features.view(B, V, N, D)        # [B, V, N, D]
+        inputs_embeds = self.vlm.get_input_embeddings()(input_ids)  # [B, L, D]
+        merged_embeds, attention_mask = self.vlm._merge_input_ids_with_image_features(
+            image_features[:, 0],  # first view: [B, N, D]
+            inputs_embeds,         # [B, L, D]
+        )
+        enc_out = self.vlm.language_model.model.encoder(
+            attention_mask=attention_mask,
+            inputs_embeds=merged_embeds,
+        )[0]  # [B, T_enc, D]
+        aux_visual_inputs = image_features[:, 1:].reshape(B, -1, D)  # remaining views flattened
+        return {"vlm_features": enc_out, "aux_visual_inputs": aux_visual_inputs}
+    # ================================= training =================================
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        image_input: torch.FloatTensor,
+        image_mask: torch.Tensor,
+        domain_id: torch.LongTensor,
+        proprio: torch.Tensor,
+        action: torch.Tensor,  # [B, T=num_actions, D=dim_action]
+    ) -> Dict[str, torch.Tensor]:
+        """
+        1) Encode multimodal inputs.
+        2) Diffusion-style noisy mixture of actions: x_t = t*noise + (1-t)*gt.
+        3) Space-specific preprocessing, prediction, and supervised loss.
+        """
+        enc = self.forward_vlm(input_ids, image_input, image_mask)
+        B = input_ids.shape[0]
+        t = (torch.rand(1, device=input_ids.device)
+             + torch.arange(B, device=input_ids.device) / B) % (1 - 1e-5)
+        action_noisy = torch.randn_like(action) * t.view(-1, 1, 1) + action * (1 - t).view(-1, 1, 1)
+        proprio_m, action_noisy_m = self.action_space.preprocess(proprio, action_noisy)
+        pred_action = self.transformer(
+            domain_id=domain_id,
+            action_with_noise=action_noisy_m,
+            t=t,
+            proprio=proprio_m,
+            **enc,
+        )
+        return self.action_space.compute_loss(pred_action, action)
+    # ================================= inference =================================
+    @torch.no_grad()
+    def generate_actions(
+        self,
+        input_ids: torch.LongTensor,
+        image_input: torch.FloatTensor,
+        image_mask: torch.Tensor,
+        domain_id: torch.LongTensor,
+        proprio: torch.Tensor,
+        steps: int = 10,
+    ) -> torch.Tensor:
+        """
+        Iterative denoising (linear schedule).
+        Applies action_space.postprocess at the end (e.g., sigmoid on gripper).
+        """
+        self.eval()
+        enc = self.forward_vlm(input_ids, image_input, image_mask)
+        B = input_ids.shape[0]
+        D = self.action_space.dim_action
+        x1 = torch.randn(B, self.num_actions, D, device=proprio.device, dtype=proprio.dtype)
+        action = torch.zeros_like(x1)
+        steps = max(1, int(steps))
+        for i in range(steps, 0, -1):
+            t = torch.full((B,), i / steps, device=proprio.device, dtype=proprio.dtype)
+            x_t = x1 * t.view(-1, 1, 1) + action * (1 - t).view(-1, 1, 1)
+            proprio_m, x_t_m = self.action_space.preprocess(proprio, x_t)
+            action = self.transformer(
+                domain_id=domain_id,
+                action_with_noise=x_t_m,
+                proprio=proprio_m,
+                t=t,
+                **enc,
+            )
+        return self.action_space.postprocess(action)
+    # =============================== FastAPI service =============================
+    def _build_app(self, processor):
+        """
+        Minimal FastAPI app for XVLA inference.
+        Args:
+            processor: callable(images, text) -> Dict[str, torch.Tensor]
+                       expected keys: "input_ids", "image_input", "image_mask"
+        """
+        if self.app is not None:
+            return
+        app = FastAPI()
+        @app.post("/act")
+        def act(payload: Dict[str, Any]):
+            try:
+                self.eval()
+                # Decode up to 3 image inputs
+                images = []
+                for key in ("image0", "image1", "image2"):
+                    if key not in payload: continue
+                    v = json_numpy.loads(payload[key])
+                    if isinstance(v, np.ndarray):
+                        if v.ndim == 1:  # encoded bytes
+                            v = cv2.imdecode(v, cv2.IMREAD_COLOR)
+                        images.append(Image.fromarray(v))
+                    elif isinstance(v, (list, tuple)):
+                        images.append(Image.fromarray(np.array(v)))
+                    elif isinstance(v, str):
+                        images.append(Image.open(v))
+                if not images:
+                    return JSONResponse({"error": "No valid images found."}, status_code=400)
+                # Multimodal preprocessing by processor
+                inputs = processor(images, payload["language_instruction"])
+                if not {"input_ids", "image_input", "image_mask"}.issubset(inputs):
+                    return JSONResponse({"error": "Processor returned incomplete inputs."}, status_code=400)
+                # Build proprio/domain tensors
+                proprio = torch.as_tensor(np.asarray(json_numpy.loads(payload["proprio"])))
+                domain_id = torch.tensor([int(payload["domain_id"])], dtype=torch.long)
+                # Align to model's device/dtype
+                device = next(self.parameters()).device
+                dtype = next(self.parameters()).dtype
+                def to_model(t: torch.Tensor) -> torch.Tensor:
+                    if not isinstance(t, torch.Tensor):
+                        t = torch.as_tensor(t)
+                    # cast floats to model dtype, keep integral/bool as-is
+                    return t.to(device=device, dtype=dtype) if t.is_floating_point() else t.to(device=device)
+                inputs = {k: to_model(v) for k, v in inputs.items()}
+                inputs.update({
+                    "proprio": to_model(proprio.unsqueeze(0)),
+                    "domain_id": domain_id.to(device),
+                })
+                # Inference
+                steps = int(payload.get("steps", 10))
+                action = self.generate_actions(**inputs, steps=steps).squeeze(0).float().cpu().numpy()
+                return JSONResponse({"action": action.tolist()})
+            except Exception:
+                logging.error(traceback.format_exc())
+                return JSONResponse({"error": "Request failed"}, status_code=400)
+        self.app = app
+    def run(self, processor, host: str = "0.0.0.0", port: int = 8000):
+        """
+        Launch the FastAPI service.
+        """
+        self._build_app(processor)
+        assert self.app is not None
+        uvicorn.run(self.app, host=host, port=port)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_xvla.XVLAProcessor"
+   },
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format",
+    "do_convert_rgb"
+  ],
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_center_crop": false,
+  "image_processor_type": "CLIPImageProcessor",
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std":  [0.229, 0.224, 0.225],
+  "processor_class": "XVLAProcessor",
+  "resample": 3,
+  "size": {
+    "height": 224,
+    "width": 224
+  },
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  }
+}

processing_xvla.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from transformers import ProcessorMixin
+from typing import List, Union, Dict, Any, Optional
+import torch
+class XVLAProcessor(ProcessorMixin):
+    """
+    XVLAProcessor: Unified multimodal processor for XVLA models.
+    Handles:
+      - Multi-view image inputs (e.g., from multiple cameras).
+      - Batch processing for multiple samples.
+      - Joint tokenization and image tensor preparation.
+    This processor combines an image processor and a tokenizer under a single interface
+    so that users can call it directly like:
+        >>> processor = XVLAProcessor.from_pretrained("path/to/xvla")
+        >>> inputs = processor(images=batch_images, language_instruction=batch_texts)
+    It is fully compatible with the Hugging Face AutoProcessor API.
+    Attributes
+    ----------
+    num_views : int, default=3
+        Expected number of image views per sample. Missing views will be padded with zeros.
+    language_max_length : int, default=50
+        Maximum token length for text encoding.
+    attributes : list
+        Required by ProcessorMixin to know which submodules are stored and reloaded.
+    image_processor_class : str
+        The name of the associated image processor class.
+    tokenizer_class : tuple(str)
+        The names of compatible tokenizer classes.
+    """
+    num_views: int = 3
+    language_max_length: int = 50
+    # Hugging Face ProcessorMixin-required metadata
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None):
+        """
+        Initialize XVLAProcessor.
+        Parameters
+        ----------
+        image_processor : PreTrainedImageProcessor, optional
+            The image processor used to normalize/resize images.
+        tokenizer : PreTrainedTokenizer, optional
+            The tokenizer used for text tokenization.
+        """
+        # ProcessorMixin automatically saves these under self.image_processor / self.tokenizer
+        super().__init__(image_processor, tokenizer)
+    # ================== LANGUAGE ENCODING ==================
+    def encode_language(self, language_instruction: Union[str, List[str]]) -> Dict[str, torch.Tensor]:
+        """
+        Tokenize one or more language instructions.
+        Parameters
+        ----------
+        language_instruction : str or List[str]
+            A single instruction or a batch of instructions.
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "input_ids": tensor of shape [B, L]
+            }
+        """
+        if isinstance(language_instruction, str):
+            language_instruction = [language_instruction]
+        inputs = self.tokenizer(
+            language_instruction,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=self.language_max_length,
+            truncation=True,
+        )
+        return {"input_ids": inputs["input_ids"]}
+    # ================== IMAGE ENCODING ==================
+    def encode_image(
+        self,
+        images: Union[List, List[List]],
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Preprocess one or more sets of multi-view images.
+        Parameters
+        ----------
+        images : List or List[List]
+            Single sample: [img1, img2, ...]
+            Batch: [[img1a, img1b], [img2a, img2b, img2c], ...]
+            Each image may be a PIL.Image, NumPy array, or torch.Tensor.
+        kwargs : dict
+            Extra arguments passed to the underlying image processor
+            (e.g., `do_resize=False`, `size=(224,224)`).
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "image_input": tensor [B, num_views, C, H, W],
+              "image_mask": tensor [B, num_views]
+            }
+        """
+        # Normalize to batch form
+        if not isinstance(images[0], (list, tuple)):
+            images = [images]  # convert single sample to batch of size 1
+        batch_imgs, batch_masks = [], []
+        for sample_imgs in images:
+            processed = self.image_processor(sample_imgs, return_tensors="pt", **kwargs)["pixel_values"]
+            V_exist = processed.size(0)
+            # Pad to self.num_views
+            if V_exist < self.num_views:
+                processed = torch.cat(
+                    [processed,
+                     processed.new_zeros(self.num_views - V_exist, *processed.shape[1:])],
+                    dim=0,
+                )
+            # Mask: True for valid slots, False for padding
+            image_mask = torch.zeros(self.num_views, dtype=torch.bool, device=processed.device)
+            image_mask[:V_exist] = True
+            batch_imgs.append(processed)
+            batch_masks.append(image_mask)
+        image_input = torch.stack(batch_imgs, dim=0)  # [B, num_views, C, H, W]
+        image_mask = torch.stack(batch_masks, dim=0)  # [B, num_views]
+        return {"image_input": image_input, "image_mask": image_mask}
+    # ================== COMBINED CALL ==================
+    def __call__(
+        self,
+        images: Optional[Union[List, List[List]]] = None,
+        language_instruction: Optional[Union[str, List[str]]] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Combine image and text encoding into a unified multimodal input.
+        Parameters
+        ----------
+        images : List or List[List], optional
+            Single-sample or batched multi-view images.
+        language_instruction : str or List[str], optional
+            Corresponding text instructions.
+        kwargs : dict
+            Extra args passed to image processor.
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            {
+              "input_ids": [B, L], optional,
+              "image_input": [B, num_views, C, H, W], optional,
+              "image_mask": [B, num_views], optional
+            }
+        """
+        outputs: Dict[str, Any] = {}
+        # Encode language if provided
+        if language_instruction is not None:
+            outputs.update(self.encode_language(language_instruction))
+        # Encode image if provided
+        if images is not None:
+            outputs.update(self.encode_image(images, **kwargs))
+        # Sanity check for batch alignment
+        if "input_ids" in outputs and "image_input" in outputs:
+            assert outputs["input_ids"].size(0) == outputs["image_input"].size(0), (
+                f"Batch mismatch: text batch {outputs['input_ids'].size(0)} "
+                f"!= image batch {outputs['image_input'].size(0)}"
+            )
+        return outputs

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "model_max_length": 1024
+}

transformer.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# ------------------------------------------------------------------------------
+# Copyright 2025 2toINF (https://github.com/2toINF)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------
+from __future__ import annotations
+import math
+from functools import partial
+from typing import Final, Iterable, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ------------------------------- Small utils ----------------------------------
+def _to_2tuple(x) -> Tuple:
+    """Minimal replacement for timm.layers.to_2tuple."""
+    if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
+        t = tuple(x)
+        return (t[0], t[1]) if len(t) >= 2 else (t[0], t[0])
+    return (x, x)
+def _has_sdp_attention() -> bool:
+    """Check if we can use PyTorch fused scaled_dot_product_attention."""
+    return hasattr(F, "scaled_dot_product_attention")
+# ---------------------------------- MLP --------------------------------------
+class Mlp(nn.Module):
+    """
+    MLP used in ViT-style blocks.
+    Supports Linear or 1x1 Conv 'linear_layer' for token/channel mixing.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        norm_layer: type[nn.Module] | None = None,
+        bias: bool | Tuple[bool, bool] = True,
+        drop: float | Tuple[float, float] = 0.0,
+        use_conv: bool = False,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = _to_2tuple(bias)
+        drop_probs = _to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = nn.GELU(approximate="tanh")
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Expect [B, T, C] for Linear variant; caller is responsible for shapes.
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+# -------------------------------- Attention ----------------------------------
+class Attention(nn.Module):
+    """
+    Multi-Head Self-Attention with optional fused SDPA fallback.
+    If PyTorch provides `scaled_dot_product_attention`, it will be used
+    (usually faster and more stable); otherwise we use a manual implementation.
+    """
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = _has_sdp_attention()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor, shape [B, T, C]
+            Input sequence.
+        Returns
+        -------
+        Tensor, shape [B, T, C]
+            Output sequence after MHSA + projection.
+        """
+        B, T, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, T, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)  # 3 x [B, H, T, Dh]
+        )
+        q, k, v = qkv.unbind(0)  # each: [B, H, T, Dh]
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+            )  # [B, H, T, Dh]
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)        # [B, H, T, T]
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v                           # [B, H, T, Dh]
+        x = x.transpose(1, 2).reshape(B, T, C)     # [B, T, C]
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+# ------------------------------- Utilities -----------------------------------
+def basic_init(module: nn.Module) -> None:
+    """
+    Apply a basic initialization scheme to Linear layers.
+    - Weight: Xavier uniform initialization.
+    - Bias: Set to zero.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.constant_(module.bias, 0.0)
+def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 100) -> torch.Tensor:
+    """
+    Create sinusoidal timestep embeddings.
+    Parameters
+    ----------
+    t : torch.Tensor
+        Shape [B]. Each element is a timestep index, may be fractional.
+    dim : int
+        Dimensionality of the output embedding.
+    max_period : int, default=100
+        Controls the minimum frequency of the sinusoids.
+    Returns
+    -------
+    torch.Tensor
+        Shape [B, dim]. Sinusoidal embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=t.dtype, device=t.device)
+        / half
+    )
+    args = t[:, None] * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2 == 1:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+# ------------------------------- Core Layers ----------------------------------
+class DomainAwareLinear(nn.Module):
+    """
+    Linear layer with domain-conditioned parameters (per-sample).
+    Each domain has its own weight and bias vectors, stored in embeddings.
+    """
+    def __init__(self, input_size: int, output_size: int, num_domains: int = 20) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.fc = nn.Embedding(num_domains, output_size * input_size)
+        self.bias = nn.Embedding(num_domains, output_size)
+        nn.init.xavier_uniform_(self.fc.weight)
+        nn.init.zeros_(self.bias.weight)
+    def forward(self, x: torch.Tensor, domain_id: torch.LongTensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor
+            [B, I] or [B, T, I]
+        domain_id : LongTensor
+            [B], domain indices.
+        Returns
+        -------
+        Tensor
+            [B, O] or [B, T, O]
+        """
+        B = domain_id.shape[0]
+        squeeze_T = False
+        if x.dim() == 2:
+            x = x.unsqueeze(1)
+            squeeze_T = True
+        W = self.fc(domain_id).view(B, self.input_size, self.output_size)
+        b = self.bias(domain_id).view(B, self.output_size)
+        y = torch.matmul(x, W) + b.view(B, 1, self.output_size)
+        if squeeze_T:
+            y = y.squeeze(1)
+        return y
+class TransformerBlock(nn.Module):
+    """
+    Standard Transformer block (pre-LN): LN → MHSA → residual, LN → MLP → residual.
+    """
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float = 4.0) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size)
+        self.norm2 = nn.LayerNorm(hidden_size)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, attn_drop=0.1)
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=int(hidden_size * mlp_ratio),
+            drop=0.1,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : Tensor, [B, T, H]
+        Returns
+        -------
+        Tensor, [B, T, H]
+        """
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+# --------------------------- Main Model ---------------------------------------
+class SoftPromptedTransformer(nn.Module):
+    """
+    Multi-modal, domain-aware Transformer with optional soft prompts.
+    See parameter and forward I/O descriptions inside the docstrings.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        multi_modal_input_size: int = 768,
+        depth: int = 24,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        num_domains: int = 20,
+        dim_action: int = 20,
+        dim_propio: int = 20,
+        dim_time: int = 32,
+        len_soft_prompts: int = 32,
+        max_len_seq: int = 512,
+        use_hetero_proj: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dim_action = dim_action
+        self.dim_time = dim_time
+        self.len_soft_prompts = len_soft_prompts
+        self.use_hetero_proj = use_hetero_proj
+        self.blocks = nn.ModuleList(
+            [TransformerBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)]
+        )
+        if use_hetero_proj:
+            self.vlm_proj = DomainAwareLinear(multi_modal_input_size, hidden_size, num_domains=num_domains)
+            self.aux_visual_proj = DomainAwareLinear(multi_modal_input_size, hidden_size, num_domains=num_domains)
+        else:
+            self.vlm_proj = nn.Linear(multi_modal_input_size, hidden_size)
+            self.aux_visual_proj = nn.Linear(multi_modal_input_size, hidden_size)
+        self.pos_emb = nn.Parameter(torch.zeros(1, max_len_seq, hidden_size), requires_grad=True)
+        nn.init.normal_(self.pos_emb, std=0.02)
+        self.norm = nn.LayerNorm(hidden_size)
+        self.action_encoder = DomainAwareLinear(
+            dim_action + dim_time + dim_propio, hidden_size, num_domains=num_domains
+        )
+        self.action_decoder = DomainAwareLinear(hidden_size, dim_action, num_domains=num_domains)
+        if len_soft_prompts > 0:
+            self.soft_prompt_hub = nn.Embedding(num_domains, len_soft_prompts * hidden_size)
+            nn.init.normal_(self.soft_prompt_hub.weight, std=0.02)
+        self.apply(basic_init)
+    def forward(
+        self,
+        domain_id: torch.LongTensor,
+        vlm_features: torch.Tensor,
+        aux_visual_inputs: torch.Tensor,
+        action_with_noise: torch.Tensor,
+        proprio: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        Inputs
+        ------
+        domain_id : [B]
+        vlm_features : [B, T_vlm, D]
+        aux_visual_inputs : [B, T_aux, D]
+        action_with_noise : [B, T_action, dim_action]
+        proprio : [B, dim_propio]
+        t : [B]
+        Returns
+        -------
+        Tensor
+            Predicted actions, [B, T_action, dim_action]
+        """
+        B, num_actions = action_with_noise.shape[:2]
+        # Encode (action + proprio + time) → tokens
+        time_emb = timestep_embedding(t, self.dim_time)                     # [B, dim_time]
+        time_tokens = time_emb.unsqueeze(1).expand(B, num_actions, self.dim_time)
+        proprio_tokens = proprio.unsqueeze(1).expand(B, num_actions, proprio.shape[-1])
+        action_tokens = torch.cat([action_with_noise, proprio_tokens, time_tokens], dim=-1)
+        x = self.action_encoder(action_tokens, domain_id)                   # [B, T_action, H]
+        # Project visual streams and concatenate
+        if self.use_hetero_proj:
+            x = torch.cat(
+                [x, self.vlm_proj(vlm_features, domain_id), self.aux_visual_proj(aux_visual_inputs, domain_id)],
+                dim=1,
+            )
+        else:
+            x = torch.cat([x, self.vlm_proj(vlm_features), self.aux_visual_proj(aux_visual_inputs)], dim=1)
+        # Add positional embeddings (truncate if needed)
+        seq_len = x.shape[1]
+        if seq_len > self.pos_emb.shape[1]:
+            raise ValueError(
+                f"Sequence length {seq_len} exceeds max_len_seq={self.pos_emb.shape[1]}."
+            )
+        x = x + self.pos_emb[:, :seq_len, :]
+        # Append soft prompts
+        if self.len_soft_prompts > 0:
+            soft_prompts = self.soft_prompt_hub(domain_id).view(B, self.len_soft_prompts, self.hidden_size)
+            x = torch.cat([x, soft_prompts], dim=1)
+        # Transformer backbone
+        for block in self.blocks:
+            x = block(x)
+        # Decode only the action segment
+        return self.action_decoder(self.norm(x[:, :num_actions]), domain_id)

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff