Xiangtai commited on 17 days ago

Commit

fa5f0ed

verified ·

1 Parent(s): 5301f61

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +1 -0
README.md +162 -0
added_tokens.json +33 -0
chat_template.jinja +120 -0
config.json +69 -0
configuration_sa2va_chat.py +34 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_sa2va_qwen.py +260 -0
preprocessor_config.json +39 -0
sam2.py +0 -0
special_tokens_map.json +31 -0
templates.py +170 -0
tokenizer.json +3 -0
tokenizer_config.json +281 -0
video_preprocessor_config.json +41 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,162 @@

+---
+license: apache-2.0
+pipeline_tag: image-text-to-text
+library_name: transformers
+base_model:
+  - OpenGVLab/InternVL3-8B
+base_model_relation: merge
+language:
+  - multilingual
+tags:
+  - Sa2VA
+  - custom_code
+---
+# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos
+[\[📂 GitHub\]](https://github.com/bytedance/Sa2VA)
+[\[📜 Sa2VA paper\]](https://arxiv.org/abs/2501.04001)
+[\[🚀 Quick Start\]](#quick-start)
+## Introduction
+Sa2VA is an MLLM capable of question answering, visual prompt understanding, and dense object segmentation at both image and video levels. It achieves comparable performance to SOTA MLLMs Qwen2.5-VL and InternVL3 on question-answering benchmarks. Additionally, Sa2VA possesses the visual prompt understanding and dense object segmentation capabilities that SOTA MLLMs Qwen2.5-VL and InternVL3 lack. Sa2VA achieves SOTA performance on both image and video grounding and segmentation benchmarks.
+## Sa2VA Family
+We built the Sa2VA series based on Qwen2.5/3-VL and InternVL2.5/3. In the following table, we provide some Sa2VA models built on Qwen2.5/3-VL and InternVL3.
+| Model Name |                             Base MLLM                              |                                Language Part                                |                        HF Link                        |
+|:----------:|:------------------------------------------------------------------:|:---------------------------------------------------------------------------:|:-----------------------------------------------------:|
+|  Sa2VA-InternVL3-2B  | [InternVL3-2B](https://huggingface.co/OpenGVLab/InternVL3-2B)  | [Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)  | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-InternVL3-2B)  |
+|  Sa2VA-InternVL3-8B  | [InternVL3-8B](https://huggingface.co/OpenGVLab/InternVL3-8B)  |   [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)    | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-InternVL3-8B)  |
+|  Sa2VA-InternVL3-14B  | [InternVL3-14B](https://huggingface.co/OpenGVLab/InternVL3-14B)  | [Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)  | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-InternVL3-14B)  |
+| Sa2VA-Qwen2_5-VL-3B  | [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) | [Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B) | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-Qwen2_5-VL-3B) |
+| Sa2VA-Qwen2_5-VL-7B  | [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) | [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B) | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-Qwen2_5-VL-7B) |
+| Sa2VA-Qwen3-VL-4B  | [Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) | [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) | [🤗 link](https://huggingface.co/ByteDance/Sa2VA-Qwen3-VL-4B) |
+## Sa2VA Performance
+| Model Name |   MME    | MMBench  | RefCOCO | RefCOCO+ | RefCOCOg | MeVIS (val_u) | DAVIS |
+|:----------:|:--------:|:----:|:-------:|:--------:|:--------:|:-------------:|:-----:|
+|  Sa2VA-InternVL3-2B  | 1631/559 | 79.8 |  81.4   |   75.7   |   80.3   |     53.9      | 74.5  |
+|  Sa2VA-InternVL3-8B  | 1743/633 | 83.0 |  83.3   |   78.9   |   81.8   |     56.4      | 76.3  |
+|  Sa2VA-InternVL3-14B  | 1746/724 | 84.3 |  83.6   |   79.9   |   83.6   |     59.2      | 76.6  |
+| Sa2VA-Qwen2_5-VL-3B | 1533/572 | 78.4 |  79.6   |   74.0   |   77.1   |     51.6      | 73.4  |
+| Sa2VA-Qwen2_5-VL-7B | 1552/676 | 84.5 |  82.4   |   77.5   |   81.5   |     56.4      | 79.4  |
+| Sa2VA-Qwen3-VL-4B | 1660/655 | 86.3 |  81.7   |   77.4   |   80.0   |     57.1      | 75.9  |
+## Quick Start
+We provide an example code to run `Sa2VA` using `transformers`.
+```python
+import torch
+from transformers import AutoProcessor, AutoModel
+from PIL import Image
+import numpy as np
+import os
+# load the model and processor
+path = "ByteDance/Sa2VA-Qwen3-VL-4B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True).eval().cuda()
+processor = AutoProcessor.from_pretrained(path, trust_remote_code=True, use_fast=False)
+# for image chat
+image_path = "/PATH/TO/IMAGE"
+text_prompts = "<image>Please describe the image."
+image = Image.open(image_path).convert('RGB')
+input_dict = {
+    'image': image,
+    'text': text_prompts,
+    'past_text': '',
+    'mask_prompts': None,
+    'processor': processor,
+    }
+return_dict = model.predict_forward(**input_dict)
+answer = return_dict["prediction"] # the text format answer
+# for image chat with segmentation output
+image_path = "/PATH/TO/IMAGE"
+text_prompts = "<image>Could you please give me a brief description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer."
+image = Image.open(image_path).convert('RGB')
+input_dict = {
+    'image': image,
+    'text': text_prompts,
+    'past_text': '',
+    'mask_prompts': None,
+    'processor': processor,
+    }
+return_dict = model.predict_forward(**input_dict)
+answer = return_dict["prediction"] # the text format answer
+masks = return_dict['prediction_masks']  # segmentation masks, list(np.array(1, h, w), ...)
+# for chat with visual prompt (mask format) input
+mask_prompts = np.load('/PATH/TO/pred_masks.npy') # np.array(n_prompts, h, w)
+image_path = "/PATH/TO/IMAGE"
+text_prompts = "<image>Can you provide me with a detailed description of the region in the picture marked by region1."
+image = Image.open(image_path).convert('RGB')
+input_dict = {
+    'image': image,
+    'text': text_prompts,
+    'past_text': '',
+    'mask_prompts': mask_prompts,
+    'processor': processor,
+    }
+return_dict = model.predict_forward(**input_dict)
+answer = return_dict["prediction"] # the text format answer
+# for video chat
+video_folder = "/PATH/TO/VIDEO_FOLDER"
+images_paths = os.listdir(video_folder)
+images_paths = [os.path.join(video_folder, image_path) for image_name in images_paths]
+if len(images_paths) > 5:  # uniformly sample 5 frames
+    step = (len(images_paths) - 1) // (5 - 1)
+    images_paths = [images_paths[0]] + images_paths[1:-1][::step][1:] + [images_paths[-1]]
+text_prompts = "<image>Please describe the video."
+input_dict = {
+    'video': images_paths,
+    'text': text_prompts,
+    'past_text': '',
+    'mask_prompts': None,
+    'processor': processor,
+}
+return_dict = model.predict_forward(**input_dict)
+answer = return_dict["prediction"] # the text format answer
+# for video chat with segmentation mask output
+video_folder = "/PATH/TO/VIDEO_FOLDER"
+images_paths = os.listdir(video_folder)
+images_paths = [os.path.join(video_folder, image_path) for image_name in images_paths]
+text_prompts = "<image>Please segment the person."
+input_dict = {
+    'video': images_paths,
+    'text': text_prompts,
+    'past_text': '',
+    'mask_prompts': None,
+    'processor': processor,
+}
+return_dict = model.predict_forward(**input_dict)
+answer = return_dict["prediction"] # the text format answer
+masks = return_dict['prediction_masks']  # segmentation masks, list(np.array(n_frames, h, w), ...)
+```
+## Citation
+If you find this project useful in your research, please consider citing:
+```BibTeX
+@article{sa2va,
+  title={Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos},
+  author={Yuan, Haobo and Li, Xiangtai and Zhang, Tao and Huang, Zilong Huang and Xu, Shilin and Ji, Shunping and Tong, Yunhai and Qi, Lu and Feng, Jiashi and Yang, Ming-Hsuan},
+  journal={arXiv preprint},
+  year={2025}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "</p>": 151671,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "</vp>": 151673,
+  "<p>": 151670,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<vp>": 151672,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "[SEG]": 151669
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "architectures": [
+    "Sa2VAChatModelQwen"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_sa2va_chat.Sa2VAChatConfigQwen",
+    "AutoModel": "modeling_sa2va_qwen.Sa2VAChatModelQwen",
+    "AutoModelForCausalLM": "modeling_sa2va_qwen.Sa2VAChatModelQwen"
+  },
+  "dtype": "float32",
+  "image_token_id": 151655,
+  "model_type": "sa2va_chat",
+  "template": "qwen_chat",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151674
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.0",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2560,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

configuration_sa2va_chat.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import copy
+import transformers
+from transformers import Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+logger = logging.get_logger(__name__)
+class Sa2VAChatConfigQwen(Qwen3VLConfig):
+    model_type = 'sa2va_chat'
+    def __init__(
+            self,
+            template=None,
+            **kwargs
+        ):
+        super().__init__(**kwargs)
+        self.template = template
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = super().to_dict()
+        output["template"] = self.template
+        return output

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed2f72ade8566319cf154c12f4765166fa4bc2524614aa446aef2a123dd28f09
+size 4934328696

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11aa35d66d696c6e4ec0b4246a9edb0f81049bd59e94cf4a376dd8713eddec71
+size 4944311840

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:991fc96652cddd8992e32aa1cfe61b4c153cca335b72c2c26341ff185735649c
+size 4944311896

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:600c3ea1ea7fe5421a30ef8b67bba119eeb10168818bfe57a9852a8b9572543b
+size 4998026056

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35642f38b2551f8a32c3ee8bf9b579ebc71c9e7366601d4250e2407a20157ff4
+size 407536000

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_sa2va_qwen.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import torch
+from torch import nn
+from transformers import (AutoModel, GenerationConfig, Qwen3VLForConditionalGeneration,
+                          Qwen2ForCausalLM)
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_sa2va_chat import Sa2VAChatConfigQwen
+from .sam2 import SAM2
+import numpy as np
+from torchvision.transforms.functional import to_pil_image
+import torch.nn.functional as F
+from qwen_vl_utils import process_vision_info
+class DirectResize:
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        img = to_pil_image(image, mode='RGB')
+        return np.array(img.resize((self.target_length, self.target_length)))
+class Sa2VAChatModelQwen(PreTrainedModel):
+    config_class = Sa2VAChatConfigQwen
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['Qwen3VisionTransformerPretrainedModel', 'Qwen3VLDecoderLayer', 'SAM2']
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    def __init__(self, config: Sa2VAChatConfigQwen, model=None, use_flash_attn=True):
+        super().__init__(config)
+        self.extra_image_processor = DirectResize(target_length=1024, )
+        self.min_pixels = 512 * 28 * 28
+        self.max_pixels = 2048 * 28 * 28
+        self.torch_dtype = torch.bfloat16
+        if model is not None:
+            self.model=model
+        else:
+            self.model = Qwen3VLForConditionalGeneration(config)
+        llm_hidden_size = config.text_config.hidden_size
+        self.grounding_encoder = SAM2()
+        out_dim = self.grounding_encoder.hidden_dim
+        in_dim = llm_hidden_size
+        self.text_hidden_fcs = nn.Sequential(
+            nn.Linear(in_dim, in_dim), nn.ReLU(inplace=True),
+            nn.Linear(in_dim, out_dim), nn.Dropout(0.0)
+        )
+    @property
+    def lm_head(self):
+        return self.model.lm_head
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
+    def predict_forward(
+            self,
+            image=None,
+            video=None,
+            text=None,
+            past_text='',
+            mask_prompts=None,
+            tokenizer=None,
+            processor=None,
+    ):
+        assert processor is not None
+        self.processor = processor
+        self.seg_token_idx = self.processor.tokenizer.convert_tokens_to_ids('[SEG]')
+        text = text.replace('<image>', "")
+        if image is None and video is None and '<image>' not in past_text:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": past_text + text},
+                    ],
+                }
+            ]
+            # Preparation for inference
+            processsed_text = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            mm_inputs = self.processor(
+                text=[processsed_text],
+                images=None,
+                videos=None,
+                padding=True,
+                return_tensors="pt",
+            )
+            mm_inputs = mm_inputs.to(self.device)
+            ret_masks = []
+        else:
+            input_dict = {}
+            if video is not None:
+                pixel_values = []
+                extra_pixel_values = []
+                images = []
+                content = []
+                ori_image_size = video[0].size
+                for frame_idx, frame_image in enumerate(video):
+                    # assert ori_image_size == frame_image.size
+                    g_image = np.array(frame_image)  # for grounding
+                    g_image = self.extra_image_processor.apply_image(g_image)
+                    g_image = torch.from_numpy(g_image).permute(2, 0, 1).contiguous()
+                    extra_pixel_values.append(g_image)
+                    if frame_idx < 5:
+                        content.append({"type": "image", "image": frame_image},)
+                content.append({"type": "text", "text": text})
+                messages = [
+                    {
+                        "role": "user",
+                        "content": content,
+                    }
+                ]
+                # Preparation for inference
+                processsed_text = self.processor.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                image_inputs, video_inputs = process_vision_info(messages)
+                mm_inputs = self.processor(
+                    text=[processsed_text],
+                    images=image_inputs,
+                    videos=video_inputs,
+                    padding=True,
+                    return_tensors="pt",
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels
+                )
+                mm_inputs = mm_inputs.to(self.device)
+                g_pixel_values = torch.stack([
+                    self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
+                ]).to(self.torch_dtype)
+                num_frames = min(5, len(video))
+            else:
+                ori_image_size = image.size
+                # prepare grounding images
+                g_image = np.array(image)  # for grounding
+                g_image = self.extra_image_processor.apply_image(g_image)
+                g_pixel_values = torch.from_numpy(g_image).permute(2, 0, 1).contiguous().to(self.torch_dtype)
+                extra_pixel_values = [g_pixel_values]
+                g_pixel_values = torch.stack([
+                    self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
+                ]).to(self.torch_dtype)
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "image": image,
+                            },
+                            {"type": "text", "text": text},
+                        ],
+                    }
+                ]
+                # Preparation for inference
+                processsed_text = self.processor.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                image_inputs, video_inputs = process_vision_info(messages)
+                mm_inputs = self.processor(
+                    text=[processsed_text],
+                    images=image_inputs,
+                    videos=video_inputs,
+                    padding=True,
+                    return_tensors="pt",
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels
+                )
+                mm_inputs = mm_inputs.to(self.device)
+                num_frames = 1
+            input_dict['g_pixel_values'] = g_pixel_values
+            ret_masks = []
+        generate_output = self.model.generate(
+            **mm_inputs,
+            max_new_tokens=2048,
+            do_sample=False,
+            output_hidden_states=True,
+            return_dict_in_generate=True
+        )
+        generate_output_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(mm_inputs.input_ids, generate_output.sequences)
+        ]
+        predict = self.processor.batch_decode(generate_output_trimmed, skip_special_tokens=False)[0].strip()
+        if image is None and video is None and '<image>' not in past_text:
+            return {'prediction': predict, 'prediction_masks': ret_masks, }
+        # if have seg result, find the seg hidden states
+        hidden_states = generate_output.hidden_states
+        last_hidden_states = [item[-1][0] for item in hidden_states]
+        last_hidden_states = torch.cat(last_hidden_states, dim=0)
+        seg_hidden_states = get_seg_hidden_states(
+            last_hidden_states, generate_output.sequences[0][:-1],
+            seg_id=self.seg_token_idx
+        )
+        all_seg_hidden_states = self.text_hidden_fcs(seg_hidden_states)
+        for seg_hidden_states in all_seg_hidden_states:
+            seg_hidden_states = seg_hidden_states.unsqueeze(0)
+            g_pixel_values = input_dict['g_pixel_values']
+            sam_states = self.grounding_encoder.get_sam2_embeddings(g_pixel_values)
+            pred_masks = self.grounding_encoder.language_embd_inference(sam_states, [seg_hidden_states] * num_frames)
+            w, h = ori_image_size
+            masks = F.interpolate(pred_masks, size=(h, w), mode='bilinear', align_corners=False)
+            masks = masks[:, 0]
+            masks = masks.sigmoid() > 0.5
+            masks = masks.cpu().numpy()
+            ret_masks.append(masks)
+        return {'prediction': predict, 'prediction_masks': ret_masks,}
+def get_seg_hidden_states(hidden_states, output_ids, seg_id):
+    seg_mask = output_ids == seg_id
+    n_out = len(seg_mask)
+    if n_out == 0:
+        return hidden_states[0:0]
+    return hidden_states[-n_out:][seg_mask]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_pixels": null,
+  "merge_size": 2,
+  "min_pixels": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 16777216,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2
+}

sam2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

templates.py ADDED Viewed

	@@ -0,0 +1,170 @@

+PROMPT_TEMPLATE = dict(
+    default=dict(
+        SYSTEM='<|System|>:{system}\n',
+        INSTRUCTION='<|User|>:{input}\n<|Bot|>:',
+        SEP='\n'),
+    zephyr=dict(
+        SYSTEM='<|system|>\n{system}\n',
+        INSTRUCTION='<|user|>\n{input}\n<|assistant|>\n',
+        SEP='\n'),
+    internlm_chat=dict(
+        SYSTEM='<|System|>:{system}\n',
+        INSTRUCTION='<|User|>:{input}<eoh>\n<|Bot|>:',
+        SUFFIX='<eoa>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<eoa>']),
+    internlm2_chat=dict(
+        SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>']),
+    moss_sft=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<|Human|>: {input}<eoh>\n',
+        SEP='\n',
+        STOP_WORDS=['<eoc>', '<eom>']),
+    llama2_chat=dict(
+        SYSTEM=(
+            '[INST] <<SYS>>\n You are a helpful, respectful and honest '
+            'assistant. Always answer as helpfully as possible, while being '
+            'safe. Your answers should not include any harmful, unethical, '
+            'racist, sexist, toxic, dangerous, or illegal content. Please '
+            'ensure that your responses are socially unbiased and positive in '
+            'nature.\n{system}\n<</SYS>>\n [/INST] '),
+        INSTRUCTION='[INST] {input} [/INST]',
+        SEP='\n'),
+    code_llama_chat=dict(
+        SYSTEM='{system}\n', INSTRUCTION='[INST] {input} [/INST]'),
+    chatglm2=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='[Round {round}]\n\n问：{input}\n\n答：',
+        SEP='\n\n'),
+    chatglm3=dict(
+        SYSTEM='<|system|>\n{system}',
+        INSTRUCTION='<|user|>\n{input}<|assistant|>\n',
+        SEP='\n'),
+    qwen_chat=dict(
+        SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
+    baichuan_chat=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<reserved_102>{input}<reserved_103>',
+        SEP='\n'),
+    baichuan2_chat=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<reserved_106>{input}<reserved_107>',
+        SEP='\n'),
+    wizardlm=dict(
+        SYSTEM=('A chat between a curious user and an artificial '
+                'intelligence assistant. The assistant gives '
+                'helpful, detailed, and polite answers to the '
+                'user\'s questions. {system}\n '),
+        INSTRUCTION=('USER: {input} ASSISTANT:'),
+        SEP='\n'),
+    wizardcoder=dict(
+        SYSTEM=(
+            'Below is an instruction that describes a task. '
+            'Write a response that appropriately completes the request.\n\n'
+            '{system}\n '),
+        INSTRUCTION=('### Instruction:\n{input}\n\n### Response:'),
+        SEP='\n\n'),
+    vicuna=dict(
+        SYSTEM=('A chat between a curious user and an artificial '
+                'intelligence assistant. The assistant gives '
+                'helpful, detailed, and polite answers to the '
+                'user\'s questions. {system}\n '),
+        INSTRUCTION=('USER: {input} ASSISTANT:'),
+        SEP='\n'),
+    deepseek_coder=dict(
+        SYSTEM=('You are an AI programming assistant, utilizing '
+                'the DeepSeek Coder model, developed by DeepSeek'
+                'Company, and you only answer questions related '
+                'to computer science. For politically sensitive '
+                'questions, security and privacy issues, and '
+                'other non-computer science questions, you will '
+                'refuse to answer. {system}\n'),
+        INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
+        SEP='\n'),
+    # TODO: deprecation, v0.2.0
+    deepseekcoder=dict(
+        SYSTEM=('You are an AI programming assistant, utilizing '
+                'the DeepSeek Coder model, developed by DeepSeek'
+                'Company, and you only answer questions related '
+                'to computer science. For politically sensitive '
+                'questions, security and privacy issues, and '
+                'other non-computer science questions, you will '
+                'refuse to answer. {system}\n'),
+        INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
+        SEP='\n'),
+    deepseek_moe=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    deepseek_v2=dict(
+        SYSTEM='{system}\n\n',
+        INSTRUCTION='User: {input}\n\nAssistant: ',
+        SUFFIX='<｜end▁of▁sentence｜>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<｜end▁of▁sentence｜>']),
+    mistral=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    mixtral=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    minicpm=dict(INSTRUCTION=('<用户> {input} <AI>'), SEP='\n'),
+    minicpm3=dict(
+        SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
+    gemma=dict(
+        # `system` field is extended by xtuner
+        SYSTEM=('<start_of_turn>system\n{system}<end_of_turn>\n'),
+        INSTRUCTION=('<start_of_turn>user\n{input}<end_of_turn>\n'
+                     '<start_of_turn>model\n'),
+        SUFFIX='<end_of_turn>',
+        SUFFIX_AS_EOS=False,
+        SEP='\n',
+        STOP_WORDS=['<end_of_turn>']),
+    cohere_chat=dict(
+        SYSTEM=('<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{system}'
+                '<|END_OF_TURN_TOKEN|>'),
+        INSTRUCTION=(
+            '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{input}<|END_OF_TURN_TOKEN|>'
+            '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'),
+        SUFFIX='<|END_OF_TURN_TOKEN|>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<|END_OF_TURN_TOKEN|>']),
+    llama3_chat=dict(
+        SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
+                '{system}<|eot_id|>'),
+        INSTRUCTION=(
+            '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
+            '<|start_header_id|>assistant<|end_header_id|>\n\n'),
+        SUFFIX='<|eot_id|>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<|eot_id|>']),
+    phi3_chat=dict(
+        SYSTEM='<|system|>\n{system}<|end|>\n',
+        INSTRUCTION='<|user|>\n{input}<|end|>\n<|assistant|>\n',
+        SUFFIX='<|end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|end|>']),
+)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ade4f08c34dfcbd3b4d11082162940b785e27af18f916549aa4bc223155c91b8
+size 11423560

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,281 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "[SEG]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<p>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</p>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<vp>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</vp>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": true,
+  "fps": 2,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "merge_size": 2,
+  "min_frames": 4,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 25165824,
+    "shortest_edge": 4096
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen3VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff