zhouyik commited on Mar 19

Commit

4ee9c8f

verified ·

1 Parent(s): ef577d4

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +1 -0
__pycache__/configuration_intern_vit.cpython-310.pyc +0 -0
__pycache__/configuration_internlm2.cpython-310.pyc +0 -0
__pycache__/configuration_mask2former.cpython-310.pyc +0 -0
__pycache__/configuration_phi3.cpython-310.pyc +0 -0
__pycache__/configuration_sa2va_chat.cpython-310.pyc +0 -0
__pycache__/constants.cpython-310.pyc +0 -0
__pycache__/flash_attention.cpython-310.pyc +0 -0
__pycache__/mask2former.cpython-310.pyc +0 -0
__pycache__/modeling_intern_vit.cpython-310.pyc +0 -0
__pycache__/modeling_internlm2.cpython-310.pyc +0 -0
__pycache__/modeling_phi3.cpython-310.pyc +0 -0
__pycache__/modeling_sa2va_chat.cpython-310.pyc +0 -0
__pycache__/templates.cpython-310.pyc +0 -0
added_tokens.json +140 -0
chat_with_sa2va.py +140 -0
config.json +2677 -0
configuration_intern_vit.py +120 -0
configuration_internlm2.py +150 -0
configuration_phi3.py +211 -0
configuration_sa2va_chat.py +122 -0
constants.py +13 -0
flash_attention.py +76 -0
generation_config.json +4 -0
mask2former.py +834 -0
merges.txt +0 -0
model.safetensors.index.json +0 -0
modeling_intern_vit.py +364 -0
modeling_internlm2.py +1429 -0
modeling_phi3.py +1610 -0
modeling_sa2va_chat.py +1100 -0
sam2.py +0 -0
special_tokens_map.json +40 -0
templates.py +170 -0
tokenization_internlm2.py +235 -0
tokenization_internlm2_fast.py +211 -0
tokenizer.json +3 -0
tokenizer_config.json +1147 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

__pycache__/configuration_intern_vit.cpython-310.pyc ADDED Viewed

Binary file (5.03 kB). View file

__pycache__/configuration_internlm2.cpython-310.pyc ADDED Viewed

Binary file (5.54 kB). View file

__pycache__/configuration_mask2former.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

__pycache__/configuration_phi3.cpython-310.pyc ADDED Viewed

Binary file (8.67 kB). View file

__pycache__/configuration_sa2va_chat.cpython-310.pyc ADDED Viewed

Binary file (3.57 kB). View file

__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (555 Bytes). View file

__pycache__/flash_attention.cpython-310.pyc ADDED Viewed

Binary file (2.71 kB). View file

__pycache__/mask2former.cpython-310.pyc ADDED Viewed

Binary file (17.5 kB). View file

__pycache__/modeling_intern_vit.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

__pycache__/modeling_internlm2.cpython-310.pyc ADDED Viewed

Binary file (42.9 kB). View file

__pycache__/modeling_phi3.cpython-310.pyc ADDED Viewed

Binary file (44.2 kB). View file

__pycache__/modeling_sa2va_chat.cpython-310.pyc ADDED Viewed

Binary file (27.1 kB). View file

__pycache__/templates.cpython-310.pyc ADDED Viewed

Binary file (3.86 kB). View file

added_tokens.json ADDED Viewed

	@@ -0,0 +1,140 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</obj>": 151679,
+  "</p>": 151675,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<OBJ_CONTEXT>": 151680,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<obj>": 151678,
+  "<p>": 151674,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "[BG_CLS]": 151677,
+  "[CLS]": 151676,
+  "[SEG000]": 151681,
+  "[SEG001]": 151682,
+  "[SEG002]": 151683,
+  "[SEG003]": 151684,
+  "[SEG004]": 151685,
+  "[SEG005]": 151686,
+  "[SEG006]": 151687,
+  "[SEG007]": 151688,
+  "[SEG008]": 151689,
+  "[SEG009]": 151690,
+  "[SEG010]": 151691,
+  "[SEG011]": 151692,
+  "[SEG012]": 151693,
+  "[SEG013]": 151694,
+  "[SEG014]": 151695,
+  "[SEG015]": 151696,
+  "[SEG016]": 151697,
+  "[SEG017]": 151698,
+  "[SEG018]": 151699,
+  "[SEG019]": 151700,
+  "[SEG020]": 151701,
+  "[SEG021]": 151702,
+  "[SEG022]": 151703,
+  "[SEG023]": 151704,
+  "[SEG024]": 151705,
+  "[SEG025]": 151706,
+  "[SEG026]": 151707,
+  "[SEG027]": 151708,
+  "[SEG028]": 151709,
+  "[SEG029]": 151710,
+  "[SEG030]": 151711,
+  "[SEG031]": 151712,
+  "[SEG032]": 151713,
+  "[SEG033]": 151714,
+  "[SEG034]": 151715,
+  "[SEG035]": 151716,
+  "[SEG036]": 151717,
+  "[SEG037]": 151718,
+  "[SEG038]": 151719,
+  "[SEG039]": 151720,
+  "[SEG040]": 151721,
+  "[SEG041]": 151722,
+  "[SEG042]": 151723,
+  "[SEG043]": 151724,
+  "[SEG044]": 151725,
+  "[SEG045]": 151726,
+  "[SEG046]": 151727,
+  "[SEG047]": 151728,
+  "[SEG048]": 151729,
+  "[SEG049]": 151730,
+  "[SEG050]": 151731,
+  "[SEG051]": 151732,
+  "[SEG052]": 151733,
+  "[SEG053]": 151734,
+  "[SEG054]": 151735,
+  "[SEG055]": 151736,
+  "[SEG056]": 151737,
+  "[SEG057]": 151738,
+  "[SEG058]": 151739,
+  "[SEG059]": 151740,
+  "[SEG060]": 151741,
+  "[SEG061]": 151742,
+  "[SEG062]": 151743,
+  "[SEG063]": 151744,
+  "[SEG064]": 151745,
+  "[SEG065]": 151746,
+  "[SEG066]": 151747,
+  "[SEG067]": 151748,
+  "[SEG068]": 151749,
+  "[SEG069]": 151750,
+  "[SEG070]": 151751,
+  "[SEG071]": 151752,
+  "[SEG072]": 151753,
+  "[SEG073]": 151754,
+  "[SEG074]": 151755,
+  "[SEG075]": 151756,
+  "[SEG076]": 151757,
+  "[SEG077]": 151758,
+  "[SEG078]": 151759,
+  "[SEG079]": 151760,
+  "[SEG080]": 151761,
+  "[SEG081]": 151762,
+  "[SEG082]": 151763,
+  "[SEG083]": 151764,
+  "[SEG084]": 151765,
+  "[SEG085]": 151766,
+  "[SEG086]": 151767,
+  "[SEG087]": 151768,
+  "[SEG088]": 151769,
+  "[SEG089]": 151770,
+  "[SEG090]": 151771,
+  "[SEG091]": 151772,
+  "[SEG092]": 151773,
+  "[SEG093]": 151774,
+  "[SEG094]": 151775,
+  "[SEG095]": 151776,
+  "[SEG096]": 151777,
+  "[SEG097]": 151778,
+  "[SEG098]": 151779,
+  "[SEG099]": 151780
+}

chat_with_sa2va.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+from PIL import Image
+import os
+import numpy as np
+from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
+from types import MethodType
+from detectron2.data import MetadataCatalog
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.visualizer import GenericMask
+import matplotlib.colors as mplc
+def draw_instance_predictions_cache(self, labels, np_masks, jittering: bool = True):
+    """
+    Draw instance-level prediction results on an image.
+    Args:
+        predictions (Instances): the output of an instance detection/segmentation
+            model. Following fields will be used to draw:
+            "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+        jittering: if True, in color mode SEGMENTATION, randomly jitter the colors per class
+            to distinguish instances from the same class
+    Returns:
+        output (VisImage): image object with visualizations.
+    """
+    boxes = None
+    scores = None
+    classes = None
+    keypoints = None
+    masks = [GenericMask(x, self.output.height, self.output.width) for x in np_masks]
+    if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+        colors = (
+            [self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes]
+            if jittering
+            else [
+                tuple(mplc.to_rgb([x / 255 for x in self.metadata.thing_colors[c]]))
+                for c in classes
+            ]
+        )
+        alpha = 0.8
+    else:
+        colors = None
+        alpha = 0.5
+    self.overlay_instances(
+        masks=masks,
+        boxes=boxes,
+        labels=labels,
+        keypoints=keypoints,
+        assigned_colors=colors,
+        alpha=alpha,
+    )
+    return self.output
+def visualize(image_path, cat_masks, out_path, tags):
+    if tags is None:
+        left_tags = [f'{i}' for i in range(len(cat_masks))]
+    else:
+        left_tags = tags
+    unique_tags = list(set(left_tags))
+    text_prompt = ','.join(unique_tags)
+    metadata = MetadataCatalog.get("__unused_ape_" + text_prompt)
+    metadata.thing_classes = unique_tags
+    metadata.stuff_classes = unique_tags
+    result_masks = cat_masks
+    input_image = read_image(image_path, format="BGR")
+    visualizer = Visualizer(input_image[:, :, ::-1], metadata, instance_mode=ColorMode.IMAGE)
+    visualizer.draw_instance_predictions = MethodType(draw_instance_predictions_cache, visualizer)
+    vis_output = visualizer.draw_instance_predictions(labels=left_tags, np_masks=result_masks)
+    output_image = vis_output.get_image()
+    output_image = Image.fromarray(output_image)
+    output_image.save(out_path)
+path = "./work_dirs/hf_pano_vlm"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+image_path = "./FRAME02_ORI.jpg"
+image = Image.open(image_path)
+width, height = image.size
+from projects.llava_sam2.datasets.coco_category import COCO_CATEGORIES
+coco_category_names = ""
+for item in COCO_CATEGORIES:
+    class_name = item['name']
+    coco_category_names += f"<p>{class_name}</p> [CLS], "
+coco_category_names = coco_category_names[:-2]
+# question = f"<image>\nSegment from the class prompt: {coco_category_names}."
+question = f"<image>\nSegment from the class prompt: <p>person</p> [CLS], <p>car</p> [CLS], <p>road</p> [CLS], <p>tree</p> [CLS], <p>building</p> [CLS], <p>ground</p> [CLS]."
+m2f_processor = AutoImageProcessor.from_pretrained("./facebook/mask2former-swin-large-coco-panoptic", trust_remote_code=True,)
+chat_outputs = model.predict_forward(text=question, image=image, tokenizer=tokenizer, m2f_processor=m2f_processor)
+answer = chat_outputs['prediction']
+masks = chat_outputs['prediction_masks']
+m2f_outputs = chat_outputs['m2f_outputs']
+label_id_to_text = m2f_outputs['label_id_to_text']
+post_m2f_outputs = model.post_process_panoptic_segmentation(
+    m2f_outputs['class_queries_logits'],
+    m2f_outputs['masks_queries_logits'],
+    target_sizes=[(height, width)],
+)
+print(f"user: {question}")
+print(f"assistant: {answer}")
+segmentation = post_m2f_outputs[0]['segmentation']
+segments_info = post_m2f_outputs[0]['segments_info']
+pano_masks, pano_tags = [], []
+for item in segments_info:
+    mask = segmentation == item['id']
+    pano_masks.append(mask.unsqueeze(0).cpu().numpy())
+    pano_tags.append(label_id_to_text[item['label_id']])
+pano_masks = np.concatenate(pano_masks, axis=0)
+visualize(image_path, pano_masks, "./visualize_test_4.jpg", pano_tags)

config.json ADDED Viewed

	@@ -0,0 +1,2677 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "./OpenGVLab/InternVL2_5-4B",
+  "architectures": [
+    "Sa2VAChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_sa2va_chat.Sa2VAChatConfig",
+    "AutoModel": "modeling_sa2va_chat.Sa2VAChatModel",
+    "AutoModelForCausalLM": "modeling_sa2va_chat.Sa2VAChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "hidden_size": 2048,
+  "llm_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 70,
+    "min_length": 0,
+    "model_type": "qwen2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 2,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.47.0",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151781
+  },
+  "m2f_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "",
+    "activation_function": "relu",
+    "add_cross_attention": false,
+    "architectures": [
+      "Mask2FormerForUniversalSegmentation"
+    ],
+    "backbone": null,
+    "backbone_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": [
+        "SwinForImageClassification"
+      ],
+      "attention_probs_dropout_prob": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "depths": [
+        2,
+        2,
+        18,
+        2
+      ],
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "drop_path_rate": 0.3,
+      "early_stopping": false,
+      "embed_dim": 192,
+      "encoder_no_repeat_ngram_size": 0,
+      "encoder_stride": 32,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu",
+      "hidden_dropout_prob": 0.0,
+      "hidden_size": 1536,
+      "id2label": {
+        "0": "tench, Tinca tinca",
+        "1": "goldfish, Carassius auratus",
+        "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+        "3": "tiger shark, Galeocerdo cuvieri",
+        "4": "hammerhead, hammerhead shark",
+        "5": "electric ray, crampfish, numbfish, torpedo",
+        "6": "stingray",
+        "7": "cock",
+        "8": "hen",
+        "9": "ostrich, Struthio camelus",
+        "10": "brambling, Fringilla montifringilla",
+        "11": "goldfinch, Carduelis carduelis",
+        "12": "house finch, linnet, Carpodacus mexicanus",
+        "13": "junco, snowbird",
+        "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+        "15": "robin, American robin, Turdus migratorius",
+        "16": "bulbul",
+        "17": "jay",
+        "18": "magpie",
+        "19": "chickadee",
+        "20": "water ouzel, dipper",
+        "21": "kite",
+        "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+        "23": "vulture",
+        "24": "great grey owl, great gray owl, Strix nebulosa",
+        "25": "European fire salamander, Salamandra salamandra",
+        "26": "common newt, Triturus vulgaris",
+        "27": "eft",
+        "28": "spotted salamander, Ambystoma maculatum",
+        "29": "axolotl, mud puppy, Ambystoma mexicanum",
+        "30": "bullfrog, Rana catesbeiana",
+        "31": "tree frog, tree-frog",
+        "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+        "33": "loggerhead, loggerhead turtle, Caretta caretta",
+        "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+        "35": "mud turtle",
+        "36": "terrapin",
+        "37": "box turtle, box tortoise",
+        "38": "banded gecko",
+        "39": "common iguana, iguana, Iguana iguana",
+        "40": "American chameleon, anole, Anolis carolinensis",
+        "41": "whiptail, whiptail lizard",
+        "42": "agama",
+        "43": "frilled lizard, Chlamydosaurus kingi",
+        "44": "alligator lizard",
+        "45": "Gila monster, Heloderma suspectum",
+        "46": "green lizard, Lacerta viridis",
+        "47": "African chameleon, Chamaeleo chamaeleon",
+        "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+        "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+        "50": "American alligator, Alligator mississipiensis",
+        "51": "triceratops",
+        "52": "thunder snake, worm snake, Carphophis amoenus",
+        "53": "ringneck snake, ring-necked snake, ring snake",
+        "54": "hognose snake, puff adder, sand viper",
+        "55": "green snake, grass snake",
+        "56": "king snake, kingsnake",
+        "57": "garter snake, grass snake",
+        "58": "water snake",
+        "59": "vine snake",
+        "60": "night snake, Hypsiglena torquata",
+        "61": "boa constrictor, Constrictor constrictor",
+        "62": "rock python, rock snake, Python sebae",
+        "63": "Indian cobra, Naja naja",
+        "64": "green mamba",
+        "65": "sea snake",
+        "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+        "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+        "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+        "69": "trilobite",
+        "70": "harvestman, daddy longlegs, Phalangium opilio",
+        "71": "scorpion",
+        "72": "black and gold garden spider, Argiope aurantia",
+        "73": "barn spider, Araneus cavaticus",
+        "74": "garden spider, Aranea diademata",
+        "75": "black widow, Latrodectus mactans",
+        "76": "tarantula",
+        "77": "wolf spider, hunting spider",
+        "78": "tick",
+        "79": "centipede",
+        "80": "black grouse",
+        "81": "ptarmigan",
+        "82": "ruffed grouse, partridge, Bonasa umbellus",
+        "83": "prairie chicken, prairie grouse, prairie fowl",
+        "84": "peacock",
+        "85": "quail",
+        "86": "partridge",
+        "87": "African grey, African gray, Psittacus erithacus",
+        "88": "macaw",
+        "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+        "90": "lorikeet",
+        "91": "coucal",
+        "92": "bee eater",
+        "93": "hornbill",
+        "94": "hummingbird",
+        "95": "jacamar",
+        "96": "toucan",
+        "97": "drake",
+        "98": "red-breasted merganser, Mergus serrator",
+        "99": "goose",
+        "100": "black swan, Cygnus atratus",
+        "101": "tusker",
+        "102": "echidna, spiny anteater, anteater",
+        "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+        "104": "wallaby, brush kangaroo",
+        "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+        "106": "wombat",
+        "107": "jellyfish",
+        "108": "sea anemone, anemone",
+        "109": "brain coral",
+        "110": "flatworm, platyhelminth",
+        "111": "nematode, nematode worm, roundworm",
+        "112": "conch",
+        "113": "snail",
+        "114": "slug",
+        "115": "sea slug, nudibranch",
+        "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+        "117": "chambered nautilus, pearly nautilus, nautilus",
+        "118": "Dungeness crab, Cancer magister",
+        "119": "rock crab, Cancer irroratus",
+        "120": "fiddler crab",
+        "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+        "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+        "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+        "124": "crayfish, crawfish, crawdad, crawdaddy",
+        "125": "hermit crab",
+        "126": "isopod",
+        "127": "white stork, Ciconia ciconia",
+        "128": "black stork, Ciconia nigra",
+        "129": "spoonbill",
+        "130": "flamingo",
+        "131": "little blue heron, Egretta caerulea",
+        "132": "American egret, great white heron, Egretta albus",
+        "133": "bittern",
+        "134": "crane",
+        "135": "limpkin, Aramus pictus",
+        "136": "European gallinule, Porphyrio porphyrio",
+        "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+        "138": "bustard",
+        "139": "ruddy turnstone, Arenaria interpres",
+        "140": "red-backed sandpiper, dunlin, Erolia alpina",
+        "141": "redshank, Tringa totanus",
+        "142": "dowitcher",
+        "143": "oystercatcher, oyster catcher",
+        "144": "pelican",
+        "145": "king penguin, Aptenodytes patagonica",
+        "146": "albatross, mollymawk",
+        "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+        "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+        "149": "dugong, Dugong dugon",
+        "150": "sea lion",
+        "151": "Chihuahua",
+        "152": "Japanese spaniel",
+        "153": "Maltese dog, Maltese terrier, Maltese",
+        "154": "Pekinese, Pekingese, Peke",
+        "155": "Shih-Tzu",
+        "156": "Blenheim spaniel",
+        "157": "papillon",
+        "158": "toy terrier",
+        "159": "Rhodesian ridgeback",
+        "160": "Afghan hound, Afghan",
+        "161": "basset, basset hound",
+        "162": "beagle",
+        "163": "bloodhound, sleuthhound",
+        "164": "bluetick",
+        "165": "black-and-tan coonhound",
+        "166": "Walker hound, Walker foxhound",
+        "167": "English foxhound",
+        "168": "redbone",
+        "169": "borzoi, Russian wolfhound",
+        "170": "Irish wolfhound",
+        "171": "Italian greyhound",
+        "172": "whippet",
+        "173": "Ibizan hound, Ibizan Podenco",
+        "174": "Norwegian elkhound, elkhound",
+        "175": "otterhound, otter hound",
+        "176": "Saluki, gazelle hound",
+        "177": "Scottish deerhound, deerhound",
+        "178": "Weimaraner",
+        "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+        "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+        "181": "Bedlington terrier",
+        "182": "Border terrier",
+        "183": "Kerry blue terrier",
+        "184": "Irish terrier",
+        "185": "Norfolk terrier",
+        "186": "Norwich terrier",
+        "187": "Yorkshire terrier",
+        "188": "wire-haired fox terrier",
+        "189": "Lakeland terrier",
+        "190": "Sealyham terrier, Sealyham",
+        "191": "Airedale, Airedale terrier",
+        "192": "cairn, cairn terrier",
+        "193": "Australian terrier",
+        "194": "Dandie Dinmont, Dandie Dinmont terrier",
+        "195": "Boston bull, Boston terrier",
+        "196": "miniature schnauzer",
+        "197": "giant schnauzer",
+        "198": "standard schnauzer",
+        "199": "Scotch terrier, Scottish terrier, Scottie",
+        "200": "Tibetan terrier, chrysanthemum dog",
+        "201": "silky terrier, Sydney silky",
+        "202": "soft-coated wheaten terrier",
+        "203": "West Highland white terrier",
+        "204": "Lhasa, Lhasa apso",
+        "205": "flat-coated retriever",
+        "206": "curly-coated retriever",
+        "207": "golden retriever",
+        "208": "Labrador retriever",
+        "209": "Chesapeake Bay retriever",
+        "210": "German short-haired pointer",
+        "211": "vizsla, Hungarian pointer",
+        "212": "English setter",
+        "213": "Irish setter, red setter",
+        "214": "Gordon setter",
+        "215": "Brittany spaniel",
+        "216": "clumber, clumber spaniel",
+        "217": "English springer, English springer spaniel",
+        "218": "Welsh springer spaniel",
+        "219": "cocker spaniel, English cocker spaniel, cocker",
+        "220": "Sussex spaniel",
+        "221": "Irish water spaniel",
+        "222": "kuvasz",
+        "223": "schipperke",
+        "224": "groenendael",
+        "225": "malinois",
+        "226": "briard",
+        "227": "kelpie",
+        "228": "komondor",
+        "229": "Old English sheepdog, bobtail",
+        "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+        "231": "collie",
+        "232": "Border collie",
+        "233": "Bouvier des Flandres, Bouviers des Flandres",
+        "234": "Rottweiler",
+        "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+        "236": "Doberman, Doberman pinscher",
+        "237": "miniature pinscher",
+        "238": "Greater Swiss Mountain dog",
+        "239": "Bernese mountain dog",
+        "240": "Appenzeller",
+        "241": "EntleBucher",
+        "242": "boxer",
+        "243": "bull mastiff",
+        "244": "Tibetan mastiff",
+        "245": "French bulldog",
+        "246": "Great Dane",
+        "247": "Saint Bernard, St Bernard",
+        "248": "Eskimo dog, husky",
+        "249": "malamute, malemute, Alaskan malamute",
+        "250": "Siberian husky",
+        "251": "dalmatian, coach dog, carriage dog",
+        "252": "affenpinscher, monkey pinscher, monkey dog",
+        "253": "basenji",
+        "254": "pug, pug-dog",
+        "255": "Leonberg",
+        "256": "Newfoundland, Newfoundland dog",
+        "257": "Great Pyrenees",
+        "258": "Samoyed, Samoyede",
+        "259": "Pomeranian",
+        "260": "chow, chow chow",
+        "261": "keeshond",
+        "262": "Brabancon griffon",
+        "263": "Pembroke, Pembroke Welsh corgi",
+        "264": "Cardigan, Cardigan Welsh corgi",
+        "265": "toy poodle",
+        "266": "miniature poodle",
+        "267": "standard poodle",
+        "268": "Mexican hairless",
+        "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+        "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+        "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+        "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+        "273": "dingo, warrigal, warragal, Canis dingo",
+        "274": "dhole, Cuon alpinus",
+        "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+        "276": "hyena, hyaena",
+        "277": "red fox, Vulpes vulpes",
+        "278": "kit fox, Vulpes macrotis",
+        "279": "Arctic fox, white fox, Alopex lagopus",
+        "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+        "281": "tabby, tabby cat",
+        "282": "tiger cat",
+        "283": "Persian cat",
+        "284": "Siamese cat, Siamese",
+        "285": "Egyptian cat",
+        "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+        "287": "lynx, catamount",
+        "288": "leopard, Panthera pardus",
+        "289": "snow leopard, ounce, Panthera uncia",
+        "290": "jaguar, panther, Panthera onca, Felis onca",
+        "291": "lion, king of beasts, Panthera leo",
+        "292": "tiger, Panthera tigris",
+        "293": "cheetah, chetah, Acinonyx jubatus",
+        "294": "brown bear, bruin, Ursus arctos",
+        "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+        "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+        "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+        "298": "mongoose",
+        "299": "meerkat, mierkat",
+        "300": "tiger beetle",
+        "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+        "302": "ground beetle, carabid beetle",
+        "303": "long-horned beetle, longicorn, longicorn beetle",
+        "304": "leaf beetle, chrysomelid",
+        "305": "dung beetle",
+        "306": "rhinoceros beetle",
+        "307": "weevil",
+        "308": "fly",
+        "309": "bee",
+        "310": "ant, emmet, pismire",
+        "311": "grasshopper, hopper",
+        "312": "cricket",
+        "313": "walking stick, walkingstick, stick insect",
+        "314": "cockroach, roach",
+        "315": "mantis, mantid",
+        "316": "cicada, cicala",
+        "317": "leafhopper",
+        "318": "lacewing, lacewing fly",
+        "319": "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+        "320": "damselfly",
+        "321": "admiral",
+        "322": "ringlet, ringlet butterfly",
+        "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+        "324": "cabbage butterfly",
+        "325": "sulphur butterfly, sulfur butterfly",
+        "326": "lycaenid, lycaenid butterfly",
+        "327": "starfish, sea star",
+        "328": "sea urchin",
+        "329": "sea cucumber, holothurian",
+        "330": "wood rabbit, cottontail, cottontail rabbit",
+        "331": "hare",
+        "332": "Angora, Angora rabbit",
+        "333": "hamster",
+        "334": "porcupine, hedgehog",
+        "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+        "336": "marmot",
+        "337": "beaver",
+        "338": "guinea pig, Cavia cobaya",
+        "339": "sorrel",
+        "340": "zebra",
+        "341": "hog, pig, grunter, squealer, Sus scrofa",
+        "342": "wild boar, boar, Sus scrofa",
+        "343": "warthog",
+        "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+        "345": "ox",
+        "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+        "347": "bison",
+        "348": "ram, tup",
+        "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+        "350": "ibex, Capra ibex",
+        "351": "hartebeest",
+        "352": "impala, Aepyceros melampus",
+        "353": "gazelle",
+        "354": "Arabian camel, dromedary, Camelus dromedarius",
+        "355": "llama",
+        "356": "weasel",
+        "357": "mink",
+        "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+        "359": "black-footed ferret, ferret, Mustela nigripes",
+        "360": "otter",
+        "361": "skunk, polecat, wood pussy",
+        "362": "badger",
+        "363": "armadillo",
+        "364": "three-toed sloth, ai, Bradypus tridactylus",
+        "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+        "366": "gorilla, Gorilla gorilla",
+        "367": "chimpanzee, chimp, Pan troglodytes",
+        "368": "gibbon, Hylobates lar",
+        "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+        "370": "guenon, guenon monkey",
+        "371": "patas, hussar monkey, Erythrocebus patas",
+        "372": "baboon",
+        "373": "macaque",
+        "374": "langur",
+        "375": "colobus, colobus monkey",
+        "376": "proboscis monkey, Nasalis larvatus",
+        "377": "marmoset",
+        "378": "capuchin, ringtail, Cebus capucinus",
+        "379": "howler monkey, howler",
+        "380": "titi, titi monkey",
+        "381": "spider monkey, Ateles geoffroyi",
+        "382": "squirrel monkey, Saimiri sciureus",
+        "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+        "384": "indri, indris, Indri indri, Indri brevicaudatus",
+        "385": "Indian elephant, Elephas maximus",
+        "386": "African elephant, Loxodonta africana",
+        "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+        "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+        "389": "barracouta, snoek",
+        "390": "eel",
+        "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+        "392": "rock beauty, Holocanthus tricolor",
+        "393": "anemone fish",
+        "394": "sturgeon",
+        "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+        "396": "lionfish",
+        "397": "puffer, pufferfish, blowfish, globefish",
+        "398": "abacus",
+        "399": "abaya",
+        "400": "academic gown, academic robe, judge's robe",
+        "401": "accordion, piano accordion, squeeze box",
+        "402": "acoustic guitar",
+        "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+        "404": "airliner",
+        "405": "airship, dirigible",
+        "406": "altar",
+        "407": "ambulance",
+        "408": "amphibian, amphibious vehicle",
+        "409": "analog clock",
+        "410": "apiary, bee house",
+        "411": "apron",
+        "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+        "413": "assault rifle, assault gun",
+        "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+        "415": "bakery, bakeshop, bakehouse",
+        "416": "balance beam, beam",
+        "417": "balloon",
+        "418": "ballpoint, ballpoint pen, ballpen, Biro",
+        "419": "Band Aid",
+        "420": "banjo",
+        "421": "bannister, banister, balustrade, balusters, handrail",
+        "422": "barbell",
+        "423": "barber chair",
+        "424": "barbershop",
+        "425": "barn",
+        "426": "barometer",
+        "427": "barrel, cask",
+        "428": "barrow, garden cart, lawn cart, wheelbarrow",
+        "429": "baseball",
+        "430": "basketball",
+        "431": "bassinet",
+        "432": "bassoon",
+        "433": "bathing cap, swimming cap",
+        "434": "bath towel",
+        "435": "bathtub, bathing tub, bath, tub",
+        "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+        "437": "beacon, lighthouse, beacon light, pharos",
+        "438": "beaker",
+        "439": "bearskin, busby, shako",
+        "440": "beer bottle",
+        "441": "beer glass",
+        "442": "bell cote, bell cot",
+        "443": "bib",
+        "444": "bicycle-built-for-two, tandem bicycle, tandem",
+        "445": "bikini, two-piece",
+        "446": "binder, ring-binder",
+        "447": "binoculars, field glasses, opera glasses",
+        "448": "birdhouse",
+        "449": "boathouse",
+        "450": "bobsled, bobsleigh, bob",
+        "451": "bolo tie, bolo, bola tie, bola",
+        "452": "bonnet, poke bonnet",
+        "453": "bookcase",
+        "454": "bookshop, bookstore, bookstall",
+        "455": "bottlecap",
+        "456": "bow",
+        "457": "bow tie, bow-tie, bowtie",
+        "458": "brass, memorial tablet, plaque",
+        "459": "brassiere, bra, bandeau",
+        "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+        "461": "breastplate, aegis, egis",
+        "462": "broom",
+        "463": "bucket, pail",
+        "464": "buckle",
+        "465": "bulletproof vest",
+        "466": "bullet train, bullet",
+        "467": "butcher shop, meat market",
+        "468": "cab, hack, taxi, taxicab",
+        "469": "caldron, cauldron",
+        "470": "candle, taper, wax light",
+        "471": "cannon",
+        "472": "canoe",
+        "473": "can opener, tin opener",
+        "474": "cardigan",
+        "475": "car mirror",
+        "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+        "477": "carpenter's kit, tool kit",
+        "478": "carton",
+        "479": "car wheel",
+        "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+        "481": "cassette",
+        "482": "cassette player",
+        "483": "castle",
+        "484": "catamaran",
+        "485": "CD player",
+        "486": "cello, violoncello",
+        "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+        "488": "chain",
+        "489": "chainlink fence",
+        "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+        "491": "chain saw, chainsaw",
+        "492": "chest",
+        "493": "chiffonier, commode",
+        "494": "chime, bell, gong",
+        "495": "china cabinet, china closet",
+        "496": "Christmas stocking",
+        "497": "church, church building",
+        "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+        "499": "cleaver, meat cleaver, chopper",
+        "500": "cliff dwelling",
+        "501": "cloak",
+        "502": "clog, geta, patten, sabot",
+        "503": "cocktail shaker",
+        "504": "coffee mug",
+        "505": "coffeepot",
+        "506": "coil, spiral, volute, whorl, helix",
+        "507": "combination lock",
+        "508": "computer keyboard, keypad",
+        "509": "confectionery, confectionary, candy store",
+        "510": "container ship, containership, container vessel",
+        "511": "convertible",
+        "512": "corkscrew, bottle screw",
+        "513": "cornet, horn, trumpet, trump",
+        "514": "cowboy boot",
+        "515": "cowboy hat, ten-gallon hat",
+        "516": "cradle",
+        "517": "crane",
+        "518": "crash helmet",
+        "519": "crate",
+        "520": "crib, cot",
+        "521": "Crock Pot",
+        "522": "croquet ball",
+        "523": "crutch",
+        "524": "cuirass",
+        "525": "dam, dike, dyke",
+        "526": "desk",
+        "527": "desktop computer",
+        "528": "dial telephone, dial phone",
+        "529": "diaper, nappy, napkin",
+        "530": "digital clock",
+        "531": "digital watch",
+        "532": "dining table, board",
+        "533": "dishrag, dishcloth",
+        "534": "dishwasher, dish washer, dishwashing machine",
+        "535": "disk brake, disc brake",
+        "536": "dock, dockage, docking facility",
+        "537": "dogsled, dog sled, dog sleigh",
+        "538": "dome",
+        "539": "doormat, welcome mat",
+        "540": "drilling platform, offshore rig",
+        "541": "drum, membranophone, tympan",
+        "542": "drumstick",
+        "543": "dumbbell",
+        "544": "Dutch oven",
+        "545": "electric fan, blower",
+        "546": "electric guitar",
+        "547": "electric locomotive",
+        "548": "entertainment center",
+        "549": "envelope",
+        "550": "espresso maker",
+        "551": "face powder",
+        "552": "feather boa, boa",
+        "553": "file, file cabinet, filing cabinet",
+        "554": "fireboat",
+        "555": "fire engine, fire truck",
+        "556": "fire screen, fireguard",
+        "557": "flagpole, flagstaff",
+        "558": "flute, transverse flute",
+        "559": "folding chair",
+        "560": "football helmet",
+        "561": "forklift",
+        "562": "fountain",
+        "563": "fountain pen",
+        "564": "four-poster",
+        "565": "freight car",
+        "566": "French horn, horn",
+        "567": "frying pan, frypan, skillet",
+        "568": "fur coat",
+        "569": "garbage truck, dustcart",
+        "570": "gasmask, respirator, gas helmet",
+        "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+        "572": "goblet",
+        "573": "go-kart",
+        "574": "golf ball",
+        "575": "golfcart, golf cart",
+        "576": "gondola",
+        "577": "gong, tam-tam",
+        "578": "gown",
+        "579": "grand piano, grand",
+        "580": "greenhouse, nursery, glasshouse",
+        "581": "grille, radiator grille",
+        "582": "grocery store, grocery, food market, market",
+        "583": "guillotine",
+        "584": "hair slide",
+        "585": "hair spray",
+        "586": "half track",
+        "587": "hammer",
+        "588": "hamper",
+        "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+        "590": "hand-held computer, hand-held microcomputer",
+        "591": "handkerchief, hankie, hanky, hankey",
+        "592": "hard disc, hard disk, fixed disk",
+        "593": "harmonica, mouth organ, harp, mouth harp",
+        "594": "harp",
+        "595": "harvester, reaper",
+        "596": "hatchet",
+        "597": "holster",
+        "598": "home theater, home theatre",
+        "599": "honeycomb",
+        "600": "hook, claw",
+        "601": "hoopskirt, crinoline",
+        "602": "horizontal bar, high bar",
+        "603": "horse cart, horse-cart",
+        "604": "hourglass",
+        "605": "iPod",
+        "606": "iron, smoothing iron",
+        "607": "jack-o'-lantern",
+        "608": "jean, blue jean, denim",
+        "609": "jeep, landrover",
+        "610": "jersey, T-shirt, tee shirt",
+        "611": "jigsaw puzzle",
+        "612": "jinrikisha, ricksha, rickshaw",
+        "613": "joystick",
+        "614": "kimono",
+        "615": "knee pad",
+        "616": "knot",
+        "617": "lab coat, laboratory coat",
+        "618": "ladle",
+        "619": "lampshade, lamp shade",
+        "620": "laptop, laptop computer",
+        "621": "lawn mower, mower",
+        "622": "lens cap, lens cover",
+        "623": "letter opener, paper knife, paperknife",
+        "624": "library",
+        "625": "lifeboat",
+        "626": "lighter, light, igniter, ignitor",
+        "627": "limousine, limo",
+        "628": "liner, ocean liner",
+        "629": "lipstick, lip rouge",
+        "630": "Loafer",
+        "631": "lotion",
+        "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+        "633": "loupe, jeweler's loupe",
+        "634": "lumbermill, sawmill",
+        "635": "magnetic compass",
+        "636": "mailbag, postbag",
+        "637": "mailbox, letter box",
+        "638": "maillot",
+        "639": "maillot, tank suit",
+        "640": "manhole cover",
+        "641": "maraca",
+        "642": "marimba, xylophone",
+        "643": "mask",
+        "644": "matchstick",
+        "645": "maypole",
+        "646": "maze, labyrinth",
+        "647": "measuring cup",
+        "648": "medicine chest, medicine cabinet",
+        "649": "megalith, megalithic structure",
+        "650": "microphone, mike",
+        "651": "microwave, microwave oven",
+        "652": "military uniform",
+        "653": "milk can",
+        "654": "minibus",
+        "655": "miniskirt, mini",
+        "656": "minivan",
+        "657": "missile",
+        "658": "mitten",
+        "659": "mixing bowl",
+        "660": "mobile home, manufactured home",
+        "661": "Model T",
+        "662": "modem",
+        "663": "monastery",
+        "664": "monitor",
+        "665": "moped",
+        "666": "mortar",
+        "667": "mortarboard",
+        "668": "mosque",
+        "669": "mosquito net",
+        "670": "motor scooter, scooter",
+        "671": "mountain bike, all-terrain bike, off-roader",
+        "672": "mountain tent",
+        "673": "mouse, computer mouse",
+        "674": "mousetrap",
+        "675": "moving van",
+        "676": "muzzle",
+        "677": "nail",
+        "678": "neck brace",
+        "679": "necklace",
+        "680": "nipple",
+        "681": "notebook, notebook computer",
+        "682": "obelisk",
+        "683": "oboe, hautboy, hautbois",
+        "684": "ocarina, sweet potato",
+        "685": "odometer, hodometer, mileometer, milometer",
+        "686": "oil filter",
+        "687": "organ, pipe organ",
+        "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+        "689": "overskirt",
+        "690": "oxcart",
+        "691": "oxygen mask",
+        "692": "packet",
+        "693": "paddle, boat paddle",
+        "694": "paddlewheel, paddle wheel",
+        "695": "padlock",
+        "696": "paintbrush",
+        "697": "pajama, pyjama, pj's, jammies",
+        "698": "palace",
+        "699": "panpipe, pandean pipe, syrinx",
+        "700": "paper towel",
+        "701": "parachute, chute",
+        "702": "parallel bars, bars",
+        "703": "park bench",
+        "704": "parking meter",
+        "705": "passenger car, coach, carriage",
+        "706": "patio, terrace",
+        "707": "pay-phone, pay-station",
+        "708": "pedestal, plinth, footstall",
+        "709": "pencil box, pencil case",
+        "710": "pencil sharpener",
+        "711": "perfume, essence",
+        "712": "Petri dish",
+        "713": "photocopier",
+        "714": "pick, plectrum, plectron",
+        "715": "pickelhaube",
+        "716": "picket fence, paling",
+        "717": "pickup, pickup truck",
+        "718": "pier",
+        "719": "piggy bank, penny bank",
+        "720": "pill bottle",
+        "721": "pillow",
+        "722": "ping-pong ball",
+        "723": "pinwheel",
+        "724": "pirate, pirate ship",
+        "725": "pitcher, ewer",
+        "726": "plane, carpenter's plane, woodworking plane",
+        "727": "planetarium",
+        "728": "plastic bag",
+        "729": "plate rack",
+        "730": "plow, plough",
+        "731": "plunger, plumber's helper",
+        "732": "Polaroid camera, Polaroid Land camera",
+        "733": "pole",
+        "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+        "735": "poncho",
+        "736": "pool table, billiard table, snooker table",
+        "737": "pop bottle, soda bottle",
+        "738": "pot, flowerpot",
+        "739": "potter's wheel",
+        "740": "power drill",
+        "741": "prayer rug, prayer mat",
+        "742": "printer",
+        "743": "prison, prison house",
+        "744": "projectile, missile",
+        "745": "projector",
+        "746": "puck, hockey puck",
+        "747": "punching bag, punch bag, punching ball, punchball",
+        "748": "purse",
+        "749": "quill, quill pen",
+        "750": "quilt, comforter, comfort, puff",
+        "751": "racer, race car, racing car",
+        "752": "racket, racquet",
+        "753": "radiator",
+        "754": "radio, wireless",
+        "755": "radio telescope, radio reflector",
+        "756": "rain barrel",
+        "757": "recreational vehicle, RV, R.V.",
+        "758": "reel",
+        "759": "reflex camera",
+        "760": "refrigerator, icebox",
+        "761": "remote control, remote",
+        "762": "restaurant, eating house, eating place, eatery",
+        "763": "revolver, six-gun, six-shooter",
+        "764": "rifle",
+        "765": "rocking chair, rocker",
+        "766": "rotisserie",
+        "767": "rubber eraser, rubber, pencil eraser",
+        "768": "rugby ball",
+        "769": "rule, ruler",
+        "770": "running shoe",
+        "771": "safe",
+        "772": "safety pin",
+        "773": "saltshaker, salt shaker",
+        "774": "sandal",
+        "775": "sarong",
+        "776": "sax, saxophone",
+        "777": "scabbard",
+        "778": "scale, weighing machine",
+        "779": "school bus",
+        "780": "schooner",
+        "781": "scoreboard",
+        "782": "screen, CRT screen",
+        "783": "screw",
+        "784": "screwdriver",
+        "785": "seat belt, seatbelt",
+        "786": "sewing machine",
+        "787": "shield, buckler",
+        "788": "shoe shop, shoe-shop, shoe store",
+        "789": "shoji",
+        "790": "shopping basket",
+        "791": "shopping cart",
+        "792": "shovel",
+        "793": "shower cap",
+        "794": "shower curtain",
+        "795": "ski",
+        "796": "ski mask",
+        "797": "sleeping bag",
+        "798": "slide rule, slipstick",
+        "799": "sliding door",
+        "800": "slot, one-armed bandit",
+        "801": "snorkel",
+        "802": "snowmobile",
+        "803": "snowplow, snowplough",
+        "804": "soap dispenser",
+        "805": "soccer ball",
+        "806": "sock",
+        "807": "solar dish, solar collector, solar furnace",
+        "808": "sombrero",
+        "809": "soup bowl",
+        "810": "space bar",
+        "811": "space heater",
+        "812": "space shuttle",
+        "813": "spatula",
+        "814": "speedboat",
+        "815": "spider web, spider's web",
+        "816": "spindle",
+        "817": "sports car, sport car",
+        "818": "spotlight, spot",
+        "819": "stage",
+        "820": "steam locomotive",
+        "821": "steel arch bridge",
+        "822": "steel drum",
+        "823": "stethoscope",
+        "824": "stole",
+        "825": "stone wall",
+        "826": "stopwatch, stop watch",
+        "827": "stove",
+        "828": "strainer",
+        "829": "streetcar, tram, tramcar, trolley, trolley car",
+        "830": "stretcher",
+        "831": "studio couch, day bed",
+        "832": "stupa, tope",
+        "833": "submarine, pigboat, sub, U-boat",
+        "834": "suit, suit of clothes",
+        "835": "sundial",
+        "836": "sunglass",
+        "837": "sunglasses, dark glasses, shades",
+        "838": "sunscreen, sunblock, sun blocker",
+        "839": "suspension bridge",
+        "840": "swab, swob, mop",
+        "841": "sweatshirt",
+        "842": "swimming trunks, bathing trunks",
+        "843": "swing",
+        "844": "switch, electric switch, electrical switch",
+        "845": "syringe",
+        "846": "table lamp",
+        "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+        "848": "tape player",
+        "849": "teapot",
+        "850": "teddy, teddy bear",
+        "851": "television, television system",
+        "852": "tennis ball",
+        "853": "thatch, thatched roof",
+        "854": "theater curtain, theatre curtain",
+        "855": "thimble",
+        "856": "thresher, thrasher, threshing machine",
+        "857": "throne",
+        "858": "tile roof",
+        "859": "toaster",
+        "860": "tobacco shop, tobacconist shop, tobacconist",
+        "861": "toilet seat",
+        "862": "torch",
+        "863": "totem pole",
+        "864": "tow truck, tow car, wrecker",
+        "865": "toyshop",
+        "866": "tractor",
+        "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+        "868": "tray",
+        "869": "trench coat",
+        "870": "tricycle, trike, velocipede",
+        "871": "trimaran",
+        "872": "tripod",
+        "873": "triumphal arch",
+        "874": "trolleybus, trolley coach, trackless trolley",
+        "875": "trombone",
+        "876": "tub, vat",
+        "877": "turnstile",
+        "878": "typewriter keyboard",
+        "879": "umbrella",
+        "880": "unicycle, monocycle",
+        "881": "upright, upright piano",
+        "882": "vacuum, vacuum cleaner",
+        "883": "vase",
+        "884": "vault",
+        "885": "velvet",
+        "886": "vending machine",
+        "887": "vestment",
+        "888": "viaduct",
+        "889": "violin, fiddle",
+        "890": "volleyball",
+        "891": "waffle iron",
+        "892": "wall clock",
+        "893": "wallet, billfold, notecase, pocketbook",
+        "894": "wardrobe, closet, press",
+        "895": "warplane, military plane",
+        "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+        "897": "washer, automatic washer, washing machine",
+        "898": "water bottle",
+        "899": "water jug",
+        "900": "water tower",
+        "901": "whiskey jug",
+        "902": "whistle",
+        "903": "wig",
+        "904": "window screen",
+        "905": "window shade",
+        "906": "Windsor tie",
+        "907": "wine bottle",
+        "908": "wing",
+        "909": "wok",
+        "910": "wooden spoon",
+        "911": "wool, woolen, woollen",
+        "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+        "913": "wreck",
+        "914": "yawl",
+        "915": "yurt",
+        "916": "web site, website, internet site, site",
+        "917": "comic book",
+        "918": "crossword puzzle, crossword",
+        "919": "street sign",
+        "920": "traffic light, traffic signal, stoplight",
+        "921": "book jacket, dust cover, dust jacket, dust wrapper",
+        "922": "menu",
+        "923": "plate",
+        "924": "guacamole",
+        "925": "consomme",
+        "926": "hot pot, hotpot",
+        "927": "trifle",
+        "928": "ice cream, icecream",
+        "929": "ice lolly, lolly, lollipop, popsicle",
+        "930": "French loaf",
+        "931": "bagel, beigel",
+        "932": "pretzel",
+        "933": "cheeseburger",
+        "934": "hotdog, hot dog, red hot",
+        "935": "mashed potato",
+        "936": "head cabbage",
+        "937": "broccoli",
+        "938": "cauliflower",
+        "939": "zucchini, courgette",
+        "940": "spaghetti squash",
+        "941": "acorn squash",
+        "942": "butternut squash",
+        "943": "cucumber, cuke",
+        "944": "artichoke, globe artichoke",
+        "945": "bell pepper",
+        "946": "cardoon",
+        "947": "mushroom",
+        "948": "Granny Smith",
+        "949": "strawberry",
+        "950": "orange",
+        "951": "lemon",
+        "952": "fig",
+        "953": "pineapple, ananas",
+        "954": "banana",
+        "955": "jackfruit, jak, jack",
+        "956": "custard apple",
+        "957": "pomegranate",
+        "958": "hay",
+        "959": "carbonara",
+        "960": "chocolate sauce, chocolate syrup",
+        "961": "dough",
+        "962": "meat loaf, meatloaf",
+        "963": "pizza, pizza pie",
+        "964": "potpie",
+        "965": "burrito",
+        "966": "red wine",
+        "967": "espresso",
+        "968": "cup",
+        "969": "eggnog",
+        "970": "alp",
+        "971": "bubble",
+        "972": "cliff, drop, drop-off",
+        "973": "coral reef",
+        "974": "geyser",
+        "975": "lakeside, lakeshore",
+        "976": "promontory, headland, head, foreland",
+        "977": "sandbar, sand bar",
+        "978": "seashore, coast, seacoast, sea-coast",
+        "979": "valley, vale",
+        "980": "volcano",
+        "981": "ballplayer, baseball player",
+        "982": "groom, bridegroom",
+        "983": "scuba diver",
+        "984": "rapeseed",
+        "985": "daisy",
+        "986": "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+        "987": "corn",
+        "988": "acorn",
+        "989": "hip, rose hip, rosehip",
+        "990": "buckeye, horse chestnut, conker",
+        "991": "coral fungus",
+        "992": "agaric",
+        "993": "gyromitra",
+        "994": "stinkhorn, carrion fungus",
+        "995": "earthstar",
+        "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+        "997": "bolete",
+        "998": "ear, spike, capitulum",
+        "999": "toilet tissue, toilet paper, bathroom tissue"
+      },
+      "image_size": 384,
+      "initializer_range": 0.02,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "Afghan hound, Afghan": 160,
+        "African chameleon, Chamaeleo chamaeleon": 47,
+        "African crocodile, Nile crocodile, Crocodylus niloticus": 49,
+        "African elephant, Loxodonta africana": 386,
+        "African grey, African gray, Psittacus erithacus": 87,
+        "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus": 275,
+        "Airedale, Airedale terrier": 191,
+        "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier": 180,
+        "American alligator, Alligator mississipiensis": 50,
+        "American black bear, black bear, Ursus americanus, Euarctos americanus": 295,
+        "American chameleon, anole, Anolis carolinensis": 40,
+        "American coot, marsh hen, mud hen, water hen, Fulica americana": 137,
+        "American egret, great white heron, Egretta albus": 132,
+        "American lobster, Northern lobster, Maine lobster, Homarus americanus": 122,
+        "Angora, Angora rabbit": 332,
+        "Appenzeller": 240,
+        "Arabian camel, dromedary, Camelus dromedarius": 354,
+        "Arctic fox, white fox, Alopex lagopus": 279,
+        "Australian terrier": 193,
+        "Band Aid": 419,
+        "Bedlington terrier": 181,
+        "Bernese mountain dog": 239,
+        "Blenheim spaniel": 156,
+        "Border collie": 232,
+        "Border terrier": 182,
+        "Boston bull, Boston terrier": 195,
+        "Bouvier des Flandres, Bouviers des Flandres": 233,
+        "Brabancon griffon": 262,
+        "Brittany spaniel": 215,
+        "CD player": 485,
+        "Cardigan, Cardigan Welsh corgi": 264,
+        "Chesapeake Bay retriever": 209,
+        "Chihuahua": 151,
+        "Christmas stocking": 496,
+        "Crock Pot": 521,
+        "Dandie Dinmont, Dandie Dinmont terrier": 194,
+        "Doberman, Doberman pinscher": 236,
+        "Dungeness crab, Cancer magister": 118,
+        "Dutch oven": 544,
+        "Egyptian cat": 285,
+        "English foxhound": 167,
+        "English setter": 212,
+        "English springer, English springer spaniel": 217,
+        "EntleBucher": 241,
+        "Eskimo dog, husky": 248,
+        "European fire salamander, Salamandra salamandra": 25,
+        "European gallinule, Porphyrio porphyrio": 136,
+        "French bulldog": 245,
+        "French horn, horn": 566,
+        "French loaf": 930,
+        "German shepherd, German shepherd dog, German police dog, alsatian": 235,
+        "German short-haired pointer": 210,
+        "Gila monster, Heloderma suspectum": 45,
+        "Gordon setter": 214,
+        "Granny Smith": 948,
+        "Great Dane": 246,
+        "Great Pyrenees": 257,
+        "Greater Swiss Mountain dog": 238,
+        "Ibizan hound, Ibizan Podenco": 173,
+        "Indian cobra, Naja naja": 63,
+        "Indian elephant, Elephas maximus": 385,
+        "Irish setter, red setter": 213,
+        "Irish terrier": 184,
+        "Irish water spaniel": 221,
+        "Irish wolfhound": 170,
+        "Italian greyhound": 171,
+        "Japanese spaniel": 152,
+        "Kerry blue terrier": 183,
+        "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis": 48,
+        "Labrador retriever": 208,
+        "Lakeland terrier": 189,
+        "Leonberg": 255,
+        "Lhasa, Lhasa apso": 204,
+        "Loafer": 630,
+        "Madagascar cat, ring-tailed lemur, Lemur catta": 383,
+        "Maltese dog, Maltese terrier, Maltese": 153,
+        "Mexican hairless": 268,
+        "Model T": 661,
+        "Newfoundland, Newfoundland dog": 256,
+        "Norfolk terrier": 185,
+        "Norwegian elkhound, elkhound": 174,
+        "Norwich terrier": 186,
+        "Old English sheepdog, bobtail": 229,
+        "Pekinese, Pekingese, Peke": 154,
+        "Pembroke, Pembroke Welsh corgi": 263,
+        "Persian cat": 283,
+        "Petri dish": 712,
+        "Polaroid camera, Polaroid Land camera": 732,
+        "Pomeranian": 259,
+        "Rhodesian ridgeback": 159,
+        "Rottweiler": 234,
+        "Saint Bernard, St Bernard": 247,
+        "Saluki, gazelle hound": 176,
+        "Samoyed, Samoyede": 258,
+        "Scotch terrier, Scottish terrier, Scottie": 199,
+        "Scottish deerhound, deerhound": 177,
+        "Sealyham terrier, Sealyham": 190,
+        "Shetland sheepdog, Shetland sheep dog, Shetland": 230,
+        "Shih-Tzu": 155,
+        "Siamese cat, Siamese": 284,
+        "Siberian husky": 250,
+        "Staffordshire bullterrier, Staffordshire bull terrier": 179,
+        "Sussex spaniel": 220,
+        "Tibetan mastiff": 244,
+        "Tibetan terrier, chrysanthemum dog": 200,
+        "Walker hound, Walker foxhound": 166,
+        "Weimaraner": 178,
+        "Welsh springer spaniel": 218,
+        "West Highland white terrier": 203,
+        "Windsor tie": 906,
+        "Yorkshire terrier": 187,
+        "abacus": 398,
+        "abaya": 399,
+        "academic gown, academic robe, judge's robe": 400,
+        "accordion, piano accordion, squeeze box": 401,
+        "acorn": 988,
+        "acorn squash": 941,
+        "acoustic guitar": 402,
+        "admiral": 321,
+        "affenpinscher, monkey pinscher, monkey dog": 252,
+        "agama": 42,
+        "agaric": 992,
+        "aircraft carrier, carrier, flattop, attack aircraft carrier": 403,
+        "airliner": 404,
+        "airship, dirigible": 405,
+        "albatross, mollymawk": 146,
+        "alligator lizard": 44,
+        "alp": 970,
+        "altar": 406,
+        "ambulance": 407,
+        "amphibian, amphibious vehicle": 408,
+        "analog clock": 409,
+        "anemone fish": 393,
+        "ant, emmet, pismire": 310,
+        "apiary, bee house": 410,
+        "apron": 411,
+        "armadillo": 363,
+        "artichoke, globe artichoke": 944,
+        "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin": 412,
+        "assault rifle, assault gun": 413,
+        "axolotl, mud puppy, Ambystoma mexicanum": 29,
+        "baboon": 372,
+        "backpack, back pack, knapsack, packsack, rucksack, haversack": 414,
+        "badger": 362,
+        "bagel, beigel": 931,
+        "bakery, bakeshop, bakehouse": 415,
+        "balance beam, beam": 416,
+        "bald eagle, American eagle, Haliaeetus leucocephalus": 22,
+        "balloon": 417,
+        "ballplayer, baseball player": 981,
+        "ballpoint, ballpoint pen, ballpen, Biro": 418,
+        "banana": 954,
+        "banded gecko": 38,
+        "banjo": 420,
+        "bannister, banister, balustrade, balusters, handrail": 421,
+        "barbell": 422,
+        "barber chair": 423,
+        "barbershop": 424,
+        "barn": 425,
+        "barn spider, Araneus cavaticus": 73,
+        "barometer": 426,
+        "barracouta, snoek": 389,
+        "barrel, cask": 427,
+        "barrow, garden cart, lawn cart, wheelbarrow": 428,
+        "baseball": 429,
+        "basenji": 253,
+        "basketball": 430,
+        "basset, basset hound": 161,
+        "bassinet": 431,
+        "bassoon": 432,
+        "bath towel": 434,
+        "bathing cap, swimming cap": 433,
+        "bathtub, bathing tub, bath, tub": 435,
+        "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon": 436,
+        "beacon, lighthouse, beacon light, pharos": 437,
+        "beagle": 162,
+        "beaker": 438,
+        "bearskin, busby, shako": 439,
+        "beaver": 337,
+        "bee": 309,
+        "bee eater": 92,
+        "beer bottle": 440,
+        "beer glass": 441,
+        "bell cote, bell cot": 442,
+        "bell pepper": 945,
+        "bib": 443,
+        "bicycle-built-for-two, tandem bicycle, tandem": 444,
+        "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis": 349,
+        "bikini, two-piece": 445,
+        "binder, ring-binder": 446,
+        "binoculars, field glasses, opera glasses": 447,
+        "birdhouse": 448,
+        "bison": 347,
+        "bittern": 133,
+        "black and gold garden spider, Argiope aurantia": 72,
+        "black grouse": 80,
+        "black stork, Ciconia nigra": 128,
+        "black swan, Cygnus atratus": 100,
+        "black widow, Latrodectus mactans": 75,
+        "black-and-tan coonhound": 165,
+        "black-footed ferret, ferret, Mustela nigripes": 359,
+        "bloodhound, sleuthhound": 163,
+        "bluetick": 164,
+        "boa constrictor, Constrictor constrictor": 61,
+        "boathouse": 449,
+        "bobsled, bobsleigh, bob": 450,
+        "bolete": 997,
+        "bolo tie, bolo, bola tie, bola": 451,
+        "bonnet, poke bonnet": 452,
+        "book jacket, dust cover, dust jacket, dust wrapper": 921,
+        "bookcase": 453,
+        "bookshop, bookstore, bookstall": 454,
+        "borzoi, Russian wolfhound": 169,
+        "bottlecap": 455,
+        "bow": 456,
+        "bow tie, bow-tie, bowtie": 457,
+        "box turtle, box tortoise": 37,
+        "boxer": 242,
+        "brain coral": 109,
+        "brambling, Fringilla montifringilla": 10,
+        "brass, memorial tablet, plaque": 458,
+        "brassiere, bra, bandeau": 459,
+        "breakwater, groin, groyne, mole, bulwark, seawall, jetty": 460,
+        "breastplate, aegis, egis": 461,
+        "briard": 226,
+        "broccoli": 937,
+        "broom": 462,
+        "brown bear, bruin, Ursus arctos": 294,
+        "bubble": 971,
+        "bucket, pail": 463,
+        "buckeye, horse chestnut, conker": 990,
+        "buckle": 464,
+        "bulbul": 16,
+        "bull mastiff": 243,
+        "bullet train, bullet": 466,
+        "bulletproof vest": 465,
+        "bullfrog, Rana catesbeiana": 30,
+        "burrito": 965,
+        "bustard": 138,
+        "butcher shop, meat market": 467,
+        "butternut squash": 942,
+        "cab, hack, taxi, taxicab": 468,
+        "cabbage butterfly": 324,
+        "cairn, cairn terrier": 192,
+        "caldron, cauldron": 469,
+        "can opener, tin opener": 473,
+        "candle, taper, wax light": 470,
+        "cannon": 471,
+        "canoe": 472,
+        "capuchin, ringtail, Cebus capucinus": 378,
+        "car mirror": 475,
+        "car wheel": 479,
+        "carbonara": 959,
+        "cardigan": 474,
+        "cardoon": 946,
+        "carousel, carrousel, merry-go-round, roundabout, whirligig": 476,
+        "carpenter's kit, tool kit": 477,
+        "carton": 478,
+        "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM": 480,
+        "cassette": 481,
+        "cassette player": 482,
+        "castle": 483,
+        "catamaran": 484,
+        "cauliflower": 938,
+        "cello, violoncello": 486,
+        "cellular telephone, cellular phone, cellphone, cell, mobile phone": 487,
+        "centipede": 79,
+        "chain": 488,
+        "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour": 490,
+        "chain saw, chainsaw": 491,
+        "chainlink fence": 489,
+        "chambered nautilus, pearly nautilus, nautilus": 117,
+        "cheeseburger": 933,
+        "cheetah, chetah, Acinonyx jubatus": 293,
+        "chest": 492,
+        "chickadee": 19,
+        "chiffonier, commode": 493,
+        "chime, bell, gong": 494,
+        "chimpanzee, chimp, Pan troglodytes": 367,
+        "china cabinet, china closet": 495,
+        "chiton, coat-of-mail shell, sea cradle, polyplacophore": 116,
+        "chocolate sauce, chocolate syrup": 960,
+        "chow, chow chow": 260,
+        "church, church building": 497,
+        "cicada, cicala": 316,
+        "cinema, movie theater, movie theatre, movie house, picture palace": 498,
+        "cleaver, meat cleaver, chopper": 499,
+        "cliff dwelling": 500,
+        "cliff, drop, drop-off": 972,
+        "cloak": 501,
+        "clog, geta, patten, sabot": 502,
+        "clumber, clumber spaniel": 216,
+        "cock": 7,
+        "cocker spaniel, English cocker spaniel, cocker": 219,
+        "cockroach, roach": 314,
+        "cocktail shaker": 503,
+        "coffee mug": 504,
+        "coffeepot": 505,
+        "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch": 391,
+        "coil, spiral, volute, whorl, helix": 506,
+        "collie": 231,
+        "colobus, colobus monkey": 375,
+        "combination lock": 507,
+        "comic book": 917,
+        "common iguana, iguana, Iguana iguana": 39,
+        "common newt, Triturus vulgaris": 26,
+        "computer keyboard, keypad": 508,
+        "conch": 112,
+        "confectionery, confectionary, candy store": 509,
+        "consomme": 925,
+        "container ship, containership, container vessel": 510,
+        "convertible": 511,
+        "coral fungus": 991,
+        "coral reef": 973,
+        "corkscrew, bottle screw": 512,
+        "corn": 987,
+        "cornet, horn, trumpet, trump": 513,
+        "coucal": 91,
+        "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor": 286,
+        "cowboy boot": 514,
+        "cowboy hat, ten-gallon hat": 515,
+        "coyote, prairie wolf, brush wolf, Canis latrans": 272,
+        "cradle": 516,
+        "crane": 517,
+        "crash helmet": 518,
+        "crate": 519,
+        "crayfish, crawfish, crawdad, crawdaddy": 124,
+        "crib, cot": 520,
+        "cricket": 312,
+        "croquet ball": 522,
+        "crossword puzzle, crossword": 918,
+        "crutch": 523,
+        "cucumber, cuke": 943,
+        "cuirass": 524,
+        "cup": 968,
+        "curly-coated retriever": 206,
+        "custard apple": 956,
+        "daisy": 985,
+        "dalmatian, coach dog, carriage dog": 251,
+        "dam, dike, dyke": 525,
+        "damselfly": 320,
+        "desk": 526,
+        "desktop computer": 527,
+        "dhole, Cuon alpinus": 274,
+        "dial telephone, dial phone": 528,
+        "diamondback, diamondback rattlesnake, Crotalus adamanteus": 67,
+        "diaper, nappy, napkin": 529,
+        "digital clock": 530,
+        "digital watch": 531,
+        "dingo, warrigal, warragal, Canis dingo": 273,
+        "dining table, board": 532,
+        "dishrag, dishcloth": 533,
+        "dishwasher, dish washer, dishwashing machine": 534,
+        "disk brake, disc brake": 535,
+        "dock, dockage, docking facility": 536,
+        "dogsled, dog sled, dog sleigh": 537,
+        "dome": 538,
+        "doormat, welcome mat": 539,
+        "dough": 961,
+        "dowitcher": 142,
+        "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk": 319,
+        "drake": 97,
+        "drilling platform, offshore rig": 540,
+        "drum, membranophone, tympan": 541,
+        "drumstick": 542,
+        "dugong, Dugong dugon": 149,
+        "dumbbell": 543,
+        "dung beetle": 305,
+        "ear, spike, capitulum": 998,
+        "earthstar": 995,
+        "echidna, spiny anteater, anteater": 102,
+        "eel": 390,
+        "eft": 27,
+        "eggnog": 969,
+        "electric fan, blower": 545,
+        "electric guitar": 546,
+        "electric locomotive": 547,
+        "electric ray, crampfish, numbfish, torpedo": 5,
+        "entertainment center": 548,
+        "envelope": 549,
+        "espresso": 967,
+        "espresso maker": 550,
+        "face powder": 551,
+        "feather boa, boa": 552,
+        "fiddler crab": 120,
+        "fig": 952,
+        "file, file cabinet, filing cabinet": 553,
+        "fire engine, fire truck": 555,
+        "fire screen, fireguard": 556,
+        "fireboat": 554,
+        "flagpole, flagstaff": 557,
+        "flamingo": 130,
+        "flat-coated retriever": 205,
+        "flatworm, platyhelminth": 110,
+        "flute, transverse flute": 558,
+        "fly": 308,
+        "folding chair": 559,
+        "football helmet": 560,
+        "forklift": 561,
+        "fountain": 562,
+        "fountain pen": 563,
+        "four-poster": 564,
+        "fox squirrel, eastern fox squirrel, Sciurus niger": 335,
+        "freight car": 565,
+        "frilled lizard, Chlamydosaurus kingi": 43,
+        "frying pan, frypan, skillet": 567,
+        "fur coat": 568,
+        "gar, garfish, garpike, billfish, Lepisosteus osseus": 395,
+        "garbage truck, dustcart": 569,
+        "garden spider, Aranea diademata": 74,
+        "garter snake, grass snake": 57,
+        "gas pump, gasoline pump, petrol pump, island dispenser": 571,
+        "gasmask, respirator, gas helmet": 570,
+        "gazelle": 353,
+        "geyser": 974,
+        "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca": 388,
+        "giant schnauzer": 197,
+        "gibbon, Hylobates lar": 368,
+        "go-kart": 573,
+        "goblet": 572,
+        "golden retriever": 207,
+        "goldfinch, Carduelis carduelis": 11,
+        "goldfish, Carassius auratus": 1,
+        "golf ball": 574,
+        "golfcart, golf cart": 575,
+        "gondola": 576,
+        "gong, tam-tam": 577,
+        "goose": 99,
+        "gorilla, Gorilla gorilla": 366,
+        "gown": 578,
+        "grand piano, grand": 579,
+        "grasshopper, hopper": 311,
+        "great grey owl, great gray owl, Strix nebulosa": 24,
+        "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias": 2,
+        "green lizard, Lacerta viridis": 46,
+        "green mamba": 64,
+        "green snake, grass snake": 55,
+        "greenhouse, nursery, glasshouse": 580,
+        "grey fox, gray fox, Urocyon cinereoargenteus": 280,
+        "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus": 147,
+        "grille, radiator grille": 581,
+        "grocery store, grocery, food market, market": 582,
+        "groenendael": 224,
+        "groom, bridegroom": 982,
+        "ground beetle, carabid beetle": 302,
+        "guacamole": 924,
+        "guenon, guenon monkey": 370,
+        "guillotine": 583,
+        "guinea pig, Cavia cobaya": 338,
+        "gyromitra": 993,
+        "hair slide": 584,
+        "hair spray": 585,
+        "half track": 586,
+        "hammer": 587,
+        "hammerhead, hammerhead shark": 4,
+        "hamper": 588,
+        "hamster": 333,
+        "hand blower, blow dryer, blow drier, hair dryer, hair drier": 589,
+        "hand-held computer, hand-held microcomputer": 590,
+        "handkerchief, hankie, hanky, hankey": 591,
+        "hard disc, hard disk, fixed disk": 592,
+        "hare": 331,
+        "harmonica, mouth organ, harp, mouth harp": 593,
+        "harp": 594,
+        "hartebeest": 351,
+        "harvester, reaper": 595,
+        "harvestman, daddy longlegs, Phalangium opilio": 70,
+        "hatchet": 596,
+        "hay": 958,
+        "head cabbage": 936,
+        "hen": 8,
+        "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa": 996,
+        "hermit crab": 125,
+        "hip, rose hip, rosehip": 989,
+        "hippopotamus, hippo, river horse, Hippopotamus amphibius": 344,
+        "hog, pig, grunter, squealer, Sus scrofa": 341,
+        "hognose snake, puff adder, sand viper": 54,
+        "holster": 597,
+        "home theater, home theatre": 598,
+        "honeycomb": 599,
+        "hook, claw": 600,
+        "hoopskirt, crinoline": 601,
+        "horizontal bar, high bar": 602,
+        "hornbill": 93,
+        "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus": 66,
+        "horse cart, horse-cart": 603,
+        "hot pot, hotpot": 926,
+        "hotdog, hot dog, red hot": 934,
+        "hourglass": 604,
+        "house finch, linnet, Carpodacus mexicanus": 12,
+        "howler monkey, howler": 379,
+        "hummingbird": 94,
+        "hyena, hyaena": 276,
+        "iPod": 605,
+        "ibex, Capra ibex": 350,
+        "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus": 296,
+        "ice cream, icecream": 928,
+        "ice lolly, lolly, lollipop, popsicle": 929,
+        "impala, Aepyceros melampus": 352,
+        "indigo bunting, indigo finch, indigo bird, Passerina cyanea": 14,
+        "indri, indris, Indri indri, Indri brevicaudatus": 384,
+        "iron, smoothing iron": 606,
+        "isopod": 126,
+        "jacamar": 95,
+        "jack-o'-lantern": 607,
+        "jackfruit, jak, jack": 955,
+        "jaguar, panther, Panthera onca, Felis onca": 290,
+        "jay": 17,
+        "jean, blue jean, denim": 608,
+        "jeep, landrover": 609,
+        "jellyfish": 107,
+        "jersey, T-shirt, tee shirt": 610,
+        "jigsaw puzzle": 611,
+        "jinrikisha, ricksha, rickshaw": 612,
+        "joystick": 613,
+        "junco, snowbird": 13,
+        "keeshond": 261,
+        "kelpie": 227,
+        "killer whale, killer, orca, grampus, sea wolf, Orcinus orca": 148,
+        "kimono": 614,
+        "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica": 121,
+        "king penguin, Aptenodytes patagonica": 145,
+        "king snake, kingsnake": 56,
+        "kit fox, Vulpes macrotis": 278,
+        "kite": 21,
+        "knee pad": 615,
+        "knot": 616,
+        "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus": 105,
+        "komondor": 228,
+        "kuvasz": 222,
+        "lab coat, laboratory coat": 617,
+        "lacewing, lacewing fly": 318,
+        "ladle": 618,
+        "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle": 301,
+        "lakeside, lakeshore": 975,
+        "lampshade, lamp shade": 619,
+        "langur": 374,
+        "laptop, laptop computer": 620,
+        "lawn mower, mower": 621,
+        "leaf beetle, chrysomelid": 304,
+        "leafhopper": 317,
+        "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea": 34,
+        "lemon": 951,
+        "lens cap, lens cover": 622,
+        "leopard, Panthera pardus": 288,
+        "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens": 387,
+        "letter opener, paper knife, paperknife": 623,
+        "library": 624,
+        "lifeboat": 625,
+        "lighter, light, igniter, ignitor": 626,
+        "limousine, limo": 627,
+        "limpkin, Aramus pictus": 135,
+        "liner, ocean liner": 628,
+        "lion, king of beasts, Panthera leo": 291,
+        "lionfish": 396,
+        "lipstick, lip rouge": 629,
+        "little blue heron, Egretta caerulea": 131,
+        "llama": 355,
+        "loggerhead, loggerhead turtle, Caretta caretta": 33,
+        "long-horned beetle, longicorn, longicorn beetle": 303,
+        "lorikeet": 90,
+        "lotion": 631,
+        "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system": 632,
+        "loupe, jeweler's loupe": 633,
+        "lumbermill, sawmill": 634,
+        "lycaenid, lycaenid butterfly": 326,
+        "lynx, catamount": 287,
+        "macaque": 373,
+        "macaw": 88,
+        "magnetic compass": 635,
+        "magpie": 18,
+        "mailbag, postbag": 636,
+        "mailbox, letter box": 637,
+        "maillot": 638,
+        "maillot, tank suit": 639,
+        "malamute, malemute, Alaskan malamute": 249,
+        "malinois": 225,
+        "manhole cover": 640,
+        "mantis, mantid": 315,
+        "maraca": 641,
+        "marimba, xylophone": 642,
+        "marmoset": 377,
+        "marmot": 336,
+        "mashed potato": 935,
+        "mask": 643,
+        "matchstick": 644,
+        "maypole": 645,
+        "maze, labyrinth": 646,
+        "measuring cup": 647,
+        "meat loaf, meatloaf": 962,
+        "medicine chest, medicine cabinet": 648,
+        "meerkat, mierkat": 299,
+        "megalith, megalithic structure": 649,
+        "menu": 922,
+        "microphone, mike": 650,
+        "microwave, microwave oven": 651,
+        "military uniform": 652,
+        "milk can": 653,
+        "miniature pinscher": 237,
+        "miniature poodle": 266,
+        "miniature schnauzer": 196,
+        "minibus": 654,
+        "miniskirt, mini": 655,
+        "minivan": 656,
+        "mink": 357,
+        "missile": 657,
+        "mitten": 658,
+        "mixing bowl": 659,
+        "mobile home, manufactured home": 660,
+        "modem": 662,
+        "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus": 323,
+        "monastery": 663,
+        "mongoose": 298,
+        "monitor": 664,
+        "moped": 665,
+        "mortar": 666,
+        "mortarboard": 667,
+        "mosque": 668,
+        "mosquito net": 669,
+        "motor scooter, scooter": 670,
+        "mountain bike, all-terrain bike, off-roader": 671,
+        "mountain tent": 672,
+        "mouse, computer mouse": 673,
+        "mousetrap": 674,
+        "moving van": 675,
+        "mud turtle": 35,
+        "mushroom": 947,
+        "muzzle": 676,
+        "nail": 677,
+        "neck brace": 678,
+        "necklace": 679,
+        "nematode, nematode worm, roundworm": 111,
+        "night snake, Hypsiglena torquata": 60,
+        "nipple": 680,
+        "notebook, notebook computer": 681,
+        "obelisk": 682,
+        "oboe, hautboy, hautbois": 683,
+        "ocarina, sweet potato": 684,
+        "odometer, hodometer, mileometer, milometer": 685,
+        "oil filter": 686,
+        "orange": 950,
+        "orangutan, orang, orangutang, Pongo pygmaeus": 365,
+        "organ, pipe organ": 687,
+        "oscilloscope, scope, cathode-ray oscilloscope, CRO": 688,
+        "ostrich, Struthio camelus": 9,
+        "otter": 360,
+        "otterhound, otter hound": 175,
+        "overskirt": 689,
+        "ox": 345,
+        "oxcart": 690,
+        "oxygen mask": 691,
+        "oystercatcher, oyster catcher": 143,
+        "packet": 692,
+        "paddle, boat paddle": 693,
+        "paddlewheel, paddle wheel": 694,
+        "padlock": 695,
+        "paintbrush": 696,
+        "pajama, pyjama, pj's, jammies": 697,
+        "palace": 698,
+        "panpipe, pandean pipe, syrinx": 699,
+        "paper towel": 700,
+        "papillon": 157,
+        "parachute, chute": 701,
+        "parallel bars, bars": 702,
+        "park bench": 703,
+        "parking meter": 704,
+        "partridge": 86,
+        "passenger car, coach, carriage": 705,
+        "patas, hussar monkey, Erythrocebus patas": 371,
+        "patio, terrace": 706,
+        "pay-phone, pay-station": 707,
+        "peacock": 84,
+        "pedestal, plinth, footstall": 708,
+        "pelican": 144,
+        "pencil box, pencil case": 709,
+        "pencil sharpener": 710,
+        "perfume, essence": 711,
+        "photocopier": 713,
+        "pick, plectrum, plectron": 714,
+        "pickelhaube": 715,
+        "picket fence, paling": 716,
+        "pickup, pickup truck": 717,
+        "pier": 718,
+        "piggy bank, penny bank": 719,
+        "pill bottle": 720,
+        "pillow": 721,
+        "pineapple, ananas": 953,
+        "ping-pong ball": 722,
+        "pinwheel": 723,
+        "pirate, pirate ship": 724,
+        "pitcher, ewer": 725,
+        "pizza, pizza pie": 963,
+        "plane, carpenter's plane, woodworking plane": 726,
+        "planetarium": 727,
+        "plastic bag": 728,
+        "plate": 923,
+        "plate rack": 729,
+        "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus": 103,
+        "plow, plough": 730,
+        "plunger, plumber's helper": 731,
+        "pole": 733,
+        "polecat, fitch, foulmart, foumart, Mustela putorius": 358,
+        "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria": 734,
+        "pomegranate": 957,
+        "poncho": 735,
+        "pool table, billiard table, snooker table": 736,
+        "pop bottle, soda bottle": 737,
+        "porcupine, hedgehog": 334,
+        "pot, flowerpot": 738,
+        "potpie": 964,
+        "potter's wheel": 739,
+        "power drill": 740,
+        "prairie chicken, prairie grouse, prairie fowl": 83,
+        "prayer rug, prayer mat": 741,
+        "pretzel": 932,
+        "printer": 742,
+        "prison, prison house": 743,
+        "proboscis monkey, Nasalis larvatus": 376,
+        "projectile, missile": 744,
+        "projector": 745,
+        "promontory, headland, head, foreland": 976,
+        "ptarmigan": 81,
+        "puck, hockey puck": 746,
+        "puffer, pufferfish, blowfish, globefish": 397,
+        "pug, pug-dog": 254,
+        "punching bag, punch bag, punching ball, punchball": 747,
+        "purse": 748,
+        "quail": 85,
+        "quill, quill pen": 749,
+        "quilt, comforter, comfort, puff": 750,
+        "racer, race car, racing car": 751,
+        "racket, racquet": 752,
+        "radiator": 753,
+        "radio telescope, radio reflector": 755,
+        "radio, wireless": 754,
+        "rain barrel": 756,
+        "ram, tup": 348,
+        "rapeseed": 984,
+        "recreational vehicle, RV, R.V.": 757,
+        "red fox, Vulpes vulpes": 277,
+        "red wine": 966,
+        "red wolf, maned wolf, Canis rufus, Canis niger": 271,
+        "red-backed sandpiper, dunlin, Erolia alpina": 140,
+        "red-breasted merganser, Mergus serrator": 98,
+        "redbone": 168,
+        "redshank, Tringa totanus": 141,
+        "reel": 758,
+        "reflex camera": 759,
+        "refrigerator, icebox": 760,
+        "remote control, remote": 761,
+        "restaurant, eating house, eating place, eatery": 762,
+        "revolver, six-gun, six-shooter": 763,
+        "rhinoceros beetle": 306,
+        "rifle": 764,
+        "ringlet, ringlet butterfly": 322,
+        "ringneck snake, ring-necked snake, ring snake": 53,
+        "robin, American robin, Turdus migratorius": 15,
+        "rock beauty, Holocanthus tricolor": 392,
+        "rock crab, Cancer irroratus": 119,
+        "rock python, rock snake, Python sebae": 62,
+        "rocking chair, rocker": 765,
+        "rotisserie": 766,
+        "rubber eraser, rubber, pencil eraser": 767,
+        "ruddy turnstone, Arenaria interpres": 139,
+        "ruffed grouse, partridge, Bonasa umbellus": 82,
+        "rugby ball": 768,
+        "rule, ruler": 769,
+        "running shoe": 770,
+        "safe": 771,
+        "safety pin": 772,
+        "saltshaker, salt shaker": 773,
+        "sandal": 774,
+        "sandbar, sand bar": 977,
+        "sarong": 775,
+        "sax, saxophone": 776,
+        "scabbard": 777,
+        "scale, weighing machine": 778,
+        "schipperke": 223,
+        "school bus": 779,
+        "schooner": 780,
+        "scoreboard": 781,
+        "scorpion": 71,
+        "screen, CRT screen": 782,
+        "screw": 783,
+        "screwdriver": 784,
+        "scuba diver": 983,
+        "sea anemone, anemone": 108,
+        "sea cucumber, holothurian": 329,
+        "sea lion": 150,
+        "sea slug, nudibranch": 115,
+        "sea snake": 65,
+        "sea urchin": 328,
+        "seashore, coast, seacoast, sea-coast": 978,
+        "seat belt, seatbelt": 785,
+        "sewing machine": 786,
+        "shield, buckler": 787,
+        "shoe shop, shoe-shop, shoe store": 788,
+        "shoji": 789,
+        "shopping basket": 790,
+        "shopping cart": 791,
+        "shovel": 792,
+        "shower cap": 793,
+        "shower curtain": 794,
+        "siamang, Hylobates syndactylus, Symphalangus syndactylus": 369,
+        "sidewinder, horned rattlesnake, Crotalus cerastes": 68,
+        "silky terrier, Sydney silky": 201,
+        "ski": 795,
+        "ski mask": 796,
+        "skunk, polecat, wood pussy": 361,
+        "sleeping bag": 797,
+        "slide rule, slipstick": 798,
+        "sliding door": 799,
+        "slot, one-armed bandit": 800,
+        "sloth bear, Melursus ursinus, Ursus ursinus": 297,
+        "slug": 114,
+        "snail": 113,
+        "snorkel": 801,
+        "snow leopard, ounce, Panthera uncia": 289,
+        "snowmobile": 802,
+        "snowplow, snowplough": 803,
+        "soap dispenser": 804,
+        "soccer ball": 805,
+        "sock": 806,
+        "soft-coated wheaten terrier": 202,
+        "solar dish, solar collector, solar furnace": 807,
+        "sombrero": 808,
+        "sorrel": 339,
+        "soup bowl": 809,
+        "space bar": 810,
+        "space heater": 811,
+        "space shuttle": 812,
+        "spaghetti squash": 940,
+        "spatula": 813,
+        "speedboat": 814,
+        "spider monkey, Ateles geoffroyi": 381,
+        "spider web, spider's web": 815,
+        "spindle": 816,
+        "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish": 123,
+        "spoonbill": 129,
+        "sports car, sport car": 817,
+        "spotlight, spot": 818,
+        "spotted salamander, Ambystoma maculatum": 28,
+        "squirrel monkey, Saimiri sciureus": 382,
+        "stage": 819,
+        "standard poodle": 267,
+        "standard schnauzer": 198,
+        "starfish, sea star": 327,
+        "steam locomotive": 820,
+        "steel arch bridge": 821,
+        "steel drum": 822,
+        "stethoscope": 823,
+        "stingray": 6,
+        "stinkhorn, carrion fungus": 994,
+        "stole": 824,
+        "stone wall": 825,
+        "stopwatch, stop watch": 826,
+        "stove": 827,
+        "strainer": 828,
+        "strawberry": 949,
+        "street sign": 919,
+        "streetcar, tram, tramcar, trolley, trolley car": 829,
+        "stretcher": 830,
+        "studio couch, day bed": 831,
+        "stupa, tope": 832,
+        "sturgeon": 394,
+        "submarine, pigboat, sub, U-boat": 833,
+        "suit, suit of clothes": 834,
+        "sulphur butterfly, sulfur butterfly": 325,
+        "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita": 89,
+        "sundial": 835,
+        "sunglass": 836,
+        "sunglasses, dark glasses, shades": 837,
+        "sunscreen, sunblock, sun blocker": 838,
+        "suspension bridge": 839,
+        "swab, swob, mop": 840,
+        "sweatshirt": 841,
+        "swimming trunks, bathing trunks": 842,
+        "swing": 843,
+        "switch, electric switch, electrical switch": 844,
+        "syringe": 845,
+        "tabby, tabby cat": 281,
+        "table lamp": 846,
+        "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui": 32,
+        "tank, army tank, armored combat vehicle, armoured combat vehicle": 847,
+        "tape player": 848,
+        "tarantula": 76,
+        "teapot": 849,
+        "teddy, teddy bear": 850,
+        "television, television system": 851,
+        "tench, Tinca tinca": 0,
+        "tennis ball": 852,
+        "terrapin": 36,
+        "thatch, thatched roof": 853,
+        "theater curtain, theatre curtain": 854,
+        "thimble": 855,
+        "three-toed sloth, ai, Bradypus tridactylus": 364,
+        "thresher, thrasher, threshing machine": 856,
+        "throne": 857,
+        "thunder snake, worm snake, Carphophis amoenus": 52,
+        "tick": 78,
+        "tiger beetle": 300,
+        "tiger cat": 282,
+        "tiger shark, Galeocerdo cuvieri": 3,
+        "tiger, Panthera tigris": 292,
+        "tile roof": 858,
+        "timber wolf, grey wolf, gray wolf, Canis lupus": 269,
+        "titi, titi monkey": 380,
+        "toaster": 859,
+        "tobacco shop, tobacconist shop, tobacconist": 860,
+        "toilet seat": 861,
+        "toilet tissue, toilet paper, bathroom tissue": 999,
+        "torch": 862,
+        "totem pole": 863,
+        "toucan": 96,
+        "tow truck, tow car, wrecker": 864,
+        "toy poodle": 265,
+        "toy terrier": 158,
+        "toyshop": 865,
+        "tractor": 866,
+        "traffic light, traffic signal, stoplight": 920,
+        "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi": 867,
+        "tray": 868,
+        "tree frog, tree-frog": 31,
+        "trench coat": 869,
+        "triceratops": 51,
+        "tricycle, trike, velocipede": 870,
+        "trifle": 927,
+        "trilobite": 69,
+        "trimaran": 871,
+        "tripod": 872,
+        "triumphal arch": 873,
+        "trolleybus, trolley coach, trackless trolley": 874,
+        "trombone": 875,
+        "tub, vat": 876,
+        "turnstile": 877,
+        "tusker": 101,
+        "typewriter keyboard": 878,
+        "umbrella": 879,
+        "unicycle, monocycle": 880,
+        "upright, upright piano": 881,
+        "vacuum, vacuum cleaner": 882,
+        "valley, vale": 979,
+        "vase": 883,
+        "vault": 884,
+        "velvet": 885,
+        "vending machine": 886,
+        "vestment": 887,
+        "viaduct": 888,
+        "vine snake": 59,
+        "violin, fiddle": 889,
+        "vizsla, Hungarian pointer": 211,
+        "volcano": 980,
+        "volleyball": 890,
+        "vulture": 23,
+        "waffle iron": 891,
+        "walking stick, walkingstick, stick insect": 313,
+        "wall clock": 892,
+        "wallaby, brush kangaroo": 104,
+        "wallet, billfold, notecase, pocketbook": 893,
+        "wardrobe, closet, press": 894,
+        "warplane, military plane": 895,
+        "warthog": 343,
+        "washbasin, handbasin, washbowl, lavabo, wash-hand basin": 896,
+        "washer, automatic washer, washing machine": 897,
+        "water bottle": 898,
+        "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis": 346,
+        "water jug": 899,
+        "water ouzel, dipper": 20,
+        "water snake": 58,
+        "water tower": 900,
+        "weasel": 356,
+        "web site, website, internet site, site": 916,
+        "weevil": 307,
+        "whippet": 172,
+        "whiptail, whiptail lizard": 41,
+        "whiskey jug": 901,
+        "whistle": 902,
+        "white stork, Ciconia ciconia": 127,
+        "white wolf, Arctic wolf, Canis lupus tundrarum": 270,
+        "wig": 903,
+        "wild boar, boar, Sus scrofa": 342,
+        "window screen": 904,
+        "window shade": 905,
+        "wine bottle": 907,
+        "wing": 908,
+        "wire-haired fox terrier": 188,
+        "wok": 909,
+        "wolf spider, hunting spider": 77,
+        "wombat": 106,
+        "wood rabbit, cottontail, cottontail rabbit": 330,
+        "wooden spoon": 910,
+        "wool, woolen, woollen": 911,
+        "worm fence, snake fence, snake-rail fence, Virginia fence": 912,
+        "wreck": 913,
+        "yawl": 914,
+        "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum": 986,
+        "yurt": 915,
+        "zebra": 340,
+        "zucchini, courgette": 939
+      },
+      "layer_norm_eps": 1e-05,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "mlp_ratio": 4.0,
+      "model_type": "swin",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_channels": 3,
+      "num_heads": [
+        6,
+        12,
+        24,
+        48
+      ],
+      "num_layers": 4,
+      "num_return_sequences": 1,
+      "out_features": [
+        "stage1",
+        "stage2",
+        "stage3",
+        "stage4"
+      ],
+      "out_indices": [
+        1,
+        2,
+        3,
+        4
+      ],
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 4,
+      "path_norm": true,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "qkv_bias": true,
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "stage_names": [
+        "stem",
+        "stage1",
+        "stage2",
+        "stage3",
+        "stage4"
+      ],
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": "float32",
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_absolute_embeddings": false,
+      "use_bfloat16": false,
+      "window_size": 12
+    },
+    "backbone_kwargs": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "class_weight": 2.0,
+    "common_stride": 4,
+    "cross_attention_hidden_size": null,
+    "decoder_layers": 10,
+    "decoder_start_token_id": null,
+    "dice_weight": 5.0,
+    "dim_feedforward": 2048,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_feedforward_dim": 1024,
+    "encoder_layers": 6,
+    "encoder_no_repeat_ngram_size": 0,
+    "enforce_input_proj": false,
+    "enforce_input_projection": false,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "feature_size": 256,
+    "feature_strides": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_dim": 256,
+    "id2label": {
+      "0": "person",
+      "1": "bicycle",
+      "2": "car",
+      "3": "motorcycle",
+      "4": "airplane",
+      "5": "bus",
+      "6": "train",
+      "7": "truck",
+      "8": "boat",
+      "9": "traffic light",
+      "10": "fire hydrant",
+      "11": "stop sign",
+      "12": "parking meter",
+      "13": "bench",
+      "14": "bird",
+      "15": "cat",
+      "16": "dog",
+      "17": "horse",
+      "18": "sheep",
+      "19": "cow",
+      "20": "elephant",
+      "21": "bear",
+      "22": "zebra",
+      "23": "giraffe",
+      "24": "backpack",
+      "25": "umbrella",
+      "26": "handbag",
+      "27": "tie",
+      "28": "suitcase",
+      "29": "frisbee",
+      "30": "skis",
+      "31": "snowboard",
+      "32": "sports ball",
+      "33": "kite",
+      "34": "baseball bat",
+      "35": "baseball glove",
+      "36": "skateboard",
+      "37": "surfboard",
+      "38": "tennis racket",
+      "39": "bottle",
+      "40": "wine glass",
+      "41": "cup",
+      "42": "fork",
+      "43": "knife",
+      "44": "spoon",
+      "45": "bowl",
+      "46": "banana",
+      "47": "apple",
+      "48": "sandwich",
+      "49": "orange",
+      "50": "broccoli",
+      "51": "carrot",
+      "52": "hot dog",
+      "53": "pizza",
+      "54": "donut",
+      "55": "cake",
+      "56": "chair",
+      "57": "couch",
+      "58": "potted plant",
+      "59": "bed",
+      "60": "dining table",
+      "61": "toilet",
+      "62": "tv",
+      "63": "laptop",
+      "64": "mouse",
+      "65": "remote",
+      "66": "keyboard",
+      "67": "cell phone",
+      "68": "microwave",
+      "69": "oven",
+      "70": "toaster",
+      "71": "sink",
+      "72": "refrigerator",
+      "73": "book",
+      "74": "clock",
+      "75": "vase",
+      "76": "scissors",
+      "77": "teddy bear",
+      "78": "hair drier",
+      "79": "toothbrush",
+      "80": "banner",
+      "81": "blanket",
+      "82": "bridge",
+      "83": "cardboard",
+      "84": "counter",
+      "85": "curtain",
+      "86": "door-stuff",
+      "87": "floor-wood",
+      "88": "flower",
+      "89": "fruit",
+      "90": "gravel",
+      "91": "house",
+      "92": "light",
+      "93": "mirror-stuff",
+      "94": "net",
+      "95": "pillow",
+      "96": "platform",
+      "97": "playingfield",
+      "98": "railroad",
+      "99": "river",
+      "100": "road",
+      "101": "roof",
+      "102": "sand",
+      "103": "sea",
+      "104": "shelf",
+      "105": "snow",
+      "106": "stairs",
+      "107": "tent",
+      "108": "towel",
+      "109": "wall-brick",
+      "110": "wall-stone",
+      "111": "wall-tile",
+      "112": "wall-wood",
+      "113": "water-other",
+      "114": "window-blind",
+      "115": "window-other",
+      "116": "tree-merged",
+      "117": "fence-merged",
+      "118": "ceiling-merged",
+      "119": "sky-other-merged",
+      "120": "cabinet-merged",
+      "121": "table-merged",
+      "122": "floor-other-merged",
+      "123": "pavement-merged",
+      "124": "mountain-merged",
+      "125": "grass-merged",
+      "126": "dirt-merged",
+      "127": "paper-merged",
+      "128": "food-other-merged",
+      "129": "building-other-merged",
+      "130": "rock-merged",
+      "131": "wall-other-merged",
+      "132": "rug-merged"
+    },
+    "ignore_value": 255,
+    "importance_sample_ratio": 0.75,
+    "init_std": 0.02,
+    "init_xavier_std": 1.0,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "airplane": 4,
+      "apple": 47,
+      "backpack": 24,
+      "banana": 46,
+      "banner": 80,
+      "baseball bat": 34,
+      "baseball glove": 35,
+      "bear": 21,
+      "bed": 59,
+      "bench": 13,
+      "bicycle": 1,
+      "bird": 14,
+      "blanket": 81,
+      "boat": 8,
+      "book": 73,
+      "bottle": 39,
+      "bowl": 45,
+      "bridge": 82,
+      "broccoli": 50,
+      "building-other-merged": 129,
+      "bus": 5,
+      "cabinet-merged": 120,
+      "cake": 55,
+      "car": 2,
+      "cardboard": 83,
+      "carrot": 51,
+      "cat": 15,
+      "ceiling-merged": 118,
+      "cell phone": 67,
+      "chair": 56,
+      "clock": 74,
+      "couch": 57,
+      "counter": 84,
+      "cow": 19,
+      "cup": 41,
+      "curtain": 85,
+      "dining table": 60,
+      "dirt-merged": 126,
+      "dog": 16,
+      "donut": 54,
+      "door-stuff": 86,
+      "elephant": 20,
+      "fence-merged": 117,
+      "fire hydrant": 10,
+      "floor-other-merged": 122,
+      "floor-wood": 87,
+      "flower": 88,
+      "food-other-merged": 128,
+      "fork": 42,
+      "frisbee": 29,
+      "fruit": 89,
+      "giraffe": 23,
+      "grass-merged": 125,
+      "gravel": 90,
+      "hair drier": 78,
+      "handbag": 26,
+      "horse": 17,
+      "hot dog": 52,
+      "house": 91,
+      "keyboard": 66,
+      "kite": 33,
+      "knife": 43,
+      "laptop": 63,
+      "light": 92,
+      "microwave": 68,
+      "mirror-stuff": 93,
+      "motorcycle": 3,
+      "mountain-merged": 124,
+      "mouse": 64,
+      "net": 94,
+      "orange": 49,
+      "oven": 69,
+      "paper-merged": 127,
+      "parking meter": 12,
+      "pavement-merged": 123,
+      "person": 0,
+      "pillow": 95,
+      "pizza": 53,
+      "platform": 96,
+      "playingfield": 97,
+      "potted plant": 58,
+      "railroad": 98,
+      "refrigerator": 72,
+      "remote": 65,
+      "river": 99,
+      "road": 100,
+      "rock-merged": 130,
+      "roof": 101,
+      "rug-merged": 132,
+      "sand": 102,
+      "sandwich": 48,
+      "scissors": 76,
+      "sea": 103,
+      "sheep": 18,
+      "shelf": 104,
+      "sink": 71,
+      "skateboard": 36,
+      "skis": 30,
+      "sky-other-merged": 119,
+      "snow": 105,
+      "snowboard": 31,
+      "spoon": 44,
+      "sports ball": 32,
+      "stairs": 106,
+      "stop sign": 11,
+      "suitcase": 28,
+      "surfboard": 37,
+      "table-merged": 121,
+      "teddy bear": 77,
+      "tennis racket": 38,
+      "tent": 107,
+      "tie": 27,
+      "toaster": 70,
+      "toilet": 61,
+      "toothbrush": 79,
+      "towel": 108,
+      "traffic light": 9,
+      "train": 6,
+      "tree-merged": 116,
+      "truck": 7,
+      "tv": 62,
+      "umbrella": 25,
+      "vase": 75,
+      "wall-brick": 109,
+      "wall-other-merged": 131,
+      "wall-stone": 110,
+      "wall-tile": 111,
+      "wall-wood": 112,
+      "water-other": 113,
+      "window-blind": 114,
+      "window-other": 115,
+      "wine glass": 40,
+      "zebra": 22
+    },
+    "length_penalty": 1.0,
+    "mask_feature_size": 256,
+    "mask_weight": 5.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "mask2former",
+    "no_object_weight": 0.1,
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 8,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 10,
+    "num_queries": 200,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_auxiliary_logits": null,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "oversample_ratio": 3.0,
+    "pad_token_id": null,
+    "pre_norm": false,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "train_num_points": 12544,
+    "transformers_version": "4.47.0",
+    "typical_p": 1.0,
+    "use_auxiliary_loss": true,
+    "use_bfloat16": false,
+    "use_pretrained_backbone": false,
+    "use_timm_backbone": false
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "sa2va_chat",
+  "num_m2f_proposals": 100,
+  "num_m2f_queries": 200,
+  "pad2square": false,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internlm2_chat",
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "intern_vit_6b",
+    "no_repeat_ngram_size": 0,
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.47.0",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_flash_attn": true
+  }
+}

configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

configuration_internlm2.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM2 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+    """
+    model_type = 'internlm2'
+    _auto_class = 'AutoConfig'
+    def __init__(  # pylint: disable=W0102
+        self,
+        vocab_size=103168,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act='silu',
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation='eager',
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = 'eager'
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
+                f'got {self.rope_scaling}'
+            )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_factor = self.rope_scaling.get('factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

configuration_phi3.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License atd
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Phi-3 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'microsoft/Phi-3-mini-4k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json',
+    'microsoft/Phi-3-mini-128k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json',
+}
+class Phi3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+    Example:
+    ```python
+    >>> from transformers import Phi3Model, Phi3Config
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    >>> # Initializing a model from the configuration
+    >>> model = Phi3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'phi3'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act='silu',
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                '`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, '
+                f'got {self.rope_scaling}'
+            )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_short_factor = self.rope_scaling.get('short_factor', None)
+        rope_scaling_long_factor = self.rope_scaling.get('long_factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['su', 'yarn']:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )

configuration_sa2va_chat.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from .configuration_internlm2 import InternLM2Config
+from .configuration_phi3 import Phi3Config
+from transformers import AutoConfig, LlamaConfig, Qwen2Config, Mask2FormerConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+logger = logging.get_logger(__name__)
+class Sa2VAChatConfig(PretrainedConfig):
+    model_type = 'sa2va_chat'
+    is_composition = True
+    def __init__(
+            self,
+            vision_config=None,
+            llm_config=None,
+            m2f_config=None,
+            use_backbone_lora=0,
+            use_llm_lora=0,
+            pad2square=False,
+            select_layer=-1,
+            force_image_size=None,
+            downsample_ratio=0.5,
+            template=None,
+            dynamic_image_size=False,
+            use_thumbnail=False,
+            ps_version='v1',
+            min_dynamic_patch=1,
+            max_dynamic_patch=6,
+            # mask2former
+            num_m2f_queries=300,
+            num_m2f_proposals=100,
+            **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {"architectures": ["InternVisionModel"]}
+            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
+        if llm_config is None:
+            llm_config = {'architectures': ['Qwen2ForCausalLM']}
+            logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
+        if m2f_config is None:
+            m2f_config = {"architectures": ["SwinForImageClassification"]}
+            logger.info('m2f_config is None. Initializing the Mask2FormerConfig config with default values.')
+        self.vision_config = InternVisionConfig(**vision_config)
+        self.m2f_config = Mask2FormerConfig(**m2f_config)
+        if llm_config['architectures'][0] == 'LlamaForCausalLM':
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
+            self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config['architectures'][0] == 'Phi3ForCausalLM':
+            self.llm_config = Phi3Config(**llm_config)
+        elif llm_config['architectures'][0] == 'Qwen2ForCausalLM':
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        # mask2former
+        self.num_m2f_queries=num_m2f_queries
+        self.num_m2f_proposals=num_m2f_proposals
+        self.hidden_size = self.llm_config.hidden_size
+        self.tie_word_embeddings = False
+        logger.info(f'vision_select_layer: {self.select_layer}')
+        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
+        logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['llm_config'] = self.llm_config.to_dict()
+        output['m2f_config'] = self.m2f_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        output['use_backbone_lora'] = self.use_backbone_lora
+        output['use_llm_lora'] = self.use_llm_lora
+        output['pad2square'] = self.pad2square
+        output['select_layer'] = self.select_layer
+        output['force_image_size'] = self.force_image_size
+        output['downsample_ratio'] = self.downsample_ratio
+        output['template'] = self.template
+        output['dynamic_image_size'] = self.dynamic_image_size
+        output['use_thumbnail'] = self.use_thumbnail
+        output['ps_version'] = self.ps_version
+        output['min_dynamic_patch'] = self.min_dynamic_patch
+        output['max_dynamic_patch'] = self.max_dynamic_patch
+        output['num_m2f_queries'] = self.num_m2f_queries
+        output['num_m2f_proposals'] = self.num_m2f_proposals
+        return output

constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+IMG_START_TOKEN = '<img>'
+IMG_END_TOKEN = '</img>'
+PHRASE_START_TOKEN = '<p>'
+PHRASE_END_TOKEN = '</p>'
+SEG_TOKEN = '[SEG{id}]'
+CLS_TOKEN = '[CLS]'
+BG_CLS_TOKEN = '[BG_CLS]'
+# PROPOSAL_TOKENS = [f'[SEG{str(i).zfill(3)}]' for i in range(300)]
+OBJ_START_TOKEN = '<obj>'
+OBJ_END_TOKEN = '</obj>'
+OBJ_CONTEXT_TOKEN = '<OBJ_CONTEXT>'
+DEFAULT_OBJ_TOKEN = '<obj_tokens>'

flash_attention.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+try:  # v1
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import pad_input, unpad_input
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.47.0"
+}

mask2former.py ADDED Viewed

	@@ -0,0 +1,834 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Dict, List, Optional, Tuple, Union
+from transformers.models.mask2former.modeling_mask2former import (
+     Mask2FormerMaskedAttentionDecoderOutput, Mask2FormerModelOutput,
+     Mask2FormerForUniversalSegmentationOutput, Mask2FormerMLPPredictionHead,
+     sample_point, pair_wise_sigmoid_cross_entropy_loss, pair_wise_dice_loss,
+     sigmoid_cross_entropy_loss, dice_loss)
+from torch import Tensor
+import torch.nn.functional as F
+from transformers.file_utils import is_scipy_available
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+def get_classification_logits(x, text_classifier, logit_scale):
+    # x in shape of [B, *, C]
+    # text_classifier in shape of [num_classes, C]
+    # logit_scale is a learnable scalar https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/model.py#L201
+    # return: [B, *, num_classes]
+    x = F.normalize(x, dim=-1)
+    text_classifier = F.normalize(text_classifier, dim=-1)
+    logit_scale = torch.clamp(logit_scale.exp(), max=100)
+    pred_logits = logit_scale * x @ text_classifier.T # B, *, N + 1
+    return pred_logits
+def _post_init(self):
+    self.class_embed = Mask2FormerMLPPredictionHead(self.config.hidden_dim, self.config.hidden_dim, self.config.hidden_dim, 3)
+    self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+def ov_class_predictor(self, x, text_classifier):
+    x = self.class_embed(x)
+    all_pred_logits = []
+    for per_x, per_text_classifier in zip(x, text_classifier):
+        per_pred_logits = get_classification_logits(per_x.unsqueeze(0), per_text_classifier, self.logit_scale)
+        all_pred_logits.append(per_pred_logits.squeeze(0))
+    return all_pred_logits
+def Mask2FormerLoss_loss_labels(
+        self, class_queries_logits: Tensor, class_labels: List[Tensor], indices: Tuple[np.array]
+    ) -> Dict[str, Tensor]:
+        batch_size = len(class_queries_logits)
+        num_queries = class_queries_logits[0].shape[0]
+        all_ce_loss = []
+        for i in range(batch_size):
+            num_labels_plus1 = class_queries_logits[i].shape[-1]
+            empty_weight = torch.ones(num_labels_plus1)
+            empty_weight[-1] = self.eos_coef
+            empty_weight = empty_weight.to(class_queries_logits[i].device).to(class_queries_logits[i].dtype)
+            criterion = nn.CrossEntropyLoss(weight=empty_weight, reduction='none')
+            target_classes_o = class_labels[i][indices[i][1]]
+            target_classes = torch.full(
+                 (num_queries, ), fill_value=num_labels_plus1-1, dtype=torch.int64, device=class_queries_logits[i].device)
+            target_classes[indices[i][0]] = target_classes_o.to(class_queries_logits[i].device)
+            target_classes = target_classes.unsqueeze(0)
+            pred_logits = class_queries_logits[i].unsqueeze(0).transpose(1, 2)
+            loss_ce = criterion(pred_logits, target_classes)
+            all_ce_loss.append(loss_ce)
+        losses = {"loss_cross_entropy": torch.cat(all_ce_loss, dim=-1).mean()}
+        return losses
+def Mask2FormerLoss_loss_masks(
+        self,
+        masks_queries_logits: torch.Tensor,
+        mask_labels: List[torch.Tensor],
+        indices: Tuple[np.array],
+        num_masks: int
+    ) -> Dict[str, torch.Tensor]:
+        src_idx = self._get_predictions_permutation_indices(indices)
+        tgt_idx = self._get_targets_permutation_indices(indices)
+        # shape (batch_size * num_queries, height, width)
+        pred_masks = masks_queries_logits[src_idx]
+        # shape (batch_size, num_queries, height, width)
+        # pad all and stack the targets to the num_labels dimension
+        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
+        target_masks = target_masks[tgt_idx]
+        # No need to upsample predictions as we are using normalized coordinates
+        pred_masks = pred_masks[:, None]
+        target_masks = target_masks[:, None]
+        # Sample point coordinates
+        with torch.no_grad():
+            point_coordinates = self.sample_points_using_uncertainty(
+                pred_masks,
+                lambda logits: self.calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            point_labels = sample_point(target_masks.to(torch.bfloat16), point_coordinates.to(torch.bfloat16), align_corners=False).squeeze(1)
+        point_logits = sample_point(pred_masks, point_coordinates.to(pred_masks.dtype), align_corners=False).squeeze(1)
+        losses = {
+            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
+        }
+        del pred_masks
+        del target_masks
+        return losses
+def Mask2FormerLoss_sample_points_using_uncertainty(
+        self,
+        logits: torch.Tensor,
+        uncertainty_function,
+        num_points: int,
+        oversample_ratio: int,
+        importance_sample_ratio: float,
+    ) -> torch.Tensor:
+        num_boxes = logits.shape[0]
+        num_points_sampled = int(num_points * oversample_ratio)
+        # Get random point coordinates
+        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
+        # Get sampled prediction value for the point coordinates
+        point_logits = sample_point(logits, point_coordinates.to(logits.dtype), align_corners=False)
+        # Calculate the uncertainties based on the sampled prediction values of the points
+        point_uncertainties = uncertainty_function(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
+        idx += shift[:, None]
+        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
+        if num_random_points > 0:
+            point_coordinates = torch.cat(
+                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
+                dim=1,
+            )
+        return point_coordinates
+@torch.no_grad()
+def Mask2FormerHungarianMatcher_forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: torch.Tensor,
+        class_labels: torch.Tensor,
+    ) -> List[Tuple[Tensor]]:
+        indices: List[Tuple[np.array]] = []
+        # iterate through batch size
+        batch_size = masks_queries_logits.shape[0]
+        for i in range(batch_size):
+            pred_probs = class_queries_logits[i].softmax(-1)
+            pred_mask = masks_queries_logits[i]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -pred_probs[:, class_labels[i]]
+            target_mask = mask_labels[i].to(pred_mask)
+            target_mask = target_mask[:, None]
+            pred_mask = pred_mask[:, None]
+            # Sample ground truth and predicted masks
+            point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device)
+            target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1).to(target_mask.dtype)
+            target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1)
+            pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1).to(pred_mask.dtype)
+            pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1)
+            # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
+            # Compute the dice loss betwen each mask pairs -> shape (num_queries, num_labels)
+            cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
+            # final cost matrix
+            cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+            # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible``
+            cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
+            cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
+            cost_matrix = torch.nan_to_num(cost_matrix, 0)
+            # do the assigmented using the hungarian algorithm in scipy
+            assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.to(torch.float32).cpu())
+            indices.append(assigned_indices)
+        # It could be stacked in one tensor
+        matched_indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
+        ]
+        return matched_indices
+def Mask2FormerMaskedAttentionDecoder_forward_first3layers(
+        self,
+        inputs_embeds: torch.Tensor = None,
+        multi_stage_positional_embeddings: torch.Tensor = None,
+        pixel_embeddings: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        query_position_embeddings: torch.Tensor = None,
+        feature_size_list: List = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            multi_stage_positional_embeddings (`torch.FloatTensor` of shape `(height*width, batch_size, num_channels)`):
+                Position embeddings that are added to the keys in each cross(masked)-attention layer.
+            pixel_embeddings (`torch.FloatTensor`):
+                Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel
+                Decoder.
+            query_position_embeddings (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross(masked)-attention of the decoder.
+            feature_size_list (`List[torch.Size]`):
+                This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        # intermediate hidden states with layernorm applied - required for predicting class logits
+        intermediate = ()
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        # intermediate mask predictions from transformer decoder layers
+        intermediate_mask_predictions = ()
+        intermediate_hidden_states = self.layernorm(inputs_embeds)
+        intermediate += (intermediate_hidden_states,)
+        predicted_mask, attention_mask = self.mask_predictor(
+            intermediate_hidden_states, pixel_embeddings, feature_size_list[0]
+        )
+        intermediate_mask_predictions += (predicted_mask,)
+        for idx, decoder_layer in enumerate(self.layers[:3]):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = torch.rand([])
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    None,
+                    None,
+                    output_attentions,
+                )
+            else:
+                level_index = idx % self.num_feature_levels
+                where = (attention_mask.sum(-1) != attention_mask.shape[-1]).to(attention_mask.dtype)
+                # Multiply the attention mask instead of indexing to avoid issue in torch.export.
+                attention_mask = attention_mask * where.unsqueeze(-1)
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    level_index=level_index,
+                    position_embeddings=multi_stage_positional_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                )
+                intermediate_hidden_states = self.layernorm(layer_outputs[0])
+                predicted_mask, attention_mask = self.mask_predictor(
+                    intermediate_hidden_states,
+                    pixel_embeddings,
+                    feature_size_list[(idx + 1) % self.num_feature_levels],
+                )
+                intermediate_mask_predictions += (predicted_mask,)
+                # add intermediate hidden states with layer norm applied which will be used for predicting class logits
+                intermediate += (intermediate_hidden_states,)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                attentions += (layer_outputs[1],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        hidden_states = hidden_states.transpose(1, 0)
+        if not return_dict:
+            outputs = [hidden_states, all_hidden_states, attentions, intermediate, intermediate_mask_predictions]
+            return tuple(v for v in outputs if v is not None)
+        return Mask2FormerMaskedAttentionDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=attentions,
+            intermediate_hidden_states=intermediate,
+            masks_queries_logits=intermediate_mask_predictions,
+        )
+def Mask2FormerMaskedAttentionDecoder_forward_last3layers(
+        self,
+        inputs_embeds: torch.Tensor = None,
+        multi_stage_positional_embeddings: torch.Tensor = None,
+        pixel_embeddings: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        query_position_embeddings: torch.Tensor = None,
+        feature_size_list: List = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            multi_stage_positional_embeddings (`torch.FloatTensor` of shape `(height*width, batch_size, num_channels)`):
+                Position embeddings that are added to the keys in each cross(masked)-attention layer.
+            pixel_embeddings (`torch.FloatTensor`):
+                Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel
+                Decoder.
+            query_position_embeddings (`torch.FloatTensor` of shape `(num_queries, batch_size, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross(masked)-attention of the decoder.
+            feature_size_list (`List[torch.Size]`):
+                This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        # intermediate hidden states with layernorm applied - required for predicting class logits
+        intermediate = ()
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        # intermediate mask predictions from transformer decoder layers
+        intermediate_mask_predictions = ()
+        intermediate_hidden_states = self.layernorm(inputs_embeds)
+        intermediate += (intermediate_hidden_states,)
+        predicted_mask, attention_mask = self.mask_predictor(
+            intermediate_hidden_states, pixel_embeddings, feature_size_list[0]
+        )
+        intermediate_mask_predictions += (predicted_mask,)
+        for _idx, decoder_layer in enumerate(self.layers[3:]):
+            idx = _idx + 3
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = torch.rand([])
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    None,
+                    None,
+                    output_attentions,
+                )
+            else:
+                level_index = idx % self.num_feature_levels
+                where = (attention_mask.sum(-1) != attention_mask.shape[-1]).to(attention_mask.dtype)
+                # Multiply the attention mask instead of indexing to avoid issue in torch.export.
+                attention_mask = attention_mask * where.unsqueeze(-1)
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    level_index=level_index,
+                    position_embeddings=multi_stage_positional_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                )
+                intermediate_hidden_states = self.layernorm(layer_outputs[0])
+                predicted_mask, attention_mask = self.mask_predictor(
+                    intermediate_hidden_states,
+                    pixel_embeddings,
+                    feature_size_list[(idx + 1) % self.num_feature_levels],
+                )
+                intermediate_mask_predictions += (predicted_mask,)
+                # add intermediate hidden states with layer norm applied which will be used for predicting class logits
+                intermediate += (intermediate_hidden_states,)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                attentions += (layer_outputs[1],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        hidden_states = hidden_states.transpose(1, 0)
+        if not return_dict:
+            outputs = [hidden_states, all_hidden_states, attentions, intermediate, intermediate_mask_predictions]
+            return tuple(v for v in outputs if v is not None)
+        return Mask2FormerMaskedAttentionDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=attentions,
+            intermediate_hidden_states=intermediate,
+            masks_queries_logits=intermediate_mask_predictions,
+        )
+def Mask2FormerTransformerModule_forward_first_part(
+        self,
+        multi_scale_features: List[Tensor],
+        mask_features: Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+    ) -> Mask2FormerMaskedAttentionDecoderOutput:
+        multi_stage_features = []
+        multi_stage_positional_embeddings = []
+        size_list = []
+        for i in range(self.num_feature_levels):
+            size_list.append(multi_scale_features[i].shape[-2:])
+            multi_stage_positional_embeddings.append(self.position_embedder(multi_scale_features[i], None).flatten(2))
+            multi_stage_features.append(
+                self.input_projections[i](multi_scale_features[i]).flatten(2)
+                + self.level_embed.weight[i][None, :, None]
+            )
+            # Flatten (batch_size, num_channels, height, width) -> (height*width, batch_size, num_channels)
+            multi_stage_positional_embeddings[-1] = multi_stage_positional_embeddings[-1].permute(2, 0, 1)
+            multi_stage_features[-1] = multi_stage_features[-1].permute(2, 0, 1)
+        _, batch_size, _ = multi_stage_features[0].shape
+        # [num_queries, batch_size, num_channels]
+        query_embeddings = self.queries_embedder.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        query_features = self.queries_features.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        decoder_output = self.decoder.Mask2FormerMaskedAttentionDecoder_forward_first3layers(
+            inputs_embeds=query_features,
+            multi_stage_positional_embeddings=multi_stage_positional_embeddings,
+            pixel_embeddings=mask_features,
+            encoder_hidden_states=multi_stage_features,
+            query_position_embeddings=query_embeddings,
+            feature_size_list=size_list,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        return decoder_output
+def Mask2FormerTransformerModule_forward_second_part(
+        self,
+        query_features: Tensor,
+        query_embeddings: Tensor,
+        multi_scale_features: List[Tensor],
+        mask_features: Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+    ) -> Mask2FormerMaskedAttentionDecoderOutput:
+        multi_stage_features = []
+        multi_stage_positional_embeddings = []
+        size_list = []
+        for i in range(self.num_feature_levels):
+            size_list.append(multi_scale_features[i].shape[-2:])
+            multi_stage_positional_embeddings.append(self.position_embedder(multi_scale_features[i], None).flatten(2))
+            multi_stage_features.append(
+                self.input_projections[i](multi_scale_features[i]).flatten(2)
+                + self.level_embed.weight[i][None, :, None]
+            )
+            # Flatten (batch_size, num_channels, height, width) -> (height*width, batch_size, num_channels)
+            multi_stage_positional_embeddings[-1] = multi_stage_positional_embeddings[-1].permute(2, 0, 1)
+            multi_stage_features[-1] = multi_stage_features[-1].permute(2, 0, 1)
+        _, batch_size, _ = multi_stage_features[0].shape
+        # [num_queries, batch_size, num_channels]
+        # query_embeddings = self.queries_embedder.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        # query_features = self.queries_features.weight.unsqueeze(1).repeat(1, batch_size, 1)
+        decoder_output = self.decoder.Mask2FormerMaskedAttentionDecoder_forward_last3layers(
+            inputs_embeds=query_features,
+            multi_stage_positional_embeddings=multi_stage_positional_embeddings,
+            pixel_embeddings=mask_features,
+            encoder_hidden_states=multi_stage_features,
+            query_position_embeddings=query_embeddings,
+            feature_size_list=size_list,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        return decoder_output
+def Mask2FormerModel_forward_first_part(
+        self,
+        pixel_values: Tensor,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerModelOutput:
+        r"""
+        Returns:
+            `Mask2FormerModelOutput`
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, Mask2FormerModel
+        >>> # load image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> # load image preprocessor and Mask2FormerModel trained on COCO instance segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> model = Mask2FormerModel.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> inputs = image_processor(image, return_tensors="pt")
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # model outputs last hidden states of shape (batch_size, num_queries, hidden_size)
+        >>> print(outputs.transformer_decoder_last_hidden_state.shape)
+        torch.Size([1, 100, 256])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, _, height, width = pixel_values.shape
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
+        pixel_level_module_output = self.pixel_level_module(
+            pixel_values=pixel_values, output_hidden_states=output_hidden_states
+        )
+        transformer_module_output = self.transformer_module.Mask2FormerTransformerModule_forward_first_part(
+            multi_scale_features=pixel_level_module_output.decoder_hidden_states,
+            mask_features=pixel_level_module_output.decoder_last_hidden_state,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+        )
+        query_features = transformer_module_output.last_hidden_state
+        return query_features, pixel_level_module_output
+def Mask2FormerModel_forward_second_part(
+        self,
+        query_features: Tensor,
+        query_embeddings: Tensor,
+        pixel_level_module_output,
+        pixel_values: Tensor,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerModelOutput:
+        r"""
+        Returns:
+            `Mask2FormerModelOutput`
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, Mask2FormerModel
+        >>> # load image
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> # load image preprocessor and Mask2FormerModel trained on COCO instance segmentation dataset
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> model = Mask2FormerModel.from_pretrained("facebook/mask2former-swin-small-coco-instance")
+        >>> inputs = image_processor(image, return_tensors="pt")
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> # model outputs last hidden states of shape (batch_size, num_queries, hidden_size)
+        >>> print(outputs.transformer_decoder_last_hidden_state.shape)
+        torch.Size([1, 100, 256])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, _, height, width = pixel_values.shape
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
+        transformer_module_output = self.transformer_module.Mask2FormerTransformerModule_forward_second_part(
+            query_features=query_features,
+            query_embeddings=query_embeddings,
+            multi_scale_features=pixel_level_module_output.decoder_hidden_states,
+            mask_features=pixel_level_module_output.decoder_last_hidden_state,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+        )
+        encoder_hidden_states = None
+        pixel_decoder_hidden_states = None
+        transformer_decoder_hidden_states = None
+        transformer_decoder_intermediate_states = None
+        if output_hidden_states:
+            encoder_hidden_states = pixel_level_module_output.encoder_hidden_states
+            pixel_decoder_hidden_states = pixel_level_module_output.decoder_hidden_states
+            transformer_decoder_hidden_states = transformer_module_output.hidden_states
+            transformer_decoder_intermediate_states = transformer_module_output.intermediate_hidden_states
+        output = Mask2FormerModelOutput(
+            encoder_last_hidden_state=pixel_level_module_output.encoder_last_hidden_state,
+            pixel_decoder_last_hidden_state=pixel_level_module_output.decoder_last_hidden_state,
+            transformer_decoder_last_hidden_state=transformer_module_output.last_hidden_state,
+            encoder_hidden_states=encoder_hidden_states,
+            pixel_decoder_hidden_states=pixel_decoder_hidden_states,
+            transformer_decoder_hidden_states=transformer_decoder_hidden_states,
+            transformer_decoder_intermediate_states=transformer_decoder_intermediate_states,
+            attentions=transformer_module_output.attentions,
+            masks_queries_logits=transformer_module_output.masks_queries_logits,
+        )
+        if not return_dict:
+            output = tuple(v for v in output.values() if v is not None)
+        return output
+def Mask2FormerForUniversalSegmentation_forward_first_part(
+        self,
+        pixel_values: Tensor,
+        mask_labels: Optional[List[Tensor]] = None,
+        class_labels: Optional[List[Tensor]] = None,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_auxiliary_logits: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerForUniversalSegmentationOutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        query_features, pixel_level_module_output = self.model.Mask2FormerModel_forward_first_part(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        return query_features, pixel_level_module_output
+def Mask2FormerForUniversalSegmentation_forward_second_part(
+        self,
+        query_features,
+        query_embeddings,
+        pixel_level_module_output,
+        text_classifier,
+        pixel_values: Tensor,
+        mask_labels: Optional[List[Tensor]] = None,
+        class_labels: Optional[List[Tensor]] = None,
+        pixel_mask: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_auxiliary_logits: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Mask2FormerForUniversalSegmentationOutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model.Mask2FormerModel_forward_second_part(
+            query_features=query_features,
+            query_embeddings=query_embeddings,
+            pixel_level_module_output=pixel_level_module_output,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        loss, loss_dict, auxiliary_logits = None, None, None
+        class_queries_logits = ()
+        for decoder_output in outputs.transformer_decoder_intermediate_states:
+            class_prediction = self.ov_class_predictor(decoder_output.transpose(0, 1), text_classifier)
+            # class_prediction = self.class_predictor(decoder_output.transpose(0, 1))
+            class_queries_logits += (class_prediction,)
+        masks_queries_logits = outputs.masks_queries_logits
+        auxiliary_logits = self.get_auxiliary_logits(class_queries_logits, masks_queries_logits)
+        if mask_labels is not None and class_labels is not None:
+            loss_dict = self.get_loss_dict(
+                masks_queries_logits=masks_queries_logits[-1],
+                class_queries_logits=class_queries_logits[-1],
+                mask_labels=mask_labels,
+                class_labels=class_labels,
+                auxiliary_predictions=auxiliary_logits,
+            )
+            loss = self.get_loss(loss_dict)
+        encoder_hidden_states = None
+        pixel_decoder_hidden_states = None
+        transformer_decoder_hidden_states = None
+        if output_hidden_states:
+            encoder_hidden_states = outputs.encoder_hidden_states
+            pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states
+            transformer_decoder_hidden_states = outputs.transformer_decoder_hidden_states
+        output_auxiliary_logits = (
+            self.config.output_auxiliary_logits if output_auxiliary_logits is None else output_auxiliary_logits
+        )
+        if not output_auxiliary_logits:
+            auxiliary_logits = None
+        output = Mask2FormerForUniversalSegmentationOutput(
+            loss=loss,
+            class_queries_logits=class_queries_logits[-1],
+            masks_queries_logits=masks_queries_logits[-1],
+            auxiliary_logits=auxiliary_logits,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            pixel_decoder_last_hidden_state=outputs.pixel_decoder_last_hidden_state,
+            transformer_decoder_last_hidden_state=outputs.transformer_decoder_last_hidden_state,
+            encoder_hidden_states=encoder_hidden_states,
+            pixel_decoder_hidden_states=pixel_decoder_hidden_states,
+            transformer_decoder_hidden_states=transformer_decoder_hidden_states,
+            attentions=outputs.attentions,
+        )
+        if not return_dict:
+            output = tuple(v for v in output.values() if v is not None)
+            if loss is not None:
+                output = (loss) + output
+        return output

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,364 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    _supports_flash_attn_2 = True
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

modeling_internlm2.py ADDED Viewed

	@@ -0,0 +1,1429 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast,
+                                           SequenceClassifierOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+try:
+    from transformers.generation.streamers import BaseStreamer
+except:  # noqa # pylint: disable=bare-except
+    BaseStreamer = None
+from .configuration_internlm2 import InternLM2Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = 'InternLM2Config'
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+try:
+    from flash_attn import flash_attn_func as _flash_attn_func
+    from flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis as _index_first_axis
+    from flash_attn.bert_padding import pad_input as _pad_input
+    from flash_attn.bert_padding import unpad_input as _unpad_input
+    flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+    pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    has_flash_attn = True
+except:
+    has_flash_attn = False
+def _import_flash_attn():
+    global flash_attn_func, flash_attn_varlen_func
+    global pad_input, index_first_axis, unpad_input
+    try:
+        from flash_attn import flash_attn_func as _flash_attn_func
+        from flash_attn import \
+            flash_attn_varlen_func as _flash_attn_varlen_func
+        from flash_attn.bert_padding import \
+            index_first_axis as _index_first_axis
+        from flash_attn.bert_padding import pad_input as _pad_input
+        from flash_attn.bert_padding import unpad_input as _unpad_input
+        flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+        pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    except ImportError:
+        raise ImportError('flash_attn is not installed.')
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        InternLM2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from functools import partial
+    from apex.normalization import FusedRMSNorm
+    InternLM2RMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
+    print('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternLM2RMSNorm')
+except ImportError:
+    # using the normal LlamaRMSNorm
+    pass
+except Exception:
+    print('discovered apex but it failed to load, falling back to InternLM2RMSNorm')
+    pass
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+    """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+    """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla.
+    """
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer('inv_freq', inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class InternLM2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+        return down_proj
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternLM2Config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.wqkv = nn.Linear(
+            self.hidden_size,
+            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+            bias=config.bias,
+        )
+        self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = InternLM2RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling['type']
+            scaling_factor = self.config.rope_scaling['factor']
+            if scaling_type == 'dynamic':
+                self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
+                )
+            elif scaling_type == 'linear':
+                self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
+                )
+            else:
+                raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+        return self.rotary_emb
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.wqkv(hidden_states)
+        qkv_states = rearrange(
+            qkv_states,
+            'b q (h gs d) -> b q h gs d',
+            gs=2 + self.num_key_value_groups,
+            d=self.head_dim,
+        )
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.wo(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+    """
+    InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # InternLM2FlashAttention2 attention does not support output_attentions
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop('padding_mask')
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.wqkv(hidden_states)
+        qkv_states = rearrange(
+            qkv_states,
+            'b q (h gs d) -> b q h gs d',
+            gs=2 + self.num_key_value_groups,
+            d=self.head_dim,
+        )
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.wo(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        causal = self.is_causal and query_length != 1
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q.to(torch.int64),
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+INTERNLM2_ATTENTION_CLASSES = {
+    'eager': InternLM2Attention,
+    'flash_attention_2': InternLM2FlashAttention2,
+}
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+    def __init__(self, config: InternLM2Config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+        self.feed_forward = InternLM2MLP(config)
+        self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+        residual = hidden_states
+        hidden_states = self.attention_norm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+InternLM2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`InternLM2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+    config_class = InternLM2Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['InternLM2DecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+InternLM2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+            when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+    Args:
+        config: InternLM2Config
+    """
+    _auto_class = 'AutoModel'
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        if not has_flash_attn:
+            self.config.attn_implementation = 'eager'
+            print('Warning: Flash attention is not available, using eager attention instead.')
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.tok_embeddings = value
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.attn_implementation == 'flash_attention_2':
+            _import_flash_attn()
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError('You have to specify either input_ids or inputs_embeds')
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+        if self.config.attn_implementation == 'flash_attention_2':
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+                )
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+    _auto_class = 'AutoModelForCausalLM'
+    _tied_weights_keys = ['output.weight']
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = InternLM2Model(config)
+        self.vocab_size = config.vocab_size
+        self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+    def get_output_embeddings(self):
+        return self.output
+    def set_output_embeddings(self, new_embeddings):
+        self.output = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+        >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.output(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        output = CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+        output['logits'] = output['logits'].to(device)
+        return output
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=''):
+        if tokenizer.add_bos_token:
+            prompt = ''
+        else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+        for record in history:
+            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+        return tokenizer([prompt], return_tensors='pt')
+    @torch.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        streamer: Optional[BaseStreamer] = None,
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        meta_instruction: str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n'
+                                '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
+                                '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.',
+        **kwargs,
+    ):
+        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+        inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+        # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]]
+        outputs = self.generate(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
+        response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split('<|im_end|>')[0]
+        history = history + [(query, response)]
+        return response, history
+    @torch.no_grad()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        **kwargs,
+    ):
+        """
+        Return a generator in format: (response, history)
+        Eg.
+        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
+        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
+        """
+        if BaseStreamer is None:
+            raise ModuleNotFoundError(
+                'The version of `transformers` is too low. Please make sure '
+                'that you have installed `transformers>=4.28.0`.'
+            )
+        response_queue = queue.Queue(maxsize=20)
+        class ChatStreamer(BaseStreamer):
+            def __init__(self, tokenizer) -> None:
+                super().__init__()
+                self.tokenizer = tokenizer
+                self.queue = response_queue
+                self.query = query
+                self.history = history
+                self.response = ''
+                self.cache = []
+                self.received_inputs = False
+                self.queue.put((self.response, history + [(self.query, self.response)]))
+            def put(self, value):
+                if len(value.shape) > 1 and value.shape[0] > 1:
+                    raise ValueError('ChatStreamer only supports batch size 1')
+                elif len(value.shape) > 1:
+                    value = value[0]
+                if not self.received_inputs:
+                    # The first received value is input_ids, ignore here
+                    self.received_inputs = True
+                    return
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
+                if token.strip() != '<|im_end|>':
+                    self.response = self.response + token
+                    history = self.history + [(self.query, self.response)]
+                    self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
+            def end(self):
+                self.queue.put(None)
+        def stream_producer():
+            return self.chat(
+                tokenizer=tokenizer,
+                query=query,
+                streamer=ChatStreamer(tokenizer=tokenizer),
+                history=history,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_p=top_p,
+                **kwargs,
+            )
+        def consumer():
+            producer = threading.Thread(target=stream_producer)
+            producer.start()
+            while True:
+                res = response_queue.get()
+                if res is None:
+                    return
+                yield res
+        return consumer()
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+    """
+    The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+    [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+    as other causal models (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = InternLM2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.')
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

modeling_phi3.py ADDED Viewed

	@@ -0,0 +1,1610 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Phi-3 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import \
+    _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast,
+                                           SequenceClassifierOutputWithPast,
+                                           TokenClassifierOutput)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_code_sample_docstrings,
+                                add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+from .configuration_phi3 import Phi3Config
+logger = logging.get_logger(__name__)
+# Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
+# if is_flash_attn_2_available():
+_flash_supports_window_size = False
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (index_first_axis, pad_input,  # noqa
+                                         unpad_input)
+    _flash_supports_window_size = 'window_size' in list(inspect.signature(flash_attn_func).parameters)
+    has_flash_attn = True
+except ImportError as error:
+    logger.warning(
+        f'`flash-attention` package not found, consider installing for better performance: {error}.'
+    )
+    if not _flash_supports_window_size:
+        logger.warning(
+            "Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
+        )
+    has_flash_attn = False
+_CHECKPOINT_FOR_DOC = 'microsoft/Phi-3-mini-4k-instruct'
+_CONFIG_FOR_DOC = 'Phi3Config'
+PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    'microsoft/Phi-3-mini-4k-instruct',
+    'microsoft/Phi-3-mini-128k-instruct',
+    # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
+]
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.register_buffer('inv_freq', None, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling['short_factor']
+        self.long_factor = config.rope_scaling['long_factor']
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling['short_factor']
+        self.long_factor = config.rope_scaling['long_factor']
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = 0.1 * math.log(scale) + 1.0
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+        return self.down_proj(up_states)
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f'Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will '
+                'lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` '
+                'when creating this class.'
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.rope_scaling is None:
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling['type']
+            if scaling_type == 'su':
+                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+            elif scaling_type == 'yarn':
+                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once('You are not running the flash-attention implementation, expect numerical differences.')
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} '
+                    'for auto-regressive decoding with k/v caching, please make sure to initialize the attention class '
+                    'with a layer index.'
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Phi3FlashAttention2(Phi3Attention):
+    """
+    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Phi3FlashAttention2 attention does not support output_attentions
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
+            )
+            raise ValueError('The current flash attention version does not support sliding window attention.')
+        output_attentions = False
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop('padding_mask')
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} '
+                    'for auto-regressive decoding with k/v caching, please make sure to initialize the attention class '
+                    'with a layer index.'
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, 'sliding_window', None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, 'sliding_window', None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f'past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got'
+                        f' {past_key.shape}'
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_dropout = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+            logger.warning_once(
+                f'The input hidden states seems to be silently casted in float32, this might be related to'
+                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
+                f' {target_dtype}.'
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=attn_dropout,
+            use_sliding_windows=use_sliding_windows,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+        return attn_output
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
+# TODO @Arthur no longer copied from LLama after static cache
+class Phi3SdpaAttention(Phi3Attention):
+    """
+    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Phi3Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                'Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, '
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == 'cuda' and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+PHI3_ATTENTION_CLASSES = {
+    'eager': Phi3Attention,
+    'flash_attention_2': Phi3FlashAttention2,
+    'sdpa': Phi3SdpaAttention,
+}
+class Phi3DecoderLayer(nn.Module):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+PHI3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Phi3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    'The bare Phi-3 model outputting raw hidden-states without any specific head on top.',
+    PHI3_START_DOCSTRING,
+)
+class Phi3PreTrainedModel(PreTrainedModel):
+    config_class = Phi3Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['Phi3DecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    _version = '0.0.5'
+    def __init__(self, config: Phi3Config):
+        if not has_flash_attn:
+            config._attn_implementation = 'eager'
+            print('Warning: Flash attention is not available, using eager attention instead.')
+        super().__init__(config)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+PHI3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    'The bare Phi-3 model outputting raw hidden-states without any specific head on top.',
+    PHI3_START_DOCSTRING,
+)
+class Phi3Model(Phi3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+    Args:
+        config: Phi3Config
+    """
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        self.layers = nn.ModuleList(
+            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError('You have to specify either input_ids or inputs_embeds')
+        past_key_values_length = 0
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if attention_mask is not None and self._attn_implementation == 'flash_attention_2' and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    ' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to '
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == 'flash_attention_2':
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Phi3ForCausalLM(Phi3PreTrainedModel):
+    _tied_weights_keys = ['lm_head.weight']
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+    # Ignore copy
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if (inputs_embeds is not None and past_key_values is None) or (inputs_embeds is not None and len(past_key_values) == 0):
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The [`Phi3Model`] with a sequence classification head on top (linear layer).
+    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi3ForSequenceClassification(Phi3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.')
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + model_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi3ForTokenClassification(Phi3PreTrainedModel):
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        if hasattr(config, 'classifier_dropout') and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, 'hidden_dropout') and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+        if not return_dict:
+            output = (logits,) + model_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )

modeling_sa2va_chat.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import warnings
+from typing import Any, List, Optional, Tuple, Union, Dict, Set
+from PIL import Image
+import re
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch.utils.checkpoint
+import transformers
+from .modeling_internlm2 import InternLM2ForCausalLM
+from .modeling_phi3 import Phi3ForCausalLM
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
+                          LlamaTokenizer, Qwen2ForCausalLM)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging, TensorType
+from transformers import StoppingCriteriaList, StoppingCriteria
+from transformers.models.mask2former.image_processing_mask2former import (
+    remove_low_and_no_objects, check_segment_validity)
+from .configuration_sa2va_chat import Sa2VAChatConfig
+from .modeling_intern_vit import InternVisionModel, has_flash_attn
+from .templates import PROMPT_TEMPLATE
+import numpy as np
+from torchvision.transforms.functional import resize, to_pil_image
+from types import MethodType
+import torch.nn.functional as F
+from transformers import Mask2FormerForUniversalSegmentation
+from .mask2former import (
+    Mask2FormerMaskedAttentionDecoder_forward_first3layers,
+    Mask2FormerMaskedAttentionDecoder_forward_last3layers,
+    Mask2FormerTransformerModule_forward_first_part,
+    Mask2FormerTransformerModule_forward_second_part,
+    Mask2FormerModel_forward_first_part,
+    Mask2FormerModel_forward_second_part,
+    Mask2FormerForUniversalSegmentation_forward_first_part,
+    Mask2FormerForUniversalSegmentation_forward_second_part,
+    _post_init,
+    ov_class_predictor,
+    Mask2FormerLoss_loss_labels,
+    Mask2FormerLoss_loss_masks,
+    Mask2FormerLoss_sample_points_using_uncertainty,
+    Mask2FormerHungarianMatcher_forward,
+)
+from .constants import (
+    IMG_CONTEXT_TOKEN, OBJ_CONTEXT_TOKEN, SEG_TOKEN, CLS_TOKEN, BG_CLS_TOKEN, OBJ_START_TOKEN, OBJ_END_TOKEN)
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+def version_cmp(v1, v2, op='eq'):
+    import operator
+    from packaging import version
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
+class StopWordStoppingCriteria(StoppingCriteria):
+    """StopWord stopping criteria."""
+    def __init__(self, tokenizer, stop_word):
+        self.tokenizer = tokenizer
+        self.stop_word = stop_word
+        self.length = len(self.stop_word)
+    def __call__(self, input_ids, *args, **kwargs) -> bool:
+        cur_text = self.tokenizer.decode(input_ids[0])
+        cur_text = cur_text.replace('\r', '').replace('\n', '')
+        return cur_text[-self.length:] == self.stop_word
+def get_stop_criteria(
+    tokenizer,
+    stop_words=[],
+):
+    stop_criteria = StoppingCriteriaList()
+    for word in stop_words:
+        stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
+    return stop_criteria
+class DirectResize:
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        img = to_pil_image(image, mode='RGB')
+        return np.array(img.resize((self.target_length, self.target_length)))
+class Sa2VAChatModel(PreTrainedModel):
+    config_class = Sa2VAChatConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer',
+                         'Phi3DecoderLayer', 'Qwen2DecoderLayer', 'Mask2FormerForUniversalSegmentation']
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    def __init__(self, config: Sa2VAChatConfig, vision_model=None, language_model=None, mask2former=None, use_flash_attn=True):
+        super().__init__(config)
+        assert version_cmp(transformers.__version__, '4.37.0', 'ge')
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.template = self.template.replace('-', '_')
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        self.llm_arch_name = config.llm_config.architectures[0]
+        use_flash_attn = use_flash_attn if has_flash_attn else False
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
+        logger.info(f'num_image_token: {self.num_image_token}')
+        logger.info(f'ps_version: {self.ps_version}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            self.vision_model = InternVisionModel(config.vision_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            if config.llm_config.architectures[0] == 'LlamaForCausalLM':
+                self.language_model = LlamaForCausalLM(config.llm_config)
+            elif config.llm_config.architectures[0] == 'InternLM2ForCausalLM':
+                self.language_model = InternLM2ForCausalLM(config.llm_config)
+            elif config.llm_config.architectures[0] == 'Phi3ForCausalLM':
+                self.language_model = Phi3ForCausalLM(config.llm_config)
+            elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
+                self.language_model = Qwen2ForCausalLM(config.llm_config)
+            else:
+                raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.img_context_token_id = None
+        self.conv_template = PROMPT_TEMPLATE[self.template]
+        self.template = self.conv_template
+        if hasattr(config, 'system_message'):
+            self.system_message = config.system_message
+        self.num_samples = 0
+        if config.use_backbone_lora:
+            self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
+        if config.use_llm_lora:
+            self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
+        # mask2former
+        if mask2former is None:
+            self.mask2former = Mask2FormerForUniversalSegmentation(config.m2f_config)
+        else:
+            self.mask2former = mask2former
+        assert self.mask2former.config.num_queries == config.num_m2f_queries
+        self.num_m2f_queries =config. num_m2f_queries
+        self.num_m2f_proposals = config.num_m2f_proposals
+        self.m2f_input_size = 1024
+        # register functions
+        self.mask2former._post_init = MethodType(_post_init, self.mask2former)
+        self.mask2former.ov_class_predictor = MethodType(ov_class_predictor, self.mask2former)
+        self.mask2former.criterion.loss_labels = MethodType(Mask2FormerLoss_loss_labels, self.mask2former.criterion)
+        self.mask2former.criterion.loss_masks = MethodType(Mask2FormerLoss_loss_masks, self.mask2former.criterion)
+        self.mask2former.criterion.sample_points_using_uncertainty = MethodType(
+            Mask2FormerLoss_sample_points_using_uncertainty, self.mask2former.criterion)
+        self.mask2former.forward_first_part = MethodType(Mask2FormerForUniversalSegmentation_forward_first_part, self.mask2former)
+        self.mask2former.forward_second_part = MethodType(Mask2FormerForUniversalSegmentation_forward_second_part, self.mask2former)
+        self.mask2former.model.Mask2FormerModel_forward_first_part = MethodType(
+            Mask2FormerModel_forward_first_part, self.mask2former.model)
+        self.mask2former.model.Mask2FormerModel_forward_second_part = MethodType(
+            Mask2FormerModel_forward_second_part, self.mask2former.model)
+        self.mask2former.model.transformer_module.Mask2FormerTransformerModule_forward_first_part = MethodType(
+            Mask2FormerTransformerModule_forward_first_part, self.mask2former.model.transformer_module
+        )
+        self.mask2former.model.transformer_module.Mask2FormerTransformerModule_forward_second_part = MethodType(
+            Mask2FormerTransformerModule_forward_second_part, self.mask2former.model.transformer_module
+        )
+        self.mask2former.model.transformer_module.decoder.Mask2FormerMaskedAttentionDecoder_forward_first3layers = MethodType(
+            Mask2FormerMaskedAttentionDecoder_forward_first3layers, self.mask2former.model.transformer_module.decoder
+        )
+        self.mask2former.model.transformer_module.decoder.Mask2FormerMaskedAttentionDecoder_forward_last3layers = MethodType(
+            Mask2FormerMaskedAttentionDecoder_forward_last3layers, self.mask2former.model.transformer_module.decoder
+        )
+        self.mask2former.criterion.matcher.forward = MethodType(Mask2FormerHungarianMatcher_forward, self.mask2former.criterion.matcher)
+        # post_init of mask2former
+        self.mask2former._post_init()
+        out_dim = config.m2f_config.hidden_dim
+        in_dim = config.llm_config.hidden_size
+        self.m2f_to_llm = nn.Sequential(
+            nn.LayerNorm(out_dim,),
+            nn.Linear(out_dim, in_dim),
+            nn.GELU(),
+            nn.Linear(in_dim, in_dim)
+        )
+        self.llm_to_m2f = nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, out_dim * 2),
+            nn.GELU(),
+            nn.Linear(out_dim * 2, out_dim * 2)
+        )
+        self.llm_to_cls = nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.init_prediction_config = False
+    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+        self.vision_model.print_trainable_parameters()
+    def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        # Determine the target modules based on the architecture of the language model
+        if self.llm_arch_name == 'InternLM2ForCausalLM':
+            target_modules = ['attention.wqkv', 'attention.wo', 'feed_forward.w1', 'feed_forward.w2', 'feed_forward.w3']
+        elif self.llm_arch_name == 'Phi3ForCausalLM':
+            target_modules = ['mlp.down_proj', 'mlp.gate_up_proj', 'self_attn.o_proj', 'self_attn.qkv_proj']
+        elif self.llm_arch_name in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+            target_modules = ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
+                              'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj']
+        else:
+            raise NotImplemented
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=target_modules,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            task_type='CAUSAL_LM'
+        )
+        self.language_model = get_peft_model(self.language_model, lora_config)
+        self.language_model.enable_input_require_grads()
+        self.language_model.print_trainable_parameters()
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+    @property
+    def lm_head(self):
+        return self.language_model.get_output_embeddings()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def forward(self, data, data_samples=None, mode='loss'):
+        pixel_values = data['pixel_values']
+        if type(pixel_values) is list or pixel_values.ndim == 5:
+            if type(pixel_values) is list:
+                pixel_values = [
+                    x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values
+                ]
+            # b*n, c, h, w
+            concat_images = torch.cat(
+                [image.to(self.vision_model.dtype) for image in pixel_values], dim=0)
+        else:
+            raise NotImplementedError()
+        input_ids = data['input_ids']
+        position_ids = data['position_ids']
+        attention_mask = data['attention_mask']
+        # sum is 0 are text
+        image_flags = torch.sum(concat_images, dim=(1, 2, 3)) != 0
+        image_flags = image_flags.long()
+        labels = data['labels']
+        use_cache = False
+        if 'vp_overall_mask' not in data.keys():
+            vp_overall_mask = None
+        else:
+            vp_overall_mask = data['vp_overall_mask']
+        if 'prompt_masks' in data.keys():
+            prompt_masks = data['prompt_masks']
+        else:
+            prompt_masks = None
+        outputs = self._llm_forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            image_flags=image_flags,
+            pixel_values=concat_images,
+            labels=labels,
+            use_cache=use_cache,
+            output_hidden_states=True,
+            vp_overall_mask=vp_overall_mask,
+            prompt_masks=prompt_masks,
+        )
+        return outputs
+    def _llm_forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            image_flags: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            vp_overall_mask=None,
+            prompt_masks=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+        image_flags = image_flags.squeeze(-1)
+        # We only added the clone code here to avoid the error.
+        input_embeds = self.language_model.get_input_embeddings()(
+            input_ids).clone()
+        vit_embeds = self.extract_feature(pixel_values)
+        vit_embeds = vit_embeds.to(input_embeds.dtype)  # FIXME: why vit_embeds is float16?
+        fast_vit_embeds = None
+        vit_embeds = vit_embeds[image_flags == 1]
+        vit_batch_size = pixel_values.shape[0]
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        self._count += 1
+        if vp_overall_mask is not None and prompt_masks is not None:
+            vp_embeds = []
+            vp_overall_mask = vp_overall_mask.to(vit_embeds.device).bool()
+            prompt_masks = [item.to(vit_embeds.device).bool() for item in prompt_masks]
+            vp_overall_mask = vp_overall_mask[image_flags == 1]
+            overall_tile_vit_embeds = vit_embeds[vp_overall_mask]  # (n_img, hw, c)
+            i_vp_img = 0
+            for i_img in range(len(vit_embeds)):
+                vp_embeds.append(vit_embeds[i_img].reshape(-1, C))
+                if vp_overall_mask[i_img]:
+                    tile_vit_embeds = overall_tile_vit_embeds[i_vp_img].reshape(-1, C)  # (hw, C)
+                    objects_prompt_masks = prompt_masks[i_vp_img]
+                    n_obj = len(objects_prompt_masks)
+                    tile_vit_embeds = tile_vit_embeds.unsqueeze(0).repeat(n_obj, 1, 1)
+                    objects_prompt_masks = objects_prompt_masks.reshape(n_obj, -1)
+                    vp_embeds.append(tile_vit_embeds[objects_prompt_masks])
+                    i_vp_img += 1
+            vp_embeds = torch.cat(vp_embeds, dim=0)
+        else:
+            vp_embeds = None
+        input_ids = input_ids.reshape(B * N)
+        selected = (input_ids == self.img_context_token_id)
+        if vp_embeds is None:
+            try:
+                input_embeds[selected] = vit_embeds.reshape(-1, C)
+            except Exception as e:
+                vit_embeds = vit_embeds.reshape(-1, C)
+                print(f'warning: {e}, input_embeds[selected].shape='
+                      f'{input_embeds[selected].shape}, '
+                      f'vit_embeds.shape={vit_embeds.shape}')
+                n_token = selected.sum()
+                if n_token > len(vit_embeds):
+                    print(f"Wrong !!! {n_token} image tokens in text but only {len(vit_embeds)} vit embeds !!!")
+                    expand_ratio = n_token // len(vit_embeds) + 1
+                    vit_embeds = torch.cat([vit_embeds] * expand_ratio, dim=0)
+                input_embeds[selected] = vit_embeds[:n_token]
+        else:
+            try:
+                input_embeds[selected] = vp_embeds.reshape(-1, C)
+            except Exception as e:
+                vp_embeds = vp_embeds.reshape(-1, C)
+                print(f'warning: {e}, input_embeds[selected].shape='
+                      f'{input_embeds[selected].shape}, '
+                      f'vp_embeds.shape={vp_embeds.shape}')
+                n_token = selected.sum()
+                if n_token > len(vp_embeds):
+                    print(f"Wrong !!! {n_token} image tokens in text but only {len(vp_embeds)} vit embeds !!!")
+                    expand_ratio = n_token // len(vp_embeds) + 1
+                    vp_embeds = torch.cat([vp_embeds] * expand_ratio, dim=0)
+                input_embeds[selected] = vp_embeds[:n_token]
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(
+                -1, self.language_model.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            visual_features: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            prompt_masks=None,
+            vp_overall_mask=None,
+            query_embeds=None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        device = self.device
+        assert self.img_context_token_id is not None
+        if pixel_values is not None:
+            if visual_features is not None:
+                vit_embeds = visual_features
+            else:
+                if type(pixel_values) is list or pixel_values.ndim == 5:
+                    if type(pixel_values) is list:
+                        pixel_values = [
+                            x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values
+                        ]
+                    # b*n, c, h, w
+                    pixel_values = torch.cat(
+                        [image.to(self.vision_model.dtype) for image in pixel_values], dim=0)
+                vit_embeds = self.extract_feature(pixel_values.to(device))
+            image_flags = torch.sum(pixel_values, dim=(1, 2, 3)) != 0
+            image_flags = image_flags.long()
+            vit_embeds = vit_embeds[image_flags == 1]
+            input_embeds = self.language_model.get_input_embeddings()(input_ids.to(device))
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = (input_ids == self.img_context_token_id)
+            assert selected.sum() != 0
+            input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+            # object queries
+            query_embeds = query_embeds.to(input_embeds.dtype)
+            selected = (input_ids == self.obj_context_token_id)
+            input_embeds[selected] = query_embeds.reshape(-1, C)
+            input_embeds = input_embeds.reshape(B, N, C)
+        else:
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask.to(device),
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        return outputs
+    def preparing_for_generation(self, tokenizer, max_new_tokens=2048, torch_dtype=torch.bfloat16):
+        # set stop criteria and generation configs for model
+        if not hasattr(self, 'tokenizer'):
+            self.tokenizer = tokenizer
+        self.bot_name = 'BOT'
+        stop_words = []
+        stop_words += self.template.get('STOP_WORDS', [])
+        stop_criteria = get_stop_criteria(
+            tokenizer=self.tokenizer, stop_words=stop_words)
+        self.stop_criteria = stop_criteria
+        default_generation_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=(
+                self.tokenizer.pad_token_id
+                if self.tokenizer.pad_token_id is not None
+                else self.tokenizer.eos_token_id
+            ),
+        )
+        self.gen_config = GenerationConfig(**default_generation_kwargs)
+        self.init_prediction_config = True
+        self.torch_dtype = torch_dtype
+        self.to(torch_dtype)
+        self.extra_image_processor = DirectResize(target_length=1024, )
+        # for multi image process
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_size = patch_size
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        self.IMAGENET_STD = (0.229, 0.224, 0.225)
+        self.IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+        self.IMG_START_TOKEN = '<img>'
+        self.IMG_END_TOKEN = '</img>'
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        # change phi3 prepare for generation fuction
+        if self.config.llm_config.architectures[0] == 'Phi3ForCausalLM':
+            self.language_model.prepare_inputs_for_generation = MethodType(prepare_inputs_for_generation_phi3, self.language_model)
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        obj_context_token_id = tokenizer.convert_tokens_to_ids(OBJ_CONTEXT_TOKEN)
+        self.obj_context_token_id = obj_context_token_id
+        self.PROPOSAL_TOKENS = [SEG_TOKEN.format(id=str(i).zfill(3)) for i in range(self.num_m2f_proposals)]
+        self.the_first_seg_token_idx = self.tokenizer(self.PROPOSAL_TOKENS[0], add_special_tokens=False).input_ids[0]
+        self.the_last_seg_token_idx = self.tokenizer(self.PROPOSAL_TOKENS[-1], add_special_tokens=False).input_ids[0]
+        self.cls_token_idx = self.tokenizer(CLS_TOKEN, add_special_tokens=False).input_ids[0]
+        self.bg_cls_token_idx = self.tokenizer(BG_CLS_TOKEN, add_special_tokens=False).input_ids[0]
+        return
+    def predict_forward(
+            self,
+            image=None,
+            video=None,
+            text=None,
+            past_text='',
+            mask_prompts=None,
+            tokenizer=None,
+            m2f_processor=None,
+    ):
+        if not self.init_prediction_config:
+            assert tokenizer
+            self.preparing_for_generation(tokenizer=tokenizer)
+        if image is None and video is None and '<image>' not in past_text:
+            text = text.replace('<image>', "")
+            input_text = ''
+            input_text += self.template['INSTRUCTION'].format(
+                input=text, round=1, bot_name=self.bot_name)
+            input_text = past_text + input_text
+            ids = self.tokenizer.encode(input_text)
+            ids = torch.tensor(ids).cuda().unsqueeze(0)
+            attention_mask = torch.ones_like(ids, dtype=torch.bool)
+            mm_inputs = {
+                'pixel_values': None,
+                'input_ids': ids,
+                'attention_mask': attention_mask,
+                'position_ids': None,
+                'past_key_values': None,
+                'labels': None,
+                'prompt_masks': None,
+                'vp_overall_mask': None,
+                'm2f_inputs': None,
+            }
+        else:
+            input_dict = {}
+            if video is not None:
+                pixel_values = []
+                ori_image_size = video[0].size
+                for frame_idx, frame_image in enumerate(video):
+                    assert ori_image_size == frame_image.size
+                    img = self.transformer(frame_image)
+                    pixel_values.append(img)
+                pixel_values = torch.stack(pixel_values, dim=0).to(self.torch_dtype)  # (n_f, 3, h, w)
+                num_image_tokens = self.patch_token
+                num_frames = len(pixel_values)
+                # prepapre mask2former inputs
+                m2f_pixel_values, m2f_pixel_masks = [], []
+                for frame_idx, frame_image in enumerate(video):
+                    assert ori_image_size == frame_image.size
+                    w, h = frame_image.size
+                    if w > h:
+                        target_size = (self.m2f_input_size, int(h/w*self.m2f_input_size))
+                    else:
+                        target_size = (int(w/h*self.m2f_input_size), self.m2f_input_size)
+                    resized_frame_image = frame_image.resize(target_size)
+                    cur_w, cur_h = resized_frame_image.size
+                    padded_frame_image = np.ones(shape=(self.m2f_input_size, self.m2f_input_size, 3), dtype=np.uint8) * 255
+                    padded_frame_image[:cur_h, :cur_w, :] = np.array(resized_frame_image)
+                    m2f_inputs_i = m2f_processor(images=Image.fromarray(padded_frame_image), return_tensors="pt", do_resize=False)
+                    m2f_pixel_values.append(m2f_inputs_i['pixel_values'])
+                    m2f_pixel_masks.append(m2f_inputs_i['pixel_mask'])
+                m2f_inputs = {
+                    'pixel_values': torch.cat(m2f_pixel_values, dim=0),
+                    'pixel_mask': torch.cat(m2f_pixel_masks, dim=0)}
+            else:
+                ori_image_size = image.size
+                images = dynamic_preprocess(image, self.min_dynamic_patch,
+                                            self.max_dynamic_patch,
+                                            self.image_size, self.use_thumbnail)
+                pixel_values = [self.transformer(patch) for patch in images]
+                pixel_values = torch.stack(pixel_values).to(self.torch_dtype)
+                num_image_tokens = pixel_values.shape[0] * self.patch_token
+                num_frames = 1
+                w, h = image.size
+                if w > h:
+                    target_size = (self.m2f_input_size, int(h/w*self.m2f_input_size))
+                else:
+                    target_size = (int(w/h*self.m2f_input_size), self.m2f_input_size)
+                resized_image = image.resize(target_size)
+                cur_w, cur_h = resized_image.size
+                padded_image = np.ones(shape=(self.m2f_input_size, self.m2f_input_size, 3), dtype=np.uint8) * 255
+                padded_image[:cur_h, :cur_w, :] = np.array(resized_image)
+                m2f_inputs = m2f_processor(images=Image.fromarray(padded_image), return_tensors="pt", do_resize=False)
+            input_dict['pixel_values'] = pixel_values
+            #TODO add a frame tag to indicate the order
+            image_token_str = f'{self.IMG_START_TOKEN}' \
+                              f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                              f'{self.IMG_END_TOKEN}'
+            object_token_str = f"{OBJ_START_TOKEN}"\
+                               f"{OBJ_CONTEXT_TOKEN * self.num_m2f_queries}"\
+                               f"{OBJ_END_TOKEN}"
+            image_token_str = image_token_str + '\n' + object_token_str + '\n'
+            image_token_str = image_token_str * num_frames
+            image_token_str = image_token_str.strip()
+            if '<image>' in text or mask_prompts is not None:
+                assert past_text is None or len(past_text) == 0
+            text = text.replace('<image>', image_token_str)
+            input_text = ''
+            input_text += self.template['INSTRUCTION'].format(
+                input=text, round=1, bot_name=self.bot_name)
+            input_text = past_text + input_text
+            ids = self.tokenizer.encode(input_text)
+            ids = torch.tensor(ids).cuda().unsqueeze(0)
+            attention_mask = torch.ones_like(ids, dtype=torch.bool)
+            # encode multi-scale visual features into 100~300 queries
+            m2f_inputs['pixel_values'] = m2f_inputs['pixel_values'].to(self.mask2former.dtype).to(self.mask2former.device)
+            m2f_inputs['pixel_mask'] = m2f_inputs['pixel_mask'].to(self.mask2former.dtype).to(self.mask2former.device)
+            query_features, pixel_level_module_output = \
+                self.mask2former.forward_first_part(**m2f_inputs)
+            query_embeds = self.m2f_to_llm(query_features)  # BS, m2f_NQ, 2048
+            mm_inputs = {
+                'pixel_values': input_dict['pixel_values'],
+                'input_ids': ids,
+                'attention_mask': attention_mask,
+                'position_ids': None,
+                'past_key_values': None,
+                'labels': None,
+                'query_embeds': query_embeds,
+                # 'prompt_masks': mask_prompts,
+                # 'vp_overall_mask': input_dict['vp_overall_mask'],
+            }
+        generate_output = self.generate(
+            **mm_inputs,
+            generation_config=self.gen_config,
+            streamer=None,
+            bos_token_id=self.tokenizer.bos_token_id,
+            stopping_criteria=self.stop_criteria,
+            output_hidden_states=True,
+            return_dict_in_generate=True
+        )
+        predict = self.tokenizer.decode(
+            generate_output.sequences[0], skip_special_tokens=False).strip()
+        ret_masks = []
+        if image is None and video is None and '<image>' not in past_text:
+            return {'prediction': predict, 'prediction_masks': ret_masks, 'm2f_outputs': None}
+        # if have seg result, find the seg hidden states
+        hidden_states = generate_output.hidden_states
+        last_hidden_states = [item[-1][0] for item in hidden_states]
+        last_hidden_states = torch.cat(last_hidden_states, dim=0)
+        # get cls tokens
+        bg_cls_token_id = torch.as_tensor([self.bg_cls_token_idx,], dtype=ids.dtype, device=ids.device)
+        bg_cls_embedding = self.language_model.get_input_embeddings()(bg_cls_token_id).clone()
+        output_ids = generate_output.sequences[0][:-1]
+        cls_token_mask = ids == self.cls_token_idx
+        # get seg tokens
+        seg_token_mask = (output_ids >= self.the_first_seg_token_idx) & (output_ids <= self.the_last_seg_token_idx)
+        do_pano_seg = torch.any(cls_token_mask) & torch.any(seg_token_mask)
+        reason_cls_token_mask = output_ids == self.cls_token_idx
+        do_reason_seg = torch.any(reason_cls_token_mask) & torch.any(seg_token_mask)
+        if not do_pano_seg and not do_reason_seg:
+            return {'prediction': predict, 'prediction_masks': ret_masks, 'm2f_outputs': None}
+        # get seg tokens
+        seg_hidden_states = last_hidden_states[-len(seg_token_mask):][seg_token_mask].unsqueeze(0)
+        seg_hidden_states = self.llm_to_m2f(seg_hidden_states)
+        if do_pano_seg:
+            cls_hidden_states = last_hidden_states[:len(cls_token_mask)][cls_token_mask]
+            text_classifier = self.llm_to_cls(torch.cat([cls_hidden_states, bg_cls_embedding], dim=0))
+            seg_hidden_states = seg_hidden_states.transpose(0, 1)
+            # proposals go through mask2former decoder layers
+            m2f_outputs = self.mask2former.forward_second_part(
+                query_features=seg_hidden_states[:, :, :self.mask2former.config.hidden_dim], # q, b, c
+                query_embeddings=seg_hidden_states[:, :, self.mask2former.config.hidden_dim:], # q, b, c
+                pixel_level_module_output=pixel_level_module_output,
+                text_classifier=[text_classifier, ],
+                mask_labels=None,
+                class_labels=None,
+                **m2f_inputs
+            )
+            tags = re.findall(r'<p>(.*?)</p>', predict)
+            label_id_to_text = {id: tag for id, tag in enumerate(tags)}
+            class_queries_logits = m2f_outputs.class_queries_logits
+            masks_queries_logits = m2f_outputs.masks_queries_logits
+            m2f_masks = {'label_id_to_text': label_id_to_text,
+                        'class_queries_logits': class_queries_logits,
+                        'masks_queries_logits': masks_queries_logits}
+            return {'prediction': predict, 'prediction_masks': ret_masks, 'm2f_outputs': m2f_masks}
+        elif do_reason_seg:
+            raise NotImplementedError
+        else:
+            raise NotImplementedError
+    def post_process_panoptic_segmentation(
+        self,
+        class_queries_logits,
+        masks_queries_logits,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[Set[int]] = None,
+        target_sizes: Optional[List[Tuple[int, int]]] = None,
+    ) -> List[Dict]:
+        if label_ids_to_fuse is None:
+            logger.warning("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+        batch_size = len(class_queries_logits)
+        # Loop over items in batch size
+        results: List[Dict[str, TensorType]] = []
+        for i in range(batch_size):
+            height, width = target_sizes[i]
+            long_edge = height if height > width else width
+            masks_queries_logits_i = torch.nn.functional.interpolate(
+                masks_queries_logits[i:i+1], size=(long_edge, long_edge), mode="bilinear", align_corners=False
+            )
+            mask_probs = masks_queries_logits_i[0].sigmoid()
+            num_labels = class_queries_logits[i].shape[-1] - 1
+            pred_scores, pred_labels = nn.functional.softmax(class_queries_logits[i], dim=-1).max(-1)
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs, pred_scores, pred_labels, threshold, num_labels
+            )
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+def get_seg_hidden_states(hidden_states, output_ids, seg_id):
+    seg_mask = output_ids == seg_id
+    n_out = len(seg_mask)
+    if n_out == 0:
+        return hidden_states[0:0]
+    return hidden_states[-n_out:][seg_mask]
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1) for j in range(1, n + 1)
+                     if i * j <= max_num and i * j >= min_num}
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+from transformers.cache_utils import Cache, DynamicCache
+def prepare_inputs_for_generation_phi3(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+):
+    if past_key_values is not None:
+        if isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+        else:
+            cache_length = past_length = past_key_values[0][0].shape[2]
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get('position_ids', None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1]:]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and (past_key_values is None or len(past_key_values)==0):
+        model_inputs = {'inputs_embeds': inputs_embeds}
+    else:
+        model_inputs = {'input_ids': input_ids}
+    model_inputs.update(
+        {
+            'position_ids': position_ids,
+            'past_key_values': past_key_values,
+            'use_cache': kwargs.get('use_cache'),
+            'attention_mask': attention_mask,
+        }
+    )
+    return model_inputs
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[Set[int]] = None,
+    target_size: Tuple[int, int] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: List[Dict] = []
+    if target_size is not None:
+        mask_probs = mask_probs[..., :height, :width]
+    current_segment_id = 0
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+    # Keep track of instances of each class
+    stuff_memory_list: Dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+    return segmentation, segments

sam2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

templates.py ADDED Viewed

	@@ -0,0 +1,170 @@

+PROMPT_TEMPLATE = dict(
+    default=dict(
+        SYSTEM='<|System|>:{system}\n',
+        INSTRUCTION='<|User|>:{input}\n<|Bot|>:',
+        SEP='\n'),
+    zephyr=dict(
+        SYSTEM='<|system|>\n{system}\n',
+        INSTRUCTION='<|user|>\n{input}\n<|assistant|>\n',
+        SEP='\n'),
+    internlm_chat=dict(
+        SYSTEM='<|System|>:{system}\n',
+        INSTRUCTION='<|User|>:{input}<eoh>\n<|Bot|>:',
+        SUFFIX='<eoa>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<eoa>']),
+    internlm2_chat=dict(
+        SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>']),
+    moss_sft=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<|Human|>: {input}<eoh>\n',
+        SEP='\n',
+        STOP_WORDS=['<eoc>', '<eom>']),
+    llama2_chat=dict(
+        SYSTEM=(
+            '[INST] <<SYS>>\n You are a helpful, respectful and honest '
+            'assistant. Always answer as helpfully as possible, while being '
+            'safe. Your answers should not include any harmful, unethical, '
+            'racist, sexist, toxic, dangerous, or illegal content. Please '
+            'ensure that your responses are socially unbiased and positive in '
+            'nature.\n{system}\n<</SYS>>\n [/INST] '),
+        INSTRUCTION='[INST] {input} [/INST]',
+        SEP='\n'),
+    code_llama_chat=dict(
+        SYSTEM='{system}\n', INSTRUCTION='[INST] {input} [/INST]'),
+    chatglm2=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='[Round {round}]\n\n问：{input}\n\n答：',
+        SEP='\n\n'),
+    chatglm3=dict(
+        SYSTEM='<|system|>\n{system}',
+        INSTRUCTION='<|user|>\n{input}<|assistant|>\n',
+        SEP='\n'),
+    qwen_chat=dict(
+        SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
+    baichuan_chat=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<reserved_102>{input}<reserved_103>',
+        SEP='\n'),
+    baichuan2_chat=dict(
+        SYSTEM='{system}\n',
+        INSTRUCTION='<reserved_106>{input}<reserved_107>',
+        SEP='\n'),
+    wizardlm=dict(
+        SYSTEM=('A chat between a curious user and an artificial '
+                'intelligence assistant. The assistant gives '
+                'helpful, detailed, and polite answers to the '
+                'user\'s questions. {system}\n '),
+        INSTRUCTION=('USER: {input} ASSISTANT:'),
+        SEP='\n'),
+    wizardcoder=dict(
+        SYSTEM=(
+            'Below is an instruction that describes a task. '
+            'Write a response that appropriately completes the request.\n\n'
+            '{system}\n '),
+        INSTRUCTION=('### Instruction:\n{input}\n\n### Response:'),
+        SEP='\n\n'),
+    vicuna=dict(
+        SYSTEM=('A chat between a curious user and an artificial '
+                'intelligence assistant. The assistant gives '
+                'helpful, detailed, and polite answers to the '
+                'user\'s questions. {system}\n '),
+        INSTRUCTION=('USER: {input} ASSISTANT:'),
+        SEP='\n'),
+    deepseek_coder=dict(
+        SYSTEM=('You are an AI programming assistant, utilizing '
+                'the DeepSeek Coder model, developed by DeepSeek'
+                'Company, and you only answer questions related '
+                'to computer science. For politically sensitive '
+                'questions, security and privacy issues, and '
+                'other non-computer science questions, you will '
+                'refuse to answer. {system}\n'),
+        INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
+        SEP='\n'),
+    # TODO: deprecation, v0.2.0
+    deepseekcoder=dict(
+        SYSTEM=('You are an AI programming assistant, utilizing '
+                'the DeepSeek Coder model, developed by DeepSeek'
+                'Company, and you only answer questions related '
+                'to computer science. For politically sensitive '
+                'questions, security and privacy issues, and '
+                'other non-computer science questions, you will '
+                'refuse to answer. {system}\n'),
+        INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
+        SEP='\n'),
+    deepseek_moe=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    deepseek_v2=dict(
+        SYSTEM='{system}\n\n',
+        INSTRUCTION='User: {input}\n\nAssistant: ',
+        SUFFIX='<｜end▁of▁sentence｜>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<｜end▁of▁sentence｜>']),
+    mistral=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    mixtral=dict(
+        SYSTEM=('[INST] {system} [/INST]\n'),
+        INSTRUCTION=('[INST] {input} [/INST]'),
+        SEP='\n'),
+    minicpm=dict(INSTRUCTION=('<用户> {input} <AI>'), SEP='\n'),
+    minicpm3=dict(
+        SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
+        INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
+                     '<|im_start|>assistant\n'),
+        SUFFIX='<|im_end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
+    gemma=dict(
+        # `system` field is extended by xtuner
+        SYSTEM=('<start_of_turn>system\n{system}<end_of_turn>\n'),
+        INSTRUCTION=('<start_of_turn>user\n{input}<end_of_turn>\n'
+                     '<start_of_turn>model\n'),
+        SUFFIX='<end_of_turn>',
+        SUFFIX_AS_EOS=False,
+        SEP='\n',
+        STOP_WORDS=['<end_of_turn>']),
+    cohere_chat=dict(
+        SYSTEM=('<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{system}'
+                '<|END_OF_TURN_TOKEN|>'),
+        INSTRUCTION=(
+            '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{input}<|END_OF_TURN_TOKEN|>'
+            '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'),
+        SUFFIX='<|END_OF_TURN_TOKEN|>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<|END_OF_TURN_TOKEN|>']),
+    llama3_chat=dict(
+        SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
+                '{system}<|eot_id|>'),
+        INSTRUCTION=(
+            '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
+            '<|start_header_id|>assistant<|end_header_id|>\n\n'),
+        SUFFIX='<|eot_id|>',
+        SUFFIX_AS_EOS=True,
+        STOP_WORDS=['<|eot_id|>']),
+    phi3_chat=dict(
+        SYSTEM='<|system|>\n{system}<|end|>\n',
+        INSTRUCTION='<|user|>\n{input}<|end|>\n<|assistant|>\n',
+        SUFFIX='<|end|>',
+        SUFFIX_AS_EOS=True,
+        SEP='\n',
+        STOP_WORDS=['<|end|>']),
+)

tokenization_internlm2.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for InternLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
+PRETRAINED_VOCAB_FILES_MAP = {}
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
+class InternLM2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ['input_ids', 'attention_mask']
+    _auto_class = 'AutoTokenizer'
+    def __init__(
+        self,
+        vocab_file,
+        unk_token='<unk>',
+        bos_token='<s>',
+        eos_token='</s>',
+        pad_token='</s>',
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')}
+        return self._no_prefix_space_tokens
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return ' ' + decoded
+        else:
+            return decoded
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ''
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += ' '
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, 'wb') as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

tokenization_internlm2_fast.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization Fast class for InternLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple
+from tokenizers import Tokenizer, decoders, normalizers, processors
+from tokenizers.models import BPE
+from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
+                                                 SentencePieceExtractor,
+                                                 SpmConverter)
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from .tokenization_internlm2 import InternLM2Tokenizer
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
+# Modified from transformers.convert_slow_tokenizer.LlamaConverter
+class InternLM2Converter(SpmConverter):
+    handle_byte_fallback = True
+    def vocab(self, proto):
+        vocab = [
+            ('<unk>', 0.0),
+            ('<s>', 0.0),
+            ('</s>', 0.0),
+        ]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        return vocab
+    def unk_id(self, proto):
+        unk_id = 0
+        return unk_id
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Sequence(
+            [
+                decoders.Replace('▁', ' '),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+                decoders.Strip(content=' ', left=1),
+            ]
+        )
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        # special tokens
+        added_tokens = self.original_tokenizer.added_tokens_decoder
+        for i in range(len(vocab_scores)):
+            piece, score = vocab_scores[i]
+            if i in added_tokens:
+                vocab_scores[i] = (added_tokens[i].content, score)
+        if model_type == 1:
+            raise RuntimeError('InternLM2 is supposed to be a BPE model!')
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
+            )
+            tokenizer.add_special_tokens(
+                [ added_token for index, added_token in added_tokens.items()]
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+        return tokenizer
+    def normalizer(self, proto):
+        normalizers_list = []
+        if proto.normalizer_spec.add_dummy_prefix:
+            normalizers_list.append(normalizers.Prepend(prepend='▁'))
+        normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
+        return normalizers.Sequence(normalizers_list)
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return None
+SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
+# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
+class InternLM2TokenizerFast(PreTrainedTokenizerFast):
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = InternLM2Tokenizer
+    padding_side = 'left'
+    model_input_names = ['input_ids', 'attention_mask']
+    _auto_class = 'AutoTokenizer'
+    def __init__(
+        self,
+        vocab_file,
+        unk_token='<unk>',
+        bos_token='<s>',
+        eos_token='</s>',
+        pad_token='</s>',
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            sp_model_kwargs=sp_model_kwargs,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            decode_with_prefix_space=decode_with_prefix_space,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.vocab_file = vocab_file
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError('add_bos_token = True but bos_token = None')
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError('add_eos_token = True but eos_token = None')
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.'
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d257d75be50ec94137a76982b1ba699695a69d25a660733e8d0e2073bf50328b
+size 11443325

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1147 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<p>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</p>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "[BG_CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<obj>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "</obj>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<OBJ_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "[SEG000]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "[SEG001]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "[SEG002]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "[SEG003]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "[SEG004]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "[SEG005]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "[SEG006]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "[SEG007]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "[SEG008]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "[SEG009]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "[SEG010]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "[SEG011]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "[SEG012]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "[SEG013]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "[SEG014]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "[SEG015]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "[SEG016]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "[SEG017]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "[SEG018]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "[SEG019]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "[SEG020]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "[SEG021]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "[SEG022]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "[SEG023]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "[SEG024]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "[SEG025]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "[SEG026]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "[SEG027]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "[SEG028]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "[SEG029]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "[SEG030]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "[SEG031]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "[SEG032]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "[SEG033]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "[SEG034]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "[SEG035]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "[SEG036]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "[SEG037]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "[SEG038]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "[SEG039]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "[SEG040]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "[SEG041]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "[SEG042]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "[SEG043]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "[SEG044]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "[SEG045]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "[SEG046]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "[SEG047]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "[SEG048]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "[SEG049]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "[SEG050]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "[SEG051]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "[SEG052]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "[SEG053]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "[SEG054]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "[SEG055]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "[SEG056]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "[SEG057]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "[SEG058]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "[SEG059]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "[SEG060]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "[SEG061]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "[SEG062]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "[SEG063]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "[SEG064]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "[SEG065]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "[SEG066]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "[SEG067]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "[SEG068]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "[SEG069]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "[SEG070]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "[SEG071]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "[SEG072]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "[SEG073]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "[SEG074]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "[SEG075]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "[SEG076]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "[SEG077]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "[SEG078]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "[SEG079]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "[SEG080]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "[SEG081]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "[SEG082]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "[SEG083]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "[SEG084]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "[SEG085]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "[SEG086]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "[SEG087]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151769": {
+      "content": "[SEG088]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151770": {
+      "content": "[SEG089]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151771": {
+      "content": "[SEG090]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151772": {
+      "content": "[SEG091]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151773": {
+      "content": "[SEG092]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151774": {
+      "content": "[SEG093]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151775": {
+      "content": "[SEG094]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151776": {
+      "content": "[SEG095]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151777": {
+      "content": "[SEG096]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151778": {
+      "content": "[SEG097]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151779": {
+      "content": "[SEG098]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151780": {
+      "content": "[SEG099]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 16384,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff