MVRL
/

scalemae-vitlarge-800

PyTorch

Model card Files Files and versions

xet

Community

Srikumar26 commited on May 23, 2024

Commit

54bf4fb

verified ·

1 Parent(s): 90dd44f

Create model.py

Browse files

Files changed (1) hide show

model.py +302 -0

model.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# Adapted from: https://github.com/bair-climate-initiative/scale-mae/blob/main/mae/main_finetune.py
+import torch
+from timm.models.layers import trunc_normal_
+from functools import partial
+import timm.models.vision_transformer
+import torch.nn as nn
+from timm.models.vision_transformer import Block, PatchEmbed
+import os
+from torchvision.io import read_image
+import numpy as np
+import sys
+import random
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from huggingface_hub import PyTorchModelHubMixin
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_with_resolution(
+    embed_dim, grid_size, res, cls_token=False, device="cpu"
+):
+    """
+    grid_size: int of the grid height and width
+    res: array of size n, representing the resolution of a pixel (say, in meters),
+    return:
+    pos_embed: [n,grid_size*grid_size, embed_dim] or [n,1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    # res = torch.FloatTensor(res).to(device)
+    res = res.to(device)
+    grid_h = torch.arange(grid_size, dtype=torch.float32, device=device)
+    grid_w = torch.arange(grid_size, dtype=torch.float32, device=device)
+    grid = torch.meshgrid(
+        grid_w, grid_h, indexing="xy"
+    )  # here h goes first,direction reversed for numpy
+    grid = torch.stack(grid, dim=0)  # 2 x h x w
+    # grid = grid.reshape([2, 1, grid_size, grid_size])
+    grid = torch.einsum("chw,n->cnhw", grid, res)  # 2 x n x h x w
+    _, n, h, w = grid.shape
+    pos_embed = get_2d_sincos_pos_embed_from_grid_torch(
+        embed_dim, grid
+    )  #  # (nxH*W, D/2)
+    pos_embed = pos_embed.reshape(n, h * w, embed_dim)
+    if cls_token:
+        pos_embed = torch.cat(
+            [
+                torch.zeros(
+                    [n, 1, embed_dim], dtype=torch.float32, device=pos_embed.device
+                ),
+                pos_embed,
+            ],
+            dim=1,
+        )
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_2d_sincos_pos_embed_from_grid_torch(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(
+        embed_dim // 2, grid[0]
+    )  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(
+        embed_dim // 2, grid[1]
+    )  # (H*W, D/2)
+    emb = torch.cat([emb_h, emb_w], dim=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid_torch(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    old_shape = pos
+    omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+class PatchEmbedUnSafe(PatchEmbed):
+    """Image to Patch Embedding"""
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # Dropped size check in timm
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
+    """Vision Transformer with support for global average pooling"""
+    def __init__(
+        self, cls_token_flag=False, global_pool=False, patch_size=16, in_chans=3, embed_dim=1024, **kwargs
+    ):
+        super().__init__(embed_dim=embed_dim, **kwargs)
+        self.cls_token_flag = cls_token_flag
+        self.patch_embed = PatchEmbedUnSafe(
+            img_size=224,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.global_pool = global_pool
+        if self.global_pool:
+            norm_layer = kwargs["norm_layer"]
+            embed_dim = embed_dim
+            self.fc_norm = norm_layer(embed_dim)
+            del self.norm  # remove the original norm
+        del self.head
+        if self.cls_token_flag == False:
+            del self.cls_token
+        del self.pos_embed
+    def forward_features(self, x, input_res=None):
+        B, _, h, w = x.shape
+        x = self.patch_embed(x)
+        input_res = input_res.cpu()
+        num_patches = int(
+            (h * w) / (self.patch_embed.patch_size[0] * self.patch_embed.patch_size[1])
+        )
+        pos_embed = get_2d_sincos_pos_embed_with_resolution(
+            x.shape[-1],
+            int(num_patches**0.5),
+            input_res,
+            cls_token=self.cls_token_flag,
+            device=x.device,
+        )
+        if self.cls_token_flag:
+            cls_tokens = self.cls_token.expand(
+                B, -1, -1
+            )  # stole cls_tokens impl from Phil Wang, thanks
+            x = torch.cat((cls_tokens, x), dim=1)
+        x = x + pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        #x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
+        outcome = self.fc_norm(x)
+        return outcome
+    def forward(self, x, input_res=None):
+        x = self.forward_features(x, input_res=input_res)
+        return x
+def vit_large_patch16(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs
+    )
+    return model
+def get_ScaleMAE_model(global_pool=True, cls_token=True):
+    model = vit_large_patch16(
+            num_classes=1000,
+            drop_path_rate=0.1,
+            global_pool=global_pool,
+            cls_token_flag = cls_token
+        )
+    if global_pool:
+        assert set(msg.missing_keys) == {
+            "head.weight",
+            "head.bias",
+            "fc_norm.weight",
+            "fc_norm.bias",
+        }
+    else:
+        pass
+    return model
+class ScaleMAE_baseline(pl.LightningModule, PyTorchModelHubMixin):
+    def __init__(self, feat_dim=1024, fc_dim=1024, global_pool=False, cls_token_flag=True):
+        super().__init__()
+        self.model = get_ScaleMAE_model(global_pool= global_pool,cls_token = cls_token_flag)
+    def forward(self,x,patch_size,input_res=10.0):
+        input_res = torch.tensor([10.0]).to(x.device)
+        x = self.model(x,input_res=input_res)
+        return x