Removed MMCV dependency

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +33 -50
assets/pikachu_seg.png +3 -0
hf_demo.ipynb +0 -0
hf_model/__init__.py +0 -0
hf_model/hooks.py +52 -0
hf_model/masker.py +246 -0
hf_model/model.py +757 -0
hf_model/modules.py +243 -0
hf_model/pamr.py +146 -0
hf_model/talk2dino.py +432 -0
hf_model/templates.py +148 -0
hf_model/us.py +119 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -31,7 +31,7 @@ Talking to DINO: Bridging Self-Supervised Vision Backbones with Language for Ope
 <div align="center">
 <figure>
-  <img alt="Overview of Talk2DINO" src="./assets/overview.png" width="40%">
 </figure>
 </div>
@@ -43,75 +43,58 @@ Open-Vocabulary Segmentation (OVS) aims at segmenting images from free-form text
 ### Mapping CLIP Text Embeddings to DINOv2 space with Talk2DINO
 We can use Talk2DINO to map CLIP text embeddings into the DINOv2 patch embedding space.
 ```python
-import clip
-from src.model import ProjectionLayer
-import torch
-import os
 # Device setup
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# Configuration and weights
-proj_name = 'vitb_mlp_infonce'
-config_path = os.path.join("configs", f"{proj_name}.yaml")
-weights_path = os.path.join("weights", f"{proj_name}.pth")
-# Load Talk2DINO projection layer
-talk2dino = ProjectionLayer.from_config(config_path)
-talk2dino.load_state_dict(torch.load(weights_path, map_location=device))
-talk2dino.to(device)
-# Load CLIP model
-clip_model, clip_preprocess = clip.load("ViT-L/14", device=device, jit=False)
-tokenizer = clip.tokenize
-# Example: Tokenize and project text features
-texts = ["a cat"]
-text_tokens = tokenizer(texts).to(device)
-text_features = clip_model.encode_text(text_tokens)
-projected_text_features = talk2dino.project_clip_txt(text_features)
-```
-### Demo
-In `demo.py` we provide a simple example on how to use Talk2DINO for inference on a given image with custom textual categories. Run
-```bash
-python demo.py --input custom_input_image --output custom_output_seg [--with_background] --textual_categories category_1,category_2,..
-```
-Example:
-```bash
-python demo.py --input assets/pikachu.png --output pikachu_seg.png --textual_categories pikachu,traffic_sign,forest,route
 ```
 Result:
 <div align="center">
 <table><tr><td><figure>
   <img alt="" src="./assets/pikachu.png" width=300>
 </figure></td><td><figure>
-  <img alt="" src="./pikachu_seg.png" width=300>
 </figure></td></tr></table>
 </div>
 ## Installation
 ```bash
-# Create a new environment with Python 3.10
-conda create --name talk2dino python=3.10 -c conda-forge
-conda activate talk2dino
-# Install compilers for C++/CUDA extensions
-conda install -c conda-forge "gxx_linux-64=11.*" "gcc_linux-64=11.*"
-# Install CUDA toolkit and cuDNN
-conda install -c nvidia/label/cuda-11.7.0 cuda
-conda install -c nvidia/label/cuda-11.7.0 cuda-nvcc
-conda install -c conda-forge cudnn cudatoolkit=11.7.0
-# Install PyTorch 2.1 with CUDA 11.8 support
-# Note: This is crucial, as it matches the requirements of mmcv-full 1.7.2
-pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
-# Install other dependencies
 pip install -r requirements.txt
-pip install -U openmim
-mim install mmengine
-# Install a compatible version of mmcv-full (1.7.2) for PyTorch 2.1
-pip install mmcv-full==1.7.2 -f https://download.openmmlab.com/mmcv/dist/cu118/torch2.1.0/index.html
-# Install mmsegmentation
-pip install mmsegmentation==0.30.0
 ```
 <details>
   <summary>Qualitative Results</summary>

 <div align="center">
 <figure>
+  <img alt="Overview of Talk2DINO" src="./assets/overview.png" width="90%">
 </figure>
 </div>
 ### Mapping CLIP Text Embeddings to DINOv2 space with Talk2DINO
 We can use Talk2DINO to map CLIP text embeddings into the DINOv2 patch embedding space.
 ```python
+from hf_model.talk2dino import Talk2DINO
+from torchvision.io import read_image
 # Device setup
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Model Loading
+model = Talk2DINO.from_pretrained("lorebianchi98/Talk2DINO-ViTL").to(device).eval()
+# Embedding generation
+with torch.no_grad():
+    text_embed = model.encode_text("a pikachu")
+    image_embed = model.encode_image(image)
+# normalize the features to perform cosine similarity
+text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+image_embed = image_embed / image_embed.norm(dim=-1, keepdim=True)
+similarity = (image_embed @ text_embed.T).squeeze(0, -1).cpu().numpy()
 ```
+### Demo
+In `demo.ipynb` we provide a simple example on how to use Talk2DINO for inference on a given image with custom textual categories.
 Result:
 <div align="center">
 <table><tr><td><figure>
   <img alt="" src="./assets/pikachu.png" width=300>
 </figure></td><td><figure>
+  <img alt="" src="./assets/pikachu_seg.png" width=300>
 </figure></td></tr></table>
 </div>
 ## Installation
+To use the **Hugging Face interface** for inference:
 ```bash
+# Clone the repository
+git clone https://huggingface.co/lorebianchi98/Talk2DINO-ViTL
+cd Talk2DINO-ViTL
+# Install dependencies
 pip install -r requirements.txt
+# Install PyTorch and torchvision with the appropriate CUDA version
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
 ```
+> For the **full MMCV interface** to perform evaluation on segmentation benchmarks, please refer to the [original Talk2DINO repository](https://github.com/lorebianchi98/Talk2DINO).
 <details>
   <summary>Qualitative Results</summary>

assets/pikachu_seg.png ADDED Viewed

Git LFS Details

SHA256: 53eb872bee3c849aeca202853fc8d38019916f2a465f8620542647c4a8baa852
Pointer size: 131 Bytes
Size of remote file: 220 kB

hf_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

hf_model/__init__.py ADDED Viewed

File without changes

hf_model/hooks.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+feats = {}
+def get_self_attention(module, input, output):
+    feats['self_attn'] = output
+def process_self_attention(output, batch_size, num_tokens, num_attn_heads, embed_dim, scale, num_global_tokens, ret_self_attn_maps=False):
+    qkv = output.reshape(batch_size, num_tokens, 3, num_attn_heads, embed_dim // num_attn_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+    attn = q @ k.transpose(-2, -1)
+    self_attn_maps = attn[:, : , 0, num_global_tokens:]
+    self_attn = self_attn_maps.mean(dim=1)
+    self_attn = self_attn.softmax(dim=-1)
+    if ret_self_attn_maps:
+        return self_attn, self_attn_maps
+    else:
+        return self_attn
+def get_vit_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['vit_out'] = output
+def get_second_last_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['second_last_out'] = output
+def get_all_out_tokens(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_txt_out_tokens'] = output
+def get_clip_second_last_dense_out(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_second_last_out'] = output.permute(1,0,2)
+def get_dinov1_patches(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['dinov1_patches'] = output
+def get_all_out_tokens(model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+    feats['clip_txt_out_tokens'] = output
+def average_text_tokens(text_embeddings, mask, keep_cls=False, keep_end_seq=False):
+    if not keep_end_seq:
+        mask[torch.arange(mask.shape[0]), mask.sum(dim=1) - 1] = False # excluding end of sequence
+    if not keep_cls:
+        mask[:, 0] = False # excluding CLS token
+    masked_embeddings = text_embeddings * mask.unsqueeze(-1)  # shape: [BS, SEQ_LEN, 512]
+    sum_embeddings = masked_embeddings.sum(dim=1)  # shape: [BS, 512]
+    valid_elements = mask.sum(dim=1, keepdim=True)  # shape: [BS, 1]
+    mean_embeddings = sum_embeddings / valid_elements  # shape: [BS, 512]
+    return mean_embeddings

hf_model/masker.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# ------------------------------------------------------------------------------
+# Talk2DINO
+# ------------------------------------------------------------------------------
+import copy
+from collections import OrderedDict
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import hf_model.us as us
+from einops import rearrange, repeat
+# from models.dinotext.gumbel import gumbel_sigmoid
+from hf_model.modules import FeatureEncoder
+from omegaconf import OmegaConf
+def build_model(config):
+    model = OmegaConf.to_container(config, resolve=True)
+    return model
+class Sim2Mask(nn.Module):
+    def __init__(self, init_w=1.0, init_b=0.0, gumbel_tau=1.0, learnable=True):
+        super().__init__()
+        self.init_w = init_w
+        self.init_b = init_b
+        self.gumbel_tau = gumbel_tau
+        self.learnable = learnable
+        assert not ((init_w is None) ^ (init_b is None))
+        if learnable:
+            self.w = nn.Parameter(torch.full([], float(init_w)))
+            self.b = nn.Parameter(torch.full([], float(init_b)))
+        else:
+            self.w = init_w
+            self.b = init_b
+    def forward(self, x, deterministic=False):
+        logits = x * self.w + self.b
+        soft_mask = torch.sigmoid(logits)
+        if deterministic:
+            hard_mask = soft_mask.gt(0.5).type(logits.dtype)
+        else:
+            hard_mask = gumbel_sigmoid(logits, hard=True, tau=self.gumbel_tau)
+        return hard_mask, soft_mask
+    def extra_repr(self):
+        return f'init_w={self.init_w}, init_b={self.init_b}, learnable={self.learnable}, gumbel_tau={self.gumbel_tau}'
+class MaskerBackbone(nn.Module):
+    """Masker image encoder backbone.
+    """
+    def __init__(self, clip_visual, freeze_idx):
+        super().__init__()
+        self.transformer = copy.deepcopy(clip_visual.transformer)
+        self.transformer.resblocks = self.transformer.resblocks[freeze_idx:]
+        for block in self.transformer.resblocks:
+            if hasattr(block, "hook_handler"):
+                block.hook_handler.remove()
+        self.ln_post = copy.deepcopy(clip_visual.ln_post)
+        self.proj = copy.deepcopy(clip_visual.proj)
+        self.layers = len(self.transformer.resblocks)
+        self.patch_size = clip_visual.patch_size
+        self.output_dim = clip_visual.output_dim if self.proj is not None else clip_visual.width
+    def forward(self, x, spatial=True, ignore_last_attn=True):
+        if self.layers:
+            x = self.transformer(x, ignore_last_attn=ignore_last_attn)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        if spatial:
+            x = self.ln_post(x)
+        else:
+            x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class MaskerImageFeatureEncoder(FeatureEncoder):
+    def __init__(self, backbone: nn.Module, decoder: nn.Module, ignore_last_attn: bool = True):
+        super().__init__()
+        self.ignore_last_attn = ignore_last_attn
+        self.patch_size = backbone.patch_size
+        self.backbone = backbone
+        self.decoder = decoder
+        for resblock in self.backbone.transformer.resblocks:
+            resblock.hook_handler = resblock.register_forward_hook(self.hook)
+    def _encode(self, image, image_feat):
+        H, W = image.shape[-2:]
+        h = H // self.patch_size
+        w = W // self.patch_size
+        x = self.backbone(image_feat, spatial=True, ignore_last_attn=self.ignore_last_attn)  # BLC
+        x = rearrange(x[:, 1:], "B (H W) C -> B C H W", H=h, W=w)
+        x = self.decoder(x)
+        return x
+class Masker(nn.Module):
+    def __init__(self, backbone, decoder, image_proj, sim2mask, ignore_last_attn, **kwargs):
+        super().__init__()
+        self.ignore_last_attn = ignore_last_attn
+        decoder["C"] = backbone.output_dim
+        decoder = MODELS.build(decoder)
+        decoder = nn.Sequential(OrderedDict([
+            ("decoder", decoder),
+            ("image_proj", image_proj)
+        ]))
+        self.image_encoder = MaskerImageFeatureEncoder(backbone, decoder, ignore_last_attn=ignore_last_attn)
+        self.sim2mask = Sim2Mask(**sim2mask)
+    def forward(self, image, image_feat, text_emb, deterministic=False):
+        B = image.size(0)
+        image_emb, feats = self.image_encoder(image, image_feat, ret_feats=True)  # [BCHW]
+        image_emb_norm = us.normalize(image_emb, dim=1)
+        text_emb_norm = us.normalize(text_emb, dim=-1)
+        H, W = image_emb.shape[2:]
+        D = dist.get_world_size()
+        # simmap [B, B*D, H, W] where D is #devices
+        all_text_emb_norm = us.gather_cat(text_emb_norm, grad=True, contiguous_grad=True)
+        simmap = torch.einsum("bchw,nc->bnhw", image_emb_norm, all_text_emb_norm)
+        mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        # mask [B, B*D, H, W] where D is #devices
+        # positive global label
+        pos_indices = torch.arange(B, dtype=torch.long, device=image_emb.device) + B * dist.get_rank()
+        pos_mask = mask[torch.arange(B), pos_indices].unsqueeze(1)  # [B, 1, H, W]
+        offdiag = torch.ones(B, B*D, dtype=torch.bool, device=mask.device)
+        offdiag[torch.arange(B), pos_indices] = False
+        soft_pos_mask = soft_mask[torch.arange(B), pos_indices].unsqueeze(1)
+        soft_neg_mask = soft_mask.masked_select(offdiag[..., None, None]).view(B, B*D-1, H, W)
+        masks = {
+            "pos": pos_mask,  # [B, 1, H, W]
+            "soft_pos": soft_pos_mask,
+            "soft_neg": soft_neg_mask,
+            "soft_all": soft_mask,  # [B, N, H, W]
+        }
+        return masks, image_emb, text_emb, feats
+    @torch.no_grad()
+    def forward_seg(self, image, image_feat, text_emb, deterministic=True, hard=False):
+        """Make mask by 1:N matching
+        Args:
+            image [B, 3, H, W]
+            image_feat [L, B, C]: CLIP features
+            text_emb [N, C]
+            deterministic (bool): deterministic inference flag for gumbel noise
+            hard (bool): decide hard or soft returning segmentation mask.
+                Note that soft mask is required for proper evaluation
+        Return:
+            mask [B, N, H', W'] (H' and W' are downsampled H/W)
+        """
+        image_emb = self.image_encoder(image, image_feat)  # [BCHW]
+        image_emb = us.normalize(image_emb, dim=1)  # BCHW
+        text_emb = us.normalize(text_emb, dim=-1)  # NC
+        simmap = torch.einsum("b c h w, n c -> b n h w", image_emb, text_emb)
+        hard_mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        mask = hard_mask if hard else soft_mask
+        return mask, simmap
+class DINOTextMasker(nn.Module):
+    def __init__(self, similarity_type="cosine"):
+        super().__init__()
+        self.sim2mask = DINOTextSim2Mask()
+        self.sim2mask = self.sim2mask.eval()
+        self.similarity_type = similarity_type
+    def forward(self, image, image_feat, text_emb, deterministic=False):
+        pass
+    @torch.no_grad()
+    def forward_seg(self, image_feat, text_emb, deterministic=True, hard=False):
+        """Make mask by 1:N matching
+        Args:
+            image [B, 3, H, W]
+            image_feat [L, B, C]: CLIP features
+            text_emb [N, K, C]
+            deterministic (bool): deterministic inference flag for gumbel noise
+            hard (bool): decide hard or soft returning segmentation mask.
+                Note that soft mask is required for proper evaluation
+            use_k_nn (bool): use kNN to segment
+            k_nn (int): number of nearest neighbors for kNN segmentation
+        Return:
+            mask [B, N, H', W'] (H' and W' are downsampled H/W)
+        """
+        b, c, h, w = image_feat.shape
+        n, c = text_emb.shape
+        if self.similarity_type == "cosine":
+            image_feat = us.normalize(image_feat, dim=1)  # BCHW
+            # text_emb = us.normalize(text_emb, dim=-1)  # NKC
+            simmap = torch.einsum("b c h w, n c -> b n h w", image_feat, text_emb)
+        else:
+            raise NotImplementedError("similarity type {} not implemented".format(self.similarity_type))
+        hard_mask, soft_mask = self.sim2mask(simmap, deterministic=deterministic)
+        mask = hard_mask if hard else soft_mask
+        return mask, simmap
+class DINOTextSim2Mask(nn.Module):
+    def __init__(self, gumbel_tau=1.0):
+        super().__init__()
+        self.gumbel_tau = gumbel_tau
+    def forward(self, x, deterministic=False):
+        soft_mask = torch.sigmoid(x)
+        if deterministic:
+            hard_mask = soft_mask.gt(0.5).type(x.dtype)
+        else:
+            hard_mask = gumbel_sigmoid(x, hard=True, tau=self.gumbel_tau)
+        return hard_mask, soft_mask

hf_model/model.py ADDED Viewed

	@@ -0,0 +1,757 @@

+import clip
+import yaml
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from hf_model.hooks import get_self_attention, process_self_attention, feats
+class VisualProjectionLayer(nn.Module):
+    """
+    Creates a projection layer on top of the DINO encoder.
+    The forward method calculate the similarity between the projected DINO token and the CLIP textual CLS token.
+    """
+    def __init__(self, act=nn.Tanh(), hidden_layer=False, cosine=True, hidden_embed_dim=None, dino_embed_dim=1024, clip_embed_dim=512):
+        # mlp_dims list of mlp dimensions
+        super().__init__()
+        if hidden_embed_dim is None:
+            hidden_embed_dim = clip_embed_dim
+        self.linear_layer = nn.Linear(dino_embed_dim, hidden_embed_dim)
+        if hidden_layer:
+            self.linear_layer2 = nn.Linear(hidden_embed_dim, clip_embed_dim)
+        self.act = act
+        self.cosine = cosine
+    @classmethod
+    def from_config(cls, config):
+        if type(config) is str:
+            # if the configuration is a string, we treat it as a file path
+            with open(config, 'r') as f:
+                config = yaml.safe_load(f)['model']
+        # loading the activation function
+        act = config.get('act', None)
+        if act == 'tanh':
+            act = nn.Tanh()
+        elif act == 'relu':
+            act = nn.ReLU()
+        elif act == 'sigmoid':
+            act = nn.Sigmoid()
+        elif act is not None:
+            raise Exception("Unknown activation function")
+        model = cls(
+            act=act,
+            hidden_layer=config.get('hidden_layer', False),
+            cosine=config.get('cosine', True),
+            hidden_embed_dim=config.get('hidden_embed_dim', None) if config.get('hidden_layer', False) else None,
+            dino_embed_dim=config.get('dino_embed_dim', 1024),
+            clip_embed_dim=config.get('clip_embed_dim', 512)
+        )
+        return model
+    def forward(self, visual_embedding, textual_embedding, ret_similarity_matrix=True, ret_embeds=False):
+        visual_embedding = self.project_dino(visual_embedding)
+        textual_embedding = textual_embedding.float()
+        if self.cosine:
+            textual_embedding = F.normalize(textual_embedding, p=2, dim=1)
+            visual_embedding = F.normalize(visual_embedding, p=2, dim=1)
+        if ret_embeds:
+            return textual_embedding, visual_embedding
+        x = textual_embedding @ visual_embedding.transpose(1, 0)
+        if not ret_similarity_matrix:
+            x = x[torch.eye(len(x)) > 0.5] # only diagonal elements
+        return x
+    def project_dino(self, visual_embedding):
+        visual_embedding = visual_embedding.float()
+        x = self.linear_layer(visual_embedding)
+        if self.act:
+            x = self.act(x)
+        if hasattr(self, 'linear_layer2'):
+            x = self.linear_layer2(x)
+        return x
+    def __len__(self):
+        return sum(p.numel() for p in self.parameters())
+class ProjectionLayer(nn.Module):
+    """
+    Creates a projection layer on top of the CLIP-text encoder.
+    The forward method calculate the similarity between the DINO CLS token and the projected CLIP textual CLS token.
+    """
+    def __init__(self, act=nn.Tanh(), hidden_layer=False, cosine=True, dino_embed_dim=1024, clip_embed_dim=512, num_attn_head=16, weight_attn_heads=None,
+                 alignment_strategy='max_score', alpha=0.6, keep_cls=False, keep_end_seq=False):
+        # mlp_dims list of mlp dimensions
+        super().__init__()
+        self.num_attn_head = num_attn_head
+        self.linear_layer = nn.Linear(clip_embed_dim, dino_embed_dim)
+        if hidden_layer:
+            hidden_layer = 1 if hidden_layer is True else hidden_layer # ensuring compatibility with old code
+            # self.linear_layer2 = nn.Linear(dino_embed_dim, dino_embed_dim)
+            self.hidden_layers = nn.ModuleList([nn.Linear(dino_embed_dim, dino_embed_dim) for _ in range(hidden_layer)])
+        self.act = act
+        self.cosine = cosine
+        self.weight_attn_heads = weight_attn_heads
+        if weight_attn_heads == 'static':
+            self.attn_weights = nn.Parameter(torch.rand(self.num_attn_head))
+        elif weight_attn_heads == 'conditioned':
+            self.weight_layer1 = nn.Linear(dino_embed_dim, dino_embed_dim)
+            self.weight_layer2 = nn.Linear(dino_embed_dim, self.num_attn_head)
+        self.alignment_strategy = alignment_strategy # relevant only if we use disentangled_self_attn
+        self.keep_cls = keep_cls # relevant only if we use clip_txt_tokens_out
+        self.keep_end_seq = keep_end_seq # relevant only if we use clip_txt_tokens_out
+        self.alpha = alpha
+    @classmethod
+    def from_config(cls, config):
+        if type(config) is str:
+            # if the configuration is a string, we treat it as a file path
+            with open(config, 'r') as f:
+                config = yaml.safe_load(f)['model']
+        # loading the activation function
+        act = config.get('act', None)
+        if act == 'tanh':
+            act = nn.Tanh()
+        elif act == 'relu':
+            act = nn.ReLU()
+        elif act == 'sigmoid':
+            act = nn.Sigmoid()
+        elif act is not None:
+            raise Exception("Unknown activation function")
+        model = cls(
+            act=act,
+            hidden_layer=config.get('hidden_layer', False),
+            cosine=config.get('cosine', True),
+            dino_embed_dim=config.get('dino_embed_dim', 1024),
+            num_attn_head=config.get('num_attn_head', 16),
+            clip_embed_dim=config.get('clip_embed_dim', 512),
+            weight_attn_heads=config.get('weight_attn_heads', None),
+            alignment_strategy=config.get('alignment_strategy', 'max_score'),
+            alpha=config.get('alpha', 0.6),
+            keep_cls=config.get('keep_cls', None),
+            keep_end_seq=config.get('keep_end_seq', None),
+        )
+        if config.get('starting_checkpoint', None) is not None:
+            model.load_state_dict(torch.load(config['starting_checkpoint'], 'cpu'))
+        return model
+    def compute_similarity(self, visual_embedding, textual_embedding, text_input_mask=None, return_index=False):
+        if len(visual_embedding.shape) == 3 or len(textual_embedding.shape) == 3:
+            # at least one embedding is decomposed: either we have all textual tokens or we have all the attention head tokens
+            if self.alignment_strategy == 'weighted_avg':
+                if len(visual_embedding.shape) != 3 or len(textual_embedding.shape) != 2:
+                    raise Exception("Alignment strategy not implemented for this type of embeddings!")
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                # in this case, we keep as visual_embedding the averaged token weighted by the text similarities
+                visual_embedding = (visual_embedding * sims.unsqueeze(dim=-1)).mean(dim=1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            # in this case we sample the visual embedding from the softmax similarities of attention heads tokens and the textual tokens
+            elif self.alignment_strategy == 'sampled_attn_map':
+                if len(visual_embedding.shape) != 3 or len(textual_embedding.shape) != 2:
+                    raise Exception("Alignment strategy not implemented for this type of embeddings!")
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                # in this case, we sample from the distribution given byt text2attn-maps similarities the attention map to align
+                index = torch.multinomial(sims, 1).view(-1, 1, 1).expand(-1, 1, visual_embedding.shape[-1])
+                visual_embedding = torch.gather(visual_embedding, 1, index).squeeze(1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            elif self.alignment_strategy == 'max_score':
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                index = sims.argmax(dim=-1)
+                index_reshaped = sims.argmax(dim=-1).view(-1, 1, 1).expand(-1, 1, visual_embedding.shape[-1])
+                visual_embedding = torch.gather(visual_embedding, 1, index_reshaped).squeeze(1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            else:
+                # in this case we construct a similarity matrix between attention head tokens and textual tokens
+                # we ensure that both the batch embeddings have the same number of dimensions
+                textual_embedding = textual_embedding.unsqueeze(1) if len(textual_embedding.shape) == 2 else textual_embedding
+                visual_embedding = visual_embedding.unsqueeze(1) if len(visual_embedding.shape) == 2 else visual_embedding
+                if textual_embedding.shape[1] > 1:
+                    assert text_input_mask is not None, "If we use all the textual embeddings, we need the input mask"
+                    if not self.keep_end_seq:
+                        # we take the last True value of the mask and we set it to False
+                        text_input_mask[torch.arange(text_input_mask.shape[0]), torch.sum(text_input_mask, dim=1) - 1] = False
+                    if not self.keep_cls:
+                        text_input_mask[:, 0] = False
+                # do not consider cls and eos tokens
+                im_set = visual_embedding
+                s_seq = textual_embedding
+                im_set_batch = im_set.size(0)
+                im_set_len = im_set.size(1)
+                s_seq_batch = s_seq.size(0)
+                s_seq_len = s_seq.size(1)
+                im_set = im_set.unsqueeze(1).expand(-1, s_seq_batch, -1, -1)  # B x B x S_im x dim
+                s_seq = s_seq.unsqueeze(0).expand(im_set_batch, -1, -1, -1) # B x B x S_s x dim
+                alignments = torch.matmul(im_set, s_seq.permute(0, 1, 3, 2))  # B x B x S_im x S_s
+                # compute mask for the alignments tensor
+                if text_input_mask is not None:
+                    alignment_mask = text_input_mask.unsqueeze(1).unsqueeze(0).expand(im_set_batch, -1, im_set_len, -1).logical_not()
+                    alignments.masked_fill_(alignment_mask, value=0)
+                # alignments = F.relu(alignments)
+                # alignments = F.normalize(alignments,p=2, dim=2)
+                if self.alignment_strategy == 'sum':
+                    sims = alignments.sum(dim=(2,3))
+                elif self.alignment_strategy == 'mean':
+                    sims = alignments.mean(dim=(2,3))
+                elif self.alignment_strategy == 'max-row_sum':
+                    sims = alignments.max(2)[0].sum(2)
+                elif self.alignment_strategy == 'nucleus-sampling':
+                    max_alignments = alignments.max(2)[0]
+                    sorted_alignments = max_alignments.sort(dim=2, descending=True)[0]
+                    # min-max normalization
+                    mins = sorted_alignments.min(2)[0].unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    maxs = sorted_alignments.max(2)[0].unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    norm_alignments = ((sorted_alignments - mins) / (maxs - mins))
+                    # transform values in percentage
+                    sums = norm_alignments.sum(dim=-1).unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    norm_alignments = norm_alignments / sums
+                    # finding the element indices which surpasses alpha
+                    cumsums = norm_alignments.cumsum(2)
+                    indices = torch.argmax((cumsums > self.alpha).int() + 1, dim=2)
+                    mask = torch.arange(s_seq_len).unsqueeze(0).unsqueeze(0).expand(s_seq_batch, s_seq_batch, s_seq_len).to(indices.device) < indices.unsqueeze(-1).expand(-1, -1, s_seq_len) + 1
+                    relevant_alignments = (sorted_alignments * mask)
+                    sims = relevant_alignments.sum(dim=2)
+        else:
+            # default case: dot-product
+            sims = textual_embedding @ visual_embedding.transpose(1, 0)
+        if not return_index:
+            return sims
+        else:
+            return sims, index
+    def forward(self, visual_embedding, textual_embedding, ret_similarity_matrix=True, ret_embeds=False, self_attn_maps=None, cls=None, text_input_mask=None, return_index=False):
+        if self.weight_attn_heads is not None:
+            assert self_attn_maps is not None, "In case we have attention maps weights, we have to weight patch tokens mean by the weighted self-attention maps"
+            visual_embedding = self.get_visual_embed(visual_embedding, self_attn_maps=self_attn_maps, cls=cls)
+        textual_embedding = self.project_clip_txt(textual_embedding)
+        if self.cosine:
+            textual_embedding = F.normalize(textual_embedding, p=2, dim=-1)
+            visual_embedding = F.normalize(visual_embedding, p=2, dim=-1)
+        if ret_embeds:
+            return textual_embedding, visual_embedding
+        if not return_index:
+            x = self.compute_similarity(visual_embedding, textual_embedding, text_input_mask, return_index)
+        else:
+            x, index = self.compute_similarity(visual_embedding, textual_embedding, text_input_mask, return_index)
+        if not ret_similarity_matrix:
+            x = x[torch.eye(len(x)) > 0.5] # only diagonal elements
+        if not return_index:
+            return x
+        else:
+            return x, index
+    def get_visual_embed(self, visual_embedding, self_attn_maps=None, cls=None):
+        if self_attn_maps is not None:
+            # we weight each attention head to obtain a weighted self-attention map
+            assert len(visual_embedding.shape) == 3, "In case we have attention maps weights, the visual_embedding should contain patch embeddings, with shape BS x NUM_PATCHES x EMBED_DIM"
+            if self.weight_attn_heads == 'conditioned':
+                assert cls is not None, "cls must be setted in case of dinamic attention weighting"
+                x = self.weight_layer1(cls)
+                x = self.act(x)
+                x = self.weight_layer2(x)
+                normalized_attn_weights = x.softmax(dim=1)
+                self_attn = (self_attn_maps * normalized_attn_weights.unsqueeze(dim=-1)).mean(dim=1)
+            else:
+                normalized_attn_weights = self.attn_weights.softmax(dim=0)
+                self_attn = (self_attn_maps * normalized_attn_weights.view(1, normalized_attn_weights.shape[0], 1)).mean(dim=1)
+            self_attn = self_attn.softmax(dim=-1)
+            # then we perform the weighted mean of patches
+            visual_embedding = (self_attn.unsqueeze(-1) * visual_embedding).mean(dim=1)
+        return visual_embedding
+    def project_clip_txt(self, textual_embedding):
+        textual_embedding = textual_embedding.float()
+        x = self.linear_layer(textual_embedding)
+        if hasattr(self, 'hidden_layers'):
+            for hidden_layer in self.hidden_layers:
+                if self.act:
+                    x = self.act(x)
+                x = hidden_layer(x)
+        return x
+    def load_state_dict(self, state_dict, strict=True):
+        # compatibility with old code
+        if 'linear_layer2.weight' in state_dict:
+            state_dict['hidden_layers.0.weight'] = state_dict.pop('linear_layer2.weight')
+            state_dict['hidden_layers.0.bias'] = state_dict.pop('linear_layer2.bias')
+        # Call the parent class's load_state_dict with the modified state_dict
+        super(ProjectionLayer, self).load_state_dict(state_dict, strict)
+    def set_alignment_strategy(self, alignment_strategy):
+        self.alignment_strategy = alignment_strategy
+        return
+    def __len__(self):
+        return sum(p.numel() for p in self.parameters())
+class DoubleMLP(nn.Module):
+    def __init__(self, act=nn.Tanh(), hidden_layer=False, cosine=True, dino_embed_dim=1024, clip_embed_dim=512, num_attn_head=16, weight_attn_heads=None,
+                 alignment_strategy='max_score', alpha=0.6, keep_cls=False, keep_end_seq=False):
+        super().__init__()
+        self.num_attn_head = num_attn_head
+        self.linear_layer = nn.Linear(clip_embed_dim, dino_embed_dim)
+        if hidden_layer:
+            hidden_layer = 1 if hidden_layer is True else hidden_layer # ensuring compatibility with old code
+            # self.linear_layer2 = nn.Linear(dino_embed_dim, dino_embed_dim)
+            self.hidden_layers = nn.ModuleList([nn.Linear(dino_embed_dim, dino_embed_dim) for _ in range(hidden_layer)])
+        self.act = act
+        self.cosine = cosine
+        self.weight_attn_heads = weight_attn_heads
+        if weight_attn_heads == 'static':
+            self.attn_weights = nn.Parameter(torch.rand(self.num_attn_head))
+        elif weight_attn_heads == 'conditioned':
+            self.weight_layer1 = nn.Linear(dino_embed_dim, dino_embed_dim)
+            self.weight_layer2 = nn.Linear(dino_embed_dim, self.num_attn_head)
+        self.alignment_strategy = alignment_strategy # relevant only if we use disentangled_self_attn
+        self.keep_cls = keep_cls # relevant only if we use clip_txt_tokens_out
+        self.keep_end_seq = keep_end_seq # relevant only if we use clip_txt_tokens_out
+        self.alpha = alpha
+        self.visual_linear = nn.Linear(dino_embed_dim, dino_embed_dim)
+        if hidden_layer:
+            hidden_layer = 1 if hidden_layer is True else hidden_layer # ensuring compatibility with old code
+            self.visual_hidden_layers = nn.ModuleList([nn.Linear(dino_embed_dim, dino_embed_dim) for _ in range(hidden_layer)])
+    @classmethod
+    def from_config(cls, config):
+        if type(config) is str:
+            # if the configuration is a string, we treat it as a file path
+            with open(config, 'r') as f:
+                config = yaml.safe_load(f)['model']
+        # loading the activation function
+        act = config.get('act', None)
+        if act == 'tanh':
+            act = nn.Tanh()
+        elif act == 'relu':
+            act = nn.ReLU()
+        elif act == 'sigmoid':
+            act = nn.Sigmoid()
+        elif act is not None:
+            raise Exception("Unknown activation function")
+        model = cls(
+            act=act,
+            hidden_layer=config.get('hidden_layer', False),
+            cosine=config.get('cosine', True),
+            dino_embed_dim=config.get('dino_embed_dim', 1024),
+            num_attn_head=config.get('num_attn_head', 16),
+            clip_embed_dim=config.get('clip_embed_dim', 512),
+            weight_attn_heads=config.get('weight_attn_heads', None),
+            alignment_strategy=config.get('alignment_strategy', 'max_score'),
+            alpha=config.get('alpha', 0.6),
+            keep_cls=config.get('keep_cls', None),
+            keep_end_seq=config.get('keep_end_seq', None),
+        )
+        if config.get('starting_checkpoint', None) is not None:
+            model.load_state_dict(torch.load(config['starting_checkpoint'], 'cpu'))
+        return model
+    def compute_similarity(self, visual_embedding, textual_embedding, text_input_mask=None):
+        if len(visual_embedding.shape) == 3 or len(textual_embedding.shape) == 3:
+            # at least one embedding is decomposed: either we have all textual tokens or we have all the attention head tokens
+            if self.alignment_strategy == 'weighted_avg':
+                if len(visual_embedding.shape) != 3 or len(textual_embedding.shape) != 2:
+                    raise Exception("Alignment strategy not implemented for this type of embeddings!")
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                # in this case, we keep as visual_embedding the averaged token weighted by the text similarities
+                visual_embedding = (visual_embedding * sims.unsqueeze(dim=-1)).mean(dim=1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            # in this case we sample the visual embedding from the softmax similarities of attention heads tokens and the textual tokens
+            elif self.alignment_strategy == 'sampled_attn_map':
+                if len(visual_embedding.shape) != 3 or len(textual_embedding.shape) != 2:
+                    raise Exception("Alignment strategy not implemented for this type of embeddings!")
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                # in this case, we sample from the distribution given byt text2attn-maps similarities the attention map to align
+                index = torch.multinomial(sims, 1).view(-1, 1, 1).expand(-1, 1, visual_embedding.shape[-1])
+                visual_embedding = torch.gather(visual_embedding, 1, index).squeeze(1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            elif self.alignment_strategy == 'max_score':
+                sims = torch.einsum('ik,ijk->ij', textual_embedding, visual_embedding)
+                sims = sims.softmax(dim=-1)
+                index = sims.argmax(dim=-1).view(-1, 1, 1).expand(-1, 1, visual_embedding.shape[-1])
+                visual_embedding = torch.gather(visual_embedding, 1, index).squeeze(1)
+                sims = textual_embedding @ visual_embedding.transpose(1, 0)
+            else:
+                # in this case we construct a similarity matrix between attention head tokens and textual tokens
+                # we ensure that both the batch embeddings have the same number of dimensions
+                textual_embedding = textual_embedding.unsqueeze(1) if len(textual_embedding.shape) == 2 else textual_embedding
+                visual_embedding = visual_embedding.unsqueeze(1) if len(visual_embedding.shape) == 2 else visual_embedding
+                if textual_embedding.shape[1] > 1:
+                    assert text_input_mask is not None, "If we use all the textual embeddings, we need the input mask"
+                    if not self.keep_end_seq:
+                        # we take the last True value of the mask and we set it to False
+                        text_input_mask[torch.arange(text_input_mask.shape[0]), torch.sum(text_input_mask, dim=1) - 1] = False
+                    if not self.keep_cls:
+                        text_input_mask[:, 0] = False
+                # do not consider cls and eos tokens
+                im_set = visual_embedding
+                s_seq = textual_embedding
+                im_set_batch = im_set.size(0)
+                im_set_len = im_set.size(1)
+                s_seq_batch = s_seq.size(0)
+                s_seq_len = s_seq.size(1)
+                im_set = im_set.unsqueeze(1).expand(-1, s_seq_batch, -1, -1)  # B x B x S_im x dim
+                s_seq = s_seq.unsqueeze(0).expand(im_set_batch, -1, -1, -1) # B x B x S_s x dim
+                alignments = torch.matmul(im_set, s_seq.permute(0, 1, 3, 2))  # B x B x S_im x S_s
+                # compute mask for the alignments tensor
+                if text_input_mask is not None:
+                    alignment_mask = text_input_mask.unsqueeze(1).unsqueeze(0).expand(im_set_batch, -1, im_set_len, -1).logical_not()
+                    alignments.masked_fill_(alignment_mask, value=0)
+                # alignments = F.relu(alignments)
+                # alignments = F.normalize(alignments,p=2, dim=2)
+                if self.alignment_strategy == 'sum':
+                    sims = alignments.sum(dim=(2,3))
+                elif self.alignment_strategy == 'mean':
+                    sims = alignments.mean(dim=(2,3))
+                elif self.alignment_strategy == 'max-row_sum':
+                    sims = alignments.max(2)[0].sum(2)
+                elif self.alignment_strategy == 'nucleus-sampling':
+                    max_alignments = alignments.max(2)[0]
+                    sorted_alignments = max_alignments.sort(dim=2, descending=True)[0]
+                    # min-max normalization
+                    mins = sorted_alignments.min(2)[0].unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    maxs = sorted_alignments.max(2)[0].unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    norm_alignments = ((sorted_alignments - mins) / (maxs - mins))
+                    # transform values in percentage
+                    sums = norm_alignments.sum(dim=-1).unsqueeze(-1).expand(-1, -1, s_seq_len)
+                    norm_alignments = norm_alignments / sums
+                    # finding the element indices which surpasses alpha
+                    cumsums = norm_alignments.cumsum(2)
+                    indices = torch.argmax((cumsums > self.alpha).int() + 1, dim=2)
+                    mask = torch.arange(s_seq_len).unsqueeze(0).unsqueeze(0).expand(s_seq_batch, s_seq_batch, s_seq_len).to(indices.device) < indices.unsqueeze(-1).expand(-1, -1, s_seq_len) + 1
+                    relevant_alignments = (sorted_alignments * mask)
+                    sims = relevant_alignments.sum(dim=2)
+        else:
+            # default case: dot-product
+            sims = textual_embedding @ visual_embedding.transpose(1, 0)
+        return sims
+    def forward(self, visual_embedding, textual_embedding, ret_similarity_matrix=True, ret_embeds=False, self_attn_maps=None, cls=None, text_input_mask=None):
+        if self.weight_attn_heads is not None:
+            assert self_attn_maps is not None, "In case we have attention maps weights, we have to weight patch tokens mean by the weighted self-attention maps"
+            visual_embedding = self.get_visual_embed(visual_embedding, self_attn_maps=self_attn_maps, cls=cls)
+        visual_embedding = self.project_visual(visual_embedding)
+        textual_embedding = self.project_clip_txt(textual_embedding)
+        if self.cosine:
+            textual_embedding = F.normalize(textual_embedding, p=2, dim=-1)
+            visual_embedding = F.normalize(visual_embedding, p=2, dim=-1)
+        if ret_embeds:
+            return textual_embedding, visual_embedding
+        x = self.compute_similarity(visual_embedding, textual_embedding, text_input_mask)
+        if not ret_similarity_matrix:
+            x = x[torch.eye(len(x)) > 0.5] # only diagonal elements
+        return x
+    def get_visual_embed(self, visual_embedding, self_attn_maps=None, cls=None):
+        if self_attn_maps is not None:
+            # we weight each attention head to obtain a weighted self-attention map
+            assert len(visual_embedding.shape) == 3, "In case we have attention maps weights, the visual_embedding should contain patch embeddings, with shape BS x NUM_PATCHES x EMBED_DIM"
+            if self.weight_attn_heads == 'conditioned':
+                assert cls is not None, "cls must be setted in case of dinamic attention weighting"
+                x = self.weight_layer1(cls)
+                x = self.act(x)
+                x = self.weight_layer2(x)
+                normalized_attn_weights = x.softmax(dim=1)
+                self_attn = (self_attn_maps * normalized_attn_weights.unsqueeze(dim=-1)).mean(dim=1)
+            else:
+                normalized_attn_weights = self.attn_weights.softmax(dim=0)
+                self_attn = (self_attn_maps * normalized_attn_weights.view(1, normalized_attn_weights.shape[0], 1)).mean(dim=1)
+            self_attn = self_attn.softmax(dim=-1)
+            # then we perform the weighted mean of patches
+            visual_embedding = (self_attn.unsqueeze(-1) * visual_embedding).mean(dim=1)
+        return visual_embedding
+    def project_clip_txt(self, textual_embedding):
+        textual_embedding = textual_embedding.float()
+        x = self.linear_layer(textual_embedding)
+        for hidden_layer in self.hidden_layers:
+            if self.act:
+                x = self.act(x)
+            x = hidden_layer(x)
+        return x
+    def project_visual(self, visual_embedding):
+        visual_embedding = visual_embedding.float()
+        x = self.visual_linear(visual_embedding)
+        for hidden_layer in self.visual_hidden_layers:
+            if self.act:
+                x = self.act(x)
+            x = hidden_layer(x)
+        return x
+    def load_state_dict(self, state_dict, strict=True):
+        # compatibility with old code
+        if 'linear_layer2.weight' in state_dict:
+            state_dict['hidden_layers.0.weight'] = state_dict.pop('linear_layer2.weight')
+            state_dict['hidden_layers.0.bias'] = state_dict.pop('linear_layer2.bias')
+        # Call the parent class's load_state_dict with the modified state_dict
+        super(DoubleMLP, self).load_state_dict(state_dict, strict)
+    def set_alignment_strategy(self, alignment_strategy):
+        self.alignment_strategy = alignment_strategy
+        return
+    def __len__(self):
+        return sum(p.numel() for p in self.parameters())
+class CLIPLastLayer(nn.Module):
+    def __init__(self,  act=nn.Tanh(), hidden_layer=False, cosine=True, dino_embed_dim=1024, clip_embed_dim=512, weight_attn_heads=None, alignment_strategy='max_score', clip_model='ViT-B/16', text_input_mask=None, projection_weights=None):
+        import clip
+        super().__init__()
+        self.clip_model, _ = clip.load(clip_model)
+        self.clip_model.to(dtype=torch.float32)
+        # self.last_resblock = copy.deepcopy(self.clip_model.transformer.resblocks[-1])
+        self.last_resblock = self.clip_model.transformer.resblocks[-1]
+        # self.last_resblock.requires_grad_(False)
+        # self.last_ln = copy.deepcopy(self.clip_model.ln_final)
+        self.last_ln = self.clip_model.ln_final
+        # self.last_ln.requires_grad_(False)
+        # self.clip_text_proj = copy.deepcopy(self.clip_model.text_projection)
+        self.clip_text_proj = self.clip_model.text_projection
+        # self.clip_text_proj.requires_grad_(False)
+        self.clip_dtype = self.clip_model.dtype
+        del self.clip_model
+        self.projection_layer = ProjectionLayer(act=act, hidden_layer=hidden_layer, cosine=cosine, dino_embed_dim=dino_embed_dim,
+                                                clip_embed_dim=clip_embed_dim, weight_attn_heads=weight_attn_heads, alignment_strategy=alignment_strategy)
+        if projection_weights is not None:
+            self.projection_layer.load_state_dict(torch.load(projection_weights, 'cpu'))
+    def forward(self, visual_embedding, textual_embedding, ret_similarity_matrix=True, ret_embeds=False, self_attn_maps=None, cls=None, text_argmax=None, text_input_mask=None):
+        x = self.last_resblock(textual_embedding.permute(1, 0, 2))
+        x = x.permute(1, 0, 2)
+        x = self.last_ln(x).type(self.clip_dtype)
+        x = x[torch.arange(x.shape[0]), text_argmax] @ self.clip_text_proj
+        if ret_embeds:
+            textual_embedding, visual_embedding = self.projection_layer(visual_embedding, x, ret_similarity_matrix=ret_similarity_matrix, ret_embeds=ret_embeds, self_attn_maps=self_attn_maps, cls=cls)
+            return textual_embedding, visual_embedding
+        x = self.projection_layer(visual_embedding, x, ret_similarity_matrix=ret_similarity_matrix, ret_embeds=ret_embeds, self_attn_maps=self_attn_maps, cls=cls)
+        return x
+    def project_clip_txt(self, textual_embedding, text_argmax):
+        x = self.last_resblock(textual_embedding.permute(1, 0, 2))
+        x = x.permute(1, 0, 2)
+        x = self.last_ln(x).type(self.clip_dtype)
+        x = x[torch.arange(x.shape[0]), text_argmax] @ self.clip_text_proj
+        x = self.projection_layer.project_clip_txt(x)
+        return x
+    @classmethod
+    def from_config(cls, config):
+        if type(config) is str:
+            # if the configuration is a string, we treat it as a file path
+            with open(config, 'r') as f:
+                config = yaml.safe_load(f)['model']
+        # loading the activation function
+        act = config.get('act', None)
+        if act == 'tanh':
+            act = nn.Tanh()
+        elif act == 'relu':
+            act = nn.ReLU()
+        elif act == 'sigmoid':
+            act = nn.Sigmoid()
+        elif act is not None:
+            raise Exception("Unknown activation function")
+        model = cls(
+            act=act,
+            hidden_layer=config.get('hidden_layer', False),
+            cosine=config.get('cosine', True),
+            dino_embed_dim=config.get('dino_embed_dim', 1024),
+            clip_embed_dim=config.get('clip_embed_dim', 512),
+            weight_attn_heads=config.get('weight_attn_heads', None),
+            alignment_strategy=config.get('alignment_strategy', 'max_score'),
+            clip_model=config.get('clip_model', 'ViT-B/16'),
+            projection_weights=config.get('projection_weights', None),
+        )
+        if config.get('starting_checkpoint', None) is not None:
+            model.load_state_dict(torch.load(config['starting_checkpoint'], 'cpu'))
+        return model
+    def __len__(self):
+        return sum(p.numel() for p in self.parameters())
+class DinoText(nn.Module):
+    """
+    Project images and texts into DINOv2 latent space.
+    """
+    def __init__(self, dino_cfg="dinov2_vitl14_reg", clip_cfg="ViT-B/16", projection_cfg="configs/linear.yaml", projection_weights="weights/linear_avg_self_attn_out.pth", freeze_text_encoder=True, avg_self_attn_token=True, use_disentangled_self_attn=False):
+        super().__init__()
+        # DINO parameters
+        self.num_global_tokens = 1 if "reg" not in dino_cfg else 5
+        self.embed_dim = 1024 if "vitl" in dino_cfg else 768
+        self.num_attn_heads = 16
+        self.scale = 0.125
+        self.visual_backbone = torch.hub.load('facebookresearch/dinov2', dino_cfg)
+        self.text_backbone, _ = clip.load(clip_cfg)
+        self.clip2dino_proj = ProjectionLayer.from_config(projection_cfg)
+        if projection_weights is not None:
+            self.clip2dino_proj.load_state_dict(torch.load(projection_weights, 'cpu'))
+        self.use_avg_self_attn = avg_self_attn_token
+        self.use_disentangled_self_attn = use_disentangled_self_attn
+        if self.use_avg_self_attn or self.use_disentangled_self_attn:
+            self.visual_backbone.blocks[-1].attn.qkv.register_forward_hook(get_self_attention)
+        if self.use_disentangled_self_attn:
+            self.visual_backbone.blocks[-1].attn.qkv.register_forward_hook(get_self_attention)
+        if freeze_text_encoder:
+            self.text_backbone.eval()
+            self.text_backbone.requires_grad_(False)
+        self.avg_self_attn_token = avg_self_attn_token
+        if self.avg_self_attn_token or self.use_disentangled_self_attn:
+            self.visual_backbone.blocks[-1].attn.qkv.register_forward_hook(self.get_self_attention)
+            self.feats = {}
+            self.num_global_tokens = 1 if "reg" not in dino_cfg else 5
+            self.num_attn_heads = 16
+            self.scale = 0.125
+    @classmethod
+    def from_config(cls, cfg):
+        if type(cfg) is str:
+            # if the configuration is a string, we treat it as a file path
+            with open(cfg, 'r') as f:
+                cfg = yaml.safe_load(f)['model']
+        model = cls(
+            dino_cfg=cfg.get('dino_cfg', "dinov2_vitl14_reg"),
+            clip_cfg=cfg.get('clip_cfg', "ViT-B/16"),
+            projection_cfg=cfg.get('projection_cfg', "configs/linear.yaml"),
+            projection_weights=cfg.get('projection_weights', None),
+            avg_self_attn_token=cfg.get('use_avg_self_attn', False),
+            use_disentangled_self_attn=cfg.get('use_disentangled_self_attn', False),
+        )
+        return model
+    def encode_text(self, tokenized_texts):
+        x = self.text_backbone.encode_text(tokenized_texts)
+        x = self.clip2dino_proj.project_clip_txt(x)
+        return x
+    def encode_image(self, images):
+        batch_size, _, _, _ = images.shape
+        x = self.visual_backbone(images, is_training=self.avg_self_attn_token or self.use_disentangled_self_attn)
+        if self.avg_self_attn_token:
+            batch_size, num_tokens, embed_dim = x['x_norm_patchtokens'].shape
+            num_tokens = num_tokens + self.num_global_tokens
+            self_attn = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens)
+            x = (self_attn.unsqueeze(-1) * x['x_norm_patchtokens']).mean(dim=1)
+        if self.use_disentangled_self_attn:
+            batch_size, num_tokens, embed_dim = x['x_norm_patchtokens'].shape
+            num_tokens = num_tokens + self.num_global_tokens
+            self_attn, self_attn_maps = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens, ret_self_attn_maps=True)
+            self_attn_maps = self_attn_maps.softmax(dim=-1)
+            x = (x['x_norm_patchtokens'].unsqueeze(1) * self_attn_maps.unsqueeze(-1)).mean(dim=2)
+        return x
+    def get_self_attention(self, module, input, output):
+        self.feats['self_attn'] = output
+    def process_self_attention(self, output, batch_size, num_tokens, num_attn_heads, embed_dim, scale, num_global_tokens, ret_self_attn_maps=False):
+        qkv = output.reshape(batch_size, num_tokens, 3, num_attn_heads, embed_dim // num_attn_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        self_attn_maps = attn[:, : , 0, num_global_tokens:]
+        self_attn = self_attn_maps.mean(dim=1)
+        self_attn = self_attn.softmax(dim=-1)
+        if ret_self_attn_maps:
+            return self_attn, self_attn_maps
+        else:
+            return self_attn
+    def forward(self, images, tokenized_texts, cosine=True, ret_similarity_matrix=True):
+        img_embed = self.encode_image(images)
+        txt_embed = self.encode_text(tokenized_texts)
+        if cosine:
+            img_embed = F.normalize(img_embed, p=2, dim=1)
+            txt_embed = F.normalize(txt_embed, p=2, dim=1)
+        x = img_embed @ txt_embed.transpose(1, 0)
+        if not ret_similarity_matrix:
+            x = x[torch.eye(len(x)) > 0.5] # only diagonal elements
+        return x
+    def __len__(self):
+        def count_parameters(model):
+            return sum(p.numel() for p in model.parameters())
+        return count_parameters(self.visual_backbone) + count_parameters(self.clip2dino_proj) + count_parameters(self.text_backbone.transformer)

hf_model/modules.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# ------------------------------------------------------------------------------
+# FreeDA
+# ------------------------------------------------------------------------------
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class BLCModuleCompatibleBCHW(nn.Module):
+    def forward_blc(self, x):
+        raise NotImplementedError()
+    def forward(self, x):
+        is2d = x.ndim == 4
+        if is2d:
+            _, _, H, W = x.shape
+            x = rearrange(x, "B C H W -> B (H W) C")
+        x = self.forward_blc(x)
+        if is2d:
+            x = rearrange(x, "B (H W) C -> B C H W", H=H, W=W)
+        return x
+class FeatureEncoder(nn.Module):
+    """Encoder + Feature extractor
+    """
+    def __init__(self, safe=True):
+        super().__init__()
+        self.safe = safe  # clone return features to protect it from after-modification
+        self._features = []
+    def hook(self, module, input, output):
+        self._features.append(output)
+    def clear_features(self):
+        self._features.clear()
+    def _encode(self, x):
+        raise NotImplementedError()
+    def forward(self, *args, ret_feats=False, **kwargs):
+        self.clear_features()
+        x = self._encode(*args, **kwargs)
+        if ret_feats:
+            if self.safe:
+                features = [t.clone() for t in self._features]
+                self.clear_features()
+            else:
+                features = self._features
+            return x, features
+        else:
+            self.clear_features()
+            return x
+class Project2d(nn.Module):
+    """2d projection by 1x1 conv
+    Args:
+        p: [C_in, C_out]
+    """
+    def __init__(self, p):
+        # convert to 1x1 conv weight
+        super().__init__()
+        p = rearrange(p, "Cin Cout -> Cout Cin 1 1")
+        self.p = nn.Parameter(p.detach().clone())
+    def forward(self, x):
+        return F.conv2d(x, self.p)  # 1x1 conv
+def dispatcher(dispatch_fn):
+    def decorated(key, *args):
+        if callable(key):
+            return key
+        if key is None:
+            key = "none"
+        return dispatch_fn(key, *args)
+    return decorated
+@dispatcher
+def activ_dispatch(activ):
+    return {
+        "none": nn.Identity,
+        "relu": nn.ReLU,
+        "lrelu": partial(nn.LeakyReLU, negative_slope=0.2),
+        "gelu": nn.GELU,
+    }[activ.lower()]
+def get_norm_fn(norm, C):
+    """2d normalization layers
+    """
+    if norm is None or norm == "none":
+        return nn.Identity()
+    return {
+        "bn": nn.BatchNorm2d(C),
+        "syncbn": nn.SyncBatchNorm(C),
+        "ln": LayerNorm2d(C),
+        "gn": nn.GroupNorm(32, C),
+    }[norm]
+class LayerNorm2d(nn.LayerNorm):
+    def __init__(self, num_channels, eps=1e-5, affine=True):
+        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
+    def forward(self, x):
+        return F.layer_norm(
+            x.permute(0, 2, 3, 1),
+            self.normalized_shape,
+            self.weight,
+            self.bias,
+            self.eps
+        ).permute(0, 3, 1, 2)
+class Gate(nn.Module):
+    """Tanh gate"""
+    def __init__(self, init=0.0):
+        super().__init__()
+        self.gate = nn.Parameter(torch.as_tensor(init))
+    def forward(self, x):
+        return torch.tanh(self.gate) * x
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        C_in,
+        C_out,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        norm="none",
+        activ="relu",
+        bias=True,
+        upsample=False,
+        downsample=False,
+        pad_type="zeros",
+        dropout=0.0,
+        gate=False,
+    ):
+        super().__init__()
+        if kernel_size == 1:
+            assert padding == 0
+        self.C_in = C_in
+        self.C_out = C_out
+        activ = activ_dispatch(activ)
+        self.upsample = upsample
+        self.downsample = downsample
+        self.norm = get_norm_fn(norm, C_in)
+        self.activ = activ()
+        if dropout > 0.0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        self.conv = nn.Conv2d(
+            C_in, C_out, kernel_size, stride, padding,
+            bias=bias, padding_mode=pad_type
+        )
+        self.gate = Gate() if gate else None
+    def forward(self, x):
+        # pre-act
+        x = self.norm(x)
+        x = self.activ(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2)
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        x = self.conv(x)
+        if self.downsample:
+            x = F.avg_pool2d(x, 2)
+        if self.gate is not None:
+            x = self.gate(x)
+        return x
+class ResConv(nn.Module):
+    """Pre-activate residual block with single or double conv block"""
+    def __init__(
+        self,
+        C_in,
+        C_out,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        norm="none",
+        activ="relu",
+        upsample=False,
+        pad_type="zeros",
+        dropout=0.0,
+        gate=True,  # if True, use zero-init gate
+        double=False,
+        # norm2 and activ2 are only used when double is True
+        norm2=None,  # if given, apply it to second conv
+        activ2=None  # if given, apply it to second conv
+    ):
+        super().__init__()
+        self.C_in = C_in
+        self.C_out = C_out
+        self.upsample = upsample
+        self.double = double
+        self.conv = ConvBlock(
+            C_in, C_out, kernel_size, stride, padding, norm, activ,
+            pad_type=pad_type, dropout=dropout, gate=gate,
+        )
+        if double:
+            norm2 = norm2 or norm
+            activ2 = activ2 or activ
+            self.conv2 = ConvBlock(
+                C_out, C_out, kernel_size, stride, padding, norm2, activ2,
+                pad_type=pad_type, dropout=dropout, gate=gate
+            )
+    def forward(self, x):
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2)
+        x = x + self.conv(x)
+        if self.double:
+            x = x + self.conv2(x)
+        return x

hf_model/pamr.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright 2020 TU Darmstadt
+# Licnese: Apache 2.0 License.
+# https://github.com/visinf/1-stage-wseg/blob/master/models/mods/pamr.py
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from functools import partial
+#
+# Helper modules
+#
+class LocalAffinity(nn.Module):
+    def __init__(self, dilations=[1]):
+        super(LocalAffinity, self).__init__()
+        self.dilations = dilations
+        weight = self._init_aff()
+        self.register_buffer('kernel', weight)
+    def _init_aff(self):
+        # initialising the shift kernel
+        weight = torch.zeros(8, 1, 3, 3)
+        for i in range(weight.size(0)):
+            weight[i, 0, 1, 1] = 1
+        weight[0, 0, 0, 0] = -1
+        weight[1, 0, 0, 1] = -1
+        weight[2, 0, 0, 2] = -1
+        weight[3, 0, 1, 0] = -1
+        weight[4, 0, 1, 2] = -1
+        weight[5, 0, 2, 0] = -1
+        weight[6, 0, 2, 1] = -1
+        weight[7, 0, 2, 2] = -1
+        self.weight_check = weight.clone()
+        return weight
+    def forward(self, x):
+        self.weight_check = self.weight_check.type_as(x)
+        assert torch.all(self.weight_check.eq(self.kernel))
+        B,K,H,W = x.size()
+        x = x.view(B*K,1,H,W)
+        x_affs = []
+        for d in self.dilations:
+            x_pad = F.pad(x, [d]*4, mode='replicate')
+            x_aff = F.conv2d(x_pad, self.kernel, dilation=d)
+            x_affs.append(x_aff)
+        x_aff = torch.cat(x_affs, 1)
+        return x_aff.view(B,K,-1,H,W)
+class LocalAffinityCopy(LocalAffinity):
+    def _init_aff(self):
+        # initialising the shift kernel
+        weight = torch.zeros(8, 1, 3, 3)
+        weight[0, 0, 0, 0] = 1
+        weight[1, 0, 0, 1] = 1
+        weight[2, 0, 0, 2] = 1
+        weight[3, 0, 1, 0] = 1
+        weight[4, 0, 1, 2] = 1
+        weight[5, 0, 2, 0] = 1
+        weight[6, 0, 2, 1] = 1
+        weight[7, 0, 2, 2] = 1
+        self.weight_check = weight.clone()
+        return weight
+class LocalStDev(LocalAffinity):
+    def _init_aff(self):
+        weight = torch.zeros(9, 1, 3, 3)
+        weight.zero_()
+        weight[0, 0, 0, 0] = 1
+        weight[1, 0, 0, 1] = 1
+        weight[2, 0, 0, 2] = 1
+        weight[3, 0, 1, 0] = 1
+        weight[4, 0, 1, 1] = 1
+        weight[5, 0, 1, 2] = 1
+        weight[6, 0, 2, 0] = 1
+        weight[7, 0, 2, 1] = 1
+        weight[8, 0, 2, 2] = 1
+        self.weight_check = weight.clone()
+        return weight
+    def forward(self, x):
+        # returns (B,K,P,H,W), where P is the number
+        # of locations
+        x = super(LocalStDev, self).forward(x)
+        return x.std(2, keepdim=True)
+class LocalAffinityAbs(LocalAffinity):
+    def forward(self, x):
+        x = super(LocalAffinityAbs, self).forward(x)
+        return torch.abs(x)
+#
+# PAMR module
+#
+class PAMR(nn.Module):
+    def __init__(self, num_iter=1, dilations=[1]):
+        super(PAMR, self).__init__()
+        self.num_iter = num_iter
+        self.aff_x = LocalAffinityAbs(dilations)
+        self.aff_m = LocalAffinityCopy(dilations)
+        self.aff_std = LocalStDev(dilations)
+    def forward(self, x, mask):
+        mask = F.interpolate(mask, size=x.size()[-2:], mode="bilinear", align_corners=True)
+        # x: [BxKxHxW]
+        # mask: [BxCxHxW]
+        B,K,H,W = x.size()
+        _,C,_,_ = mask.size()
+        x_std = self.aff_std(x)
+        x = -self.aff_x(x) / (1e-8 + 0.1 * x_std)
+        x = x.mean(1, keepdim=True)
+        x = F.softmax(x, 2)
+        for _ in range(self.num_iter):
+            m = self.aff_m(mask)  # [BxCxPxHxW]
+            mask = (m * x).sum(2)
+        # xvals: [BxCxHxW]
+        return mask

hf_model/talk2dino.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import itertools
+import os
+import pickle
+from math import sqrt
+import re
+import yaml
+import numpy as np
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from transformers import BertModel, AutoTokenizer
+import torchvision.transforms as T
+import clip
+import importlib
+import hf_model.us as us
+from hf_model.pamr import PAMR
+from hf_model.masker import DINOTextMasker
+from hf_model.templates import get_template
+from hf_model.model import ProjectionLayer, VisualProjectionLayer, CLIPLastLayer, DoubleMLP
+from hf_model.hooks import average_text_tokens, get_vit_out, feats
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class DINOText(nn.Module):
+    def get_self_attention(self, module, input, output):
+        self.feats['self_attn'] = output
+    def get_clip_second_last_dense_out(self, model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+        self.feats['clip_second_last_out'] = output
+        self.feats['clip_second_last_out'].to(dtype=torch.float32)
+    def get_all_out_tokens(self, model: torch.nn.Module, input: torch.Tensor, output: torch.Tensor):
+        self.feats['clip_txt_out_tokens'] = output
+    def __init__(
+            self, model_name, resize_dim, clip_model_name, proj_class, proj_name, proj_model, avg_self_attn_token=False, disentangled_self_attn_token=True, loss=None, pre_trained=True,
+            unfreeze_last_text_layer=False, unfreeze_last_image_layer=False, is_eval=True, use_avg_text_token=False, keep_cls=False, keep_end_seq=False, with_bg_clean=False, **kwargs
+    ):
+        super().__init__()
+        self.feats = {}
+        self.model_name = model_name
+        # loading the model
+        if 'dinov2' in model_name:
+            self.model_family = 'facebookresearch/dinov2' if 'dinov2' in model_name else 'facebookresearch/dino:main'
+            self.model = torch.hub.load(self.model_family, model_name)
+        elif 'dinov3' in model_name:
+            def extract_dinov3_name(path, n_parts=2):
+                filename = os.path.basename(path)
+                parts = filename.split("_")
+                return "_".join(parts[:n_parts])
+            self.model = torch.hub.load('src/dinov3', extract_dinov3_name(model_name), source='local', weights=model_name)
+        elif 'mae' in model_name or 'sam' in model_name or 'clip' in model_name or 'dino' in model_name:
+            self.model = timm.create_model(
+                model_name,
+                pretrained=True,
+                num_classes=0,  # remove classifier nn.Linear
+                img_size=resize_dim
+            )
+            if 'sam' in model_name:
+                self.model.blocks[-1].register_forward_hook(get_vit_out)
+        else:
+            raise Exception("Unknown ViT model")
+        # self.model.eval()
+        mean = (0.485, 0.456, 0.406) if not 'clip' in model_name else (0.4815, 0.4578, 0.4082)
+        std = (0.229, 0.224, 0.225) if not 'clip' in model_name else (0.2686, 0.2613, 0.2758)
+        self.image_transforms = T.Compose([
+            T.Resize((resize_dim, resize_dim)),
+            lambda x: T.ToTensor()(x) if not isinstance(x, torch.Tensor) else x / 255.0,  # ensure tensor
+            T.Normalize(mean, std),
+        ])
+        self.model.to(device)
+        self.model.requires_grad_(False)
+        self.clip_model_name = clip_model_name
+        if 'bert' in self.clip_model_name:
+            self.clip_model = BertModel.from_pretrained(self.clip_model_name, output_hidden_states = False)
+            # load the corresponding wordtokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(self.clip_model_name)
+        else:
+            self.clip_model, _ = clip.load(clip_model_name, device=device)
+        self.clip_model.eval()
+        self.clip_model.requires_grad_(False)
+        if unfreeze_last_text_layer:
+            for param in self.clip_model.transformer.resblocks[-1].parameters():
+                param.requires_grad = True
+            for param in self.clip_model.ln_final.parameters():
+                param.requires_grad = True
+            self.clip_model.text_projection.requires_grad = True
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        # with open(os.path.join('configs', f"{proj_class}.yaml"), 'r') as config_file:
+        #     config = yaml.safe_load(config_file)['model']
+        if 'vitb_mlp_infonce' in proj_class:
+            config = {
+                'act': 'tanh', # None, tanh, relu or sigmoid
+                'hidden_layer': True,
+                'dino_embed_dim': 768
+            }
+        elif 'vitl_mlp_infonce' in proj_class:
+            config = {
+                'act': 'tanh', # None, tanh, relu or sigmoid
+                'hidden_layer': True,
+                'dino_embed_dim': 1024
+            }
+        self.proj = ProjectionLayer.from_config(config)
+        if type(self.proj) == CLIPLastLayer:
+            self.clip_model.transformer.resblocks[-2].register_forward_hook(self.get_clip_second_last_dense_out)
+        # if pre_trained:
+        #     self.proj.load_state_dict(torch.load(os.path.join("weights", f"{proj_name}.pth"), 'cpu'))
+        self.proj.to(device)
+        self.masker = DINOTextMasker(similarity_type="cosine")
+        self.masker = self.masker.eval()
+        self.pamr = None
+        self.avg_self_attn_token = avg_self_attn_token
+        self.disentangled_self_attn_token = disentangled_self_attn_token
+        if self.avg_self_attn_token or self.disentangled_self_attn_token or is_eval:
+            self.model.blocks[-1].attn.qkv.register_forward_hook(self.get_self_attention)
+            self.num_global_tokens = 5 if 'reg' in model_name or 'dinov3' in model_name else 1
+            if 'sam' in self.model_name:
+                self.num_global_tokens = 0
+            self.num_attn_heads = self.model.num_heads
+            self.scale = 0.125
+        self.use_avg_text_token = use_avg_text_token
+        if self.use_avg_text_token:
+            self.feats = {}
+            # in this case we register a forward hook with the aim of getting all the tokens and not only the cls
+            self.clip_model.ln_final.register_forward_hook(self.get_all_out_tokens)
+            self.keep_cls = keep_cls
+            self.keep_end_seq = keep_end_seq
+        self.with_bg_clean = with_bg_clean
+    def process_self_attention(self, output, batch_size, num_tokens, num_attn_heads, embed_dim, scale, num_global_tokens, ret_self_attn_maps=False):
+        qkv = output.reshape(batch_size, num_tokens, 3, num_attn_heads, embed_dim // num_attn_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        self_attn_maps = attn[:, : , 0, num_global_tokens:]
+        self_attn = self_attn_maps.mean(dim=1)
+        self_attn = self_attn.softmax(dim=-1)
+        if ret_self_attn_maps:
+            return self_attn, self_attn_maps
+        else:
+            return self_attn
+    def encode_text(self, tokenized_texts):
+        if type(self.proj) == CLIPLastLayer:
+            self.clip_model.encode_text(tokenized_texts)
+            x = self.feats['clip_second_last_out']
+            x = x.to(dtype=torch.float32)
+        else:
+            x = self.clip_model.encode_text(tokenized_texts)
+        return x
+    def encode_image(self, images):
+        batch_size, _, _, _ = images.shape
+        self_attn_maps = None
+        x = self.model(images, is_training=(self.avg_self_attn_token or self.disentangled_self_attn_token))
+        batch_size, num_tokens, embed_dim = x['x_norm_patchtokens'].shape
+        num_tokens = num_tokens + self.num_global_tokens
+        if self.avg_self_attn_token or self.disentangled_self_attn_token:
+            self_attn, self_attn_maps = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens, ret_self_attn_maps=True)
+        if self.avg_self_attn_token:
+            x = (self_attn.unsqueeze(-1) * x['x_norm_patchtokens']).mean(dim=1)
+        elif self.disentangled_self_attn_token:
+            self_attn_maps = self_attn_maps.softmax(dim=-1)
+            x = (x['x_norm_patchtokens'].unsqueeze(1) * self_attn_maps.unsqueeze(-1)).mean(dim=2)
+        return x, self_attn_maps
+    def forward(self, image, text, return_logit_scale=False):
+        with torch.no_grad():
+            txt_embed = self.encode_text(text)
+        img_embed, self_attn_maps = self.encode_image(image)
+        if type(self.proj) == CLIPLastLayer:
+            img_embed, txt_embed = self.proj(img_embed, txt_embed, ret_embeds=True, self_attn_maps=self_attn_maps, text_argmax=text.argmax(dim=-1))
+        else:
+            img_embed, txt_embed = self.proj(img_embed, txt_embed, ret_embeds=True, self_attn_maps=self_attn_maps)
+        if return_logit_scale:
+            return txt_embed, img_embed, self.logit_scale
+        return txt_embed, img_embed
+    def compute_loss(self, image, text, cosine=True, ret_similarity_matrix=True):
+        ret = {}
+        if cosine:
+            img_embed = F.normalize(img_embed, p=2, dim=1)
+            txt_embed = F.normalize(txt_embed, p=2, dim=1)
+        sim = img_embed @ txt_embed.transpose(1, 0)
+        if not ret_similarity_matrix:
+            sim = sim[torch.eye(len(sim)) > 0.5] # only diagonal elements
+        ret['contrastive_loss'] = self.contrastive_loss.compute_contrastive_loss(sim)
+        return ret
+    @torch.no_grad()
+    def build_dataset_class_tokens(self, template_set, classnames):
+        tokens = []
+        templates = get_template(template_set)
+        for classname in classnames:
+            if 'bert' not in self.clip_model_name:
+                tokens.append(
+                    clip.tokenize([template.format(classname) for template in templates])
+                )
+            else:
+                tokens.append(self.tokenizer([template.format(classname) for template in templates], return_tensors='pt', padding='max_length')['input_ids'])
+        # [N, T, L], N: number of instance, T: number of captions (including ensembled), L: sequence length
+        tokens = torch.stack(tokens)
+        return tokens
+    @torch.no_grad()
+    def build_text_embedding(self, text):
+        """
+        Args:
+            text (torch.Tensor): [NUM_CLASSES, NUM_TEMPLATES, CONTEXT_LENGTH] text tokens
+        Returns:
+            text_embs
+        """
+        text = text.to(next(self.parameters()).device)
+        num_classes, num_templates = text.shape[:2]
+        text_argmax = text.argmax(dim=-1)
+        text_argmax = rearrange(text_argmax, 'n t -> (n t)', n=num_classes, t=num_templates)
+        text = rearrange(text, 'n t l -> (n t) l', n=num_classes, t=num_templates)
+        # chunked inference for memory limitation
+        chunk_size = 32
+        N = text.size(0)
+        if type(self.proj) == CLIPLastLayer:
+            text_embs = torch.cat([
+            self.proj.project_clip_txt(self.encode_text(text[i:i + chunk_size]).permute(1, 0, 2), text_argmax=text_argmax[i:i + chunk_size])
+            for i in range(0, N, chunk_size)
+        ])
+        else:
+            if not self.use_avg_text_token:
+                # performing classification using CLS textual token
+                if 'bert' not in self.clip_model_name:
+                    text_embs = torch.cat([
+                        self.clip_model.encode_text(text[i:i + chunk_size])
+                        for i in range(0, N, chunk_size)
+                    ])
+                else:
+                    # encoding with BERT
+                    text_embs = []
+                    for i in range(0, N, chunk_size):
+                        outputs = self.clip_model(text[i:i + chunk_size])
+                        text_embs.append(outputs['pooler_output'])
+                    text_embs = torch.cat(text_embs)
+            else:
+                # using text token average
+                text_embs = []
+                for i in range(0, N, chunk_size):
+                    self.clip_model.encode_text(text[i:i + chunk_size])
+                    text_embs.append(average_text_tokens(self.feats['clip_txt_out_tokens'] @ self.clip_model.text_projection, text[i:i + chunk_size] > 0, self.keep_cls, self.keep_end_seq))
+                text_embs = torch.cat(text_embs)
+        # [N, T, C]
+        text_embs = rearrange(text_embs, '(n t) c -> n t c', n=num_classes, t=num_templates)
+        # [N, C]
+        text_embs = text_embs.mean(dim=1).float()
+        if type(self.proj) == ProjectionLayer or type(self.proj) == DoubleMLP:
+            text_embs = self.proj.project_clip_txt(text_embs)
+        text_embs = us.normalize(text_embs, dim=-1)
+        return text_embs
+    def apply_pamr(self, image, mask):
+        image = F.interpolate(image, mask.shape[-2:], mode="bilinear", align_corners=True)
+        if self.pamr is None:
+            pamr_iter = 10
+            pamr_kernel = [1, 2, 4, 8, 12, 24]
+            self.pamr = PAMR(pamr_iter, pamr_kernel)
+            self.pamr.eval()
+            self.pamr.to(next(self.parameters()).device)
+        mask = self.pamr(image, mask)
+        return mask
+    def compute_padsize(self, H: int, W: int, patch_size: int):
+        l, r, t, b = 0, 0, 0, 0
+        if W % patch_size:
+            lr = patch_size - (W % patch_size)
+            l = lr // 2
+            r = lr - l
+        if H % patch_size:
+            tb = patch_size - (H % patch_size)
+            t = tb // 2
+            b = tb - t
+        return l, r, t, b
+    @torch.no_grad()
+    def generate_masks(
+            self, image, img_metas, text_emb, classnames, text_is_token=False, apply_pamr=False, background_func="weighted_average_sigmoid", lambda_bg=0.2,
+            # kp_w=0.3,
+    ):
+        """Generate masks for each text embeddings
+        Args:
+            image [B, 3, H, W]
+        Returns:
+            softmask [B, N, H, W]: softmasks for each text embeddings
+        """
+        H, W = image.shape[2:]  # original image shape
+        # padded image size
+        pH, pW = image.shape[2:]
+        num_classes = text_emb.shape[0]
+        batch_size = image.shape[0]
+        image = image[:, [2, 1, 0], :, :]  # BGR to RGB
+        ori_image = image.clone()
+        img_preprocessed = self.image_transforms(image).to(next(self.parameters()).device)
+        if 'dinov2' in self.model_name or 'dinov3' in self.model_name:
+            image_feat = self.model.forward_features(img_preprocessed)['x_norm_patchtokens']
+        elif 'mae' in self.model_name or 'clip' in self.model_name or 'dino' in self.model_name:
+            image_feat = self.model.forward_features(img_preprocessed)[:, 1:, :]
+        elif 'sam' in self.model_name:
+            self.model.forward_features(img_preprocessed)
+            image_feat = feats['vit_out'].reshape(feats['vit_out'].shape[0], feats['vit_out'].shape[1]**2, feats['vit_out'].shape[-1]) # BS x N_PATCHES x EMBED_DIM
+        batch_size, num_tokens, embed_dim = image_feat.shape
+        if type(self.proj) == VisualProjectionLayer:
+            image_feat = self.proj.project_dino(image_feat.float())
+        if type(self.proj) == DoubleMLP:
+            image_feat = self.proj.project_visual(image_feat.float())
+        b, np, c = image_feat.shape
+        np_h = np_w = int(sqrt(np))
+        image_feat = image_feat.reshape(b, np_h, np_w, c).permute(0, 3, 1, 2)
+        self_attn, self_attn_maps = self.process_self_attention(self.feats['self_attn'], batch_size, num_tokens + self.num_global_tokens, self.num_attn_heads, embed_dim, self.scale, self.num_global_tokens, ret_self_attn_maps=True)
+        mask, simmap = self.masker.forward_seg(image_feat, text_emb, hard=False)  # [B, N, H', W']
+        if self.with_bg_clean:
+            mask = self.similarity_assignment_weighted(mask, image_feat, self_attn_maps, text_emb, lambda_bg)
+        # resize
+        mask = F.interpolate(mask, (pH, pW), mode='bilinear', align_corners=True)  # [B, N, H, W]
+        if apply_pamr:
+            for c in range(0, mask.shape[1], 30):
+                mask[:, c:c + 30] = self.apply_pamr(ori_image, mask[:, c:c + 30])
+        assert mask.shape[2] == H and mask.shape[3] == W, f"shape mismatch: ({H}, {W}) / {mask.shape}"
+        return mask, simmap
+    def similarity_assignment_weighted(self, mask, image_feat, self_attn_maps, text_emb, lambda_bg=0.2):
+        bs, c, h, w = image_feat.shape
+        bs, num_classes, h, w = mask.shape
+        bs, num_heads, hw = self_attn_maps.shape
+        image_feat = image_feat.reshape(bs, c, hw)
+        num_classes, c = text_emb.shape
+        avg_head_embed = (self_attn_maps.unsqueeze(2) * image_feat.unsqueeze(1)).mean(dim=-1)
+        avg_head_embed = avg_head_embed / avg_head_embed.norm(dim=-1, keepdim=True)
+        avg_head_embed = avg_head_embed.permute(0, 2, 1) # [B, C, M]
+        head_text_sim = text_emb.unsqueeze(0) @ avg_head_embed # [B, M, N]
+        head_text_sim = (head_text_sim).softmax(dim=-1)
+        head_text_sim_sum = head_text_sim.sum(dim=-1)
+        self_attn_maps_repeat = self_attn_maps.unsqueeze(1).repeat(1, num_classes, 1, 1)
+        head_text_sim_repeat = head_text_sim.unsqueeze(-1).repeat(1, 1, 1, hw)
+        avg_self_attn_per_class = (self_attn_maps_repeat * head_text_sim_repeat).sum(dim=2) / head_text_sim_sum.unsqueeze(-1).repeat(1, 1, hw)
+        avg_self_attn_per_class = avg_self_attn_per_class.softmax(dim=-1)
+        min_self_attn = avg_self_attn_per_class.min().item()
+        max_self_attn = avg_self_attn_per_class.max().item()
+        max_self_attn = max(max_self_attn, max_self_attn - min_self_attn)
+        avg_self_attn_per_class = avg_self_attn_per_class - min_self_attn
+        avg_self_attn_per_class = avg_self_attn_per_class / max_self_attn
+        avg_self_attn_per_class = avg_self_attn_per_class * (mask.max() - mask.min()) + mask.min()
+        mask = mask.reshape(num_classes, hw) # [N, P]
+        mask_output = (mask + lambda_bg * avg_self_attn_per_class).reshape(bs, num_classes, h, w) / (1 + lambda_bg)
+        return mask_output
+from huggingface_hub import PyTorchModelHubMixin
+class Talk2DINO(DINOText, PyTorchModelHubMixin):
+    def encode_text(self, texts):
+        """ texts: string or list of strings
+         returns: text embeddings (N, D) where N is the number of texts, D is the embedding dimension
+        """
+        text_tokens = clip.tokenize(texts).to(self.parameters().__next__().device)
+        txt_embed = self.clip_model.encode_text(text_tokens)
+        txt_embed = self.proj.project_clip_txt(txt_embed)
+        return txt_embed
+    def encode_image(self, images):
+        """ images: PIL image or list of PIL images
+         returns: image embeddings (N, L, D) where N is the number of images, L is the number of patches, D is the embedding dimension
+        """
+        if type(images) is not list:
+            images = [images]
+        img_preprocessed = [self.image_transforms(img).to(next(self.parameters()).device) for img in images]
+        img_preprocessed = torch.stack(img_preprocessed)
+        if 'dinov2' in self.model_name or 'dinov3' in self.model_name:
+            img_embed = self.model.forward_features(img_preprocessed)['x_norm_patchtokens']
+        elif 'mae' in self.model_name or 'clip' in self.model_name or 'dino' in self.model_name:
+            img_embed = self.model.forward_features(img_preprocessed)[:, 1:, :]
+        return img_embed

hf_model/templates.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# ------------------------------------------------------------------------------
+# FreeDA
+# ------------------------------------------------------------------------------
+# Modified from CLIP (https://github.com/openai/CLIP)
+# Copyright (c) 2021 OpenAI. All Rights Reserved.
+# ------------------------------------------------------------------------------
+full_imagenet_templates = [
+    "a bad photo of a {}.",
+    "a photo of many {}.",
+    "a sculpture of a {}.",
+    "a photo of the hard to see {}.",
+    "a low resolution photo of the {}.",
+    "a rendering of a {}.",
+    "graffiti of a {}.",
+    "a bad photo of the {}.",
+    "a cropped photo of the {}.",
+    "a tattoo of a {}.",
+    "the embroidered {}.",
+    "a photo of a hard to see {}.",
+    "a bright photo of a {}.",
+    "a photo of a clean {}.",
+    "a photo of a dirty {}.",
+    "a dark photo of the {}.",
+    "a drawing of a {}.",
+    "a photo of my {}.",
+    "the plastic {}.",
+    "a photo of the cool {}.",
+    "a close-up photo of a {}.",
+    "a black and white photo of the {}.",
+    "a painting of the {}.",
+    "a painting of a {}.",
+    "a pixelated photo of the {}.",
+    "a sculpture of the {}.",
+    "a bright photo of the {}.",
+    "a cropped photo of a {}.",
+    "a plastic {}.",
+    "a photo of the dirty {}.",
+    "a jpeg corrupted photo of a {}.",
+    "a blurry photo of the {}.",
+    "a photo of the {}.",
+    "a good photo of the {}.",
+    "a rendering of the {}.",
+    "a {} in a video game.",
+    "a photo of one {}.",
+    "a doodle of a {}.",
+    "a close-up photo of the {}.",
+    "a photo of a {}.",
+    "the origami {}.",
+    "the {} in a video game.",
+    "a sketch of a {}.",
+    "a doodle of the {}.",
+    "a origami {}.",
+    "a low resolution photo of a {}.",
+    "the toy {}.",
+    "a rendition of the {}.",
+    "a photo of the clean {}.",
+    "a photo of a large {}.",
+    "a rendition of a {}.",
+    "a photo of a nice {}.",
+    "a photo of a weird {}.",
+    "a blurry photo of a {}.",
+    "a cartoon {}.",
+    "art of a {}.",
+    "a sketch of the {}.",
+    "a embroidered {}.",
+    "a pixelated photo of a {}.",
+    "itap of the {}.",
+    "a jpeg corrupted photo of the {}.",
+    "a good photo of a {}.",
+    "a plushie {}.",
+    "a photo of the nice {}.",
+    "a photo of the small {}.",
+    "a photo of the weird {}.",
+    "the cartoon {}.",
+    "art of the {}.",
+    "a drawing of the {}.",
+    "a photo of the large {}.",
+    "a black and white photo of a {}.",
+    "the plushie {}.",
+    "a dark photo of a {}.",
+    "itap of a {}.",
+    "graffiti of the {}.",
+    "a toy {}.",
+    "itap of my {}.",
+    "a photo of a cool {}.",
+    "a photo of a small {}.",
+    "a tattoo of the {}.",
+]
+maskclip_templates = [
+    "there is a {} in the scene.",
+    "there is the {} in the scene.",
+    "this is a {} in the scene.",
+    "this is the {} in the scene.",
+    "this is one {} in the scene.",  # maskclip
+]
+sub_imagenet_template = [
+    "itap of a {}.",
+    "a bad photo of a {}.",
+    "a origami {}.",
+    "a photo of the large {}.",
+    "a {} in a video game.",
+    "art of the {}.",
+    "a photo of the small {}.",
+]
+simple_imagenet_template = [
+    "a photo of a {}.",
+]
+plural_template = [
+    "a photo of {}s.",
+]
+identity_template = [
+    "{}",
+]
+template_meta = {
+    "full": full_imagenet_templates,
+    "full+maskclip": full_imagenet_templates + maskclip_templates,  # templates used in maskclip paper
+    "subset": sub_imagenet_template,
+    "subset+maskclip": sub_imagenet_template + maskclip_templates,
+    "maskclip": maskclip_templates,
+    "simple": simple_imagenet_template,
+    "plural": plural_template,
+    "identity": identity_template,
+}
+def get_template(key):
+    if key in template_meta:
+        return template_meta[key]
+    gdic = globals()
+    if key in gdic:
+        return gdic[key]
+    raise ValueError(key)
+# custom template boosts performance a little.
+custom = sub_imagenet_template + [
+    "a photo of many {}.",
+    "a photo of {}s.",
+]

hf_model/us.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# ------------------------------------------------------------------------------
+# FreeDA
+# ------------------------------------------------------------------------------
+from typing import Dict, List, Any
+from datetime import datetime
+from itertools import chain
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+import numpy as np
+# ImageNet mean/std (from timm)
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+DEFAULT_MEAN = IMAGENET_DEFAULT_MEAN
+DEFAULT_STD = IMAGENET_DEFAULT_STD
+# NOTE Originally CLIP statistics should be used, but the legacy of ImageNet statistics
+# from GroupViT is applied. Fortunately, CLIP is quite robust to slightly different
+# normalization constants (https://github.com/openai/CLIP/issues/20#issuecomment-764985771).
+def unnorm(x):
+    mean = torch.as_tensor(DEFAULT_MEAN, device=x.device)[None, ..., None, None]
+    std = torch.as_tensor(DEFAULT_STD, device=x.device)[None, ..., None, None]
+    return x.mul(std).add(mean)
+# DEBUG NaN
+def check_nonfinite(x, name=""):
+    rank = dist.get_rank()
+    n_nan = x.isnan().sum()
+    n_inf = x.isinf().sum()
+    if n_nan or n_inf:
+        print(f"[RANK {rank}] {name} is not finite: #nan={n_nan}, #inf={n_inf}")
+        return True
+    print(f"[RANK {rank}] {name} is OK ...")
+    return False
+def normalize(t, dim, eps=1e-6):
+    """Large default eps for fp16"""
+    return F.normalize(t, dim=dim, eps=eps)
+def timestamp(fmt="%y%m%d-%H%M%S"):
+    return datetime.now().strftime(fmt)
+def merge_dicts_by_key(dics: List[Dict]) -> Dict[Any, List]:
+    """Merge dictionaries by key. All of dicts must have same keys."""
+    ret = {key: [] for key in dics[0].keys()}
+    for dic in dics:
+        for key, value in dic.items():
+            ret[key].append(value)
+    return ret
+def flatten_2d_list(list2d):
+    return list(chain.from_iterable(list2d))
+def num_params(module):
+    return sum(p.numel() for p in module.parameters())
+def param_trace(name, module, depth=0, max_depth=999, threshold=0, printf=print):
+    if depth > max_depth:
+        return
+    prefix = "  " * depth
+    n_params = num_params(module)
+    if n_params > threshold:
+        printf("{:60s}\t{:10.3f}M".format(prefix + name, n_params / 1024 / 1024))
+    for n, m in module.named_children():
+        if depth == 0:
+            child_name = n
+        else:
+            child_name = "{}.{}".format(name, n)
+        param_trace(child_name, m, depth + 1, max_depth, threshold, printf)
+@torch.no_grad()
+def hash_bn(module):
+    summary = []
+    for m in module.modules():
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
+            w = m.weight.detach().mean().item()
+            b = m.bias.detach().mean().item()
+            rm = m.running_mean.detach().mean().item()
+            rv = m.running_var.detach().mean().item()
+            summary.append((w, b, rm, rv))
+    if not summary:
+        return 0.0, 0.0
+    w, b, rm, rv = [np.mean(col) for col in zip(*summary)]
+    p = np.mean([w, b])
+    s = np.mean([rm, rv])
+    return p, s
+@torch.no_grad()
+def hash_params(module):
+    return torch.as_tensor([p.mean() for p in module.parameters()]).mean().item()
+@torch.no_grad()
+def hashm(module):
+    p = hash_params(module)
+    _, s = hash_bn(module)
+    return p, s