jinaai
/

jina-bert-flash-implementation

Markus28 commited on Mar 22, 2024

Commit

9587227

1 Parent(s): 0211324

fixed GLU implementation, added conversion of layer norms

Files changed (2) hide show

convert_v2_weights.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import re
 from collections import OrderedDict
-from transformers import AutoModel
 from .configuration_bert import JinaBertConfig
 import torch
 from .modeling_bert import BertModel
@@ -115,6 +115,12 @@ def remap_state_dict(state_dict, config: JinaBertConfig):
             decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
         )
     return state_dict
@@ -124,3 +130,15 @@ state_dict = v2_model.state_dict()
 new_state_dict = remap_state_dict(state_dict, config)
 flash_model = BertModel(config)
 flash_model.load_state_dict(new_state_dict)

 import re
 from collections import OrderedDict
+from transformers import AutoModel, AutoTokenizer
 from .configuration_bert import JinaBertConfig
 import torch
 from .modeling_bert import BertModel
             decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
         )
+    # LayerNorm
+    def key_mapping_layernorm(key):
+        return re.sub(r'^encoder.layers.(\d+).mlp.layernorm.(weight|bias)', r"encoder.layers.\1.norm2.\2", key)
+    state_dict = OrderedDict((key_mapping_layernorm(k), v) for k, v in state_dict.items())
     return state_dict
 new_state_dict = remap_state_dict(state_dict, config)
 flash_model = BertModel(config)
 flash_model.load_state_dict(new_state_dict)
+tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
+inp = tokenizer.batch_encode_plus(['Hello world', 'How is the weather today?', 'It is raining a lot in  Berlin'], return_tensors='pt', padding=True).to('cuda')
+v2_model.eval()
+flash_model.eval()
+v2_model = v2_model.to('cuda', torch.float16)
+flash_model = flash_model.to('cuda', torch.float16)
+output_v2 = v2_model(**inp)
+output_flash = flash_model(**inp)
+x = output_v2.last_hidden_state
+y = output_flash.last_hidden_state
+print(torch.abs(x - y))

mlp.py CHANGED Viewed

@@ -37,6 +37,7 @@ class GLUMLP(nn.Module):
             hidden_dropout_prob=0.1
     ):
         super().__init__()
         self.gated_layers = nn.Linear(
             in_features, hidden_features * 2, bias=False
         )
@@ -57,8 +58,8 @@ class GLUMLP(nn.Module):
         residual_connection = hidden_states
         # compute the activation
         hidden_states = self.gated_layers(hidden_states)
-        gated = hidden_states[:, :, : self.config.intermediate_size]
-        non_gated = hidden_states[:, :, self.config.intermediate_size :]
         hidden_states = self.act(gated) * non_gated
         hidden_states = self.dropout(hidden_states)
         # multiply by the second matrix

             hidden_dropout_prob=0.1
     ):
         super().__init__()
+        self.hidden_features = hidden_features
         self.gated_layers = nn.Linear(
             in_features, hidden_features * 2, bias=False
         )
         residual_connection = hidden_states
         # compute the activation
         hidden_states = self.gated_layers(hidden_states)
+        gated = hidden_states[:, : self.hidden_features]
+        non_gated = hidden_states[:, self.hidden_features :]
         hidden_states = self.act(gated) * non_gated
         hidden_states = self.dropout(hidden_states)
         # multiply by the second matrix