jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

🇪🇺 Region: EU

Model card Files Files and versions

xet

Community

Markus28 commited on Mar 6, 2024

Commit

850b9a2

1 Parent(s): 5c4e4bf

fix: use proper initilization for embedding layer

Browse files

Files changed (1) hide show

modeling_lora.py +28 -11

modeling_lora.py CHANGED Viewed

@@ -11,20 +11,37 @@ from torch.nn import Parameter
 from .modeling_bert import BertModel, BertPreTrainedModel, JinaBertConfig
 class LoRAParametrization(nn.Module):
-    def __init__(self, fan_in, fan_out, fan_in_fan_out=False, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         super().__init__()
         # if weight is stored as (fan_out, fan_in), the memory layout of A & B follows (W + BA)x
         # otherwise, it's x(W + AB). This allows us to tie the weights between linear layers and embeddings
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
-        lora_A_data = []
-        for _ in range(num_adaptions):
-            new_adaption = torch.zeros(self.swap((rank, fan_in)))
-            nn.init.kaiming_uniform_(new_adaption, a=math.sqrt(5))
-            lora_A_data.append(new_adaption)
-        lora_A_data = torch.stack(lora_A_data, dim=0)
-        self.lora_A = nn.Parameter(lora_A_data)
-        self.lora_B = nn.Parameter(torch.zeros((num_adaptions, *self.swap((fan_out, rank)))))
         self.lora_alpha, self.rank = lora_alpha, rank
         self.scaling = lora_alpha / rank
         self.lora_dropout = nn.Dropout(p=lora_dropout_p) if lora_dropout_p > 0 else lambda x: x
@@ -55,14 +72,14 @@ class LoRAParametrization(nn.Module):
     def from_linear(cls, layer, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         fan_out, fan_in = layer.weight.shape
         return cls(
-            fan_in, fan_out, num_adaptions=num_adaptions, fan_in_fan_out=False, rank=rank, lora_dropout_p=lora_dropout_p, lora_alpha=lora_alpha
         )
     @classmethod
     def from_embedding(cls, layer, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         fan_in, fan_out = layer.weight.shape
         return cls(
-            fan_in, fan_out, num_adaptions=num_adaptions, fan_in_fan_out=True, rank=rank, lora_dropout_p=lora_dropout_p, lora_alpha=lora_alpha
         )
     @classmethod

 from .modeling_bert import BertModel, BertPreTrainedModel, JinaBertConfig
+def initialized_weights(shape, num_adaptions, init='kaiming'):
+    weight_data = []
+    for _ in range(num_adaptions):
+        new_adaption = torch.zeros(shape)
+        if init == 'kaiming':
+            nn.init.kaiming_uniform_(new_adaption, a=math.sqrt(5))
+        elif init == 'normal':
+            nn.init.normal_(new_adaption)
+        else:
+            raise NotImplementedError
+        weight_data.append(new_adaption)
+    return torch.stack(weight_data, dim=0)
 class LoRAParametrization(nn.Module):
+    def __init__(self, fan_in, fan_out, layer_type='linear', num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         super().__init__()
         # if weight is stored as (fan_out, fan_in), the memory layout of A & B follows (W + BA)x
         # otherwise, it's x(W + AB). This allows us to tie the weights between linear layers and embeddings
+        fan_in_fan_out = (layer_type == 'embedding')
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
+        if layer_type == 'linear':
+            self.lora_A = nn.Parameter(initialized_weights((rank, fan_in), num_adaptions, init='kaiming'))
+            self.lora_B = nn.Parameter(torch.zeros((num_adaptions, fan_out, rank)))
+        elif layer_type == 'embedding':
+            self.lora_A = nn.Parameter(torch.zeros((num_adaptions, fan_in, rank)))
+            self.lora_B = nn.Parameter(initialized_weights((rank, fan_out), num_adaptions=num_adaptions, init='normal'))
+        else:
+            raise NotImplementedError
         self.lora_alpha, self.rank = lora_alpha, rank
         self.scaling = lora_alpha / rank
         self.lora_dropout = nn.Dropout(p=lora_dropout_p) if lora_dropout_p > 0 else lambda x: x
     def from_linear(cls, layer, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         fan_out, fan_in = layer.weight.shape
         return cls(
+            fan_in, fan_out, num_adaptions=num_adaptions, layer_type='linear', rank=rank, lora_dropout_p=lora_dropout_p, lora_alpha=lora_alpha
         )
     @classmethod
     def from_embedding(cls, layer, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
         fan_in, fan_out = layer.weight.shape
         return cls(
+            fan_in, fan_out, num_adaptions=num_adaptions, layer_type='embedding', rank=rank, lora_dropout_p=lora_dropout_p, lora_alpha=lora_alpha
         )
     @classmethod