jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

🇪🇺 Region: EU

Model card Files Files and versions

xet

Community

Markus28 commited on Mar 1, 2024

Commit

32458be

1 Parent(s): 3b35eab

feat: added encode method

Browse files

Files changed (2) hide show

configuration_bert.py +2 -0
modeling_bert.py +165 -0

configuration_bert.py CHANGED Viewed

@@ -84,6 +84,7 @@ class JinaBertConfig(PretrainedConfig):
         num_tasks=0,
         use_flash_attn=True,
         use_qk_norm=True,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
@@ -112,3 +113,4 @@ class JinaBertConfig(PretrainedConfig):
         self.num_tasks = num_tasks
         self.use_flash_attn = use_flash_attn
         self.use_qk_norm = use_qk_norm

         num_tasks=0,
         use_flash_attn=True,
         use_qk_norm=True,
+        emb_pooler=None,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
         self.num_tasks = num_tasks
         self.use_flash_attn = use_flash_attn
         self.use_qk_norm = use_qk_norm
+        self.emb_pooler = emb_pooler

modeling_bert.py CHANGED Viewed

@@ -15,7 +15,10 @@ and made modifications to use ALiBi.
 import logging
 from collections.abc import Sequence
 from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -54,6 +57,10 @@ try:
 except ImportError:
     CrossEntropyLoss = None
 logger = logging.getLogger(__name__)
@@ -346,6 +353,15 @@ class BertModel(BertPreTrainedModel):
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
         # We now initialize the task embeddings to 0; We do not use task types during
         # pretraining. When we start using task types during embedding training,
         # we want the model to behave exactly as in pretraining (i.e. task types
@@ -419,6 +435,155 @@ class BertModel(BertPreTrainedModel):
         )
 class BertForPreTraining(BertPreTrainedModel):
     def __init__(self, config: JinaBertConfig):
         super().__init__(config)

 import logging
 from collections.abc import Sequence
 from functools import partial
+from typing import Union, List, Optional
+import warnings
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 except ImportError:
     CrossEntropyLoss = None
+try:
+    from tqdm.autonotebook import trange
+except ImportError:
+    trange = None
 logger = logging.getLogger(__name__)
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
+        self.emb_pooler = config.emb_pooler
+        self._name_or_path = config._name_or_path
+        if self.emb_pooler is not None:
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
+        else:
+            self.tokenizer = None
         # We now initialize the task embeddings to 0; We do not use task types during
         # pretraining. When we start using task types during embedding training,
         # we want the model to behave exactly as in pretraining (i.e. task types
         )
+    @torch.inference_mode()
+    def encode(
+        self: 'BertModel',
+        sentences: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: Optional[bool] = None,
+        output_value: str = 'sentence_embedding',
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: Optional[torch.device] = None,
+        normalize_embeddings: bool = False,
+        **tokenizer_kwargs,
+    ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
+        """
+        Computes sentence embeddings
+        Args:
+            sentences(`str` or `List[str]`):
+                Sentence or sentences to be encoded
+            batch_size(`int`, *optional*, defaults to 32):
+                Batch size for the computation
+            show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding sentences.
+                If set to None, progress bar is only shown when `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
+            output_value(`str`, *optional*, defaults to 'sentence_embedding'):
+                Default sentence_embedding, to get sentence embeddings.
+                Can be set to token_embeddings to get wordpiece token embeddings.
+                Set to None, to get all output values
+            convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors.
+                Else, it is a list of pytorch tensors.
+            convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return.
+                Overwrites any setting from convert_to_numpy
+            device(`torch.device`, *optional*, defaults to None):
+                Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
+                If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+            tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
+                Keyword arguments for the tokenizer
+        Returns:
+            By default, a list of tensors is returned.
+            If convert_to_tensor, a stacked tensor is returned.
+            If convert_to_numpy, a numpy matrix is returned.
+        """
+        if self.emb_pooler is None:
+            warnings.warn("No emb_pooler specified, defaulting to mean pooling.")
+            self.emb_pooler = 'mean'
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(self._name_or_path)
+        if self.emb_pooler != 'mean':
+            raise NotImplementedError
+        is_training = self.training
+        self.eval()
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO
+                or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        if convert_to_tensor:
+            convert_to_numpy = False
+        if output_value != 'sentence_embedding':
+            convert_to_tensor = False
+            convert_to_numpy = False
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
+            sentences = [sentences]
+            input_was_string = True
+        if device is not None:
+            self.to(device)
+        # TODO: Maybe use better length heuristic?
+        permutation = np.argsort([-len(i) for i in sentences])
+        inverse_permutation = np.argsort(permutation)
+        sentences = [sentences[idx] for idx in permutation]
+        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
+        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 8192)
+        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
+        all_embeddings = []
+        if trange is not None:
+            range_iter = trange(
+                0,
+                len(sentences),
+                batch_size,
+                desc="Encoding",
+                disable=not show_progress_bar,
+            )
+        else:
+            range_iter = range(0, len(sentences), batch_size)
+        for i in range_iter:
+            encoded_input = self.tokenizer(
+                sentences[i : i + batch_size],
+                return_tensors='pt',
+                **tokenizer_kwargs,
+            ).to(self.device)
+            token_embs = self.forward(**encoded_input)[0]
+            # Accumulate in fp32 to avoid overflow
+            token_embs = token_embs.float()
+            if output_value == 'token_embeddings':
+                raise NotImplementedError
+            elif output_value is None:
+                raise NotImplementedError
+            else:
+                embeddings = self.mean_pooling(
+                    token_embs, encoded_input['attention_mask']
+                )
+                if normalize_embeddings:
+                    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+                if convert_to_numpy:
+                    embeddings = embeddings.cpu()
+            all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
+        if convert_to_tensor:
+            all_embeddings = torch.stack(all_embeddings)
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+        self.train(is_training)
+        return all_embeddings
+    def mean_pooling(
+        self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+    ):
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        )
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+            input_mask_expanded.sum(1), min=1e-9
+        )
 class BertForPreTraining(BertPreTrainedModel):
     def __init__(self, config: JinaBertConfig):
         super().__init__(config)