jinaai
/

jina-bert-flash-implementation

🇪🇺 Region: EU

Model card Files Files and versions

michael-guenther commited on Mar 4, 2024

Commit

6343db7

·

1 Parent(s): 32458be

add tokenizer

Files changed (1) hide show

tokenizer.py +62 -0

tokenizer.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import numpy as np
+from transformers import RobertaTokenizer, BatchEncoding
+import warnings
+class JinaTokenizer(RobertaTokenizer):
+    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.task_type_vocab_size = task_type_vocab_size
+    def __call__(self, *args, task_type=None, **kwargs):
+        batch_encoding = super().__call__(*args, **kwargs)
+        batch_encoding = BatchEncoding(
+            {
+                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
+                **batch_encoding,
+            },
+            tensor_type=kwargs.get('return_tensors'),
+        )
+        return batch_encoding
+    def _batch_encode_plus(self, *args, task_type=None, **kwargs):
+        batch_encoding = super()._batch_encode_plus(*args, **kwargs)
+        if task_type is not None:
+            batch_encoding = BatchEncoding(
+                {
+                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=kwargs.get('return_tensors'),
+            )
+        return batch_encoding
+    def _encode_plus(self, *args, task_type=None, **kwargs):
+        batch_encoding = super()._encode_plus(*args, **kwargs)
+        if task_type is not None:
+            batch_encoding = BatchEncoding(
+                {
+                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=kwargs.get('return_tensors'),
+            )
+        return batch_encoding
+    @staticmethod
+    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
+        if isinstance(batch_encoding['input_ids'], torch.Tensor):
+            shape = batch_encoding['input_ids'].shape
+            return torch.ones(shape, dtype=torch.long) * task_type
+        else:
+            shape = torch.tensor(batch_encoding['input_ids']).shape
+            if isinstance(batch_encoding['input_ids'], list):
+                return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
+            elif isinstance(batch_encoding['input_ids'], np.array):
+                return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
+            else:
+                warnings.warn(
+                    'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
+                )
+                return torch.ones(shape, dtype=torch.long) * task_type