jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

🇪🇺 Region: EU

Model card Files Files and versions

xet

Community

michael-guenther commited on Mar 5, 2024

Commit

6170b43

1 Parent(s): 326b1c4

support multiple task ids

Browse files

Files changed (1) hide show

tokenizer.py +31 -13

tokenizer.py CHANGED Viewed

@@ -11,13 +11,14 @@ class JinaTokenizer(RobertaTokenizer):
     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
-        batch_encoding = BatchEncoding(
-            {
-                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                **batch_encoding,
-            },
-            tensor_type=kwargs.get('return_tensors'),
-        )
         return batch_encoding
     def _batch_encode_plus(self, *args, task_type=None, **kwargs):
@@ -45,18 +46,35 @@ class JinaTokenizer(RobertaTokenizer):
         return batch_encoding
     @staticmethod
-    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
         if isinstance(batch_encoding['input_ids'], torch.Tensor):
             shape = batch_encoding['input_ids'].shape
-            return torch.ones(shape, dtype=torch.long) * task_type
         else:
-            shape = torch.tensor(batch_encoding['input_ids']).shape
             if isinstance(batch_encoding['input_ids'], list):
-                return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
             elif isinstance(batch_encoding['input_ids'], np.array):
-                return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
             else:
                 warnings.warn(
                     'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                 )
-                return torch.ones(shape, dtype=torch.long) * task_type

     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
+        if task_type is not None:
+            batch_encoding = BatchEncoding(
+                {
+                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=kwargs.get('return_tensors'),
+            )
         return batch_encoding
     def _batch_encode_plus(self, *args, task_type=None, **kwargs):
         return batch_encoding
     @staticmethod
+    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):
+        def apply_task_type(m, x):
+            x = torch.tensor(x)
+            return m * x if len(x.shape) == 0 else m * x[:, None]
         if isinstance(batch_encoding['input_ids'], torch.Tensor):
             shape = batch_encoding['input_ids'].shape
+            return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
         else:
+            try:
+                shape = torch.tensor(batch_encoding['input_ids']).shape
+            except:
+                raise ValueError(
+                    "Unable to create tensor, you should probably "
+                    "activate truncation and/or padding with "
+                    "'padding=True' 'truncation=True' to have batched "
+                    "tensors with the same length."
+                )
             if isinstance(batch_encoding['input_ids'], list):
+                return (
+                    apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                ).tolist()
             elif isinstance(batch_encoding['input_ids'], np.array):
+                return (
+                    apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                ).numpy()
             else:
                 warnings.warn(
                     'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                 )
+                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)