feat-add-constant-for-task-type-ids (#10)

Browse files

- feat: add enum for task type ids (db57d383793c47c7e6f6487d68c20311be3bf20d)

Files changed (1) hide show

tokenizer.py +30 -11

tokenizer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import torch
 import numpy as np
 from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
 import warnings
@@ -6,6 +7,14 @@ import warnings
 def get_tokenizer(parent_class):
     class TokenizerClass(parent_class):
         def __init__(self, *args, **kwargs):
             """
             This class dynamically extends a given tokenizer class from the HF
@@ -16,26 +25,34 @@ def get_tokenizer(parent_class):
             """
             super().__init__(*args, **kwargs)
-        def __call__(self, *args, task_type=None, **kwargs):
             batch_encoding = super().__call__(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
-        def _batch_encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._batch_encode_plus(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
-        def _encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._encode_plus(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
         @classmethod
-        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
             return BatchEncoding(
                 {
                     'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
@@ -45,12 +62,11 @@ def get_tokenizer(parent_class):
             )
         @staticmethod
-        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):
             def apply_task_type(m, x):
                 x = torch.tensor(x)
                 assert (
-                        len(x.shape) == 0 or x.shape[0] == m.shape[0]
                 ), 'The shape of task_type does not match the size of the batch.'
                 return m * x if len(x.shape) == 0 else m * x[:, None]
@@ -79,10 +95,13 @@ def get_tokenizer(parent_class):
                     warnings.warn(
                         'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                     )
-                    return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
     return TokenizerClass
 JinaTokenizer = get_tokenizer(RobertaTokenizer)
 JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)

 import torch
+from enum import IntEnum
 import numpy as np
 from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
 import warnings
 def get_tokenizer(parent_class):
     class TokenizerClass(parent_class):
+        class TaskTypes(IntEnum):
+            NULL = (0,)
+            QUERY = 1
+            DOCUMENT = 2
+            STS = 3
+            CLUSTERING = (4,)
+            CLASSIFICATION = 5
         def __init__(self, *args, **kwargs):
             """
             This class dynamically extends a given tokenizer class from the HF
             """
             super().__init__(*args, **kwargs)
+        def __call__(self, *args, task_type: TaskTypes = None, **kwargs):
             batch_encoding = super().__call__(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(
+                    batch_encoding, task_type, kwargs.get('return_tensors')
+                )
             return batch_encoding
+        def _batch_encode_plus(self, *args, task_type: TaskTypes = None, **kwargs):
             batch_encoding = super()._batch_encode_plus(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(
+                    batch_encoding, task_type, kwargs.get('return_tensors')
+                )
             return batch_encoding
+        def _encode_plus(self, *args, task_type: TaskTypes = None, **kwargs):
             batch_encoding = super()._encode_plus(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(
+                    batch_encoding, task_type, kwargs.get('return_tensors')
+                )
             return batch_encoding
         @classmethod
+        def _add_task_type_ids(
+            cls, batch_encoding: BatchEncoding, task_type: TaskTypes, tensor_type: str
+        ):
             return BatchEncoding(
                 {
                     'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
             )
         @staticmethod
+        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: TaskTypes):
             def apply_task_type(m, x):
                 x = torch.tensor(x)
                 assert (
+                    len(x.shape) == 0 or x.shape[0] == m.shape[0]
                 ), 'The shape of task_type does not match the size of the batch.'
                 return m * x if len(x.shape) == 0 else m * x[:, None]
                     warnings.warn(
                         'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                     )
+                    return apply_task_type(
+                        torch.ones(shape, dtype=torch.long), task_type
+                    )
     return TokenizerClass
 JinaTokenizer = get_tokenizer(RobertaTokenizer)
 JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)