jinaai
/

jina-bert-flash-implementation

🇪🇺 Region: EU

Model card Files Files and versions

michael-guenther commited on Mar 5, 2024

Commit

4b66519

·

1 Parent(s): 6170b43

add assertions and docs

Files changed (1) hide show

tokenizer.py +11 -2

tokenizer.py CHANGED Viewed

@@ -5,9 +5,15 @@ import warnings
 class JinaTokenizer(RobertaTokenizer):
-    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
         super().__init__(*args, **kwargs)
-        self.task_type_vocab_size = task_type_vocab_size
     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
@@ -50,6 +56,9 @@ class JinaTokenizer(RobertaTokenizer):
         def apply_task_type(m, x):
             x = torch.tensor(x)
             return m * x if len(x.shape) == 0 else m * x[:, None]
         if isinstance(batch_encoding['input_ids'], torch.Tensor):

 class JinaTokenizer(RobertaTokenizer):
+    def __init__(self, *args, **kwargs):
+        """
+        JinaTokenizer extends the RobertaTokenizer class to include task_type_ids in
+        the batch encoding.
+        The task_type_ids are used to pass instruction information to the model.
+        A task_type should either be an integer or a sequence of integers with the same
+        length as the batch size.
+        """
         super().__init__(*args, **kwargs)
     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
         def apply_task_type(m, x):
             x = torch.tensor(x)
+            assert (
+                len(x.shape) == 0 or x.shape[0] == m.shape[0]
+            ), 'The shape of task_type does not match the size of the batch.'
             return m * x if len(x.shape) == 0 else m * x[:, None]
         if isinstance(batch_encoding['input_ids'], torch.Tensor):