fix out of vocab token
Browse files- tokenization_interns1.py +3 -1
tokenization_interns1.py
CHANGED
|
@@ -893,7 +893,9 @@ class InternS1Tokenizer(Qwen2Tokenizer):
|
|
| 893 |
|
| 894 |
def convert_tokens_to_string(self, tokens):
|
| 895 |
"""Converts a sequence of tokens (string) in a single string."""
|
| 896 |
-
text = ""
|
|
|
|
|
|
|
| 897 |
text = text.replace(
|
| 898 |
"▁", "Ġ"
|
| 899 |
) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
|
|
|
|
| 893 |
|
| 894 |
def convert_tokens_to_string(self, tokens):
|
| 895 |
"""Converts a sequence of tokens (string) in a single string."""
|
| 896 |
+
text = ""
|
| 897 |
+
for token in tokens:
|
| 898 |
+
text += token if token else ""
|
| 899 |
text = text.replace(
|
| 900 |
"▁", "Ġ"
|
| 901 |
) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
|