CenturioTokenizer / centurio.py
Darkester's picture
Upload centurio.py
fa639b2 verified
import json
import os
from typing import List, Optional
from transformers import PreTrainedTokenizer
class CenturioTokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "centurio_vocab.json"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
sep_token="<sep>",
cls_token="<cls>",
mask_token="<mask>",
space_token="▁",
**kwargs
):
self.space_token = space_token
self._vocab = {}
self._inv_vocab = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
sep_token=sep_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs
)
if vocab_file is not None:
self._load_vocab(vocab_file)
else:
self._build_default_vocab()
def _build_default_vocab(self):
special_tokens = [
self.unk_token, self.bos_token, self.eos_token,
self.pad_token, self.sep_token, self.cls_token,
self.mask_token, self.space_token
]
self._vocab = {token: i for i, token in enumerate(special_tokens)}
self._inv_vocab = {i: token for token, i in self._vocab.items()}
def _load_vocab(self, vocab_file):
with open(vocab_file, "r", encoding="utf-8") as f:
self._vocab = json.load(f)
self._inv_vocab = {v: k for k, v in self._vocab.items()}
def get_vocab(self):
return self._vocab.copy()
@property
def vocab_size(self):
return len(self._vocab)
def _tokenize(self, text: str) -> List[str]:
text = text.replace(" ", self.space_token)
tokens = []
current = ""
for ch in text:
if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ":
current += ch
else:
if current:
tokens.append(current)
current = ""
tokens.append(ch if ch != self.space_token else self.space_token)
if current:
tokens.append(current)
return tokens
def _convert_token_to_id(self, token: str) -> int:
return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
def _convert_id_to_token(self, index: int) -> str:
return self._inv_vocab.get(index, self.unk_token)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
if not os.path.isdir(save_directory):
os.makedirs(save_directory)
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json"
)
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
return (vocab_file,)
def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2):
from collections import Counter
token_counter = Counter()
for text in corpus:
tokens = self._tokenize(text)
token_counter.update(tokens)
special_tokens = [
self.unk_token, self.bos_token, self.eos_token,
self.pad_token, self.sep_token, self.cls_token,
self.mask_token, self.space_token
]
new_vocab = {token: i for i, token in enumerate(special_tokens)}
idx = len(new_vocab)
for token, freq in token_counter.items():
if freq >= min_freq and token not in new_vocab:
new_vocab[token] = idx
idx += 1
self._vocab = new_vocab
self._inv_vocab = {v: k for k, v in self._vocab.items()}
if __name__ == "__main__":
corpus = [
"Привет, как дела!",
"Я учу немецкий язык.",
"Морфемы помогают понять структуру слов."
]
tokenizer = CenturioTokenizer()
tokenizer.build_vocab_from_corpus(corpus, min_freq=1)
tokenizer.save_pretrained("./centurio_model")
for text in corpus:
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)
back = tokenizer.decode(ids)
print(f"\nTEXT : {text}")
print(f"TOKENS : {tokens}")
print(f"IDS : {ids}")
print(f"BACK : {back}")
print(f"VOCAB : {tokenizer.vocab_size}")