| | import json
|
| | import os
|
| | from typing import List, Optional
|
| | from transformers import PreTrainedTokenizer
|
| |
|
| |
|
| | class CenturioTokenizer(PreTrainedTokenizer):
|
| | vocab_files_names = {"vocab_file": "centurio_vocab.json"}
|
| | model_input_names = ["input_ids", "attention_mask"]
|
| |
|
| | def __init__(
|
| | self,
|
| | vocab_file=None,
|
| | unk_token="<unk>",
|
| | bos_token="<s>",
|
| | eos_token="</s>",
|
| | pad_token="<pad>",
|
| | sep_token="<sep>",
|
| | cls_token="<cls>",
|
| | mask_token="<mask>",
|
| | space_token="▁",
|
| | **kwargs
|
| | ):
|
| | self.space_token = space_token
|
| | self._vocab = {}
|
| | self._inv_vocab = {}
|
| |
|
| | super().__init__(
|
| | unk_token=unk_token,
|
| | bos_token=bos_token,
|
| | eos_token=eos_token,
|
| | pad_token=pad_token,
|
| | sep_token=sep_token,
|
| | cls_token=cls_token,
|
| | mask_token=mask_token,
|
| | **kwargs
|
| | )
|
| |
|
| | if vocab_file is not None:
|
| | self._load_vocab(vocab_file)
|
| | else:
|
| | self._build_default_vocab()
|
| |
|
| | def _build_default_vocab(self):
|
| | special_tokens = [
|
| | self.unk_token, self.bos_token, self.eos_token,
|
| | self.pad_token, self.sep_token, self.cls_token,
|
| | self.mask_token, self.space_token
|
| | ]
|
| | self._vocab = {token: i for i, token in enumerate(special_tokens)}
|
| | self._inv_vocab = {i: token for token, i in self._vocab.items()}
|
| |
|
| | def _load_vocab(self, vocab_file):
|
| | with open(vocab_file, "r", encoding="utf-8") as f:
|
| | self._vocab = json.load(f)
|
| | self._inv_vocab = {v: k for k, v in self._vocab.items()}
|
| |
|
| | def get_vocab(self):
|
| | return self._vocab.copy()
|
| |
|
| | @property
|
| | def vocab_size(self):
|
| | return len(self._vocab)
|
| |
|
| | def _tokenize(self, text: str) -> List[str]:
|
| | text = text.replace(" ", self.space_token)
|
| | tokens = []
|
| | current = ""
|
| | for ch in text:
|
| | if ch.isalnum() or ch in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ":
|
| | current += ch
|
| | else:
|
| | if current:
|
| | tokens.append(current)
|
| | current = ""
|
| | tokens.append(ch if ch != self.space_token else self.space_token)
|
| | if current:
|
| | tokens.append(current)
|
| | return tokens
|
| |
|
| | def _convert_token_to_id(self, token: str) -> int:
|
| | return self._vocab.get(token, self._vocab.get(self.unk_token, 0))
|
| |
|
| | def _convert_id_to_token(self, index: int) -> str:
|
| | return self._inv_vocab.get(index, self.unk_token)
|
| |
|
| | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
|
| | if not os.path.isdir(save_directory):
|
| | os.makedirs(save_directory)
|
| | vocab_file = os.path.join(
|
| | save_directory,
|
| | (filename_prefix + "-" if filename_prefix else "") + "centurio_vocab.json"
|
| | )
|
| | with open(vocab_file, "w", encoding="utf-8") as f:
|
| | json.dump(self._vocab, f, ensure_ascii=False, indent=2)
|
| | return (vocab_file,)
|
| |
|
| | def build_vocab_from_corpus(self, corpus: List[str], min_freq: int = 2):
|
| | from collections import Counter
|
| | token_counter = Counter()
|
| | for text in corpus:
|
| | tokens = self._tokenize(text)
|
| | token_counter.update(tokens)
|
| |
|
| | special_tokens = [
|
| | self.unk_token, self.bos_token, self.eos_token,
|
| | self.pad_token, self.sep_token, self.cls_token,
|
| | self.mask_token, self.space_token
|
| | ]
|
| | new_vocab = {token: i for i, token in enumerate(special_tokens)}
|
| | idx = len(new_vocab)
|
| |
|
| | for token, freq in token_counter.items():
|
| | if freq >= min_freq and token not in new_vocab:
|
| | new_vocab[token] = idx
|
| | idx += 1
|
| |
|
| | self._vocab = new_vocab
|
| | self._inv_vocab = {v: k for k, v in self._vocab.items()}
|
| |
|
| | if __name__ == "__main__":
|
| | corpus = [
|
| | "Привет, как дела!",
|
| | "Я учу немецкий язык.",
|
| | "Морфемы помогают понять структуру слов."
|
| | ]
|
| |
|
| | tokenizer = CenturioTokenizer()
|
| | tokenizer.build_vocab_from_corpus(corpus, min_freq=1)
|
| | tokenizer.save_pretrained("./centurio_model")
|
| |
|
| | for text in corpus:
|
| | tokens = tokenizer.tokenize(text)
|
| | ids = tokenizer.encode(text)
|
| | back = tokenizer.decode(ids)
|
| | print(f"\nTEXT : {text}")
|
| | print(f"TOKENS : {tokens}")
|
| | print(f"IDS : {ids}")
|
| | print(f"BACK : {back}")
|
| | print(f"VOCAB : {tokenizer.vocab_size}")
|
| |
|