add_model_src_code (#1)

Browse files

- add model src code (6f8733e4a99d15adc3c3f5794ceb9f442ca289b2)
- rename and documentation (fbc3442307396d208fb11eae95ceef4d7b81ed89)

Co-authored-by: Nir Raviv <[email protected]>

Files changed (4) hide show

requirements.txt +3 -0
src/config.py +44 -0
src/inference.py +174 -0
src/models.py +24 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy==1.23.5
+torch==2.2.2
+transformers==4.44.2

src/config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from transformers import BertConfig
+class PunctuationBertConfig(BertConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PunctuationBertConfig`]. It is based on BERT config
+     to the specified arguments, defining the model architecture.
+     Args:
+        backward_context (`int`, *optional*, defaults to 15):
+            size of backward context window
+        forward_context (`int`, *optional*, defaults to 16):
+            size of forward context window
+        output_size (`int`, *optional*, defaults to 4):
+            number of punctuation classes
+        dropout (`float`, *optional*, defaults to 0.3):
+            dropout rate
+    Examples:
+        ```python
+    >>> from transformers import BertConfig, BertModel
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+    >>> configuration = PunctuationBertConfig()
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
+    >>> model = BertForPunctuation(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    def __init__(
+        self,
+        backward_context=15,
+        forward_context=16,
+        output_size=4,
+        dropout=0.3,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.backward_context = backward_context
+        self.forward_context = forward_context
+        self.output_size = output_size
+        self.dropout = dropout

src/inference.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer
+from src.models import BertForPunctuation
+PUNCTUATION_SIGNS = ['', ',', '.', '?']
+PAUSE_TOKEN = 0
+MODEL_NAME = "verbit/hebrew_punctuation"
+def tokenize_text(
+    word_list: List[str], pause_list: List[float], tokenizer: BertTokenizer
+) -> Tuple[List[int], List[int], List[float]]:
+    """
+    Tokenizes text and generates pause list for each word
+    Args:
+        word_list: list of words
+        pause_list: list of pauses after each word in seconds
+        tokenizer: tokenizer
+    Returns:
+        original_word_idx: list of indexes of original words
+        x: list of indexed words
+        pause: list of pauses after each word in seconds
+    """
+    assert len(word_list) == len(pause_list), "word_list and pause_list should have the same length"
+    x, pause = [], []
+    # when we do tokenization the number of tokens might be more than one for single word, so we need to keep
+    # mapping tokens into real words
+    original_word_idx = []
+    for w, p in zip(word_list, pause_list):
+        tokens = tokenizer.tokenize(w)
+        p = [p]
+        # converting tokens to idx, if we have no token for current word then just pad it with 0 to be safe
+        _x = tokenizer.convert_tokens_to_ids(tokens) if tokens else [0]
+        if len(_x) > 1:
+            p = (len(_x) - 1) * [0] + p
+        x += _x
+        original_word_idx.append(len(x) - 1)
+        pause += p
+    return original_word_idx, x, pause
+def gen_model_inputs(
+    x: List[int],
+    pause: List[float],
+    forward_context: int,
+    backward_context: int,
+) -> torch.Tensor:
+    """
+    Generates inputs for model out of list of indexed words.
+    Inserts a pause token into the segment
+    Args:
+        x: list of indexed words
+        pause: list of corresponding pauses
+        forward_context: size of the forward context window
+        backward_context: size of the backward context window (without the predicted token)`
+    Returns:
+        A tensor of model inputs for each indexed word in x
+    """
+    model_input = []
+    tokenized_pause = [PAUSE_TOKEN] * len(pause)
+    x_pad = [0] * backward_context + x + [0] * forward_context
+    for i in range(len(x)):
+        segment = x_pad[i : i + backward_context + forward_context + 1]
+        segment.insert(backward_context + 1, tokenized_pause[i])
+        model_input.append(segment)
+    return torch.tensor(model_input)
+def add_punctuation_to_text(text: str, punct_prob: np.ndarray) -> str:
+    """
+    Inserts punctuation to text on provided punctuation string for every word
+    Args:
+        text: text to insert punctuation to
+        punct_prob: matrix of probabilities for each punctuation
+    Returns:
+        text with punctuation
+    """
+    words = text.split()
+    new_words = list()
+    punctuation_idx = np.argmax(punct_prob, axis=1)
+    punctuation_list = [PUNCTUATION_SIGNS[i] for i in punctuation_idx]
+    for word, punctuation_str in zip(words, punctuation_list):
+        if punctuation_str:
+            new_words.append(word + punctuation_str)
+        else:
+            new_words.append(word)
+    punct_text = ' '.join(new_words)
+    return punct_text
+def get_prediction(
+    model: BertForPunctuation,
+    text: str,
+    tokenizer: BertTokenizer,
+    batch_size: int = 16,
+    backward_context: int = 15,
+    forward_context: int = 16,
+    pause_list: Optional[List[float]] = None,
+    device: str = 'cpu',
+) -> str:
+    """
+    Generates predictions for given list of words.
+    Args:
+        model: punctuation model
+        text: text to predict punctuation for
+        tokenizer: tokenizer
+        batch_size: batch size
+        backward_context: size of the backward context window
+        forward_context: size of the forward context window
+        pause_list: list of pauses after each word in seconds
+        device: device to run model on
+    Returns:
+        text with punctuation
+    """
+    word_list = text.split()
+    if not pause_list:
+        # make default pauses if pauses are not provided
+        pause_list = [0.0] * len(word_list)
+    word_idx, x, pause = tokenize_text(word_list=word_list, pause_list=pause_list, tokenizer=tokenizer)
+    model_inputs = gen_model_inputs(x, pause, forward_context, backward_context)
+    model_inputs = model_inputs.index_select(0, torch.LongTensor(word_idx)).to(device)
+    inputs_length = len(model_inputs)
+    output = []
+    with torch.no_grad():
+        for ndx in range(0, inputs_length, batch_size):
+            o = model(model_inputs[ndx : min(ndx + batch_size, inputs_length)])
+            o = F.softmax(o, dim=1)
+            output.append(o.cpu().data.numpy())
+    punct_probabilities_matrix = np.concatenate(output, axis=0)
+    punct_text = add_punctuation_to_text(text, punct_probabilities_matrix)
+    return punct_text
+def main():
+    model = BertForPunctuation.from_pretrained(MODEL_NAME)
+    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
+    model.eval()
+    text = """חברת ורביט פיתחה מערכת לתמלול המבוססת על בינה מלאכותית וגורם אנושי ושוקדת על תמלול עדויות ניצולי שואה
+    את התוצאות אפשר לראות כבר ברשת בהן חלקים מעדותו של טוביה ביילסקי שהיה מפקד גדוד הפרטיזנים היהודים בביילורוסיה"""
+    punct_text = get_prediction(
+        model=model,
+        text=text,
+        tokenizer=tokenizer,
+        backward_context=model.config.backward_context,
+        forward_context=model.config.forward_context,
+    )
+    print(punct_text)
+if __name__ == "__main__":
+    main()

src/models.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from torch import nn
+from transformers import BertForMaskedLM, PreTrainedModel
+from src.config import PunctuationBertConfig
+class BertForPunctuation(PreTrainedModel):
+    config_class = PunctuationBertConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # segment_size equal backward_context + forward_context + predicted token + pause token
+        segment_size = config.backward_context + config.forward_context + 2
+        bert_vocab_size = config.vocab_size
+        self.bert = BertForMaskedLM(config)
+        self.bn = nn.BatchNorm1d(segment_size * bert_vocab_size)
+        self.fc = nn.Linear(segment_size * bert_vocab_size, config.output_size)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.bert(x)[0]
+        x = x.view(x.shape[0], -1)
+        x = self.fc(self.dropout(self.bn(x)))
+        return x