Spaces:

VRLLab
/

TurkishBERTweet-SA-LoRA

Sleeping

App Files Files Community

AliNajafi commited on Jan 29, 2024

Commit

d856fda

1 Parent(s): 2307b31

Add application files

Browse files

Files changed (6) hide show

.gitignore +129 -0
Preprocessor/__init__.py +2 -0
Preprocessor/demojize.py +91 -0
Preprocessor/emojis_tr_twitter.json +0 -0
Preprocessor/preprocessor.py +75 -0
app.py +45 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

Preprocessor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .demojize import demojize
2	+ from .preprocessor import preprocess

Preprocessor/demojize.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+import os
+dir = os.path.dirname(__file__)
+EMOJI_DATA_PATH = os.path.join(dir, "emojis_tr_twitter.json")
+with open(EMOJI_DATA_PATH, "r") as f:
+    emojis = json.load(f)
+_SEARCH_TREE = None
+def _get_search_tree():
+    global _SEARCH_TREE
+    if _SEARCH_TREE is None:
+        _SEARCH_TREE = {}
+        for emj in emojis:
+            sub_tree = _SEARCH_TREE
+            lastidx = len(emj) - 1
+            for i, char in enumerate(emj):
+                if char not in sub_tree:
+                    sub_tree[char] = {}
+                sub_tree = sub_tree[char]
+                if i == lastidx:
+                    sub_tree["data"] = emojis[emj]
+    return _SEARCH_TREE
+def demojize(
+    string,
+    delimiters=("<emoji> ", " </emoji>"),
+    language="tr",
+    version=None,
+    handle_version=None,
+):
+    if language == "alias":
+        language = "tr"
+        _use_aliases = True
+    else:
+        _use_aliases = False
+    tree = _get_search_tree()
+    result = []
+    i = 0
+    length = len(string)
+    while i < length:
+        consumed = False
+        char = string[i]
+        if char in tree:
+            j = i + 1
+            sub_tree = tree[char]
+            while j < length and string[j] in sub_tree:
+                sub_tree = sub_tree[string[j]]
+                j += 1
+            if "data" in sub_tree:
+                emj_data = sub_tree["data"]
+                code_points = string[i:j]
+                replace_str = None
+                if version is not None and emj_data["E"] > version:
+                    if callable(handle_version):
+                        emj_data = emj_data.copy()
+                        emj_data["match_start"] = i
+                        emj_data["match_end"] = j
+                        replace_str = handle_version(code_points, emj_data)
+                    elif handle_version is not None:
+                        replace_str = str(handle_version)
+                    else:
+                        replace_str = None
+                elif language in emj_data:
+                    if _use_aliases and "alias" in emj_data:
+                        replace_str = (
+                            delimiters[0] + emj_data["alias"][0][:-1] + delimiters[1]
+                        )
+                    else:
+                        replace_str = (
+                            delimiters[0] + emj_data[language][1:-1] + delimiters[1]
+                        )
+                else:
+                    # The emoji exists, but it is not translated, so we keep the emoji
+                    replace_str = code_points
+                i = j - 1
+                consumed = True
+                if replace_str:
+                    result.append(replace_str)
+        if not consumed and char != "\ufe0e" and char != "\ufe0f":
+            result.append(char)
+        i += 1
+    return "".join(result)

Preprocessor/emojis_tr_twitter.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Preprocessor/preprocessor.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import urllib
+import html
+import re
+from urlextract import URLExtract
+from unicodedata import normalize
+from .demojize import demojize
+def hashtag_handler(text: str):
+    pattern = r"(#([^\s]+))"
+    return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
+def cashtag_handler(text: str):
+    pattern = r"(\$([^\s]+))"
+    return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
+def mention_handler(text: str):
+    pattern = r"(@([^\s]+))"
+    return re.sub(pattern, " @user ", text)
+url_extractor = URLExtract()
+def url_handler(text: str):
+    urls = list(url_extractor.gen_urls(text))
+    updated_urls = list(
+        set([url if "http" in url else f"https://{url}" for url in urls])
+    )
+    domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
+    for i in range(len(domains)):
+        text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
+    return text
+def email_handler(text: str):
+    pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
+    match = re.findall(pattern, text)
+    for m in match:
+        text = text.replace(m, " <email> ").strip()
+    return text
+def emoji_handler(text: str):
+    return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
+def normalize_text(text: str):
+    return normalize("NFC", text)
+def preprocess(text: str):
+    output = html.unescape(text)
+    output = normalize_text(output)
+    output = email_handler(output)
+    output = url_handler(output)
+    output = hashtag_handler(output)
+    output = cashtag_handler(output)
+    output = mention_handler(output)
+    output = emoji_handler(output)
+    output = re.sub(r"\s+", " ", output)
+    output = output.lower()
+    output = output.strip()
+    return output
+if __name__ == "__main__":
+    sample_text = ""
+    preprocessed_text = preprocess(sample_text)
+    print(preprocessed_text)

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+import torch
+from peft import (
+    PeftModel,
+    PeftConfig,
+)
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from Preprocessor import preprocess
+peft_model = "VRLLab/TurkishBERTweet-Lora-SA"
+peft_config = PeftConfig.from_pretrained(peft_model)
+# loading Tokenizer
+padding_side = "right"
+tokenizer = AutoTokenizer.from_pretrained(
+    peft_config.base_model_name_or_path, padding_side=padding_side
+)
+if getattr(tokenizer, "pad_token_id") is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+id2label_sa = {0: "negative", 2: "positive", 1: "neutral"}
+turkishBERTweet_sa = AutoModelForSequenceClassification.from_pretrained(
+    peft_config.base_model_name_or_path,
+    return_dict=True,
+    num_labels=len(id2label_sa),
+    id2label=id2label_sa,
+)
+turkishBERTweet_sa = PeftModel.from_pretrained(turkishBERTweet_sa, peft_model)
+st.title("Sentiment Analysis with HuggingFace Spaces")
+st.write("Enter a sentence to analyze its sentiment:")
+user_input = st.text_input("")
+if user_input:
+    with torch.no_grad():
+        ids = tokenizer.encode_plus(preprocess(user_input), return_tensors="pt")
+        logits = turkishBERTweet_sa(**ids).logits
+        label_id = logits.argmax(-1).item()
+        confidence = logits.softmax(-1)[0, label_id].item()
+        st.write(f"Sentiment: {id2label_sa[label_id]}")
+        st.write(f"Confidence: {confidence:.2f}")