|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
from sklearn.model_selection import train_test_split |
|
|
from datasets import Dataset |
|
|
from underthesea import word_tokenize |
|
|
import os |
|
|
import pickle |
|
|
|
|
|
|
|
|
MODEL_NAME = "vinai/phobert-base" |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) |
|
|
|
|
|
|
|
|
label_encoder = None |
|
|
|
|
|
|
|
|
def preprocess_function(example): |
|
|
|
|
|
tokens = word_tokenize(example["comment"], format="text") |
|
|
return tokenizer(tokens, truncation=True) |
|
|
|
|
|
|
|
|
def train_model(file): |
|
|
df = pd.read_csv(file.name) |
|
|
|
|
|
global label_encoder |
|
|
label_encoder = LabelEncoder() |
|
|
df["label"] = label_encoder.fit_transform(df["label"]) |
|
|
|
|
|
|
|
|
with open("label_encoder.pkl", "wb") as f: |
|
|
pickle.dump(label_encoder, f) |
|
|
|
|
|
|
|
|
dataset = Dataset.from_pandas(df[["comment", "label"]]) |
|
|
tokenized_dataset = dataset.map(preprocess_function) |
|
|
|
|
|
|
|
|
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2) |
|
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_)) |
|
|
|
|
|
|
|
|
args = TrainingArguments( |
|
|
output_dir="./results", |
|
|
evaluation_strategy="epoch", |
|
|
per_device_train_batch_size=8, |
|
|
per_device_eval_batch_size=8, |
|
|
num_train_epochs=3, |
|
|
save_strategy="no", |
|
|
logging_steps=10 |
|
|
) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=args, |
|
|
train_dataset=tokenized_dataset["train"], |
|
|
eval_dataset=tokenized_dataset["test"], |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
try: |
|
|
trainer.train() |
|
|
except IndexError as e: |
|
|
return f"❌ Lỗi: Có thể nhãn (label) vượt quá số lượng cho phép. Chi tiết: {str(e)}" |
|
|
|
|
|
|
|
|
model.save_pretrained("finetuned_phobert") |
|
|
tokenizer.save_pretrained("finetuned_phobert") |
|
|
|
|
|
return "✅ Huấn luyện thành công!" |
|
|
|
|
|
|
|
|
def predict_sentiment(text): |
|
|
if not os.path.exists("finetuned_phobert"): |
|
|
return "❌ Chưa có mô hình được huấn luyện." |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("finetuned_phobert") |
|
|
tokenizer = AutoTokenizer.from_pretrained("finetuned_phobert", use_fast=False) |
|
|
|
|
|
global label_encoder |
|
|
if label_encoder is None: |
|
|
with open("label_encoder.pkl", "rb") as f: |
|
|
label_encoder = pickle.load(f) |
|
|
|
|
|
tokens = word_tokenize(text, format="text") |
|
|
inputs = tokenizer(tokens, return_tensors="pt", truncation=True) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
pred = torch.argmax(probs, dim=1).item() |
|
|
|
|
|
return f"🔎 Dự đoán: {label_encoder.inverse_transform([pred])[0]} (xác suất: {probs[0][pred]:.2f})" |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# 🔥 Fine-tune cảm xúc tiếng Việt với PhoBERT") |
|
|
|
|
|
with gr.Tab("1️⃣ Huấn luyện"): |
|
|
file_input = gr.File(label="Tải lên file CSV") |
|
|
train_button = gr.Button("Huấn luyện mô hình") |
|
|
train_output = gr.Textbox(label="Kết quả") |
|
|
|
|
|
train_button.click(fn=train_model, inputs=file_input, outputs=train_output) |
|
|
|
|
|
with gr.Tab("2️⃣ Dự đoán"): |
|
|
text_input = gr.Textbox(label="Nhập câu đánh giá") |
|
|
predict_button = gr.Button("Dự đoán") |
|
|
predict_output = gr.Textbox(label="Kết quả") |
|
|
|
|
|
predict_button.click(fn=predict_sentiment, inputs=text_input, outputs=predict_output) |
|
|
|
|
|
demo.launch() |
|
|
|