vnanhtuan's picture
Update app.py
18faad3 verified
import gradio as gr
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from underthesea import word_tokenize
import os
import pickle
# ---- Load PhoBERT ----
MODEL_NAME = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
# ---- Label Encoder (sẽ lưu sau khi train) ----
label_encoder = None
# ---- Tokenization ----
def preprocess_function(example):
# word_tokenize tiếng Việt đúng cách cho PhoBERT
tokens = word_tokenize(example["comment"], format="text")
return tokenizer(tokens, truncation=True)
# ---- Train function ----
def train_model(file):
df = pd.read_csv(file.name)
global label_encoder
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])
# Save encoder
with open("label_encoder.pkl", "wb") as f:
pickle.dump(label_encoder, f)
# Chuyển Hugging Face Dataset
dataset = Dataset.from_pandas(df[["comment", "label"]])
tokenized_dataset = dataset.map(preprocess_function)
# Split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_))
# Training
args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
save_strategy="no",
logging_steps=10
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
)
try:
trainer.train()
except IndexError as e:
return f"❌ Lỗi: Có thể nhãn (label) vượt quá số lượng cho phép. Chi tiết: {str(e)}"
# Save model
model.save_pretrained("finetuned_phobert")
tokenizer.save_pretrained("finetuned_phobert")
return "✅ Huấn luyện thành công!"
# ---- Dự đoán ----
def predict_sentiment(text):
if not os.path.exists("finetuned_phobert"):
return "❌ Chưa có mô hình được huấn luyện."
model = AutoModelForSequenceClassification.from_pretrained("finetuned_phobert")
tokenizer = AutoTokenizer.from_pretrained("finetuned_phobert", use_fast=False)
global label_encoder
if label_encoder is None:
with open("label_encoder.pkl", "rb") as f:
label_encoder = pickle.load(f)
tokens = word_tokenize(text, format="text")
inputs = tokenizer(tokens, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
pred = torch.argmax(probs, dim=1).item()
return f"🔎 Dự đoán: {label_encoder.inverse_transform([pred])[0]} (xác suất: {probs[0][pred]:.2f})"
# ---- Giao diện Gradio ----
with gr.Blocks() as demo:
gr.Markdown("# 🔥 Fine-tune cảm xúc tiếng Việt với PhoBERT")
with gr.Tab("1️⃣ Huấn luyện"):
file_input = gr.File(label="Tải lên file CSV")
train_button = gr.Button("Huấn luyện mô hình")
train_output = gr.Textbox(label="Kết quả")
train_button.click(fn=train_model, inputs=file_input, outputs=train_output)
with gr.Tab("2️⃣ Dự đoán"):
text_input = gr.Textbox(label="Nhập câu đánh giá")
predict_button = gr.Button("Dự đoán")
predict_output = gr.Textbox(label="Kết quả")
predict_button.click(fn=predict_sentiment, inputs=text_input, outputs=predict_output)
demo.launch()