File size: 3,930 Bytes
718d45a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b14e686
718d45a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18faad3
 
 
 
718d45a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from underthesea import word_tokenize
import os
import pickle

# ---- Load PhoBERT ----
MODEL_NAME = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

# ---- Label Encoder (sẽ lưu sau khi train) ----
label_encoder = None

# ---- Tokenization ----
def preprocess_function(example):
    # word_tokenize tiếng Việt đúng cách cho PhoBERT
    tokens = word_tokenize(example["comment"], format="text")
    return tokenizer(tokens, truncation=True)

# ---- Train function ----
def train_model(file):
    df = pd.read_csv(file.name)

    global label_encoder
    label_encoder = LabelEncoder()
    df["label"] = label_encoder.fit_transform(df["label"])

    # Save encoder
    with open("label_encoder.pkl", "wb") as f:
        pickle.dump(label_encoder, f)

    # Chuyển Hugging Face Dataset
    dataset = Dataset.from_pandas(df[["comment", "label"]])
    tokenized_dataset = dataset.map(preprocess_function)

    # Split
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_))

    # Training
    args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        save_strategy="no",
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
    )

    try:
        trainer.train()
    except IndexError as e:
        return f"❌ Lỗi: Có thể nhãn (label) vượt quá số lượng cho phép. Chi tiết: {str(e)}"

    # Save model
    model.save_pretrained("finetuned_phobert")
    tokenizer.save_pretrained("finetuned_phobert")

    return "✅ Huấn luyện thành công!"

# ---- Dự đoán ----
def predict_sentiment(text):
    if not os.path.exists("finetuned_phobert"):
        return "❌ Chưa có mô hình được huấn luyện."

    model = AutoModelForSequenceClassification.from_pretrained("finetuned_phobert")
    tokenizer = AutoTokenizer.from_pretrained("finetuned_phobert", use_fast=False)

    global label_encoder
    if label_encoder is None:
        with open("label_encoder.pkl", "rb") as f:
            label_encoder = pickle.load(f)

    tokens = word_tokenize(text, format="text")
    inputs = tokenizer(tokens, return_tensors="pt", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=1).item()

    return f"🔎 Dự đoán: {label_encoder.inverse_transform([pred])[0]} (xác suất: {probs[0][pred]:.2f})"

# ---- Giao diện Gradio ----
with gr.Blocks() as demo:
    gr.Markdown("# 🔥 Fine-tune cảm xúc tiếng Việt với PhoBERT")

    with gr.Tab("1️⃣ Huấn luyện"):
        file_input = gr.File(label="Tải lên file CSV")
        train_button = gr.Button("Huấn luyện mô hình")
        train_output = gr.Textbox(label="Kết quả")

        train_button.click(fn=train_model, inputs=file_input, outputs=train_output)

    with gr.Tab("2️⃣ Dự đoán"):
        text_input = gr.Textbox(label="Nhập câu đánh giá")
        predict_button = gr.Button("Dự đoán")
        predict_output = gr.Textbox(label="Kết quả")

        predict_button.click(fn=predict_sentiment, inputs=text_input, outputs=predict_output)

demo.launch()