absa-indobert-web / model_utils.py
zdannn2808's picture
perbaiki model_utils.py and back app.py, visualization.py
4ca46c3 verified
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
from torch.utils.data import Dataset, DataLoader
from config import CONFIG
# Import sklearn untuk encoding label sentimen
try:
from sklearn.preprocessing import LabelEncoder
except ImportError:
print("⚠️ scikit-learn tidak terinstall, mencoba install...")
import subprocess
subprocess.check_call(['pip', 'install', 'scikit-learn'])
from sklearn.preprocessing import LabelEncoder
class ABSADataset(Dataset):
"""
Custom Dataset untuk ABSA batch processing.
Menggabungkan setiap kalimat dengan setiap aspek untuk prediksi.
"""
def __init__(self, sentences, aspects, tokenizer, max_len):
"""
Args:
sentences (list): List dari kalimat input.
aspects (list): List dari aspek yang ingin diprediksi.
tokenizer (AutoTokenizer): Tokenizer IndoBERT.
max_len (int): Panjang maksimum token.
"""
self.sentences = sentences
self.aspects = aspects
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
# Total kombinasi = jumlah kalimat × jumlah aspek
return len(self.sentences) * len(self.aspects)
def __getitem__(self, idx):
"""
Mengembalikan encoded input untuk satu pasangan kalimat-aspek.
"""
# Hitung indeks kalimat dan aspek dari indeks global
sent_idx = idx // len(self.aspects)
aspect_idx = idx % len(self.aspects)
sentence = self.sentences[sent_idx]
aspect = self.aspects[aspect_idx]
# Gabungkan aspek dan kalimat dengan format khusus
combined = f"[ASPEK] {aspect} [TEXT] {sentence}"
# Tokenisasi dan encoding text
encoded = self.tokenizer.encode_plus(
combined,
add_special_tokens=True, # Tambah token [CLS] dan [SEP]
padding="max_length", # Padding ke max_length
max_length=self.max_len,
truncation=True, # Potong jika melebihi max_length
return_attention_mask=True,
return_tensors="pt",
)
return {
'input_ids': encoded['input_ids'].flatten(),
'attention_mask': encoded['attention_mask'].flatten(),
'sent_idx': sent_idx, # Simpan indeks untuk mapping hasil
'aspect_idx': aspect_idx
}
def load_model_and_tokenizer():
"""
Memuat model IndoBERT ABSA, tokenizer, dan label encoder.
Jika file tidak ada lokal, akan download dari HuggingFace.
Returns:
model (nn.Module): Model ABSA yang sudah diload.
tokenizer (AutoTokenizer): Tokenizer untuk IndoBERT.
label_encoder (LabelEncoder): Encoder untuk label sentimen.
device (torch.device): Device (cuda/cpu) yang digunakan.
"""
# Setup path direktori model dan tokenizer
base_path = os.path.abspath(os.path.dirname(__file__))
model_dir = os.path.join(base_path, "assets", "model")
tokenizer_dir = os.path.join(base_path, "assets", "tokenizer")
# Buat direktori jika belum ada
os.makedirs(model_dir, exist_ok=True)
os.makedirs(tokenizer_dir, exist_ok=True)
model_path = os.path.join(model_dir, "indobert_absa_model.pth")
label_path = os.path.join(model_dir, "label_encoder.joblib")
# === DOWNLOAD MODEL JIKA BELUM ADA ===
if not os.path.exists(model_path):
print("📥 Downloading model dari HuggingFace...")
try:
# Download dari HuggingFace Hub
downloaded_model = hf_hub_download(
repo_id=CONFIG["hf_model_repo"],
filename="indobert_absa_model.pth",
subfolder=CONFIG["hf_model_subfolder"],
cache_dir=None
)
# Copy ke direktori lokal
import shutil
shutil.copy(downloaded_model, model_path)
print(f"✅ Model berhasil didownload ke {model_path}")
except Exception as e:
print(f"❌ Error downloading model: {e}")
raise
# === DOWNLOAD LABEL ENCODER JIKA BELUM ADA ===
if not os.path.exists(label_path):
print("📥 Downloading label encoder dari HuggingFace...")
try:
downloaded_label = hf_hub_download(
repo_id=CONFIG["hf_model_repo"],
filename="label_encoder.joblib",
subfolder=CONFIG["hf_model_subfolder"],
cache_dir=None
)
import shutil
shutil.copy(downloaded_label, label_path)
print(f"✅ Label encoder berhasil didownload ke {label_path}")
except Exception as e:
print(f"❌ Error downloading label encoder: {e}")
raise
# === DOWNLOAD TOKENIZER JIKA BELUM ADA ===
tokenizer_files = ["special_tokens_map.json", "tokenizer.json",
"tokenizer_config.json", "vocab.txt"]
# Cek apakah semua file tokenizer sudah ada
all_tokenizer_exists = all(
os.path.exists(os.path.join(tokenizer_dir, f)) for f in tokenizer_files
)
if not all_tokenizer_exists:
print("📥 Downloading tokenizer dari HuggingFace...")
try:
for file in tokenizer_files:
if not os.path.exists(os.path.join(tokenizer_dir, file)):
# Download setiap file tokenizer
downloaded_file = hf_hub_download(
repo_id=CONFIG["hf_model_repo"],
filename=file,
subfolder=CONFIG["hf_tokenizer_subfolder"],
cache_dir=None
)
import shutil
shutil.copy(downloaded_file, os.path.join(
tokenizer_dir, file))
print(f"✅ Tokenizer berhasil didownload ke {tokenizer_dir}")
except Exception as e:
print(f"❌ Error downloading tokenizer: {e}")
# Fallback ke pretrained jika gagal
pass
# === LOAD TOKENIZER ===
try:
# Coba load dari direktori lokal
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
print("✅ Tokenizer loaded dari lokal")
except Exception as e:
# Fallback: load dari pretrained model
print(
f"⚠️ Gagal load tokenizer lokal ({e}), menggunakan pretrained...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
# === LOAD LABEL ENCODER ===
try:
label_encoder = joblib.load(label_path)
print("✅ Label encoder loaded successfully")
except Exception as e:
print(f"❌ Error loading label encoder: {e}")
raise RuntimeError(
f"Gagal load label_encoder.joblib. "
f"Pastikan file valid dan scikit-learn terinstall. Error: {e}"
)
# === DEFINISI MODEL ARCHITECTURE ===
class IndoBertForABSA(nn.Module):
"""
Model klasifikasi aspek berbasis IndoBERT untuk ABSA.
Arsitektur: IndoBERT -> LayerNorm -> Dropout -> Linear Classifier
"""
def __init__(self, num_labels):
super().__init__()
# Load pretrained IndoBERT
self.bert = AutoModel.from_pretrained(
CONFIG["model_name"], trust_remote_code=True, use_safetensors=True
)
# Layer normalisasi untuk stabilitas training
self.norm = nn.LayerNorm(self.bert.config.hidden_size)
# Dropout untuk mencegah overfitting
self.dropout = nn.Dropout(CONFIG["dropout_rate"])
# Linear layer untuk klasifikasi sentimen
self.classifier = nn.Linear(
self.bert.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask):
"""
Forward pass untuk model ABSA.
Args:
input_ids (torch.Tensor): Tensor input token IDs.
attention_mask (torch.Tensor): Tensor mask perhatian.
Returns:
torch.Tensor: Logit prediksi.
"""
# Dapatkan output dari BERT
output = self.bert(input_ids=input_ids,
attention_mask=attention_mask)
# Gunakan pooler output (representasi [CLS] token)
pooled = output.pooler_output
# Normalisasi
normed = self.norm(pooled)
# Dropout
dropped = self.dropout(normed)
# Klasifikasi
return self.classifier(dropped)
# === SETUP DEVICE DAN LOAD MODEL ===
# Gunakan GPU jika tersedia, jika tidak gunakan CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Inisialisasi model dengan jumlah label dari label encoder
model = IndoBertForABSA(num_labels=len(label_encoder.classes_))
try:
# Load weights model yang sudah di-training
model.load_state_dict(torch.load(model_path, map_location=device))
print("✅ Model state dict loaded successfully")
except Exception as e:
print(f"❌ Error loading model state dict: {e}")
raise
# Pindahkan model ke device (GPU/CPU)
model.to(device)
# Set model ke mode evaluasi (nonaktifkan dropout, dll)
model.eval()
return model, tokenizer, label_encoder, device
def predict_multi_aspect(model, tokenizer, sentence, aspek_list, label_encoder, device, max_len):
"""
Melakukan prediksi sentimen untuk setiap aspek pada satu kalimat.
Proses prediksi dilakukan satu per satu (non-batch).
Args:
model (nn.Module): Model ABSA yang sudah diload.
tokenizer (AutoTokenizer): Tokenizer IndoBERT.
sentence (str): Kalimat input.
aspek_list (list): Daftar aspek yang ingin diprediksi.
label_encoder (LabelEncoder): Encoder label.
device (torch.device): Device (cuda/cpu).
max_len (int): Panjang maksimum token.
Returns:
dict: Hasil prediksi berupa {aspek: label_sentimen}.
"""
results = {}
# Loop untuk setiap aspek
for aspek in aspek_list:
# Gabungkan aspek dan kalimat
combined = f"[ASPEK] {aspek} [TEXT] {sentence}"
# Tokenisasi input
encoded = tokenizer.encode_plus(
combined,
add_special_tokens=True,
padding="max_length",
max_length=max_len,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)
# Pindahkan tensor ke device
input_ids = encoded["input_ids"].to(device)
attention_mask = encoded["attention_mask"].to(device)
# Prediksi tanpa menghitung gradient (inference mode)
with torch.no_grad():
# Forward pass
outputs = model(input_ids, attention_mask)
# Konversi logits ke probabilitas dengan softmax
probs = F.softmax(outputs, dim=1).squeeze()
# Ambil indeks dengan probabilitas tertinggi
idx = torch.argmax(probs).item()
# Konversi indeks ke label sentimen
label = label_encoder.inverse_transform([idx])[0]
# Simpan hasil
results[aspek] = label
return results
def predict_multi_aspect_batch(model, tokenizer, sentences, aspek_list, label_encoder, device, max_len, batch_size=None):
"""
Melakukan prediksi sentimen untuk setiap aspek pada multiple kalimat menggunakan batch processing.
Lebih efisien untuk memproses banyak kalimat sekaligus.
Args:
model (nn.Module): Model ABSA yang sudah diload.
tokenizer (AutoTokenizer): Tokenizer IndoBERT.
sentences (list): List kalimat input.
aspek_list (list): Daftar aspek yang ingin diprediksi.
label_encoder (LabelEncoder): Encoder label.
device (torch.device): Device (cuda/cpu).
max_len (int): Panjang maksimum token.
batch_size (int, optional): Ukuran batch. Jika None, gunakan dari CONFIG.
Returns:
list: List of dict hasil prediksi [{aspek: label_sentimen}, ...].
"""
# Set batch size dari CONFIG jika tidak diberikan
if batch_size is None:
batch_size = CONFIG.get("batch_size", 32)
# === BUAT DATASET DAN DATALOADER ===
# Dataset akan membuat kombinasi semua kalimat × semua aspek
dataset = ABSADataset(sentences, aspek_list, tokenizer, max_len)
dataloader = DataLoader(
dataset,
batch_size=batch_size, # Process dalam batch untuk efisiensi
shuffle=False, # Jangan shuffle untuk maintain urutan
num_workers=CONFIG.get("num_workers", 0)
)
# === INISIALISASI CONTAINER HASIL ===
num_sentences = len(sentences)
num_aspects = len(aspek_list)
# Buat matrix untuk menyimpan prediksi [num_sentences x num_aspects]
all_predictions = [[None] * num_aspects for _ in range(num_sentences)]
# === BATCH PREDICTION ===
model.eval() # Set model ke evaluation mode
with torch.no_grad(): # Nonaktifkan gradient calculation
for batch in dataloader:
# Pindahkan batch ke device
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
sent_indices = batch['sent_idx'].numpy()
aspect_indices = batch['aspect_idx'].numpy()
# Forward pass untuk seluruh batch
outputs = model(input_ids, attention_mask)
# Konversi logits ke probabilitas
probs = F.softmax(outputs, dim=1)
# Ambil indeks prediksi tertinggi
pred_indices = torch.argmax(probs, dim=1).cpu().numpy()
# Konversi indeks ke label sentimen
labels = label_encoder.inverse_transform(pred_indices)
# Simpan hasil ke matrix sesuai indeks aslinya
for i, (sent_idx, aspect_idx, label) in enumerate(zip(sent_indices, aspect_indices, labels)):
all_predictions[sent_idx][aspect_idx] = label
# === KONVERSI KE FORMAT DICTIONARY ===
results = []
for predictions in all_predictions:
# Buat dict {aspek: label} untuk setiap kalimat
result_dict = {aspek: label for aspek,
label in zip(aspek_list, predictions)}
results.append(result_dict)
return results