import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from datetime import datetime import json import torch import os # Disable caching os.environ['TRANSFORMERS_CACHE'] = '/tmp' os.environ['HF_HOME'] = '/tmp' # ====================== Utility Functions ====================== def simulate_topic_prediction(text): """Topic model prediction - replace with your actual model""" topics = ['product', 'customer_service', 'shipping'] predictions = {} token_result = pipeline.tokenizer(text, return_tensors="pt") input_ids = token_result['input_ids'] attention_mask = token_result['attention_mask'] with torch.no_grad(): outputs = pipeline.aspect_model(input_ids=input_ids, attention_mask=attention_mask) # st.write("Topic Model output:", outputs) # Convert model outputs to probabilities # The output is already sigmoid result, use directly if hasattr(outputs, 'logits'): probs = outputs.logits.squeeze().numpy() else: probs = outputs.squeeze().numpy() # Create predictions dictionary for i, topic in enumerate(topics): predictions[topic] = float(probs[i]) if len(probs.shape) > 0 else float(probs) return predictions def simulate_sentiment_prediction(text): """Sentiment prediction - replace with your actual model""" sentiments = ['positive', 'neutral', 'negative'] token_result = pipeline.tokenizer(text, return_tensors="pt") input_ids = token_result['input_ids'] attention_mask = token_result['attention_mask'] with torch.no_grad(): outputs = pipeline.sentiment_model(input_ids=input_ids, attention_mask=attention_mask) # st.write("Sentiment Model output:", outputs) # Convert model outputs to probabilities # The output is already softmax result, use directly if hasattr(outputs, 'logits'): probs = outputs.logits.squeeze().numpy() else: probs = outputs.squeeze().numpy() # st.write("Sentiment Model probabilities:", probs) # Get the predicted sentiment predicted_idx = np.argmax(probs) predicted_sentiment = sentiments[predicted_idx] confidence = float(probs[predicted_idx]) return { 'sentiment': predicted_sentiment, 'confidence': confidence, 'all_probs': {sentiments[i]: float(probs[i]) for i in range(len(sentiments))} } def display_predictions(text, topic_predictions, sentiment_prediction): """Display prediction results""" st.markdown("---") st.subheader("🎯 Classification Results") # Display input text st.markdown("**Input Text:**") st.info(text) col_topic, col_sentiment = st.columns(2) with col_topic: st.markdown("**🏷️ Topic Classification (Multi-label):**") for topic, prob in topic_predictions.items(): if prob >= 0.5: # Fixed threshold confidence_class = "topic-positive" if prob > 0.7 else "topic-neutral" emoji = "✅" if prob > 0.7 else "⚠️" result_html = f"""
{emoji} {topic.replace('_', ' ').title()}
Confidence: {prob:.2%}
""" st.markdown(result_html, unsafe_allow_html=True) # Show chart fig_topic = create_topic_chart(topic_predictions) st.plotly_chart(fig_topic, use_container_width=True) with col_sentiment: st.markdown("**😊 Sentiment Analysis:**") sentiment = sentiment_prediction['sentiment'] confidence = sentiment_prediction['confidence'] sentiment_emoji = {"positive": "😊", "neutral": "😐", "negative": "😞"} sentiment_class = f"topic-{sentiment}" result_html = f"""
{sentiment_emoji[sentiment]} {sentiment.title()}
Confidence: {confidence:.2%}
""" st.markdown(result_html, unsafe_allow_html=True) # Show chart fig_sentiment = create_sentiment_chart(sentiment_prediction) st.plotly_chart(fig_sentiment, use_container_width=True) # Store in session state for statistics if 'classification_history' not in st.session_state: st.session_state.classification_history = [] st.session_state.classification_history.append({ 'text': text, 'topics': topic_predictions, 'sentiment': sentiment_prediction, 'confidence': np.mean(list(topic_predictions.values()) + [sentiment_prediction['confidence']]), 'timestamp': datetime.now() }) def create_topic_chart(predictions): """Create topic prediction chart""" topics = list(predictions.keys()) probabilities = list(predictions.values()) fig = go.Figure(data=[ go.Bar( x=[t.replace('_', ' ').title() for t in topics], y=probabilities, marker_color=['#28a745' if p >= 0.5 else '#6c757d' for p in probabilities] ) ]) fig.update_layout( title="Topic Classification Probabilities", xaxis_title="Topics", yaxis_title="Probability", height=300, showlegend=False ) fig.add_hline(y=0.5, line_dash="dash", line_color="red", annotation_text="Threshold (0.5)") return fig def create_sentiment_chart(prediction): """Create sentiment prediction chart""" sentiments = ['positive', 'neutral', 'negative'] # Use all probabilities if available, otherwise create from single prediction if 'all_probs' in prediction: probs = [prediction['all_probs'][s] for s in sentiments] else: # Fallback to original method current_sentiment = prediction['sentiment'] confidence = prediction['confidence'] probs = [0.1, 0.1, 0.1] idx = sentiments.index(current_sentiment) probs[idx] = confidence remaining = (1.0 - confidence) / 2 for i, _ in enumerate(probs): if i != idx: probs[i] = remaining colors = ['#28a745', '#ffc107', '#dc3545'] # Create donut chart for single prediction fig = go.Figure(data=[go.Pie( labels=[s.title() for s in sentiments], values=probs, hole=0.3, # Creates donut effect marker_colors=colors, textinfo='label+percent', textposition='auto' )]) fig.update_layout( title="Sentiment Analysis Probabilities", height=300, showlegend=False, margin=dict(t=50, b=20, l=20, r=20) ) return fig def process_batch_classification(df, text_column, max_results=10): """Process batch classification""" st.subheader("🔄 Batch Processing Results") progress_bar = st.progress(0) results = [] all_topic_predictions = [] all_sentiment_predictions = [] for i, text in enumerate(df[text_column].values[:max_results]): if isinstance(text, str) and text.strip(): try: topic_pred = simulate_topic_prediction(text) sentiment_pred = simulate_sentiment_prediction(text) # Store for visualization all_topic_predictions.append(topic_pred) all_sentiment_predictions.append(sentiment_pred) results.append({ 'text': text[:100] + '...' if len(text) > 100 else text, 'topics': ', '.join([t for t, p in topic_pred.items() if p >= 0.5]), 'sentiment': sentiment_pred['sentiment'], 'sentiment_confidence': sentiment_pred['confidence'] }) except Exception as e: st.error(f"Error processing text {i+1}: {str(e)}") continue progress_bar.progress((i + 1) / min(len(df), max_results)) # Display results table if results: results_df = pd.DataFrame(results) st.dataframe(results_df, use_container_width=True) # Create visualization section st.markdown("---") st.subheader("📊 Batch Analysis Visualization") col_topic_viz, col_sentiment_viz = st.columns(2) with col_topic_viz: st.markdown("**Topic Distribution**") create_batch_topic_chart(all_topic_predictions) with col_sentiment_viz: st.markdown("**Sentiment Distribution**") create_batch_sentiment_chart(all_sentiment_predictions) # Download results csv = results_df.to_csv(index=False) st.download_button( label="📥 Download Results", data=csv, file_name=f"classification_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime="text/csv" ) def create_batch_topic_chart(all_predictions): """Create batch topic analysis chart""" topics = ['product', 'customer_service', 'shipping'] topic_counts = {topic: 0 for topic in topics} total_texts = len(all_predictions) # Count how many texts were classified for each topic (above threshold) for pred in all_predictions: for topic, prob in pred.items(): if prob >= 0.5: topic_counts[topic] += 1 # Convert to percentages topic_percentages = {topic: (count / total_texts) * 100 for topic, count in topic_counts.items()} # Create bar chart fig = go.Figure(data=[ go.Bar( x=[t.replace('_', ' ').title() for t in topics], y=list(topic_percentages.values()), marker_color=['#28a745', '#17a2b8', '#ffc107'], text=[f'{v:.1f}%' for v in topic_percentages.values()], textposition='auto' ) ]) fig.update_layout( title=f"Topic Distribution Across {total_texts} Texts", xaxis_title="Topics", yaxis_title="Percentage of Texts (%)", height=400, showlegend=False ) st.plotly_chart(fig, use_container_width=True) def create_batch_sentiment_chart(all_predictions): """Create batch sentiment analysis chart (rounded/donut)""" sentiments = ['positive', 'neutral', 'negative'] sentiment_counts = {sentiment: 0 for sentiment in sentiments} total_texts = len(all_predictions) # Count sentiment predictions for pred in all_predictions: sentiment = pred['sentiment'] sentiment_counts[sentiment] += 1 # Convert to percentages sentiment_percentages = [(count / total_texts) * 100 for count in sentiment_counts.values()] # Create donut chart colors = ['#28a745', '#ffc107', '#dc3545'] fig = go.Figure(data=[go.Pie( labels=[s.title() for s in sentiments], values=sentiment_percentages, hole=0.4, # Creates donut effect marker_colors=colors, textinfo='label+percent', textposition='auto' )]) fig.update_layout( title=f"Sentiment Distribution Across {total_texts} Texts", height=400, showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.1, xanchor="center", x=0.5 ) ) st.plotly_chart(fig, use_container_width=True) # ====================== Main Application ====================== # Page configuration st.set_page_config( page_title="Text Classification System", page_icon="🔍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Title st.markdown('

🔍 Ecommerce Product Review Analysis - Indonesian Language

', unsafe_allow_html=True) st.markdown("---") # Sidebar for configuration with st.sidebar: st.header("📊 Model Information") # Topic Model Info with st.expander("🏷️ Topic Classification Model", expanded=True): st.markdown(""" **Model Type:** Multi-label Classification **Categories:** - 📦 Product - 🎧 Customer Service - 🚚 Shipping **Note:** Text can belong to multiple categories """) # Sentiment Model Info with st.expander("😊 Sentiment Analysis Model", expanded=True): st.markdown(""" **Model Type:** Single-label Classification **Categories:** - 😊 Positive - 😐 Neutral - 😞 Negative """) # Statistics (if available) if 'classification_history' in st.session_state: st.header("📈 Session Statistics") history = st.session_state.classification_history col_stat1, col_stat2 = st.columns(2) with col_stat1: st.metric("Texts Classified", len(history)) with col_stat2: avg_confidence = np.mean([h['confidence'] for h in history]) st.metric("Avg Confidence", f"{avg_confidence:.2f}") # Import pipeline module with error handling try: import pipeline # st.success("✅ Models loaded successfully!") except ImportError as e: st.error(f"❌ Error importing pipeline module: {str(e)}") st.info("Please make sure your pipeline.py file is in the same directory and contains the required models.") st.stop() except Exception as e: st.error(f"❌ Error loading models: {str(e)}") st.stop() # Main content st.header("📝 Text Input") # Input methods input_method = st.radio("Choose input method:", ["Single Text", "Batch Upload", "Example Texts"]) if input_method == "Single Text": user_text = st.text_area( "Enter text to classify:", placeholder="Type or paste your text here...", height=150 ) if st.button("🚀 Classify Text", type="primary"): if user_text.strip(): try: # Call your actual model prediction functions topic_predictions = simulate_topic_prediction(user_text) sentiment_prediction = simulate_sentiment_prediction(user_text) display_predictions(user_text, topic_predictions, sentiment_prediction) except Exception as e: st.error(f"Error during classification: {str(e)}") else: st.warning("Please enter some text to classify!") elif input_method == "Batch Upload": uploaded_file = st.file_uploader("Upload CSV file", type=['csv']) if uploaded_file is not None: # Delimiter options col_delim, col_encoding = st.columns(2) with col_delim: delimiter = st.selectbox( "Select delimiter:", options=[",", ";", "\t", "|", " "], format_func=lambda x: {"," : "Comma (,)", ";" : "Semicolon (;)", "\t" : "Tab", "|" : "Pipe (|)", " " : "Space"}[x], index=0 ) with col_encoding: encoding = st.selectbox( "Select encoding:", options=["utf-8", "latin-1", "cp1252", "ascii"], index=0 ) try: df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding=encoding) st.write("Preview of uploaded data:") st.dataframe(df.head()) text_column = st.selectbox("Select text column:", df.columns) # maximum number of rows to process max_rows = st.slider( "Maximum rows to process:", min_value=1, max_value=len(df), value=min(100, len(df)), step=1 ) if st.button("🔄 Process Batch", type="primary"): process_batch_classification(df, text_column, max_results=max_rows) except Exception as e: st.error(f"Error reading CSV file: {str(e)}") st.info("Try different delimiter or encoding options if the file doesn't load correctly.") else: # Example Texts st.subheader("Try these example texts:") # Example type selection example_type = st.radio( "Choose example type:", ["Single Examples", "CSV Examples"], horizontal=True ) if example_type == "Single Examples": examples = [ "Pengiriman terlambat 3 hari dan paketnya rusak.", "Pelayanan pelanggan sangat baik! Tim support sangat membantu dan responsif.", "Kualitas produknya sangat bagus, sesuai dengan yang saya harapkan.", "Saya kesulitan dengan proses pengembalian barang, sangat membingungkan.", "Pengiriman cepat dan barang sampai dalam kondisi sempurna!" ] # Initialize session state for tracking which example to show results for if 'selected_example' not in st.session_state: st.session_state.selected_example = None st.session_state.example_results = None for i, example in enumerate(examples): col_ex1, col_ex2 = st.columns([4, 1]) with col_ex1: st.text(f"{i+1}. {example}") with col_ex2: if st.button(f"Classify", key=f"example_{i}"): try: topic_predictions = simulate_topic_prediction(example) sentiment_prediction = simulate_sentiment_prediction(example) # Store results in session state st.session_state.selected_example = i st.session_state.example_results = { 'text': example, 'topic_predictions': topic_predictions, 'sentiment_prediction': sentiment_prediction } st.rerun() except Exception as e: st.error(f"Error during classification: {str(e)}") # Display results below all examples if any example was classified if st.session_state.selected_example is not None and st.session_state.example_results: results = st.session_state.example_results display_predictions( results['text'], results['topic_predictions'], results['sentiment_prediction'] ) else: # CSV Examples st.markdown("**Pre-prepared CSV datasets for testing:**") # Predefined CSV options csv_options = { "Sample E-commerce Reviews": { "data": { "review_text": [ "Produk bagus tapi pengiriman lama", "Customer service tidak responsif", "Barang sesuai deskripsi, packing aman", "Pengiriman cepat tapi produk cacat", "Pelayanan memuaskan, akan order lagi", "Kualitas produk mengecewakan", "Pengiriman sangat cepat dan aman", "Tim support sangat membantu menyelesaikan masalah", "Produk original dan sesuai gambar", "Proses refund sangat lambat dan rumit" ], "rating": [4, 2, 5, 3, 5, 1, 5, 5, 4, 2], "category": ["Electronics", "Fashion", "Books", "Electronics", "Fashion", "Electronics", "Books", "Fashion", "Electronics", "Fashion"] }, "description": "Indonesian e-commerce reviews with mixed sentiments and topics" }, "Product Reviews Dataset": { "data": { "review_text": [ "Laptop ini performanya sangat bagus untuk gaming", "Baju ini bahannya halus dan nyaman dipakai", "Buku ini sangat informatif dan mudah dipahami", "Handphone rusak setelah 2 minggu pemakaian", "Sepatu ini sangat nyaman untuk jogging", "Kamera foto hasil jelek, tidak sesuai harga", "Pelayanan toko online ini sangat memuaskan", "Pengiriman terlambat tapi barang aman", "Produk tidak sesuai dengan deskripsi", "Kualitas packaging sangat baik dan rapi" ], "product_type": ["Laptop", "Clothing", "Book", "Phone", "Shoes", "Camera", "Service", "Shipping", "General", "Packaging"], "sentiment_label": ["positive", "positive", "positive", "negative", "positive", "negative", "positive", "neutral", "negative", "positive"] }, "description": "Product-focused reviews with pre-labeled sentiments" }, "Customer Service Reviews": { "data": { "review_text": [ "CS sangat ramah dan membantu menyelesaikan komplain", "Susah menghubungi customer service via telepon", "Live chat responsive tapi solusi kurang tepat", "Tim support email sangat profesional", "Customer service tidak memberikan solusi yang jelas", "Pelayanan 24/7 sangat membantu customer", "CS galak dan tidak sabar melayani customer", "Support ticket dijawab dengan cepat dan tepat" ], "channel": ["Phone", "Phone", "Chat", "Email", "Phone", "24/7", "Phone", "Ticket"], "resolution": ["Resolved", "Unresolved", "Partial", "Resolved", "Unresolved", "Resolved", "Unresolved", "Resolved"] }, "description": "Customer service specific reviews and interactions" } } # CSV selection selected_csv = st.selectbox( "Choose a pre-prepared dataset:", options=list(csv_options.keys()), help="Select from curated datasets for testing different scenarios" ) if selected_csv: csv_info = csv_options[selected_csv] sample_df = pd.DataFrame(csv_info["data"]) # Display info and preview st.info(f"📋 **{selected_csv}**: {csv_info['description']}") col_preview, col_actions = st.columns([3, 1]) with col_preview: st.dataframe(sample_df, use_container_width=True) with col_actions: # Download button csv_data = sample_df.to_csv(index=False) st.download_button( label="📥 Download CSV", data=csv_data, file_name=f"{selected_csv.lower().replace(' ', '_')}.csv", mime="text/csv", help="Download this dataset to test batch processing" ) # Quick test button if st.button("🚀 Quick Test", help="Automatically process this dataset"): st.session_state['quick_test_df'] = sample_df st.session_state['quick_test_column'] = 'review_text' st.rerun() # Handle quick test if 'quick_test_df' in st.session_state: st.markdown("---") st.subheader("🔄 Quick Test Results") process_batch_classification( st.session_state['quick_test_df'], st.session_state['quick_test_column'], len(st.session_state['quick_test_df']) ) # Clear session state del st.session_state['quick_test_df'] del st.session_state['quick_test_column'] st.info("💡 **Tip:** Download any dataset above and upload it in the 'Batch Upload' section, or use 'Quick Test' for immediate processing!") # Footer st.markdown("---")