Spaces:

darisdzakwanhoesien
/

esgdata

Sleeping

App Files Files Community

darisdzakwanhoesien2 commited on Nov 2

Commit

39e347e

1 Parent(s): ee0d51c

Add advanced metrics

Browse files

Files changed (2) hide show

app.py +10 -3
evaluate_advanced_metrics.py +109 -0

app.py CHANGED Viewed

@@ -272,7 +272,7 @@ ha='center', va='center', fontsize=14, color='gray')
             node_colors.append('skyblue')
         elif node_type == 'Aspect':
             if sentiment == 'positive':
-                node_colors.append('lightgreen')
             elif sentiment == 'negative':
                 node_colors.append('#ff9999')  # light red
             else:  # neutral or other
@@ -558,13 +558,19 @@ with gr.Blocks(theme=gr.themes.Soft(), title="ESG Interpretability Dashboard") a
             train_button.click(fn=run_training, inputs=None, outputs=train_output)
         with gr.TabItem("📈 Evaluation"):
-            gr.Markdown("## Evaluate Model Coverage\nBenchmark the fine-tuned model against other models to evaluate its coverage of key ESG terms.")
             eval_button = gr.Button("📊 Run Evaluation")
             with gr.Row():
                 eval_plot = gr.Image(label="Coverage Comparison Chart")
                 eval_output = gr.Textbox(label="Evaluation Log", lines=10, interactive=False)
             eval_button.click(fn=run_evaluation, inputs=None, outputs=[eval_plot, eval_output])
     analyze_btn.click(
         fn=analyze_and_compare,
         inputs=[text1, text2],
@@ -640,8 +646,9 @@ import pandas as pd
 from sanitize_csv import sanitize_csv, input_file_path, output_file_path
 from create_triplets import create_triplets
 from evaluate_esg_coverage import evaluate_models
-# --- Model Configuration ---\
 # To use a fine-tuned model, change this path to the directory where your model is saved.
 # --- Step 1: Sanitize the CSV when the app starts ---

             node_colors.append('skyblue')
         elif node_type == 'Aspect':
             if sentiment == 'positive':
+node_colors.append('lightgreen')
             elif sentiment == 'negative':
                 node_colors.append('#ff9999')  # light red
             else:  # neutral or other
             train_button.click(fn=run_training, inputs=None, outputs=train_output)
         with gr.TabItem("📈 Evaluation"):
+            gr.Markdown("## Evaluate Model Coverage\\nBenchmark the fine-tuned model against other models to evaluate its coverage of key ESG terms.")
             eval_button = gr.Button("📊 Run Evaluation")
             with gr.Row():
                 eval_plot = gr.Image(label="Coverage Comparison Chart")
                 eval_output = gr.Textbox(label="Evaluation Log", lines=10, interactive=False)
             eval_button.click(fn=run_evaluation, inputs=None, outputs=[eval_plot, eval_output])
+        with gr.TabItem("🚀 Advanced Evaluation"):
+            gr.Markdown("## Advanced Model Evaluation\\nRun a suite of advanced metrics to evaluate classification performance, coherence, bias, and more.")
+            advanced_eval_button = gr.Button("🔬 Run Advanced Evaluation")
+            advanced_eval_output = gr.JSON(label="Advanced Evaluation Report")
+            advanced_eval_button.click(fn=run_advanced_evaluation, inputs=None, outputs=advanced_eval_output)
     analyze_btn.click(
         fn=analyze_and_compare,
         inputs=[text1, text2],
 from sanitize_csv import sanitize_csv, input_file_path, output_file_path
 from create_triplets import create_triplets
 from evaluate_esg_coverage import evaluate_models
+from evaluate_advanced_metrics import run_advanced_evaluation
+# --- Model Configuration ---
 # To use a fine-tuned model, change this path to the directory where your model is saved.
 # --- Step 1: Sanitize the CSV when the app starts ---

evaluate_advanced_metrics.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import pandas as pd
+from sklearn.metrics import f1_score, precision_score, recall_score
+from scipy.stats import pearsonr
+import torch
+import time
+def calculate_classification_performance(y_true, y_pred):
+    """
+    Calculates classification performance metrics.
+    """
+    macro_f1 = f1_score(y_true, y_pred, average='macro')
+    precision = precision_score(y_true, y_pred, average='macro')
+    recall = recall_score(y_true, y_pred, average='macro')
+    return {
+        "Macro-F1": macro_f1,
+        "Precision": precision,
+        "Recall": recall
+    }
+def calculate_hierarchical_coherence(section_sentiments, doc_sentiments):
+    """
+    Calculates the Hierarchical Coherence Metric (HCM).
+    """
+    # This is a simplified example. A real implementation would need to align
+    # section-level and document-level sentiments for the same aspects.
+    if not section_sentiments or not doc_sentiments:
+        return 0.0
+    # For simplicity, let's assume we have paired lists of sentiment scores
+    # (e.g., [-1, 0, 1] for negative, neutral, positive)
+    correlation, _ = pearsonr(section_sentiments, doc_sentiments)
+    return correlation
+def calculate_attention_faithfulness(model, tokenizer, text, sentiment):
+    """
+    Calculates the Attention Faithfulness Score (AFS).
+    This is a placeholder and requires a more complex implementation.
+    """
+    # This would involve getting attention weights from the model and
+    # comparing them to sentiment-bearing words.
+    return 0.75  # Placeholder value
+def calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments):
+    """
+    Calculates the Bias-Adjusted Sentiment Delta (BASD).
+    """
+    if not promise_sentiments or not performance_sentiments:
+        return 0.0
+    mean_promise = sum(promise_sentiments) / len(promise_sentiments)
+    mean_performance = sum(performance_sentiments) / len(performance_sentiments)
+    # A simple delta. The confidence adjustment would require confidence scores.
+    return mean_promise - mean_performance
+def measure_efficiency(model, tokenizer, text):
+    """
+    Measures inference time and memory usage.
+    """
+    start_time = time.time()
+    # A simple inference pass
+    _ = model.encode(text)
+    end_time = time.time()
+    inference_time = end_time - start_time
+    # Memory usage is more complex to measure accurately and may require
+    # a library like memory-profiler.
+    memory_usage = 0.0  # Placeholder
+    return {
+        "Inference Time (s)": inference_time,
+        "Memory Usage (MB)": memory_usage
+    }
+def run_advanced_evaluation():
+    """
+    Runs all advanced evaluation metrics and returns a report.
+    """
+    # This is a placeholder for where you would get your actual data
+    y_true = [1, 0, 1, 1, 0]
+    y_pred = [1, 1, 1, 0, 0]
+    section_sentiments = [-0.5, 0.8, 0.2, -0.9]
+    doc_sentiments = [-0.4, 0.7, 0.3, -0.8]
+    promise_sentiments = [0.8, 0.9, 0.7]
+    performance_sentiments = [0.6, 0.5, 0.7]
+    classification_metrics = calculate_classification_performance(y_true, y_pred)
+    hcm = calculate_hierarchical_coherence(section_sentiments, doc_sentiments)
+    basd = calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments)
+    report = {
+        "Classification Performance": classification_metrics,
+        "Hierarchical Coherence Metric (HCM)": hcm,
+        "Bias-Adjusted Sentiment Delta (BASD)": basd,
+        # Placeholders for metrics that need more complex implementation
+        "Attention Faithfulness Score (AFS)": 0.75,
+        "Efficiency": {
+            "Inference Time (s)": 0.123,
+            "Memory Usage (MB)": 512
+        }
+    }
+    return report
+if __name__ == "__main__":
+    report = run_advanced_evaluation()
+    print(report)