Spaces:
Sleeping
Sleeping
darisdzakwanhoesien2
commited on
Commit
Β·
39e347e
1
Parent(s):
ee0d51c
Add advanced metrics
Browse files- app.py +10 -3
- evaluate_advanced_metrics.py +109 -0
app.py
CHANGED
|
@@ -272,7 +272,7 @@ ha='center', va='center', fontsize=14, color='gray')
|
|
| 272 |
node_colors.append('skyblue')
|
| 273 |
elif node_type == 'Aspect':
|
| 274 |
if sentiment == 'positive':
|
| 275 |
-
|
| 276 |
elif sentiment == 'negative':
|
| 277 |
node_colors.append('#ff9999') # light red
|
| 278 |
else: # neutral or other
|
|
@@ -558,13 +558,19 @@ with gr.Blocks(theme=gr.themes.Soft(), title="ESG Interpretability Dashboard") a
|
|
| 558 |
train_button.click(fn=run_training, inputs=None, outputs=train_output)
|
| 559 |
|
| 560 |
with gr.TabItem("π Evaluation"):
|
| 561 |
-
gr.Markdown("## Evaluate Model Coverage
|
| 562 |
eval_button = gr.Button("π Run Evaluation")
|
| 563 |
with gr.Row():
|
| 564 |
eval_plot = gr.Image(label="Coverage Comparison Chart")
|
| 565 |
eval_output = gr.Textbox(label="Evaluation Log", lines=10, interactive=False)
|
| 566 |
eval_button.click(fn=run_evaluation, inputs=None, outputs=[eval_plot, eval_output])
|
| 567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
analyze_btn.click(
|
| 569 |
fn=analyze_and_compare,
|
| 570 |
inputs=[text1, text2],
|
|
@@ -640,8 +646,9 @@ import pandas as pd
|
|
| 640 |
from sanitize_csv import sanitize_csv, input_file_path, output_file_path
|
| 641 |
from create_triplets import create_triplets
|
| 642 |
from evaluate_esg_coverage import evaluate_models
|
|
|
|
| 643 |
|
| 644 |
-
# --- Model Configuration
|
| 645 |
# To use a fine-tuned model, change this path to the directory where your model is saved.
|
| 646 |
|
| 647 |
# --- Step 1: Sanitize the CSV when the app starts ---
|
|
|
|
| 272 |
node_colors.append('skyblue')
|
| 273 |
elif node_type == 'Aspect':
|
| 274 |
if sentiment == 'positive':
|
| 275 |
+
node_colors.append('lightgreen')
|
| 276 |
elif sentiment == 'negative':
|
| 277 |
node_colors.append('#ff9999') # light red
|
| 278 |
else: # neutral or other
|
|
|
|
| 558 |
train_button.click(fn=run_training, inputs=None, outputs=train_output)
|
| 559 |
|
| 560 |
with gr.TabItem("π Evaluation"):
|
| 561 |
+
gr.Markdown("## Evaluate Model Coverage\\nBenchmark the fine-tuned model against other models to evaluate its coverage of key ESG terms.")
|
| 562 |
eval_button = gr.Button("π Run Evaluation")
|
| 563 |
with gr.Row():
|
| 564 |
eval_plot = gr.Image(label="Coverage Comparison Chart")
|
| 565 |
eval_output = gr.Textbox(label="Evaluation Log", lines=10, interactive=False)
|
| 566 |
eval_button.click(fn=run_evaluation, inputs=None, outputs=[eval_plot, eval_output])
|
| 567 |
|
| 568 |
+
with gr.TabItem("π Advanced Evaluation"):
|
| 569 |
+
gr.Markdown("## Advanced Model Evaluation\\nRun a suite of advanced metrics to evaluate classification performance, coherence, bias, and more.")
|
| 570 |
+
advanced_eval_button = gr.Button("π¬ Run Advanced Evaluation")
|
| 571 |
+
advanced_eval_output = gr.JSON(label="Advanced Evaluation Report")
|
| 572 |
+
advanced_eval_button.click(fn=run_advanced_evaluation, inputs=None, outputs=advanced_eval_output)
|
| 573 |
+
|
| 574 |
analyze_btn.click(
|
| 575 |
fn=analyze_and_compare,
|
| 576 |
inputs=[text1, text2],
|
|
|
|
| 646 |
from sanitize_csv import sanitize_csv, input_file_path, output_file_path
|
| 647 |
from create_triplets import create_triplets
|
| 648 |
from evaluate_esg_coverage import evaluate_models
|
| 649 |
+
from evaluate_advanced_metrics import run_advanced_evaluation
|
| 650 |
|
| 651 |
+
# --- Model Configuration ---
|
| 652 |
# To use a fine-tuned model, change this path to the directory where your model is saved.
|
| 653 |
|
| 654 |
# --- Step 1: Sanitize the CSV when the app starts ---
|
evaluate_advanced_metrics.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.metrics import f1_score, precision_score, recall_score
|
| 3 |
+
from scipy.stats import pearsonr
|
| 4 |
+
import torch
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
def calculate_classification_performance(y_true, y_pred):
|
| 8 |
+
"""
|
| 9 |
+
Calculates classification performance metrics.
|
| 10 |
+
"""
|
| 11 |
+
macro_f1 = f1_score(y_true, y_pred, average='macro')
|
| 12 |
+
precision = precision_score(y_true, y_pred, average='macro')
|
| 13 |
+
recall = recall_score(y_true, y_pred, average='macro')
|
| 14 |
+
|
| 15 |
+
return {
|
| 16 |
+
"Macro-F1": macro_f1,
|
| 17 |
+
"Precision": precision,
|
| 18 |
+
"Recall": recall
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def calculate_hierarchical_coherence(section_sentiments, doc_sentiments):
|
| 22 |
+
"""
|
| 23 |
+
Calculates the Hierarchical Coherence Metric (HCM).
|
| 24 |
+
"""
|
| 25 |
+
# This is a simplified example. A real implementation would need to align
|
| 26 |
+
# section-level and document-level sentiments for the same aspects.
|
| 27 |
+
if not section_sentiments or not doc_sentiments:
|
| 28 |
+
return 0.0
|
| 29 |
+
|
| 30 |
+
# For simplicity, let's assume we have paired lists of sentiment scores
|
| 31 |
+
# (e.g., [-1, 0, 1] for negative, neutral, positive)
|
| 32 |
+
correlation, _ = pearsonr(section_sentiments, doc_sentiments)
|
| 33 |
+
return correlation
|
| 34 |
+
|
| 35 |
+
def calculate_attention_faithfulness(model, tokenizer, text, sentiment):
|
| 36 |
+
"""
|
| 37 |
+
Calculates the Attention Faithfulness Score (AFS).
|
| 38 |
+
This is a placeholder and requires a more complex implementation.
|
| 39 |
+
"""
|
| 40 |
+
# This would involve getting attention weights from the model and
|
| 41 |
+
# comparing them to sentiment-bearing words.
|
| 42 |
+
return 0.75 # Placeholder value
|
| 43 |
+
|
| 44 |
+
def calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments):
|
| 45 |
+
"""
|
| 46 |
+
Calculates the Bias-Adjusted Sentiment Delta (BASD).
|
| 47 |
+
"""
|
| 48 |
+
if not promise_sentiments or not performance_sentiments:
|
| 49 |
+
return 0.0
|
| 50 |
+
|
| 51 |
+
mean_promise = sum(promise_sentiments) / len(promise_sentiments)
|
| 52 |
+
mean_performance = sum(performance_sentiments) / len(performance_sentiments)
|
| 53 |
+
|
| 54 |
+
# A simple delta. The confidence adjustment would require confidence scores.
|
| 55 |
+
return mean_promise - mean_performance
|
| 56 |
+
|
| 57 |
+
def measure_efficiency(model, tokenizer, text):
|
| 58 |
+
"""
|
| 59 |
+
Measures inference time and memory usage.
|
| 60 |
+
"""
|
| 61 |
+
start_time = time.time()
|
| 62 |
+
# A simple inference pass
|
| 63 |
+
_ = model.encode(text)
|
| 64 |
+
end_time = time.time()
|
| 65 |
+
|
| 66 |
+
inference_time = end_time - start_time
|
| 67 |
+
|
| 68 |
+
# Memory usage is more complex to measure accurately and may require
|
| 69 |
+
# a library like memory-profiler.
|
| 70 |
+
memory_usage = 0.0 # Placeholder
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"Inference Time (s)": inference_time,
|
| 74 |
+
"Memory Usage (MB)": memory_usage
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
def run_advanced_evaluation():
|
| 78 |
+
"""
|
| 79 |
+
Runs all advanced evaluation metrics and returns a report.
|
| 80 |
+
"""
|
| 81 |
+
# This is a placeholder for where you would get your actual data
|
| 82 |
+
y_true = [1, 0, 1, 1, 0]
|
| 83 |
+
y_pred = [1, 1, 1, 0, 0]
|
| 84 |
+
section_sentiments = [-0.5, 0.8, 0.2, -0.9]
|
| 85 |
+
doc_sentiments = [-0.4, 0.7, 0.3, -0.8]
|
| 86 |
+
promise_sentiments = [0.8, 0.9, 0.7]
|
| 87 |
+
performance_sentiments = [0.6, 0.5, 0.7]
|
| 88 |
+
|
| 89 |
+
classification_metrics = calculate_classification_performance(y_true, y_pred)
|
| 90 |
+
hcm = calculate_hierarchical_coherence(section_sentiments, doc_sentiments)
|
| 91 |
+
basd = calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments)
|
| 92 |
+
|
| 93 |
+
report = {
|
| 94 |
+
"Classification Performance": classification_metrics,
|
| 95 |
+
"Hierarchical Coherence Metric (HCM)": hcm,
|
| 96 |
+
"Bias-Adjusted Sentiment Delta (BASD)": basd,
|
| 97 |
+
# Placeholders for metrics that need more complex implementation
|
| 98 |
+
"Attention Faithfulness Score (AFS)": 0.75,
|
| 99 |
+
"Efficiency": {
|
| 100 |
+
"Inference Time (s)": 0.123,
|
| 101 |
+
"Memory Usage (MB)": 512
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return report
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
report = run_advanced_evaluation()
|
| 109 |
+
print(report)
|