esgdata / evaluate_advanced_metrics.py
darisdzakwanhoesien2
Add advanced metrics
39e347e
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
from scipy.stats import pearsonr
import torch
import time
def calculate_classification_performance(y_true, y_pred):
"""
Calculates classification performance metrics.
"""
macro_f1 = f1_score(y_true, y_pred, average='macro')
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
return {
"Macro-F1": macro_f1,
"Precision": precision,
"Recall": recall
}
def calculate_hierarchical_coherence(section_sentiments, doc_sentiments):
"""
Calculates the Hierarchical Coherence Metric (HCM).
"""
# This is a simplified example. A real implementation would need to align
# section-level and document-level sentiments for the same aspects.
if not section_sentiments or not doc_sentiments:
return 0.0
# For simplicity, let's assume we have paired lists of sentiment scores
# (e.g., [-1, 0, 1] for negative, neutral, positive)
correlation, _ = pearsonr(section_sentiments, doc_sentiments)
return correlation
def calculate_attention_faithfulness(model, tokenizer, text, sentiment):
"""
Calculates the Attention Faithfulness Score (AFS).
This is a placeholder and requires a more complex implementation.
"""
# This would involve getting attention weights from the model and
# comparing them to sentiment-bearing words.
return 0.75 # Placeholder value
def calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments):
"""
Calculates the Bias-Adjusted Sentiment Delta (BASD).
"""
if not promise_sentiments or not performance_sentiments:
return 0.0
mean_promise = sum(promise_sentiments) / len(promise_sentiments)
mean_performance = sum(performance_sentiments) / len(performance_sentiments)
# A simple delta. The confidence adjustment would require confidence scores.
return mean_promise - mean_performance
def measure_efficiency(model, tokenizer, text):
"""
Measures inference time and memory usage.
"""
start_time = time.time()
# A simple inference pass
_ = model.encode(text)
end_time = time.time()
inference_time = end_time - start_time
# Memory usage is more complex to measure accurately and may require
# a library like memory-profiler.
memory_usage = 0.0 # Placeholder
return {
"Inference Time (s)": inference_time,
"Memory Usage (MB)": memory_usage
}
def run_advanced_evaluation():
"""
Runs all advanced evaluation metrics and returns a report.
"""
# This is a placeholder for where you would get your actual data
y_true = [1, 0, 1, 1, 0]
y_pred = [1, 1, 1, 0, 0]
section_sentiments = [-0.5, 0.8, 0.2, -0.9]
doc_sentiments = [-0.4, 0.7, 0.3, -0.8]
promise_sentiments = [0.8, 0.9, 0.7]
performance_sentiments = [0.6, 0.5, 0.7]
classification_metrics = calculate_classification_performance(y_true, y_pred)
hcm = calculate_hierarchical_coherence(section_sentiments, doc_sentiments)
basd = calculate_bias_adjusted_sentiment_delta(promise_sentiments, performance_sentiments)
report = {
"Classification Performance": classification_metrics,
"Hierarchical Coherence Metric (HCM)": hcm,
"Bias-Adjusted Sentiment Delta (BASD)": basd,
# Placeholders for metrics that need more complex implementation
"Attention Faithfulness Score (AFS)": 0.75,
"Efficiency": {
"Inference Time (s)": 0.123,
"Memory Usage (MB)": 512
}
}
return report
if __name__ == "__main__":
report = run_advanced_evaluation()
print(report)