GeoLLM / Task2 /calculate_metrics.py
Ciallo0d00's picture
Upload folder using huggingface_hub
badcf3c verified
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import os
from bert_score import score as score_bert
from nltk.translate.meteor_score import meteor_score
from nltk.translate.meteor_score import single_meteor_score
import nltk
import jieba
from collections import Counter
# 下载必要的NLTK数据
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4') # 用于支持多语言WordNet
def calculate_metrics(model_results_paths, data_path='./data/data.xlsx'):
# 读取真实标签
true_labels = pd.read_excel(data_path, sheet_name='Yes or No Train')
for model_results_path in model_results_paths:
# 读取模型生成的结果
model_results = pd.read_json(model_results_path)
# 提取预测和真实标签
predicted = model_results['answer'].apply(lambda x: 1 if x == 'Yes' else 0)
true = true_labels['Answer'].apply(lambda x: 1 if x == 'Yes' else 0)
# 计算各项指标
accuracy = accuracy_score(true, predicted)
recall = recall_score(true, predicted)
precision = precision_score(true, predicted)
f1 = f1_score(true, predicted)
# 计算AUROC
predicted_prob = predicted # 这里可以根据实际情况调整
auroc = roc_auc_score(true, predicted_prob)
# 输出结果
results = (
f'模型 {model_results_path} 在数据集上的各项指标如下:\n'
f'Accuracy: {accuracy:.4f}\n'
f'Recall: {recall:.4f}\n'
f'Precision: {precision:.4f}\n'
f'F1 Score: {f1:.4f}\n'
f'AUROC: {auroc:.4f}\n'
'---\n'
)
save_path = 'F:\GeoLLM\output\output_result\Task2'
results_file_path = os.path.join(save_path, 'results_yes_or_no.txt')
with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
f.write(results)
print(results) # 打印输出到控制台
def evaluate_fill_in_the_blank(predicted, true):
# 处理空值并转换为字符串
predicted = str(predicted) if not pd.isna(predicted) else ""
true = str(true) if not pd.isna(true) else ""
# 检查字符包含关系
return all(char in predicted for char in true)
def calculate_metrics_f(model_results_paths, data_path='./data/data.xlsx'):
# 读取真实标签
true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')
for model_results_path in model_results_paths:
# 读取模型生成的结果
model_results = pd.read_json(model_results_path)
# 生成二进制标签(1=正确,0=错误)
y_true = []
y_pred = []
for pred, tru in zip(model_results['answer'], true_labels['Answer']):
# 处理空值和类型转换
pred_clean = str(pred) if not pd.isna(pred) else ""
tru_clean = str(tru) if not pd.isna(tru) else ""
# 生成真实标签(1=应被正确回答,0=应被错误回答)
# 注意:这里需要根据实际数据调整逻辑,当前假设所有样本都应正确
y_true.append(1)
# 生成预测标签
is_correct = all(char in pred_clean for char in tru_clean)
y_pred.append(1 if is_correct else 0)
# 处理单一类别情况
if len(set(y_true)) < 2:
auroc = None
else:
try:
auroc = roc_auc_score(y_true, y_pred)
except ValueError:
auroc = None
# 计算分类指标
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
# 输出结果
results = (
f'模型 {model_results_path} 评估结果:\n'
f'正确数/总数: {sum(y_pred)}/{len(y_pred)}\n'
f'Accuracy: {accuracy:.4f}\n'
f'Recall: {recall:.4f}\n'
f'Precision: {precision:.4f}\n'
f'F1 Score: {f1:.4f}\n'
f'AUROC: {auroc if auroc is None else f"{auroc:.4f}"}\n'
'---\n'
)
print(results) # 打印输出到控制台
# 保存结果到results.txt
# output_dir = os.path.dirname(model_results_path) # 获取模型结果文件的目录
# results_file_path = os.path.join(output_dir, 'results_f.txt')
# with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
# f.write(results)
def calculate_metrics_Factoid(model_results_paths, data_path='./data/data.xlsx'):
# 读取真实标签
true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')
for model_results_path in model_results_paths:
# 读取模型生成的结果
model_results = pd.read_json(model_results_path)
# 预处理答案,转换为列表形式
predictions = []
references = []
for pred, ref in zip(model_results['answer'], true_labels['Answer']):
# 处理空值, 转换为字符串,去除空格
pred = str(pred).strip() if not pd.isna(pred) else ""
ref = str(ref).strip() if not pd.isna(ref) else ""
# 使用jieba分词
pred_tokens = list(jieba.cut(pred))
ref_tokens = list(jieba.cut(ref))
predictions.append(pred)
references.append(ref)
# 1. 计算BERT Score
P, R, F1 = score_bert(predictions, references, lang='zh', verbose=False)# verbose=True 显示详细信息
bert_precision = P.mean().item()
bert_recall = R.mean().item()
bert_f1 = F1.mean().item()
# 2. 计算METEOR Score及相关指标
meteor_scores = []
meteor_precision_scores = []
meteor_recall_scores = []
meteor_penalty_scores = [] # initialize your results list
weighted_harmonic_means = []
# METEOR参数
ALPHA = 0.9 # 精确率权重
BETA = 3.0 # 片段惩罚权重
GAMMA = 0.5 # 惩罚因子
empty_pred = [] # initialize your empty predictions list outside the loop
for pred, ref in zip(predictions, references):
# 检查原始答案是否为空
if not pred:
empty_pred.append(pred)
print(f"警告:模型发现一个空的预测对。原始预测是:{pred},参考是:{ref}")
pred_tokens = list(jieba.cut(pred))
ref_tokens = list(jieba.cut(ref))
# 清理分词结果,去除空格
pred_tokens = [token for token in pred_tokens if token.strip()]
ref_tokens = [token for token in ref_tokens if token.strip()]
# 基础METEOR分数
meteor = single_meteor_score(ref_tokens, pred_tokens)
# print(f"meteor: {meteor}")
# 使用 Counter 处理重复词
pred_counter = Counter(pred_tokens)
ref_counter = Counter(ref_tokens)
matched_count = sum((pred_counter & ref_counter).values())
precision = matched_count / len(pred_tokens) if pred_tokens else 0
recall = matched_count / len(ref_tokens) if ref_tokens else 0
# print(f"Precision: {precision}, Recall: {recall}") # 输出精确率和召回率
# print(pred)
# 计算加权调和平均
if precision > 0 and recall > 0:
weighted_harmonic_mean = (precision * recall) / (ALPHA * precision + (1 - ALPHA) * recall)
else:
weighted_harmonic_mean = 0
# print(f"weighted_harmonic_mean: {weighted_harmonic_mean}")
if weighted_harmonic_mean != 0:
meteor_penalty_score = 1 - (meteor / weighted_harmonic_mean)
else:
meteor_penalty_score = 1
meteor_penalty_scores.append(meteor_penalty_score)
weighted_harmonic_means.append(weighted_harmonic_mean)
meteor_precision_scores.append(precision)
meteor_recall_scores.append(recall)
meteor_scores.append(meteor)
# 计算平均分数
avg_meteor_precision = sum(meteor_precision_scores) / len(meteor_precision_scores) if meteor_precision_scores else 0
avg_meteor_recall = sum(meteor_recall_scores) / len(meteor_recall_scores) if meteor_recall_scores else 0
ave_Fmean = sum(weighted_harmonic_means) / len(weighted_harmonic_means) if weighted_harmonic_means else 0
avg_meteor_penalty = sum(meteor_penalty_scores) / len(meteor_penalty_scores) if meteor_penalty_scores else 0
avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
# 空预测率
empty_pred_rate = len(empty_pred) / len(predictions)
# 输出结果
results = (
f'模型 {model_results_path} 评估结果:\n'
f'\nBERT Score 评估结果:\n'
f'BERT Precision: {bert_precision:.4f}\n'
f'BERT Recall: {bert_recall:.4f}\n'
f'BERT F1: {bert_f1:.4f}\n'
f'\nMETEOR Score 评估结果:\n'
f'METEOR Precision: {avg_meteor_precision:.4f}\n'
f'METEOR Recall: {avg_meteor_recall:.4f}\n'
f'METEOR Fmean: {ave_Fmean:.4f}\n'
f'METEOR Penalty (Gamma={GAMMA:.1f},β={BETA:.1f}): {avg_meteor_penalty:.4f}\n'
f'METEOR Score: {avg_meteor:.4f}\n'
f'空预测率: {empty_pred_rate:.4f}\n'
'---\n'
)
# save_path = 'F:\GeoLLM\output\output_result\Task2'
# results_file_path = os.path.join(save_path, 'results_factoid.txt')
# with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
# f.write(results)
print(results) # 打印输出到控制台
if __name__ == '__main__':
# # 示例调用
# model_results_paths = [
# # # gpt-3.5-turbo的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo_f.json',
# # # gpt-4o的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o_f.json',
# # # gemini-1.5-pro-002的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002_f.json',
# # # claude-3-5-haiku-20241022的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022_f.json',
# # # deepseek-ai/DeepSeek-V3的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3_f.json',
# # # deepseek-ai/DeepSeek-R1的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-R1_f.json',
# # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
# # # Qwen/Qwen2.5-72B-Instruct的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct_f.json',
# # cot_new的八个测试
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-3.5-turbo_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-4o_f_processed.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gemini-1.5-pro-002_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/claude-3-5-haiku-20241022_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-V3_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-R1_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Meta-Llama-3.1-405B-Instruct_f_processed.json',
# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Qwen2.5-72B-Instruct_f_processed.json',
# ]
# data_path = './data/data.xlsx'
# calculate_metrics_Factoid(model_results_paths, data_path)
# 示例调用
model_results_paths = [
# # gpt-3.5-turbo的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo.json',
# # gpt-4o的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o.json',
# # gemini-1.5-pro-002的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002.json',
# # claude-3-5-haiku-20241022的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022.json',
# # deepseek-ai/DeepSeek-V3的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3.json',
# # deepseek-ai/DeepSeek-R1的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1.json',
# # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
# # Qwen/Qwen2.5-72B-Instruct的八个测试
# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
# 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct.json',
'F:/GeoLLM/Task2/output/nomal1/zero_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/nomal1/one_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/nomal1/two_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/nomal1/three_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/knn1/one_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/knn1/two_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old2.json',
'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old1.json',
'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo.json',
'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo_old.json',
]
data_path = './data/data.xlsx'
calculate_metrics(model_results_paths, data_path)