Spaces:

Ciallo0d00
/

GeoLLM

Runtime error

App Files Files Community

GeoLLM / Task2 /calculate_metrics.py

Ciallo0d00

Upload folder using huggingface_hub

badcf3c verified 5 months ago

raw

history blame contribute delete

25.2 kB

	import pandas as pd
	from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
	import os
	from bert_score import score as score_bert
	from nltk.translate.meteor_score import meteor_score
	from nltk.translate.meteor_score import single_meteor_score
	import nltk
	import jieba
	from collections import Counter

	# 下载必要的NLTK数据
	# nltk.download('punkt')
	# nltk.download('wordnet')
	# nltk.download('omw-1.4') # 用于支持多语言WordNet

	def calculate_metrics(model_results_paths, data_path='./data/data.xlsx'):
	# 读取真实标签
	true_labels = pd.read_excel(data_path, sheet_name='Yes or No Train')

	for model_results_path in model_results_paths:
	# 读取模型生成的结果
	model_results = pd.read_json(model_results_path)

	# 提取预测和真实标签
	predicted = model_results['answer'].apply(lambda x: 1 if x == 'Yes' else 0)
	true = true_labels['Answer'].apply(lambda x: 1 if x == 'Yes' else 0)

	# 计算各项指标
	accuracy = accuracy_score(true, predicted)
	recall = recall_score(true, predicted)
	precision = precision_score(true, predicted)
	f1 = f1_score(true, predicted)

	# 计算AUROC
	predicted_prob = predicted # 这里可以根据实际情况调整
	auroc = roc_auc_score(true, predicted_prob)

	# 输出结果
	results = (
	f'模型 {model_results_path} 在数据集上的各项指标如下：\n'
	f'Accuracy: {accuracy:.4f}\n'
	f'Recall: {recall:.4f}\n'
	f'Precision: {precision:.4f}\n'
	f'F1 Score: {f1:.4f}\n'
	f'AUROC: {auroc:.4f}\n'
	'---\n'
	)
	save_path = 'F:\GeoLLM\output\output_result\Task2'
	results_file_path = os.path.join(save_path, 'results_yes_or_no.txt')
	with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
	f.write(results)
	print(results) # 打印输出到控制台

	def evaluate_fill_in_the_blank(predicted, true):
	# 处理空值并转换为字符串
	predicted = str(predicted) if not pd.isna(predicted) else ""
	true = str(true) if not pd.isna(true) else ""
	# 检查字符包含关系
	return all(char in predicted for char in true)

	def calculate_metrics_f(model_results_paths, data_path='./data/data.xlsx'):
	# 读取真实标签
	true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')

	for model_results_path in model_results_paths:
	# 读取模型生成的结果
	model_results = pd.read_json(model_results_path)

	# 生成二进制标签（1=正确，0=错误）
	y_true = []
	y_pred = []
	for pred, tru in zip(model_results['answer'], true_labels['Answer']):
	# 处理空值和类型转换
	pred_clean = str(pred) if not pd.isna(pred) else ""
	tru_clean = str(tru) if not pd.isna(tru) else ""

	# 生成真实标签（1=应被正确回答，0=应被错误回答）
	# 注意：这里需要根据实际数据调整逻辑，当前假设所有样本都应正确
	y_true.append(1)

	# 生成预测标签
	is_correct = all(char in pred_clean for char in tru_clean)
	y_pred.append(1 if is_correct else 0)

	# 处理单一类别情况
	if len(set(y_true)) < 2:
	auroc = None
	else:
	try:
	auroc = roc_auc_score(y_true, y_pred)
	except ValueError:
	auroc = None

	# 计算分类指标
	accuracy = accuracy_score(y_true, y_pred)
	recall = recall_score(y_true, y_pred)
	precision = precision_score(y_true, y_pred)
	f1 = f1_score(y_true, y_pred)

	# 输出结果
	results = (
	f'模型 {model_results_path} 评估结果：\n'
	f'正确数/总数: {sum(y_pred)}/{len(y_pred)}\n'
	f'Accuracy: {accuracy:.4f}\n'
	f'Recall: {recall:.4f}\n'
	f'Precision: {precision:.4f}\n'
	f'F1 Score: {f1:.4f}\n'
	f'AUROC: {auroc if auroc is None else f"{auroc:.4f}"}\n'
	'---\n'
	)

	print(results) # 打印输出到控制台

	# 保存结果到results.txt
	# output_dir = os.path.dirname(model_results_path) # 获取模型结果文件的目录
	# results_file_path = os.path.join(output_dir, 'results_f.txt')

	# with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
	# f.write(results)

	def calculate_metrics_Factoid(model_results_paths, data_path='./data/data.xlsx'):
	# 读取真实标签
	true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')

	for model_results_path in model_results_paths:
	# 读取模型生成的结果
	model_results = pd.read_json(model_results_path)

	# 预处理答案，转换为列表形式
	predictions = []
	references = []
	for pred, ref in zip(model_results['answer'], true_labels['Answer']):
	# 处理空值, 转换为字符串,去除空格
	pred = str(pred).strip() if not pd.isna(pred) else ""
	ref = str(ref).strip() if not pd.isna(ref) else ""

	# 使用jieba分词
	pred_tokens = list(jieba.cut(pred))
	ref_tokens = list(jieba.cut(ref))

	predictions.append(pred)
	references.append(ref)

	# 1. 计算BERT Score
	P, R, F1 = score_bert(predictions, references, lang='zh', verbose=False)# verbose=True 显示详细信息
	bert_precision = P.mean().item()
	bert_recall = R.mean().item()
	bert_f1 = F1.mean().item()
	# 2. 计算METEOR Score及相关指标
	meteor_scores = []
	meteor_precision_scores = []
	meteor_recall_scores = []
	meteor_penalty_scores = [] # initialize your results list
	weighted_harmonic_means = []

	# METEOR参数
	ALPHA = 0.9 # 精确率权重
	BETA = 3.0 # 片段惩罚权重
	GAMMA = 0.5 # 惩罚因子

	empty_pred = [] # initialize your empty predictions list outside the loop

	for pred, ref in zip(predictions, references):
	# 检查原始答案是否为空
	if not pred:
	empty_pred.append(pred)
	print(f"警告：模型发现一个空的预测对。原始预测是：{pred}，参考是：{ref}")

	pred_tokens = list(jieba.cut(pred))
	ref_tokens = list(jieba.cut(ref))

	# 清理分词结果，去除空格
	pred_tokens = [token for token in pred_tokens if token.strip()]
	ref_tokens = [token for token in ref_tokens if token.strip()]

	# 基础METEOR分数
	meteor = single_meteor_score(ref_tokens, pred_tokens)
	# print(f"meteor: {meteor}")

	# 使用 Counter 处理重复词
	pred_counter = Counter(pred_tokens)
	ref_counter = Counter(ref_tokens)

	matched_count = sum((pred_counter & ref_counter).values())

	precision = matched_count / len(pred_tokens) if pred_tokens else 0
	recall = matched_count / len(ref_tokens) if ref_tokens else 0

	# print(f"Precision: {precision}, Recall: {recall}") # 输出精确率和召回率
	# print(pred)
	# 计算加权调和平均
	if precision > 0 and recall > 0:
	weighted_harmonic_mean = (precision * recall) / (ALPHA * precision + (1 - ALPHA) * recall)
	else:
	weighted_harmonic_mean = 0
	# print(f"weighted_harmonic_mean: {weighted_harmonic_mean}")

	if weighted_harmonic_mean != 0:
	meteor_penalty_score = 1 - (meteor / weighted_harmonic_mean)
	else:
	meteor_penalty_score = 1

	meteor_penalty_scores.append(meteor_penalty_score)

	weighted_harmonic_means.append(weighted_harmonic_mean)
	meteor_precision_scores.append(precision)
	meteor_recall_scores.append(recall)
	meteor_scores.append(meteor)


	# 计算平均分数
	avg_meteor_precision = sum(meteor_precision_scores) / len(meteor_precision_scores) if meteor_precision_scores else 0
	avg_meteor_recall = sum(meteor_recall_scores) / len(meteor_recall_scores) if meteor_recall_scores else 0
	ave_Fmean = sum(weighted_harmonic_means) / len(weighted_harmonic_means) if weighted_harmonic_means else 0
	avg_meteor_penalty = sum(meteor_penalty_scores) / len(meteor_penalty_scores) if meteor_penalty_scores else 0
	avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
	# 空预测率
	empty_pred_rate = len(empty_pred) / len(predictions)

	# 输出结果
	results = (
	f'模型 {model_results_path} 评估结果：\n'
	f'\nBERT Score 评估结果：\n'
	f'BERT Precision: {bert_precision:.4f}\n'
	f'BERT Recall: {bert_recall:.4f}\n'
	f'BERT F1: {bert_f1:.4f}\n'
	f'\nMETEOR Score 评估结果：\n'
	f'METEOR Precision: {avg_meteor_precision:.4f}\n'
	f'METEOR Recall: {avg_meteor_recall:.4f}\n'
	f'METEOR Fmean: {ave_Fmean:.4f}\n'
	f'METEOR Penalty (Gamma={GAMMA:.1f},β={BETA:.1f}): {avg_meteor_penalty:.4f}\n'
	f'METEOR Score: {avg_meteor:.4f}\n'
	f'空预测率: {empty_pred_rate:.4f}\n'
	'---\n'
	)
	# save_path = 'F:\GeoLLM\output\output_result\Task2'
	# results_file_path = os.path.join(save_path, 'results_factoid.txt')
	# with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件
	# f.write(results)
	print(results) # 打印输出到控制台

	if __name__ == '__main__':
	# # 示例调用
	# model_results_paths = [
	# # # gpt-3.5-turbo的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo_f.json',
	# # # gpt-4o的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o_f.json',
	# # # gemini-1.5-pro-002的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002_f.json',
	# # # claude-3-5-haiku-20241022的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022_f.json',
	# # # deepseek-ai/DeepSeek-V3的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3_f.json',
	# # # deepseek-ai/DeepSeek-R1的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-R1_f.json',
	# # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
	# # # Qwen/Qwen2.5-72B-Instruct的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct_f.json',
	# # cot_new的八个测试
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-3.5-turbo_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-4o_f_processed.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gemini-1.5-pro-002_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/claude-3-5-haiku-20241022_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-V3_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-R1_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Meta-Llama-3.1-405B-Instruct_f_processed.json',
	# # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Qwen2.5-72B-Instruct_f_processed.json',
	# ]
	# data_path = './data/data.xlsx'
	# calculate_metrics_Factoid(model_results_paths, data_path)
	# 示例调用
	model_results_paths = [
	# # gpt-3.5-turbo的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo.json',
	# # gpt-4o的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o.json',
	# # gemini-1.5-pro-002的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002.json',
	# # claude-3-5-haiku-20241022的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022.json',
	# # deepseek-ai/DeepSeek-V3的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3.json',
	# # deepseek-ai/DeepSeek-R1的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1.json',
	# # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
	# # Qwen/Qwen2.5-72B-Instruct的八个测试
	# 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
	# 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct.json',

	'F:/GeoLLM/Task2/output/nomal1/zero_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/nomal1/one_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/nomal1/two_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/nomal1/three_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/knn1/one_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/knn1/two_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old2.json',
	'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old1.json',
	'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo.json',
	'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo_old.json',


	]
	data_path = './data/data.xlsx'
	calculate_metrics(model_results_paths, data_path)