import pandas as pd from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score import os from bert_score import score as score_bert from nltk.translate.meteor_score import meteor_score from nltk.translate.meteor_score import single_meteor_score import nltk import jieba from collections import Counter # 下载必要的NLTK数据 # nltk.download('punkt') # nltk.download('wordnet') # nltk.download('omw-1.4') # 用于支持多语言WordNet def calculate_metrics(model_results_paths, data_path='./data/data.xlsx'): # 读取真实标签 true_labels = pd.read_excel(data_path, sheet_name='Yes or No Train') for model_results_path in model_results_paths: # 读取模型生成的结果 model_results = pd.read_json(model_results_path) # 提取预测和真实标签 predicted = model_results['answer'].apply(lambda x: 1 if x == 'Yes' else 0) true = true_labels['Answer'].apply(lambda x: 1 if x == 'Yes' else 0) # 计算各项指标 accuracy = accuracy_score(true, predicted) recall = recall_score(true, predicted) precision = precision_score(true, predicted) f1 = f1_score(true, predicted) # 计算AUROC predicted_prob = predicted # 这里可以根据实际情况调整 auroc = roc_auc_score(true, predicted_prob) # 输出结果 results = ( f'模型 {model_results_path} 在数据集上的各项指标如下:\n' f'Accuracy: {accuracy:.4f}\n' f'Recall: {recall:.4f}\n' f'Precision: {precision:.4f}\n' f'F1 Score: {f1:.4f}\n' f'AUROC: {auroc:.4f}\n' '---\n' ) save_path = 'F:\GeoLLM\output\output_result\Task2' results_file_path = os.path.join(save_path, 'results_yes_or_no.txt') with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 f.write(results) print(results) # 打印输出到控制台 def evaluate_fill_in_the_blank(predicted, true): # 处理空值并转换为字符串 predicted = str(predicted) if not pd.isna(predicted) else "" true = str(true) if not pd.isna(true) else "" # 检查字符包含关系 return all(char in predicted for char in true) def calculate_metrics_f(model_results_paths, data_path='./data/data.xlsx'): # 读取真实标签 true_labels = pd.read_excel(data_path, sheet_name='Factoid Train') for model_results_path in model_results_paths: # 读取模型生成的结果 model_results = pd.read_json(model_results_path) # 生成二进制标签(1=正确,0=错误) y_true = [] y_pred = [] for pred, tru in zip(model_results['answer'], true_labels['Answer']): # 处理空值和类型转换 pred_clean = str(pred) if not pd.isna(pred) else "" tru_clean = str(tru) if not pd.isna(tru) else "" # 生成真实标签(1=应被正确回答,0=应被错误回答) # 注意:这里需要根据实际数据调整逻辑,当前假设所有样本都应正确 y_true.append(1) # 生成预测标签 is_correct = all(char in pred_clean for char in tru_clean) y_pred.append(1 if is_correct else 0) # 处理单一类别情况 if len(set(y_true)) < 2: auroc = None else: try: auroc = roc_auc_score(y_true, y_pred) except ValueError: auroc = None # 计算分类指标 accuracy = accuracy_score(y_true, y_pred) recall = recall_score(y_true, y_pred) precision = precision_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) # 输出结果 results = ( f'模型 {model_results_path} 评估结果:\n' f'正确数/总数: {sum(y_pred)}/{len(y_pred)}\n' f'Accuracy: {accuracy:.4f}\n' f'Recall: {recall:.4f}\n' f'Precision: {precision:.4f}\n' f'F1 Score: {f1:.4f}\n' f'AUROC: {auroc if auroc is None else f"{auroc:.4f}"}\n' '---\n' ) print(results) # 打印输出到控制台 # 保存结果到results.txt # output_dir = os.path.dirname(model_results_path) # 获取模型结果文件的目录 # results_file_path = os.path.join(output_dir, 'results_f.txt') # with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 # f.write(results) def calculate_metrics_Factoid(model_results_paths, data_path='./data/data.xlsx'): # 读取真实标签 true_labels = pd.read_excel(data_path, sheet_name='Factoid Train') for model_results_path in model_results_paths: # 读取模型生成的结果 model_results = pd.read_json(model_results_path) # 预处理答案,转换为列表形式 predictions = [] references = [] for pred, ref in zip(model_results['answer'], true_labels['Answer']): # 处理空值, 转换为字符串,去除空格 pred = str(pred).strip() if not pd.isna(pred) else "" ref = str(ref).strip() if not pd.isna(ref) else "" # 使用jieba分词 pred_tokens = list(jieba.cut(pred)) ref_tokens = list(jieba.cut(ref)) predictions.append(pred) references.append(ref) # 1. 计算BERT Score P, R, F1 = score_bert(predictions, references, lang='zh', verbose=False)# verbose=True 显示详细信息 bert_precision = P.mean().item() bert_recall = R.mean().item() bert_f1 = F1.mean().item() # 2. 计算METEOR Score及相关指标 meteor_scores = [] meteor_precision_scores = [] meteor_recall_scores = [] meteor_penalty_scores = [] # initialize your results list weighted_harmonic_means = [] # METEOR参数 ALPHA = 0.9 # 精确率权重 BETA = 3.0 # 片段惩罚权重 GAMMA = 0.5 # 惩罚因子 empty_pred = [] # initialize your empty predictions list outside the loop for pred, ref in zip(predictions, references): # 检查原始答案是否为空 if not pred: empty_pred.append(pred) print(f"警告:模型发现一个空的预测对。原始预测是:{pred},参考是:{ref}") pred_tokens = list(jieba.cut(pred)) ref_tokens = list(jieba.cut(ref)) # 清理分词结果,去除空格 pred_tokens = [token for token in pred_tokens if token.strip()] ref_tokens = [token for token in ref_tokens if token.strip()] # 基础METEOR分数 meteor = single_meteor_score(ref_tokens, pred_tokens) # print(f"meteor: {meteor}") # 使用 Counter 处理重复词 pred_counter = Counter(pred_tokens) ref_counter = Counter(ref_tokens) matched_count = sum((pred_counter & ref_counter).values()) precision = matched_count / len(pred_tokens) if pred_tokens else 0 recall = matched_count / len(ref_tokens) if ref_tokens else 0 # print(f"Precision: {precision}, Recall: {recall}") # 输出精确率和召回率 # print(pred) # 计算加权调和平均 if precision > 0 and recall > 0: weighted_harmonic_mean = (precision * recall) / (ALPHA * precision + (1 - ALPHA) * recall) else: weighted_harmonic_mean = 0 # print(f"weighted_harmonic_mean: {weighted_harmonic_mean}") if weighted_harmonic_mean != 0: meteor_penalty_score = 1 - (meteor / weighted_harmonic_mean) else: meteor_penalty_score = 1 meteor_penalty_scores.append(meteor_penalty_score) weighted_harmonic_means.append(weighted_harmonic_mean) meteor_precision_scores.append(precision) meteor_recall_scores.append(recall) meteor_scores.append(meteor) # 计算平均分数 avg_meteor_precision = sum(meteor_precision_scores) / len(meteor_precision_scores) if meteor_precision_scores else 0 avg_meteor_recall = sum(meteor_recall_scores) / len(meteor_recall_scores) if meteor_recall_scores else 0 ave_Fmean = sum(weighted_harmonic_means) / len(weighted_harmonic_means) if weighted_harmonic_means else 0 avg_meteor_penalty = sum(meteor_penalty_scores) / len(meteor_penalty_scores) if meteor_penalty_scores else 0 avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0 # 空预测率 empty_pred_rate = len(empty_pred) / len(predictions) # 输出结果 results = ( f'模型 {model_results_path} 评估结果:\n' f'\nBERT Score 评估结果:\n' f'BERT Precision: {bert_precision:.4f}\n' f'BERT Recall: {bert_recall:.4f}\n' f'BERT F1: {bert_f1:.4f}\n' f'\nMETEOR Score 评估结果:\n' f'METEOR Precision: {avg_meteor_precision:.4f}\n' f'METEOR Recall: {avg_meteor_recall:.4f}\n' f'METEOR Fmean: {ave_Fmean:.4f}\n' f'METEOR Penalty (Gamma={GAMMA:.1f},β={BETA:.1f}): {avg_meteor_penalty:.4f}\n' f'METEOR Score: {avg_meteor:.4f}\n' f'空预测率: {empty_pred_rate:.4f}\n' '---\n' ) # save_path = 'F:\GeoLLM\output\output_result\Task2' # results_file_path = os.path.join(save_path, 'results_factoid.txt') # with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 # f.write(results) print(results) # 打印输出到控制台 if __name__ == '__main__': # # 示例调用 # model_results_paths = [ # # # gpt-3.5-turbo的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo_f.json', # # # gpt-4o的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o_f.json', # # # gemini-1.5-pro-002的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002_f.json', # # # claude-3-5-haiku-20241022的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022_f.json', # # # deepseek-ai/DeepSeek-V3的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3_f.json', # # # deepseek-ai/DeepSeek-R1的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-R1_f.json', # # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', # # # Qwen/Qwen2.5-72B-Instruct的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct_f.json', # # cot_new的八个测试 # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-3.5-turbo_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-4o_f_processed.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gemini-1.5-pro-002_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/claude-3-5-haiku-20241022_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-V3_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-R1_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Meta-Llama-3.1-405B-Instruct_f_processed.json', # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Qwen2.5-72B-Instruct_f_processed.json', # ] # data_path = './data/data.xlsx' # calculate_metrics_Factoid(model_results_paths, data_path) # 示例调用 model_results_paths = [ # # gpt-3.5-turbo的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo.json', # # gpt-4o的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o.json', # # gemini-1.5-pro-002的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002.json', # # claude-3-5-haiku-20241022的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022.json', # # deepseek-ai/DeepSeek-V3的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3.json', # # deepseek-ai/DeepSeek-R1的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1.json', # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', # # Qwen/Qwen2.5-72B-Instruct的八个测试 # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct.json', # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct.json', 'F:/GeoLLM/Task2/output/nomal1/zero_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/nomal1/one_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/nomal1/two_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/nomal1/three_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/knn1/one_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/knn1/two_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old2.json', 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old1.json', 'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo.json', 'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo_old.json', ] data_path = './data/data.xlsx' calculate_metrics(model_results_paths, data_path)