Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score | |
| import os | |
| from bert_score import score as score_bert | |
| from nltk.translate.meteor_score import meteor_score | |
| from nltk.translate.meteor_score import single_meteor_score | |
| import nltk | |
| import jieba | |
| from collections import Counter | |
| # 下载必要的NLTK数据 | |
| # nltk.download('punkt') | |
| # nltk.download('wordnet') | |
| # nltk.download('omw-1.4') # 用于支持多语言WordNet | |
| def calculate_metrics(model_results_paths, data_path='./data/data.xlsx'): | |
| # 读取真实标签 | |
| true_labels = pd.read_excel(data_path, sheet_name='Yes or No Train') | |
| for model_results_path in model_results_paths: | |
| # 读取模型生成的结果 | |
| model_results = pd.read_json(model_results_path) | |
| # 提取预测和真实标签 | |
| predicted = model_results['answer'].apply(lambda x: 1 if x == 'Yes' else 0) | |
| true = true_labels['Answer'].apply(lambda x: 1 if x == 'Yes' else 0) | |
| # 计算各项指标 | |
| accuracy = accuracy_score(true, predicted) | |
| recall = recall_score(true, predicted) | |
| precision = precision_score(true, predicted) | |
| f1 = f1_score(true, predicted) | |
| # 计算AUROC | |
| predicted_prob = predicted # 这里可以根据实际情况调整 | |
| auroc = roc_auc_score(true, predicted_prob) | |
| # 输出结果 | |
| results = ( | |
| f'模型 {model_results_path} 在数据集上的各项指标如下:\n' | |
| f'Accuracy: {accuracy:.4f}\n' | |
| f'Recall: {recall:.4f}\n' | |
| f'Precision: {precision:.4f}\n' | |
| f'F1 Score: {f1:.4f}\n' | |
| f'AUROC: {auroc:.4f}\n' | |
| '---\n' | |
| ) | |
| save_path = 'F:\GeoLLM\output\output_result\Task2' | |
| results_file_path = os.path.join(save_path, 'results_yes_or_no.txt') | |
| with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 | |
| f.write(results) | |
| print(results) # 打印输出到控制台 | |
| def evaluate_fill_in_the_blank(predicted, true): | |
| # 处理空值并转换为字符串 | |
| predicted = str(predicted) if not pd.isna(predicted) else "" | |
| true = str(true) if not pd.isna(true) else "" | |
| # 检查字符包含关系 | |
| return all(char in predicted for char in true) | |
| def calculate_metrics_f(model_results_paths, data_path='./data/data.xlsx'): | |
| # 读取真实标签 | |
| true_labels = pd.read_excel(data_path, sheet_name='Factoid Train') | |
| for model_results_path in model_results_paths: | |
| # 读取模型生成的结果 | |
| model_results = pd.read_json(model_results_path) | |
| # 生成二进制标签(1=正确,0=错误) | |
| y_true = [] | |
| y_pred = [] | |
| for pred, tru in zip(model_results['answer'], true_labels['Answer']): | |
| # 处理空值和类型转换 | |
| pred_clean = str(pred) if not pd.isna(pred) else "" | |
| tru_clean = str(tru) if not pd.isna(tru) else "" | |
| # 生成真实标签(1=应被正确回答,0=应被错误回答) | |
| # 注意:这里需要根据实际数据调整逻辑,当前假设所有样本都应正确 | |
| y_true.append(1) | |
| # 生成预测标签 | |
| is_correct = all(char in pred_clean for char in tru_clean) | |
| y_pred.append(1 if is_correct else 0) | |
| # 处理单一类别情况 | |
| if len(set(y_true)) < 2: | |
| auroc = None | |
| else: | |
| try: | |
| auroc = roc_auc_score(y_true, y_pred) | |
| except ValueError: | |
| auroc = None | |
| # 计算分类指标 | |
| accuracy = accuracy_score(y_true, y_pred) | |
| recall = recall_score(y_true, y_pred) | |
| precision = precision_score(y_true, y_pred) | |
| f1 = f1_score(y_true, y_pred) | |
| # 输出结果 | |
| results = ( | |
| f'模型 {model_results_path} 评估结果:\n' | |
| f'正确数/总数: {sum(y_pred)}/{len(y_pred)}\n' | |
| f'Accuracy: {accuracy:.4f}\n' | |
| f'Recall: {recall:.4f}\n' | |
| f'Precision: {precision:.4f}\n' | |
| f'F1 Score: {f1:.4f}\n' | |
| f'AUROC: {auroc if auroc is None else f"{auroc:.4f}"}\n' | |
| '---\n' | |
| ) | |
| print(results) # 打印输出到控制台 | |
| # 保存结果到results.txt | |
| # output_dir = os.path.dirname(model_results_path) # 获取模型结果文件的目录 | |
| # results_file_path = os.path.join(output_dir, 'results_f.txt') | |
| # with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 | |
| # f.write(results) | |
| def calculate_metrics_Factoid(model_results_paths, data_path='./data/data.xlsx'): | |
| # 读取真实标签 | |
| true_labels = pd.read_excel(data_path, sheet_name='Factoid Train') | |
| for model_results_path in model_results_paths: | |
| # 读取模型生成的结果 | |
| model_results = pd.read_json(model_results_path) | |
| # 预处理答案,转换为列表形式 | |
| predictions = [] | |
| references = [] | |
| for pred, ref in zip(model_results['answer'], true_labels['Answer']): | |
| # 处理空值, 转换为字符串,去除空格 | |
| pred = str(pred).strip() if not pd.isna(pred) else "" | |
| ref = str(ref).strip() if not pd.isna(ref) else "" | |
| # 使用jieba分词 | |
| pred_tokens = list(jieba.cut(pred)) | |
| ref_tokens = list(jieba.cut(ref)) | |
| predictions.append(pred) | |
| references.append(ref) | |
| # 1. 计算BERT Score | |
| P, R, F1 = score_bert(predictions, references, lang='zh', verbose=False)# verbose=True 显示详细信息 | |
| bert_precision = P.mean().item() | |
| bert_recall = R.mean().item() | |
| bert_f1 = F1.mean().item() | |
| # 2. 计算METEOR Score及相关指标 | |
| meteor_scores = [] | |
| meteor_precision_scores = [] | |
| meteor_recall_scores = [] | |
| meteor_penalty_scores = [] # initialize your results list | |
| weighted_harmonic_means = [] | |
| # METEOR参数 | |
| ALPHA = 0.9 # 精确率权重 | |
| BETA = 3.0 # 片段惩罚权重 | |
| GAMMA = 0.5 # 惩罚因子 | |
| empty_pred = [] # initialize your empty predictions list outside the loop | |
| for pred, ref in zip(predictions, references): | |
| # 检查原始答案是否为空 | |
| if not pred: | |
| empty_pred.append(pred) | |
| print(f"警告:模型发现一个空的预测对。原始预测是:{pred},参考是:{ref}") | |
| pred_tokens = list(jieba.cut(pred)) | |
| ref_tokens = list(jieba.cut(ref)) | |
| # 清理分词结果,去除空格 | |
| pred_tokens = [token for token in pred_tokens if token.strip()] | |
| ref_tokens = [token for token in ref_tokens if token.strip()] | |
| # 基础METEOR分数 | |
| meteor = single_meteor_score(ref_tokens, pred_tokens) | |
| # print(f"meteor: {meteor}") | |
| # 使用 Counter 处理重复词 | |
| pred_counter = Counter(pred_tokens) | |
| ref_counter = Counter(ref_tokens) | |
| matched_count = sum((pred_counter & ref_counter).values()) | |
| precision = matched_count / len(pred_tokens) if pred_tokens else 0 | |
| recall = matched_count / len(ref_tokens) if ref_tokens else 0 | |
| # print(f"Precision: {precision}, Recall: {recall}") # 输出精确率和召回率 | |
| # print(pred) | |
| # 计算加权调和平均 | |
| if precision > 0 and recall > 0: | |
| weighted_harmonic_mean = (precision * recall) / (ALPHA * precision + (1 - ALPHA) * recall) | |
| else: | |
| weighted_harmonic_mean = 0 | |
| # print(f"weighted_harmonic_mean: {weighted_harmonic_mean}") | |
| if weighted_harmonic_mean != 0: | |
| meteor_penalty_score = 1 - (meteor / weighted_harmonic_mean) | |
| else: | |
| meteor_penalty_score = 1 | |
| meteor_penalty_scores.append(meteor_penalty_score) | |
| weighted_harmonic_means.append(weighted_harmonic_mean) | |
| meteor_precision_scores.append(precision) | |
| meteor_recall_scores.append(recall) | |
| meteor_scores.append(meteor) | |
| # 计算平均分数 | |
| avg_meteor_precision = sum(meteor_precision_scores) / len(meteor_precision_scores) if meteor_precision_scores else 0 | |
| avg_meteor_recall = sum(meteor_recall_scores) / len(meteor_recall_scores) if meteor_recall_scores else 0 | |
| ave_Fmean = sum(weighted_harmonic_means) / len(weighted_harmonic_means) if weighted_harmonic_means else 0 | |
| avg_meteor_penalty = sum(meteor_penalty_scores) / len(meteor_penalty_scores) if meteor_penalty_scores else 0 | |
| avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0 | |
| # 空预测率 | |
| empty_pred_rate = len(empty_pred) / len(predictions) | |
| # 输出结果 | |
| results = ( | |
| f'模型 {model_results_path} 评估结果:\n' | |
| f'\nBERT Score 评估结果:\n' | |
| f'BERT Precision: {bert_precision:.4f}\n' | |
| f'BERT Recall: {bert_recall:.4f}\n' | |
| f'BERT F1: {bert_f1:.4f}\n' | |
| f'\nMETEOR Score 评估结果:\n' | |
| f'METEOR Precision: {avg_meteor_precision:.4f}\n' | |
| f'METEOR Recall: {avg_meteor_recall:.4f}\n' | |
| f'METEOR Fmean: {ave_Fmean:.4f}\n' | |
| f'METEOR Penalty (Gamma={GAMMA:.1f},β={BETA:.1f}): {avg_meteor_penalty:.4f}\n' | |
| f'METEOR Score: {avg_meteor:.4f}\n' | |
| f'空预测率: {empty_pred_rate:.4f}\n' | |
| '---\n' | |
| ) | |
| # save_path = 'F:\GeoLLM\output\output_result\Task2' | |
| # results_file_path = os.path.join(save_path, 'results_factoid.txt') | |
| # with open(results_file_path, 'a', encoding='utf-8') as f: # 以追加模式打开文件 | |
| # f.write(results) | |
| print(results) # 打印输出到控制台 | |
| if __name__ == '__main__': | |
| # # 示例调用 | |
| # model_results_paths = [ | |
| # # # gpt-3.5-turbo的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo_f.json', | |
| # # # gpt-4o的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o_f.json', | |
| # # # gemini-1.5-pro-002的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002_f.json', | |
| # # # claude-3-5-haiku-20241022的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022_f.json', | |
| # # # deepseek-ai/DeepSeek-V3的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3_f.json', | |
| # # # deepseek-ai/DeepSeek-R1的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-R1_f.json', | |
| # # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json', | |
| # # # Qwen/Qwen2.5-72B-Instruct的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct_f.json', | |
| # # cot_new的八个测试 | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-3.5-turbo_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-4o_f_processed.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gemini-1.5-pro-002_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/claude-3-5-haiku-20241022_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-V3_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-R1_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Meta-Llama-3.1-405B-Instruct_f_processed.json', | |
| # # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Qwen2.5-72B-Instruct_f_processed.json', | |
| # ] | |
| # data_path = './data/data.xlsx' | |
| # calculate_metrics_Factoid(model_results_paths, data_path) | |
| # 示例调用 | |
| model_results_paths = [ | |
| # # gpt-3.5-turbo的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo.json', | |
| # # gpt-4o的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o.json', | |
| # # gemini-1.5-pro-002的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002.json', | |
| # # claude-3-5-haiku-20241022的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022.json', | |
| # # deepseek-ai/DeepSeek-V3的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3.json', | |
| # # deepseek-ai/DeepSeek-R1的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1.json', | |
| # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct.json', | |
| # # Qwen/Qwen2.5-72B-Instruct的八个测试 | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct.json', | |
| # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct.json', | |
| 'F:/GeoLLM/Task2/output/nomal1/zero_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/nomal1/one_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/nomal1/two_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/nomal1/three_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/knn1/one_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/knn1/two_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old2.json', | |
| 'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old1.json', | |
| 'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo.json', | |
| 'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo_old.json', | |
| ] | |
| data_path = './data/data.xlsx' | |
| calculate_metrics(model_results_paths, data_path) | |