Spaces:

Ciallo0d00
/

GeoLLM

Runtime error

File size: 25,187 Bytes

badcf3c

import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import os
from bert_score import score as score_bert
from nltk.translate.meteor_score import meteor_score
from nltk.translate.meteor_score import single_meteor_score
import nltk
import jieba
from collections import Counter

# 下载必要的NLTK数据
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')  # 用于支持多语言WordNet

def calculate_metrics(model_results_paths, data_path='./data/data.xlsx'):
    # 读取真实标签
    true_labels = pd.read_excel(data_path, sheet_name='Yes or No Train')

    for model_results_path in model_results_paths:
        # 读取模型生成的结果
        model_results = pd.read_json(model_results_path)

        # 提取预测和真实标签
        predicted = model_results['answer'].apply(lambda x: 1 if x == 'Yes' else 0)
        true = true_labels['Answer'].apply(lambda x: 1 if x == 'Yes' else 0)

        # 计算各项指标
        accuracy = accuracy_score(true, predicted)
        recall = recall_score(true, predicted)
        precision = precision_score(true, predicted)
        f1 = f1_score(true, predicted)

        # 计算AUROC
        predicted_prob = predicted  # 这里可以根据实际情况调整
        auroc = roc_auc_score(true, predicted_prob)

        # 输出结果
        results = (
            f'模型 {model_results_path} 在数据集上的各项指标如下：\n'
            f'Accuracy: {accuracy:.4f}\n'
            f'Recall: {recall:.4f}\n'
            f'Precision: {precision:.4f}\n'
            f'F1 Score: {f1:.4f}\n'
            f'AUROC: {auroc:.4f}\n'
            '---\n'
        )
        save_path = 'F:\GeoLLM\output\output_result\Task2'
        results_file_path = os.path.join(save_path, 'results_yes_or_no.txt')
        with open(results_file_path, 'a', encoding='utf-8') as f:  # 以追加模式打开文件
            f.write(results)
        print(results)  # 打印输出到控制台

def evaluate_fill_in_the_blank(predicted, true):
    # 处理空值并转换为字符串
    predicted = str(predicted) if not pd.isna(predicted) else ""
    true = str(true) if not pd.isna(true) else ""
    # 检查字符包含关系
    return all(char in predicted for char in true)

def calculate_metrics_f(model_results_paths, data_path='./data/data.xlsx'):
    # 读取真实标签
    true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')
    
    for model_results_path in model_results_paths:
        # 读取模型生成的结果
        model_results = pd.read_json(model_results_path)

        # 生成二进制标签（1=正确，0=错误）
        y_true = []
        y_pred = []
        for pred, tru in zip(model_results['answer'], true_labels['Answer']):
            # 处理空值和类型转换
            pred_clean = str(pred) if not pd.isna(pred) else ""
            tru_clean = str(tru) if not pd.isna(tru) else ""
            
            # 生成真实标签（1=应被正确回答，0=应被错误回答）
            # 注意：这里需要根据实际数据调整逻辑，当前假设所有样本都应正确
            y_true.append(1)  
            
            # 生成预测标签
            is_correct = all(char in pred_clean for char in tru_clean)
            y_pred.append(1 if is_correct else 0)

        # 处理单一类别情况
        if len(set(y_true)) < 2:
            auroc = None
        else:
            try:
                auroc = roc_auc_score(y_true, y_pred)
            except ValueError:
                auroc = None

        # 计算分类指标
        accuracy = accuracy_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        
        # 输出结果
        results = (
            f'模型 {model_results_path} 评估结果：\n'
            f'正确数/总数: {sum(y_pred)}/{len(y_pred)}\n'
            f'Accuracy: {accuracy:.4f}\n'
            f'Recall: {recall:.4f}\n' 
            f'Precision: {precision:.4f}\n'
            f'F1 Score: {f1:.4f}\n'
            f'AUROC: {auroc if auroc is None else f"{auroc:.4f}"}\n'
            '---\n'
        )
        
        print(results)  # 打印输出到控制台

        # 保存结果到results.txt
        # output_dir = os.path.dirname(model_results_path)  # 获取模型结果文件的目录
        # results_file_path = os.path.join(output_dir, 'results_f.txt')

        # with open(results_file_path, 'a', encoding='utf-8') as f:  # 以追加模式打开文件
        #     f.write(results)

def calculate_metrics_Factoid(model_results_paths, data_path='./data/data.xlsx'):
    # 读取真实标签
    true_labels = pd.read_excel(data_path, sheet_name='Factoid Train')
    
    for model_results_path in model_results_paths:
        # 读取模型生成的结果
        model_results = pd.read_json(model_results_path)
        
        # 预处理答案，转换为列表形式
        predictions = []
        references = []
        for pred, ref in zip(model_results['answer'], true_labels['Answer']):
            # 处理空值, 转换为字符串,去除空格
            pred = str(pred).strip() if not pd.isna(pred) else ""
            ref = str(ref).strip() if not pd.isna(ref) else ""
            
            # 使用jieba分词
            pred_tokens = list(jieba.cut(pred))
            ref_tokens = list(jieba.cut(ref))
             
            predictions.append(pred)
            references.append(ref)
            
        # 1. 计算BERT Score
        P, R, F1 = score_bert(predictions, references, lang='zh', verbose=False)# verbose=True 显示详细信息
        bert_precision = P.mean().item()
        bert_recall = R.mean().item()
        bert_f1 = F1.mean().item()
        # 2. 计算METEOR Score及相关指标
        meteor_scores = []
        meteor_precision_scores = []
        meteor_recall_scores = []
        meteor_penalty_scores = []  # initialize your results list
        weighted_harmonic_means = []
        
        # METEOR参数
        ALPHA = 0.9  # 精确率权重
        BETA = 3.0   # 片段惩罚权重
        GAMMA = 0.5  # 惩罚因子
        
        empty_pred = []  # initialize your empty predictions list outside the loop
        
        for pred, ref in zip(predictions, references):
            # 检查原始答案是否为空
            if not pred:
                empty_pred.append(pred)
                print(f"警告：模型发现一个空的预测对。原始预测是：{pred}，参考是：{ref}")
            
            pred_tokens = list(jieba.cut(pred))
            ref_tokens = list(jieba.cut(ref))
            
            # 清理分词结果，去除空格
            pred_tokens = [token for token in pred_tokens if token.strip()]
            ref_tokens = [token for token in ref_tokens if token.strip()]
            
            # 基础METEOR分数
            meteor = single_meteor_score(ref_tokens, pred_tokens)
            # print(f"meteor: {meteor}")

            # 使用 Counter 处理重复词
            pred_counter = Counter(pred_tokens)
            ref_counter = Counter(ref_tokens)
            
            matched_count = sum((pred_counter & ref_counter).values())
            
            precision = matched_count / len(pred_tokens) if pred_tokens else 0
            recall = matched_count / len(ref_tokens) if ref_tokens else 0

            # print(f"Precision: {precision}, Recall: {recall}")  # 输出精确率和召回率
            # print(pred)
            # 计算加权调和平均
            if precision > 0 and recall > 0:
                weighted_harmonic_mean = (precision * recall) / (ALPHA * precision + (1 - ALPHA) * recall)
            else:
                weighted_harmonic_mean = 0
            # print(f"weighted_harmonic_mean: {weighted_harmonic_mean}")

            if weighted_harmonic_mean != 0:
                meteor_penalty_score = 1 - (meteor / weighted_harmonic_mean)
            else:
                meteor_penalty_score = 1

            meteor_penalty_scores.append(meteor_penalty_score)

            weighted_harmonic_means.append(weighted_harmonic_mean)
            meteor_precision_scores.append(precision)
            meteor_recall_scores.append(recall)
            meteor_scores.append(meteor)


        # 计算平均分数
        avg_meteor_precision = sum(meteor_precision_scores) / len(meteor_precision_scores) if meteor_precision_scores else 0
        avg_meteor_recall = sum(meteor_recall_scores) / len(meteor_recall_scores) if meteor_recall_scores else 0 
        ave_Fmean = sum(weighted_harmonic_means) / len(weighted_harmonic_means) if weighted_harmonic_means else 0
        avg_meteor_penalty = sum(meteor_penalty_scores) / len(meteor_penalty_scores) if meteor_penalty_scores else 0
        avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
        # 空预测率
        empty_pred_rate = len(empty_pred) / len(predictions)
        
        # 输出结果
        results = (
            f'模型 {model_results_path} 评估结果：\n'
            f'\nBERT Score 评估结果：\n'
            f'BERT Precision: {bert_precision:.4f}\n'
            f'BERT Recall: {bert_recall:.4f}\n'
            f'BERT F1: {bert_f1:.4f}\n'
            f'\nMETEOR Score 评估结果：\n'
            f'METEOR Precision: {avg_meteor_precision:.4f}\n'
            f'METEOR Recall: {avg_meteor_recall:.4f}\n'
            f'METEOR Fmean: {ave_Fmean:.4f}\n'
            f'METEOR Penalty (Gamma={GAMMA:.1f},β={BETA:.1f}): {avg_meteor_penalty:.4f}\n'
            f'METEOR Score: {avg_meteor:.4f}\n'
            f'空预测率: {empty_pred_rate:.4f}\n'
            '---\n'
        )
        # save_path = 'F:\GeoLLM\output\output_result\Task2'
        # results_file_path = os.path.join(save_path, 'results_factoid.txt')
        # with open(results_file_path, 'a', encoding='utf-8') as f:  # 以追加模式打开文件
        #     f.write(results)
        print(results)  # 打印输出到控制台

if __name__ == '__main__':
    # # 示例调用
    # model_results_paths = [
    #     # # gpt-3.5-turbo的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo_f.json',
    #     # # gpt-4o的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o_f.json',
    #     # # gemini-1.5-pro-002的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002_f.json',
    #     # # claude-3-5-haiku-20241022的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022_f.json',
    #     # # deepseek-ai/DeepSeek-V3的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3_f.json',
    #     # # deepseek-ai/DeepSeek-R1的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-R1_f.json',
    #     # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct_f.json',
    #     # # Qwen/Qwen2.5-72B-Instruct的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct_f.json',
    #     # cot_new的八个测试
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-3.5-turbo_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gpt-4o_f_processed.json',
    #     'F:/GeoLLM/output/output_result/Task2/cot/cot_new/gemini-1.5-pro-002_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/claude-3-5-haiku-20241022_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-V3_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/DeepSeek-R1_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Meta-Llama-3.1-405B-Instruct_f_processed.json',
    #     # 'F:/GeoLLM/output/output_result/Task2/cot/cot_new/Qwen2.5-72B-Instruct_f_processed.json',
    # ]
    # data_path = './data/data.xlsx'
    # calculate_metrics_Factoid(model_results_paths, data_path)
        # 示例调用
    model_results_paths = [
        # # gpt-3.5-turbo的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-3.5-turbo.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-3.5-turbo.json',
        # # gpt-4o的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gpt-4o.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gpt-4o.json',
        # # gemini-1.5-pro-002的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/gemini-1.5-pro-002.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/gemini-1.5-pro-002.json',
        # # claude-3-5-haiku-20241022的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/claude-3-5-haiku-20241022.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/claude-3-5-haiku-20241022.json',
        # # deepseek-ai/DeepSeek-V3的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-V3.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/deepseek-ai/DeepSeek-V3.json',
        # # deepseek-ai/DeepSeek-R1的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/deepseek-ai/DeepSeek-R1.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/deepseek-ai/DeepSeek-R1.json',
        # # meta-llama/Meta-Llama-3.1-405B-Instruct的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/meta-llama/Meta-Llama-3.1-405B-Instruct.json',
        # # Qwen/Qwen2.5-72B-Instruct的八个测试
        # 'F:/GeoLLM/output/output_result/Task2/nomal/zero_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/nomal/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/one_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/two_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/knn/three_shot/Qwen/Qwen2.5-72B-Instruct.json',
        # 'F:/GeoLLM/output/output_result/Task2/cot/cot/Qwen/Qwen2.5-72B-Instruct.json',

        'F:/GeoLLM/Task2/output/nomal1/zero_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/nomal1/one_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/nomal1/two_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/nomal1/three_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/knn1/one_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/knn1/two_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old2.json',     
        'F:/GeoLLM/Task2/output/knn1/three_shot/gpt-3.5-turbo_old1.json',     
        'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo.json',
        'F:/GeoLLM/Task2/output/cot1/cot/gpt-3.5-turbo_old.json',


    ]
    data_path = './data/data.xlsx'
    calculate_metrics(model_results_paths, data_path)