#!/usr/bin/env python3 #python Gen_llm_eval_output.py --p1 csv_files/llm_scores_p1.xlsx --p2 csv_files/llm_scores_p2.xlsx --p3 csv_files/llm_scores_p3.xlsx --output-dir csv_files/outputs import argparse import os import re import math import pandas as pd import numpy as np REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"] def read_scores(path: str) -> pd.DataFrame: df = pd.read_excel(path) # normalize columns df.columns = [c.strip().lower() for c in df.columns] if "prompts" not in df.columns and "prompt" in df.columns: df["prompts"] = df["prompt"] missing = [c for c in REQUIRED_COLS if c not in df.columns] if missing: raise ValueError(f"{path} is missing required columns: {missing}") # keep only required, coerce f1 to numeric df = df[REQUIRED_COLS].copy() df["f1"] = pd.to_numeric(df["f1"], errors="coerce") df = df.dropna(subset=["f1"]) return df def sanitize_filename(s: str) -> str: return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip()) def format_float(x): if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))): return "nan" return f"{x:.4f}" def prompt_order_key(label: str): # Sort by the number in "prompt-" if present; fallback to string m = re.search(r"(\d+)", str(label)) return (0, int(m.group(1))) if m else (1, str(label)) def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str: # Collect all prompt-level f1 values (across tasks and prompts) prompt_values = g["f1"].to_numpy(dtype=float) if prompt_values.size > 0: gen_value = float(np.mean(prompt_values)) gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0 else: gen_value, gen_stderr = float("nan"), 0.0 # Build table text if configuration=="0shot" : configuration='0' if configuration=="10shot" : configuration='10' model = model.split("__")[0]+'/'+model.split("__")[1] #if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B' #if model =='HiTZ__Medical-mT5-large' : model='' #if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model #if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model #if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model #if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model='' #if model =='epfl-llm__meditron-7b' : model='' #if model =='google__gemma-2-9b-it' : model='' #if model =='google__gemma-3-27b-it' : model='' #if model =='google__medgemma-27b-text-it' : model='' #if model =='google__medgemma-4b-it' : model='' #if model =='microsoft__MediPhi-Clinical' : model='' #if model =='microsoft__MediPhi-Instruct' : model='' #if model =='mistralai__Mistral-7B-Instruct-v0.2' : model='' #if model =='mistralai__Mistral-Nemo-Instruct-2407' : model='' #if model =='tiiuae__Falcon3-10B-Instruct' : model='' #if model =='unsloth__phi-4' : model='' #if model =='Henrychur__MMed-Llama-3-8B' : model='' header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1" lines = [ "|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|", "|-------|-------|------|------|------|----|------|---|------|", #f"|Gen | | | |f1 | |{format_float(gen_value)} |---| {format_float(gen_stderr)} |", ] # For each task, add task row (mean over prompts) then prompt rows for task, df_task in g.groupby("task", sort=False): f1s = df_task["f1"].to_numpy(dtype=float) task_mean = float(np.mean(f1s)) if f1s.size else float("nan") lines.append(f"| - {task.upper()} | | | |f1 | | {format_float(task_mean)} | |0 |") # Prompt-level rows, sorted by prompt number if available df_task = df_task.copy() df_task["_order"] = df_task["prompts"].map(prompt_order_key) df_task = df_task.sort_values("_order") for _, r in df_task.iterrows(): prompt_label = str(r["prompts"]) lines.append(f"| - {prompt_label} | | | |f1 | | {format_float(r['f1'])} | | 0 |") return header + "\n" + "\n".join(lines) + "\n" def main(): ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.") ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx") ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx") ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx") ap.add_argument("--output-dir", required=True, help="Directory to write output files") args = ap.parse_args() os.makedirs(args.output_dir, exist_ok=True) df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True) # One file per (model, language, configuration) for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False): content = render_group_table(g, model, language, config) fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt" out_path = os.path.join(args.output_dir, fname) with open(out_path, "w", encoding="utf-8") as f: f.write(content) if __name__ == "__main__": main()