Spaces:

eduagarcia
/

open_pt_llm_leaderboard

Running on CPU Upgrade

eduagarcia commited on Feb 6, 2024

Commit

a69553b

1 Parent(s): 79aba72

support hf leaderboard format and my format

Files changed (1) hide show

src/leaderboard/read_evals.py CHANGED Viewed

@@ -75,7 +75,6 @@ class EvalResult:
             tasks = ORIGINAL_TASKS
         for task in tasks:
             benchmark, metric = task
-            metric = metric + ',all'
             # We skip old mmlu entries
             wrong_mmlu_version = False
@@ -92,12 +91,21 @@ class EvalResult:
                 if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
                     results[benchmark] = 0.0
                     continue
             # We average all scores of a given metric (mostly for mmlu)
-            accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
             results[benchmark] = mean_acc

             tasks = ORIGINAL_TASKS
         for task in tasks:
             benchmark, metric = task
             # We skip old mmlu entries
             wrong_mmlu_version = False
                 if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
                     results[benchmark] = 0.0
                     continue
+            def get_metric(v):
+                res = v.get(metric, None)
+                if res is None:
+                    res = v.get(metric + ',all', None)
+                if res is None:
+                    res = v.get(metric + ',None', None)
+                if res is None:
+                    res = v.get('main_score', None)
+                return res
             # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([get_metric(v) for k, v in data["results"].items() if benchmark in k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
             results[benchmark] = mean_acc