Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
a69553b
1
Parent(s):
79aba72
support hf leaderboard format and my format
Browse files
src/leaderboard/read_evals.py
CHANGED
|
@@ -75,7 +75,6 @@ class EvalResult:
|
|
| 75 |
tasks = ORIGINAL_TASKS
|
| 76 |
for task in tasks:
|
| 77 |
benchmark, metric = task
|
| 78 |
-
metric = metric + ',all'
|
| 79 |
|
| 80 |
# We skip old mmlu entries
|
| 81 |
wrong_mmlu_version = False
|
|
@@ -92,12 +91,21 @@ class EvalResult:
|
|
| 92 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
|
| 93 |
results[benchmark] = 0.0
|
| 94 |
continue
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
# We average all scores of a given metric (mostly for mmlu)
|
| 97 |
-
accs = np.array([v
|
| 98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 99 |
continue
|
| 100 |
-
|
| 101 |
|
| 102 |
mean_acc = np.mean(accs) * 100.0
|
| 103 |
results[benchmark] = mean_acc
|
|
|
|
| 75 |
tasks = ORIGINAL_TASKS
|
| 76 |
for task in tasks:
|
| 77 |
benchmark, metric = task
|
|
|
|
| 78 |
|
| 79 |
# We skip old mmlu entries
|
| 80 |
wrong_mmlu_version = False
|
|
|
|
| 91 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
|
| 92 |
results[benchmark] = 0.0
|
| 93 |
continue
|
| 94 |
+
|
| 95 |
+
def get_metric(v):
|
| 96 |
+
res = v.get(metric, None)
|
| 97 |
+
if res is None:
|
| 98 |
+
res = v.get(metric + ',all', None)
|
| 99 |
+
if res is None:
|
| 100 |
+
res = v.get(metric + ',None', None)
|
| 101 |
+
if res is None:
|
| 102 |
+
res = v.get('main_score', None)
|
| 103 |
+
return res
|
| 104 |
+
|
| 105 |
# We average all scores of a given metric (mostly for mmlu)
|
| 106 |
+
accs = np.array([get_metric(v) for k, v in data["results"].items() if benchmark in k])
|
| 107 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 108 |
continue
|
|
|
|
| 109 |
|
| 110 |
mean_acc = np.mean(accs) * 100.0
|
| 111 |
results[benchmark] = mean_acc
|