Spaces:
Running
Running
yangzhitao
commited on
Commit
·
6b6ce23
1
Parent(s):
cd1b5e8
refactor: update about.py to comment out task definitions and modify metric handling in read_evals.py to support multiple metrics
Browse files- src/about.py +36 -40
- src/leaderboard/read_evals.py +4 -3
src/about.py
CHANGED
|
@@ -1,55 +1,51 @@
|
|
| 1 |
-
from enum import Enum
|
| 2 |
from functools import lru_cache
|
| 3 |
from textwrap import dedent
|
| 4 |
-
from typing import Annotated
|
| 5 |
-
|
| 6 |
-
from pydantic import BaseModel, Field
|
| 7 |
|
| 8 |
from src.prepare import load_meta_toml, prepare_space
|
| 9 |
|
| 10 |
prepare_space()
|
| 11 |
|
| 12 |
|
| 13 |
-
class _Task(BaseModel):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
| 18 |
|
| 19 |
# Select your tasks here
|
| 20 |
# ---------------------------------------------------
|
| 21 |
-
class _Tasks(Enum):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
|
| 55 |
# BENCHMARKS = {m.value.benchmark for m in Tasks}
|
|
|
|
|
|
|
| 1 |
from functools import lru_cache
|
| 2 |
from textwrap import dedent
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from src.prepare import load_meta_toml, prepare_space
|
| 5 |
|
| 6 |
prepare_space()
|
| 7 |
|
| 8 |
|
| 9 |
+
# class _Task(BaseModel):
|
| 10 |
+
# benchmark: Annotated[str, Field(description="The benchmark name")]
|
| 11 |
+
# metric: Annotated[str, Field(description="The metric name")]
|
| 12 |
+
# col_name: Annotated[str, Field(description="The column name")]
|
| 13 |
|
| 14 |
|
| 15 |
# Select your tasks here
|
| 16 |
# ---------------------------------------------------
|
| 17 |
+
# class _Tasks(Enum):
|
| 18 |
+
# # task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 19 |
+
|
| 20 |
+
# # acc
|
| 21 |
+
# task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
|
| 22 |
+
# task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
|
| 23 |
+
# task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
|
| 24 |
+
# task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
|
| 25 |
+
# task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
|
| 26 |
+
# task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
|
| 27 |
+
# task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
|
| 28 |
+
# task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
|
| 29 |
+
|
| 30 |
+
# # caa
|
| 31 |
+
# task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
|
| 32 |
+
# task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
|
| 33 |
+
# task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
|
| 34 |
+
# task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
|
| 35 |
+
# task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
|
| 36 |
+
# task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
|
| 37 |
+
# task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
|
| 38 |
+
# task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
|
| 39 |
+
|
| 40 |
+
# # rand
|
| 41 |
+
# task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
|
| 42 |
+
# task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
|
| 43 |
+
# task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
|
| 44 |
+
# task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
|
| 45 |
+
# task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
|
| 46 |
+
# task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
|
| 47 |
+
# task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
|
| 48 |
+
# task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
|
| 49 |
|
| 50 |
|
| 51 |
# BENCHMARKS = {m.value.benchmark for m in Tasks}
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -107,12 +107,13 @@ class EvalResult(BaseModel):
|
|
| 107 |
for task in BENCHMARKS:
|
| 108 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 109 |
# TODO: support multiple metrics
|
| 110 |
-
metric_keys = ["caa"]
|
| 111 |
accs = np.array([
|
| 112 |
-
v.get(
|
| 113 |
for k, v in data.results.items()
|
| 114 |
if task.key == k
|
| 115 |
-
for
|
|
|
|
| 116 |
])
|
| 117 |
if accs.size == 0 or any(np.isnan(acc) for acc in accs):
|
| 118 |
continue
|
|
|
|
| 107 |
for task in BENCHMARKS:
|
| 108 |
# We average all scores of a given metric (not all metrics are present in all files)
|
| 109 |
# TODO: support multiple metrics
|
| 110 |
+
metric_keys = ["caa", "acc"]
|
| 111 |
accs = np.array([
|
| 112 |
+
v.get(metric, np.nan)
|
| 113 |
for k, v in data.results.items()
|
| 114 |
if task.key == k
|
| 115 |
+
for metric in metric_keys
|
| 116 |
+
if metric in v
|
| 117 |
])
|
| 118 |
if accs.size == 0 or any(np.isnan(acc) for acc in accs):
|
| 119 |
continue
|