yangzhitao commited on
Commit
6b6ce23
·
1 Parent(s): cd1b5e8

refactor: update about.py to comment out task definitions and modify metric handling in read_evals.py to support multiple metrics

Browse files
Files changed (2) hide show
  1. src/about.py +36 -40
  2. src/leaderboard/read_evals.py +4 -3
src/about.py CHANGED
@@ -1,55 +1,51 @@
1
- from enum import Enum
2
  from functools import lru_cache
3
  from textwrap import dedent
4
- from typing import Annotated
5
-
6
- from pydantic import BaseModel, Field
7
 
8
  from src.prepare import load_meta_toml, prepare_space
9
 
10
  prepare_space()
11
 
12
 
13
- class _Task(BaseModel):
14
- benchmark: Annotated[str, Field(description="The benchmark name")]
15
- metric: Annotated[str, Field(description="The metric name")]
16
- col_name: Annotated[str, Field(description="The column name")]
17
 
18
 
19
  # Select your tasks here
20
  # ---------------------------------------------------
21
- class _Tasks(Enum):
22
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
23
-
24
- # acc
25
- task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
26
- task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
27
- task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
28
- task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
29
- task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
30
- task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
31
- task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
32
- task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
33
-
34
- # caa
35
- task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
36
- task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
37
- task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
38
- task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
39
- task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
40
- task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
41
- task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
42
- task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
43
-
44
- # rand
45
- task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
46
- task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
47
- task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
48
- task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
49
- task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
50
- task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
51
- task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
52
- task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
53
 
54
 
55
  # BENCHMARKS = {m.value.benchmark for m in Tasks}
 
 
1
  from functools import lru_cache
2
  from textwrap import dedent
 
 
 
3
 
4
  from src.prepare import load_meta_toml, prepare_space
5
 
6
  prepare_space()
7
 
8
 
9
+ # class _Task(BaseModel):
10
+ # benchmark: Annotated[str, Field(description="The benchmark name")]
11
+ # metric: Annotated[str, Field(description="The metric name")]
12
+ # col_name: Annotated[str, Field(description="The column name")]
13
 
14
 
15
  # Select your tasks here
16
  # ---------------------------------------------------
17
+ # class _Tasks(Enum):
18
+ # # task_key in the json file, metric_key in the json file, name to display in the leaderboard
19
+
20
+ # # acc
21
+ # task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
22
+ # task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
23
+ # task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
24
+ # task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
25
+ # task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
26
+ # task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
27
+ # task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
28
+ # task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
29
+
30
+ # # caa
31
+ # task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
32
+ # task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
33
+ # task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
34
+ # task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
35
+ # task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
36
+ # task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
37
+ # task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
38
+ # task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
39
+
40
+ # # rand
41
+ # task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
42
+ # task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
43
+ # task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
44
+ # task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
45
+ # task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
46
+ # task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
47
+ # task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
48
+ # task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
49
 
50
 
51
  # BENCHMARKS = {m.value.benchmark for m in Tasks}
src/leaderboard/read_evals.py CHANGED
@@ -107,12 +107,13 @@ class EvalResult(BaseModel):
107
  for task in BENCHMARKS:
108
  # We average all scores of a given metric (not all metrics are present in all files)
109
  # TODO: support multiple metrics
110
- metric_keys = ["caa"]
111
  accs = np.array([
112
- v.get(metric_key, np.nan)
113
  for k, v in data.results.items()
114
  if task.key == k
115
- for metric_key in metric_keys
 
116
  ])
117
  if accs.size == 0 or any(np.isnan(acc) for acc in accs):
118
  continue
 
107
  for task in BENCHMARKS:
108
  # We average all scores of a given metric (not all metrics are present in all files)
109
  # TODO: support multiple metrics
110
+ metric_keys = ["caa", "acc"]
111
  accs = np.array([
112
+ v.get(metric, np.nan)
113
  for k, v in data.results.items()
114
  if task.key == k
115
+ for metric in metric_keys
116
+ if metric in v
117
  ])
118
  if accs.size == 0 or any(np.isnan(acc) for acc in accs):
119
  continue