yangzhitao commited on
Commit
c2c3c10
·
1 Parent(s): 79bb0de

feat: integrate display configuration loading and enhance leaderboard data retrieval with versioning support

Browse files
Files changed (6) hide show
  1. app.py +11 -7
  2. src/about.py +19 -3
  3. src/envs.py +13 -8
  4. src/leaderboard/read_evals.py +53 -32
  5. src/populate.py +14 -6
  6. src/prepare.py +34 -1
app.py CHANGED
@@ -37,7 +37,7 @@ from src.display.utils import (
37
  )
38
  from src.envs import API, settings
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
40
- from src.prepare import prepare_space
41
  from src.submission.submit import add_new_submit
42
 
43
  prepare_space()
@@ -282,6 +282,8 @@ def init_leaderboard_tabs(
282
 
283
 
284
  def main():
 
 
285
  demo = gr.Blocks(css_paths=[custom_css, backend_status_indicator_css])
286
  with demo:
287
  gr.HTML(TITLE)
@@ -293,10 +295,11 @@ def main():
293
  print("benchmark_cols:", benchmark_cols)
294
  cols = BASE_COLS + benchmark_cols
295
  benchmark_df = get_leaderboard_df(
296
- settings.EVAL_RESULTS_PATH,
297
  settings.EVAL_REQUESTS_PATH,
298
- cols,
299
- benchmark_cols,
 
300
  )
301
  _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
302
 
@@ -308,10 +311,11 @@ def main():
308
  benchmark_cols = [col for col in BENCHMARK_COLS if col.startswith(benchmark.title)]
309
  cols = BASE_COLS + benchmark_cols
310
  benchmark_df = get_leaderboard_df(
311
- settings.EVAL_RESULTS_PATH,
312
  settings.EVAL_REQUESTS_PATH,
313
- cols,
314
- benchmark_cols,
 
315
  )
316
  _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
317
 
 
37
  )
38
  from src.envs import API, settings
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
40
+ from src.prepare import load_display_toml, prepare_space
41
  from src.submission.submit import add_new_submit
42
 
43
  prepare_space()
 
282
 
283
 
284
  def main():
285
+ results_version = load_display_toml().version
286
+
287
  demo = gr.Blocks(css_paths=[custom_css, backend_status_indicator_css])
288
  with demo:
289
  gr.HTML(TITLE)
 
295
  print("benchmark_cols:", benchmark_cols)
296
  cols = BASE_COLS + benchmark_cols
297
  benchmark_df = get_leaderboard_df(
298
+ settings.EVAL_RESULTS_VERSIONS_DIR,
299
  settings.EVAL_REQUESTS_PATH,
300
+ results_version=results_version,
301
+ cols=cols,
302
+ benchmark_cols=benchmark_cols,
303
  )
304
  _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
305
 
 
311
  benchmark_cols = [col for col in BENCHMARK_COLS if col.startswith(benchmark.title)]
312
  cols = BASE_COLS + benchmark_cols
313
  benchmark_df = get_leaderboard_df(
314
+ settings.EVAL_RESULTS_VERSIONS_DIR,
315
  settings.EVAL_REQUESTS_PATH,
316
+ results_version=results_version,
317
+ cols=cols,
318
+ benchmark_cols=benchmark_cols,
319
  )
320
  _leaderboard = init_leaderboard_tabs(benchmark_df, benchmark_cols, NOT_SUPPORTED_COLS)
321
 
src/about.py CHANGED
@@ -1,7 +1,13 @@
 
1
  from functools import lru_cache
2
  from textwrap import dedent
3
 
4
- from src.prepare import load_meta_toml, prepare_space
 
 
 
 
 
5
 
6
  prepare_space()
7
 
@@ -52,9 +58,19 @@ prepare_space()
52
  # METRICS = {m.value.metric for m in Tasks}
53
  # COL_NAMES = {m.value.col_name for m in Tasks}
54
  @lru_cache(maxsize=1)
55
- def get_benchmarks():
56
  meta_toml = load_meta_toml()
57
- return [b for b in meta_toml.benchmarks if not b.disabled]
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  NUM_FEWSHOT = 0 # Change with your few shot
 
1
+ import typing
2
  from functools import lru_cache
3
  from textwrap import dedent
4
 
5
+ from loguru import logger
6
+
7
+ from src.prepare import load_display_toml, load_meta_toml, prepare_space
8
+
9
+ if typing.TYPE_CHECKING:
10
+ from src.prepare import MetaToml_Benchmark
11
 
12
  prepare_space()
13
 
 
58
  # METRICS = {m.value.metric for m in Tasks}
59
  # COL_NAMES = {m.value.col_name for m in Tasks}
60
  @lru_cache(maxsize=1)
61
+ def get_benchmarks() -> list["MetaToml_Benchmark"]:
62
  meta_toml = load_meta_toml()
63
+ display_toml = load_display_toml()
64
+ benchmarks_map = {b.key: b for b in meta_toml.benchmarks if not b.disabled}
65
+ benchmarks = []
66
+ # Sort benchmarks by display order
67
+ for key in display_toml.benchmarks_order:
68
+ b = benchmarks_map.pop(key, None)
69
+ if b is not None:
70
+ benchmarks.append(b)
71
+ benchmarks.extend(benchmarks_map.values())
72
+ logger.info(f"Loaded {len(benchmarks)} benchmarks: titles={[b.title for b in benchmarks]!r}")
73
+ return benchmarks
74
 
75
 
76
  NUM_FEWSHOT = 0 # Change with your few shot
src/envs.py CHANGED
@@ -54,23 +54,28 @@ class Settings(BaseSettings):
54
 
55
  @computed_field
56
  @cached_property
57
- def EVAL_REQUESTS_PATH(self) -> str:
58
- return (self.HF_HOME / "eval-queue").as_posix()
59
 
60
  @computed_field
61
  @cached_property
62
- def EVAL_RESULTS_PATH(self) -> str:
63
- return (self.HF_HOME / "eval-results").as_posix()
64
 
65
  @computed_field
66
  @cached_property
67
- def EVAL_REQUESTS_PATH_BACKUP(self) -> str:
68
- return (self.HF_HOME / "eval-queue-bk").as_posix()
69
 
70
  @computed_field
71
  @cached_property
72
- def EVAL_RESULTS_PATH_BACKUP(self) -> str:
73
- return (self.HF_HOME / "eval-results-bk").as_posix()
 
 
 
 
 
74
 
75
  ENABLE_BENCHMARK_TABS: bool = False
76
  ENABLE_SUBMISSION: bool = False
 
54
 
55
  @computed_field
56
  @cached_property
57
+ def EVAL_REQUESTS_PATH(self) -> Path:
58
+ return self.HF_HOME / "eval-queue"
59
 
60
  @computed_field
61
  @cached_property
62
+ def EVAL_RESULTS_PATH(self) -> Path:
63
+ return self.HF_HOME / "eval-results"
64
 
65
  @computed_field
66
  @cached_property
67
+ def EVAL_RESULTS_VERSIONS_DIR(self) -> Path:
68
+ return self.EVAL_RESULTS_PATH / "leaderboard/versions"
69
 
70
  @computed_field
71
  @cached_property
72
+ def EVAL_REQUESTS_PATH_BACKUP(self) -> Path:
73
+ return self.HF_HOME / "eval-queue-bk"
74
+
75
+ @computed_field
76
+ @cached_property
77
+ def EVAL_RESULTS_PATH_BACKUP(self) -> Path:
78
+ return self.HF_HOME / "eval-results-bk"
79
 
80
  ENABLE_BENCHMARK_TABS: bool = False
81
  ENABLE_SUBMISSION: bool = False
src/leaderboard/read_evals.py CHANGED
@@ -10,9 +10,10 @@ import warnings
10
  from pathlib import Path
11
  from typing import Annotated, Any
12
 
13
- import dateutil.parser
14
  import numpy as np
 
15
  from pydantic import BaseModel, ConfigDict, Field, computed_field
 
16
  from typing_extensions import Self
17
 
18
  from src.about import get_benchmarks
@@ -39,6 +40,7 @@ class EvalResultJson_Config(BaseModel):
39
  model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
40
 
41
  model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
 
42
  model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
43
  model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
44
  model_args: Annotated[str | None, Field(description="The model args.")] = None
@@ -47,8 +49,20 @@ class EvalResultJson_Config(BaseModel):
47
  class EvalResult(BaseModel):
48
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
49
 
50
- eval_name: str # org_model_precision (uid)
51
- full_model: str # org/model (path on hub)
 
 
 
 
 
 
 
 
 
 
 
 
52
  org: str | None
53
  model: str
54
  link_url: str | None = None
@@ -77,10 +91,25 @@ class EvalResult(BaseModel):
77
  return None
78
 
79
  @classmethod
80
- def init_from_json_file(cls, json_filepath: str) -> Self:
81
  """Inits the result from the specific model result file"""
82
- json_content = Path(json_filepath).read_text(encoding="utf-8")
83
- data = EvalResultJson.model_validate_json(json_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  config = data.config
85
 
86
  # Precision
@@ -89,21 +118,20 @@ class EvalResult(BaseModel):
89
  meta_toml = load_meta_toml()
90
 
91
  # Get model and org
92
- model_key = config.model_name or config.model_args or ""
93
- model = model_key
94
  org = None
95
  link_url = None
96
  m_repo = meta_toml.model_key_to_repo.get(model_key)
97
  if m_repo is not None:
98
  if m_repo.repo_id:
99
- org, _, model = m_repo.repo_id.rpartition("/")
100
  org = org or None
101
  if m_repo.link:
102
  link_url = m_repo.link
103
  if not org:
104
- result_key = f"{model}_{precision.value.name}"
105
  else:
106
- result_key = f"{org}_{model}_{precision.value.name}"
107
 
108
  model_title = model_key
109
  m_meta = meta_toml.model_key_to_model.get(model_key)
@@ -112,7 +140,7 @@ class EvalResult(BaseModel):
112
 
113
  if org:
114
  still_on_hub, _, model_config = is_model_on_hub(
115
- f"{org}/{model}",
116
  config.model_sha or "main",
117
  trust_remote_code=True,
118
  test_tokenizer=False,
@@ -150,7 +178,7 @@ class EvalResult(BaseModel):
150
  "eval_name": result_key,
151
  "full_model": model_title,
152
  "org": org or None,
153
- "model": model,
154
  "link_url": link_url or None,
155
  "results": results,
156
  "precision": precision,
@@ -159,7 +187,7 @@ class EvalResult(BaseModel):
159
  "architecture": architecture,
160
  })
161
 
162
- def update_with_request_file(self, requests_path: str) -> None:
163
  """Finds the relevant request file for the current model and updates info with it"""
164
  # TODO: do nothing for now
165
  return
@@ -223,28 +251,21 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
223
  return request_file
224
 
225
 
226
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
227
  """From the path of the results folder root, extract all needed info for results"""
228
- model_result_filepaths: list[str] = []
229
-
230
- for root, _, files in os.walk(results_path):
231
- # We should only have json files in model results
232
- if len(files) == 0 or any(not f.endswith(".json") for f in files):
233
- continue
234
-
235
- # Sort the files by date
236
- try:
237
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
238
- except dateutil.parser.ParserError:
239
- files = [files[-1]]
240
-
241
- for file in files:
242
- model_result_filepaths.append(os.path.join(root, file))
243
 
244
  eval_results: dict[str, EvalResult] = {}
245
- for model_result_filepath in model_result_filepaths:
246
  # Creation of result
247
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
248
  eval_result.update_with_request_file(requests_path)
249
 
250
  # Store results of same eval together
 
10
  from pathlib import Path
11
  from typing import Annotated, Any
12
 
 
13
  import numpy as np
14
+ from loguru import logger
15
  from pydantic import BaseModel, ConfigDict, Field, computed_field
16
+ from pydantic_core import from_json
17
  from typing_extensions import Self
18
 
19
  from src.about import get_benchmarks
 
40
  model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
41
 
42
  model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
43
+ model_key: Annotated[str, Field(..., description="The model key. e.g. 'qwen2.5_3b'")]
44
  model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
45
  model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
46
  model_args: Annotated[str | None, Field(description="The model args.")] = None
 
49
  class EvalResult(BaseModel):
50
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
51
 
52
+ eval_name: Annotated[
53
+ str,
54
+ Field(
55
+ ...,
56
+ description="The evaluation name. e.g. '{model_key}_{precision}', '{org}_{model_key}_{precision}' (unique identifier)",
57
+ ),
58
+ ]
59
+ full_model: Annotated[
60
+ str,
61
+ Field(
62
+ ...,
63
+ description="The full model name. e.g. '{org}/{model_title}' (path on hub)",
64
+ ),
65
+ ]
66
  org: str | None
67
  model: str
68
  link_url: str | None = None
 
91
  return None
92
 
93
  @classmethod
94
+ def init_from_json_file(cls, json_path: Path) -> Self:
95
  """Inits the result from the specific model result file"""
96
+ json_data = json_path.read_bytes()
97
+ return cls.init_from_json(json_data)
98
+
99
+ @classmethod
100
+ def init_from_json(cls, json_data: str | bytes | bytearray) -> Self:
101
+ """Inits the result from the specific json data"""
102
+ data = EvalResultJson.model_validate_json(json_data)
103
+ return cls.init_from_model(data)
104
+
105
+ @classmethod
106
+ def init_from_dict(cls, raw_model: dict[str, Any]) -> Self:
107
+ """Inits the result from the specific json content"""
108
+ data = EvalResultJson.model_validate(raw_model)
109
+ return cls.init_from_model(data)
110
+
111
+ @classmethod
112
+ def init_from_model(cls, data: EvalResultJson) -> Self:
113
  config = data.config
114
 
115
  # Precision
 
118
  meta_toml = load_meta_toml()
119
 
120
  # Get model and org
121
+ model_key: str = config.model_key or config.model_args or ""
 
122
  org = None
123
  link_url = None
124
  m_repo = meta_toml.model_key_to_repo.get(model_key)
125
  if m_repo is not None:
126
  if m_repo.repo_id:
127
+ org, _, model_key = m_repo.repo_id.rpartition("/")
128
  org = org or None
129
  if m_repo.link:
130
  link_url = m_repo.link
131
  if not org:
132
+ result_key = f"{model_key}_{precision.value.name}"
133
  else:
134
+ result_key = f"{org}_{model_key}_{precision.value.name}"
135
 
136
  model_title = model_key
137
  m_meta = meta_toml.model_key_to_model.get(model_key)
 
140
 
141
  if org:
142
  still_on_hub, _, model_config = is_model_on_hub(
143
+ f"{org}/{model_key}",
144
  config.model_sha or "main",
145
  trust_remote_code=True,
146
  test_tokenizer=False,
 
178
  "eval_name": result_key,
179
  "full_model": model_title,
180
  "org": org or None,
181
+ "model": model_key,
182
  "link_url": link_url or None,
183
  "results": results,
184
  "precision": precision,
 
187
  "architecture": architecture,
188
  })
189
 
190
+ def update_with_request_file(self, requests_path: Path | str) -> None:
191
  """Finds the relevant request file for the current model and updates info with it"""
192
  # TODO: do nothing for now
193
  return
 
251
  return request_file
252
 
253
 
254
+ def get_raw_eval_results(results_versions_dir: Path, requests_path: Path, *, results_version: str) -> list[EvalResult]:
255
  """From the path of the results folder root, extract all needed info for results"""
256
+ versioned_result_file = results_versions_dir / f"bench_{results_version}.json"
257
+ if not versioned_result_file.exists():
258
+ raise FileNotFoundError(
259
+ f"version={results_version!r} results file not found: {versioned_result_file.as_posix()!r}"
260
+ )
261
+ logger.info(f"Loading results from: {versioned_result_file.as_posix()!r}")
262
+ raw_results_model: dict[str, dict[str, Any]] = from_json(versioned_result_file.read_bytes())
263
+ logger.info(f"Loaded {len(raw_results_model)} results")
 
 
 
 
 
 
 
264
 
265
  eval_results: dict[str, EvalResult] = {}
266
+ for _model_key, model_data in raw_results_model.items():
267
  # Creation of result
268
+ eval_result = EvalResult.init_from_dict(model_data)
269
  eval_result.update_with_request_file(requests_path)
270
 
271
  # Store results of same eval together
src/populate.py CHANGED
@@ -15,6 +15,7 @@ applies formatting transformations, and filters data based on completion status.
15
 
16
  import json
17
  import os
 
18
 
19
  import pandas as pd
20
 
@@ -24,8 +25,10 @@ from src.leaderboard.read_evals import get_raw_eval_results
24
 
25
 
26
  def get_leaderboard_df(
27
- results_path: str,
28
- requests_path: str,
 
 
29
  cols: list[str],
30
  benchmark_cols: list[str],
31
  ) -> pd.DataFrame:
@@ -38,8 +41,9 @@ def get_leaderboard_df(
38
  evaluations.
39
 
40
  Args:
41
- results_path (str): Path to the directory containing evaluation result files
42
- requests_path (str): Path to the directory containing evaluation request files
 
43
  cols (list): List of column names to include in the final DataFrame
44
  benchmark_cols (list): List of benchmark column names used for filtering
45
 
@@ -52,7 +56,11 @@ def get_leaderboard_df(
52
  The function automatically truncates numeric values to 1 decimal place and
53
  filters out any entries that have NaN values in the specified benchmark columns.
54
  """
55
- raw_data = get_raw_eval_results(results_path, requests_path)
 
 
 
 
56
  all_data_json = [v.to_dict() for v in raw_data]
57
 
58
  df = pd.DataFrame.from_records(all_data_json)
@@ -64,7 +72,7 @@ def get_leaderboard_df(
64
  return df
65
 
66
 
67
- def get_evaluation_queue_df(save_path: str, cols: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
68
  """
69
  Creates separate DataFrames for different evaluation queue statuses.
70
 
 
15
 
16
  import json
17
  import os
18
+ from pathlib import Path
19
 
20
  import pandas as pd
21
 
 
25
 
26
 
27
  def get_leaderboard_df(
28
+ results_versions_dir: Path,
29
+ requests_path: Path,
30
+ *,
31
+ results_version: str,
32
  cols: list[str],
33
  benchmark_cols: list[str],
34
  ) -> pd.DataFrame:
 
41
  evaluations.
42
 
43
  Args:
44
+ results_versions_dir (Path): Path to the directory containing evaluation result files
45
+ requests_path (Path): Path to the directory containing evaluation request files
46
+ results_version (str): Version of the results
47
  cols (list): List of column names to include in the final DataFrame
48
  benchmark_cols (list): List of benchmark column names used for filtering
49
 
 
56
  The function automatically truncates numeric values to 1 decimal place and
57
  filters out any entries that have NaN values in the specified benchmark columns.
58
  """
59
+ raw_data = get_raw_eval_results(
60
+ results_versions_dir,
61
+ requests_path,
62
+ results_version=results_version,
63
+ )
64
  all_data_json = [v.to_dict() for v in raw_data]
65
 
66
  df = pd.DataFrame.from_records(all_data_json)
 
72
  return df
73
 
74
 
75
+ def get_evaluation_queue_df(save_path: Path, cols: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
76
  """
77
  Creates separate DataFrames for different evaluation queue statuses.
78
 
src/prepare.py CHANGED
@@ -2,10 +2,11 @@ import os
2
  import sys
3
  from functools import cached_property, lru_cache
4
  from pathlib import Path
 
5
 
6
  from huggingface_hub import snapshot_download
7
  from loguru import logger
8
- from pydantic import BaseModel, ConfigDict
9
  from typing_extensions import Self
10
 
11
  from src.envs import API, settings
@@ -54,6 +55,7 @@ def prepare_space():
54
  PREPARED_FLAG = True
55
 
56
  load_meta_toml()
 
57
 
58
 
59
  class MetaToml(BaseModel):
@@ -158,3 +160,34 @@ def load_meta_toml() -> MetaToml:
158
  logger.info("Loaded meta.toml")
159
  assert meta_toml is not None, f"Failed to load meta.toml: {meta_toml_path.as_posix()!r}"
160
  return meta_toml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import sys
3
  from functools import cached_property, lru_cache
4
  from pathlib import Path
5
+ from typing import Annotated
6
 
7
  from huggingface_hub import snapshot_download
8
  from loguru import logger
9
+ from pydantic import BaseModel, ConfigDict, Field
10
  from typing_extensions import Self
11
 
12
  from src.envs import API, settings
 
55
  PREPARED_FLAG = True
56
 
57
  load_meta_toml()
58
+ load_display_toml()
59
 
60
 
61
  class MetaToml(BaseModel):
 
160
  logger.info("Loaded meta.toml")
161
  assert meta_toml is not None, f"Failed to load meta.toml: {meta_toml_path.as_posix()!r}"
162
  return meta_toml
163
+
164
+
165
+ class DisplayToml(BaseModel):
166
+ model_config = ConfigDict(extra="allow", frozen=True)
167
+
168
+ version: Annotated[str, Field(..., description="The version of the results.")]
169
+ benchmarks_order: Annotated[
170
+ list[str],
171
+ Field(
172
+ default_factory=lambda: [
173
+ "vsi_bench",
174
+ "mmsi_bench",
175
+ "mindcube_tiny",
176
+ "viewspatial",
177
+ "site",
178
+ ],
179
+ description="The predefined order of the benchmarks.",
180
+ ),
181
+ ]
182
+
183
+
184
+ @lru_cache(maxsize=1)
185
+ def load_display_toml() -> DisplayToml:
186
+ display_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "display.toml"
187
+ logger.info(f'Loading display.toml from: {display_toml_path.as_posix()!r}')
188
+ with display_toml_path.open("rb") as f:
189
+ data = toml_load(f)
190
+ display_toml = DisplayToml.model_validate(data)
191
+ logger.info("Loaded display.toml")
192
+ assert display_toml is not None, f"Failed to load display.toml: {display_toml_path.as_posix()!r}"
193
+ return display_toml