yangzhitao commited on
Commit
12947f9
·
1 Parent(s): fbc528a
Files changed (2) hide show
  1. app.py +38 -36
  2. src/leaderboard/read_evals.py +5 -1
app.py CHANGED
@@ -6,26 +6,25 @@ from huggingface_hub import snapshot_download
6
  from rich import print
7
 
8
  from src.about import (
 
9
  CITATION_BUTTON_LABEL,
10
  CITATION_BUTTON_TEXT,
11
  EVALUATION_QUEUE_TEXT,
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
15
- BENCHMARKS,
16
  )
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
 
19
  BENCHMARK_COLS,
20
  COLS,
21
- BASE_COLS,
22
  EVAL_COLS,
23
  EVAL_TYPES,
24
  AutoEvalColumn,
25
  ModelType,
26
  Precision,
27
  WeightType,
28
- fields,
29
  )
30
  from src.envs import API, settings
31
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -84,18 +83,18 @@ def filter_dataframe_by_columns(selected_cols: list[str], original_df: pd.DataFr
84
  # 始终包含基础列 'T' 和 'Model'
85
  base_cols = ['T', 'Model']
86
  all_selected_cols = [col for col in base_cols if col in original_df.columns]
87
-
88
  # 添加用户选择的列(排除已存在的基础列)
89
  for col in selected_cols:
90
  if col in original_df.columns and col not in all_selected_cols:
91
  all_selected_cols.append(col)
92
-
93
  # 确保列的顺序:基础列在前,然后是按原始顺序的选中列
94
  ordered_cols = []
95
  for col in original_df.columns:
96
  if col in all_selected_cols:
97
  ordered_cols.append(col)
98
-
99
  # 确保总是返回 DataFrame,即使是单列也使用 [[]] 来保持 DataFrame 类型
100
  if ordered_cols:
101
  filtered_df = original_df.loc[:, ordered_cols]
@@ -111,11 +110,11 @@ def filter_dataframe_by_precision(selected_precisions: list[str], df: pd.DataFra
111
  """
112
  if not selected_precisions:
113
  return df.iloc[0:0].copy() # 返回相同结构但为空的 DataFrame
114
-
115
  precision_col = AutoEvalColumn.precision.name
116
  if precision_col not in df.columns:
117
  return df
118
-
119
  # 筛选包含任一选定 precision 的行
120
  mask = df[precision_col].isin(selected_precisions)
121
  filtered_df = df.loc[mask, :]
@@ -129,26 +128,26 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
129
  """
130
  if not search_text or not search_text.strip():
131
  return df
132
-
133
  # 分割逗号,去除空白并转换为小写用于匹配
134
  import re
135
-
136
  keywords = [keyword.strip().lower() for keyword in search_text.split(',') if keyword.strip()]
137
  if not keywords:
138
  return df
139
-
140
  if 'Model' not in df.columns:
141
  return df
142
-
143
  # 匹配函数:从 HTML 中提取纯文本并检查是否包含关键词
144
  def matches_search(model_cell):
145
  if pd.isna(model_cell):
146
  return False
147
-
148
  # 从 HTML 链接中提取纯文本(model_name)
149
  # 格式: <a ...>model_name</a> 或直接是文本
150
  text = str(model_cell)
151
-
152
  # 提取 HTML 标签内的文本
153
  # 匹配 <a>...</a> 标签内的内容,或直接使用文本
154
  match = re.search(r'<a[^>]*>([^<]+)</a>', text, re.IGNORECASE)
@@ -156,10 +155,10 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
156
  model_name = match.group(1).lower()
157
  else:
158
  model_name = text.lower()
159
-
160
  # 检查是否包含任一关键词
161
  return any(keyword in model_name for keyword in keywords)
162
-
163
  # 应用搜索过滤
164
  mask = df['Model'].apply(matches_search)
165
  filtered_df = df.loc[mask, :]
@@ -169,18 +168,22 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
169
  def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
170
  # 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
171
  original_df = dataframe.copy()
172
-
173
  available_precisions = sorted(original_df["Precision"].dropna().unique().tolist())
174
- default_precision = ['bfloat16'] if 'bfloat16' in available_precisions else (available_precisions[:1] if available_precisions else [])
175
-
 
 
 
 
176
  # 初始化显示的列(包含基础列和默认选中的列)
177
  default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
178
-
179
  # 先按 precision 筛选 original_df
180
  precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
181
  # 根据默认选择再筛选一次 DataFrame
182
  initial_filtered_df = filter_dataframe_by_columns(default_selected, precision_filtered_df)
183
-
184
  with gr.Row():
185
  with gr.Column(scale=1):
186
  search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
@@ -191,7 +194,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
191
  interactive=True,
192
  )
193
  with gr.Column(scale=1):
194
- model_type = gr.CheckboxGroup(
195
  [],
196
  label="Model Type",
197
  value=[],
@@ -202,7 +205,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
202
  value=default_precision,
203
  interactive=True,
204
  )
205
- hide_models = gr.CheckboxGroup(
206
  ['Deleted/incomplete'],
207
  label="Hide Models",
208
  value=['Deleted/incomplete'],
@@ -218,7 +221,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
218
  datatype='markdown',
219
  elem_id="auto-width-dataframe",
220
  )
221
-
222
  # 统一的更新函数:同时处理 precision、列筛选和搜索
223
  def update_dataframe(search_text: str, selected_cols: list[str], selected_precisions: list[str]):
224
  # 先按 precision 筛选 original_df
@@ -228,26 +231,26 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
228
  # 最后按搜索关键词筛选
229
  final_df = search_models_in_dataframe(search_text, column_filtered_df)
230
  return final_df
231
-
232
  # 绑定搜索、列选择和 precision 的变化事件,动态更新 DataFrame
233
  search.change(
234
  fn=update_dataframe,
235
  inputs=[search, show_columns, precision],
236
  outputs=leaderboard,
237
  )
238
-
239
  show_columns.change(
240
  fn=update_dataframe,
241
  inputs=[search, show_columns, precision],
242
  outputs=leaderboard,
243
  )
244
-
245
  precision.change(
246
  fn=update_dataframe,
247
  inputs=[search, show_columns, precision],
248
  outputs=leaderboard,
249
  )
250
-
251
  return leaderboard
252
 
253
 
@@ -257,10 +260,11 @@ with demo:
257
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
258
 
259
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
260
-
261
  for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
262
  with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
263
- benchmark_cols = [BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)]
 
 
264
  cols = BASE_COLS + benchmark_cols
265
  BENCHMARK_DF = get_leaderboard_df(
266
  settings.EVAL_RESULTS_PATH,
@@ -339,24 +343,22 @@ with demo:
339
  value=None,
340
  interactive=True,
341
  )
 
342
  def search_models(query):
343
  if not query.strip():
344
  return []
345
  models = API.list_models(search=query, limit=10)
346
  results = []
347
  for m in models:
348
- results.append([
349
- m.id,
350
- m.pipeline_tag or "N/A",
351
- m.downloads or 0,
352
- m.likes or 0
353
- ])
354
  return results
 
355
  def on_select(evt: gr.SelectData, data):
356
  row_idx = evt.index[0] # 获取点击行号
357
  if row_idx < len(data):
358
  return data.iloc[row_idx, 0] # 返回模型名
359
  return ""
 
360
  search_name.change(fn=search_models, inputs=search_name, outputs=table)
361
  table.select(fn=on_select, inputs=table, outputs=model_name_textbox)
362
 
 
6
  from rich import print
7
 
8
  from src.about import (
9
+ BENCHMARKS,
10
  CITATION_BUTTON_LABEL,
11
  CITATION_BUTTON_TEXT,
12
  EVALUATION_QUEUE_TEXT,
13
  INTRODUCTION_TEXT,
14
  LLM_BENCHMARKS_TEXT,
15
  TITLE,
 
16
  )
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
19
+ BASE_COLS,
20
  BENCHMARK_COLS,
21
  COLS,
 
22
  EVAL_COLS,
23
  EVAL_TYPES,
24
  AutoEvalColumn,
25
  ModelType,
26
  Precision,
27
  WeightType,
 
28
  )
29
  from src.envs import API, settings
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
83
  # 始终包含基础列 'T' 和 'Model'
84
  base_cols = ['T', 'Model']
85
  all_selected_cols = [col for col in base_cols if col in original_df.columns]
86
+
87
  # 添加用户选择的列(排除已存在的基础列)
88
  for col in selected_cols:
89
  if col in original_df.columns and col not in all_selected_cols:
90
  all_selected_cols.append(col)
91
+
92
  # 确保列的顺序:基础列在前,然后是按原始顺序的选中列
93
  ordered_cols = []
94
  for col in original_df.columns:
95
  if col in all_selected_cols:
96
  ordered_cols.append(col)
97
+
98
  # 确保总是返回 DataFrame,即使是单列也使用 [[]] 来保持 DataFrame 类型
99
  if ordered_cols:
100
  filtered_df = original_df.loc[:, ordered_cols]
 
110
  """
111
  if not selected_precisions:
112
  return df.iloc[0:0].copy() # 返回相同结构但为空的 DataFrame
113
+
114
  precision_col = AutoEvalColumn.precision.name
115
  if precision_col not in df.columns:
116
  return df
117
+
118
  # 筛选包含任一选定 precision 的行
119
  mask = df[precision_col].isin(selected_precisions)
120
  filtered_df = df.loc[mask, :]
 
128
  """
129
  if not search_text or not search_text.strip():
130
  return df
131
+
132
  # 分割逗号,去除空白并转换为小写用于匹配
133
  import re
134
+
135
  keywords = [keyword.strip().lower() for keyword in search_text.split(',') if keyword.strip()]
136
  if not keywords:
137
  return df
138
+
139
  if 'Model' not in df.columns:
140
  return df
141
+
142
  # 匹配函数:从 HTML 中提取纯文本并检查是否包含关键词
143
  def matches_search(model_cell):
144
  if pd.isna(model_cell):
145
  return False
146
+
147
  # 从 HTML 链接中提取纯文本(model_name)
148
  # 格式: <a ...>model_name</a> 或直接是文本
149
  text = str(model_cell)
150
+
151
  # 提取 HTML 标签内的文本
152
  # 匹配 <a>...</a> 标签内的内容,或直接使用文本
153
  match = re.search(r'<a[^>]*>([^<]+)</a>', text, re.IGNORECASE)
 
155
  model_name = match.group(1).lower()
156
  else:
157
  model_name = text.lower()
158
+
159
  # 检查是否包含任一关键词
160
  return any(keyword in model_name for keyword in keywords)
161
+
162
  # 应用搜索过滤
163
  mask = df['Model'].apply(matches_search)
164
  filtered_df = df.loc[mask, :]
 
168
  def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
169
  # 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
170
  original_df = dataframe.copy()
171
+
172
  available_precisions = sorted(original_df["Precision"].dropna().unique().tolist())
173
+ default_precision = (
174
+ ['bfloat16']
175
+ if 'bfloat16' in available_precisions
176
+ else (available_precisions[:1] if available_precisions else [])
177
+ )
178
+
179
  # 初始化显示的列(包含基础列和默认选中的列)
180
  default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
181
+
182
  # 先按 precision 筛选 original_df
183
  precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
184
  # 根据默认选择再筛选一次 DataFrame
185
  initial_filtered_df = filter_dataframe_by_columns(default_selected, precision_filtered_df)
186
+
187
  with gr.Row():
188
  with gr.Column(scale=1):
189
  search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
 
194
  interactive=True,
195
  )
196
  with gr.Column(scale=1):
197
+ _model_type = gr.CheckboxGroup(
198
  [],
199
  label="Model Type",
200
  value=[],
 
205
  value=default_precision,
206
  interactive=True,
207
  )
208
+ _hide_models = gr.CheckboxGroup(
209
  ['Deleted/incomplete'],
210
  label="Hide Models",
211
  value=['Deleted/incomplete'],
 
221
  datatype='markdown',
222
  elem_id="auto-width-dataframe",
223
  )
224
+
225
  # 统一的更新函数:同时处理 precision、列筛选和搜索
226
  def update_dataframe(search_text: str, selected_cols: list[str], selected_precisions: list[str]):
227
  # 先按 precision 筛选 original_df
 
231
  # 最后按搜索关键词筛选
232
  final_df = search_models_in_dataframe(search_text, column_filtered_df)
233
  return final_df
234
+
235
  # 绑定搜索、列选择和 precision 的变化事件,动态更新 DataFrame
236
  search.change(
237
  fn=update_dataframe,
238
  inputs=[search, show_columns, precision],
239
  outputs=leaderboard,
240
  )
241
+
242
  show_columns.change(
243
  fn=update_dataframe,
244
  inputs=[search, show_columns, precision],
245
  outputs=leaderboard,
246
  )
247
+
248
  precision.change(
249
  fn=update_dataframe,
250
  inputs=[search, show_columns, precision],
251
  outputs=leaderboard,
252
  )
253
+
254
  return leaderboard
255
 
256
 
 
260
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
261
 
262
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
263
  for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
264
  with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
265
+ benchmark_cols = [
266
+ BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)
267
+ ]
268
  cols = BASE_COLS + benchmark_cols
269
  BENCHMARK_DF = get_leaderboard_df(
270
  settings.EVAL_RESULTS_PATH,
 
343
  value=None,
344
  interactive=True,
345
  )
346
+
347
  def search_models(query):
348
  if not query.strip():
349
  return []
350
  models = API.list_models(search=query, limit=10)
351
  results = []
352
  for m in models:
353
+ results.append([m.id, m.pipeline_tag or "N/A", m.downloads or 0, m.likes or 0])
 
 
 
 
 
354
  return results
355
+
356
  def on_select(evt: gr.SelectData, data):
357
  row_idx = evt.index[0] # 获取点击行号
358
  if row_idx < len(data):
359
  return data.iloc[row_idx, 0] # 返回模型名
360
  return ""
361
+
362
  search_name.change(fn=search_models, inputs=search_name, outputs=table)
363
  table.select(fn=on_select, inputs=table, outputs=model_name_textbox)
364
 
src/leaderboard/read_evals.py CHANGED
@@ -6,6 +6,7 @@ Enhanced with Pydantic models.
6
  import glob
7
  import json
8
  import os
 
9
  from pathlib import Path
10
  from typing import Annotated, Any
11
 
@@ -179,6 +180,8 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
179
 
180
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
181
  """From the path of the results folder root, extract all needed info for results"""
 
 
182
  model_result_filepaths: list[str] = []
183
 
184
  for root, _, files in os.walk(results_path):
@@ -213,7 +216,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
213
  try:
214
  v.to_dict() # we test if the dict version is complete
215
  results.append(v)
216
- except KeyError: # not all eval values present
 
217
  continue
218
 
219
  return results
 
6
  import glob
7
  import json
8
  import os
9
+ import warnings
10
  from pathlib import Path
11
  from typing import Annotated, Any
12
 
 
180
 
181
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
182
  """From the path of the results folder root, extract all needed info for results"""
183
+ from rich import print as rprint # FIXME: DEBUG
184
+
185
  model_result_filepaths: list[str] = []
186
 
187
  for root, _, files in os.walk(results_path):
 
216
  try:
217
  v.to_dict() # we test if the dict version is complete
218
  results.append(v)
219
+ except KeyError as e: # not all eval values present
220
+ warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2)
221
  continue
222
 
223
  return results