Spaces:
Running
Running
yangzhitao
commited on
Commit
·
12947f9
1
Parent(s):
fbc528a
reformat
Browse files- app.py +38 -36
- src/leaderboard/read_evals.py +5 -1
app.py
CHANGED
|
@@ -6,26 +6,25 @@ from huggingface_hub import snapshot_download
|
|
| 6 |
from rich import print
|
| 7 |
|
| 8 |
from src.about import (
|
|
|
|
| 9 |
CITATION_BUTTON_LABEL,
|
| 10 |
CITATION_BUTTON_TEXT,
|
| 11 |
EVALUATION_QUEUE_TEXT,
|
| 12 |
INTRODUCTION_TEXT,
|
| 13 |
LLM_BENCHMARKS_TEXT,
|
| 14 |
TITLE,
|
| 15 |
-
BENCHMARKS,
|
| 16 |
)
|
| 17 |
from src.display.css_html_js import custom_css
|
| 18 |
from src.display.utils import (
|
|
|
|
| 19 |
BENCHMARK_COLS,
|
| 20 |
COLS,
|
| 21 |
-
BASE_COLS,
|
| 22 |
EVAL_COLS,
|
| 23 |
EVAL_TYPES,
|
| 24 |
AutoEvalColumn,
|
| 25 |
ModelType,
|
| 26 |
Precision,
|
| 27 |
WeightType,
|
| 28 |
-
fields,
|
| 29 |
)
|
| 30 |
from src.envs import API, settings
|
| 31 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
@@ -84,18 +83,18 @@ def filter_dataframe_by_columns(selected_cols: list[str], original_df: pd.DataFr
|
|
| 84 |
# 始终包含基础列 'T' 和 'Model'
|
| 85 |
base_cols = ['T', 'Model']
|
| 86 |
all_selected_cols = [col for col in base_cols if col in original_df.columns]
|
| 87 |
-
|
| 88 |
# 添加用户选择的列(排除已存在的基础列)
|
| 89 |
for col in selected_cols:
|
| 90 |
if col in original_df.columns and col not in all_selected_cols:
|
| 91 |
all_selected_cols.append(col)
|
| 92 |
-
|
| 93 |
# 确保列的顺序:基础列在前,然后是按原始顺序的选中列
|
| 94 |
ordered_cols = []
|
| 95 |
for col in original_df.columns:
|
| 96 |
if col in all_selected_cols:
|
| 97 |
ordered_cols.append(col)
|
| 98 |
-
|
| 99 |
# 确保总是返回 DataFrame,即使是单列也使用 [[]] 来保持 DataFrame 类型
|
| 100 |
if ordered_cols:
|
| 101 |
filtered_df = original_df.loc[:, ordered_cols]
|
|
@@ -111,11 +110,11 @@ def filter_dataframe_by_precision(selected_precisions: list[str], df: pd.DataFra
|
|
| 111 |
"""
|
| 112 |
if not selected_precisions:
|
| 113 |
return df.iloc[0:0].copy() # 返回相同结构但为空的 DataFrame
|
| 114 |
-
|
| 115 |
precision_col = AutoEvalColumn.precision.name
|
| 116 |
if precision_col not in df.columns:
|
| 117 |
return df
|
| 118 |
-
|
| 119 |
# 筛选包含任一选定 precision 的行
|
| 120 |
mask = df[precision_col].isin(selected_precisions)
|
| 121 |
filtered_df = df.loc[mask, :]
|
|
@@ -129,26 +128,26 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
|
|
| 129 |
"""
|
| 130 |
if not search_text or not search_text.strip():
|
| 131 |
return df
|
| 132 |
-
|
| 133 |
# 分割逗号,去除空白并转换为小写用于匹配
|
| 134 |
import re
|
| 135 |
-
|
| 136 |
keywords = [keyword.strip().lower() for keyword in search_text.split(',') if keyword.strip()]
|
| 137 |
if not keywords:
|
| 138 |
return df
|
| 139 |
-
|
| 140 |
if 'Model' not in df.columns:
|
| 141 |
return df
|
| 142 |
-
|
| 143 |
# 匹配函数:从 HTML 中提取纯文本并检查是否包含关键词
|
| 144 |
def matches_search(model_cell):
|
| 145 |
if pd.isna(model_cell):
|
| 146 |
return False
|
| 147 |
-
|
| 148 |
# 从 HTML 链接中提取纯文本(model_name)
|
| 149 |
# 格式: <a ...>model_name</a> 或直接是文本
|
| 150 |
text = str(model_cell)
|
| 151 |
-
|
| 152 |
# 提取 HTML 标签内的文本
|
| 153 |
# 匹配 <a>...</a> 标签内的内容,或直接使用文本
|
| 154 |
match = re.search(r'<a[^>]*>([^<]+)</a>', text, re.IGNORECASE)
|
|
@@ -156,10 +155,10 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
|
|
| 156 |
model_name = match.group(1).lower()
|
| 157 |
else:
|
| 158 |
model_name = text.lower()
|
| 159 |
-
|
| 160 |
# 检查是否包含任一关键词
|
| 161 |
return any(keyword in model_name for keyword in keywords)
|
| 162 |
-
|
| 163 |
# 应用搜索过滤
|
| 164 |
mask = df['Model'].apply(matches_search)
|
| 165 |
filtered_df = df.loc[mask, :]
|
|
@@ -169,18 +168,22 @@ def search_models_in_dataframe(search_text: str, df: pd.DataFrame) -> pd.DataFra
|
|
| 169 |
def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
| 170 |
# 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
|
| 171 |
original_df = dataframe.copy()
|
| 172 |
-
|
| 173 |
available_precisions = sorted(original_df["Precision"].dropna().unique().tolist())
|
| 174 |
-
default_precision =
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
# 初始化显示的列(包含基础列和默认选中的列)
|
| 177 |
default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
|
| 178 |
-
|
| 179 |
# 先按 precision 筛选 original_df
|
| 180 |
precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
|
| 181 |
# 根据默认选择再筛选一次 DataFrame
|
| 182 |
initial_filtered_df = filter_dataframe_by_columns(default_selected, precision_filtered_df)
|
| 183 |
-
|
| 184 |
with gr.Row():
|
| 185 |
with gr.Column(scale=1):
|
| 186 |
search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
|
|
@@ -191,7 +194,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 191 |
interactive=True,
|
| 192 |
)
|
| 193 |
with gr.Column(scale=1):
|
| 194 |
-
|
| 195 |
[],
|
| 196 |
label="Model Type",
|
| 197 |
value=[],
|
|
@@ -202,7 +205,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 202 |
value=default_precision,
|
| 203 |
interactive=True,
|
| 204 |
)
|
| 205 |
-
|
| 206 |
['Deleted/incomplete'],
|
| 207 |
label="Hide Models",
|
| 208 |
value=['Deleted/incomplete'],
|
|
@@ -218,7 +221,7 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 218 |
datatype='markdown',
|
| 219 |
elem_id="auto-width-dataframe",
|
| 220 |
)
|
| 221 |
-
|
| 222 |
# 统一的更新函数:同时处理 precision、列筛选和搜索
|
| 223 |
def update_dataframe(search_text: str, selected_cols: list[str], selected_precisions: list[str]):
|
| 224 |
# 先按 precision 筛选 original_df
|
|
@@ -228,26 +231,26 @@ def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
|
| 228 |
# 最后按搜索关键词筛选
|
| 229 |
final_df = search_models_in_dataframe(search_text, column_filtered_df)
|
| 230 |
return final_df
|
| 231 |
-
|
| 232 |
# 绑定搜索、列选择和 precision 的变化事件,动态更新 DataFrame
|
| 233 |
search.change(
|
| 234 |
fn=update_dataframe,
|
| 235 |
inputs=[search, show_columns, precision],
|
| 236 |
outputs=leaderboard,
|
| 237 |
)
|
| 238 |
-
|
| 239 |
show_columns.change(
|
| 240 |
fn=update_dataframe,
|
| 241 |
inputs=[search, show_columns, precision],
|
| 242 |
outputs=leaderboard,
|
| 243 |
)
|
| 244 |
-
|
| 245 |
precision.change(
|
| 246 |
fn=update_dataframe,
|
| 247 |
inputs=[search, show_columns, precision],
|
| 248 |
outputs=leaderboard,
|
| 249 |
)
|
| 250 |
-
|
| 251 |
return leaderboard
|
| 252 |
|
| 253 |
|
|
@@ -257,10 +260,11 @@ with demo:
|
|
| 257 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 258 |
|
| 259 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 260 |
-
|
| 261 |
for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
|
| 262 |
with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
|
| 263 |
-
benchmark_cols = [
|
|
|
|
|
|
|
| 264 |
cols = BASE_COLS + benchmark_cols
|
| 265 |
BENCHMARK_DF = get_leaderboard_df(
|
| 266 |
settings.EVAL_RESULTS_PATH,
|
|
@@ -339,24 +343,22 @@ with demo:
|
|
| 339 |
value=None,
|
| 340 |
interactive=True,
|
| 341 |
)
|
|
|
|
| 342 |
def search_models(query):
|
| 343 |
if not query.strip():
|
| 344 |
return []
|
| 345 |
models = API.list_models(search=query, limit=10)
|
| 346 |
results = []
|
| 347 |
for m in models:
|
| 348 |
-
results.append([
|
| 349 |
-
m.id,
|
| 350 |
-
m.pipeline_tag or "N/A",
|
| 351 |
-
m.downloads or 0,
|
| 352 |
-
m.likes or 0
|
| 353 |
-
])
|
| 354 |
return results
|
|
|
|
| 355 |
def on_select(evt: gr.SelectData, data):
|
| 356 |
row_idx = evt.index[0] # 获取点击行号
|
| 357 |
if row_idx < len(data):
|
| 358 |
return data.iloc[row_idx, 0] # 返回模型名
|
| 359 |
return ""
|
|
|
|
| 360 |
search_name.change(fn=search_models, inputs=search_name, outputs=table)
|
| 361 |
table.select(fn=on_select, inputs=table, outputs=model_name_textbox)
|
| 362 |
|
|
|
|
| 6 |
from rich import print
|
| 7 |
|
| 8 |
from src.about import (
|
| 9 |
+
BENCHMARKS,
|
| 10 |
CITATION_BUTTON_LABEL,
|
| 11 |
CITATION_BUTTON_TEXT,
|
| 12 |
EVALUATION_QUEUE_TEXT,
|
| 13 |
INTRODUCTION_TEXT,
|
| 14 |
LLM_BENCHMARKS_TEXT,
|
| 15 |
TITLE,
|
|
|
|
| 16 |
)
|
| 17 |
from src.display.css_html_js import custom_css
|
| 18 |
from src.display.utils import (
|
| 19 |
+
BASE_COLS,
|
| 20 |
BENCHMARK_COLS,
|
| 21 |
COLS,
|
|
|
|
| 22 |
EVAL_COLS,
|
| 23 |
EVAL_TYPES,
|
| 24 |
AutoEvalColumn,
|
| 25 |
ModelType,
|
| 26 |
Precision,
|
| 27 |
WeightType,
|
|
|
|
| 28 |
)
|
| 29 |
from src.envs import API, settings
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
|
| 83 |
# 始终包含基础列 'T' 和 'Model'
|
| 84 |
base_cols = ['T', 'Model']
|
| 85 |
all_selected_cols = [col for col in base_cols if col in original_df.columns]
|
| 86 |
+
|
| 87 |
# 添加用户选择的列(排除已存在的基础列)
|
| 88 |
for col in selected_cols:
|
| 89 |
if col in original_df.columns and col not in all_selected_cols:
|
| 90 |
all_selected_cols.append(col)
|
| 91 |
+
|
| 92 |
# 确保列的顺序:基础列在前,然后是按原始顺序的选中列
|
| 93 |
ordered_cols = []
|
| 94 |
for col in original_df.columns:
|
| 95 |
if col in all_selected_cols:
|
| 96 |
ordered_cols.append(col)
|
| 97 |
+
|
| 98 |
# 确保总是返回 DataFrame,即使是单列也使用 [[]] 来保持 DataFrame 类型
|
| 99 |
if ordered_cols:
|
| 100 |
filtered_df = original_df.loc[:, ordered_cols]
|
|
|
|
| 110 |
"""
|
| 111 |
if not selected_precisions:
|
| 112 |
return df.iloc[0:0].copy() # 返回相同结构但为空的 DataFrame
|
| 113 |
+
|
| 114 |
precision_col = AutoEvalColumn.precision.name
|
| 115 |
if precision_col not in df.columns:
|
| 116 |
return df
|
| 117 |
+
|
| 118 |
# 筛选包含任一选定 precision 的行
|
| 119 |
mask = df[precision_col].isin(selected_precisions)
|
| 120 |
filtered_df = df.loc[mask, :]
|
|
|
|
| 128 |
"""
|
| 129 |
if not search_text or not search_text.strip():
|
| 130 |
return df
|
| 131 |
+
|
| 132 |
# 分割逗号,去除空白并转换为小写用于匹配
|
| 133 |
import re
|
| 134 |
+
|
| 135 |
keywords = [keyword.strip().lower() for keyword in search_text.split(',') if keyword.strip()]
|
| 136 |
if not keywords:
|
| 137 |
return df
|
| 138 |
+
|
| 139 |
if 'Model' not in df.columns:
|
| 140 |
return df
|
| 141 |
+
|
| 142 |
# 匹配函数:从 HTML 中提取纯文本并检查是否包含关键词
|
| 143 |
def matches_search(model_cell):
|
| 144 |
if pd.isna(model_cell):
|
| 145 |
return False
|
| 146 |
+
|
| 147 |
# 从 HTML 链接中提取纯文本(model_name)
|
| 148 |
# 格式: <a ...>model_name</a> 或直接是文本
|
| 149 |
text = str(model_cell)
|
| 150 |
+
|
| 151 |
# 提取 HTML 标签内的文本
|
| 152 |
# 匹配 <a>...</a> 标签内的内容,或直接使用文本
|
| 153 |
match = re.search(r'<a[^>]*>([^<]+)</a>', text, re.IGNORECASE)
|
|
|
|
| 155 |
model_name = match.group(1).lower()
|
| 156 |
else:
|
| 157 |
model_name = text.lower()
|
| 158 |
+
|
| 159 |
# 检查是否包含任一关键词
|
| 160 |
return any(keyword in model_name for keyword in keywords)
|
| 161 |
+
|
| 162 |
# 应用搜索过滤
|
| 163 |
mask = df['Model'].apply(matches_search)
|
| 164 |
filtered_df = df.loc[mask, :]
|
|
|
|
| 168 |
def init_leaderboard_tabs(dataframe: pd.DataFrame, cols: list[str]):
|
| 169 |
# 存储原始 DataFrame 以便后续过滤使用(使用闭包保存)
|
| 170 |
original_df = dataframe.copy()
|
| 171 |
+
|
| 172 |
available_precisions = sorted(original_df["Precision"].dropna().unique().tolist())
|
| 173 |
+
default_precision = (
|
| 174 |
+
['bfloat16']
|
| 175 |
+
if 'bfloat16' in available_precisions
|
| 176 |
+
else (available_precisions[:1] if available_precisions else [])
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
# 初始化显示的列(包含基础列和默认选中的列)
|
| 180 |
default_selected = [col for col in dataframe.columns if col in cols] + ['Average ⬆️']
|
| 181 |
+
|
| 182 |
# 先按 precision 筛选 original_df
|
| 183 |
precision_filtered_df = filter_dataframe_by_precision(default_precision, original_df)
|
| 184 |
# 根据默认选择再筛选一次 DataFrame
|
| 185 |
initial_filtered_df = filter_dataframe_by_columns(default_selected, precision_filtered_df)
|
| 186 |
+
|
| 187 |
with gr.Row():
|
| 188 |
with gr.Column(scale=1):
|
| 189 |
search = gr.Textbox(label="Search", placeholder="Separate multiple queries with commas")
|
|
|
|
| 194 |
interactive=True,
|
| 195 |
)
|
| 196 |
with gr.Column(scale=1):
|
| 197 |
+
_model_type = gr.CheckboxGroup(
|
| 198 |
[],
|
| 199 |
label="Model Type",
|
| 200 |
value=[],
|
|
|
|
| 205 |
value=default_precision,
|
| 206 |
interactive=True,
|
| 207 |
)
|
| 208 |
+
_hide_models = gr.CheckboxGroup(
|
| 209 |
['Deleted/incomplete'],
|
| 210 |
label="Hide Models",
|
| 211 |
value=['Deleted/incomplete'],
|
|
|
|
| 221 |
datatype='markdown',
|
| 222 |
elem_id="auto-width-dataframe",
|
| 223 |
)
|
| 224 |
+
|
| 225 |
# 统一的更新函数:同时处理 precision、列筛选和搜索
|
| 226 |
def update_dataframe(search_text: str, selected_cols: list[str], selected_precisions: list[str]):
|
| 227 |
# 先按 precision 筛选 original_df
|
|
|
|
| 231 |
# 最后按搜索关键词筛选
|
| 232 |
final_df = search_models_in_dataframe(search_text, column_filtered_df)
|
| 233 |
return final_df
|
| 234 |
+
|
| 235 |
# 绑定搜索、列选择和 precision 的变化事件,动态更新 DataFrame
|
| 236 |
search.change(
|
| 237 |
fn=update_dataframe,
|
| 238 |
inputs=[search, show_columns, precision],
|
| 239 |
outputs=leaderboard,
|
| 240 |
)
|
| 241 |
+
|
| 242 |
show_columns.change(
|
| 243 |
fn=update_dataframe,
|
| 244 |
inputs=[search, show_columns, precision],
|
| 245 |
outputs=leaderboard,
|
| 246 |
)
|
| 247 |
+
|
| 248 |
precision.change(
|
| 249 |
fn=update_dataframe,
|
| 250 |
inputs=[search, show_columns, precision],
|
| 251 |
outputs=leaderboard,
|
| 252 |
)
|
| 253 |
+
|
| 254 |
return leaderboard
|
| 255 |
|
| 256 |
|
|
|
|
| 260 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 261 |
|
| 262 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
| 263 |
for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
|
| 264 |
with gr.TabItem(f"🏅 {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
|
| 265 |
+
benchmark_cols = [
|
| 266 |
+
BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)
|
| 267 |
+
]
|
| 268 |
cols = BASE_COLS + benchmark_cols
|
| 269 |
BENCHMARK_DF = get_leaderboard_df(
|
| 270 |
settings.EVAL_RESULTS_PATH,
|
|
|
|
| 343 |
value=None,
|
| 344 |
interactive=True,
|
| 345 |
)
|
| 346 |
+
|
| 347 |
def search_models(query):
|
| 348 |
if not query.strip():
|
| 349 |
return []
|
| 350 |
models = API.list_models(search=query, limit=10)
|
| 351 |
results = []
|
| 352 |
for m in models:
|
| 353 |
+
results.append([m.id, m.pipeline_tag or "N/A", m.downloads or 0, m.likes or 0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
return results
|
| 355 |
+
|
| 356 |
def on_select(evt: gr.SelectData, data):
|
| 357 |
row_idx = evt.index[0] # 获取点击行号
|
| 358 |
if row_idx < len(data):
|
| 359 |
return data.iloc[row_idx, 0] # 返回模型名
|
| 360 |
return ""
|
| 361 |
+
|
| 362 |
search_name.change(fn=search_models, inputs=search_name, outputs=table)
|
| 363 |
table.select(fn=on_select, inputs=table, outputs=model_name_textbox)
|
| 364 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -6,6 +6,7 @@ Enhanced with Pydantic models.
|
|
| 6 |
import glob
|
| 7 |
import json
|
| 8 |
import os
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import Annotated, Any
|
| 11 |
|
|
@@ -179,6 +180,8 @@ def get_request_file_for_model(requests_path, model_name, precision) -> str:
|
|
| 179 |
|
| 180 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 181 |
"""From the path of the results folder root, extract all needed info for results"""
|
|
|
|
|
|
|
| 182 |
model_result_filepaths: list[str] = []
|
| 183 |
|
| 184 |
for root, _, files in os.walk(results_path):
|
|
@@ -213,7 +216,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 213 |
try:
|
| 214 |
v.to_dict() # we test if the dict version is complete
|
| 215 |
results.append(v)
|
| 216 |
-
except KeyError: # not all eval values present
|
|
|
|
| 217 |
continue
|
| 218 |
|
| 219 |
return results
|
|
|
|
| 6 |
import glob
|
| 7 |
import json
|
| 8 |
import os
|
| 9 |
+
import warnings
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Annotated, Any
|
| 12 |
|
|
|
|
| 180 |
|
| 181 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 182 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 183 |
+
from rich import print as rprint # FIXME: DEBUG
|
| 184 |
+
|
| 185 |
model_result_filepaths: list[str] = []
|
| 186 |
|
| 187 |
for root, _, files in os.walk(results_path):
|
|
|
|
| 216 |
try:
|
| 217 |
v.to_dict() # we test if the dict version is complete
|
| 218 |
results.append(v)
|
| 219 |
+
except KeyError as e: # not all eval values present
|
| 220 |
+
warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2)
|
| 221 |
continue
|
| 222 |
|
| 223 |
return results
|