e-mon commited on
Commit
a79a88d
·
verified ·
1 Parent(s): 09629d4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
+ src/Logos-HQ/B-Test-1-D-Top-Logo.png filter=lfs diff=lfs merge=lfs -text
37
+ src/Logos-HQ/B-Test-2-Bottom-Logo-B.png filter=lfs diff=lfs merge=lfs -text
38
+ src/Logos-HQ/HuggingFace-Logo-Oct-2024.png filter=lfs diff=lfs merge=lfs -text
39
+ src/Logos-HQ/MDX-Logo-Oct-2024.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .venv
8
+ .ruff_cache
9
+
10
+ eval-queue/
11
+ eval-results/
12
+ eval-queue-bk/
13
+ eval-results-bk/
14
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-yaml
6
+ - id: check-case-conflict
7
+ - id: detect-private-key
8
+ - id: check-added-large-files
9
+ args: ["--maxkb=1000"]
10
+ - id: requirements-txt-fixer
11
+ - id: end-of-file-fixer
12
+ - id: trailing-whitespace
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ rev: v0.8.4
15
+ hooks:
16
+ - id: ruff
17
+ args: ["--select", "E,F,I,UP,W", "--ignore", "E501", "--fix"]
18
+ - id: ruff-format
19
+ args: ["--line-length", "119"]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10.15
.vscode/settings.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "flake8.args": [
13
+ "--max-line-length=119"
14
+ ],
15
+ }
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
- title: Open Japanese Llm Leaderboard V2
3
- emoji:
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Open Japanese LLM Leaderboard
3
+ emoji: 🌸
4
+ colorFrom: gray
5
+ colorTo: gray
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ sdk_version: 5.9.1
11
+ fullWidth: true
12
+ datasets:
13
+ - llm-jp/leaderboard-requests
14
+ - llm-jp/leaderboard-results
15
+ - llm-jp/leaderboard-contents
16
+ tags:
17
+ - 日本語
18
+ - Japanese
19
+ - leaderboard
20
+ - language:日本語
21
+ - language:Japanese
22
  ---
 
 
app.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+ from huggingface_hub import snapshot_download
9
+
10
+ from src.about import (
11
+ BOTTOM_LOGO,
12
+ CITATION_BUTTON_LABEL,
13
+ CITATION_BUTTON_LABEL_JA,
14
+ CITATION_BUTTON_TEXT,
15
+ EVALUATION_QUEUE_TEXT,
16
+ EVALUATION_QUEUE_TEXT_JA,
17
+ INTRODUCTION_TEXT,
18
+ INTRODUCTION_TEXT_JA,
19
+ LLM_BENCHMARKS_TEXT,
20
+ LLM_BENCHMARKS_TEXT_JA,
21
+ TITLE,
22
+ TaskType,
23
+ )
24
+ from src.display.utils import (
25
+ BENCHMARK_COLS,
26
+ COLS,
27
+ EVAL_COLS,
28
+ EVAL_TYPES,
29
+ NUMERIC_INTERVALS,
30
+ TYPES,
31
+ AddSpecialTokens,
32
+ AutoEvalColumn,
33
+ LLMJpEvalVersion,
34
+ ModelType,
35
+ NumFewShots,
36
+ Precision,
37
+ VllmVersion,
38
+ fields,
39
+ )
40
+ from src.envs import API, CONTENTS_REPO, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID
41
+ from src.i18n import (
42
+ CITATION_ACCORDION_LABEL,
43
+ CITATION_ACCORDION_LABEL_JA,
44
+ SELECT_ALL_BUTTON_LABEL,
45
+ SELECT_ALL_BUTTON_LABEL_JA,
46
+ SELECT_AVG_ONLY_BUTTON_LABEL,
47
+ SELECT_AVG_ONLY_BUTTON_LABEL_JA,
48
+ SELECT_NONE_BUTTON_LABEL,
49
+ SELECT_NONE_BUTTON_LABEL_JA,
50
+ )
51
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
52
+ from src.submission.submit import add_new_eval
53
+
54
+
55
+ def restart_space() -> None:
56
+ API.restart_space(repo_id=REPO_ID)
57
+
58
+
59
+ # Space initialization
60
+ try:
61
+ snapshot_download(
62
+ repo_id=QUEUE_REPO,
63
+ local_dir=EVAL_REQUESTS_PATH,
64
+ repo_type="dataset",
65
+ tqdm_class=None,
66
+ etag_timeout=30,
67
+ )
68
+ except Exception:
69
+ restart_space()
70
+
71
+
72
+ # Get dataframes
73
+
74
+ (
75
+ FINISHED_EVAL_QUEUE_DF,
76
+ RUNNING_EVAL_QUEUE_DF,
77
+ PENDING_EVAL_QUEUE_DF,
78
+ FAILED_EVAL_QUEUE_DF,
79
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
80
+
81
+ try:
82
+ ORIGINAL_DF = get_leaderboard_df(CONTENTS_REPO, COLS, BENCHMARK_COLS)
83
+ except Exception as e:
84
+ print(f"Error getting leaderboard df: {e}")
85
+ ORIGINAL_DF = pd.DataFrame()
86
+
87
+
88
+ # Searching and filtering
89
+
90
+
91
+ def filter_models(
92
+ df: pd.DataFrame,
93
+ type_query: list[str],
94
+ size_query: list[str],
95
+ precision_query: list[str],
96
+ add_special_tokens_query: list[str],
97
+ num_few_shots_query: list[int],
98
+ version_query: list[str],
99
+ vllm_query: list[str],
100
+ ) -> pd.DataFrame:
101
+ # Filter by model type
102
+ type_emoji = [t.split()[0] for t in type_query]
103
+ df = df[df["T"].isin(type_emoji)]
104
+
105
+ # Filter by precision
106
+ df = df[df["Precision"].isin(precision_query)]
107
+
108
+ # Filter by model size
109
+ # Note: When `df` is empty, `size_mask` is empty, and the shape of `df[size_mask]` becomes (0, 0),
110
+ # so we need to check the length of `df` before applying the filter.
111
+ if len(df) > 0:
112
+ size_mask = df["#Params (B)"].apply(
113
+ lambda x: any(x in NUMERIC_INTERVALS[s] for s in size_query if s != "Unknown")
114
+ )
115
+ if "Unknown" in size_query:
116
+ size_mask |= df["#Params (B)"].isna() | (df["#Params (B)"] == 0)
117
+ df = df[size_mask]
118
+
119
+ # Filter by special tokens setting
120
+ df = df[df["Add Special Tokens"].isin(add_special_tokens_query)]
121
+
122
+ # Filter by number of few-shot examples
123
+ df = df[df["Few-shot"].isin(num_few_shots_query)]
124
+
125
+ # Filter by evaluator version
126
+ df = df[df["llm-jp-eval version"].isin(version_query)]
127
+
128
+ # Filter by vLLM version
129
+ df = df[df["vllm version"].isin(vllm_query)]
130
+
131
+ return df
132
+
133
+
134
+ def search_model_by_name(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
135
+ return df[df[AutoEvalColumn.dummy.name].str.contains(model_name, case=False)]
136
+
137
+
138
+ def search_models_by_multiple_names(df: pd.DataFrame, search_text: str) -> pd.DataFrame:
139
+ if not search_text:
140
+ return df
141
+ model_names = [name.strip() for name in search_text.split(";")]
142
+ dfs = [search_model_by_name(df, name) for name in model_names if name]
143
+ return pd.concat(dfs).drop_duplicates(subset=AutoEvalColumn.row_id.name)
144
+
145
+
146
+ def select_columns(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
147
+ always_here_cols = [
148
+ AutoEvalColumn.model_type_symbol.name, # 'T'
149
+ AutoEvalColumn.model.name, # 'Model'
150
+ ]
151
+
152
+ # Remove 'always_here_cols' from 'columns' to avoid duplicates
153
+ columns = [c for c in columns if c not in always_here_cols]
154
+ new_columns = (
155
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.row_id.name]
156
+ )
157
+
158
+ # Maintain order while removing duplicates
159
+ seen = set()
160
+ unique_columns = []
161
+ for c in new_columns:
162
+ if c not in seen:
163
+ unique_columns.append(c)
164
+ seen.add(c)
165
+
166
+ # Create DataFrame with filtered columns
167
+ filtered_df = df[unique_columns]
168
+ return filtered_df
169
+
170
+
171
+ def update_table(
172
+ type_query: list[str],
173
+ precision_query: list[str],
174
+ size_query: list[str],
175
+ add_special_tokens_query: list[str],
176
+ num_few_shots_query: list[int],
177
+ version_query: list[str],
178
+ vllm_query: list[str],
179
+ query: str,
180
+ *columns,
181
+ ) -> pd.DataFrame:
182
+ columns = [item for column in columns for item in column]
183
+ df = filter_models(
184
+ ORIGINAL_DF,
185
+ type_query,
186
+ size_query,
187
+ precision_query,
188
+ add_special_tokens_query,
189
+ num_few_shots_query,
190
+ version_query,
191
+ vllm_query,
192
+ )
193
+ df = search_models_by_multiple_names(df, query)
194
+ df = select_columns(df, columns)
195
+ return df
196
+
197
+
198
+ # Prepare the dataframes
199
+
200
+
201
+ INITIAL_COLUMNS = ["T"] + [
202
+ c.name for c in fields(AutoEvalColumn) if (c.never_hidden or c.displayed_by_default) and c.name != "T"
203
+ ]
204
+ leaderboard_df = ORIGINAL_DF.copy()
205
+ if len(leaderboard_df) > 0:
206
+ leaderboard_df = filter_models(
207
+ leaderboard_df,
208
+ [t.to_str(" : ") for t in ModelType],
209
+ list(NUMERIC_INTERVALS.keys()),
210
+ [i.value.name for i in Precision],
211
+ [i.value.name for i in AddSpecialTokens],
212
+ [i.value for i in NumFewShots],
213
+ [i.value.name for i in LLMJpEvalVersion],
214
+ [i.value.name for i in VllmVersion],
215
+ )
216
+ leaderboard_df = select_columns(leaderboard_df, INITIAL_COLUMNS)
217
+ else:
218
+ leaderboard_df = pd.DataFrame(columns=INITIAL_COLUMNS)
219
+
220
+ # Leaderboard demo
221
+
222
+
223
+ def toggle_all_categories(action: str) -> list[gr.CheckboxGroup]:
224
+ """Function to control all category checkboxes at once"""
225
+ results = []
226
+ for task_type in TaskType:
227
+ if task_type == TaskType.NotTask:
228
+ # Maintain existing selection for Model details
229
+ results.append(gr.CheckboxGroup())
230
+ elif action == "all":
231
+ # Select all
232
+ results.append(
233
+ gr.CheckboxGroup(
234
+ value=[
235
+ c.name
236
+ for c in fields(AutoEvalColumn)
237
+ if not c.hidden and not c.never_hidden and not c.dummy and c.task_type == task_type
238
+ ]
239
+ )
240
+ )
241
+ elif action == "none":
242
+ # Deselect all
243
+ results.append(gr.CheckboxGroup(value=[]))
244
+ elif action == "avg_only":
245
+ # Select only AVG metrics
246
+ results.append(
247
+ gr.CheckboxGroup(
248
+ value=[
249
+ c.name
250
+ for c in fields(AutoEvalColumn)
251
+ if not c.hidden
252
+ and not c.never_hidden
253
+ and c.task_type == task_type
254
+ and ((task_type == TaskType.AVG) or (task_type != TaskType.AVG and c.average))
255
+ ]
256
+ )
257
+ )
258
+ return results
259
+
260
+
261
+ TASK_AVG_NAME_MAP = {
262
+ c.name: c.task_type.name for c in fields(AutoEvalColumn) if c.average and c.task_type != TaskType.AVG
263
+ }
264
+ AVG_COLUMNS = ["AVG"] + list(TASK_AVG_NAME_MAP.keys())
265
+
266
+
267
+ def plot_size_vs_score(df_filtered: pd.DataFrame) -> go.Figure:
268
+ df = ORIGINAL_DF[ORIGINAL_DF[AutoEvalColumn.row_id.name].isin(df_filtered[AutoEvalColumn.row_id.name])]
269
+ df = df[df["#Params (B)"] > 0]
270
+ df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
271
+ df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
272
+ df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"].astype(str) + "-shot)"
273
+ df = pd.melt(
274
+ df,
275
+ id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
276
+ value_vars=AVG_COLUMNS,
277
+ var_name="Category",
278
+ value_name="Score",
279
+ )
280
+ max_model_size = df["#Params (B)"].max()
281
+ fig = px.scatter(
282
+ df,
283
+ x="#Params (B)",
284
+ y="Score",
285
+ text="model_name_without_org_name",
286
+ color="Category",
287
+ hover_data=["Model", "n-shot", "Category"],
288
+ )
289
+ fig.update_traces(
290
+ hovertemplate="<b>%{customdata[0]}</b><br>#Params: %{x:.2f}B<br>n-shot: %{customdata[1]}<br>%{customdata[2]}: %{y:.4f}<extra></extra>",
291
+ textposition="top right",
292
+ mode="markers",
293
+ )
294
+ for trace in fig.data:
295
+ if trace.name != "AVG":
296
+ trace.visible = "legendonly"
297
+ fig.update_layout(xaxis_range=[0, max_model_size * 1.2], yaxis_range=[0, 1])
298
+ fig.update_layout(
299
+ updatemenus=[
300
+ dict(
301
+ type="buttons",
302
+ direction="left",
303
+ showactive=True,
304
+ buttons=[
305
+ dict(label="Hide Labels", method="update", args=[{"mode": ["markers"]}]),
306
+ dict(label="Show Labels", method="update", args=[{"mode": ["markers+text"]}]),
307
+ ],
308
+ x=0.5,
309
+ y=-0.2,
310
+ xanchor="center",
311
+ yanchor="top",
312
+ )
313
+ ]
314
+ )
315
+ return fig
316
+
317
+
318
+ def plot_average_scores(df_filtered: pd.DataFrame) -> go.Figure:
319
+ df = ORIGINAL_DF[ORIGINAL_DF[AutoEvalColumn.row_id.name].isin(df_filtered[AutoEvalColumn.row_id.name])]
320
+ df = df[["model_name_for_query", "Few-shot"] + list(TASK_AVG_NAME_MAP.keys())]
321
+ df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
322
+ df = df.rename(columns=TASK_AVG_NAME_MAP)
323
+ df = df.set_index(["Model", "n-shot"])
324
+
325
+ fig = go.Figure()
326
+ for i, ((name, n_shot), row) in enumerate(df.iterrows()):
327
+ visible = True if i < 2 else "legendonly" # Display only the first 2 models
328
+ fig.add_trace(
329
+ go.Scatterpolar(
330
+ r=row.values,
331
+ theta=row.index,
332
+ fill="toself",
333
+ name=f"{name} ({n_shot}-shot)",
334
+ hovertemplate="%{theta}: %{r}",
335
+ visible=visible,
336
+ )
337
+ )
338
+ fig.update_layout(
339
+ polar={
340
+ "radialaxis": {"range": [0, 1]},
341
+ },
342
+ showlegend=True,
343
+ )
344
+ return fig
345
+
346
+
347
+ shown_columns_dict: dict[str, gr.CheckboxGroup] = {}
348
+ checkboxes: list[gr.CheckboxGroup] = []
349
+
350
+ with gr.Blocks() as demo_leaderboard:
351
+ with gr.Row():
352
+ search_bar = gr.Textbox(
353
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
354
+ show_label=False,
355
+ elem_id="search-bar",
356
+ )
357
+ with gr.Accordion("Column Filter", open=True):
358
+ with gr.Row():
359
+ with gr.Row():
360
+ select_all_button = gr.Button(SELECT_ALL_BUTTON_LABEL_JA, size="sm")
361
+ select_none_button = gr.Button(SELECT_NONE_BUTTON_LABEL_JA, size="sm")
362
+ select_avg_only_button = gr.Button(SELECT_AVG_ONLY_BUTTON_LABEL_JA, size="sm")
363
+
364
+ for task_type in TaskType:
365
+ label = "Model details" if task_type == TaskType.NotTask else task_type.value
366
+ with gr.Accordion(label, open=True, elem_classes="accordion"):
367
+ with gr.Row(height=110):
368
+ shown_column = gr.CheckboxGroup(
369
+ show_label=False,
370
+ choices=[
371
+ c.name
372
+ for c in fields(AutoEvalColumn)
373
+ if not c.hidden and not c.never_hidden and not c.dummy and c.task_type == task_type
374
+ ],
375
+ value=[
376
+ c.name
377
+ for c in fields(AutoEvalColumn)
378
+ if c.displayed_by_default
379
+ and not c.hidden
380
+ and not c.never_hidden
381
+ and c.task_type == task_type
382
+ ],
383
+ elem_id="column-select",
384
+ container=False,
385
+ )
386
+ shown_columns_dict[task_type.name] = shown_column
387
+ checkboxes.append(shown_column)
388
+
389
+ with gr.Accordion("Model Filter", open=True):
390
+ with gr.Row():
391
+ filter_columns_type = gr.CheckboxGroup(
392
+ label="Model types",
393
+ choices=[t.to_str() for t in ModelType],
394
+ value=[t.to_str() for t in ModelType],
395
+ elem_id="filter-columns-type",
396
+ )
397
+ filter_columns_precision = gr.CheckboxGroup(
398
+ label="Precision",
399
+ choices=[i.value.name for i in Precision],
400
+ value=[i.value.name for i in Precision],
401
+ elem_id="filter-columns-precision",
402
+ )
403
+ filter_columns_size = gr.CheckboxGroup(
404
+ label="Model sizes (in billions of parameters)",
405
+ choices=list(NUMERIC_INTERVALS.keys()),
406
+ value=list(NUMERIC_INTERVALS.keys()),
407
+ elem_id="filter-columns-size",
408
+ )
409
+ filter_columns_add_special_tokens = gr.CheckboxGroup(
410
+ label="Add Special Tokens",
411
+ choices=[i.value.name for i in AddSpecialTokens],
412
+ value=[i.value.name for i in AddSpecialTokens],
413
+ elem_id="filter-columns-add-special-tokens",
414
+ )
415
+ filter_columns_num_few_shots = gr.CheckboxGroup(
416
+ label="Num Few Shots",
417
+ choices=[i.value for i in NumFewShots],
418
+ value=[i.value for i in NumFewShots],
419
+ elem_id="filter-columns-num-few-shots",
420
+ )
421
+ filter_columns_version = gr.CheckboxGroup(
422
+ label="llm-jp-eval version",
423
+ choices=[i.value.name for i in LLMJpEvalVersion],
424
+ value=[i.value.name for i in LLMJpEvalVersion],
425
+ elem_id="filter-columns-version",
426
+ )
427
+ filter_columns_vllm = gr.CheckboxGroup(
428
+ label="vllm version",
429
+ choices=[i.value.name for i in VllmVersion],
430
+ value=[i.value.name for i in VllmVersion],
431
+ elem_id="filter-columns-vllm",
432
+ )
433
+
434
+ leaderboard_table = gr.Dataframe(
435
+ value=leaderboard_df,
436
+ headers=INITIAL_COLUMNS,
437
+ datatype=TYPES,
438
+ elem_id="leaderboard-table",
439
+ interactive=False,
440
+ visible=True,
441
+ )
442
+
443
+ graph_size_vs_score = gr.Plot(label="Size vs. Score")
444
+ graph_average_scores = gr.Plot(label="Performance across Task Categories")
445
+
446
+ select_all_button.click(
447
+ fn=lambda: toggle_all_categories("all"),
448
+ outputs=checkboxes,
449
+ api_name=False,
450
+ queue=False,
451
+ )
452
+ select_none_button.click(
453
+ fn=lambda: toggle_all_categories("none"),
454
+ outputs=checkboxes,
455
+ api_name=False,
456
+ queue=False,
457
+ )
458
+ select_avg_only_button.click(
459
+ fn=lambda: toggle_all_categories("avg_only"),
460
+ outputs=checkboxes,
461
+ api_name=False,
462
+ queue=False,
463
+ )
464
+
465
+ gr.on(
466
+ triggers=[
467
+ filter_columns_type.change,
468
+ filter_columns_precision.change,
469
+ filter_columns_size.change,
470
+ filter_columns_add_special_tokens.change,
471
+ filter_columns_num_few_shots.change,
472
+ filter_columns_version.change,
473
+ filter_columns_vllm.change,
474
+ search_bar.submit,
475
+ ]
476
+ + [shown_columns.change for shown_columns in shown_columns_dict.values()],
477
+ fn=update_table,
478
+ inputs=[
479
+ filter_columns_type,
480
+ filter_columns_precision,
481
+ filter_columns_size,
482
+ filter_columns_add_special_tokens,
483
+ filter_columns_num_few_shots,
484
+ filter_columns_version,
485
+ filter_columns_vllm,
486
+ search_bar,
487
+ ]
488
+ + list(shown_columns_dict.values()),
489
+ outputs=leaderboard_table,
490
+ )
491
+
492
+ leaderboard_table.change(
493
+ fn=plot_size_vs_score,
494
+ inputs=leaderboard_table,
495
+ outputs=graph_size_vs_score,
496
+ api_name=False,
497
+ queue=False,
498
+ )
499
+
500
+ leaderboard_table.change(
501
+ fn=plot_average_scores,
502
+ inputs=leaderboard_table,
503
+ outputs=graph_average_scores,
504
+ api_name=False,
505
+ queue=False,
506
+ )
507
+
508
+
509
+ # Submission demo
510
+
511
+ with gr.Blocks() as demo_submission:
512
+ with gr.Column():
513
+ with gr.Row():
514
+ evaluation_queue_text = gr.Markdown(EVALUATION_QUEUE_TEXT_JA, elem_classes="markdown-text")
515
+
516
+ with gr.Column():
517
+ with gr.Accordion(
518
+ f"✅ Finished Evaluations ({len(FINISHED_EVAL_QUEUE_DF)})",
519
+ open=False,
520
+ ):
521
+ with gr.Row():
522
+ finished_eval_table = gr.Dataframe(
523
+ value=FINISHED_EVAL_QUEUE_DF,
524
+ headers=EVAL_COLS,
525
+ datatype=EVAL_TYPES,
526
+ row_count=5,
527
+ )
528
+ with gr.Accordion(
529
+ f"🔄 Running Evaluation Queue ({len(RUNNING_EVAL_QUEUE_DF)})",
530
+ open=False,
531
+ ):
532
+ with gr.Row():
533
+ running_eval_table = gr.Dataframe(
534
+ value=RUNNING_EVAL_QUEUE_DF,
535
+ headers=EVAL_COLS,
536
+ datatype=EVAL_TYPES,
537
+ row_count=5,
538
+ )
539
+
540
+ with gr.Accordion(
541
+ f"⏳ Pending Evaluation Queue ({len(PENDING_EVAL_QUEUE_DF)})",
542
+ open=False,
543
+ ):
544
+ with gr.Row():
545
+ pending_eval_table = gr.Dataframe(
546
+ value=PENDING_EVAL_QUEUE_DF,
547
+ headers=EVAL_COLS,
548
+ datatype=EVAL_TYPES,
549
+ row_count=5,
550
+ )
551
+ with gr.Accordion(
552
+ f"❎ Failed Evaluation Queue ({len(FAILED_EVAL_QUEUE_DF)})",
553
+ open=False,
554
+ ):
555
+ with gr.Row():
556
+ failed_eval_table = gr.Dataframe(
557
+ value=FAILED_EVAL_QUEUE_DF,
558
+ headers=EVAL_COLS,
559
+ datatype=EVAL_TYPES,
560
+ row_count=5,
561
+ )
562
+ with gr.Row():
563
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
564
+
565
+ with gr.Row():
566
+ with gr.Column():
567
+ model_name_textbox = gr.Textbox(label="Model name")
568
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
569
+ model_type = gr.Dropdown(
570
+ label="Model type",
571
+ choices=[t.to_str(" : ") for t in ModelType],
572
+ multiselect=False,
573
+ value=None,
574
+ )
575
+
576
+ with gr.Column():
577
+ precision = gr.Dropdown(
578
+ label="Precision",
579
+ choices=[i.value.name for i in Precision] + ["auto"],
580
+ multiselect=False,
581
+ value="auto",
582
+ )
583
+ add_special_tokens = gr.Dropdown(
584
+ label="AddSpecialTokens",
585
+ choices=[i.value.name for i in AddSpecialTokens],
586
+ multiselect=False,
587
+ value="False",
588
+ )
589
+
590
+ submit_button = gr.Button("Submit Eval")
591
+ submission_result = gr.Markdown()
592
+ submit_button.click(
593
+ fn=add_new_eval,
594
+ inputs=[
595
+ model_name_textbox,
596
+ revision_name_textbox,
597
+ precision,
598
+ model_type,
599
+ add_special_tokens,
600
+ ],
601
+ outputs=submission_result,
602
+ )
603
+
604
+
605
+ # Main demo
606
+
607
+
608
+ def set_default_language(request: gr.Request) -> gr.Radio:
609
+ if request.headers["Accept-Language"].split(",")[0].lower().startswith("ja"):
610
+ return gr.Radio(value="🇯🇵 JA")
611
+ else:
612
+ return gr.Radio(value="🇺🇸 EN")
613
+
614
+
615
+ def update_language(
616
+ language: str,
617
+ ) -> tuple[
618
+ gr.Markdown, # introduction_text
619
+ gr.Markdown, # llm_benchmarks_text
620
+ gr.Markdown, # evaluation_queue_text
621
+ gr.Textbox, # citation_button
622
+ gr.Button, # select_all_button
623
+ gr.Button, # select_none_button
624
+ gr.Button, # select_avg_only_button
625
+ gr.Accordion, # citation_accordion
626
+ ]:
627
+ if language == "🇯🇵 JA":
628
+ return (
629
+ gr.Markdown(value=INTRODUCTION_TEXT_JA),
630
+ gr.Markdown(value=LLM_BENCHMARKS_TEXT_JA),
631
+ gr.Markdown(value=EVALUATION_QUEUE_TEXT_JA),
632
+ gr.Textbox(label=CITATION_BUTTON_LABEL_JA),
633
+ gr.Button(value=SELECT_ALL_BUTTON_LABEL_JA),
634
+ gr.Button(value=SELECT_NONE_BUTTON_LABEL_JA),
635
+ gr.Button(value=SELECT_AVG_ONLY_BUTTON_LABEL_JA),
636
+ gr.Accordion(label=CITATION_ACCORDION_LABEL_JA),
637
+ )
638
+ else:
639
+ return (
640
+ gr.Markdown(value=INTRODUCTION_TEXT),
641
+ gr.Markdown(value=LLM_BENCHMARKS_TEXT),
642
+ gr.Markdown(value=EVALUATION_QUEUE_TEXT),
643
+ gr.Textbox(label=CITATION_BUTTON_LABEL),
644
+ gr.Button(value=SELECT_ALL_BUTTON_LABEL),
645
+ gr.Button(value=SELECT_NONE_BUTTON_LABEL),
646
+ gr.Button(value=SELECT_AVG_ONLY_BUTTON_LABEL),
647
+ gr.Accordion(label=CITATION_ACCORDION_LABEL),
648
+ )
649
+
650
+
651
+ with gr.Blocks(css_paths="style.css", theme=gr.themes.Glass()) as demo:
652
+ gr.HTML(TITLE)
653
+ introduction_text = gr.Markdown(INTRODUCTION_TEXT_JA, elem_classes="markdown-text")
654
+
655
+ with gr.Tabs() as tabs:
656
+ with gr.Tab("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table"):
657
+ demo_leaderboard.render()
658
+
659
+ with gr.Tab("📝 About", elem_id="llm-benchmark-tab-about"):
660
+ llm_benchmarks_text = gr.Markdown(LLM_BENCHMARKS_TEXT_JA, elem_classes="markdown-text")
661
+
662
+ with gr.Tab("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit"):
663
+ demo_submission.render()
664
+
665
+ with gr.Row():
666
+ with gr.Accordion(CITATION_ACCORDION_LABEL_JA, open=False) as citation_accordion:
667
+ citation_button = gr.Textbox(
668
+ label=CITATION_BUTTON_LABEL_JA,
669
+ value=CITATION_BUTTON_TEXT,
670
+ lines=20,
671
+ elem_id="citation-button",
672
+ show_copy_button=True,
673
+ )
674
+ gr.HTML(BOTTOM_LOGO)
675
+
676
+ language = gr.Radio(
677
+ choices=["🇯🇵 JA", "🇺🇸 EN"],
678
+ value="🇯🇵 JA",
679
+ elem_classes="language-selector",
680
+ show_label=False,
681
+ container=False,
682
+ )
683
+
684
+ demo.load(fn=set_default_language, outputs=language)
685
+ language.change(
686
+ fn=update_language,
687
+ inputs=language,
688
+ outputs=[
689
+ introduction_text,
690
+ llm_benchmarks_text,
691
+ evaluation_queue_text,
692
+ citation_button,
693
+ select_all_button,
694
+ select_none_button,
695
+ select_avg_only_button,
696
+ citation_accordion,
697
+ ],
698
+ api_name=False,
699
+ )
700
+
701
+ if __name__ == "__main__":
702
+ if os.getenv("SPACE_ID"):
703
+ scheduler = BackgroundScheduler()
704
+ scheduler.add_job(restart_space, "interval", seconds=1800)
705
+ scheduler.start()
706
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "open-japanese-llm-leaderboard"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "apscheduler>=3.10.4",
9
+ "datasets>=3.2.0",
10
+ "gradio>=5.9.1",
11
+ "hf-transfer>=0.1.8",
12
+ "plotly>=5.24.1",
13
+ "torch>=2.5.1",
14
+ "transformers>=4.47.1",
15
+ ]
16
+
17
+ [tool.ruff]
18
+ line-length = 119
19
+
20
+ [tool.ruff.lint]
21
+ select = [
22
+ "ARG", # Check function argument usage
23
+ "B", # Common bugs and design problems (from flake8-bugbear)
24
+ "C", # Complexity checks (from mccabe)
25
+ "E", # PEP 8 errors (from pycodestyle)
26
+ "F", # Pyflakes errors (basic Python errors)
27
+ "I", # Import sorting and formatting
28
+ "N", # Naming conventions (from pep8-naming)
29
+ "PL", # Pylint rules
30
+ "S101", # Use of assert statements (from flake8-bandit)
31
+ "SIM", # Code simplification suggestions
32
+ "UP", # Python upgrade suggestions
33
+ "W", # PEP 8 warnings (from pycodestyle)
34
+ ]
35
+ ignore = [
36
+ "E501", # Line too long (> 79 characters)
37
+ "SIM117", # Use a single 'with' statement with multiple contexts instead of nested 'with' statements
38
+ ]
39
+
40
+ [tool.ruff.format]
41
+ docstring-code-format = true
requirements.txt ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==23.2.1
4
+ # via gradio
5
+ aiohappyeyeballs==2.4.4
6
+ # via aiohttp
7
+ aiohttp==3.11.11
8
+ # via
9
+ # datasets
10
+ # fsspec
11
+ aiosignal==1.3.2
12
+ # via aiohttp
13
+ annotated-types==0.7.0
14
+ # via pydantic
15
+ anyio==4.7.0
16
+ # via
17
+ # gradio
18
+ # httpx
19
+ # starlette
20
+ apscheduler==3.11.0
21
+ # via open-japanese-llm-leaderboard (pyproject.toml)
22
+ async-timeout==5.0.1
23
+ # via aiohttp
24
+ attrs==24.3.0
25
+ # via aiohttp
26
+ certifi==2024.12.14
27
+ # via
28
+ # httpcore
29
+ # httpx
30
+ # requests
31
+ charset-normalizer==3.4.0
32
+ # via requests
33
+ click==8.1.8
34
+ # via
35
+ # typer
36
+ # uvicorn
37
+ datasets==3.2.0
38
+ # via open-japanese-llm-leaderboard (pyproject.toml)
39
+ dill==0.3.8
40
+ # via
41
+ # datasets
42
+ # multiprocess
43
+ exceptiongroup==1.2.2
44
+ # via anyio
45
+ fastapi==0.115.6
46
+ # via gradio
47
+ ffmpy==0.5.0
48
+ # via gradio
49
+ filelock==3.16.1
50
+ # via
51
+ # datasets
52
+ # huggingface-hub
53
+ # torch
54
+ # transformers
55
+ # triton
56
+ frozenlist==1.5.0
57
+ # via
58
+ # aiohttp
59
+ # aiosignal
60
+ fsspec==2024.9.0
61
+ # via
62
+ # datasets
63
+ # gradio-client
64
+ # huggingface-hub
65
+ # torch
66
+ gradio==5.9.1
67
+ # via open-japanese-llm-leaderboard (pyproject.toml)
68
+ gradio-client==1.5.2
69
+ # via gradio
70
+ h11==0.14.0
71
+ # via
72
+ # httpcore
73
+ # uvicorn
74
+ hf-transfer==0.1.8
75
+ # via open-japanese-llm-leaderboard (pyproject.toml)
76
+ httpcore==1.0.7
77
+ # via httpx
78
+ httpx==0.28.1
79
+ # via
80
+ # gradio
81
+ # gradio-client
82
+ # safehttpx
83
+ huggingface-hub==0.27.0
84
+ # via
85
+ # datasets
86
+ # gradio
87
+ # gradio-client
88
+ # tokenizers
89
+ # transformers
90
+ idna==3.10
91
+ # via
92
+ # anyio
93
+ # httpx
94
+ # requests
95
+ # yarl
96
+ jinja2==3.1.5
97
+ # via
98
+ # gradio
99
+ # torch
100
+ markdown-it-py==3.0.0
101
+ # via rich
102
+ markupsafe==2.1.5
103
+ # via
104
+ # gradio
105
+ # jinja2
106
+ mdurl==0.1.2
107
+ # via markdown-it-py
108
+ mpmath==1.3.0
109
+ # via sympy
110
+ multidict==6.1.0
111
+ # via
112
+ # aiohttp
113
+ # yarl
114
+ multiprocess==0.70.16
115
+ # via datasets
116
+ networkx==3.4.2
117
+ # via torch
118
+ numpy==2.2.1
119
+ # via
120
+ # datasets
121
+ # gradio
122
+ # pandas
123
+ # transformers
124
+ nvidia-cublas-cu12==12.4.5.8
125
+ # via
126
+ # nvidia-cudnn-cu12
127
+ # nvidia-cusolver-cu12
128
+ # torch
129
+ nvidia-cuda-cupti-cu12==12.4.127
130
+ # via torch
131
+ nvidia-cuda-nvrtc-cu12==12.4.127
132
+ # via torch
133
+ nvidia-cuda-runtime-cu12==12.4.127
134
+ # via torch
135
+ nvidia-cudnn-cu12==9.1.0.70
136
+ # via torch
137
+ nvidia-cufft-cu12==11.2.1.3
138
+ # via torch
139
+ nvidia-curand-cu12==10.3.5.147
140
+ # via torch
141
+ nvidia-cusolver-cu12==11.6.1.9
142
+ # via torch
143
+ nvidia-cusparse-cu12==12.3.1.170
144
+ # via
145
+ # nvidia-cusolver-cu12
146
+ # torch
147
+ nvidia-nccl-cu12==2.21.5
148
+ # via torch
149
+ nvidia-nvjitlink-cu12==12.4.127
150
+ # via
151
+ # nvidia-cusolver-cu12
152
+ # nvidia-cusparse-cu12
153
+ # torch
154
+ nvidia-nvtx-cu12==12.4.127
155
+ # via torch
156
+ orjson==3.10.12
157
+ # via gradio
158
+ packaging==24.2
159
+ # via
160
+ # datasets
161
+ # gradio
162
+ # gradio-client
163
+ # huggingface-hub
164
+ # plotly
165
+ # transformers
166
+ pandas==2.2.3
167
+ # via
168
+ # datasets
169
+ # gradio
170
+ pillow==11.0.0
171
+ # via gradio
172
+ plotly==5.24.1
173
+ # via open-japanese-llm-leaderboard (pyproject.toml)
174
+ propcache==0.2.1
175
+ # via
176
+ # aiohttp
177
+ # yarl
178
+ pyarrow==18.1.0
179
+ # via datasets
180
+ pydantic==2.10.4
181
+ # via
182
+ # fastapi
183
+ # gradio
184
+ pydantic-core==2.27.2
185
+ # via pydantic
186
+ pydub==0.25.1
187
+ # via gradio
188
+ pygments==2.18.0
189
+ # via rich
190
+ python-dateutil==2.9.0.post0
191
+ # via pandas
192
+ python-multipart==0.0.20
193
+ # via gradio
194
+ pytz==2024.2
195
+ # via pandas
196
+ pyyaml==6.0.2
197
+ # via
198
+ # datasets
199
+ # gradio
200
+ # huggingface-hub
201
+ # transformers
202
+ regex==2024.11.6
203
+ # via transformers
204
+ requests==2.32.3
205
+ # via
206
+ # datasets
207
+ # huggingface-hub
208
+ # transformers
209
+ rich==13.9.4
210
+ # via typer
211
+ ruff==0.8.4
212
+ # via gradio
213
+ safehttpx==0.1.6
214
+ # via gradio
215
+ safetensors==0.4.5
216
+ # via transformers
217
+ semantic-version==2.10.0
218
+ # via gradio
219
+ shellingham==1.5.4
220
+ # via typer
221
+ six==1.17.0
222
+ # via python-dateutil
223
+ sniffio==1.3.1
224
+ # via anyio
225
+ starlette==0.41.3
226
+ # via
227
+ # fastapi
228
+ # gradio
229
+ sympy==1.13.1
230
+ # via torch
231
+ tenacity==9.0.0
232
+ # via plotly
233
+ tokenizers==0.21.0
234
+ # via transformers
235
+ tomlkit==0.13.2
236
+ # via gradio
237
+ torch==2.5.1
238
+ # via open-japanese-llm-leaderboard (pyproject.toml)
239
+ tqdm==4.67.1
240
+ # via
241
+ # datasets
242
+ # huggingface-hub
243
+ # transformers
244
+ transformers==4.47.1
245
+ # via open-japanese-llm-leaderboard (pyproject.toml)
246
+ triton==3.1.0
247
+ # via torch
248
+ typer==0.15.1
249
+ # via gradio
250
+ typing-extensions==4.12.2
251
+ # via
252
+ # anyio
253
+ # fastapi
254
+ # gradio
255
+ # gradio-client
256
+ # huggingface-hub
257
+ # multidict
258
+ # pydantic
259
+ # pydantic-core
260
+ # rich
261
+ # torch
262
+ # typer
263
+ # uvicorn
264
+ tzdata==2024.2
265
+ # via pandas
266
+ tzlocal==5.2
267
+ # via apscheduler
268
+ urllib3==2.3.0
269
+ # via requests
270
+ uvicorn==0.34.0
271
+ # via gradio
272
+ websockets==14.1
273
+ # via gradio-client
274
+ xxhash==3.5.0
275
+ # via datasets
276
+ yarl==1.18.3
277
+ # via aiohttp
src/Logos-HQ/B-Test-1-D-Top-Logo.png ADDED

Git LFS Details

  • SHA256: b78bd6b0b76cd0223e3a17b12c454c31eaee0e557cd3c862ccbfaea71c18ba46
  • Pointer size: 131 Bytes
  • Size of remote file: 315 kB
src/Logos-HQ/B-Test-2-Bottom-Logo-B.png ADDED

Git LFS Details

  • SHA256: 6e251a49c7d32bf56fa6da3219c15ba8590af3e55aafa7ac518e953d7ef4eae3
  • Pointer size: 131 Bytes
  • Size of remote file: 303 kB
src/Logos-HQ/HuggingFace-Logo-Oct-2024.png ADDED

Git LFS Details

  • SHA256: 9cf16f4f32604eaf76dabbdf47701eea5a768ebcc7296acc1d1758181f71db73
  • Pointer size: 131 Bytes
  • Size of remote file: 185 kB
src/Logos-HQ/LLM-jp-Logo-Oct-2024.png ADDED
src/Logos-HQ/MDX-Logo-Oct-2024.jpg ADDED

Git LFS Details

  • SHA256: fa4ead210b2ccadc0a10d677782a714a7d5f4f6897b741031fea5acad3efb2ed
  • Pointer size: 131 Bytes
  • Size of remote file: 676 kB
src/about.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ class TaskType(Enum):
6
+ AVG = "Average - 平均"
7
+ NLI = "NLI - 自然言語推論"
8
+ QA = "QA - 質問応答"
9
+ RC = "RC - 読解力"
10
+ CR = "CR - コモンセンス推論"
11
+ EL = "EL - エンティティリンキング"
12
+ FA = "FA - 基礎分析"
13
+ MR = "MR - 数学的推論"
14
+ MT = "MT - 機械翻訳"
15
+ STS = "STS - 意味的類似度"
16
+ HE_EN = "HE-EN - 英語試験問題"
17
+ HE_JA = "HE-JA - 日本語試験問題"
18
+ CG = "CG - コード生成"
19
+ SUM = "SUM - 要約"
20
+ BBH = "BBH - Big-Bench Hard"
21
+ IF = "IF - 指示追従"
22
+ NotTask = "?"
23
+
24
+
25
+ @dataclass
26
+ class Task:
27
+ benchmark: str
28
+ metric: str
29
+ col_name: str
30
+ task_type: TaskType
31
+ average: bool = False
32
+
33
+
34
+ # Select your tasks here
35
+ # ---------------------------------------------------
36
+ class Tasks(Enum):
37
+ AVG = Task("scores", "AVG", "AVG", TaskType.AVG, True)
38
+ NLI = Task("scores", "NLI", "AVG (NLI)", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
39
+ QA = Task("scores", "QA", "AVG (QA)", TaskType.QA, True) # Question Answering - 質問応答
40
+ RC = Task("scores", "RC", "AVG (RC)", TaskType.RC, True) # Reading Comprehension - 文章読解
41
+ EL = Task("scores", "EL", "AVG (EL)", TaskType.EL, True) # Entity Linking - エンティティリンキング
42
+ FA = Task("scores", "FA", "AVG (FA)", TaskType.FA, True) # Fundamental Analysis - 基礎解析
43
+ MR = Task("scores", "MR", "AVG (MR)", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
44
+ MT = Task("scores", "MT", "AVG (MT)", TaskType.MT, True) # Machine Translation - 機械翻訳
45
+ HE_EN = Task("scores", "HE-EN", "AVG (HE-EN)", TaskType.HE_EN, True) # Human Examination - English
46
+ HE_JA = Task("scores", "HE-JA", "AVG (HE-JA)", TaskType.HE_JA, True) # Human Examination - Japanese
47
+ CG = Task("scores", "CG", "AVG (CG)", TaskType.CG, True) # Code Generation - コード生成
48
+ SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
49
+ BBH = Task("scores", "BBH", "AVG (BBH)", TaskType.BBH, True) # Big-Bench Hard
50
+ CR = Task("scores", "CR", "AVG (CR)", TaskType.CR, True) # Commonsense Reasoning
51
+ IF = Task("scores", "IF", "AVG (IF)", TaskType.IF, True) # Instruction Following
52
+ alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
53
+ alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
54
+ alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22 ⭐", TaskType.MT)
55
+ alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
56
+ alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
57
+ alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
58
+ chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
59
+ commonsensemoralja_exact_match = Task(
60
+ "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA ⭐", TaskType.CR
61
+ )
62
+ jamp_exact_match = Task("scores", "jamp_exact_match", "JAMP ⭐", TaskType.NLI)
63
+ janli_exact_match = Task("scores", "janli_exact_match", "JANLI ⭐", TaskType.NLI)
64
+ jcommonsenseqa_exact_match = Task("scores", "jcommonsenseqa_exact_match", "JCommonSenseQA ⭐", TaskType.CR)
65
+ jemhopqa_char_f1 = Task("scores", "jemhopqa_char_f1", "JEMHopQA ⭐", TaskType.QA)
66
+ jmmlu_exact_match = Task("scores", "jmmlu_exact_match", "JMMLU ⭐", TaskType.HE_JA)
67
+ jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI ⭐", TaskType.NLI)
68
+ jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM ⭐", TaskType.NLI)
69
+ jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK ⭐", TaskType.NLI)
70
+ jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
71
+ jsts_pearson = Task(
72
+ "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
73
+ ) # Semantic Textual Similarity - 意味的類似度
74
+ jsts_spearman = Task(
75
+ "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
76
+ ) # Semantic Textual Similarity - 意味的類似度
77
+ kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI ⭐", TaskType.CR)
78
+ mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
79
+ mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) (0 shots only) ⭐", TaskType.CG)
80
+ mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint) (0 shots only)", TaskType.CG)
81
+ mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU ⭐", TaskType.HE_EN)
82
+ niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC ⭐", TaskType.QA)
83
+ aio_char_f1 = Task("scores", "aio_char_f1", "JAQKET ⭐", TaskType.QA)
84
+ wiki_coreference_set_f1 = Task("scores", "wiki_coreference_set_f1", "Wiki Coreference ⭐", TaskType.FA)
85
+ wiki_dependency_set_f1 = Task("scores", "wiki_dependency_set_f1", "Wiki Dependency ⭐", TaskType.FA)
86
+ wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER ⭐", TaskType.FA)
87
+ wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS ⭐", TaskType.FA)
88
+ wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading ⭐", TaskType.FA)
89
+ wikicorpus_e_to_j_bert_score_ja_f1 = Task(
90
+ "scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score", TaskType.MT
91
+ )
92
+ wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
93
+ wikicorpus_e_to_j_comet_wmt22 = Task(
94
+ "scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22 ⭐", TaskType.MT
95
+ )
96
+ wikicorpus_j_to_e_bert_score_en_f1 = Task(
97
+ "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
98
+ )
99
+ wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
100
+ wikicorpus_j_to_e_comet_wmt22 = Task(
101
+ "scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22 ⭐", TaskType.MT
102
+ )
103
+ xlsum_ja_bert_score_ja_f1 = Task(
104
+ "scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score (0 shots only)", TaskType.SUM
105
+ )
106
+ xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU (0 shots only)", TaskType.SUM)
107
+ xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1 (0 shots only)", TaskType.SUM)
108
+ xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2 (0 shots only) ⭐", TaskType.SUM)
109
+ # xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
110
+ xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum (0 shots only)", TaskType.SUM)
111
+ # New tasks for v2.0.0
112
+ aime2024_mathematical_equivalence = Task("scores", "aime2024_mathematical_equivalence", "AIME 2024 ⭐", TaskType.MR)
113
+ aime2025_mathematical_equivalence = Task("scores", "aime2025_mathematical_equivalence", "AIME 2025 ⭐", TaskType.MR)
114
+ bigbenchhard_direct_exact_match = Task("scores", "bigbenchhard_direct_exact_match", "BBH Direct ⭐", TaskType.BBH)
115
+ bigbenchhard_cot_exact_match = Task("scores", "bigbenchhard_cot_exact_match", "BBH CoT ⭐", TaskType.BBH)
116
+ bigbenchhard_ja_direct_exact_match = Task("scores", "bigbenchhard_ja_direct_exact_match", "BBH JA Direct ⭐", TaskType.BBH)
117
+ bigbenchhard_ja_cot_exact_match = Task("scores", "bigbenchhard_ja_cot_exact_match", "BBH JA CoT ⭐", TaskType.BBH)
118
+ drop_drop_f1 = Task("scores", "drop_drop_f1", "DROP ⭐", TaskType.QA)
119
+ gsm8k_mathematical_equivalence = Task("scores", "gsm8k_mathematical_equivalence", "GSM8K ⭐", TaskType.MR)
120
+ gpqa_diamond_en_exact_match = Task("scores", "gpqa_diamond_en_exact_match", "GPQA Diamond EN ⭐", TaskType.HE_EN)
121
+ gpqa_extended_en_exact_match = Task("scores", "gpqa_extended_en_exact_match", "GPQA Extended EN ⭐", TaskType.HE_EN)
122
+ gpqa_main_en_exact_match = Task("scores", "gpqa_main_en_exact_match", "GPQA Main EN ⭐", TaskType.HE_EN)
123
+ gpqa_diamond_ja_exact_match = Task("scores", "gpqa_diamond_ja_exact_match", "GPQA Diamond JA ⭐", TaskType.HE_JA)
124
+ gpqa_extended_ja_exact_match = Task("scores", "gpqa_extended_ja_exact_match", "GPQA Extended JA ⭐", TaskType.HE_JA)
125
+ gpqa_main_ja_exact_match = Task("scores", "gpqa_main_ja_exact_match", "GPQA Main JA ⭐", TaskType.HE_JA)
126
+ jamc_qa_exact_match = Task("scores", "jamc-qa_exact_match", "JAMC-QA ⭐", TaskType.QA)
127
+ jhumaneval_code_exec = Task("scores", "jhumaneval_code_exec", "JHumanEval ⭐", TaskType.CG)
128
+ mgsm_mathematical_equivalence = Task("scores", "mgsm_mathematical_equivalence", "MGSM ⭐", TaskType.MR)
129
+ mmlu_prox_ja_exact_match = Task("scores", "mmlu_prox_ja_exact_match", "MMLU Prox JA ⭐", TaskType.HE_JA)
130
+ mmlu_prox_en_exact_match = Task("scores", "mmlu_prox_en_exact_match", "MMLU Prox EN ⭐", TaskType.HE_EN)
131
+ mif_eval_ja_mifeval_strict = Task("scores", "mif_eval_ja_mifeval_strict", "MIF Eval JA ⭐", TaskType.IF)
132
+ mif_eval_en_mifeval_strict = Task("scores", "mif_eval_en_mifeval_strict", "MIF Eval EN ⭐", TaskType.IF)
133
+ mmmlu_exact_match = Task("scores", "mmmlu_exact_match", "MMMLU ⭐", TaskType.HE_JA)
134
+ openbookqa_exact_match = Task("scores", "openbookqa_exact_match", "OpenBookQA ⭐", TaskType.HE_EN)
135
+ polymath_en_polymath_weighted_accuracy = Task("scores", "polymath-en_polymath_weighted_accuracy", "Polymath EN ⭐", TaskType.MR)
136
+ polymath_ja_polymath_weighted_accuracy = Task("scores", "polymath-ja_polymath_weighted_accuracy", "Polymath JA ⭐", TaskType.MR)
137
+ triviaqa_triviaqa_f1 = Task("scores", "triviaqa_triviaqa_f1", "TriviaQA ⭐", TaskType.QA)
138
+ winogrande_xl_exact_match = Task("scores", "winogrande_xl_exact_match", "WinoGrande XL ⭐", TaskType.CR)
139
+
140
+
141
+ NUM_FEWSHOT = 0 # Change with your few shot
142
+ # ---------------------------------------------------
143
+
144
+ # Your leaderboard name
145
+ TITLE = """<h1 align="center" id="space-title">🇯🇵 Open Japanese LLM Leaderboard 🌸<br>オープン日本語LLMリーダーボード</h1>"""
146
+
147
+ # What does your leaderboard evaluate?
148
+ INTRODUCTION_TEXT = """
149
+ The __Open Japanese LLM Leaderboard__ by __[LLM-jp](https://llm-jp.nii.ac.jp/en/)__ evaluates
150
+ the performance of Japanese Large Language Models (LLMs) across 12 categories covering more than 50 tasks from
151
+ classical to modern NLP tasks. The __Open Japanese LLM Leaderboard__ was built by open-source
152
+ contributors of __[LLM-jp](https://llm-jp.nii.ac.jp/en/)__, a cross-organizational project
153
+ for the research and development of Japanese LLMs supported by the _National Institute of
154
+ Informatics_ in Tokyo, Japan.
155
+
156
+ On the __"LLM Benchmark"__ page, the question mark **"?"** refers to the parameters that
157
+ are unknown in the model card on Hugging Face. For more information about datasets,
158
+ please consult the __"About"__ page or refer to the website of
159
+ __[LLM-jp](https://llm-jp.nii.ac.jp/en/)__. And on the __"Submit here!"__ page, you can
160
+ evaluate the performance of your model, and be part of the leaderboard.
161
+ """
162
+ INTRODUCTION_TEXT_JA = """\
163
+ __[LLM-jp](https://llm-jp.nii.ac.jp/)__ による __オープン日本語LLMリーダーボード__ は、\
164
+ 古典的なものから最新のものまで12のカテゴリに渡る50種類以上のNLPタスクを用いて日本語大規模言語モデル(LLM)の\
165
+ 性能を評価します。__オープン日本語LLMリーダーボード__ は、日本の国立情報学研究所を中心に\
166
+ 日本語LLMの研究開発を行う組織横断プロジェクト __[LLM-jp](https://llm-jp.nii.ac.jp/)__ \
167
+ のオープンソース貢献者によって構築されました。
168
+
169
+ __「LLM Benchmark」__ ページでは、疑問符 **「?」** はHugging Faceのモデルカードで不明な\
170
+ パラメータを示しています。データセットに関する詳細情報については、__「About」__ ページを\
171
+ 参照するか、__[LLM-jp](https://llm-jp.nii.ac.jp/)__ のウェブサイトをご覧ください。\
172
+ また、__「Submit here!」__ ページでは、あなたのモデルの性能を評価し、リーダーボードに\
173
+ 参加することができます。
174
+ """
175
+
176
+ # Which evaluations are you running? how can people reproduce what you have?
177
+ LLM_BENCHMARKS_TEXT = """
178
+ ## How it works
179
+ 📈 We evaluate Japanese Large Language Models across 12 categories covering more than 50 tasks leveraging our evaluation tool [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval), a unified framework to evaluate Japanese LLMs on various evaluation tasks.
180
+
181
+ **NLI (Natural Language Inference)**
182
+
183
+ * `Jamp`, a Japanese NLI benchmark focused on temporal inference [Source](https://github.com/tomo-ut/temporalNLI_dataset) (License CC BY-SA 4.0)
184
+
185
+ * `JaNLI`, Japanese Adversarial Natural Language Inference [Source](https://github.com/verypluming/JaNLI) (License CC BY-SA 4.0)
186
+
187
+ * `JNLI`, Japanese Natural Language Inference (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
188
+
189
+ * `JSeM`, Japanese semantic test suite [Source](https://github.com/DaisukeBekki/JSeM) (License BSD 3-Clause)
190
+
191
+ * `JSICK`, Japanese Sentences Involving Compositional Knowledge [Source](https://github.com/verypluming/JSICK) (License CC BY-SA 4.0)
192
+
193
+ **QA (Question Answering)**
194
+
195
+ * `JEMHopQA`, Japanese Explainable Multi-hop Question Answering [Source](https://github.com/aiishii/JEMHopQA) (License CC BY-SA 4.0)
196
+
197
+ * `NIILC`, NIILC Question Answering Dataset [Source](https://github.com/mynlp/niilc-qa) (License CC BY-SA 4.0)
198
+
199
+ * `JAQKET`, Japanese QA dataset on the subject of quizzes [Source](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/) (License CC BY-SA 4.0 - Other licenses are required for corporate usage)
200
+
201
+ * `TriviaQA`, Reading Comprehension Challenge Dataset [Source](https://nlp.cs.washington.edu/triviaqa/) (License Apache-2.0)
202
+
203
+ * `DROP`, Discrete Reasoning Over Paragraphs [Source](https://allennlp.org/drop) (License CC BY-SA 4.0)
204
+
205
+ * `JAMC-QA`, Japanese Advanced Medical Comprehension Question Answering [Source](https://huggingface.co/datasets/llm-jp/jamc-qa) (License CC BY-SA 4.0)
206
+
207
+ **RC (Reading Comprehension)**
208
+
209
+ * `JSQuAD`, Japanese version of SQuAD (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
210
+
211
+ **CR (Commonsense Reasoning)**
212
+
213
+ * `JCommonsenseMorality`, Japanese dataset for evaluating commonsense morality understanding [Source](https://github.com/Language-Media-Lab/commonsense-moral-ja) (License MIT License)
214
+
215
+ * `JCommonsenseQA`, Japanese version of CommonsenseQA [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
216
+
217
+ * `KUCI`, Kyoto University Commonsense Inference dataset [Source](https://github.com/ku-nlp/KUCI (License CC BY-SA 4.0)
218
+
219
+ * `WinoGrande`, Winogrande Pronoun Disambiguation [Source](https://huggingface.co/datasets/winogrande) (License Apache-2.0)
220
+
221
+ **EL (Entity Linking)**
222
+
223
+ * `chABSA`, Aspect-Based Sentiment Analysis dataset [Source](https://github.com/chakki-works/chABSA-dataset) (License CC BY-SA 4.0)
224
+
225
+ **FA (Fundamental Analysis)**
226
+
227
+ * `Wikipedia Annotated Corpus`, [Source](https://github.com/ku-nlp/WikipediaAnnotatedCorpus) (License CC BY-SA 4.0)
228
+
229
+ List of tasks: (Reading Prediction, Named-entity recognition (NER), Dependency Parsing, Predicate-argument structure analysis (PAS), Coreference Resolution)
230
+
231
+ **MR (Mathematical Reasoning)**
232
+
233
+ * `MAWPS`, Japanese version of MAWPS (A Math Word Problem Repository) [Source](https://github.com/nlp-waseda/chain-of-thought-ja-dataset) (License Apache-2.0)
234
+
235
+ * `MGSM`, Japanese part of MGSM (Multilingual Grade School Math Benchmark) [Source](https://huggingface.co/datasets/juletxara/mgsm) (License MIT License)
236
+
237
+ * `GSM8K`, Grade School Math 8K [Source](https://github.com/openai/grade-school-math) (License MIT License)
238
+
239
+ * `AIME`, American Invitational Mathematics Examination [Source](https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions) (License Public Domain)
240
+
241
+ * `Polymath`, Multilevel Multimodal Mathematical Reasoning [Source](https://arxiv.org/abs/2407.21046) (License MIT License)
242
+
243
+ **MT (Machine Translation)**
244
+
245
+ * `ALT`, Asian Language Treebank (ALT) - Parallel Corpus [Source](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html) (License CC BY-SA 4.0)
246
+
247
+ * `WikiCorpus`, Japanese-English Bilingual Corpus of Wikipedia's articles about the city of Kyoto [Source](https://alaginrc.nict.go.jp/WikiCorpus/) (License CC BY-SA 3.0)
248
+
249
+ **STS (Semantic Textual Similarity)**
250
+
251
+ This task is supported by llm-jp-eval, but it is not included in the evaluation score average.
252
+
253
+ * `JSTS`, Japanese version of the STS (Semantic Textual Similarity) (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
254
+
255
+ **HE-EN (Human Examination - English)**
256
+
257
+ * `MMLU`, Measuring Massive Multitask Language Understanding [Source](https://github.com/hendrycks/test) (License MIT License)
258
+
259
+ * `GPQA`, Graduate-Level Google-Proof Q&A Benchmark [Source](https://github.com/idavidrein/gpqa) (License MIT License)
260
+
261
+ * `OpenBookQA`, Open Book Question Answering [Source](https://allenai.org/data/open-book-qa) (License Apache-2.0)
262
+
263
+ **HE-JA (Human Examination - Japanese)**
264
+
265
+ * `JMMLU`, Japanese Massive Multitask Language Understanding Benchmark [Source](https://github.com/nlp-waseda/JMMLU) (License CC BY-SA 4.0 (3 tasks under the CC BY-NC-ND 4.0 license)
266
+
267
+ * `MMMLU`, Japanese version of MMLU [Source](https://huggingface.co/datasets/pfnet/mmmlu) (License MIT License)
268
+
269
+ * `GPQA (JA)`, Japanese translation of GPQA [Source](https://github.com/idavidrein/gpqa) (License MIT License)
270
+
271
+ **CG (Code Generation)**
272
+
273
+ * `MBPP`, Japanese version of Mostly Basic Python Problems (MBPP) [Source](https://huggingface.co/datasets/llm-jp/mbpp-ja) (License CC BY-SA 4.0)
274
+
275
+ * `JHumanEval`, Japanese version of HumanEval [Source](https://huggingface.co/datasets/kogi-jwu/jhumaneval) (License MIT License)
276
+
277
+ **BBH (BIG-Bench Hard)**
278
+
279
+ * `BigBenchHard`, Challenging BIG-Bench tasks with chain-of-thought evaluation [Source](https://github.com/suzgunmirac/BIG-Bench-Hard) (License MIT License)
280
+
281
+ **IF (Instruction Following)**
282
+
283
+ * `MIF-Eval`, Multilingual Instruction Following Evaluation [Source](https://huggingface.co/datasets/google/MIF-Eval) (License Apache-2.0)
284
+
285
+ **SUM (Summarization)**
286
+
287
+ * `XL-Sum`, XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages [Source](https://github.com/csebuetnlp/xl-sum) (License CC BY-NC-SA 4.0, due to the non-commercial license, this dataset will not be used, unless you specifically agree to the license and terms of use)
288
+
289
+
290
+ ## Reproducibility
291
+ To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
292
+
293
+ ## Average Score Calculation
294
+ The calculation of the average score (AVG) includes only the scores of datasets marked with a ⭐.
295
+
296
+ """
297
+
298
+ LLM_BENCHMARKS_TEXT_JA = """
299
+ ## 仕組み
300
+ 📈 評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16種類のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
301
+
302
+ **NLI(自然言語推論)**
303
+
304
+ * `Jamp`、時間推論に焦点を当てた日本語NLIベンチマーク [ソース](https://github.com/tomo-ut/temporalNLI_dataset)(ライセンス CC BY-SA 4.0)
305
+
306
+ * `JaNLI`、日本語の敵対的推論データセット [ソース](https://github.com/verypluming/JaNLI)(ライセンス CC BY-SA 4.0)
307
+
308
+ * `JNLI`、日本語自然言語推論(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
309
+
310
+ * `JSeM`、日本語意味論テストセット [ソース](https://github.com/DaisukeBekki/JSeM)(ライセンス BSD 3-Clause)
311
+
312
+ * `JSICK`、構成的知識を含む日本語文データセット [ソース](https://github.com/verypluming/JSICK)(ライセン��� CC BY-SA 4.0)
313
+
314
+ **QA(質問応答)**
315
+
316
+ * `JEMHopQA`、日本語の説明可能なマルチホップ質問応答 [ソース](https://github.com/aiishii/JEMHopQA)(ライセンス CC BY-SA 4.0)
317
+
318
+ * `NIILC`、NIILC質問応答データセット [ソース](https://github.com/mynlp/niilc-qa)(ライセンス CC BY-SA 4.0)
319
+
320
+ * `JAQKET`、クイズを題材とした日本語QAデータセット [ソース](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/)(ライセンス CC BY-SA 4.0 - 企業利用には別途ライセンスが必要)
321
+
322
+ **RC(読解)**
323
+
324
+ * `JSQuAD`、SQuADの日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
325
+
326
+ **MC(選択式質問応答)**
327
+
328
+ * `JCommonsenseMorality`、常識的な道徳理解を評価する日本語データセット [ソース](https://github.com/Language-Media-Lab/commonsense-moral-ja)(ライセンス MIT License)
329
+
330
+ * `JCommonsenseQA`、CommonsenseQAの日本語版 [ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
331
+
332
+ * `KUCI`、京都大学常識推論データセット [ソース](https://github.com/ku-nlp/KUCI)(ライセンス CC BY-SA 4.0)
333
+
334
+ **EL(エンティティリンキング)**
335
+
336
+ * `chABSA`、アスペクトベースの感情分析データセット [ソース](https://github.com/chakki-works/chABSA-dataset)(ライセンス CC BY-SA 4.0)
337
+
338
+ **FA(基礎解析)**
339
+
340
+ * `Wikipedia Annotated Corpus`、[ソース](https://github.com/ku-nlp/WikipediaAnnotatedCorpus)(ライセンス CC BY-SA 4.0)
341
+
342
+ タスク一覧:(読解予測、固有表現認識(NER)、依存構造解析、述語項構造解析(PAS)、共参照解析)
343
+
344
+ **MR(数学的推論)**
345
+
346
+ * `MAWPS`、MAWPS(A Math Word Problem Repository)の日本語版 [ソース](https://github.com/nlp-waseda/chain-of-thought-ja-dataset)(ライセンス Apache-2.0)
347
+
348
+ * `MGSM`、MGSM(Multilingual Grade School Math Benchmark)の日本語部分 [ソース](https://huggingface.co/datasets/juletxara/mgsm)(ライセンス MIT License)
349
+
350
+ **MT(機械翻訳)**
351
+
352
+ * `ALT`、アジア言語ツリーバンク(ALT) - 並行コーパス [ソース](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html)(ライセンス CC BY-SA 4.0)
353
+
354
+ * `WikiCorpus`、京都市に関するWikipedia記事の日本語-英語対訳コーパス [ソース](https://alaginrc.nict.go.jp/WikiCorpus/)(ライセンス CC BY-SA 3.0)
355
+
356
+ **STS(意味的テキスト類似度)**
357
+
358
+ このタスクはllm-jp-evalでサポートされていますが、平均スコア (AVG) の計算には含まれていません。
359
+
360
+ * `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
361
+
362
+ **HE(試験問題)**
363
+
364
+ * `MMLU`、大規模マルチタスク言語理解ベンチマーク(英語) [ソース](https://github.com/hendrycks/test)(ライセンス MIT License)
365
+
366
+ * `JMMLU`、日本語大規模マルチタスク言語理解ベンチマーク [ソース](https://github.com/nlp-waseda/JMMLU)(ライセンス CC BY-SA 4.0(3つのタスクはCC BY-NC-ND 4.0ライセンス)
367
+
368
+ **CG(コード生成)**
369
+
370
+ * `MBPP`、Mostly Basic Python Problems(MBPP)の日本語版 [ソース](https://huggingface.co/datasets/llm-jp/mbpp-ja)(ライセンス CC BY-SA 4.0)
371
+
372
+ **SUM(要約)**
373
+
374
+ * `XL-Sum`、44言語の大規模多言語抽象型要約データセットの日本語部分 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
375
+
376
+ ## 再現性
377
+ 結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
378
+
379
+ ## 平均スコアの計算について
380
+ 平均スコア (AVG) の計算には、⭐マークのついたスコアのみが含まれます
381
+
382
+ """
383
+
384
+
385
+ EVALUATION_QUEUE_TEXT = """
386
+ ## First Steps Before Submitting a Model
387
+ ### 1. Ensure Your Model Loads with AutoClasses
388
+ Verify that you can load your model and tokenizer using AutoClasses:
389
+ ```python
390
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
391
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
392
+ model = AutoModel.from_pretrained("your model name", revision=revision)
393
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
394
+ ```
395
+ Note:
396
+ - If this step fails, debug your model before submitting.
397
+ - Ensure your model is public.
398
+ - Models requiring `use_remote_code=True` are not currently supported.
399
+ ### 2. Convert Weights to Safetensors
400
+ [Safetensors](https://huggingface.co/docs/safetensors/index) is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
401
+ ### 3. Verify Your Model Open License
402
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
403
+ ### 4. Complete Your Model Card
404
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
405
+ ### 5. Select Appropriate Precision
406
+ The "auto" option supports fp16, fp32, and bf16 precisions. If your model uses any other precision format, please select the appropriate option.
407
+ If auto is specified, precision in config.json is automatically selected.
408
+ ### Note about large models
409
+ Currently, we support models up to 70B parameters. However, we are working on infrastructure improvements to accommodate larger models (70B+) in the near future. Stay tuned for updates!
410
+
411
+ """
412
+ EVALUATION_QUEUE_TEXT_JA = """
413
+ ## モデル提出前の最初のステップ
414
+ ### 1. AutoClasses でモデルが読み込めることを確認
415
+ AutoClasses を使用してモデルとトークナイザーを読み込めることを確認してください:
416
+ ```python
417
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
418
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
419
+ model = AutoModel.from_pretrained("your model name", revision=revision)
420
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
421
+ ```
422
+ 注意:
423
+ - この手順が失敗する場合は、提出前にモデルをデバッグしてください。
424
+ - モデルが公開されていることを確認してください。
425
+ - `use_remote_code=True` を必要とするモデルは現時点ではサポートされていません。
426
+
427
+ ### 2. 重みを Safetensors に変換
428
+ [Safetensors](https://huggingface.co/docs/safetensors/index) は、より安全で高速に読み込めるウェイトの新しい保存形式です。これにより、`Extended Viewer` にモデルのパラメータ数を追加することも可能になります!
429
+
430
+ ### 3. モデルのオープンライセンスを確認
431
+ これはオープン LLM のリーダーボードです。できるだけ多くの人があなたのモデルを使用できることを知ってもらえると嬉しいです🤗
432
+
433
+ ### 4. モデルカードを完成させる
434
+ リーダーボードにモデルの追加情報を掲載する際は、モデルカードから自動的に情報が取得されます
435
+
436
+ ### 5. 適切なPrecisionの選択
437
+ "auto"オプションはfp16、fp32、bf16のprecisionに対応しています。これら以外のprecisionを使用している場合は、適切なオプションを選択してください。
438
+ また、autoを指定した場合、config.jsonのprecisionが自動的に選択されます。
439
+
440
+ ### 大規模モデルに関する注意
441
+ 現在、70Bパラメータまでのモデルをサポートしています。より大規模なモデル(70Bよりも大きいもの)については、インフラストラクチャの改善を進めており、近い将来対応予定です。続報をお待ちください!
442
+
443
+ """
444
+
445
+ BOTTOM_LOGO = """
446
+ <div style="display: flex; flex-direction: row; justify-content: center; align-items: center;">
447
+ <a href="https://llm-jp.nii.ac.jp/en/" style="margin: 0 10px;">
448
+ <img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/LLM-jp-Logo-Oct-2024.png" alt="LLM-jp" style="max-height: 100px;">
449
+ </a>
450
+ <a href="https://mdx.jp/" style="margin: 0 10px;">
451
+ <img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/MDX-Logo-Oct-2024.jpg" alt="MDX" style="max-height: 100px;">
452
+ </a>
453
+ <a href="https://huggingface.co/" style="margin: 0 10px;">
454
+ <img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/HuggingFace-Logo-Oct-2024.png" alt="HuggingFace" style="max-height: 100px;">
455
+ </a>
456
+ </div>
457
+ """
458
+
459
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
460
+ CITATION_BUTTON_LABEL_JA = "引用の際は、次のスニペットをコピーしてご利用ください"
461
+
462
+ CITATION_BUTTON_TEXT = r"""@misc{OJLL,
463
+ author = {Miyao, Yusuke and Ishida, Shigeki and Okamoto, Takumi and Han, Namgi and Mousterou, Akim and Fourrier, Clémentine and Hayashi, Toshihiro and Tachibana, Yuichiro},
464
+ title = {Open Japanese LLM Leaderboard},
465
+ year = {2024},
466
+ publisher = {OJLL},
467
+ howpublished = "\url{https://huggingface.co/spaces/llm-jp/open-japanese-llm-leaderboard}"
468
+ }
469
+ @misc{llmjp2024llmjpcrossorganizationalprojectresearch,
470
+ title={LLM-jp: A Cross-organizational Project for the Research and Development of Fully Open Japanese LLMs},
471
+ author={LLM-jp and : and Akiko Aizawa and Eiji Aramaki and Bowen Chen and Fei Cheng and Hiroyuki Deguchi and Rintaro Enomoto and Kazuki Fujii and Kensuke Fukumoto and Takuya Fukushima and Namgi Han and Yuto Harada and Chikara Hashimoto and Tatsuya Hiraoka and Shohei Hisada and Sosuke Hosokawa and Lu Jie and Keisuke Kamata and Teruhito Kanazawa and Hiroki Kanezashi and Hiroshi Kataoka and Satoru Katsumata and Daisuke Kawahara and Seiya Kawano and Atsushi Keyaki and Keisuke Kiryu and Hirokazu Kiyomaru and Takashi Kodama and Takahiro Kubo and Yohei Kuga and Ryoma Kumon and Shuhei Kurita and Sadao Kurohashi and Conglong Li and Taiki Maekawa and Hiroshi Matsuda and Yusuke Miyao and Kentaro Mizuki and Sakae Mizuki and Yugo Murawaki and Ryo Nakamura and Taishi Nakamura and Kouta Nakayama and Tomoka Nakazato and Takuro Niitsuma and Jiro Nishitoba and Yusuke Oda and Hayato Ogawa and Takumi Okamoto and Naoaki Okazaki and Yohei Oseki and Shintaro Ozaki and Koki Ryu and Rafal Rzepka and Keisuke Sakaguchi and Shota Sasaki and Satoshi Sekine and Kohei Suda and Saku Sugawara and Issa Sugiura and Hiroaki Sugiyama and Hisami Suzuki and Jun Suzuki and Toyotaro Suzumura and Kensuke Tachibana and Yu Takagi and Kyosuke Takami and Koichi Takeda and Masashi Takeshita and Masahiro Tanaka and Kenjiro Taura and Arseny Tolmachev and Nobuhiro Ueda and Zhen Wan and Shuntaro Yada and Sakiko Yahata and Yuya Yamamoto and Yusuke Yamauchi and Hitomi Yanaka and Rio Yokota and Koichiro Yoshino},
472
+ year={2024},
473
+ eprint={2407.03963},
474
+ archivePrefix={arXiv},
475
+ primaryClass={cs.CL},
476
+ url={https://arxiv.org/abs/2407.03963},
477
+ }
478
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def model_hyperlink_with_shot(link, model_name, num_few_shot):
6
+ display_name = f"{model_name} ({num_few_shot}-shot)"
7
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{display_name}</a>'
8
+
9
+
10
+ def make_clickable_model(model_name):
11
+ link = f"https://huggingface.co/{model_name}"
12
+ return model_hyperlink(link, model_name)
13
+
14
+
15
+ def make_clickable_model_with_shot(model_name, num_few_shot):
16
+ link = f"https://huggingface.co/{model_name}"
17
+ return model_hyperlink_with_shot(link, model_name, num_few_shot)
18
+
19
+
20
+ def styled_error(error):
21
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
22
+
23
+
24
+ def styled_warning(warn):
25
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
26
+
27
+
28
+ def styled_message(message):
29
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
30
+
31
+
32
+ def has_no_nan_values(df, columns):
33
+ return df[columns].notna().all(axis=1)
34
+
35
+
36
+ def has_nan_values(df, columns):
37
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks, TaskType
7
+
8
+
9
+ def fields(raw_class):
10
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
+
12
+
13
+ # These classes are for user facing column names,
14
+ # to avoid having to change them all around the code
15
+ # when a modif is needed
16
+ @dataclass
17
+ class ColumnContent:
18
+ name: str
19
+ type: str
20
+ displayed_by_default: bool
21
+ hidden: bool = False
22
+ never_hidden: bool = False
23
+ dummy: bool = False
24
+ task_type: TaskType = TaskType.NotTask
25
+ average: bool = False
26
+
27
+
28
+ ## Leaderboard columns
29
+ auto_eval_column_dict = []
30
+ # Init
31
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
33
+ # Scores
34
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
35
+ for task in Tasks:
36
+ auto_eval_column_dict.append(
37
+ [
38
+ task.name,
39
+ ColumnContent,
40
+ ColumnContent(
41
+ task.value.col_name,
42
+ "number",
43
+ displayed_by_default=(task.value.task_type == TaskType.AVG or task.value.average),
44
+ task_type=task.value.task_type,
45
+ average=task.value.average,
46
+ ),
47
+ ]
48
+ )
49
+ # Model information
50
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
51
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
52
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
53
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
54
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
55
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
56
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
57
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Revision", "str", False, False)])
58
+ auto_eval_column_dict.append(["num_few_shots", ColumnContent, ColumnContent("Few-shot", "number", False)])
59
+ auto_eval_column_dict.append(["add_special_tokens", ColumnContent, ColumnContent("Add Special Tokens", "bool", False)])
60
+ auto_eval_column_dict.append(
61
+ ["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
62
+ )
63
+ auto_eval_column_dict.append(["vllm_version", ColumnContent, ColumnContent("vllm version", "str", False)])
64
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
65
+ auto_eval_column_dict.append(["row_id", ColumnContent, ColumnContent("ID", "number", False, dummy=True)])
66
+
67
+ # We use make dataclass to dynamically fill the scores from Tasks
68
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
69
+
70
+
71
+ ## For the queue columns in the submission tab
72
+ @dataclass(frozen=True)
73
+ class EvalQueueColumn: # Queue column
74
+ model = ColumnContent("model", "markdown", True)
75
+ revision = ColumnContent("revision", "str", True)
76
+ model_type = ColumnContent("model_type", "str", True)
77
+ precision = ColumnContent("precision", "str", True)
78
+ add_special_tokens = ColumnContent("add_special_tokens", "str", True)
79
+ llm_jp_eval_version = ColumnContent("llm_jp_eval_version", "str", True)
80
+ vllm_version = ColumnContent("vllm_version", "str", True)
81
+ status = ColumnContent("status", "str", True)
82
+
83
+
84
+ # This class is used to store the model data in the queue
85
+ @dataclass(frozen=True)
86
+ class EvalQueuedModel:
87
+ model: str
88
+ revision: str
89
+ precision: str
90
+ add_special_tokens: str
91
+ llm_jp_eval_version: str
92
+ vllm_version: str
93
+
94
+
95
+ ## All the model information that we might need
96
+ @dataclass
97
+ class ModelDetails:
98
+ name: str
99
+ display_name: str = ""
100
+ symbol: str = "" # emoji
101
+
102
+
103
+ class ModelType(Enum):
104
+ PT = ModelDetails(name="pretrained", symbol="🟢")
105
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
106
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
107
+ RL = ModelDetails(name="RL-tuned (Preference optimization)", symbol="🟦")
108
+ MM = ModelDetails(name="multimodal", symbol="🌸")
109
+ BM = ModelDetails(name="base merges and moerges", symbol="🤝")
110
+
111
+ def to_str(self, separator=" "):
112
+ return f"{self.value.symbol}{separator}{self.value.name}"
113
+
114
+ @staticmethod
115
+ def from_str(type):
116
+ if "fine-tuned" in type or "🔶" in type:
117
+ return ModelType.FT
118
+ if "pretrained" in type or "🟢" in type:
119
+ return ModelType.PT
120
+ if "RL-tuned" in type or "🟦" in type:
121
+ return ModelType.RL
122
+ if "instruction-tuned" in type or "⭕" in type:
123
+ return ModelType.IFT
124
+ if "multimodal" in type or "🌸" in type:
125
+ return ModelType.MM
126
+ if "base merges and moerges" in type or "🤝" in type:
127
+ return ModelType.BM
128
+ raise ValueError(f"Unsupported model type: {type}")
129
+
130
+
131
+ class WeightType(Enum):
132
+ Adapter = ModelDetails("Adapter")
133
+ Original = ModelDetails("Original")
134
+ Delta = ModelDetails("Delta")
135
+
136
+
137
+ class Precision(Enum):
138
+ float16 = ModelDetails("float16")
139
+ bfloat16 = ModelDetails("bfloat16")
140
+ float32 = ModelDetails("float32")
141
+
142
+ @staticmethod
143
+ def from_str(precision: str) -> "Precision":
144
+ if precision == "float16":
145
+ return Precision.float16
146
+ if precision == "bfloat16":
147
+ return Precision.bfloat16
148
+ if precision == "float32":
149
+ return Precision.float32
150
+ raise ValueError(
151
+ f"Unsupported precision type: {precision}. Please use 'auto' (recommended), 'float32', 'float16', or 'bfloat16'"
152
+ )
153
+
154
+
155
+ class AddSpecialTokens(Enum):
156
+ true = ModelDetails("True")
157
+ false = ModelDetails("False")
158
+
159
+
160
+ class NumFewShots(Enum):
161
+ shots_0 = 0
162
+ shots_4 = 4
163
+
164
+
165
+ class LLMJpEvalVersion(Enum):
166
+ current = ModelDetails("v1.4.1")
167
+
168
+ @staticmethod
169
+ def from_str(version: str) -> "LLMJpEvalVersion":
170
+ if version == "1.4.1":
171
+ return LLMJpEvalVersion.current
172
+ raise ValueError(f"Unsupported LLMJpEval version: {version}")
173
+
174
+
175
+ class VllmVersion(Enum):
176
+ current = ModelDetails("v0.6.3.post1")
177
+
178
+ @staticmethod
179
+ def from_str(version: str) -> "VllmVersion":
180
+ if version == "v0.6.3.post1":
181
+ return VllmVersion.current
182
+ raise ValueError(f"Unsupported VLLM version: {version}")
183
+
184
+
185
+ # Column selection
186
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
187
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
188
+
189
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
190
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
191
+
192
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
193
+
194
+ NUMERIC_INTERVALS = {
195
+ "0~3B": pd.Interval(0, 3, closed="right"),
196
+ "3~7B": pd.Interval(3, 7.3, closed="right"),
197
+ "7~13B": pd.Interval(7.3, 13, closed="right"),
198
+ "13~35B": pd.Interval(13, 35, closed="right"),
199
+ "35~60B": pd.Interval(35, 60, closed="right"),
200
+ "60B+": pd.Interval(60, 10000, closed="right"),
201
+ "?": pd.Interval(-1, 0, closed="right"),
202
+ }
src/envs.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ # Info to change for your repository
7
+ # ----------------------------------
8
+ HF_TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
9
+
10
+ OWNER = "e-mon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ # ----------------------------------
12
+
13
+ REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard-v2"
14
+ QUEUE_REPO = f"{OWNER}/leaderboard-requests-v2"
15
+ CONTENTS_REPO = f"{OWNER}/leaderboard-contents-v2"
16
+
17
+ # If you setup a cache later, just change HF_HOME
18
+ CACHE_PATH = pathlib.Path(os.getenv("HF_HOME", "."))
19
+
20
+ # Local caches
21
+ EVAL_REQUESTS_PATH = CACHE_PATH / "eval-queue"
22
+
23
+ API = HfApi(token=HF_TOKEN)
src/i18n.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Column selection
2
+ SELECT_ALL_BUTTON_LABEL = "Select All"
3
+ SELECT_ALL_BUTTON_LABEL_JA = "全選択"
4
+ SELECT_NONE_BUTTON_LABEL = "Select None"
5
+ SELECT_NONE_BUTTON_LABEL_JA = "全解除"
6
+ SELECT_AVG_ONLY_BUTTON_LABEL = "AVG Only"
7
+ SELECT_AVG_ONLY_BUTTON_LABEL_JA = "AVGのみ"
8
+
9
+ # Citation
10
+ CITATION_ACCORDION_LABEL = "📙 Citation"
11
+ CITATION_ACCORDION_LABEL_JA = "📙 引用"
src/populate.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import datasets
5
+ import pandas as pd
6
+
7
+ from src.about import Tasks
8
+ from src.display.formatting import has_no_nan_values, make_clickable_model, make_clickable_model_with_shot
9
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
10
+
11
+ # The values of these columns are in the range of 0-100
12
+ # We normalize them to 0-1
13
+ COLUMNS_TO_NORMALIZE = [
14
+ "ALT E to J BLEU",
15
+ "ALT J to E BLEU",
16
+ "WikiCorpus E to J BLEU",
17
+ "WikiCorpus J to E BLEU",
18
+ "XL-Sum JA BLEU",
19
+ "XL-Sum ROUGE1",
20
+ "XL-Sum ROUGE2",
21
+ "XL-Sum ROUGE-Lsum",
22
+ ]
23
+
24
+
25
+ def get_leaderboard_df(contents_repo: str, cols: list[str], benchmark_cols: list[str]) -> pd.DataFrame:
26
+ df = datasets.load_dataset(contents_repo, split="train").to_pandas()
27
+ # df["Model"] = df["model"].map(make_clickable_model)
28
+ df["Model"] = df.apply(lambda x: make_clickable_model_with_shot(x["model"], x["num_few_shot"]), axis=1)
29
+ df["T"] = df["model_type"].map(lambda x: x.split(":")[0].strip())
30
+ df = df.rename(columns={task.value.metric: task.value.col_name for task in Tasks})
31
+ df = df.rename(
32
+ columns={
33
+ "architecture": "Architecture",
34
+ "weight_type": "Weight type",
35
+ "precision": "Precision",
36
+ "license": "Hub License",
37
+ "params": "#Params (B)",
38
+ "likes": "Hub ❤️",
39
+ "revision": "Revision",
40
+ "num_few_shot": "Few-shot",
41
+ "add_special_tokens": "Add Special Tokens",
42
+ "llm_jp_eval_version": "llm-jp-eval version",
43
+ "vllm_version": "vllm version",
44
+ "model_type": "Type",
45
+ "model": "model_name_for_query",
46
+ }
47
+ )
48
+
49
+ # Add a row ID column
50
+ df[AutoEvalColumn.row_id.name] = range(len(df))
51
+
52
+ # Normalize the columns
53
+ available_columns_to_normalize = [col for col in COLUMNS_TO_NORMALIZE if col in df.columns]
54
+ df[available_columns_to_normalize] = df[available_columns_to_normalize] / 100
55
+
56
+ df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
57
+ df = df[cols].round(decimals=4)
58
+
59
+ # filter out if any of the benchmarks have not been produced
60
+ df = df[has_no_nan_values(df, benchmark_cols)]
61
+
62
+ return df
63
+
64
+
65
+ def get_evaluation_queue_df(save_path: str, cols: list[str]) -> list[pd.DataFrame]:
66
+ """Creates the different dataframes for the evaluation queues requestes"""
67
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
68
+ all_evals = []
69
+
70
+ for entry in entries:
71
+ if ".json" in entry:
72
+ file_path = os.path.join(save_path, entry)
73
+ with open(file_path) as fp:
74
+ data = json.load(fp)
75
+
76
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
77
+ data[EvalQueueColumn.model.name] = make_clickable_model_with_shot(
78
+ data["model"],
79
+ data["num_few_shot"], # num_few_shotは必ず存在するため、直接参照
80
+ )
81
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
82
+
83
+ all_evals.append(data)
84
+ elif ".md" not in entry:
85
+ # this is a folder
86
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
87
+ for sub_entry in sub_entries:
88
+ file_path = os.path.join(save_path, entry, sub_entry)
89
+ with open(file_path) as fp:
90
+ data = json.load(fp)
91
+
92
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
93
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
94
+ all_evals.append(data)
95
+
96
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
97
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
98
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
99
+ failed_list = [e for e in all_evals if e["status"] == "FAILED"]
100
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
101
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
102
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
103
+ df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
104
+ return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
src/submission/check_validity.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+
5
+ import huggingface_hub
6
+ import requests
7
+ from huggingface_hub import ModelCard
8
+ from huggingface_hub.hf_api import ModelInfo
9
+ from transformers import AutoConfig
10
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
11
+
12
+ from src.display.utils import EvalQueuedModel
13
+
14
+
15
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
16
+ """Checks if the model card and license exist and have been filled"""
17
+ try:
18
+ card = ModelCard.load(repo_id)
19
+ except huggingface_hub.utils.EntryNotFoundError:
20
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
21
+
22
+ # Enforce license metadata
23
+ if card.data.license is None:
24
+ if not ("license_name" in card.data and "license_link" in card.data):
25
+ return False, (
26
+ "License not found. Please add a license to your model card using the `license` metadata or a"
27
+ " `license_name`/`license_link` pair."
28
+ )
29
+
30
+ # Enforce card content
31
+ if len(card.text) < 200:
32
+ return False, "Please add a description to your model card, it is too short."
33
+
34
+ return True, ""
35
+
36
+
37
+ def is_model_on_hub(
38
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
39
+ ) -> tuple[bool, str]:
40
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
41
+ try:
42
+ config = AutoConfig.from_pretrained(
43
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
44
+ )
45
+ if test_tokenizer:
46
+ try:
47
+ AutoTokenizer.from_pretrained(
48
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
49
+ )
50
+ except ValueError as e:
51
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
52
+ except Exception:
53
+ return (
54
+ False,
55
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
56
+ None,
57
+ )
58
+ return True, None, config
59
+
60
+ except ValueError:
61
+ return (
62
+ False,
63
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
64
+ None,
65
+ )
66
+
67
+ except OSError as e:
68
+ if "gated repo" in str(e):
69
+ slack_webhook_url = os.environ["SLACK_WEBHOOK_URL"]
70
+ text = f"<!channel>\n{model_name} is gated model! Please submit this model."
71
+ requests.post(slack_webhook_url, data=json.dumps({"text": text}))
72
+ return False, "is gated model! Please wait.", None
73
+ return False, "was not found on hub!", None
74
+ except Exception:
75
+ return False, "was not found on hub!", None
76
+
77
+
78
+ def get_model_size(model_info: ModelInfo, precision: str):
79
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
80
+ try:
81
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
82
+ except (AttributeError, TypeError):
83
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
84
+
85
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
86
+ model_size = size_factor * model_size
87
+ return model_size
88
+
89
+
90
+ def get_model_arch(model_info: ModelInfo):
91
+ """Gets the model architecture from the configuration"""
92
+ return model_info.config.get("architectures", "Unknown")
93
+
94
+
95
+ def already_submitted_models(requested_models_dir: pathlib.Path) -> set[EvalQueuedModel]:
96
+ """Gather a list of already submitted models to avoid duplicates"""
97
+ queued_models = set()
98
+ for json_path in requested_models_dir.glob("*/*.json"):
99
+ with json_path.open() as f:
100
+ info = json.load(f)
101
+ queued_models.add(
102
+ EvalQueuedModel(
103
+ model=info["model"],
104
+ revision=info["revision"],
105
+ precision=info["precision"],
106
+ add_special_tokens=info["add_special_tokens"],
107
+ llm_jp_eval_version=info["llm_jp_eval_version"],
108
+ vllm_version=info["vllm_version"],
109
+ )
110
+ )
111
+ return queued_models
src/submission/submit.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime, timezone
3
+
4
+ import torch
5
+
6
+ from src.display.formatting import styled_error, styled_message, styled_warning
7
+ from src.display.utils import EvalQueuedModel, LLMJpEvalVersion, VllmVersion
8
+ from src.envs import API, EVAL_REQUESTS_PATH, HF_TOKEN, QUEUE_REPO
9
+ from src.submission.check_validity import already_submitted_models, check_model_card, is_model_on_hub
10
+
11
+ REQUESTED_MODELS: set[EvalQueuedModel] = set()
12
+
13
+ LLM_JP_EVAL_VERSION = LLMJpEvalVersion.current.value.name
14
+ VLLM_VERSION = VllmVersion.current.value.name
15
+
16
+
17
+ def add_new_eval(
18
+ model_id: str,
19
+ revision: str,
20
+ precision: str,
21
+ model_type: str,
22
+ add_special_tokens: str,
23
+ ):
24
+ global REQUESTED_MODELS
25
+ if not REQUESTED_MODELS:
26
+ REQUESTED_MODELS = already_submitted_models(EVAL_REQUESTS_PATH)
27
+
28
+ revision = revision or "main"
29
+
30
+ # Is the model on the hub?
31
+ model_on_hub, error, config = is_model_on_hub(
32
+ model_name=model_id, revision=revision, token=HF_TOKEN, test_tokenizer=True
33
+ )
34
+ if not model_on_hub:
35
+ return styled_error(f'Model "{model_id}" {error}')
36
+ if precision == "auto":
37
+ dtype = ""
38
+ if hasattr(config, "torch_dtype"):
39
+ dtype = config.torch_dtype
40
+ if dtype == torch.float16:
41
+ precision = "float16"
42
+ elif dtype == torch.bfloat16:
43
+ precision = "bfloat16"
44
+ elif dtype == torch.float32:
45
+ precision = "float32"
46
+ else:
47
+ return styled_error(
48
+ "Unable to retrieve a valid dtype from config.json. Please select an appropriate one from fp16/fp32/bf16 and resubmit."
49
+ )
50
+
51
+ model_data = EvalQueuedModel(
52
+ model=model_id,
53
+ revision=revision,
54
+ precision=precision,
55
+ add_special_tokens=add_special_tokens,
56
+ llm_jp_eval_version=LLM_JP_EVAL_VERSION,
57
+ vllm_version=VLLM_VERSION,
58
+ )
59
+
60
+ if model_data in REQUESTED_MODELS:
61
+ return styled_warning("This model has already been submitted with the same configuration.")
62
+
63
+ if "/" in model_id:
64
+ user_or_org, model_name = model_id.split("/")
65
+ else:
66
+ user_or_org, model_name = "", model_id
67
+
68
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
69
+
70
+ if model_type is None or model_type == "":
71
+ return styled_error("Please select a model type.")
72
+
73
+ # Is the model info correctly filled?
74
+ try:
75
+ model_info = API.model_info(repo_id=model_id, revision=revision)
76
+ except Exception:
77
+ return styled_error("Could not get your model information. Please fill it up properly.")
78
+
79
+ # Were the model card and license filled?
80
+ try:
81
+ _ = model_info.cardData["license"]
82
+ except Exception:
83
+ return styled_error("Please select a license for your model")
84
+
85
+ modelcard_OK, error_msg = check_model_card(model_id)
86
+ if not modelcard_OK:
87
+ return styled_error(error_msg)
88
+
89
+ # Seems good, creating the eval
90
+ print("Adding new eval")
91
+
92
+ eval_entry = {
93
+ "model_type": model_type,
94
+ "model": model_id,
95
+ "precision": precision,
96
+ "revision": revision,
97
+ "add_special_tokens": add_special_tokens,
98
+ "llm_jp_eval_version": LLM_JP_EVAL_VERSION,
99
+ "vllm_version": VLLM_VERSION,
100
+ "status": "PENDING",
101
+ "submitted_time": current_time,
102
+ }
103
+
104
+ print("Creating eval file")
105
+ OUT_DIR = EVAL_REQUESTS_PATH / user_or_org
106
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
107
+ out_file_name = f"{model_name}_{current_time.replace(':','-')}.json"
108
+ out_path = OUT_DIR / out_file_name
109
+
110
+ with out_path.open("w") as f:
111
+ f.write(json.dumps(eval_entry))
112
+
113
+ print("Uploading eval file")
114
+ API.upload_file(
115
+ path_or_fileobj=out_path,
116
+ path_in_repo=out_path.relative_to(EVAL_REQUESTS_PATH).as_posix(),
117
+ repo_id=QUEUE_REPO,
118
+ repo_type="dataset",
119
+ commit_message=f"Add {model_id} to eval queue",
120
+ )
121
+ REQUESTED_MODELS.add(model_data)
122
+
123
+ # Remove the local file
124
+ out_path.unlink()
125
+
126
+ return styled_message(
127
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
128
+ )
style.css ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .markdown-text {
2
+ font-size: 16px !important;
3
+ }
4
+
5
+ #models-to-add-text {
6
+ font-size: 18px !important;
7
+ }
8
+
9
+ #citation-button span {
10
+ font-size: 16px !important;
11
+ }
12
+
13
+ #citation-button textarea {
14
+ font-size: 16px !important;
15
+ }
16
+
17
+ #citation-button > label > button {
18
+ margin: 6px;
19
+ transform: scale(1.3);
20
+ }
21
+
22
+ #leaderboard-table {
23
+ margin-top: 15px;
24
+ }
25
+
26
+ #search-bar-table-box > div:first-child {
27
+ background: none;
28
+ border: none;
29
+ }
30
+
31
+ #search-bar {
32
+ padding: 0px;
33
+ }
34
+
35
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
36
+ #leaderboard-table td:nth-child(2),
37
+ #leaderboard-table th:nth-child(2) {
38
+ max-width: 400px;
39
+ overflow: auto;
40
+ white-space: nowrap;
41
+ }
42
+
43
+ @media (min-width: 700px) {
44
+ #leaderboard-table td:nth-child(2) {
45
+ left: 0;
46
+ z-index: 1;
47
+ position: sticky;
48
+ border-right: solid rgba(0, 0, 0, 0.1) !important;
49
+ }
50
+ }
51
+ @media (min-width: 700px) and (prefers-color-scheme: light) {
52
+ #leaderboard-table td:nth-child(2) {
53
+ background-color: rgba(255, 255, 255, 0.9) !important;
54
+ }
55
+ }
56
+
57
+ @media (min-width: 700px) and (prefers-color-scheme: dark) {
58
+ #leaderboard-table td:nth-child(2) {
59
+ background-color: rgba(52, 65, 86, 0.9) !important;
60
+ }
61
+ #leaderboard-table td a {
62
+ color: white !important;
63
+ }
64
+ }
65
+
66
+ #llm-benchmark-tab-table-button, #llm-benchmark-tab-about-button, #llm-benchmark-tab-submit-button {
67
+ font-size: 20px;
68
+ }
69
+
70
+ #scale-logo {
71
+ border-style: none !important;
72
+ box-shadow: none;
73
+ display: block;
74
+ margin-left: auto;
75
+ margin-right: auto;
76
+ max-width: 600px;
77
+ }
78
+
79
+ #scale-logo .download {
80
+ display: none;
81
+ }
82
+ #filter_type {
83
+ border: 0;
84
+ padding-left: 0;
85
+ padding-top: 0;
86
+ }
87
+ #filter_type label {
88
+ display: flex;
89
+ }
90
+ #filter_type label > span {
91
+ margin-top: var(--spacing-lg);
92
+ margin-right: 0.5em;
93
+ }
94
+ #filter_type label > .wrap {
95
+ width: 103px;
96
+ }
97
+ #filter_type label > .wrap .wrap-inner {
98
+ padding: 2px;
99
+ }
100
+ #filter_type label > .wrap .wrap-inner input {
101
+ width: 1px;
102
+ }
103
+ #filter-columns-type {
104
+ border: 0;
105
+ padding: 0.5;
106
+ }
107
+ #filter-columns-size {
108
+ border: 0;
109
+ padding: 0.5;
110
+ }
111
+ #box-filter > .form {
112
+ border: 0;
113
+ }
114
+
115
+ .language-selector {
116
+ width: auto;
117
+ display: flex;
118
+ justify-content: center;
119
+ margin: 20px 0;
120
+ }
121
+
122
+ /* Full width space */
123
+ .gradio-container {
124
+ max-width: 95% !important;
125
+ }
126
+
127
+ .accordion {
128
+ min-width: 200px !important;
129
+ border: solid rgba(175, 175, 175, 0.1) !important;
130
+ }
131
+
132
+ /* make the plotly modebar horizontal */
133
+ .modebar-group {
134
+ display: flex;
135
+ flex-direction: row;
136
+ align-items: center;
137
+ }
138
+
139
+ /* Hides the final AutoEvalColumn */
140
+ #llm-benchmark-tab-table table td:last-child,
141
+ #llm-benchmark-tab-table table th:last-child {
142
+ display: none;
143
+ }
uv.lock ADDED
The diff for this file is too large to render. See raw diff