Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +5 -1
- .gitignore +14 -0
- .pre-commit-config.yaml +19 -0
- .python-version +1 -0
- .vscode/settings.json +15 -0
- Makefile +13 -0
- README.md +18 -8
- app.py +706 -0
- pyproject.toml +41 -0
- requirements.txt +277 -0
- src/Logos-HQ/B-Test-1-D-Top-Logo.png +3 -0
- src/Logos-HQ/B-Test-2-Bottom-Logo-B.png +3 -0
- src/Logos-HQ/HuggingFace-Logo-Oct-2024.png +3 -0
- src/Logos-HQ/LLM-jp-Logo-Oct-2024.png +0 -0
- src/Logos-HQ/MDX-Logo-Oct-2024.jpg +3 -0
- src/about.py +478 -0
- src/display/formatting.py +37 -0
- src/display/utils.py +202 -0
- src/envs.py +23 -0
- src/i18n.py +11 -0
- src/populate.py +104 -0
- src/submission/check_validity.py +111 -0
- src/submission/submit.py +128 -0
- style.css +143 -0
- uv.lock +0 -0
.gitattributes
CHANGED
|
@@ -25,7 +25,6 @@
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
@@ -33,3 +32,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
src/Logos-HQ/B-Test-1-D-Top-Logo.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
src/Logos-HQ/B-Test-2-Bottom-Logo-B.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
src/Logos-HQ/HuggingFace-Logo-Oct-2024.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
src/Logos-HQ/MDX-Logo-Oct-2024.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
auto_evals/
|
| 2 |
+
venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.env
|
| 5 |
+
.ipynb_checkpoints
|
| 6 |
+
*ipynb
|
| 7 |
+
.venv
|
| 8 |
+
.ruff_cache
|
| 9 |
+
|
| 10 |
+
eval-queue/
|
| 11 |
+
eval-results/
|
| 12 |
+
eval-queue-bk/
|
| 13 |
+
eval-results-bk/
|
| 14 |
+
logs/
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 3 |
+
rev: v5.0.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: check-yaml
|
| 6 |
+
- id: check-case-conflict
|
| 7 |
+
- id: detect-private-key
|
| 8 |
+
- id: check-added-large-files
|
| 9 |
+
args: ["--maxkb=1000"]
|
| 10 |
+
- id: requirements-txt-fixer
|
| 11 |
+
- id: end-of-file-fixer
|
| 12 |
+
- id: trailing-whitespace
|
| 13 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 14 |
+
rev: v0.8.4
|
| 15 |
+
hooks:
|
| 16 |
+
- id: ruff
|
| 17 |
+
args: ["--select", "E,F,I,UP,W", "--ignore", "E501", "--fix"]
|
| 18 |
+
- id: ruff-format
|
| 19 |
+
args: ["--line-length", "119"]
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.10.15
|
.vscode/settings.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"editor.formatOnSave": true,
|
| 3 |
+
"files.insertFinalNewline": false,
|
| 4 |
+
"[python]": {
|
| 5 |
+
"editor.defaultFormatter": "charliermarsh.ruff",
|
| 6 |
+
"editor.formatOnType": true,
|
| 7 |
+
"editor.codeActionsOnSave": {
|
| 8 |
+
"source.fixAll.ruff": "explicit",
|
| 9 |
+
"source.organizeImports": "explicit"
|
| 10 |
+
}
|
| 11 |
+
},
|
| 12 |
+
"flake8.args": [
|
| 13 |
+
"--max-line-length=119"
|
| 14 |
+
],
|
| 15 |
+
}
|
Makefile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: style format
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
style:
|
| 5 |
+
python -m black --line-length 119 .
|
| 6 |
+
python -m isort .
|
| 7 |
+
ruff check --fix .
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
quality:
|
| 11 |
+
python -m black --check --line-length 119 .
|
| 12 |
+
python -m isort --check-only .
|
| 13 |
+
ruff check .
|
README.md
CHANGED
|
@@ -1,12 +1,22 @@
|
|
| 1 |
---
|
| 2 |
-
title: Open Japanese
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Open Japanese LLM Leaderboard
|
| 3 |
+
emoji: 🌸
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: gradio
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
sdk_version: 5.9.1
|
| 11 |
+
fullWidth: true
|
| 12 |
+
datasets:
|
| 13 |
+
- llm-jp/leaderboard-requests
|
| 14 |
+
- llm-jp/leaderboard-results
|
| 15 |
+
- llm-jp/leaderboard-contents
|
| 16 |
+
tags:
|
| 17 |
+
- 日本語
|
| 18 |
+
- Japanese
|
| 19 |
+
- leaderboard
|
| 20 |
+
- language:日本語
|
| 21 |
+
- language:Japanese
|
| 22 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
import plotly.graph_objects as go
|
| 7 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
+
from huggingface_hub import snapshot_download
|
| 9 |
+
|
| 10 |
+
from src.about import (
|
| 11 |
+
BOTTOM_LOGO,
|
| 12 |
+
CITATION_BUTTON_LABEL,
|
| 13 |
+
CITATION_BUTTON_LABEL_JA,
|
| 14 |
+
CITATION_BUTTON_TEXT,
|
| 15 |
+
EVALUATION_QUEUE_TEXT,
|
| 16 |
+
EVALUATION_QUEUE_TEXT_JA,
|
| 17 |
+
INTRODUCTION_TEXT,
|
| 18 |
+
INTRODUCTION_TEXT_JA,
|
| 19 |
+
LLM_BENCHMARKS_TEXT,
|
| 20 |
+
LLM_BENCHMARKS_TEXT_JA,
|
| 21 |
+
TITLE,
|
| 22 |
+
TaskType,
|
| 23 |
+
)
|
| 24 |
+
from src.display.utils import (
|
| 25 |
+
BENCHMARK_COLS,
|
| 26 |
+
COLS,
|
| 27 |
+
EVAL_COLS,
|
| 28 |
+
EVAL_TYPES,
|
| 29 |
+
NUMERIC_INTERVALS,
|
| 30 |
+
TYPES,
|
| 31 |
+
AddSpecialTokens,
|
| 32 |
+
AutoEvalColumn,
|
| 33 |
+
LLMJpEvalVersion,
|
| 34 |
+
ModelType,
|
| 35 |
+
NumFewShots,
|
| 36 |
+
Precision,
|
| 37 |
+
VllmVersion,
|
| 38 |
+
fields,
|
| 39 |
+
)
|
| 40 |
+
from src.envs import API, CONTENTS_REPO, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID
|
| 41 |
+
from src.i18n import (
|
| 42 |
+
CITATION_ACCORDION_LABEL,
|
| 43 |
+
CITATION_ACCORDION_LABEL_JA,
|
| 44 |
+
SELECT_ALL_BUTTON_LABEL,
|
| 45 |
+
SELECT_ALL_BUTTON_LABEL_JA,
|
| 46 |
+
SELECT_AVG_ONLY_BUTTON_LABEL,
|
| 47 |
+
SELECT_AVG_ONLY_BUTTON_LABEL_JA,
|
| 48 |
+
SELECT_NONE_BUTTON_LABEL,
|
| 49 |
+
SELECT_NONE_BUTTON_LABEL_JA,
|
| 50 |
+
)
|
| 51 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 52 |
+
from src.submission.submit import add_new_eval
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def restart_space() -> None:
|
| 56 |
+
API.restart_space(repo_id=REPO_ID)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Space initialization
|
| 60 |
+
try:
|
| 61 |
+
snapshot_download(
|
| 62 |
+
repo_id=QUEUE_REPO,
|
| 63 |
+
local_dir=EVAL_REQUESTS_PATH,
|
| 64 |
+
repo_type="dataset",
|
| 65 |
+
tqdm_class=None,
|
| 66 |
+
etag_timeout=30,
|
| 67 |
+
)
|
| 68 |
+
except Exception:
|
| 69 |
+
restart_space()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Get dataframes
|
| 73 |
+
|
| 74 |
+
(
|
| 75 |
+
FINISHED_EVAL_QUEUE_DF,
|
| 76 |
+
RUNNING_EVAL_QUEUE_DF,
|
| 77 |
+
PENDING_EVAL_QUEUE_DF,
|
| 78 |
+
FAILED_EVAL_QUEUE_DF,
|
| 79 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
ORIGINAL_DF = get_leaderboard_df(CONTENTS_REPO, COLS, BENCHMARK_COLS)
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Error getting leaderboard df: {e}")
|
| 85 |
+
ORIGINAL_DF = pd.DataFrame()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# Searching and filtering
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def filter_models(
|
| 92 |
+
df: pd.DataFrame,
|
| 93 |
+
type_query: list[str],
|
| 94 |
+
size_query: list[str],
|
| 95 |
+
precision_query: list[str],
|
| 96 |
+
add_special_tokens_query: list[str],
|
| 97 |
+
num_few_shots_query: list[int],
|
| 98 |
+
version_query: list[str],
|
| 99 |
+
vllm_query: list[str],
|
| 100 |
+
) -> pd.DataFrame:
|
| 101 |
+
# Filter by model type
|
| 102 |
+
type_emoji = [t.split()[0] for t in type_query]
|
| 103 |
+
df = df[df["T"].isin(type_emoji)]
|
| 104 |
+
|
| 105 |
+
# Filter by precision
|
| 106 |
+
df = df[df["Precision"].isin(precision_query)]
|
| 107 |
+
|
| 108 |
+
# Filter by model size
|
| 109 |
+
# Note: When `df` is empty, `size_mask` is empty, and the shape of `df[size_mask]` becomes (0, 0),
|
| 110 |
+
# so we need to check the length of `df` before applying the filter.
|
| 111 |
+
if len(df) > 0:
|
| 112 |
+
size_mask = df["#Params (B)"].apply(
|
| 113 |
+
lambda x: any(x in NUMERIC_INTERVALS[s] for s in size_query if s != "Unknown")
|
| 114 |
+
)
|
| 115 |
+
if "Unknown" in size_query:
|
| 116 |
+
size_mask |= df["#Params (B)"].isna() | (df["#Params (B)"] == 0)
|
| 117 |
+
df = df[size_mask]
|
| 118 |
+
|
| 119 |
+
# Filter by special tokens setting
|
| 120 |
+
df = df[df["Add Special Tokens"].isin(add_special_tokens_query)]
|
| 121 |
+
|
| 122 |
+
# Filter by number of few-shot examples
|
| 123 |
+
df = df[df["Few-shot"].isin(num_few_shots_query)]
|
| 124 |
+
|
| 125 |
+
# Filter by evaluator version
|
| 126 |
+
df = df[df["llm-jp-eval version"].isin(version_query)]
|
| 127 |
+
|
| 128 |
+
# Filter by vLLM version
|
| 129 |
+
df = df[df["vllm version"].isin(vllm_query)]
|
| 130 |
+
|
| 131 |
+
return df
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def search_model_by_name(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
|
| 135 |
+
return df[df[AutoEvalColumn.dummy.name].str.contains(model_name, case=False)]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def search_models_by_multiple_names(df: pd.DataFrame, search_text: str) -> pd.DataFrame:
|
| 139 |
+
if not search_text:
|
| 140 |
+
return df
|
| 141 |
+
model_names = [name.strip() for name in search_text.split(";")]
|
| 142 |
+
dfs = [search_model_by_name(df, name) for name in model_names if name]
|
| 143 |
+
return pd.concat(dfs).drop_duplicates(subset=AutoEvalColumn.row_id.name)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def select_columns(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
| 147 |
+
always_here_cols = [
|
| 148 |
+
AutoEvalColumn.model_type_symbol.name, # 'T'
|
| 149 |
+
AutoEvalColumn.model.name, # 'Model'
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
# Remove 'always_here_cols' from 'columns' to avoid duplicates
|
| 153 |
+
columns = [c for c in columns if c not in always_here_cols]
|
| 154 |
+
new_columns = (
|
| 155 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.row_id.name]
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Maintain order while removing duplicates
|
| 159 |
+
seen = set()
|
| 160 |
+
unique_columns = []
|
| 161 |
+
for c in new_columns:
|
| 162 |
+
if c not in seen:
|
| 163 |
+
unique_columns.append(c)
|
| 164 |
+
seen.add(c)
|
| 165 |
+
|
| 166 |
+
# Create DataFrame with filtered columns
|
| 167 |
+
filtered_df = df[unique_columns]
|
| 168 |
+
return filtered_df
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def update_table(
|
| 172 |
+
type_query: list[str],
|
| 173 |
+
precision_query: list[str],
|
| 174 |
+
size_query: list[str],
|
| 175 |
+
add_special_tokens_query: list[str],
|
| 176 |
+
num_few_shots_query: list[int],
|
| 177 |
+
version_query: list[str],
|
| 178 |
+
vllm_query: list[str],
|
| 179 |
+
query: str,
|
| 180 |
+
*columns,
|
| 181 |
+
) -> pd.DataFrame:
|
| 182 |
+
columns = [item for column in columns for item in column]
|
| 183 |
+
df = filter_models(
|
| 184 |
+
ORIGINAL_DF,
|
| 185 |
+
type_query,
|
| 186 |
+
size_query,
|
| 187 |
+
precision_query,
|
| 188 |
+
add_special_tokens_query,
|
| 189 |
+
num_few_shots_query,
|
| 190 |
+
version_query,
|
| 191 |
+
vllm_query,
|
| 192 |
+
)
|
| 193 |
+
df = search_models_by_multiple_names(df, query)
|
| 194 |
+
df = select_columns(df, columns)
|
| 195 |
+
return df
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# Prepare the dataframes
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
INITIAL_COLUMNS = ["T"] + [
|
| 202 |
+
c.name for c in fields(AutoEvalColumn) if (c.never_hidden or c.displayed_by_default) and c.name != "T"
|
| 203 |
+
]
|
| 204 |
+
leaderboard_df = ORIGINAL_DF.copy()
|
| 205 |
+
if len(leaderboard_df) > 0:
|
| 206 |
+
leaderboard_df = filter_models(
|
| 207 |
+
leaderboard_df,
|
| 208 |
+
[t.to_str(" : ") for t in ModelType],
|
| 209 |
+
list(NUMERIC_INTERVALS.keys()),
|
| 210 |
+
[i.value.name for i in Precision],
|
| 211 |
+
[i.value.name for i in AddSpecialTokens],
|
| 212 |
+
[i.value for i in NumFewShots],
|
| 213 |
+
[i.value.name for i in LLMJpEvalVersion],
|
| 214 |
+
[i.value.name for i in VllmVersion],
|
| 215 |
+
)
|
| 216 |
+
leaderboard_df = select_columns(leaderboard_df, INITIAL_COLUMNS)
|
| 217 |
+
else:
|
| 218 |
+
leaderboard_df = pd.DataFrame(columns=INITIAL_COLUMNS)
|
| 219 |
+
|
| 220 |
+
# Leaderboard demo
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def toggle_all_categories(action: str) -> list[gr.CheckboxGroup]:
|
| 224 |
+
"""Function to control all category checkboxes at once"""
|
| 225 |
+
results = []
|
| 226 |
+
for task_type in TaskType:
|
| 227 |
+
if task_type == TaskType.NotTask:
|
| 228 |
+
# Maintain existing selection for Model details
|
| 229 |
+
results.append(gr.CheckboxGroup())
|
| 230 |
+
elif action == "all":
|
| 231 |
+
# Select all
|
| 232 |
+
results.append(
|
| 233 |
+
gr.CheckboxGroup(
|
| 234 |
+
value=[
|
| 235 |
+
c.name
|
| 236 |
+
for c in fields(AutoEvalColumn)
|
| 237 |
+
if not c.hidden and not c.never_hidden and not c.dummy and c.task_type == task_type
|
| 238 |
+
]
|
| 239 |
+
)
|
| 240 |
+
)
|
| 241 |
+
elif action == "none":
|
| 242 |
+
# Deselect all
|
| 243 |
+
results.append(gr.CheckboxGroup(value=[]))
|
| 244 |
+
elif action == "avg_only":
|
| 245 |
+
# Select only AVG metrics
|
| 246 |
+
results.append(
|
| 247 |
+
gr.CheckboxGroup(
|
| 248 |
+
value=[
|
| 249 |
+
c.name
|
| 250 |
+
for c in fields(AutoEvalColumn)
|
| 251 |
+
if not c.hidden
|
| 252 |
+
and not c.never_hidden
|
| 253 |
+
and c.task_type == task_type
|
| 254 |
+
and ((task_type == TaskType.AVG) or (task_type != TaskType.AVG and c.average))
|
| 255 |
+
]
|
| 256 |
+
)
|
| 257 |
+
)
|
| 258 |
+
return results
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
TASK_AVG_NAME_MAP = {
|
| 262 |
+
c.name: c.task_type.name for c in fields(AutoEvalColumn) if c.average and c.task_type != TaskType.AVG
|
| 263 |
+
}
|
| 264 |
+
AVG_COLUMNS = ["AVG"] + list(TASK_AVG_NAME_MAP.keys())
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def plot_size_vs_score(df_filtered: pd.DataFrame) -> go.Figure:
|
| 268 |
+
df = ORIGINAL_DF[ORIGINAL_DF[AutoEvalColumn.row_id.name].isin(df_filtered[AutoEvalColumn.row_id.name])]
|
| 269 |
+
df = df[df["#Params (B)"] > 0]
|
| 270 |
+
df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
|
| 271 |
+
df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
|
| 272 |
+
df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"].astype(str) + "-shot)"
|
| 273 |
+
df = pd.melt(
|
| 274 |
+
df,
|
| 275 |
+
id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
|
| 276 |
+
value_vars=AVG_COLUMNS,
|
| 277 |
+
var_name="Category",
|
| 278 |
+
value_name="Score",
|
| 279 |
+
)
|
| 280 |
+
max_model_size = df["#Params (B)"].max()
|
| 281 |
+
fig = px.scatter(
|
| 282 |
+
df,
|
| 283 |
+
x="#Params (B)",
|
| 284 |
+
y="Score",
|
| 285 |
+
text="model_name_without_org_name",
|
| 286 |
+
color="Category",
|
| 287 |
+
hover_data=["Model", "n-shot", "Category"],
|
| 288 |
+
)
|
| 289 |
+
fig.update_traces(
|
| 290 |
+
hovertemplate="<b>%{customdata[0]}</b><br>#Params: %{x:.2f}B<br>n-shot: %{customdata[1]}<br>%{customdata[2]}: %{y:.4f}<extra></extra>",
|
| 291 |
+
textposition="top right",
|
| 292 |
+
mode="markers",
|
| 293 |
+
)
|
| 294 |
+
for trace in fig.data:
|
| 295 |
+
if trace.name != "AVG":
|
| 296 |
+
trace.visible = "legendonly"
|
| 297 |
+
fig.update_layout(xaxis_range=[0, max_model_size * 1.2], yaxis_range=[0, 1])
|
| 298 |
+
fig.update_layout(
|
| 299 |
+
updatemenus=[
|
| 300 |
+
dict(
|
| 301 |
+
type="buttons",
|
| 302 |
+
direction="left",
|
| 303 |
+
showactive=True,
|
| 304 |
+
buttons=[
|
| 305 |
+
dict(label="Hide Labels", method="update", args=[{"mode": ["markers"]}]),
|
| 306 |
+
dict(label="Show Labels", method="update", args=[{"mode": ["markers+text"]}]),
|
| 307 |
+
],
|
| 308 |
+
x=0.5,
|
| 309 |
+
y=-0.2,
|
| 310 |
+
xanchor="center",
|
| 311 |
+
yanchor="top",
|
| 312 |
+
)
|
| 313 |
+
]
|
| 314 |
+
)
|
| 315 |
+
return fig
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def plot_average_scores(df_filtered: pd.DataFrame) -> go.Figure:
|
| 319 |
+
df = ORIGINAL_DF[ORIGINAL_DF[AutoEvalColumn.row_id.name].isin(df_filtered[AutoEvalColumn.row_id.name])]
|
| 320 |
+
df = df[["model_name_for_query", "Few-shot"] + list(TASK_AVG_NAME_MAP.keys())]
|
| 321 |
+
df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
|
| 322 |
+
df = df.rename(columns=TASK_AVG_NAME_MAP)
|
| 323 |
+
df = df.set_index(["Model", "n-shot"])
|
| 324 |
+
|
| 325 |
+
fig = go.Figure()
|
| 326 |
+
for i, ((name, n_shot), row) in enumerate(df.iterrows()):
|
| 327 |
+
visible = True if i < 2 else "legendonly" # Display only the first 2 models
|
| 328 |
+
fig.add_trace(
|
| 329 |
+
go.Scatterpolar(
|
| 330 |
+
r=row.values,
|
| 331 |
+
theta=row.index,
|
| 332 |
+
fill="toself",
|
| 333 |
+
name=f"{name} ({n_shot}-shot)",
|
| 334 |
+
hovertemplate="%{theta}: %{r}",
|
| 335 |
+
visible=visible,
|
| 336 |
+
)
|
| 337 |
+
)
|
| 338 |
+
fig.update_layout(
|
| 339 |
+
polar={
|
| 340 |
+
"radialaxis": {"range": [0, 1]},
|
| 341 |
+
},
|
| 342 |
+
showlegend=True,
|
| 343 |
+
)
|
| 344 |
+
return fig
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
shown_columns_dict: dict[str, gr.CheckboxGroup] = {}
|
| 348 |
+
checkboxes: list[gr.CheckboxGroup] = []
|
| 349 |
+
|
| 350 |
+
with gr.Blocks() as demo_leaderboard:
|
| 351 |
+
with gr.Row():
|
| 352 |
+
search_bar = gr.Textbox(
|
| 353 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
| 354 |
+
show_label=False,
|
| 355 |
+
elem_id="search-bar",
|
| 356 |
+
)
|
| 357 |
+
with gr.Accordion("Column Filter", open=True):
|
| 358 |
+
with gr.Row():
|
| 359 |
+
with gr.Row():
|
| 360 |
+
select_all_button = gr.Button(SELECT_ALL_BUTTON_LABEL_JA, size="sm")
|
| 361 |
+
select_none_button = gr.Button(SELECT_NONE_BUTTON_LABEL_JA, size="sm")
|
| 362 |
+
select_avg_only_button = gr.Button(SELECT_AVG_ONLY_BUTTON_LABEL_JA, size="sm")
|
| 363 |
+
|
| 364 |
+
for task_type in TaskType:
|
| 365 |
+
label = "Model details" if task_type == TaskType.NotTask else task_type.value
|
| 366 |
+
with gr.Accordion(label, open=True, elem_classes="accordion"):
|
| 367 |
+
with gr.Row(height=110):
|
| 368 |
+
shown_column = gr.CheckboxGroup(
|
| 369 |
+
show_label=False,
|
| 370 |
+
choices=[
|
| 371 |
+
c.name
|
| 372 |
+
for c in fields(AutoEvalColumn)
|
| 373 |
+
if not c.hidden and not c.never_hidden and not c.dummy and c.task_type == task_type
|
| 374 |
+
],
|
| 375 |
+
value=[
|
| 376 |
+
c.name
|
| 377 |
+
for c in fields(AutoEvalColumn)
|
| 378 |
+
if c.displayed_by_default
|
| 379 |
+
and not c.hidden
|
| 380 |
+
and not c.never_hidden
|
| 381 |
+
and c.task_type == task_type
|
| 382 |
+
],
|
| 383 |
+
elem_id="column-select",
|
| 384 |
+
container=False,
|
| 385 |
+
)
|
| 386 |
+
shown_columns_dict[task_type.name] = shown_column
|
| 387 |
+
checkboxes.append(shown_column)
|
| 388 |
+
|
| 389 |
+
with gr.Accordion("Model Filter", open=True):
|
| 390 |
+
with gr.Row():
|
| 391 |
+
filter_columns_type = gr.CheckboxGroup(
|
| 392 |
+
label="Model types",
|
| 393 |
+
choices=[t.to_str() for t in ModelType],
|
| 394 |
+
value=[t.to_str() for t in ModelType],
|
| 395 |
+
elem_id="filter-columns-type",
|
| 396 |
+
)
|
| 397 |
+
filter_columns_precision = gr.CheckboxGroup(
|
| 398 |
+
label="Precision",
|
| 399 |
+
choices=[i.value.name for i in Precision],
|
| 400 |
+
value=[i.value.name for i in Precision],
|
| 401 |
+
elem_id="filter-columns-precision",
|
| 402 |
+
)
|
| 403 |
+
filter_columns_size = gr.CheckboxGroup(
|
| 404 |
+
label="Model sizes (in billions of parameters)",
|
| 405 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
| 406 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
| 407 |
+
elem_id="filter-columns-size",
|
| 408 |
+
)
|
| 409 |
+
filter_columns_add_special_tokens = gr.CheckboxGroup(
|
| 410 |
+
label="Add Special Tokens",
|
| 411 |
+
choices=[i.value.name for i in AddSpecialTokens],
|
| 412 |
+
value=[i.value.name for i in AddSpecialTokens],
|
| 413 |
+
elem_id="filter-columns-add-special-tokens",
|
| 414 |
+
)
|
| 415 |
+
filter_columns_num_few_shots = gr.CheckboxGroup(
|
| 416 |
+
label="Num Few Shots",
|
| 417 |
+
choices=[i.value for i in NumFewShots],
|
| 418 |
+
value=[i.value for i in NumFewShots],
|
| 419 |
+
elem_id="filter-columns-num-few-shots",
|
| 420 |
+
)
|
| 421 |
+
filter_columns_version = gr.CheckboxGroup(
|
| 422 |
+
label="llm-jp-eval version",
|
| 423 |
+
choices=[i.value.name for i in LLMJpEvalVersion],
|
| 424 |
+
value=[i.value.name for i in LLMJpEvalVersion],
|
| 425 |
+
elem_id="filter-columns-version",
|
| 426 |
+
)
|
| 427 |
+
filter_columns_vllm = gr.CheckboxGroup(
|
| 428 |
+
label="vllm version",
|
| 429 |
+
choices=[i.value.name for i in VllmVersion],
|
| 430 |
+
value=[i.value.name for i in VllmVersion],
|
| 431 |
+
elem_id="filter-columns-vllm",
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
leaderboard_table = gr.Dataframe(
|
| 435 |
+
value=leaderboard_df,
|
| 436 |
+
headers=INITIAL_COLUMNS,
|
| 437 |
+
datatype=TYPES,
|
| 438 |
+
elem_id="leaderboard-table",
|
| 439 |
+
interactive=False,
|
| 440 |
+
visible=True,
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
graph_size_vs_score = gr.Plot(label="Size vs. Score")
|
| 444 |
+
graph_average_scores = gr.Plot(label="Performance across Task Categories")
|
| 445 |
+
|
| 446 |
+
select_all_button.click(
|
| 447 |
+
fn=lambda: toggle_all_categories("all"),
|
| 448 |
+
outputs=checkboxes,
|
| 449 |
+
api_name=False,
|
| 450 |
+
queue=False,
|
| 451 |
+
)
|
| 452 |
+
select_none_button.click(
|
| 453 |
+
fn=lambda: toggle_all_categories("none"),
|
| 454 |
+
outputs=checkboxes,
|
| 455 |
+
api_name=False,
|
| 456 |
+
queue=False,
|
| 457 |
+
)
|
| 458 |
+
select_avg_only_button.click(
|
| 459 |
+
fn=lambda: toggle_all_categories("avg_only"),
|
| 460 |
+
outputs=checkboxes,
|
| 461 |
+
api_name=False,
|
| 462 |
+
queue=False,
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
gr.on(
|
| 466 |
+
triggers=[
|
| 467 |
+
filter_columns_type.change,
|
| 468 |
+
filter_columns_precision.change,
|
| 469 |
+
filter_columns_size.change,
|
| 470 |
+
filter_columns_add_special_tokens.change,
|
| 471 |
+
filter_columns_num_few_shots.change,
|
| 472 |
+
filter_columns_version.change,
|
| 473 |
+
filter_columns_vllm.change,
|
| 474 |
+
search_bar.submit,
|
| 475 |
+
]
|
| 476 |
+
+ [shown_columns.change for shown_columns in shown_columns_dict.values()],
|
| 477 |
+
fn=update_table,
|
| 478 |
+
inputs=[
|
| 479 |
+
filter_columns_type,
|
| 480 |
+
filter_columns_precision,
|
| 481 |
+
filter_columns_size,
|
| 482 |
+
filter_columns_add_special_tokens,
|
| 483 |
+
filter_columns_num_few_shots,
|
| 484 |
+
filter_columns_version,
|
| 485 |
+
filter_columns_vllm,
|
| 486 |
+
search_bar,
|
| 487 |
+
]
|
| 488 |
+
+ list(shown_columns_dict.values()),
|
| 489 |
+
outputs=leaderboard_table,
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
leaderboard_table.change(
|
| 493 |
+
fn=plot_size_vs_score,
|
| 494 |
+
inputs=leaderboard_table,
|
| 495 |
+
outputs=graph_size_vs_score,
|
| 496 |
+
api_name=False,
|
| 497 |
+
queue=False,
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
leaderboard_table.change(
|
| 501 |
+
fn=plot_average_scores,
|
| 502 |
+
inputs=leaderboard_table,
|
| 503 |
+
outputs=graph_average_scores,
|
| 504 |
+
api_name=False,
|
| 505 |
+
queue=False,
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
# Submission demo
|
| 510 |
+
|
| 511 |
+
with gr.Blocks() as demo_submission:
|
| 512 |
+
with gr.Column():
|
| 513 |
+
with gr.Row():
|
| 514 |
+
evaluation_queue_text = gr.Markdown(EVALUATION_QUEUE_TEXT_JA, elem_classes="markdown-text")
|
| 515 |
+
|
| 516 |
+
with gr.Column():
|
| 517 |
+
with gr.Accordion(
|
| 518 |
+
f"✅ Finished Evaluations ({len(FINISHED_EVAL_QUEUE_DF)})",
|
| 519 |
+
open=False,
|
| 520 |
+
):
|
| 521 |
+
with gr.Row():
|
| 522 |
+
finished_eval_table = gr.Dataframe(
|
| 523 |
+
value=FINISHED_EVAL_QUEUE_DF,
|
| 524 |
+
headers=EVAL_COLS,
|
| 525 |
+
datatype=EVAL_TYPES,
|
| 526 |
+
row_count=5,
|
| 527 |
+
)
|
| 528 |
+
with gr.Accordion(
|
| 529 |
+
f"🔄 Running Evaluation Queue ({len(RUNNING_EVAL_QUEUE_DF)})",
|
| 530 |
+
open=False,
|
| 531 |
+
):
|
| 532 |
+
with gr.Row():
|
| 533 |
+
running_eval_table = gr.Dataframe(
|
| 534 |
+
value=RUNNING_EVAL_QUEUE_DF,
|
| 535 |
+
headers=EVAL_COLS,
|
| 536 |
+
datatype=EVAL_TYPES,
|
| 537 |
+
row_count=5,
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
with gr.Accordion(
|
| 541 |
+
f"⏳ Pending Evaluation Queue ({len(PENDING_EVAL_QUEUE_DF)})",
|
| 542 |
+
open=False,
|
| 543 |
+
):
|
| 544 |
+
with gr.Row():
|
| 545 |
+
pending_eval_table = gr.Dataframe(
|
| 546 |
+
value=PENDING_EVAL_QUEUE_DF,
|
| 547 |
+
headers=EVAL_COLS,
|
| 548 |
+
datatype=EVAL_TYPES,
|
| 549 |
+
row_count=5,
|
| 550 |
+
)
|
| 551 |
+
with gr.Accordion(
|
| 552 |
+
f"❎ Failed Evaluation Queue ({len(FAILED_EVAL_QUEUE_DF)})",
|
| 553 |
+
open=False,
|
| 554 |
+
):
|
| 555 |
+
with gr.Row():
|
| 556 |
+
failed_eval_table = gr.Dataframe(
|
| 557 |
+
value=FAILED_EVAL_QUEUE_DF,
|
| 558 |
+
headers=EVAL_COLS,
|
| 559 |
+
datatype=EVAL_TYPES,
|
| 560 |
+
row_count=5,
|
| 561 |
+
)
|
| 562 |
+
with gr.Row():
|
| 563 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 564 |
+
|
| 565 |
+
with gr.Row():
|
| 566 |
+
with gr.Column():
|
| 567 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 568 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 569 |
+
model_type = gr.Dropdown(
|
| 570 |
+
label="Model type",
|
| 571 |
+
choices=[t.to_str(" : ") for t in ModelType],
|
| 572 |
+
multiselect=False,
|
| 573 |
+
value=None,
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
with gr.Column():
|
| 577 |
+
precision = gr.Dropdown(
|
| 578 |
+
label="Precision",
|
| 579 |
+
choices=[i.value.name for i in Precision] + ["auto"],
|
| 580 |
+
multiselect=False,
|
| 581 |
+
value="auto",
|
| 582 |
+
)
|
| 583 |
+
add_special_tokens = gr.Dropdown(
|
| 584 |
+
label="AddSpecialTokens",
|
| 585 |
+
choices=[i.value.name for i in AddSpecialTokens],
|
| 586 |
+
multiselect=False,
|
| 587 |
+
value="False",
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
submit_button = gr.Button("Submit Eval")
|
| 591 |
+
submission_result = gr.Markdown()
|
| 592 |
+
submit_button.click(
|
| 593 |
+
fn=add_new_eval,
|
| 594 |
+
inputs=[
|
| 595 |
+
model_name_textbox,
|
| 596 |
+
revision_name_textbox,
|
| 597 |
+
precision,
|
| 598 |
+
model_type,
|
| 599 |
+
add_special_tokens,
|
| 600 |
+
],
|
| 601 |
+
outputs=submission_result,
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
# Main demo
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
def set_default_language(request: gr.Request) -> gr.Radio:
|
| 609 |
+
if request.headers["Accept-Language"].split(",")[0].lower().startswith("ja"):
|
| 610 |
+
return gr.Radio(value="🇯🇵 JA")
|
| 611 |
+
else:
|
| 612 |
+
return gr.Radio(value="🇺🇸 EN")
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def update_language(
|
| 616 |
+
language: str,
|
| 617 |
+
) -> tuple[
|
| 618 |
+
gr.Markdown, # introduction_text
|
| 619 |
+
gr.Markdown, # llm_benchmarks_text
|
| 620 |
+
gr.Markdown, # evaluation_queue_text
|
| 621 |
+
gr.Textbox, # citation_button
|
| 622 |
+
gr.Button, # select_all_button
|
| 623 |
+
gr.Button, # select_none_button
|
| 624 |
+
gr.Button, # select_avg_only_button
|
| 625 |
+
gr.Accordion, # citation_accordion
|
| 626 |
+
]:
|
| 627 |
+
if language == "🇯🇵 JA":
|
| 628 |
+
return (
|
| 629 |
+
gr.Markdown(value=INTRODUCTION_TEXT_JA),
|
| 630 |
+
gr.Markdown(value=LLM_BENCHMARKS_TEXT_JA),
|
| 631 |
+
gr.Markdown(value=EVALUATION_QUEUE_TEXT_JA),
|
| 632 |
+
gr.Textbox(label=CITATION_BUTTON_LABEL_JA),
|
| 633 |
+
gr.Button(value=SELECT_ALL_BUTTON_LABEL_JA),
|
| 634 |
+
gr.Button(value=SELECT_NONE_BUTTON_LABEL_JA),
|
| 635 |
+
gr.Button(value=SELECT_AVG_ONLY_BUTTON_LABEL_JA),
|
| 636 |
+
gr.Accordion(label=CITATION_ACCORDION_LABEL_JA),
|
| 637 |
+
)
|
| 638 |
+
else:
|
| 639 |
+
return (
|
| 640 |
+
gr.Markdown(value=INTRODUCTION_TEXT),
|
| 641 |
+
gr.Markdown(value=LLM_BENCHMARKS_TEXT),
|
| 642 |
+
gr.Markdown(value=EVALUATION_QUEUE_TEXT),
|
| 643 |
+
gr.Textbox(label=CITATION_BUTTON_LABEL),
|
| 644 |
+
gr.Button(value=SELECT_ALL_BUTTON_LABEL),
|
| 645 |
+
gr.Button(value=SELECT_NONE_BUTTON_LABEL),
|
| 646 |
+
gr.Button(value=SELECT_AVG_ONLY_BUTTON_LABEL),
|
| 647 |
+
gr.Accordion(label=CITATION_ACCORDION_LABEL),
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
with gr.Blocks(css_paths="style.css", theme=gr.themes.Glass()) as demo:
|
| 652 |
+
gr.HTML(TITLE)
|
| 653 |
+
introduction_text = gr.Markdown(INTRODUCTION_TEXT_JA, elem_classes="markdown-text")
|
| 654 |
+
|
| 655 |
+
with gr.Tabs() as tabs:
|
| 656 |
+
with gr.Tab("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table"):
|
| 657 |
+
demo_leaderboard.render()
|
| 658 |
+
|
| 659 |
+
with gr.Tab("📝 About", elem_id="llm-benchmark-tab-about"):
|
| 660 |
+
llm_benchmarks_text = gr.Markdown(LLM_BENCHMARKS_TEXT_JA, elem_classes="markdown-text")
|
| 661 |
+
|
| 662 |
+
with gr.Tab("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit"):
|
| 663 |
+
demo_submission.render()
|
| 664 |
+
|
| 665 |
+
with gr.Row():
|
| 666 |
+
with gr.Accordion(CITATION_ACCORDION_LABEL_JA, open=False) as citation_accordion:
|
| 667 |
+
citation_button = gr.Textbox(
|
| 668 |
+
label=CITATION_BUTTON_LABEL_JA,
|
| 669 |
+
value=CITATION_BUTTON_TEXT,
|
| 670 |
+
lines=20,
|
| 671 |
+
elem_id="citation-button",
|
| 672 |
+
show_copy_button=True,
|
| 673 |
+
)
|
| 674 |
+
gr.HTML(BOTTOM_LOGO)
|
| 675 |
+
|
| 676 |
+
language = gr.Radio(
|
| 677 |
+
choices=["🇯🇵 JA", "🇺🇸 EN"],
|
| 678 |
+
value="🇯🇵 JA",
|
| 679 |
+
elem_classes="language-selector",
|
| 680 |
+
show_label=False,
|
| 681 |
+
container=False,
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
demo.load(fn=set_default_language, outputs=language)
|
| 685 |
+
language.change(
|
| 686 |
+
fn=update_language,
|
| 687 |
+
inputs=language,
|
| 688 |
+
outputs=[
|
| 689 |
+
introduction_text,
|
| 690 |
+
llm_benchmarks_text,
|
| 691 |
+
evaluation_queue_text,
|
| 692 |
+
citation_button,
|
| 693 |
+
select_all_button,
|
| 694 |
+
select_none_button,
|
| 695 |
+
select_avg_only_button,
|
| 696 |
+
citation_accordion,
|
| 697 |
+
],
|
| 698 |
+
api_name=False,
|
| 699 |
+
)
|
| 700 |
+
|
| 701 |
+
if __name__ == "__main__":
|
| 702 |
+
if os.getenv("SPACE_ID"):
|
| 703 |
+
scheduler = BackgroundScheduler()
|
| 704 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 705 |
+
scheduler.start()
|
| 706 |
+
demo.queue(default_concurrency_limit=40).launch()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "open-japanese-llm-leaderboard"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"apscheduler>=3.10.4",
|
| 9 |
+
"datasets>=3.2.0",
|
| 10 |
+
"gradio>=5.9.1",
|
| 11 |
+
"hf-transfer>=0.1.8",
|
| 12 |
+
"plotly>=5.24.1",
|
| 13 |
+
"torch>=2.5.1",
|
| 14 |
+
"transformers>=4.47.1",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
[tool.ruff]
|
| 18 |
+
line-length = 119
|
| 19 |
+
|
| 20 |
+
[tool.ruff.lint]
|
| 21 |
+
select = [
|
| 22 |
+
"ARG", # Check function argument usage
|
| 23 |
+
"B", # Common bugs and design problems (from flake8-bugbear)
|
| 24 |
+
"C", # Complexity checks (from mccabe)
|
| 25 |
+
"E", # PEP 8 errors (from pycodestyle)
|
| 26 |
+
"F", # Pyflakes errors (basic Python errors)
|
| 27 |
+
"I", # Import sorting and formatting
|
| 28 |
+
"N", # Naming conventions (from pep8-naming)
|
| 29 |
+
"PL", # Pylint rules
|
| 30 |
+
"S101", # Use of assert statements (from flake8-bandit)
|
| 31 |
+
"SIM", # Code simplification suggestions
|
| 32 |
+
"UP", # Python upgrade suggestions
|
| 33 |
+
"W", # PEP 8 warnings (from pycodestyle)
|
| 34 |
+
]
|
| 35 |
+
ignore = [
|
| 36 |
+
"E501", # Line too long (> 79 characters)
|
| 37 |
+
"SIM117", # Use a single 'with' statement with multiple contexts instead of nested 'with' statements
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
[tool.ruff.format]
|
| 41 |
+
docstring-code-format = true
|
requirements.txt
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file was autogenerated by uv via the following command:
|
| 2 |
+
# uv pip compile pyproject.toml -o requirements.txt
|
| 3 |
+
aiofiles==23.2.1
|
| 4 |
+
# via gradio
|
| 5 |
+
aiohappyeyeballs==2.4.4
|
| 6 |
+
# via aiohttp
|
| 7 |
+
aiohttp==3.11.11
|
| 8 |
+
# via
|
| 9 |
+
# datasets
|
| 10 |
+
# fsspec
|
| 11 |
+
aiosignal==1.3.2
|
| 12 |
+
# via aiohttp
|
| 13 |
+
annotated-types==0.7.0
|
| 14 |
+
# via pydantic
|
| 15 |
+
anyio==4.7.0
|
| 16 |
+
# via
|
| 17 |
+
# gradio
|
| 18 |
+
# httpx
|
| 19 |
+
# starlette
|
| 20 |
+
apscheduler==3.11.0
|
| 21 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 22 |
+
async-timeout==5.0.1
|
| 23 |
+
# via aiohttp
|
| 24 |
+
attrs==24.3.0
|
| 25 |
+
# via aiohttp
|
| 26 |
+
certifi==2024.12.14
|
| 27 |
+
# via
|
| 28 |
+
# httpcore
|
| 29 |
+
# httpx
|
| 30 |
+
# requests
|
| 31 |
+
charset-normalizer==3.4.0
|
| 32 |
+
# via requests
|
| 33 |
+
click==8.1.8
|
| 34 |
+
# via
|
| 35 |
+
# typer
|
| 36 |
+
# uvicorn
|
| 37 |
+
datasets==3.2.0
|
| 38 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 39 |
+
dill==0.3.8
|
| 40 |
+
# via
|
| 41 |
+
# datasets
|
| 42 |
+
# multiprocess
|
| 43 |
+
exceptiongroup==1.2.2
|
| 44 |
+
# via anyio
|
| 45 |
+
fastapi==0.115.6
|
| 46 |
+
# via gradio
|
| 47 |
+
ffmpy==0.5.0
|
| 48 |
+
# via gradio
|
| 49 |
+
filelock==3.16.1
|
| 50 |
+
# via
|
| 51 |
+
# datasets
|
| 52 |
+
# huggingface-hub
|
| 53 |
+
# torch
|
| 54 |
+
# transformers
|
| 55 |
+
# triton
|
| 56 |
+
frozenlist==1.5.0
|
| 57 |
+
# via
|
| 58 |
+
# aiohttp
|
| 59 |
+
# aiosignal
|
| 60 |
+
fsspec==2024.9.0
|
| 61 |
+
# via
|
| 62 |
+
# datasets
|
| 63 |
+
# gradio-client
|
| 64 |
+
# huggingface-hub
|
| 65 |
+
# torch
|
| 66 |
+
gradio==5.9.1
|
| 67 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 68 |
+
gradio-client==1.5.2
|
| 69 |
+
# via gradio
|
| 70 |
+
h11==0.14.0
|
| 71 |
+
# via
|
| 72 |
+
# httpcore
|
| 73 |
+
# uvicorn
|
| 74 |
+
hf-transfer==0.1.8
|
| 75 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 76 |
+
httpcore==1.0.7
|
| 77 |
+
# via httpx
|
| 78 |
+
httpx==0.28.1
|
| 79 |
+
# via
|
| 80 |
+
# gradio
|
| 81 |
+
# gradio-client
|
| 82 |
+
# safehttpx
|
| 83 |
+
huggingface-hub==0.27.0
|
| 84 |
+
# via
|
| 85 |
+
# datasets
|
| 86 |
+
# gradio
|
| 87 |
+
# gradio-client
|
| 88 |
+
# tokenizers
|
| 89 |
+
# transformers
|
| 90 |
+
idna==3.10
|
| 91 |
+
# via
|
| 92 |
+
# anyio
|
| 93 |
+
# httpx
|
| 94 |
+
# requests
|
| 95 |
+
# yarl
|
| 96 |
+
jinja2==3.1.5
|
| 97 |
+
# via
|
| 98 |
+
# gradio
|
| 99 |
+
# torch
|
| 100 |
+
markdown-it-py==3.0.0
|
| 101 |
+
# via rich
|
| 102 |
+
markupsafe==2.1.5
|
| 103 |
+
# via
|
| 104 |
+
# gradio
|
| 105 |
+
# jinja2
|
| 106 |
+
mdurl==0.1.2
|
| 107 |
+
# via markdown-it-py
|
| 108 |
+
mpmath==1.3.0
|
| 109 |
+
# via sympy
|
| 110 |
+
multidict==6.1.0
|
| 111 |
+
# via
|
| 112 |
+
# aiohttp
|
| 113 |
+
# yarl
|
| 114 |
+
multiprocess==0.70.16
|
| 115 |
+
# via datasets
|
| 116 |
+
networkx==3.4.2
|
| 117 |
+
# via torch
|
| 118 |
+
numpy==2.2.1
|
| 119 |
+
# via
|
| 120 |
+
# datasets
|
| 121 |
+
# gradio
|
| 122 |
+
# pandas
|
| 123 |
+
# transformers
|
| 124 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 125 |
+
# via
|
| 126 |
+
# nvidia-cudnn-cu12
|
| 127 |
+
# nvidia-cusolver-cu12
|
| 128 |
+
# torch
|
| 129 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 130 |
+
# via torch
|
| 131 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 132 |
+
# via torch
|
| 133 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 134 |
+
# via torch
|
| 135 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 136 |
+
# via torch
|
| 137 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 138 |
+
# via torch
|
| 139 |
+
nvidia-curand-cu12==10.3.5.147
|
| 140 |
+
# via torch
|
| 141 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 142 |
+
# via torch
|
| 143 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 144 |
+
# via
|
| 145 |
+
# nvidia-cusolver-cu12
|
| 146 |
+
# torch
|
| 147 |
+
nvidia-nccl-cu12==2.21.5
|
| 148 |
+
# via torch
|
| 149 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 150 |
+
# via
|
| 151 |
+
# nvidia-cusolver-cu12
|
| 152 |
+
# nvidia-cusparse-cu12
|
| 153 |
+
# torch
|
| 154 |
+
nvidia-nvtx-cu12==12.4.127
|
| 155 |
+
# via torch
|
| 156 |
+
orjson==3.10.12
|
| 157 |
+
# via gradio
|
| 158 |
+
packaging==24.2
|
| 159 |
+
# via
|
| 160 |
+
# datasets
|
| 161 |
+
# gradio
|
| 162 |
+
# gradio-client
|
| 163 |
+
# huggingface-hub
|
| 164 |
+
# plotly
|
| 165 |
+
# transformers
|
| 166 |
+
pandas==2.2.3
|
| 167 |
+
# via
|
| 168 |
+
# datasets
|
| 169 |
+
# gradio
|
| 170 |
+
pillow==11.0.0
|
| 171 |
+
# via gradio
|
| 172 |
+
plotly==5.24.1
|
| 173 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 174 |
+
propcache==0.2.1
|
| 175 |
+
# via
|
| 176 |
+
# aiohttp
|
| 177 |
+
# yarl
|
| 178 |
+
pyarrow==18.1.0
|
| 179 |
+
# via datasets
|
| 180 |
+
pydantic==2.10.4
|
| 181 |
+
# via
|
| 182 |
+
# fastapi
|
| 183 |
+
# gradio
|
| 184 |
+
pydantic-core==2.27.2
|
| 185 |
+
# via pydantic
|
| 186 |
+
pydub==0.25.1
|
| 187 |
+
# via gradio
|
| 188 |
+
pygments==2.18.0
|
| 189 |
+
# via rich
|
| 190 |
+
python-dateutil==2.9.0.post0
|
| 191 |
+
# via pandas
|
| 192 |
+
python-multipart==0.0.20
|
| 193 |
+
# via gradio
|
| 194 |
+
pytz==2024.2
|
| 195 |
+
# via pandas
|
| 196 |
+
pyyaml==6.0.2
|
| 197 |
+
# via
|
| 198 |
+
# datasets
|
| 199 |
+
# gradio
|
| 200 |
+
# huggingface-hub
|
| 201 |
+
# transformers
|
| 202 |
+
regex==2024.11.6
|
| 203 |
+
# via transformers
|
| 204 |
+
requests==2.32.3
|
| 205 |
+
# via
|
| 206 |
+
# datasets
|
| 207 |
+
# huggingface-hub
|
| 208 |
+
# transformers
|
| 209 |
+
rich==13.9.4
|
| 210 |
+
# via typer
|
| 211 |
+
ruff==0.8.4
|
| 212 |
+
# via gradio
|
| 213 |
+
safehttpx==0.1.6
|
| 214 |
+
# via gradio
|
| 215 |
+
safetensors==0.4.5
|
| 216 |
+
# via transformers
|
| 217 |
+
semantic-version==2.10.0
|
| 218 |
+
# via gradio
|
| 219 |
+
shellingham==1.5.4
|
| 220 |
+
# via typer
|
| 221 |
+
six==1.17.0
|
| 222 |
+
# via python-dateutil
|
| 223 |
+
sniffio==1.3.1
|
| 224 |
+
# via anyio
|
| 225 |
+
starlette==0.41.3
|
| 226 |
+
# via
|
| 227 |
+
# fastapi
|
| 228 |
+
# gradio
|
| 229 |
+
sympy==1.13.1
|
| 230 |
+
# via torch
|
| 231 |
+
tenacity==9.0.0
|
| 232 |
+
# via plotly
|
| 233 |
+
tokenizers==0.21.0
|
| 234 |
+
# via transformers
|
| 235 |
+
tomlkit==0.13.2
|
| 236 |
+
# via gradio
|
| 237 |
+
torch==2.5.1
|
| 238 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 239 |
+
tqdm==4.67.1
|
| 240 |
+
# via
|
| 241 |
+
# datasets
|
| 242 |
+
# huggingface-hub
|
| 243 |
+
# transformers
|
| 244 |
+
transformers==4.47.1
|
| 245 |
+
# via open-japanese-llm-leaderboard (pyproject.toml)
|
| 246 |
+
triton==3.1.0
|
| 247 |
+
# via torch
|
| 248 |
+
typer==0.15.1
|
| 249 |
+
# via gradio
|
| 250 |
+
typing-extensions==4.12.2
|
| 251 |
+
# via
|
| 252 |
+
# anyio
|
| 253 |
+
# fastapi
|
| 254 |
+
# gradio
|
| 255 |
+
# gradio-client
|
| 256 |
+
# huggingface-hub
|
| 257 |
+
# multidict
|
| 258 |
+
# pydantic
|
| 259 |
+
# pydantic-core
|
| 260 |
+
# rich
|
| 261 |
+
# torch
|
| 262 |
+
# typer
|
| 263 |
+
# uvicorn
|
| 264 |
+
tzdata==2024.2
|
| 265 |
+
# via pandas
|
| 266 |
+
tzlocal==5.2
|
| 267 |
+
# via apscheduler
|
| 268 |
+
urllib3==2.3.0
|
| 269 |
+
# via requests
|
| 270 |
+
uvicorn==0.34.0
|
| 271 |
+
# via gradio
|
| 272 |
+
websockets==14.1
|
| 273 |
+
# via gradio-client
|
| 274 |
+
xxhash==3.5.0
|
| 275 |
+
# via datasets
|
| 276 |
+
yarl==1.18.3
|
| 277 |
+
# via aiohttp
|
src/Logos-HQ/B-Test-1-D-Top-Logo.png
ADDED
|
Git LFS Details
|
src/Logos-HQ/B-Test-2-Bottom-Logo-B.png
ADDED
|
Git LFS Details
|
src/Logos-HQ/HuggingFace-Logo-Oct-2024.png
ADDED
|
Git LFS Details
|
src/Logos-HQ/LLM-jp-Logo-Oct-2024.png
ADDED
|
src/Logos-HQ/MDX-Logo-Oct-2024.jpg
ADDED
|
Git LFS Details
|
src/about.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class TaskType(Enum):
|
| 6 |
+
AVG = "Average - 平均"
|
| 7 |
+
NLI = "NLI - 自然言語推論"
|
| 8 |
+
QA = "QA - 質問応答"
|
| 9 |
+
RC = "RC - 読解力"
|
| 10 |
+
CR = "CR - コモンセンス推論"
|
| 11 |
+
EL = "EL - エンティティリンキング"
|
| 12 |
+
FA = "FA - 基礎分析"
|
| 13 |
+
MR = "MR - 数学的推論"
|
| 14 |
+
MT = "MT - 機械翻訳"
|
| 15 |
+
STS = "STS - 意味的類似度"
|
| 16 |
+
HE_EN = "HE-EN - 英語試験問題"
|
| 17 |
+
HE_JA = "HE-JA - 日本語試験問題"
|
| 18 |
+
CG = "CG - コード生成"
|
| 19 |
+
SUM = "SUM - 要約"
|
| 20 |
+
BBH = "BBH - Big-Bench Hard"
|
| 21 |
+
IF = "IF - 指示追従"
|
| 22 |
+
NotTask = "?"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class Task:
|
| 27 |
+
benchmark: str
|
| 28 |
+
metric: str
|
| 29 |
+
col_name: str
|
| 30 |
+
task_type: TaskType
|
| 31 |
+
average: bool = False
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Select your tasks here
|
| 35 |
+
# ---------------------------------------------------
|
| 36 |
+
class Tasks(Enum):
|
| 37 |
+
AVG = Task("scores", "AVG", "AVG", TaskType.AVG, True)
|
| 38 |
+
NLI = Task("scores", "NLI", "AVG (NLI)", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
|
| 39 |
+
QA = Task("scores", "QA", "AVG (QA)", TaskType.QA, True) # Question Answering - 質問応答
|
| 40 |
+
RC = Task("scores", "RC", "AVG (RC)", TaskType.RC, True) # Reading Comprehension - 文章読解
|
| 41 |
+
EL = Task("scores", "EL", "AVG (EL)", TaskType.EL, True) # Entity Linking - エンティティリンキング
|
| 42 |
+
FA = Task("scores", "FA", "AVG (FA)", TaskType.FA, True) # Fundamental Analysis - 基礎解析
|
| 43 |
+
MR = Task("scores", "MR", "AVG (MR)", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
|
| 44 |
+
MT = Task("scores", "MT", "AVG (MT)", TaskType.MT, True) # Machine Translation - 機械翻訳
|
| 45 |
+
HE_EN = Task("scores", "HE-EN", "AVG (HE-EN)", TaskType.HE_EN, True) # Human Examination - English
|
| 46 |
+
HE_JA = Task("scores", "HE-JA", "AVG (HE-JA)", TaskType.HE_JA, True) # Human Examination - Japanese
|
| 47 |
+
CG = Task("scores", "CG", "AVG (CG)", TaskType.CG, True) # Code Generation - コード生成
|
| 48 |
+
SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
|
| 49 |
+
BBH = Task("scores", "BBH", "AVG (BBH)", TaskType.BBH, True) # Big-Bench Hard
|
| 50 |
+
CR = Task("scores", "CR", "AVG (CR)", TaskType.CR, True) # Commonsense Reasoning
|
| 51 |
+
IF = Task("scores", "IF", "AVG (IF)", TaskType.IF, True) # Instruction Following
|
| 52 |
+
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
| 53 |
+
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
| 54 |
+
alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22 ⭐", TaskType.MT)
|
| 55 |
+
alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
|
| 56 |
+
alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
|
| 57 |
+
alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
|
| 58 |
+
chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
|
| 59 |
+
commonsensemoralja_exact_match = Task(
|
| 60 |
+
"scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA ⭐", TaskType.CR
|
| 61 |
+
)
|
| 62 |
+
jamp_exact_match = Task("scores", "jamp_exact_match", "JAMP ⭐", TaskType.NLI)
|
| 63 |
+
janli_exact_match = Task("scores", "janli_exact_match", "JANLI ⭐", TaskType.NLI)
|
| 64 |
+
jcommonsenseqa_exact_match = Task("scores", "jcommonsenseqa_exact_match", "JCommonSenseQA ⭐", TaskType.CR)
|
| 65 |
+
jemhopqa_char_f1 = Task("scores", "jemhopqa_char_f1", "JEMHopQA ⭐", TaskType.QA)
|
| 66 |
+
jmmlu_exact_match = Task("scores", "jmmlu_exact_match", "JMMLU ⭐", TaskType.HE_JA)
|
| 67 |
+
jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI ⭐", TaskType.NLI)
|
| 68 |
+
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM ⭐", TaskType.NLI)
|
| 69 |
+
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK ⭐", TaskType.NLI)
|
| 70 |
+
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
|
| 71 |
+
jsts_pearson = Task(
|
| 72 |
+
"scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
|
| 73 |
+
) # Semantic Textual Similarity - 意味的類似度
|
| 74 |
+
jsts_spearman = Task(
|
| 75 |
+
"scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
|
| 76 |
+
) # Semantic Textual Similarity - 意味的類似度
|
| 77 |
+
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI ⭐", TaskType.CR)
|
| 78 |
+
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
|
| 79 |
+
mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) (0 shots only) ⭐", TaskType.CG)
|
| 80 |
+
mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint) (0 shots only)", TaskType.CG)
|
| 81 |
+
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU ⭐", TaskType.HE_EN)
|
| 82 |
+
niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC ⭐", TaskType.QA)
|
| 83 |
+
aio_char_f1 = Task("scores", "aio_char_f1", "JAQKET ⭐", TaskType.QA)
|
| 84 |
+
wiki_coreference_set_f1 = Task("scores", "wiki_coreference_set_f1", "Wiki Coreference ⭐", TaskType.FA)
|
| 85 |
+
wiki_dependency_set_f1 = Task("scores", "wiki_dependency_set_f1", "Wiki Dependency ⭐", TaskType.FA)
|
| 86 |
+
wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER ⭐", TaskType.FA)
|
| 87 |
+
wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS ⭐", TaskType.FA)
|
| 88 |
+
wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading ⭐", TaskType.FA)
|
| 89 |
+
wikicorpus_e_to_j_bert_score_ja_f1 = Task(
|
| 90 |
+
"scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score", TaskType.MT
|
| 91 |
+
)
|
| 92 |
+
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
|
| 93 |
+
wikicorpus_e_to_j_comet_wmt22 = Task(
|
| 94 |
+
"scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22 ⭐", TaskType.MT
|
| 95 |
+
)
|
| 96 |
+
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
| 97 |
+
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
|
| 98 |
+
)
|
| 99 |
+
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
|
| 100 |
+
wikicorpus_j_to_e_comet_wmt22 = Task(
|
| 101 |
+
"scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22 ⭐", TaskType.MT
|
| 102 |
+
)
|
| 103 |
+
xlsum_ja_bert_score_ja_f1 = Task(
|
| 104 |
+
"scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score (0 shots only)", TaskType.SUM
|
| 105 |
+
)
|
| 106 |
+
xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU (0 shots only)", TaskType.SUM)
|
| 107 |
+
xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1 (0 shots only)", TaskType.SUM)
|
| 108 |
+
xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2 (0 shots only) ⭐", TaskType.SUM)
|
| 109 |
+
# xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
|
| 110 |
+
xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum (0 shots only)", TaskType.SUM)
|
| 111 |
+
# New tasks for v2.0.0
|
| 112 |
+
aime2024_mathematical_equivalence = Task("scores", "aime2024_mathematical_equivalence", "AIME 2024 ⭐", TaskType.MR)
|
| 113 |
+
aime2025_mathematical_equivalence = Task("scores", "aime2025_mathematical_equivalence", "AIME 2025 ⭐", TaskType.MR)
|
| 114 |
+
bigbenchhard_direct_exact_match = Task("scores", "bigbenchhard_direct_exact_match", "BBH Direct ⭐", TaskType.BBH)
|
| 115 |
+
bigbenchhard_cot_exact_match = Task("scores", "bigbenchhard_cot_exact_match", "BBH CoT ⭐", TaskType.BBH)
|
| 116 |
+
bigbenchhard_ja_direct_exact_match = Task("scores", "bigbenchhard_ja_direct_exact_match", "BBH JA Direct ⭐", TaskType.BBH)
|
| 117 |
+
bigbenchhard_ja_cot_exact_match = Task("scores", "bigbenchhard_ja_cot_exact_match", "BBH JA CoT ⭐", TaskType.BBH)
|
| 118 |
+
drop_drop_f1 = Task("scores", "drop_drop_f1", "DROP ⭐", TaskType.QA)
|
| 119 |
+
gsm8k_mathematical_equivalence = Task("scores", "gsm8k_mathematical_equivalence", "GSM8K ⭐", TaskType.MR)
|
| 120 |
+
gpqa_diamond_en_exact_match = Task("scores", "gpqa_diamond_en_exact_match", "GPQA Diamond EN ⭐", TaskType.HE_EN)
|
| 121 |
+
gpqa_extended_en_exact_match = Task("scores", "gpqa_extended_en_exact_match", "GPQA Extended EN ⭐", TaskType.HE_EN)
|
| 122 |
+
gpqa_main_en_exact_match = Task("scores", "gpqa_main_en_exact_match", "GPQA Main EN ⭐", TaskType.HE_EN)
|
| 123 |
+
gpqa_diamond_ja_exact_match = Task("scores", "gpqa_diamond_ja_exact_match", "GPQA Diamond JA ⭐", TaskType.HE_JA)
|
| 124 |
+
gpqa_extended_ja_exact_match = Task("scores", "gpqa_extended_ja_exact_match", "GPQA Extended JA ⭐", TaskType.HE_JA)
|
| 125 |
+
gpqa_main_ja_exact_match = Task("scores", "gpqa_main_ja_exact_match", "GPQA Main JA ⭐", TaskType.HE_JA)
|
| 126 |
+
jamc_qa_exact_match = Task("scores", "jamc-qa_exact_match", "JAMC-QA ⭐", TaskType.QA)
|
| 127 |
+
jhumaneval_code_exec = Task("scores", "jhumaneval_code_exec", "JHumanEval ⭐", TaskType.CG)
|
| 128 |
+
mgsm_mathematical_equivalence = Task("scores", "mgsm_mathematical_equivalence", "MGSM ⭐", TaskType.MR)
|
| 129 |
+
mmlu_prox_ja_exact_match = Task("scores", "mmlu_prox_ja_exact_match", "MMLU Prox JA ⭐", TaskType.HE_JA)
|
| 130 |
+
mmlu_prox_en_exact_match = Task("scores", "mmlu_prox_en_exact_match", "MMLU Prox EN ⭐", TaskType.HE_EN)
|
| 131 |
+
mif_eval_ja_mifeval_strict = Task("scores", "mif_eval_ja_mifeval_strict", "MIF Eval JA ⭐", TaskType.IF)
|
| 132 |
+
mif_eval_en_mifeval_strict = Task("scores", "mif_eval_en_mifeval_strict", "MIF Eval EN ⭐", TaskType.IF)
|
| 133 |
+
mmmlu_exact_match = Task("scores", "mmmlu_exact_match", "MMMLU ⭐", TaskType.HE_JA)
|
| 134 |
+
openbookqa_exact_match = Task("scores", "openbookqa_exact_match", "OpenBookQA ⭐", TaskType.HE_EN)
|
| 135 |
+
polymath_en_polymath_weighted_accuracy = Task("scores", "polymath-en_polymath_weighted_accuracy", "Polymath EN ⭐", TaskType.MR)
|
| 136 |
+
polymath_ja_polymath_weighted_accuracy = Task("scores", "polymath-ja_polymath_weighted_accuracy", "Polymath JA ⭐", TaskType.MR)
|
| 137 |
+
triviaqa_triviaqa_f1 = Task("scores", "triviaqa_triviaqa_f1", "TriviaQA ⭐", TaskType.QA)
|
| 138 |
+
winogrande_xl_exact_match = Task("scores", "winogrande_xl_exact_match", "WinoGrande XL ⭐", TaskType.CR)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
| 142 |
+
# ---------------------------------------------------
|
| 143 |
+
|
| 144 |
+
# Your leaderboard name
|
| 145 |
+
TITLE = """<h1 align="center" id="space-title">🇯🇵 Open Japanese LLM Leaderboard 🌸<br>オープン日本語LLMリーダーボード</h1>"""
|
| 146 |
+
|
| 147 |
+
# What does your leaderboard evaluate?
|
| 148 |
+
INTRODUCTION_TEXT = """
|
| 149 |
+
The __Open Japanese LLM Leaderboard__ by __[LLM-jp](https://llm-jp.nii.ac.jp/en/)__ evaluates
|
| 150 |
+
the performance of Japanese Large Language Models (LLMs) across 12 categories covering more than 50 tasks from
|
| 151 |
+
classical to modern NLP tasks. The __Open Japanese LLM Leaderboard__ was built by open-source
|
| 152 |
+
contributors of __[LLM-jp](https://llm-jp.nii.ac.jp/en/)__, a cross-organizational project
|
| 153 |
+
for the research and development of Japanese LLMs supported by the _National Institute of
|
| 154 |
+
Informatics_ in Tokyo, Japan.
|
| 155 |
+
|
| 156 |
+
On the __"LLM Benchmark"__ page, the question mark **"?"** refers to the parameters that
|
| 157 |
+
are unknown in the model card on Hugging Face. For more information about datasets,
|
| 158 |
+
please consult the __"About"__ page or refer to the website of
|
| 159 |
+
__[LLM-jp](https://llm-jp.nii.ac.jp/en/)__. And on the __"Submit here!"__ page, you can
|
| 160 |
+
evaluate the performance of your model, and be part of the leaderboard.
|
| 161 |
+
"""
|
| 162 |
+
INTRODUCTION_TEXT_JA = """\
|
| 163 |
+
__[LLM-jp](https://llm-jp.nii.ac.jp/)__ による __オープン日本語LLMリーダーボード__ は、\
|
| 164 |
+
古典的なものから最新のものまで12のカテゴリに渡る50種類以上のNLPタスクを用いて日本語大規模言語モデル(LLM)の\
|
| 165 |
+
性能を評価します。__オープン日本語LLMリーダーボード__ は、日本の国立情報学研究所を中心に\
|
| 166 |
+
日本語LLMの研究開発を行う組織横断プロジェクト __[LLM-jp](https://llm-jp.nii.ac.jp/)__ \
|
| 167 |
+
のオープンソース貢献者によって構築されました。
|
| 168 |
+
|
| 169 |
+
__「LLM Benchmark」__ ページでは、疑問符 **「?」** はHugging Faceのモデルカードで不明な\
|
| 170 |
+
パラメータを示しています。データセットに関する詳細情報については、__「About」__ ページを\
|
| 171 |
+
参照するか、__[LLM-jp](https://llm-jp.nii.ac.jp/)__ のウェブサイトをご覧ください。\
|
| 172 |
+
また、__「Submit here!」__ ページでは、あなたのモデルの性能を評価し、リーダーボードに\
|
| 173 |
+
参加することができます。
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
| 177 |
+
LLM_BENCHMARKS_TEXT = """
|
| 178 |
+
## How it works
|
| 179 |
+
📈 We evaluate Japanese Large Language Models across 12 categories covering more than 50 tasks leveraging our evaluation tool [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval), a unified framework to evaluate Japanese LLMs on various evaluation tasks.
|
| 180 |
+
|
| 181 |
+
**NLI (Natural Language Inference)**
|
| 182 |
+
|
| 183 |
+
* `Jamp`, a Japanese NLI benchmark focused on temporal inference [Source](https://github.com/tomo-ut/temporalNLI_dataset) (License CC BY-SA 4.0)
|
| 184 |
+
|
| 185 |
+
* `JaNLI`, Japanese Adversarial Natural Language Inference [Source](https://github.com/verypluming/JaNLI) (License CC BY-SA 4.0)
|
| 186 |
+
|
| 187 |
+
* `JNLI`, Japanese Natural Language Inference (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
|
| 188 |
+
|
| 189 |
+
* `JSeM`, Japanese semantic test suite [Source](https://github.com/DaisukeBekki/JSeM) (License BSD 3-Clause)
|
| 190 |
+
|
| 191 |
+
* `JSICK`, Japanese Sentences Involving Compositional Knowledge [Source](https://github.com/verypluming/JSICK) (License CC BY-SA 4.0)
|
| 192 |
+
|
| 193 |
+
**QA (Question Answering)**
|
| 194 |
+
|
| 195 |
+
* `JEMHopQA`, Japanese Explainable Multi-hop Question Answering [Source](https://github.com/aiishii/JEMHopQA) (License CC BY-SA 4.0)
|
| 196 |
+
|
| 197 |
+
* `NIILC`, NIILC Question Answering Dataset [Source](https://github.com/mynlp/niilc-qa) (License CC BY-SA 4.0)
|
| 198 |
+
|
| 199 |
+
* `JAQKET`, Japanese QA dataset on the subject of quizzes [Source](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/) (License CC BY-SA 4.0 - Other licenses are required for corporate usage)
|
| 200 |
+
|
| 201 |
+
* `TriviaQA`, Reading Comprehension Challenge Dataset [Source](https://nlp.cs.washington.edu/triviaqa/) (License Apache-2.0)
|
| 202 |
+
|
| 203 |
+
* `DROP`, Discrete Reasoning Over Paragraphs [Source](https://allennlp.org/drop) (License CC BY-SA 4.0)
|
| 204 |
+
|
| 205 |
+
* `JAMC-QA`, Japanese Advanced Medical Comprehension Question Answering [Source](https://huggingface.co/datasets/llm-jp/jamc-qa) (License CC BY-SA 4.0)
|
| 206 |
+
|
| 207 |
+
**RC (Reading Comprehension)**
|
| 208 |
+
|
| 209 |
+
* `JSQuAD`, Japanese version of SQuAD (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
|
| 210 |
+
|
| 211 |
+
**CR (Commonsense Reasoning)**
|
| 212 |
+
|
| 213 |
+
* `JCommonsenseMorality`, Japanese dataset for evaluating commonsense morality understanding [Source](https://github.com/Language-Media-Lab/commonsense-moral-ja) (License MIT License)
|
| 214 |
+
|
| 215 |
+
* `JCommonsenseQA`, Japanese version of CommonsenseQA [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
|
| 216 |
+
|
| 217 |
+
* `KUCI`, Kyoto University Commonsense Inference dataset [Source](https://github.com/ku-nlp/KUCI (License CC BY-SA 4.0)
|
| 218 |
+
|
| 219 |
+
* `WinoGrande`, Winogrande Pronoun Disambiguation [Source](https://huggingface.co/datasets/winogrande) (License Apache-2.0)
|
| 220 |
+
|
| 221 |
+
**EL (Entity Linking)**
|
| 222 |
+
|
| 223 |
+
* `chABSA`, Aspect-Based Sentiment Analysis dataset [Source](https://github.com/chakki-works/chABSA-dataset) (License CC BY-SA 4.0)
|
| 224 |
+
|
| 225 |
+
**FA (Fundamental Analysis)**
|
| 226 |
+
|
| 227 |
+
* `Wikipedia Annotated Corpus`, [Source](https://github.com/ku-nlp/WikipediaAnnotatedCorpus) (License CC BY-SA 4.0)
|
| 228 |
+
|
| 229 |
+
List of tasks: (Reading Prediction, Named-entity recognition (NER), Dependency Parsing, Predicate-argument structure analysis (PAS), Coreference Resolution)
|
| 230 |
+
|
| 231 |
+
**MR (Mathematical Reasoning)**
|
| 232 |
+
|
| 233 |
+
* `MAWPS`, Japanese version of MAWPS (A Math Word Problem Repository) [Source](https://github.com/nlp-waseda/chain-of-thought-ja-dataset) (License Apache-2.0)
|
| 234 |
+
|
| 235 |
+
* `MGSM`, Japanese part of MGSM (Multilingual Grade School Math Benchmark) [Source](https://huggingface.co/datasets/juletxara/mgsm) (License MIT License)
|
| 236 |
+
|
| 237 |
+
* `GSM8K`, Grade School Math 8K [Source](https://github.com/openai/grade-school-math) (License MIT License)
|
| 238 |
+
|
| 239 |
+
* `AIME`, American Invitational Mathematics Examination [Source](https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions) (License Public Domain)
|
| 240 |
+
|
| 241 |
+
* `Polymath`, Multilevel Multimodal Mathematical Reasoning [Source](https://arxiv.org/abs/2407.21046) (License MIT License)
|
| 242 |
+
|
| 243 |
+
**MT (Machine Translation)**
|
| 244 |
+
|
| 245 |
+
* `ALT`, Asian Language Treebank (ALT) - Parallel Corpus [Source](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html) (License CC BY-SA 4.0)
|
| 246 |
+
|
| 247 |
+
* `WikiCorpus`, Japanese-English Bilingual Corpus of Wikipedia's articles about the city of Kyoto [Source](https://alaginrc.nict.go.jp/WikiCorpus/) (License CC BY-SA 3.0)
|
| 248 |
+
|
| 249 |
+
**STS (Semantic Textual Similarity)**
|
| 250 |
+
|
| 251 |
+
This task is supported by llm-jp-eval, but it is not included in the evaluation score average.
|
| 252 |
+
|
| 253 |
+
* `JSTS`, Japanese version of the STS (Semantic Textual Similarity) (part of JGLUE) [Source](https://github.com/yahoojapan/JGLUE) (License CC BY-SA 4.0)
|
| 254 |
+
|
| 255 |
+
**HE-EN (Human Examination - English)**
|
| 256 |
+
|
| 257 |
+
* `MMLU`, Measuring Massive Multitask Language Understanding [Source](https://github.com/hendrycks/test) (License MIT License)
|
| 258 |
+
|
| 259 |
+
* `GPQA`, Graduate-Level Google-Proof Q&A Benchmark [Source](https://github.com/idavidrein/gpqa) (License MIT License)
|
| 260 |
+
|
| 261 |
+
* `OpenBookQA`, Open Book Question Answering [Source](https://allenai.org/data/open-book-qa) (License Apache-2.0)
|
| 262 |
+
|
| 263 |
+
**HE-JA (Human Examination - Japanese)**
|
| 264 |
+
|
| 265 |
+
* `JMMLU`, Japanese Massive Multitask Language Understanding Benchmark [Source](https://github.com/nlp-waseda/JMMLU) (License CC BY-SA 4.0 (3 tasks under the CC BY-NC-ND 4.0 license)
|
| 266 |
+
|
| 267 |
+
* `MMMLU`, Japanese version of MMLU [Source](https://huggingface.co/datasets/pfnet/mmmlu) (License MIT License)
|
| 268 |
+
|
| 269 |
+
* `GPQA (JA)`, Japanese translation of GPQA [Source](https://github.com/idavidrein/gpqa) (License MIT License)
|
| 270 |
+
|
| 271 |
+
**CG (Code Generation)**
|
| 272 |
+
|
| 273 |
+
* `MBPP`, Japanese version of Mostly Basic Python Problems (MBPP) [Source](https://huggingface.co/datasets/llm-jp/mbpp-ja) (License CC BY-SA 4.0)
|
| 274 |
+
|
| 275 |
+
* `JHumanEval`, Japanese version of HumanEval [Source](https://huggingface.co/datasets/kogi-jwu/jhumaneval) (License MIT License)
|
| 276 |
+
|
| 277 |
+
**BBH (BIG-Bench Hard)**
|
| 278 |
+
|
| 279 |
+
* `BigBenchHard`, Challenging BIG-Bench tasks with chain-of-thought evaluation [Source](https://github.com/suzgunmirac/BIG-Bench-Hard) (License MIT License)
|
| 280 |
+
|
| 281 |
+
**IF (Instruction Following)**
|
| 282 |
+
|
| 283 |
+
* `MIF-Eval`, Multilingual Instruction Following Evaluation [Source](https://huggingface.co/datasets/google/MIF-Eval) (License Apache-2.0)
|
| 284 |
+
|
| 285 |
+
**SUM (Summarization)**
|
| 286 |
+
|
| 287 |
+
* `XL-Sum`, XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages [Source](https://github.com/csebuetnlp/xl-sum) (License CC BY-NC-SA 4.0, due to the non-commercial license, this dataset will not be used, unless you specifically agree to the license and terms of use)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
## Reproducibility
|
| 291 |
+
To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
|
| 292 |
+
|
| 293 |
+
## Average Score Calculation
|
| 294 |
+
The calculation of the average score (AVG) includes only the scores of datasets marked with a ⭐.
|
| 295 |
+
|
| 296 |
+
"""
|
| 297 |
+
|
| 298 |
+
LLM_BENCHMARKS_TEXT_JA = """
|
| 299 |
+
## 仕組み
|
| 300 |
+
📈 評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16種類のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
|
| 301 |
+
|
| 302 |
+
**NLI(自然言語推論)**
|
| 303 |
+
|
| 304 |
+
* `Jamp`、時間推論に焦点を当てた日本語NLIベンチマーク [ソース](https://github.com/tomo-ut/temporalNLI_dataset)(ライセンス CC BY-SA 4.0)
|
| 305 |
+
|
| 306 |
+
* `JaNLI`、日本語の敵対的推論データセット [ソース](https://github.com/verypluming/JaNLI)(ライセンス CC BY-SA 4.0)
|
| 307 |
+
|
| 308 |
+
* `JNLI`、日本語自然言語推論(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 309 |
+
|
| 310 |
+
* `JSeM`、日本語意味論テストセット [ソース](https://github.com/DaisukeBekki/JSeM)(ライセンス BSD 3-Clause)
|
| 311 |
+
|
| 312 |
+
* `JSICK`、構成的知識を含む日本語文データセット [ソース](https://github.com/verypluming/JSICK)(ライセン��� CC BY-SA 4.0)
|
| 313 |
+
|
| 314 |
+
**QA(質問応答)**
|
| 315 |
+
|
| 316 |
+
* `JEMHopQA`、日本語の説明可能なマルチホップ質問応答 [ソース](https://github.com/aiishii/JEMHopQA)(ライセンス CC BY-SA 4.0)
|
| 317 |
+
|
| 318 |
+
* `NIILC`、NIILC質問応答データセット [ソース](https://github.com/mynlp/niilc-qa)(ライセンス CC BY-SA 4.0)
|
| 319 |
+
|
| 320 |
+
* `JAQKET`、クイズを題材とした日本語QAデータセット [ソース](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/)(ライセンス CC BY-SA 4.0 - 企業利用には別途ライセンスが必要)
|
| 321 |
+
|
| 322 |
+
**RC(読解)**
|
| 323 |
+
|
| 324 |
+
* `JSQuAD`、SQuADの日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 325 |
+
|
| 326 |
+
**MC(選択式質問応答)**
|
| 327 |
+
|
| 328 |
+
* `JCommonsenseMorality`、常識的な道徳理解を評価する日本語データセット [ソース](https://github.com/Language-Media-Lab/commonsense-moral-ja)(ライセンス MIT License)
|
| 329 |
+
|
| 330 |
+
* `JCommonsenseQA`、CommonsenseQAの日本語版 [ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 331 |
+
|
| 332 |
+
* `KUCI`、京都大学常識推論データセット [ソース](https://github.com/ku-nlp/KUCI)(ライセンス CC BY-SA 4.0)
|
| 333 |
+
|
| 334 |
+
**EL(エンティティリンキング)**
|
| 335 |
+
|
| 336 |
+
* `chABSA`、アスペクトベースの感情分析データセット [ソース](https://github.com/chakki-works/chABSA-dataset)(ライセンス CC BY-SA 4.0)
|
| 337 |
+
|
| 338 |
+
**FA(基礎解析)**
|
| 339 |
+
|
| 340 |
+
* `Wikipedia Annotated Corpus`、[ソース](https://github.com/ku-nlp/WikipediaAnnotatedCorpus)(ライセンス CC BY-SA 4.0)
|
| 341 |
+
|
| 342 |
+
タスク一覧:(読解予測、固有表現認識(NER)、依存構造解析、述語項構造解析(PAS)、共参照解析)
|
| 343 |
+
|
| 344 |
+
**MR(数学的推論)**
|
| 345 |
+
|
| 346 |
+
* `MAWPS`、MAWPS(A Math Word Problem Repository)の日本語版 [ソース](https://github.com/nlp-waseda/chain-of-thought-ja-dataset)(ライセンス Apache-2.0)
|
| 347 |
+
|
| 348 |
+
* `MGSM`、MGSM(Multilingual Grade School Math Benchmark)の日本語部分 [ソース](https://huggingface.co/datasets/juletxara/mgsm)(ライセンス MIT License)
|
| 349 |
+
|
| 350 |
+
**MT(機械翻訳)**
|
| 351 |
+
|
| 352 |
+
* `ALT`、アジア言語ツリーバンク(ALT) - 並行コーパス [ソース](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html)(ライセンス CC BY-SA 4.0)
|
| 353 |
+
|
| 354 |
+
* `WikiCorpus`、京都市に関するWikipedia記事の日本語-英語対訳コーパス [ソース](https://alaginrc.nict.go.jp/WikiCorpus/)(ライセンス CC BY-SA 3.0)
|
| 355 |
+
|
| 356 |
+
**STS(意味的テキスト類似度)**
|
| 357 |
+
|
| 358 |
+
このタスクはllm-jp-evalでサポートされていますが、平均スコア (AVG) の計算には含まれていません。
|
| 359 |
+
|
| 360 |
+
* `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
| 361 |
+
|
| 362 |
+
**HE(試験問題)**
|
| 363 |
+
|
| 364 |
+
* `MMLU`、大規模マルチタスク言語理解ベンチマーク(英語) [ソース](https://github.com/hendrycks/test)(ライセンス MIT License)
|
| 365 |
+
|
| 366 |
+
* `JMMLU`、日本語大規模マルチタスク言語理解ベンチマーク [ソース](https://github.com/nlp-waseda/JMMLU)(ライセンス CC BY-SA 4.0(3つのタスクはCC BY-NC-ND 4.0ライセンス)
|
| 367 |
+
|
| 368 |
+
**CG(コード生成)**
|
| 369 |
+
|
| 370 |
+
* `MBPP`、Mostly Basic Python Problems(MBPP)の日本語版 [ソース](https://huggingface.co/datasets/llm-jp/mbpp-ja)(ライセンス CC BY-SA 4.0)
|
| 371 |
+
|
| 372 |
+
**SUM(要約)**
|
| 373 |
+
|
| 374 |
+
* `XL-Sum`、44言語の大規模多言語抽象型要約データセットの日本語部分 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
|
| 375 |
+
|
| 376 |
+
## 再現性
|
| 377 |
+
結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
| 378 |
+
|
| 379 |
+
## 平均スコアの計算について
|
| 380 |
+
平均スコア (AVG) の計算には、⭐マークのついたスコアのみが含まれます
|
| 381 |
+
|
| 382 |
+
"""
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
EVALUATION_QUEUE_TEXT = """
|
| 386 |
+
## First Steps Before Submitting a Model
|
| 387 |
+
### 1. Ensure Your Model Loads with AutoClasses
|
| 388 |
+
Verify that you can load your model and tokenizer using AutoClasses:
|
| 389 |
+
```python
|
| 390 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 391 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 392 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 393 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 394 |
+
```
|
| 395 |
+
Note:
|
| 396 |
+
- If this step fails, debug your model before submitting.
|
| 397 |
+
- Ensure your model is public.
|
| 398 |
+
- Models requiring `use_remote_code=True` are not currently supported.
|
| 399 |
+
### 2. Convert Weights to Safetensors
|
| 400 |
+
[Safetensors](https://huggingface.co/docs/safetensors/index) is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
| 401 |
+
### 3. Verify Your Model Open License
|
| 402 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
| 403 |
+
### 4. Complete Your Model Card
|
| 404 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
| 405 |
+
### 5. Select Appropriate Precision
|
| 406 |
+
The "auto" option supports fp16, fp32, and bf16 precisions. If your model uses any other precision format, please select the appropriate option.
|
| 407 |
+
If auto is specified, precision in config.json is automatically selected.
|
| 408 |
+
### Note about large models
|
| 409 |
+
Currently, we support models up to 70B parameters. However, we are working on infrastructure improvements to accommodate larger models (70B+) in the near future. Stay tuned for updates!
|
| 410 |
+
|
| 411 |
+
"""
|
| 412 |
+
EVALUATION_QUEUE_TEXT_JA = """
|
| 413 |
+
## モデル提出前の最初のステップ
|
| 414 |
+
### 1. AutoClasses でモデルが読み込めることを確認
|
| 415 |
+
AutoClasses を使用してモデルとトークナイザーを読み込めることを確認してください:
|
| 416 |
+
```python
|
| 417 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 418 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 419 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 420 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 421 |
+
```
|
| 422 |
+
注意:
|
| 423 |
+
- この手順が失敗する場合は、提出前にモデルをデバッグしてください。
|
| 424 |
+
- モデルが公開されていることを確認してください。
|
| 425 |
+
- `use_remote_code=True` を必要とするモデルは現時点ではサポートされていません。
|
| 426 |
+
|
| 427 |
+
### 2. 重みを Safetensors に変換
|
| 428 |
+
[Safetensors](https://huggingface.co/docs/safetensors/index) は、より安全で高速に読み込めるウェイトの新しい保存形式です。これにより、`Extended Viewer` にモデルのパラメータ数を追加することも可能になります!
|
| 429 |
+
|
| 430 |
+
### 3. モデルのオープンライセンスを確認
|
| 431 |
+
これはオープン LLM のリーダーボードです。できるだけ多くの人があなたのモデルを使用できることを知ってもらえると嬉しいです🤗
|
| 432 |
+
|
| 433 |
+
### 4. モデルカードを完成させる
|
| 434 |
+
リーダーボードにモデルの追加情報を掲載する際は、モデルカードから自動的に情報が取得されます
|
| 435 |
+
|
| 436 |
+
### 5. 適切なPrecisionの選択
|
| 437 |
+
"auto"オプションはfp16、fp32、bf16のprecisionに対応しています。これら以外のprecisionを使用している場合は、適切なオプションを選択してください。
|
| 438 |
+
また、autoを指定した場合、config.jsonのprecisionが自動的に選択されます。
|
| 439 |
+
|
| 440 |
+
### 大規模モデルに関する注意
|
| 441 |
+
現在、70Bパラメータまでのモデルをサポートしています。より大規模なモデル(70Bよりも大きいもの)については、インフラストラクチャの改善を進めており、近い将来対応予定です。続報をお待ちください!
|
| 442 |
+
|
| 443 |
+
"""
|
| 444 |
+
|
| 445 |
+
BOTTOM_LOGO = """
|
| 446 |
+
<div style="display: flex; flex-direction: row; justify-content: center; align-items: center;">
|
| 447 |
+
<a href="https://llm-jp.nii.ac.jp/en/" style="margin: 0 10px;">
|
| 448 |
+
<img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/LLM-jp-Logo-Oct-2024.png" alt="LLM-jp" style="max-height: 100px;">
|
| 449 |
+
</a>
|
| 450 |
+
<a href="https://mdx.jp/" style="margin: 0 10px;">
|
| 451 |
+
<img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/MDX-Logo-Oct-2024.jpg" alt="MDX" style="max-height: 100px;">
|
| 452 |
+
</a>
|
| 453 |
+
<a href="https://huggingface.co/" style="margin: 0 10px;">
|
| 454 |
+
<img src="https://raw.githubusercontent.com/AkimfromParis/akimfromparis/refs/heads/main/images/HuggingFace-Logo-Oct-2024.png" alt="HuggingFace" style="max-height: 100px;">
|
| 455 |
+
</a>
|
| 456 |
+
</div>
|
| 457 |
+
"""
|
| 458 |
+
|
| 459 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 460 |
+
CITATION_BUTTON_LABEL_JA = "引用の際は、次のスニペットをコピーしてご利用ください"
|
| 461 |
+
|
| 462 |
+
CITATION_BUTTON_TEXT = r"""@misc{OJLL,
|
| 463 |
+
author = {Miyao, Yusuke and Ishida, Shigeki and Okamoto, Takumi and Han, Namgi and Mousterou, Akim and Fourrier, Clémentine and Hayashi, Toshihiro and Tachibana, Yuichiro},
|
| 464 |
+
title = {Open Japanese LLM Leaderboard},
|
| 465 |
+
year = {2024},
|
| 466 |
+
publisher = {OJLL},
|
| 467 |
+
howpublished = "\url{https://huggingface.co/spaces/llm-jp/open-japanese-llm-leaderboard}"
|
| 468 |
+
}
|
| 469 |
+
@misc{llmjp2024llmjpcrossorganizationalprojectresearch,
|
| 470 |
+
title={LLM-jp: A Cross-organizational Project for the Research and Development of Fully Open Japanese LLMs},
|
| 471 |
+
author={LLM-jp and : and Akiko Aizawa and Eiji Aramaki and Bowen Chen and Fei Cheng and Hiroyuki Deguchi and Rintaro Enomoto and Kazuki Fujii and Kensuke Fukumoto and Takuya Fukushima and Namgi Han and Yuto Harada and Chikara Hashimoto and Tatsuya Hiraoka and Shohei Hisada and Sosuke Hosokawa and Lu Jie and Keisuke Kamata and Teruhito Kanazawa and Hiroki Kanezashi and Hiroshi Kataoka and Satoru Katsumata and Daisuke Kawahara and Seiya Kawano and Atsushi Keyaki and Keisuke Kiryu and Hirokazu Kiyomaru and Takashi Kodama and Takahiro Kubo and Yohei Kuga and Ryoma Kumon and Shuhei Kurita and Sadao Kurohashi and Conglong Li and Taiki Maekawa and Hiroshi Matsuda and Yusuke Miyao and Kentaro Mizuki and Sakae Mizuki and Yugo Murawaki and Ryo Nakamura and Taishi Nakamura and Kouta Nakayama and Tomoka Nakazato and Takuro Niitsuma and Jiro Nishitoba and Yusuke Oda and Hayato Ogawa and Takumi Okamoto and Naoaki Okazaki and Yohei Oseki and Shintaro Ozaki and Koki Ryu and Rafal Rzepka and Keisuke Sakaguchi and Shota Sasaki and Satoshi Sekine and Kohei Suda and Saku Sugawara and Issa Sugiura and Hiroaki Sugiyama and Hisami Suzuki and Jun Suzuki and Toyotaro Suzumura and Kensuke Tachibana and Yu Takagi and Kyosuke Takami and Koichi Takeda and Masashi Takeshita and Masahiro Tanaka and Kenjiro Taura and Arseny Tolmachev and Nobuhiro Ueda and Zhen Wan and Shuntaro Yada and Sakiko Yahata and Yuya Yamamoto and Yusuke Yamauchi and Hitomi Yanaka and Rio Yokota and Koichiro Yoshino},
|
| 472 |
+
year={2024},
|
| 473 |
+
eprint={2407.03963},
|
| 474 |
+
archivePrefix={arXiv},
|
| 475 |
+
primaryClass={cs.CL},
|
| 476 |
+
url={https://arxiv.org/abs/2407.03963},
|
| 477 |
+
}
|
| 478 |
+
"""
|
src/display/formatting.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def model_hyperlink(link, model_name):
|
| 2 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def model_hyperlink_with_shot(link, model_name, num_few_shot):
|
| 6 |
+
display_name = f"{model_name} ({num_few_shot}-shot)"
|
| 7 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{display_name}</a>'
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def make_clickable_model(model_name):
|
| 11 |
+
link = f"https://huggingface.co/{model_name}"
|
| 12 |
+
return model_hyperlink(link, model_name)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def make_clickable_model_with_shot(model_name, num_few_shot):
|
| 16 |
+
link = f"https://huggingface.co/{model_name}"
|
| 17 |
+
return model_hyperlink_with_shot(link, model_name, num_few_shot)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def styled_error(error):
|
| 21 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def styled_warning(warn):
|
| 25 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def styled_message(message):
|
| 29 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def has_no_nan_values(df, columns):
|
| 33 |
+
return df[columns].notna().all(axis=1)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def has_nan_values(df, columns):
|
| 37 |
+
return df[columns].isna().any(axis=1)
|
src/display/utils.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, make_dataclass
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
from src.about import Tasks, TaskType
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def fields(raw_class):
|
| 10 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# These classes are for user facing column names,
|
| 14 |
+
# to avoid having to change them all around the code
|
| 15 |
+
# when a modif is needed
|
| 16 |
+
@dataclass
|
| 17 |
+
class ColumnContent:
|
| 18 |
+
name: str
|
| 19 |
+
type: str
|
| 20 |
+
displayed_by_default: bool
|
| 21 |
+
hidden: bool = False
|
| 22 |
+
never_hidden: bool = False
|
| 23 |
+
dummy: bool = False
|
| 24 |
+
task_type: TaskType = TaskType.NotTask
|
| 25 |
+
average: bool = False
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
## Leaderboard columns
|
| 29 |
+
auto_eval_column_dict = []
|
| 30 |
+
# Init
|
| 31 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 32 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 33 |
+
# Scores
|
| 34 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 35 |
+
for task in Tasks:
|
| 36 |
+
auto_eval_column_dict.append(
|
| 37 |
+
[
|
| 38 |
+
task.name,
|
| 39 |
+
ColumnContent,
|
| 40 |
+
ColumnContent(
|
| 41 |
+
task.value.col_name,
|
| 42 |
+
"number",
|
| 43 |
+
displayed_by_default=(task.value.task_type == TaskType.AVG or task.value.average),
|
| 44 |
+
task_type=task.value.task_type,
|
| 45 |
+
average=task.value.average,
|
| 46 |
+
),
|
| 47 |
+
]
|
| 48 |
+
)
|
| 49 |
+
# Model information
|
| 50 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 51 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 52 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 53 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 54 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 55 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 56 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 57 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Revision", "str", False, False)])
|
| 58 |
+
auto_eval_column_dict.append(["num_few_shots", ColumnContent, ColumnContent("Few-shot", "number", False)])
|
| 59 |
+
auto_eval_column_dict.append(["add_special_tokens", ColumnContent, ColumnContent("Add Special Tokens", "bool", False)])
|
| 60 |
+
auto_eval_column_dict.append(
|
| 61 |
+
["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
|
| 62 |
+
)
|
| 63 |
+
auto_eval_column_dict.append(["vllm_version", ColumnContent, ColumnContent("vllm version", "str", False)])
|
| 64 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 65 |
+
auto_eval_column_dict.append(["row_id", ColumnContent, ColumnContent("ID", "number", False, dummy=True)])
|
| 66 |
+
|
| 67 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 68 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
## For the queue columns in the submission tab
|
| 72 |
+
@dataclass(frozen=True)
|
| 73 |
+
class EvalQueueColumn: # Queue column
|
| 74 |
+
model = ColumnContent("model", "markdown", True)
|
| 75 |
+
revision = ColumnContent("revision", "str", True)
|
| 76 |
+
model_type = ColumnContent("model_type", "str", True)
|
| 77 |
+
precision = ColumnContent("precision", "str", True)
|
| 78 |
+
add_special_tokens = ColumnContent("add_special_tokens", "str", True)
|
| 79 |
+
llm_jp_eval_version = ColumnContent("llm_jp_eval_version", "str", True)
|
| 80 |
+
vllm_version = ColumnContent("vllm_version", "str", True)
|
| 81 |
+
status = ColumnContent("status", "str", True)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# This class is used to store the model data in the queue
|
| 85 |
+
@dataclass(frozen=True)
|
| 86 |
+
class EvalQueuedModel:
|
| 87 |
+
model: str
|
| 88 |
+
revision: str
|
| 89 |
+
precision: str
|
| 90 |
+
add_special_tokens: str
|
| 91 |
+
llm_jp_eval_version: str
|
| 92 |
+
vllm_version: str
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
## All the model information that we might need
|
| 96 |
+
@dataclass
|
| 97 |
+
class ModelDetails:
|
| 98 |
+
name: str
|
| 99 |
+
display_name: str = ""
|
| 100 |
+
symbol: str = "" # emoji
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class ModelType(Enum):
|
| 104 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
| 105 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
| 106 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
| 107 |
+
RL = ModelDetails(name="RL-tuned (Preference optimization)", symbol="🟦")
|
| 108 |
+
MM = ModelDetails(name="multimodal", symbol="🌸")
|
| 109 |
+
BM = ModelDetails(name="base merges and moerges", symbol="🤝")
|
| 110 |
+
|
| 111 |
+
def to_str(self, separator=" "):
|
| 112 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
| 113 |
+
|
| 114 |
+
@staticmethod
|
| 115 |
+
def from_str(type):
|
| 116 |
+
if "fine-tuned" in type or "🔶" in type:
|
| 117 |
+
return ModelType.FT
|
| 118 |
+
if "pretrained" in type or "🟢" in type:
|
| 119 |
+
return ModelType.PT
|
| 120 |
+
if "RL-tuned" in type or "🟦" in type:
|
| 121 |
+
return ModelType.RL
|
| 122 |
+
if "instruction-tuned" in type or "⭕" in type:
|
| 123 |
+
return ModelType.IFT
|
| 124 |
+
if "multimodal" in type or "🌸" in type:
|
| 125 |
+
return ModelType.MM
|
| 126 |
+
if "base merges and moerges" in type or "🤝" in type:
|
| 127 |
+
return ModelType.BM
|
| 128 |
+
raise ValueError(f"Unsupported model type: {type}")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class WeightType(Enum):
|
| 132 |
+
Adapter = ModelDetails("Adapter")
|
| 133 |
+
Original = ModelDetails("Original")
|
| 134 |
+
Delta = ModelDetails("Delta")
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class Precision(Enum):
|
| 138 |
+
float16 = ModelDetails("float16")
|
| 139 |
+
bfloat16 = ModelDetails("bfloat16")
|
| 140 |
+
float32 = ModelDetails("float32")
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def from_str(precision: str) -> "Precision":
|
| 144 |
+
if precision == "float16":
|
| 145 |
+
return Precision.float16
|
| 146 |
+
if precision == "bfloat16":
|
| 147 |
+
return Precision.bfloat16
|
| 148 |
+
if precision == "float32":
|
| 149 |
+
return Precision.float32
|
| 150 |
+
raise ValueError(
|
| 151 |
+
f"Unsupported precision type: {precision}. Please use 'auto' (recommended), 'float32', 'float16', or 'bfloat16'"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class AddSpecialTokens(Enum):
|
| 156 |
+
true = ModelDetails("True")
|
| 157 |
+
false = ModelDetails("False")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class NumFewShots(Enum):
|
| 161 |
+
shots_0 = 0
|
| 162 |
+
shots_4 = 4
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class LLMJpEvalVersion(Enum):
|
| 166 |
+
current = ModelDetails("v1.4.1")
|
| 167 |
+
|
| 168 |
+
@staticmethod
|
| 169 |
+
def from_str(version: str) -> "LLMJpEvalVersion":
|
| 170 |
+
if version == "1.4.1":
|
| 171 |
+
return LLMJpEvalVersion.current
|
| 172 |
+
raise ValueError(f"Unsupported LLMJpEval version: {version}")
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class VllmVersion(Enum):
|
| 176 |
+
current = ModelDetails("v0.6.3.post1")
|
| 177 |
+
|
| 178 |
+
@staticmethod
|
| 179 |
+
def from_str(version: str) -> "VllmVersion":
|
| 180 |
+
if version == "v0.6.3.post1":
|
| 181 |
+
return VllmVersion.current
|
| 182 |
+
raise ValueError(f"Unsupported VLLM version: {version}")
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# Column selection
|
| 186 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 187 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
| 188 |
+
|
| 189 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 190 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 191 |
+
|
| 192 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 193 |
+
|
| 194 |
+
NUMERIC_INTERVALS = {
|
| 195 |
+
"0~3B": pd.Interval(0, 3, closed="right"),
|
| 196 |
+
"3~7B": pd.Interval(3, 7.3, closed="right"),
|
| 197 |
+
"7~13B": pd.Interval(7.3, 13, closed="right"),
|
| 198 |
+
"13~35B": pd.Interval(13, 35, closed="right"),
|
| 199 |
+
"35~60B": pd.Interval(35, 60, closed="right"),
|
| 200 |
+
"60B+": pd.Interval(60, 10000, closed="right"),
|
| 201 |
+
"?": pd.Interval(-1, 0, closed="right"),
|
| 202 |
+
}
|
src/envs.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pathlib
|
| 3 |
+
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
# Info to change for your repository
|
| 7 |
+
# ----------------------------------
|
| 8 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 9 |
+
|
| 10 |
+
OWNER = "e-mon" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 11 |
+
# ----------------------------------
|
| 12 |
+
|
| 13 |
+
REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard-v2"
|
| 14 |
+
QUEUE_REPO = f"{OWNER}/leaderboard-requests-v2"
|
| 15 |
+
CONTENTS_REPO = f"{OWNER}/leaderboard-contents-v2"
|
| 16 |
+
|
| 17 |
+
# If you setup a cache later, just change HF_HOME
|
| 18 |
+
CACHE_PATH = pathlib.Path(os.getenv("HF_HOME", "."))
|
| 19 |
+
|
| 20 |
+
# Local caches
|
| 21 |
+
EVAL_REQUESTS_PATH = CACHE_PATH / "eval-queue"
|
| 22 |
+
|
| 23 |
+
API = HfApi(token=HF_TOKEN)
|
src/i18n.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Column selection
|
| 2 |
+
SELECT_ALL_BUTTON_LABEL = "Select All"
|
| 3 |
+
SELECT_ALL_BUTTON_LABEL_JA = "全選択"
|
| 4 |
+
SELECT_NONE_BUTTON_LABEL = "Select None"
|
| 5 |
+
SELECT_NONE_BUTTON_LABEL_JA = "全解除"
|
| 6 |
+
SELECT_AVG_ONLY_BUTTON_LABEL = "AVG Only"
|
| 7 |
+
SELECT_AVG_ONLY_BUTTON_LABEL_JA = "AVGのみ"
|
| 8 |
+
|
| 9 |
+
# Citation
|
| 10 |
+
CITATION_ACCORDION_LABEL = "📙 Citation"
|
| 11 |
+
CITATION_ACCORDION_LABEL_JA = "📙 引用"
|
src/populate.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
import datasets
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from src.about import Tasks
|
| 8 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model, make_clickable_model_with_shot
|
| 9 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 10 |
+
|
| 11 |
+
# The values of these columns are in the range of 0-100
|
| 12 |
+
# We normalize them to 0-1
|
| 13 |
+
COLUMNS_TO_NORMALIZE = [
|
| 14 |
+
"ALT E to J BLEU",
|
| 15 |
+
"ALT J to E BLEU",
|
| 16 |
+
"WikiCorpus E to J BLEU",
|
| 17 |
+
"WikiCorpus J to E BLEU",
|
| 18 |
+
"XL-Sum JA BLEU",
|
| 19 |
+
"XL-Sum ROUGE1",
|
| 20 |
+
"XL-Sum ROUGE2",
|
| 21 |
+
"XL-Sum ROUGE-Lsum",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_leaderboard_df(contents_repo: str, cols: list[str], benchmark_cols: list[str]) -> pd.DataFrame:
|
| 26 |
+
df = datasets.load_dataset(contents_repo, split="train").to_pandas()
|
| 27 |
+
# df["Model"] = df["model"].map(make_clickable_model)
|
| 28 |
+
df["Model"] = df.apply(lambda x: make_clickable_model_with_shot(x["model"], x["num_few_shot"]), axis=1)
|
| 29 |
+
df["T"] = df["model_type"].map(lambda x: x.split(":")[0].strip())
|
| 30 |
+
df = df.rename(columns={task.value.metric: task.value.col_name for task in Tasks})
|
| 31 |
+
df = df.rename(
|
| 32 |
+
columns={
|
| 33 |
+
"architecture": "Architecture",
|
| 34 |
+
"weight_type": "Weight type",
|
| 35 |
+
"precision": "Precision",
|
| 36 |
+
"license": "Hub License",
|
| 37 |
+
"params": "#Params (B)",
|
| 38 |
+
"likes": "Hub ❤️",
|
| 39 |
+
"revision": "Revision",
|
| 40 |
+
"num_few_shot": "Few-shot",
|
| 41 |
+
"add_special_tokens": "Add Special Tokens",
|
| 42 |
+
"llm_jp_eval_version": "llm-jp-eval version",
|
| 43 |
+
"vllm_version": "vllm version",
|
| 44 |
+
"model_type": "Type",
|
| 45 |
+
"model": "model_name_for_query",
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Add a row ID column
|
| 50 |
+
df[AutoEvalColumn.row_id.name] = range(len(df))
|
| 51 |
+
|
| 52 |
+
# Normalize the columns
|
| 53 |
+
available_columns_to_normalize = [col for col in COLUMNS_TO_NORMALIZE if col in df.columns]
|
| 54 |
+
df[available_columns_to_normalize] = df[available_columns_to_normalize] / 100
|
| 55 |
+
|
| 56 |
+
df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
|
| 57 |
+
df = df[cols].round(decimals=4)
|
| 58 |
+
|
| 59 |
+
# filter out if any of the benchmarks have not been produced
|
| 60 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 61 |
+
|
| 62 |
+
return df
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_evaluation_queue_df(save_path: str, cols: list[str]) -> list[pd.DataFrame]:
|
| 66 |
+
"""Creates the different dataframes for the evaluation queues requestes"""
|
| 67 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 68 |
+
all_evals = []
|
| 69 |
+
|
| 70 |
+
for entry in entries:
|
| 71 |
+
if ".json" in entry:
|
| 72 |
+
file_path = os.path.join(save_path, entry)
|
| 73 |
+
with open(file_path) as fp:
|
| 74 |
+
data = json.load(fp)
|
| 75 |
+
|
| 76 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 77 |
+
data[EvalQueueColumn.model.name] = make_clickable_model_with_shot(
|
| 78 |
+
data["model"],
|
| 79 |
+
data["num_few_shot"], # num_few_shotは必ず存在するため、直接参照
|
| 80 |
+
)
|
| 81 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 82 |
+
|
| 83 |
+
all_evals.append(data)
|
| 84 |
+
elif ".md" not in entry:
|
| 85 |
+
# this is a folder
|
| 86 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
| 87 |
+
for sub_entry in sub_entries:
|
| 88 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
| 89 |
+
with open(file_path) as fp:
|
| 90 |
+
data = json.load(fp)
|
| 91 |
+
|
| 92 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 93 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 94 |
+
all_evals.append(data)
|
| 95 |
+
|
| 96 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
| 97 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 98 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
| 99 |
+
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
| 100 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
| 101 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
| 102 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
| 103 |
+
df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
|
| 104 |
+
return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
|
src/submission/check_validity.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import pathlib
|
| 4 |
+
|
| 5 |
+
import huggingface_hub
|
| 6 |
+
import requests
|
| 7 |
+
from huggingface_hub import ModelCard
|
| 8 |
+
from huggingface_hub.hf_api import ModelInfo
|
| 9 |
+
from transformers import AutoConfig
|
| 10 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 11 |
+
|
| 12 |
+
from src.display.utils import EvalQueuedModel
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
| 16 |
+
"""Checks if the model card and license exist and have been filled"""
|
| 17 |
+
try:
|
| 18 |
+
card = ModelCard.load(repo_id)
|
| 19 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
| 20 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
| 21 |
+
|
| 22 |
+
# Enforce license metadata
|
| 23 |
+
if card.data.license is None:
|
| 24 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
| 25 |
+
return False, (
|
| 26 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
| 27 |
+
" `license_name`/`license_link` pair."
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Enforce card content
|
| 31 |
+
if len(card.text) < 200:
|
| 32 |
+
return False, "Please add a description to your model card, it is too short."
|
| 33 |
+
|
| 34 |
+
return True, ""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def is_model_on_hub(
|
| 38 |
+
model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
|
| 39 |
+
) -> tuple[bool, str]:
|
| 40 |
+
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
| 41 |
+
try:
|
| 42 |
+
config = AutoConfig.from_pretrained(
|
| 43 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
| 44 |
+
)
|
| 45 |
+
if test_tokenizer:
|
| 46 |
+
try:
|
| 47 |
+
AutoTokenizer.from_pretrained(
|
| 48 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
| 49 |
+
)
|
| 50 |
+
except ValueError as e:
|
| 51 |
+
return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
|
| 52 |
+
except Exception:
|
| 53 |
+
return (
|
| 54 |
+
False,
|
| 55 |
+
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
|
| 56 |
+
None,
|
| 57 |
+
)
|
| 58 |
+
return True, None, config
|
| 59 |
+
|
| 60 |
+
except ValueError:
|
| 61 |
+
return (
|
| 62 |
+
False,
|
| 63 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
| 64 |
+
None,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
except OSError as e:
|
| 68 |
+
if "gated repo" in str(e):
|
| 69 |
+
slack_webhook_url = os.environ["SLACK_WEBHOOK_URL"]
|
| 70 |
+
text = f"<!channel>\n{model_name} is gated model! Please submit this model."
|
| 71 |
+
requests.post(slack_webhook_url, data=json.dumps({"text": text}))
|
| 72 |
+
return False, "is gated model! Please wait.", None
|
| 73 |
+
return False, "was not found on hub!", None
|
| 74 |
+
except Exception:
|
| 75 |
+
return False, "was not found on hub!", None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
| 79 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
| 80 |
+
try:
|
| 81 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
| 82 |
+
except (AttributeError, TypeError):
|
| 83 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 84 |
+
|
| 85 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
| 86 |
+
model_size = size_factor * model_size
|
| 87 |
+
return model_size
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_model_arch(model_info: ModelInfo):
|
| 91 |
+
"""Gets the model architecture from the configuration"""
|
| 92 |
+
return model_info.config.get("architectures", "Unknown")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def already_submitted_models(requested_models_dir: pathlib.Path) -> set[EvalQueuedModel]:
|
| 96 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
| 97 |
+
queued_models = set()
|
| 98 |
+
for json_path in requested_models_dir.glob("*/*.json"):
|
| 99 |
+
with json_path.open() as f:
|
| 100 |
+
info = json.load(f)
|
| 101 |
+
queued_models.add(
|
| 102 |
+
EvalQueuedModel(
|
| 103 |
+
model=info["model"],
|
| 104 |
+
revision=info["revision"],
|
| 105 |
+
precision=info["precision"],
|
| 106 |
+
add_special_tokens=info["add_special_tokens"],
|
| 107 |
+
llm_jp_eval_version=info["llm_jp_eval_version"],
|
| 108 |
+
vllm_version=info["vllm_version"],
|
| 109 |
+
)
|
| 110 |
+
)
|
| 111 |
+
return queued_models
|
src/submission/submit.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 7 |
+
from src.display.utils import EvalQueuedModel, LLMJpEvalVersion, VllmVersion
|
| 8 |
+
from src.envs import API, EVAL_REQUESTS_PATH, HF_TOKEN, QUEUE_REPO
|
| 9 |
+
from src.submission.check_validity import already_submitted_models, check_model_card, is_model_on_hub
|
| 10 |
+
|
| 11 |
+
REQUESTED_MODELS: set[EvalQueuedModel] = set()
|
| 12 |
+
|
| 13 |
+
LLM_JP_EVAL_VERSION = LLMJpEvalVersion.current.value.name
|
| 14 |
+
VLLM_VERSION = VllmVersion.current.value.name
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def add_new_eval(
|
| 18 |
+
model_id: str,
|
| 19 |
+
revision: str,
|
| 20 |
+
precision: str,
|
| 21 |
+
model_type: str,
|
| 22 |
+
add_special_tokens: str,
|
| 23 |
+
):
|
| 24 |
+
global REQUESTED_MODELS
|
| 25 |
+
if not REQUESTED_MODELS:
|
| 26 |
+
REQUESTED_MODELS = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 27 |
+
|
| 28 |
+
revision = revision or "main"
|
| 29 |
+
|
| 30 |
+
# Is the model on the hub?
|
| 31 |
+
model_on_hub, error, config = is_model_on_hub(
|
| 32 |
+
model_name=model_id, revision=revision, token=HF_TOKEN, test_tokenizer=True
|
| 33 |
+
)
|
| 34 |
+
if not model_on_hub:
|
| 35 |
+
return styled_error(f'Model "{model_id}" {error}')
|
| 36 |
+
if precision == "auto":
|
| 37 |
+
dtype = ""
|
| 38 |
+
if hasattr(config, "torch_dtype"):
|
| 39 |
+
dtype = config.torch_dtype
|
| 40 |
+
if dtype == torch.float16:
|
| 41 |
+
precision = "float16"
|
| 42 |
+
elif dtype == torch.bfloat16:
|
| 43 |
+
precision = "bfloat16"
|
| 44 |
+
elif dtype == torch.float32:
|
| 45 |
+
precision = "float32"
|
| 46 |
+
else:
|
| 47 |
+
return styled_error(
|
| 48 |
+
"Unable to retrieve a valid dtype from config.json. Please select an appropriate one from fp16/fp32/bf16 and resubmit."
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
model_data = EvalQueuedModel(
|
| 52 |
+
model=model_id,
|
| 53 |
+
revision=revision,
|
| 54 |
+
precision=precision,
|
| 55 |
+
add_special_tokens=add_special_tokens,
|
| 56 |
+
llm_jp_eval_version=LLM_JP_EVAL_VERSION,
|
| 57 |
+
vllm_version=VLLM_VERSION,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if model_data in REQUESTED_MODELS:
|
| 61 |
+
return styled_warning("This model has already been submitted with the same configuration.")
|
| 62 |
+
|
| 63 |
+
if "/" in model_id:
|
| 64 |
+
user_or_org, model_name = model_id.split("/")
|
| 65 |
+
else:
|
| 66 |
+
user_or_org, model_name = "", model_id
|
| 67 |
+
|
| 68 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 69 |
+
|
| 70 |
+
if model_type is None or model_type == "":
|
| 71 |
+
return styled_error("Please select a model type.")
|
| 72 |
+
|
| 73 |
+
# Is the model info correctly filled?
|
| 74 |
+
try:
|
| 75 |
+
model_info = API.model_info(repo_id=model_id, revision=revision)
|
| 76 |
+
except Exception:
|
| 77 |
+
return styled_error("Could not get your model information. Please fill it up properly.")
|
| 78 |
+
|
| 79 |
+
# Were the model card and license filled?
|
| 80 |
+
try:
|
| 81 |
+
_ = model_info.cardData["license"]
|
| 82 |
+
except Exception:
|
| 83 |
+
return styled_error("Please select a license for your model")
|
| 84 |
+
|
| 85 |
+
modelcard_OK, error_msg = check_model_card(model_id)
|
| 86 |
+
if not modelcard_OK:
|
| 87 |
+
return styled_error(error_msg)
|
| 88 |
+
|
| 89 |
+
# Seems good, creating the eval
|
| 90 |
+
print("Adding new eval")
|
| 91 |
+
|
| 92 |
+
eval_entry = {
|
| 93 |
+
"model_type": model_type,
|
| 94 |
+
"model": model_id,
|
| 95 |
+
"precision": precision,
|
| 96 |
+
"revision": revision,
|
| 97 |
+
"add_special_tokens": add_special_tokens,
|
| 98 |
+
"llm_jp_eval_version": LLM_JP_EVAL_VERSION,
|
| 99 |
+
"vllm_version": VLLM_VERSION,
|
| 100 |
+
"status": "PENDING",
|
| 101 |
+
"submitted_time": current_time,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
print("Creating eval file")
|
| 105 |
+
OUT_DIR = EVAL_REQUESTS_PATH / user_or_org
|
| 106 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
out_file_name = f"{model_name}_{current_time.replace(':','-')}.json"
|
| 108 |
+
out_path = OUT_DIR / out_file_name
|
| 109 |
+
|
| 110 |
+
with out_path.open("w") as f:
|
| 111 |
+
f.write(json.dumps(eval_entry))
|
| 112 |
+
|
| 113 |
+
print("Uploading eval file")
|
| 114 |
+
API.upload_file(
|
| 115 |
+
path_or_fileobj=out_path,
|
| 116 |
+
path_in_repo=out_path.relative_to(EVAL_REQUESTS_PATH).as_posix(),
|
| 117 |
+
repo_id=QUEUE_REPO,
|
| 118 |
+
repo_type="dataset",
|
| 119 |
+
commit_message=f"Add {model_id} to eval queue",
|
| 120 |
+
)
|
| 121 |
+
REQUESTED_MODELS.add(model_data)
|
| 122 |
+
|
| 123 |
+
# Remove the local file
|
| 124 |
+
out_path.unlink()
|
| 125 |
+
|
| 126 |
+
return styled_message(
|
| 127 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 128 |
+
)
|
style.css
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.markdown-text {
|
| 2 |
+
font-size: 16px !important;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
#models-to-add-text {
|
| 6 |
+
font-size: 18px !important;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
#citation-button span {
|
| 10 |
+
font-size: 16px !important;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
#citation-button textarea {
|
| 14 |
+
font-size: 16px !important;
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
#citation-button > label > button {
|
| 18 |
+
margin: 6px;
|
| 19 |
+
transform: scale(1.3);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
#leaderboard-table {
|
| 23 |
+
margin-top: 15px;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
#search-bar-table-box > div:first-child {
|
| 27 |
+
background: none;
|
| 28 |
+
border: none;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
#search-bar {
|
| 32 |
+
padding: 0px;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
| 36 |
+
#leaderboard-table td:nth-child(2),
|
| 37 |
+
#leaderboard-table th:nth-child(2) {
|
| 38 |
+
max-width: 400px;
|
| 39 |
+
overflow: auto;
|
| 40 |
+
white-space: nowrap;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
@media (min-width: 700px) {
|
| 44 |
+
#leaderboard-table td:nth-child(2) {
|
| 45 |
+
left: 0;
|
| 46 |
+
z-index: 1;
|
| 47 |
+
position: sticky;
|
| 48 |
+
border-right: solid rgba(0, 0, 0, 0.1) !important;
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
@media (min-width: 700px) and (prefers-color-scheme: light) {
|
| 52 |
+
#leaderboard-table td:nth-child(2) {
|
| 53 |
+
background-color: rgba(255, 255, 255, 0.9) !important;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
@media (min-width: 700px) and (prefers-color-scheme: dark) {
|
| 58 |
+
#leaderboard-table td:nth-child(2) {
|
| 59 |
+
background-color: rgba(52, 65, 86, 0.9) !important;
|
| 60 |
+
}
|
| 61 |
+
#leaderboard-table td a {
|
| 62 |
+
color: white !important;
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
#llm-benchmark-tab-table-button, #llm-benchmark-tab-about-button, #llm-benchmark-tab-submit-button {
|
| 67 |
+
font-size: 20px;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
#scale-logo {
|
| 71 |
+
border-style: none !important;
|
| 72 |
+
box-shadow: none;
|
| 73 |
+
display: block;
|
| 74 |
+
margin-left: auto;
|
| 75 |
+
margin-right: auto;
|
| 76 |
+
max-width: 600px;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
#scale-logo .download {
|
| 80 |
+
display: none;
|
| 81 |
+
}
|
| 82 |
+
#filter_type {
|
| 83 |
+
border: 0;
|
| 84 |
+
padding-left: 0;
|
| 85 |
+
padding-top: 0;
|
| 86 |
+
}
|
| 87 |
+
#filter_type label {
|
| 88 |
+
display: flex;
|
| 89 |
+
}
|
| 90 |
+
#filter_type label > span {
|
| 91 |
+
margin-top: var(--spacing-lg);
|
| 92 |
+
margin-right: 0.5em;
|
| 93 |
+
}
|
| 94 |
+
#filter_type label > .wrap {
|
| 95 |
+
width: 103px;
|
| 96 |
+
}
|
| 97 |
+
#filter_type label > .wrap .wrap-inner {
|
| 98 |
+
padding: 2px;
|
| 99 |
+
}
|
| 100 |
+
#filter_type label > .wrap .wrap-inner input {
|
| 101 |
+
width: 1px;
|
| 102 |
+
}
|
| 103 |
+
#filter-columns-type {
|
| 104 |
+
border: 0;
|
| 105 |
+
padding: 0.5;
|
| 106 |
+
}
|
| 107 |
+
#filter-columns-size {
|
| 108 |
+
border: 0;
|
| 109 |
+
padding: 0.5;
|
| 110 |
+
}
|
| 111 |
+
#box-filter > .form {
|
| 112 |
+
border: 0;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.language-selector {
|
| 116 |
+
width: auto;
|
| 117 |
+
display: flex;
|
| 118 |
+
justify-content: center;
|
| 119 |
+
margin: 20px 0;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
/* Full width space */
|
| 123 |
+
.gradio-container {
|
| 124 |
+
max-width: 95% !important;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.accordion {
|
| 128 |
+
min-width: 200px !important;
|
| 129 |
+
border: solid rgba(175, 175, 175, 0.1) !important;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* make the plotly modebar horizontal */
|
| 133 |
+
.modebar-group {
|
| 134 |
+
display: flex;
|
| 135 |
+
flex-direction: row;
|
| 136 |
+
align-items: center;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* Hides the final AutoEvalColumn */
|
| 140 |
+
#llm-benchmark-tab-table table td:last-child,
|
| 141 |
+
#llm-benchmark-tab-table table th:last-child {
|
| 142 |
+
display: none;
|
| 143 |
+
}
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|