Spaces:
Runtime error
Runtime error
update layout
Browse files- .gitignore +2 -0
- app.py +4 -0
- data/models.yaml +49 -8
- src/about.py +6 -4
- src/populate.py +7 -2
.gitignore
CHANGED
|
@@ -14,3 +14,5 @@ logs/
|
|
| 14 |
envs/
|
| 15 |
|
| 16 |
tmp.py
|
|
|
|
|
|
|
|
|
| 14 |
envs/
|
| 15 |
|
| 16 |
tmp.py
|
| 17 |
+
print.py
|
| 18 |
+
leaderboard.tex
|
app.py
CHANGED
|
@@ -45,6 +45,10 @@ with demo:
|
|
| 45 |
)
|
| 46 |
|
| 47 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 49 |
|
| 50 |
with gr.Row():
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 48 |
+
# with gr.Column(scale=2):
|
| 49 |
+
# gr.Markdown("""
|
| 50 |
+
# 
|
| 51 |
+
# """)
|
| 52 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 53 |
|
| 54 |
with gr.Row():
|
data/models.yaml
CHANGED
|
@@ -1,82 +1,123 @@
|
|
| 1 |
- name: 'tulu-2-dpo-70b'
|
| 2 |
fdir: 'tulu-2-dpo-70b'
|
|
|
|
| 3 |
- name: 'tulu-2-70b'
|
| 4 |
fdir: 'tulu-2-70b'
|
|
|
|
| 5 |
- name: 'llama-2-70b'
|
| 6 |
fdir: 'llama-2-70b'
|
|
|
|
| 7 |
- name: 'tulu-2-dpo-13b'
|
| 8 |
fdir: 'tulu-2-dpo-13b'
|
|
|
|
| 9 |
- name: 'tulu-2-13b'
|
| 10 |
fdir: 'tulu-2-13b'
|
|
|
|
| 11 |
- name: 'llama-2-13b'
|
| 12 |
fdir: 'llama-2-13b'
|
|
|
|
| 13 |
- name: 'tulu-2-dpo-7b'
|
| 14 |
fdir: 'tulu-2-dpo-7b'
|
|
|
|
| 15 |
- name: 'tulu-2-7b'
|
| 16 |
fdir: 'tulu-2-7b'
|
|
|
|
| 17 |
- name: 'llama-2-7b'
|
| 18 |
fdir: 'llama-2-7b'
|
|
|
|
| 19 |
- name: 'gemini-1.0-pro'
|
| 20 |
fdir: 'gemini-1.0-pro'
|
|
|
|
| 21 |
- name: 'gemini-1.5-pro'
|
| 22 |
fdir: 'gemini-1.5-pro'
|
|
|
|
| 23 |
- name: 'gemini-1.5-flash'
|
| 24 |
fdir: 'gemini-1.5-flash'
|
|
|
|
| 25 |
- name: 'llama-3-8b'
|
| 26 |
fdir: 'llama-3-8b'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
- name: 'gpt-3.5-turbo-0125'
|
| 28 |
fdir: 'gpt-3.5-turbo-0125'
|
|
|
|
| 29 |
- name: 'gpt-4-0314'
|
| 30 |
fdir: 'gpt-4-0314'
|
|
|
|
| 31 |
- name: 'gpt-4-0613'
|
| 32 |
fdir: 'gpt-4-0613'
|
|
|
|
| 33 |
- name: 'gpt-4-1106-preview'
|
| 34 |
fdir: 'gpt-4-1106-preview'
|
|
|
|
| 35 |
- name: 'gpt-4-0125-preview'
|
| 36 |
fdir: 'gpt-4-0125-preview'
|
|
|
|
| 37 |
- name: 'gpt-4-turbo-2024-04-09'
|
| 38 |
fdir: 'gpt-4-turbo-2024-04-09'
|
|
|
|
| 39 |
- name: 'gpt-4o'
|
| 40 |
fdir: 'gpt-4o'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
- name: 'claude-3-opus'
|
| 42 |
fdir: 'claude-3-opus-20240229'
|
|
|
|
| 43 |
- name: 'claude-3-haiku'
|
| 44 |
fdir: 'claude-3-haiku-20240307'
|
|
|
|
| 45 |
- name: 'claude-3-sonnet'
|
| 46 |
fdir: 'claude-3-sonnet-20240229'
|
|
|
|
| 47 |
- name: 'claude-2.1'
|
| 48 |
fdir: 'claude-2.1'
|
|
|
|
| 49 |
- name: 'claude-instant-1.2'
|
| 50 |
fdir: 'claude-instant-1.2'
|
|
|
|
| 51 |
- name: 'command-r-plus'
|
| 52 |
fdir: 'command-r-plus'
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
- name: 'mistral-7b-v0.2'
|
| 56 |
fdir: 'mistral-7b-v0.2'
|
|
|
|
| 57 |
- name: 'mistral-7b-v0.1'
|
| 58 |
fdir: 'mistral-7b-v0.1'
|
|
|
|
| 59 |
- name: 'mixtral-8x7b'
|
| 60 |
fdir: 'mixtral-8x7b'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
- name: 'yi-1.5-34b'
|
| 62 |
fdir: 'yi-1.5-34b'
|
|
|
|
| 63 |
- name: 'yi-1.5-9b'
|
| 64 |
fdir: 'yi-1.5-9b'
|
|
|
|
| 65 |
- name: 'qwen-1.5-72b'
|
| 66 |
fdir: 'qwen-1.5-72b'
|
|
|
|
| 67 |
- name: 'qwen-1.5-32b'
|
| 68 |
fdir: 'qwen-1.5-32b'
|
|
|
|
| 69 |
- name: 'qwen-2-72b'
|
| 70 |
fdir: 'qwen-2-72b'
|
|
|
|
| 71 |
- name: 'gemma-7b'
|
| 72 |
fdir: 'gemma-7b'
|
|
|
|
| 73 |
- name: 'gemma-2b'
|
| 74 |
fdir: 'gemma-2b'
|
| 75 |
-
|
| 76 |
-
fdir: 'mistral-7b-v0.3'
|
| 77 |
- name: 'glm-4-9b'
|
| 78 |
fdir: 'glm-4-9b'
|
| 79 |
-
|
| 80 |
-
fdir: 'mistral-large'
|
| 81 |
-
- name: 'claude-3.5-sonnet'
|
| 82 |
-
fdir: 'claude3.5-sonnet'
|
|
|
|
| 1 |
- name: 'tulu-2-dpo-70b'
|
| 2 |
fdir: 'tulu-2-dpo-70b'
|
| 3 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-70b'
|
| 4 |
- name: 'tulu-2-70b'
|
| 5 |
fdir: 'tulu-2-70b'
|
| 6 |
+
url: 'https://huggingface.co/allenai/tulu-2-70b'
|
| 7 |
- name: 'llama-2-70b'
|
| 8 |
fdir: 'llama-2-70b'
|
| 9 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-70b-chat-hf'
|
| 10 |
- name: 'tulu-2-dpo-13b'
|
| 11 |
fdir: 'tulu-2-dpo-13b'
|
| 12 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-13b'
|
| 13 |
- name: 'tulu-2-13b'
|
| 14 |
fdir: 'tulu-2-13b'
|
| 15 |
+
url: 'https://huggingface.co/allenai/tulu-2-13b'
|
| 16 |
- name: 'llama-2-13b'
|
| 17 |
fdir: 'llama-2-13b'
|
| 18 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-13b-chat-hf'
|
| 19 |
- name: 'tulu-2-dpo-7b'
|
| 20 |
fdir: 'tulu-2-dpo-7b'
|
| 21 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-7b'
|
| 22 |
- name: 'tulu-2-7b'
|
| 23 |
fdir: 'tulu-2-7b'
|
| 24 |
+
url: 'https://huggingface.co/allenai/tulu-2-7b'
|
| 25 |
- name: 'llama-2-7b'
|
| 26 |
fdir: 'llama-2-7b'
|
| 27 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-7b-chat-hf'
|
| 28 |
- name: 'gemini-1.0-pro'
|
| 29 |
fdir: 'gemini-1.0-pro'
|
| 30 |
+
url: 'https://deepmind.google/technologies/gemini/pro/'
|
| 31 |
- name: 'gemini-1.5-pro'
|
| 32 |
fdir: 'gemini-1.5-pro'
|
| 33 |
+
url: 'https://deepmind.google/technologies/gemini/pro/'
|
| 34 |
- name: 'gemini-1.5-flash'
|
| 35 |
fdir: 'gemini-1.5-flash'
|
| 36 |
+
url: 'https://deepmind.google/technologies/gemini/flash/'
|
| 37 |
- name: 'llama-3-8b'
|
| 38 |
fdir: 'llama-3-8b'
|
| 39 |
+
url: 'https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct'
|
| 40 |
+
- name: 'llama-3-70b'
|
| 41 |
+
fdir: 'llama-3-70b'
|
| 42 |
+
url: 'https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct'
|
| 43 |
- name: 'gpt-3.5-turbo-0125'
|
| 44 |
fdir: 'gpt-3.5-turbo-0125'
|
| 45 |
+
url: 'https://platform.openai.com/docs/models/gpt-3-5-turbo'
|
| 46 |
- name: 'gpt-4-0314'
|
| 47 |
fdir: 'gpt-4-0314'
|
| 48 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
| 49 |
- name: 'gpt-4-0613'
|
| 50 |
fdir: 'gpt-4-0613'
|
| 51 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
| 52 |
- name: 'gpt-4-1106-preview'
|
| 53 |
fdir: 'gpt-4-1106-preview'
|
| 54 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
| 55 |
- name: 'gpt-4-0125-preview'
|
| 56 |
fdir: 'gpt-4-0125-preview'
|
| 57 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
| 58 |
- name: 'gpt-4-turbo-2024-04-09'
|
| 59 |
fdir: 'gpt-4-turbo-2024-04-09'
|
| 60 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
| 61 |
- name: 'gpt-4o'
|
| 62 |
fdir: 'gpt-4o'
|
| 63 |
+
url: 'https://platform.openai.com/docs/models/gpt-4o'
|
| 64 |
+
- name: 'claude-3.5-sonnet'
|
| 65 |
+
fdir: 'claude3.5-sonnet'
|
| 66 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 67 |
- name: 'claude-3-opus'
|
| 68 |
fdir: 'claude-3-opus-20240229'
|
| 69 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 70 |
- name: 'claude-3-haiku'
|
| 71 |
fdir: 'claude-3-haiku-20240307'
|
| 72 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 73 |
- name: 'claude-3-sonnet'
|
| 74 |
fdir: 'claude-3-sonnet-20240229'
|
| 75 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 76 |
- name: 'claude-2.1'
|
| 77 |
fdir: 'claude-2.1'
|
| 78 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 79 |
- name: 'claude-instant-1.2'
|
| 80 |
fdir: 'claude-instant-1.2'
|
| 81 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
| 82 |
- name: 'command-r-plus'
|
| 83 |
fdir: 'command-r-plus'
|
| 84 |
+
url: 'https://huggingface.co/CohereForAI/c4ai-command-r-plus'
|
| 85 |
+
- name: 'mistral-7b-v0.3'
|
| 86 |
+
fdir: 'mistral-7b-v0.3'
|
| 87 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3'
|
| 88 |
- name: 'mistral-7b-v0.2'
|
| 89 |
fdir: 'mistral-7b-v0.2'
|
| 90 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2'
|
| 91 |
- name: 'mistral-7b-v0.1'
|
| 92 |
fdir: 'mistral-7b-v0.1'
|
| 93 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'
|
| 94 |
- name: 'mixtral-8x7b'
|
| 95 |
fdir: 'mixtral-8x7b'
|
| 96 |
+
url: 'https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1'
|
| 97 |
+
- name: 'mistral-large'
|
| 98 |
+
fdir: 'mistral-large'
|
| 99 |
+
url: 'https://mistral.ai/news/mistral-large/'
|
| 100 |
- name: 'yi-1.5-34b'
|
| 101 |
fdir: 'yi-1.5-34b'
|
| 102 |
+
url: 'https://huggingface.co/01-ai/Yi-1.5-34B-Chat'
|
| 103 |
- name: 'yi-1.5-9b'
|
| 104 |
fdir: 'yi-1.5-9b'
|
| 105 |
+
url: 'https://huggingface.co/01-ai/Yi-1.5-9B-Chat'
|
| 106 |
- name: 'qwen-1.5-72b'
|
| 107 |
fdir: 'qwen-1.5-72b'
|
| 108 |
+
url: 'https://huggingface.co/Qwen/Qwen1.5-72B-Chat'
|
| 109 |
- name: 'qwen-1.5-32b'
|
| 110 |
fdir: 'qwen-1.5-32b'
|
| 111 |
+
url: 'https://huggingface.co/Qwen/Qwen1.5-32B-Chat'
|
| 112 |
- name: 'qwen-2-72b'
|
| 113 |
fdir: 'qwen-2-72b'
|
| 114 |
+
url: 'https://huggingface.co/Qwen/Qwen2-72B-Instruct'
|
| 115 |
- name: 'gemma-7b'
|
| 116 |
fdir: 'gemma-7b'
|
| 117 |
+
url: 'https://huggingface.co/google/gemma-7b-it'
|
| 118 |
- name: 'gemma-2b'
|
| 119 |
fdir: 'gemma-2b'
|
| 120 |
+
url: 'https://huggingface.co/google/gemma-2b-it'
|
|
|
|
| 121 |
- name: 'glm-4-9b'
|
| 122 |
fdir: 'glm-4-9b'
|
| 123 |
+
url: 'https://huggingface.co/THUDM/glm-4-9b-chat'
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -20,6 +20,8 @@ INTRODUCTION_TEXT = """
|
|
| 20 |
LLM_BENCHMARKS_TEXT = f"""
|
| 21 |
## How it works
|
| 22 |
|
|
|
|
|
|
|
| 23 |
### Task
|
| 24 |
The LLMs are evaluated as judges in a pairwise comparison task.
|
| 25 |
Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
|
|
@@ -47,9 +49,9 @@ The [prompt](https://github.com/princeton-nlp/LLMBar/blob/main/LLMEvaluator/eval
|
|
| 47 |
"""
|
| 48 |
|
| 49 |
CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
|
| 50 |
-
CITATION_BUTTON_TEXT = r"""@
|
| 51 |
-
title={Benchmarking
|
| 52 |
author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
|
| 53 |
-
|
| 54 |
-
|
| 55 |
}"""
|
|
|
|
| 20 |
LLM_BENCHMARKS_TEXT = f"""
|
| 21 |
## How it works
|
| 22 |
|
| 23 |
+

|
| 24 |
+
|
| 25 |
### Task
|
| 26 |
The LLMs are evaluated as judges in a pairwise comparison task.
|
| 27 |
Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
|
|
|
|
| 49 |
"""
|
| 50 |
|
| 51 |
CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
|
| 52 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{liu2024benchmarking,
|
| 53 |
+
title={Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction Controllable Summarization},
|
| 54 |
author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
|
| 55 |
+
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
|
| 56 |
+
year = "2024",
|
| 57 |
}"""
|
src/populate.py
CHANGED
|
@@ -8,7 +8,7 @@ import numpy as np
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
from .envs import TOKEN
|
| 10 |
|
| 11 |
-
TYPES = ["
|
| 12 |
|
| 13 |
|
| 14 |
def read_json(file_path: str) -> list[dict]:
|
|
@@ -95,7 +95,12 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 95 |
acc, agr, models_acc, models_agr = pairwise_meta_eval(
|
| 96 |
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
|
| 97 |
)
|
| 98 |
-
predictions["Model"].append(model["name"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
predictions["Accuracy"].append(acc)
|
| 100 |
predictions["Agreement"].append(agr)
|
| 101 |
predictions["Self-Accuracy"].append(models_acc)
|
|
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
from .envs import TOKEN
|
| 10 |
|
| 11 |
+
TYPES = ["number", "html", "number", "number", "number", "number"]
|
| 12 |
|
| 13 |
|
| 14 |
def read_json(file_path: str) -> list[dict]:
|
|
|
|
| 95 |
acc, agr, models_acc, models_agr = pairwise_meta_eval(
|
| 96 |
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
|
| 97 |
)
|
| 98 |
+
# predictions["Model"].append(model["name"])
|
| 99 |
+
# predictions["Model"].append(f"[{model['name']}]({model['url']})")
|
| 100 |
+
link = model['url']
|
| 101 |
+
model_name = model['name']
|
| 102 |
+
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 103 |
+
predictions["Model"].append(output)
|
| 104 |
predictions["Accuracy"].append(acc)
|
| 105 |
predictions["Agreement"].append(agr)
|
| 106 |
predictions["Self-Accuracy"].append(models_acc)
|