LLMArena's picture
Update app.py
0555b52 verified
raw
history blame
26.2 kB
import argparse
import ast
import glob
import pickle
import traceback
from datetime import datetime
import pandas as pd
import gradio as gr
import numpy as np
basic_component_values = [None] * 6
leader_component_values = [None] * 5
promo_banner = """
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
llmarena.ru - ИСПОЛЬЗУЙТЕ БЕСПЛАТНО ПОСЛЕДНИЕ ВЕРСИИ ЛУЧШИХ ЧАТ-БОТОВ НА РУССКОМ
</div>
"""
def make_default_md_1():
leaderboard_md = f"""
# 🏆 LLM арена на русском: таблица лидеров
{promo_banner}
"""
return leaderboard_md
def make_default_md_2():
leaderboard_md = f"""
Платформа LLM Arena является открытой краудсорсинговой платформой для оценки больших языковых моделей (LLM) на русском языке. Мы собираем парные сравнения от людей, чтобы ранжировать LLM с помощью модели Брэдли-Терри и отображать рейтинги моделей по шкале Эло.
Chatbot Arena на русском зависит от участия сообщества, пожалуйста, внесите свой вклад, отдав свой голос!
- Чтобы **добавить свою модель** в сравнение - напишите нам в tg: [Группа](https://t.me/+bFEOl-Bdmok4NGUy)
- Если вы **нашли ошибку**, либо у вас **есть предложение** - напишите нам: [Роман](https://t.me/roman_kucev)
"""
return leaderboard_md
def make_arena_leaderboard_md(arena_df, last_updated_time):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
leaderboard_md = f"""
Всего #моделей: **{total_models}**.{space} Всего #голосов: **{"{:,}".format(total_votes)}**.{space} Последнее обновление: {last_updated_time}.
***Ранг (UB)**: рейтинг модели (верхняя граница), определяется как один плюс количество моделей, которые статистически лучше целевой модели.
Модель A статистически лучше модели B, когда нижняя граница оценки модели A больше верхней границы оценки модели B (с доверительным интервалом 95%).
См. Рисунок 1 ниже для визуализации доверительных интервалов оценок моделей.
"""
return leaderboard_md
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
total_subset_votes = sum(arena_subset_df["num_battles"])
total_subset_models = len(arena_subset_df)
leaderboard_md = f"""### {cat_name_to_explanation[name]}
#### {space} #модели: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #голоса: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
"""
return leaderboard_md
def model_hyperlink(model_name, link):
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
def load_leaderboard_table_csv(filename, add_hyperlink=True):
lines = open(filename).readlines()
heads = [v.strip() for v in lines[0].split(",")]
rows = []
for i in range(1, len(lines)):
row = [v.strip() for v in lines[i].split(",")]
for j in range(len(heads)):
item = {}
for h, v in zip(heads, row):
if h == "Arena Elo rating":
if v != "-":
v = int(ast.literal_eval(v))
else:
v = np.nan
elif h == "MMLU":
if v != "-":
v = round(ast.literal_eval(v) * 100, 1)
else:
v = np.nan
elif h == "MT-bench (win rate %)":
if v != "-":
v = round(ast.literal_eval(v[:-1]), 1)
else:
v = np.nan
elif h == "MT-bench (score)":
if v != "-":
v = round(ast.literal_eval(v), 2)
else:
v = np.nan
item[h] = v
if add_hyperlink:
item["Model"] = model_hyperlink(item["Model"], item["Link"])
rows.append(item)
return rows
def create_ranking_str(ranking, ranking_difference):
if ranking_difference > 0:
return f"{int(ranking)} \u2191"
elif ranking_difference < 0:
return f"{int(ranking)} \u2193"
else:
return f"{int(ranking)}"
def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
# sort by rating
if arena_subset_df is not None:
# filter out models not in the arena_df
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
# keep only the models in the subset in arena_df and recompute final_ranking
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
# recompute final ranking
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
# assign ranking by the order
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
# join arena_df and arena_subset_df on index
arena_df = arena_subset_df.join(
arena_df["final_ranking"], rsuffix="_global", how="inner"
)
arena_df["ranking_difference"] = (
arena_df["final_ranking_global"] - arena_df["final_ranking"]
)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = arena_df.apply(
lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
axis=1,
)
arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
values = []
for i in range(len(arena_df)):
row = []
model_key = arena_df.index[i]
try:
model_name = model_table_df[model_table_df["key"] == model_key][
"Model"
].values[0]
ranking = arena_df.iloc[i].get("final_ranking") or i + 1
row.append(ranking)
if arena_subset_df is not None:
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
row.append(model_name)
row.append(round(arena_df.iloc[i]["rating"]))
upper_diff = round(
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
)
lower_diff = round(
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
)
row.append(f"+{upper_diff}/-{lower_diff}")
row.append(round(arena_df.iloc[i]["num_battles"]))
row.append(
model_table_df[model_table_df["key"] == model_key][
"Organization"
].values[0]
)
row.append(
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
)
cutoff_date = model_table_df[model_table_df["key"] == model_key][
"Knowledge cutoff date"
].values[0]
if cutoff_date == "-":
row.append("Unknown")
else:
row.append(cutoff_date)
values.append(row)
except Exception as e:
traceback.print_exc()
print(f"{model_key} - {e}")
return values
key_to_category_name = {
"full": "Overall",
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
}
cat_name_to_explanation = {
"Overall": "Все запросы",
"crowdsourcing/simple_prompts": "Запросы, собранные с краудсорсинга. Преимущественно, простые.",
"site_visitors/medium_prompts": "Запросы от пользователей сайта. Содержат более сложные промпты.",
"site_visitors/medium_prompts:style control": "Запросы от пользователей сайта. Содержат более сложные промпты. [Снижено влияние стилистики](https://lmsys.org/blog/2024-08-28-style-control/) ответа на оценку."
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
}
actual_categories = [
# "Overall",
# "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control"
]
req_cat = "site_visitors/medium_prompts:style control"
# selected_category = req_cat if req_cat in actual_categories else "Overall"
selected_category = req_cat if req_cat in actual_categories else "site_visitors/medium_prompts:style control"
def read_elo_file(elo_results_file, leaderboard_table_file):
arena_dfs = {}
category_elo_results = {}
with open(elo_results_file, "rb") as fin:
elo_results = pickle.load(fin)
last_updated_time = None
if selected_category in elo_results:
last_updated_time = elo_results[selected_category]["last_updated_datetime"].split(
" "
)[0]
for k in key_to_category_name.keys():
if k not in elo_results:
continue
arena_dfs[key_to_category_name[k]] = elo_results[k][
"leaderboard_table_df"
]
category_elo_results[key_to_category_name[k]] = elo_results[k]
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
def build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
):
arena_dfs = {}
arena_df = pd.DataFrame()
category_elo_results = {}
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
arena_df = arena_dfs[selected_category]
p1 = category_elo_results[selected_category]["win_fraction_heatmap"]
p2 = category_elo_results[selected_category]["battle_count_heatmap"]
p3 = category_elo_results[selected_category]["bootstrap_elo_rating"]
p4 = category_elo_results[selected_category]["average_win_rate_bar"]
# arena_df = arena_dfs["Overall"]
default_md = make_default_md_1()
default_md_2 = make_default_md_2()
with gr.Row():
with gr.Column(scale=4):
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
with gr.Column(scale=1):
vote_button = gr.Button("Голосовать!", link="https://llmarena.ru")
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
if leaderboard_table_file:
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
with gr.Tabs() as tabs:
arena_table_vals = get_arena_table(arena_df, model_table_df)
with gr.Tab("Arena", id=0):
md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
with gr.Row():
with gr.Column(scale=2):
category_dropdown = gr.Dropdown(
choices=actual_categories, # Updated categories
value=selected_category, # Default to selected_category
label="Category",
)
default_category_details = make_category_arena_leaderboard_md(
arena_df, arena_df, name=selected_category
)
with gr.Column(scale=4, variant="panel"):
category_deets = gr.Markdown(
default_category_details, elem_id="category_deets"
)
arena_vals = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
elo_display_df = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_vals.style,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
wrap=True,
)
gr.Markdown(
elem_id="leaderboard_markdown",
)
leader_component_values[:] = [default_md, p1, p2, p3, p4]
if show_plot:
more_stats_md = gr.Markdown(
f"""## Больше статистики Чат-бот Арены""",
elem_id="leaderboard_header_markdown",
)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
elem_id="plot-title",
)
plot_3 = gr.Plot(p3, show_label=False)
with gr.Column():
gr.Markdown(
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
elem_id="plot-title",
)
plot_4 = gr.Plot(p4, show_label=False)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
elem_id="plot-title",
)
plot_1 = gr.Plot(
p1, show_label=False, elem_id="plot-container"
)
with gr.Column():
gr.Markdown(
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
elem_id="plot-title",
)
plot_2 = gr.Plot(p2, show_label=False)
if not show_plot:
gr.Markdown(
"""
""",
elem_id="leaderboard_markdown",
)
else:
pass
def update_leaderboard_df(arena_table_vals):
elo_datarame = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
def highlight_max(s):
return [
"color: green; font-weight: bold"
if "\u2191" in v
else "color: red; font-weight: bold"
if "\u2193" in v
else ""
for v in s
]
def highlight_rank_max(s):
return [
"color: green; font-weight: bold"
if v > 0
else "color: red; font-weight: bold"
if v < 0
else ""
for v in s
]
return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
highlight_rank_max, subset=["Delta"]
)
def update_leaderboard_and_plots(category):
_, arena_dfs, category_elo_results, _ , model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
arena_subset_df = arena_dfs[category]
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
elo_subset_results = category_elo_results[category]
baseline_category = cat_name_to_baseline.get(category, selected_category)
arena_df = arena_dfs[baseline_category]
arena_values = get_arena_table(
arena_df,
model_table_df,
arena_subset_df=arena_subset_df if category != "Overall" else None,
)
if category != "Overall":
arena_values = update_leaderboard_df(arena_values)
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"number",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
wrap=True,
)
else:
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
wrap=True,
)
p1 = elo_subset_results["win_fraction_heatmap"]
p2 = elo_subset_results["battle_count_heatmap"]
p3 = elo_subset_results["bootstrap_elo_rating"]
p4 = elo_subset_results["average_win_rate_bar"]
more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
"""
leaderboard_md = make_category_arena_leaderboard_md(
arena_df, arena_subset_df, name=category
)
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
if leaderboard_table_file:
category_dropdown.change(
fn=update_leaderboard_and_plots,
inputs=[category_dropdown],
outputs=[
elo_display_df,
plot_1,
plot_2,
plot_3,
plot_4,
more_stats_md,
category_deets,
],
)
if show_plot and leaderboard_table_file:
return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
return [md_1]
def build_demo(elo_results_file, leaderboard_table_file):
text_size = gr.themes.sizes.text_lg
theme = gr.themes.Default.load("theme.json")
theme.text_size = text_size
theme.set(
button_large_text_size="40px",
button_small_text_size="40px",
button_large_text_weight="1000",
button_small_text_weight="1000",
button_shadow="*shadow_drop_lg",
button_shadow_hover="*shadow_drop_lg",
checkbox_label_shadow="*shadow_drop_lg",
button_shadow_active="*shadow_inset",
button_secondary_background_fill="*primary_300",
button_secondary_background_fill_dark="*primary_700",
button_secondary_background_fill_hover="*primary_200",
button_secondary_background_fill_hover_dark="*primary_500",
button_secondary_text_color="*primary_800",
button_secondary_text_color_dark="white",
)
with gr.Blocks(
title="LLM арена: таблица лидеров",
theme=theme,
css=block_css,
) as demo:
build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
)
return demo
block_css = """
#notice_markdown .prose {
font-size: 110% !important;
}
#notice_markdown th {
display: none;
}
#notice_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#arena_leaderboard_dataframe table {
font-size: 110%;
}
#full_leaderboard_dataframe table {
font-size: 110%;
}
#model_description_markdown {
font-size: 110% !important;
}
#leaderboard_markdown .prose {
font-size: 110% !important;
}
#leaderboard_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#leaderboard_dataframe td {
line-height: 0.1em;
}
#about_markdown .prose {
font-size: 110% !important;
}
#ack_markdown .prose {
font-size: 110% !important;
}
#chatbot .prose {
font-size: 105% !important;
}
.sponsor-image-about img {
margin: 0 20px;
margin-top: 20px;
height: 40px;
max-height: 100%;
width: auto;
float: left;
}
.chatbot h1, h2, h3 {
margin-top: 8px; /* Adjust the value as needed */
margin-bottom: 0px; /* Adjust the value as needed */
padding-bottom: 0px;
}
.chatbot h1 {
font-size: 130%;
}
.chatbot h2 {
font-size: 120%;
}
.chatbot h3 {
font-size: 110%;
}
.chatbot p:not(:first-child) {
margin-top: 8px;
}
.typing {
display: inline-block;
}
.cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: black;
vertical-align: middle;
animation: blink 1s infinite;
}
.dark .cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: white;
vertical-align: middle;
animation: blink 1s infinite;
}
@keyframes blink {
0%, 50% { opacity: 1; }
50.1%, 100% { opacity: 0; }
}
.app {
max-width: 100% !important;
padding: 20px !important;
}
a {
color: #1976D2; /* Your current link color, a shade of blue */
text-decoration: none; /* Removes underline from links */
}
a:hover {
color: #63A4FF; /* This can be any color you choose for hover */
text-decoration: underline; /* Adds underline on hover */
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--share", action="store_true")
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=7860)
args = parser.parse_args()
elo_result_files = glob.glob("elo_results_*.pkl")
elo_result_files.sort(key=lambda x: int(x[12:-4]))
elo_result_file = elo_result_files[-1]
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
leaderboard_table_file = leaderboard_table_files[-1]
demo = build_demo(elo_result_file, leaderboard_table_file)
demo.launch(show_api=False)