add depricated
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import ast
|
|
| 3 |
import glob
|
| 4 |
import pickle
|
| 5 |
import traceback
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
import pandas as pd
|
|
@@ -22,13 +23,13 @@ promo_banner = """
|
|
| 22 |
|
| 23 |
deprecated_model_name = [
|
| 24 |
"GigaChat 3.1.25.3",
|
| 25 |
-
"GigaChat-Pro 2.2.25.3",
|
| 26 |
"saiga_llama3_8b_v6",
|
| 27 |
"saiga_phi3_medium",
|
| 28 |
"GigaChat-Plus 3.1.25.3",
|
| 29 |
"GigaChat-Pro 4.0.26.8",
|
| 30 |
"GigaChat 4.0.26.8",
|
| 31 |
-
"xAI: Grok 2",
|
| 32 |
"GigaChat-Pro 4.0.26.15",
|
| 33 |
"GigaChat 4.0.26.15",
|
| 34 |
"YandexGPT Experimental", "yandex-gpt-arena"
|
|
@@ -88,6 +89,43 @@ def model_hyperlink(model_name, link):
|
|
| 88 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 89 |
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
| 92 |
lines = open(filename).readlines()
|
| 93 |
heads = [v.strip() for v in lines[0].split(",")]
|
|
@@ -151,14 +189,18 @@ def recompute_final_ranking(arena_df):
|
|
| 151 |
|
| 152 |
|
| 153 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
arena_df = arena_df.sort_values(
|
| 155 |
by=["final_ranking", "rating"], ascending=[True, False]
|
| 156 |
)
|
| 157 |
|
| 158 |
-
if hidden_models:
|
| 159 |
-
arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy() # Filter deprecated models
|
| 160 |
-
|
| 161 |
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# sort by rating
|
| 164 |
if arena_subset_df is not None:
|
|
@@ -321,7 +363,7 @@ def build_leaderboard_tab(
|
|
| 321 |
model_table_df = pd.DataFrame(data)
|
| 322 |
|
| 323 |
with gr.Tabs() as tabs:
|
| 324 |
-
arena_table_vals = get_arena_table(arena_df, model_table_df)
|
| 325 |
|
| 326 |
with gr.Tab("Arena", id=0):
|
| 327 |
md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
|
|
@@ -330,17 +372,18 @@ def build_leaderboard_tab(
|
|
| 330 |
with gr.Row():
|
| 331 |
with gr.Column(scale=2):
|
| 332 |
category_dropdown = gr.Dropdown(
|
| 333 |
-
choices=actual_categories,
|
| 334 |
-
value=selected_category,
|
| 335 |
label="Category",
|
| 336 |
)
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
|
|
|
| 344 |
default_category_details = make_category_arena_leaderboard_md(
|
| 345 |
arena_df, arena_df, name=selected_category
|
| 346 |
)
|
|
@@ -481,7 +524,7 @@ def build_leaderboard_tab(
|
|
| 481 |
)
|
| 482 |
|
| 483 |
def update_leaderboard_and_plots(category, filters):
|
| 484 |
-
_, arena_dfs, category_elo_results, _
|
| 485 |
|
| 486 |
arena_subset_df = arena_dfs[category]
|
| 487 |
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
|
|
@@ -493,7 +536,25 @@ def build_leaderboard_tab(
|
|
| 493 |
arena_df,
|
| 494 |
model_table_df,
|
| 495 |
arena_subset_df=arena_subset_df if category != "Overall" else None,
|
| 496 |
-
hidden_models=None if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
)
|
| 498 |
if category != "Overall":
|
| 499 |
arena_values = update_leaderboard_df(arena_values)
|
|
@@ -569,7 +630,7 @@ def build_leaderboard_tab(
|
|
| 569 |
if leaderboard_table_file:
|
| 570 |
category_dropdown.change(
|
| 571 |
fn=update_leaderboard_and_plots,
|
| 572 |
-
inputs=[category_dropdown, category_checkbox],
|
| 573 |
outputs=[
|
| 574 |
elo_display_df,
|
| 575 |
plot_1,
|
|
@@ -580,8 +641,8 @@ def build_leaderboard_tab(
|
|
| 580 |
category_deets,
|
| 581 |
],
|
| 582 |
)
|
| 583 |
-
category_checkbox.change(
|
| 584 |
-
|
| 585 |
inputs=[category_dropdown, category_checkbox],
|
| 586 |
outputs=[
|
| 587 |
elo_display_df,
|
|
|
|
| 3 |
import glob
|
| 4 |
import pickle
|
| 5 |
import traceback
|
| 6 |
+
import numpy as np
|
| 7 |
from datetime import datetime
|
| 8 |
|
| 9 |
import pandas as pd
|
|
|
|
| 23 |
|
| 24 |
deprecated_model_name = [
|
| 25 |
"GigaChat 3.1.25.3",
|
| 26 |
+
"GigaChat-Pro 2.2.25.3",
|
| 27 |
"saiga_llama3_8b_v6",
|
| 28 |
"saiga_phi3_medium",
|
| 29 |
"GigaChat-Plus 3.1.25.3",
|
| 30 |
"GigaChat-Pro 4.0.26.8",
|
| 31 |
"GigaChat 4.0.26.8",
|
| 32 |
+
"xAI: Grok 2",
|
| 33 |
"GigaChat-Pro 4.0.26.15",
|
| 34 |
"GigaChat 4.0.26.15",
|
| 35 |
"YandexGPT Experimental", "yandex-gpt-arena"
|
|
|
|
| 89 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 90 |
|
| 91 |
|
| 92 |
+
def filter_deprecated_models_plots(fig, hidden_models=None):
|
| 93 |
+
"""
|
| 94 |
+
Removes deprecated models from a Plotly figure.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
fig: The Plotly figure object.
|
| 98 |
+
hidden_models: A list of model names to remove.
|
| 99 |
+
"""
|
| 100 |
+
if fig is None:
|
| 101 |
+
return
|
| 102 |
+
|
| 103 |
+
if hidden_models is None:
|
| 104 |
+
return fig
|
| 105 |
+
|
| 106 |
+
if fig.data[0].type == 'heatmap':
|
| 107 |
+
data = fig.data[0]
|
| 108 |
+
mask_x = ~np.isin(data.x, hidden_models)
|
| 109 |
+
mask_y = ~np.isin(data.y, hidden_models)
|
| 110 |
+
data.update({
|
| 111 |
+
'x': np.array(data.x)[mask_x],
|
| 112 |
+
'y': np.array(data.y)[mask_y],
|
| 113 |
+
'z': np.array(data.z)[np.ix_(mask_y, mask_x)]
|
| 114 |
+
})
|
| 115 |
+
elif fig.data[0].type == 'scatter':
|
| 116 |
+
trace = fig.data[0]
|
| 117 |
+
mask = ~np.isin(trace.x, hidden_models)
|
| 118 |
+
trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask]
|
| 119 |
+
for key in ['array', 'arrayminus']:
|
| 120 |
+
if key in trace.error_y:
|
| 121 |
+
trace.error_y[key] = trace.error_y[key][mask]
|
| 122 |
+
elif fig.data[0].type == 'bar':
|
| 123 |
+
mask = ~np.isin(fig.data[0].x, hidden_models)
|
| 124 |
+
fig.data[0].x = fig.data[0].x[mask]
|
| 125 |
+
fig.data[0].y = fig.data[0].y[mask]
|
| 126 |
+
|
| 127 |
+
return fig
|
| 128 |
+
|
| 129 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
| 130 |
lines = open(filename).readlines()
|
| 131 |
heads = [v.strip() for v in lines[0].split(",")]
|
|
|
|
| 189 |
|
| 190 |
|
| 191 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
| 192 |
+
# Apply hidden_models filter first
|
| 193 |
+
if hidden_models:
|
| 194 |
+
arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
|
| 195 |
+
|
| 196 |
arena_df = arena_df.sort_values(
|
| 197 |
by=["final_ranking", "rating"], ascending=[True, False]
|
| 198 |
)
|
| 199 |
|
|
|
|
|
|
|
|
|
|
| 200 |
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
| 201 |
+
arena_df = arena_df.sort_values(
|
| 202 |
+
by=["final_ranking", "rating"], ascending=[True, False]
|
| 203 |
+
)
|
| 204 |
|
| 205 |
# sort by rating
|
| 206 |
if arena_subset_df is not None:
|
|
|
|
| 363 |
model_table_df = pd.DataFrame(data)
|
| 364 |
|
| 365 |
with gr.Tabs() as tabs:
|
| 366 |
+
arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
|
| 367 |
|
| 368 |
with gr.Tab("Arena", id=0):
|
| 369 |
md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
|
|
|
|
| 372 |
with gr.Row():
|
| 373 |
with gr.Column(scale=2):
|
| 374 |
category_dropdown = gr.Dropdown(
|
| 375 |
+
choices=actual_categories,
|
| 376 |
+
value=selected_category,
|
| 377 |
label="Category",
|
| 378 |
)
|
| 379 |
+
|
| 380 |
+
with gr.Column(scale=2):
|
| 381 |
+
category_checkbox = gr.CheckboxGroup(
|
| 382 |
+
["Deprecated"],
|
| 383 |
+
label="Filter",
|
| 384 |
+
value=[],
|
| 385 |
+
info="",
|
| 386 |
+
)
|
| 387 |
default_category_details = make_category_arena_leaderboard_md(
|
| 388 |
arena_df, arena_df, name=selected_category
|
| 389 |
)
|
|
|
|
| 524 |
)
|
| 525 |
|
| 526 |
def update_leaderboard_and_plots(category, filters):
|
| 527 |
+
_, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
| 528 |
|
| 529 |
arena_subset_df = arena_dfs[category]
|
| 530 |
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
|
|
|
|
| 536 |
arena_df,
|
| 537 |
model_table_df,
|
| 538 |
arena_subset_df=arena_subset_df if category != "Overall" else None,
|
| 539 |
+
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# Filter plots based on deprecated models
|
| 543 |
+
p1 = filter_deprecated_models_plots(
|
| 544 |
+
elo_subset_results["win_fraction_heatmap"],
|
| 545 |
+
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 546 |
+
)
|
| 547 |
+
p2 = filter_deprecated_models_plots(
|
| 548 |
+
elo_subset_results["battle_count_heatmap"],
|
| 549 |
+
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 550 |
+
)
|
| 551 |
+
p3 = filter_deprecated_models_plots(
|
| 552 |
+
elo_subset_results["bootstrap_elo_rating"],
|
| 553 |
+
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 554 |
+
)
|
| 555 |
+
p4 = filter_deprecated_models_plots(
|
| 556 |
+
elo_subset_results["average_win_rate_bar"],
|
| 557 |
+
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 558 |
)
|
| 559 |
if category != "Overall":
|
| 560 |
arena_values = update_leaderboard_df(arena_values)
|
|
|
|
| 630 |
if leaderboard_table_file:
|
| 631 |
category_dropdown.change(
|
| 632 |
fn=update_leaderboard_and_plots,
|
| 633 |
+
inputs=[category_dropdown, category_checkbox],
|
| 634 |
outputs=[
|
| 635 |
elo_display_df,
|
| 636 |
plot_1,
|
|
|
|
| 641 |
category_deets,
|
| 642 |
],
|
| 643 |
)
|
| 644 |
+
category_checkbox.change(
|
| 645 |
+
update_leaderboard_and_plots,
|
| 646 |
inputs=[category_dropdown, category_checkbox],
|
| 647 |
outputs=[
|
| 648 |
elo_display_df,
|