LLMArena commited on
Commit
15dd4eb
·
verified ·
1 Parent(s): 38cea8b

add depricated

Browse files
Files changed (1) hide show
  1. app.py +81 -20
app.py CHANGED
@@ -3,6 +3,7 @@ import ast
3
  import glob
4
  import pickle
5
  import traceback
 
6
  from datetime import datetime
7
 
8
  import pandas as pd
@@ -22,13 +23,13 @@ promo_banner = """
22
 
23
  deprecated_model_name = [
24
  "GigaChat 3.1.25.3",
25
- "GigaChat-Pro 2.2.25.3",
26
  "saiga_llama3_8b_v6",
27
  "saiga_phi3_medium",
28
  "GigaChat-Plus 3.1.25.3",
29
  "GigaChat-Pro 4.0.26.8",
30
  "GigaChat 4.0.26.8",
31
- "xAI: Grok 2",
32
  "GigaChat-Pro 4.0.26.15",
33
  "GigaChat 4.0.26.15",
34
  "YandexGPT Experimental", "yandex-gpt-arena"
@@ -88,6 +89,43 @@ def model_hyperlink(model_name, link):
88
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def load_leaderboard_table_csv(filename, add_hyperlink=True):
92
  lines = open(filename).readlines()
93
  heads = [v.strip() for v in lines[0].split(",")]
@@ -151,14 +189,18 @@ def recompute_final_ranking(arena_df):
151
 
152
 
153
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
 
 
 
 
154
  arena_df = arena_df.sort_values(
155
  by=["final_ranking", "rating"], ascending=[True, False]
156
  )
157
 
158
- if hidden_models:
159
- arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy() # Filter deprecated models
160
-
161
  arena_df["final_ranking"] = recompute_final_ranking(arena_df)
 
 
 
162
 
163
  # sort by rating
164
  if arena_subset_df is not None:
@@ -321,7 +363,7 @@ def build_leaderboard_tab(
321
  model_table_df = pd.DataFrame(data)
322
 
323
  with gr.Tabs() as tabs:
324
- arena_table_vals = get_arena_table(arena_df, model_table_df)
325
 
326
  with gr.Tab("Arena", id=0):
327
  md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
@@ -330,17 +372,18 @@ def build_leaderboard_tab(
330
  with gr.Row():
331
  with gr.Column(scale=2):
332
  category_dropdown = gr.Dropdown(
333
- choices=actual_categories, # Updated categories
334
- value=selected_category, # Default to selected_category
335
  label="Category",
336
  )
337
- with gr.Column(scale=2): # New CheckboxGroup for deprecated models
338
- category_checkbox = gr.CheckboxGroup(
339
- ["Show Deprecated Models"],
340
- label="Filter",
341
- info="",
342
- )
343
-
 
344
  default_category_details = make_category_arena_leaderboard_md(
345
  arena_df, arena_df, name=selected_category
346
  )
@@ -481,7 +524,7 @@ def build_leaderboard_tab(
481
  )
482
 
483
  def update_leaderboard_and_plots(category, filters):
484
- _, arena_dfs, category_elo_results, _ , model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
485
 
486
  arena_subset_df = arena_dfs[category]
487
  arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
@@ -493,7 +536,25 @@ def build_leaderboard_tab(
493
  arena_df,
494
  model_table_df,
495
  arena_subset_df=arena_subset_df if category != "Overall" else None,
496
- hidden_models=None if "Show Deprecated Models" in filters else deprecated_model_name # Pass filter value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  )
498
  if category != "Overall":
499
  arena_values = update_leaderboard_df(arena_values)
@@ -569,7 +630,7 @@ def build_leaderboard_tab(
569
  if leaderboard_table_file:
570
  category_dropdown.change(
571
  fn=update_leaderboard_and_plots,
572
- inputs=[category_dropdown, category_checkbox], # Pass checkbox value
573
  outputs=[
574
  elo_display_df,
575
  plot_1,
@@ -580,8 +641,8 @@ def build_leaderboard_tab(
580
  category_deets,
581
  ],
582
  )
583
- category_checkbox.change( # Add a separate change handler for the checkbox
584
- fn=update_leaderboard_and_plots,
585
  inputs=[category_dropdown, category_checkbox],
586
  outputs=[
587
  elo_display_df,
 
3
  import glob
4
  import pickle
5
  import traceback
6
+ import numpy as np
7
  from datetime import datetime
8
 
9
  import pandas as pd
 
23
 
24
  deprecated_model_name = [
25
  "GigaChat 3.1.25.3",
26
+ "GigaChat-Pro 2.2.25.3",
27
  "saiga_llama3_8b_v6",
28
  "saiga_phi3_medium",
29
  "GigaChat-Plus 3.1.25.3",
30
  "GigaChat-Pro 4.0.26.8",
31
  "GigaChat 4.0.26.8",
32
+ "xAI: Grok 2",
33
  "GigaChat-Pro 4.0.26.15",
34
  "GigaChat 4.0.26.15",
35
  "YandexGPT Experimental", "yandex-gpt-arena"
 
89
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
90
 
91
 
92
+ def filter_deprecated_models_plots(fig, hidden_models=None):
93
+ """
94
+ Removes deprecated models from a Plotly figure.
95
+
96
+ Args:
97
+ fig: The Plotly figure object.
98
+ hidden_models: A list of model names to remove.
99
+ """
100
+ if fig is None:
101
+ return
102
+
103
+ if hidden_models is None:
104
+ return fig
105
+
106
+ if fig.data[0].type == 'heatmap':
107
+ data = fig.data[0]
108
+ mask_x = ~np.isin(data.x, hidden_models)
109
+ mask_y = ~np.isin(data.y, hidden_models)
110
+ data.update({
111
+ 'x': np.array(data.x)[mask_x],
112
+ 'y': np.array(data.y)[mask_y],
113
+ 'z': np.array(data.z)[np.ix_(mask_y, mask_x)]
114
+ })
115
+ elif fig.data[0].type == 'scatter':
116
+ trace = fig.data[0]
117
+ mask = ~np.isin(trace.x, hidden_models)
118
+ trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask]
119
+ for key in ['array', 'arrayminus']:
120
+ if key in trace.error_y:
121
+ trace.error_y[key] = trace.error_y[key][mask]
122
+ elif fig.data[0].type == 'bar':
123
+ mask = ~np.isin(fig.data[0].x, hidden_models)
124
+ fig.data[0].x = fig.data[0].x[mask]
125
+ fig.data[0].y = fig.data[0].y[mask]
126
+
127
+ return fig
128
+
129
  def load_leaderboard_table_csv(filename, add_hyperlink=True):
130
  lines = open(filename).readlines()
131
  heads = [v.strip() for v in lines[0].split(",")]
 
189
 
190
 
191
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
192
+ # Apply hidden_models filter first
193
+ if hidden_models:
194
+ arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
195
+
196
  arena_df = arena_df.sort_values(
197
  by=["final_ranking", "rating"], ascending=[True, False]
198
  )
199
 
 
 
 
200
  arena_df["final_ranking"] = recompute_final_ranking(arena_df)
201
+ arena_df = arena_df.sort_values(
202
+ by=["final_ranking", "rating"], ascending=[True, False]
203
+ )
204
 
205
  # sort by rating
206
  if arena_subset_df is not None:
 
363
  model_table_df = pd.DataFrame(data)
364
 
365
  with gr.Tabs() as tabs:
366
+ arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
367
 
368
  with gr.Tab("Arena", id=0):
369
  md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
 
372
  with gr.Row():
373
  with gr.Column(scale=2):
374
  category_dropdown = gr.Dropdown(
375
+ choices=actual_categories,
376
+ value=selected_category,
377
  label="Category",
378
  )
379
+
380
+ with gr.Column(scale=2):
381
+ category_checkbox = gr.CheckboxGroup(
382
+ ["Deprecated"],
383
+ label="Filter",
384
+ value=[],
385
+ info="",
386
+ )
387
  default_category_details = make_category_arena_leaderboard_md(
388
  arena_df, arena_df, name=selected_category
389
  )
 
524
  )
525
 
526
  def update_leaderboard_and_plots(category, filters):
527
+ _, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
528
 
529
  arena_subset_df = arena_dfs[category]
530
  arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
 
536
  arena_df,
537
  model_table_df,
538
  arena_subset_df=arena_subset_df if category != "Overall" else None,
539
+ hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
540
+ )
541
+
542
+ # Filter plots based on deprecated models
543
+ p1 = filter_deprecated_models_plots(
544
+ elo_subset_results["win_fraction_heatmap"],
545
+ hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
546
+ )
547
+ p2 = filter_deprecated_models_plots(
548
+ elo_subset_results["battle_count_heatmap"],
549
+ hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
550
+ )
551
+ p3 = filter_deprecated_models_plots(
552
+ elo_subset_results["bootstrap_elo_rating"],
553
+ hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
554
+ )
555
+ p4 = filter_deprecated_models_plots(
556
+ elo_subset_results["average_win_rate_bar"],
557
+ hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
558
  )
559
  if category != "Overall":
560
  arena_values = update_leaderboard_df(arena_values)
 
630
  if leaderboard_table_file:
631
  category_dropdown.change(
632
  fn=update_leaderboard_and_plots,
633
+ inputs=[category_dropdown, category_checkbox],
634
  outputs=[
635
  elo_display_df,
636
  plot_1,
 
641
  category_deets,
642
  ],
643
  )
644
+ category_checkbox.change(
645
+ update_leaderboard_and_plots,
646
  inputs=[category_dropdown, category_checkbox],
647
  outputs=[
648
  elo_display_df,