Spaces:
Running
Running
Amber Tanaka
commited on
copy changes around graph (#47)
Browse files- category_page_builder.py +2 -0
- content.py +13 -2
- leaderboard_transformer.py +2 -2
- main_page.py +2 -0
- ui_components.py +3 -2
category_page_builder.py
CHANGED
|
@@ -20,6 +20,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
| 20 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 21 |
# Repeat the process for the "test" split
|
| 22 |
if not test_df.empty:
|
|
|
|
| 23 |
create_leaderboard_display(
|
| 24 |
full_df=test_df,
|
| 25 |
tag_map=test_tag_map,
|
|
@@ -36,6 +37,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
| 36 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 37 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 38 |
if not validation_df.empty:
|
|
|
|
| 39 |
# 2. Render the main category display using the loaded data.
|
| 40 |
create_leaderboard_display(
|
| 41 |
full_df=validation_df,
|
|
|
|
| 20 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 21 |
# Repeat the process for the "test" split
|
| 22 |
if not test_df.empty:
|
| 23 |
+
gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
|
| 24 |
create_leaderboard_display(
|
| 25 |
full_df=test_df,
|
| 26 |
tag_map=test_tag_map,
|
|
|
|
| 37 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 38 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 39 |
if not validation_df.empty:
|
| 40 |
+
gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
|
| 41 |
# 2. Render the main category display using the loaded data.
|
| 42 |
create_leaderboard_display(
|
| 43 |
full_df=validation_df,
|
content.py
CHANGED
|
@@ -31,7 +31,18 @@ Each category page includes a summary table (average score and cost per problem
|
|
| 31 |
🔍 Learn more in the AstaBench technical blog post
|
| 32 |
"""
|
| 33 |
SCATTER_DISCLAIMER = """
|
| 34 |
-
Note
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"""
|
| 36 |
PARETO_DISCLAIMER = """
|
| 37 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
|
@@ -164,7 +175,7 @@ table.gr-table {
|
|
| 164 |
padding-top: 0 !important;
|
| 165 |
}
|
| 166 |
#scatter-disclaimer {
|
| 167 |
-
|
| 168 |
}
|
| 169 |
#pareto-disclaimer {
|
| 170 |
color: #f0529c !important;
|
|
|
|
| 31 |
🔍 Learn more in the AstaBench technical blog post
|
| 32 |
"""
|
| 33 |
SCATTER_DISCLAIMER = """
|
| 34 |
+
**Note:** Agents without cost data are displayed to the right of the vertical divider line. <span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line: Max Cost + (MaxCost/10) Missing Cost Datapoints/No Cost Data = Max Cost + (MaxCost/5)">ⓘ</span>
|
| 35 |
+
"""
|
| 36 |
+
scatter_disclaimer_html = """
|
| 37 |
+
<div class="disclaimer-text">
|
| 38 |
+
<b>Note:</b> Agents without cost data are displayed to the right of the vertical divider line.
|
| 39 |
+
<span class="tooltip-icon" data-tooltip="Missing Cost Dashed Line:
|
| 40 |
+
Max Cost + (MaxCost/10)
|
| 41 |
+
Missing Cost Datapoints / No Cost Data:
|
| 42 |
+
Max Cost + (MaxCost/5)">
|
| 43 |
+
ⓘ
|
| 44 |
+
</span>
|
| 45 |
+
</div>
|
| 46 |
"""
|
| 47 |
PARETO_DISCLAIMER = """
|
| 48 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
|
|
|
| 175 |
padding-top: 0 !important;
|
| 176 |
}
|
| 177 |
#scatter-disclaimer {
|
| 178 |
+
overflow: visible !important;
|
| 179 |
}
|
| 180 |
#pareto-disclaimer {
|
| 181 |
color: #f0529c !important;
|
leaderboard_transformer.py
CHANGED
|
@@ -371,7 +371,7 @@ def _plot_scatter_plotly(
|
|
| 371 |
data_plot = data.copy()
|
| 372 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
| 373 |
|
| 374 |
-
x_axis_label = f"
|
| 375 |
max_reported_cost = 0
|
| 376 |
divider_line_x = 0
|
| 377 |
|
|
@@ -552,7 +552,7 @@ def _plot_scatter_plotly(
|
|
| 552 |
template="plotly_white",
|
| 553 |
title=f"Astabench {name} Leaderboard",
|
| 554 |
xaxis=xaxis_config, # Use the updated config
|
| 555 |
-
yaxis=dict(title="
|
| 556 |
legend=dict(
|
| 557 |
bgcolor='#FAF2E9',
|
| 558 |
),
|
|
|
|
| 371 |
data_plot = data.copy()
|
| 372 |
data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
|
| 373 |
|
| 374 |
+
x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
|
| 375 |
max_reported_cost = 0
|
| 376 |
divider_line_x = 0
|
| 377 |
|
|
|
|
| 552 |
template="plotly_white",
|
| 553 |
title=f"Astabench {name} Leaderboard",
|
| 554 |
xaxis=xaxis_config, # Use the updated config
|
| 555 |
+
yaxis=dict(title="Average (mean) score", rangemode="tozero"),
|
| 556 |
legend=dict(
|
| 557 |
bgcolor='#FAF2E9',
|
| 558 |
),
|
main_page.py
CHANGED
|
@@ -26,6 +26,7 @@ def build_page():
|
|
| 26 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 27 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 28 |
if not test_df.empty:
|
|
|
|
| 29 |
create_leaderboard_display(
|
| 30 |
full_df=test_df,
|
| 31 |
tag_map=test_tag_map,
|
|
@@ -39,6 +40,7 @@ def build_page():
|
|
| 39 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 40 |
# Check if data was loaded successfully before trying to display it
|
| 41 |
if not validation_df.empty:
|
|
|
|
| 42 |
# 2. Render the display by calling the factory with the loaded data.
|
| 43 |
create_leaderboard_display(
|
| 44 |
full_df=validation_df,
|
|
|
|
| 26 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 27 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 28 |
if not test_df.empty:
|
| 29 |
+
gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.")
|
| 30 |
create_leaderboard_display(
|
| 31 |
full_df=test_df,
|
| 32 |
tag_map=test_tag_map,
|
|
|
|
| 40 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 41 |
# Check if data was loaded successfully before trying to display it
|
| 42 |
if not validation_df.empty:
|
| 43 |
+
gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.")
|
| 44 |
# 2. Render the display by calling the factory with the loaded data.
|
| 45 |
create_leaderboard_display(
|
| 46 |
full_df=validation_df,
|
ui_components.py
CHANGED
|
@@ -20,7 +20,7 @@ from leaderboard_transformer import (
|
|
| 20 |
clean_llm_base_list,
|
| 21 |
)
|
| 22 |
from content import (
|
| 23 |
-
|
| 24 |
format_error,
|
| 25 |
format_log,
|
| 26 |
format_warning,
|
|
@@ -340,7 +340,7 @@ def create_leaderboard_display(
|
|
| 340 |
value=scatter_plot,
|
| 341 |
show_label=False
|
| 342 |
)
|
| 343 |
-
|
| 344 |
# Put table and key into an accordion
|
| 345 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
| 346 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
|
@@ -497,6 +497,7 @@ def create_benchmark_details_display(
|
|
| 497 |
name=benchmark_name
|
| 498 |
)
|
| 499 |
gr.Plot(value=benchmark_plot, show_label=False)
|
|
|
|
| 500 |
# Put table and key into an accordion
|
| 501 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
| 502 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
|
|
|
| 20 |
clean_llm_base_list,
|
| 21 |
)
|
| 22 |
from content import (
|
| 23 |
+
scatter_disclaimer_html,
|
| 24 |
format_error,
|
| 25 |
format_log,
|
| 26 |
format_warning,
|
|
|
|
| 340 |
value=scatter_plot,
|
| 341 |
show_label=False
|
| 342 |
)
|
| 343 |
+
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
|
| 344 |
# Put table and key into an accordion
|
| 345 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
| 346 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
|
|
|
| 497 |
name=benchmark_name
|
| 498 |
)
|
| 499 |
gr.Plot(value=benchmark_plot, show_label=False)
|
| 500 |
+
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
|
| 501 |
# Put table and key into an accordion
|
| 502 |
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
|
| 503 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|