Spaces:
Running
Running
Amber Tanaka
commited on
Reorder benchmarks (#29)
Browse files- category_page_builder.py +1 -5
- leaderboard_transformer.py +52 -38
- requirements.txt +1 -1
- submission.py +0 -1
- ui_components.py +5 -3
category_page_builder.py
CHANGED
|
@@ -9,7 +9,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
| 9 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 10 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 11 |
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 12 |
-
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 13 |
|
| 14 |
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 15 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
|
@@ -19,8 +19,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
| 19 |
with gr.Tabs():
|
| 20 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 21 |
# Repeat the process for the "test" split
|
| 22 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 23 |
-
|
| 24 |
if not test_df.empty:
|
| 25 |
create_leaderboard_display(
|
| 26 |
full_df=test_df,
|
|
@@ -37,8 +35,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
|
| 37 |
gr.Markdown("No data available for test split.")
|
| 38 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 39 |
# 1. Load all necessary data for the "validation" split ONCE.
|
| 40 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 41 |
-
|
| 42 |
if not validation_df.empty:
|
| 43 |
# 2. Render the main category display using the loaded data.
|
| 44 |
create_leaderboard_display(
|
|
|
|
| 9 |
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 10 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 11 |
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 12 |
+
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME, validation=True)
|
| 13 |
|
| 14 |
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 15 |
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
|
|
|
| 19 |
with gr.Tabs():
|
| 20 |
with gr.Tab("Results: Test Set") as test_tab:
|
| 21 |
# Repeat the process for the "test" split
|
|
|
|
|
|
|
| 22 |
if not test_df.empty:
|
| 23 |
create_leaderboard_display(
|
| 24 |
full_df=test_df,
|
|
|
|
| 35 |
gr.Markdown("No data available for test split.")
|
| 36 |
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 37 |
# 1. Load all necessary data for the "validation" split ONCE.
|
|
|
|
|
|
|
| 38 |
if not validation_df.empty:
|
| 39 |
# 2. Render the main category display using the loaded data.
|
| 40 |
create_leaderboard_display(
|
leaderboard_transformer.py
CHANGED
|
@@ -16,34 +16,47 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 16 |
"discovery": "Discovery",
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
-
"arxivdigestables_validation": "
|
| 20 |
-
"sqa_dev": "
|
| 21 |
-
"litqa2_validation": "
|
| 22 |
-
"paper_finder_validation": "
|
| 23 |
-
"paper_finder_litqa2_validation": "
|
| 24 |
-
"discoverybench_validation": "
|
| 25 |
-
"core_bench_validation": "
|
| 26 |
-
"ds1000_validation": "
|
| 27 |
-
"e2e_discovery_validation": "E2E
|
| 28 |
-
"e2e_discovery_hard_validation": "E2E
|
| 29 |
-
"super_validation": "
|
| 30 |
# Test Names
|
| 31 |
-
"paper_finder_test": "
|
| 32 |
-
"paper_finder_litqa2_test": "
|
| 33 |
-
"sqa_test": "
|
| 34 |
-
"arxivdigestables_test": "
|
| 35 |
-
"litqa2_test": "
|
| 36 |
-
"discoverybench_test": "
|
| 37 |
-
"core_bench_test": "
|
| 38 |
-
"ds1000_test": "
|
| 39 |
-
"e2e_discovery_test": "E2E
|
| 40 |
-
"e2e_discovery_hard_test": "E2E
|
| 41 |
-
"super_test": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
|
| 45 |
-
### 2. The Updated Helper Functions ###
|
| 46 |
-
|
| 47 |
def _safe_round(value, digits=2):
|
| 48 |
"""Rounds a number if it's a valid float/int, otherwise returns it as is."""
|
| 49 |
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
|
@@ -88,7 +101,6 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 88 |
|
| 89 |
# Capitalize the metric part correctly (e.g., 'score' -> 'Score')
|
| 90 |
pretty_metric = metric_part.capitalize()
|
| 91 |
-
|
| 92 |
return f"{formal_name} {pretty_metric}"
|
| 93 |
|
| 94 |
# Case 3: If no specific rule applies, just make it title case.
|
|
@@ -97,28 +109,30 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 97 |
|
| 98 |
def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
|
| 99 |
"""
|
| 100 |
-
Converts a tag map with raw names into a tag map with pretty, formal names
|
| 101 |
-
|
| 102 |
-
Args:
|
| 103 |
-
raw_tag_map: The map with raw keys and values (e.g., {'lit': ['litqa2_validation']}).
|
| 104 |
-
name_map: The INFORMAL_TO_FORMAL_NAME_MAP used for translation.
|
| 105 |
-
|
| 106 |
-
Returns:
|
| 107 |
-
A new dictionary with pretty names (e.g., {'Literature Understanding': ['Litqa2 Validation']}).
|
| 108 |
"""
|
| 109 |
pretty_map = {}
|
| 110 |
-
# A reverse map to find raw keys from formal names if needed, though not used here
|
| 111 |
-
# This is just for understanding; the main logic uses the forward map.
|
| 112 |
-
|
| 113 |
# Helper to get pretty name with a fallback
|
| 114 |
def get_pretty(raw_name):
|
| 115 |
-
return name_map.get(raw_name, raw_name.replace("_", " ")
|
| 116 |
|
| 117 |
for raw_key, raw_value_list in raw_tag_map.items():
|
| 118 |
pretty_key = get_pretty(raw_key)
|
| 119 |
pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
|
| 120 |
-
pretty_map[pretty_key] = sorted(list(set(pretty_value_list)))
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
return pretty_map
|
| 123 |
|
| 124 |
|
|
|
|
| 16 |
"discovery": "Discovery",
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
+
"arxivdigestables_validation": "ArxivDIGES Tables",
|
| 20 |
+
"sqa_dev": "ScholarQA-CS2",
|
| 21 |
+
"litqa2_validation": "LitQA2-FullText",
|
| 22 |
+
"paper_finder_validation": "PaperFindingBench",
|
| 23 |
+
"paper_finder_litqa2_validation": "LitQA2-FullText-Search",
|
| 24 |
+
"discoverybench_validation": "DiscoveryBench",
|
| 25 |
+
"core_bench_validation": "CORE-Bench-Hard",
|
| 26 |
+
"ds1000_validation": "DS-1000",
|
| 27 |
+
"e2e_discovery_validation": "E2E-Bench",
|
| 28 |
+
"e2e_discovery_hard_validation": "E2E-Bench-Hard",
|
| 29 |
+
"super_validation": "SUPER",
|
| 30 |
# Test Names
|
| 31 |
+
"paper_finder_test": "PaperFindingBench",
|
| 32 |
+
"paper_finder_litqa2_test": "LitQA2-FullText-Search",
|
| 33 |
+
"sqa_test": "ScholarQA-CS2",
|
| 34 |
+
"arxivdigestables_test": "ArxivDIGES Tables",
|
| 35 |
+
"litqa2_test": "LitQA2-FullText",
|
| 36 |
+
"discoverybench_test": "DiscoveryBench",
|
| 37 |
+
"core_bench_test": "CORE-Bench-Hard",
|
| 38 |
+
"ds1000_test": "DS-1000",
|
| 39 |
+
"e2e_discovery_test": "E2E-Bench",
|
| 40 |
+
"e2e_discovery_hard_test": "E2E-Bench-Hard",
|
| 41 |
+
"super_test": "SUPER",
|
| 42 |
+
}
|
| 43 |
+
ORDER_MAP = {
|
| 44 |
+
'Literature Understanding': [
|
| 45 |
+
'PaperFindingBench',
|
| 46 |
+
'LitQA2-FullText-Search',
|
| 47 |
+
'ScholarQA-CS2',
|
| 48 |
+
'LitQA2-FullText',
|
| 49 |
+
'ArxivDIGES Tables'
|
| 50 |
+
],
|
| 51 |
+
'Code Execution': [
|
| 52 |
+
'CORE-Bench-Hard',
|
| 53 |
+
'SUPER',
|
| 54 |
+
'DS-1000'
|
| 55 |
+
],
|
| 56 |
+
# Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
|
| 57 |
}
|
| 58 |
|
| 59 |
|
|
|
|
|
|
|
| 60 |
def _safe_round(value, digits=2):
|
| 61 |
"""Rounds a number if it's a valid float/int, otherwise returns it as is."""
|
| 62 |
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
|
|
|
| 101 |
|
| 102 |
# Capitalize the metric part correctly (e.g., 'score' -> 'Score')
|
| 103 |
pretty_metric = metric_part.capitalize()
|
|
|
|
| 104 |
return f"{formal_name} {pretty_metric}"
|
| 105 |
|
| 106 |
# Case 3: If no specific rule applies, just make it title case.
|
|
|
|
| 109 |
|
| 110 |
def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
|
| 111 |
"""
|
| 112 |
+
Converts a tag map with raw names into a tag map with pretty, formal names,
|
| 113 |
+
applying a specific, non-alphabetic sort order to the values.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
"""
|
| 115 |
pretty_map = {}
|
|
|
|
|
|
|
|
|
|
| 116 |
# Helper to get pretty name with a fallback
|
| 117 |
def get_pretty(raw_name):
|
| 118 |
+
return name_map.get(raw_name, raw_name.replace("_", " "))
|
| 119 |
|
| 120 |
for raw_key, raw_value_list in raw_tag_map.items():
|
| 121 |
pretty_key = get_pretty(raw_key)
|
| 122 |
pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
|
|
|
|
| 123 |
|
| 124 |
+
# Get the unique values first
|
| 125 |
+
unique_values = list(set(pretty_value_list))
|
| 126 |
+
# Get the custom order for the current key. Fall back to an empty list.
|
| 127 |
+
custom_order = ORDER_MAP.get(pretty_key, [])
|
| 128 |
+
def sort_key(value):
|
| 129 |
+
if value in custom_order:
|
| 130 |
+
return 0, custom_order.index(value)
|
| 131 |
+
else:
|
| 132 |
+
return 1, value
|
| 133 |
+
pretty_map[pretty_key] = sorted(unique_values, key=sort_key)
|
| 134 |
+
|
| 135 |
+
print(f"Created pretty tag map: {pretty_map}")
|
| 136 |
return pretty_map
|
| 137 |
|
| 138 |
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
agent-eval==0.1.
|
| 2 |
aiobotocore==2.22.0
|
| 3 |
aiofiles==24.1.0
|
| 4 |
aiohappyeyeballs==2.6.1
|
|
|
|
| 1 |
+
agent-eval==0.1.30
|
| 2 |
aiobotocore==2.22.0
|
| 3 |
aiofiles==24.1.0
|
| 4 |
aiohappyeyeballs==2.6.1
|
submission.py
CHANGED
|
@@ -14,7 +14,6 @@ import requests
|
|
| 14 |
from agenteval import (
|
| 15 |
process_eval_logs,
|
| 16 |
upload_folder_to_hf,
|
| 17 |
-
upload_summary_to_hf,
|
| 18 |
)
|
| 19 |
from agenteval.leaderboard.models import LeaderboardSubmission
|
| 20 |
from agenteval.leaderboard.upload import sanitize_path_component
|
|
|
|
| 14 |
from agenteval import (
|
| 15 |
process_eval_logs,
|
| 16 |
upload_folder_to_hf,
|
|
|
|
| 17 |
)
|
| 18 |
from agenteval.leaderboard.models import LeaderboardSubmission
|
| 19 |
from agenteval.leaderboard.upload import sanitize_path_component
|
ui_components.py
CHANGED
|
@@ -530,7 +530,7 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 530 |
# Fallback for unexpected types
|
| 531 |
return pd.DataFrame(), {}
|
| 532 |
# Create sub-nav bar for benchmarks
|
| 533 |
-
def create_gradio_anchor_id(text: str) -> str:
|
| 534 |
"""
|
| 535 |
Replicates the ID format created by gr.Markdown(header_links=True).
|
| 536 |
Example: "Paper Finder Validation" -> "h-paper-finder-validation"
|
|
@@ -538,8 +538,10 @@ def create_gradio_anchor_id(text: str) -> str:
|
|
| 538 |
text = text.lower()
|
| 539 |
text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
|
| 540 |
text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
|
|
|
|
|
|
|
| 541 |
return f"h-{text}-leaderboard"
|
| 542 |
-
def create_sub_navigation_bar(tag_map: dict, category_name: str):
|
| 543 |
"""
|
| 544 |
Builds the entire sub-navigation bar as a single, self-contained HTML component.
|
| 545 |
This bypasses Gradio's layout components, giving us full control.
|
|
@@ -552,7 +554,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str):
|
|
| 552 |
# Start building the list of HTML button elements as strings
|
| 553 |
html_buttons = []
|
| 554 |
for name in benchmark_names:
|
| 555 |
-
target_id = create_gradio_anchor_id(name)
|
| 556 |
|
| 557 |
# Create a standard HTML button.
|
| 558 |
# The onclick attribute calls our global JS function directly.
|
|
|
|
| 530 |
# Fallback for unexpected types
|
| 531 |
return pd.DataFrame(), {}
|
| 532 |
# Create sub-nav bar for benchmarks
|
| 533 |
+
def create_gradio_anchor_id(text: str, validation) -> str:
|
| 534 |
"""
|
| 535 |
Replicates the ID format created by gr.Markdown(header_links=True).
|
| 536 |
Example: "Paper Finder Validation" -> "h-paper-finder-validation"
|
|
|
|
| 538 |
text = text.lower()
|
| 539 |
text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
|
| 540 |
text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
|
| 541 |
+
if validation:
|
| 542 |
+
return f"h-{text}-leaderboard-1"
|
| 543 |
return f"h-{text}-leaderboard"
|
| 544 |
+
def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
|
| 545 |
"""
|
| 546 |
Builds the entire sub-navigation bar as a single, self-contained HTML component.
|
| 547 |
This bypasses Gradio's layout components, giving us full control.
|
|
|
|
| 554 |
# Start building the list of HTML button elements as strings
|
| 555 |
html_buttons = []
|
| 556 |
for name in benchmark_names:
|
| 557 |
+
target_id = create_gradio_anchor_id(name, validation)
|
| 558 |
|
| 559 |
# Create a standard HTML button.
|
| 560 |
# The onclick attribute calls our global JS function directly.
|