asta-bench-leaderboard

Running

App Files Files Community

Amber Tanaka commited on Aug 7

Commit

64716c3

unverified ·

1 Parent(s): ad338fc

Reorder benchmarks (#29)

Browse files

Files changed (5) hide show

category_page_builder.py +1 -5
leaderboard_transformer.py +52 -38
requirements.txt +1 -1
submission.py +0 -1
ui_components.py +5 -3

category_page_builder.py CHANGED Viewed

@@ -9,7 +9,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
         validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         test_df, test_tag_map = get_full_leaderboard_data("test")
         with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
-            create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
         with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
             create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
@@ -19,8 +19,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
         with gr.Tabs():
             with gr.Tab("Results: Test Set") as test_tab:
                 # Repeat the process for the "test" split
-                test_df, test_tag_map = get_full_leaderboard_data("test")
                 if not test_df.empty:
                     create_leaderboard_display(
                         full_df=test_df,
@@ -37,8 +35,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
                     gr.Markdown("No data available for test split.")
             with gr.Tab("Results: Validation Set") as validation_tab:
                 # 1. Load all necessary data for the "validation" split ONCE.
-                validation_df, validation_tag_map = get_full_leaderboard_data("validation")
                 if not validation_df.empty:
                     # 2. Render the main category display using the loaded data.
                     create_leaderboard_display(

         validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         test_df, test_tag_map = get_full_leaderboard_data("test")
         with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
+            create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME, validation=True)
         with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
             create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
         with gr.Tabs():
             with gr.Tab("Results: Test Set") as test_tab:
                 # Repeat the process for the "test" split
                 if not test_df.empty:
                     create_leaderboard_display(
                         full_df=test_df,
                     gr.Markdown("No data available for test split.")
             with gr.Tab("Results: Validation Set") as validation_tab:
                 # 1. Load all necessary data for the "validation" split ONCE.
                 if not validation_df.empty:
                     # 2. Render the main category display using the loaded data.
                     create_leaderboard_display(

leaderboard_transformer.py CHANGED Viewed

@@ -16,34 +16,47 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
     "discovery": "Discovery",
     # Validation Names
-    "arxivdigestables_validation": "Arxivdigestables Validation",
-    "sqa_dev": "Sqa Dev",
-    "litqa2_validation": "Litqa2 Validation",
-    "paper_finder_validation": "Paper Finder Validation",
-    "paper_finder_litqa2_validation": "Paper Finder Litqa2 Validation",
-    "discoverybench_validation": "Discoverybench Validation",
-    "core_bench_validation": "Core Bench Validation",
-    "ds1000_validation": "DS1000 Validation",
-    "e2e_discovery_validation": "E2E Discovery Validation",
-    "e2e_discovery_hard_validation": "E2E Discovery Hard Validation",
-    "super_validation": "Super Validation",
     # Test Names
-    "paper_finder_test": "Paper Finder Test",
-    "paper_finder_litqa2_test": "Paper Finder Litqa2 Test",
-    "sqa_test": "Sqa Test",
-    "arxivdigestables_test": "Arxivdigestables Test",
-    "litqa2_test": "Litqa2 Test",
-    "discoverybench_test": "Discoverybench Test",
-    "core_bench_test": "Core Bench Test",
-    "ds1000_test": "DS1000 Test",
-    "e2e_discovery_test": "E2E Discovery Test",
-    "e2e_discovery_hard_test": "E2E Discovery Hard Test",
-    "super_test": "Super Test",
 }
-### 2. The Updated Helper Functions ###
 def _safe_round(value, digits=2):
     """Rounds a number if it's a valid float/int, otherwise returns it as is."""
     return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
@@ -88,7 +101,6 @@ def _pretty_column_name(raw_col: str) -> str:
             # Capitalize the metric part correctly (e.g., 'score' -> 'Score')
             pretty_metric = metric_part.capitalize()
             return f"{formal_name} {pretty_metric}"
     # Case 3: If no specific rule applies, just make it title case.
@@ -97,28 +109,30 @@ def _pretty_column_name(raw_col: str) -> str:
 def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
     """
-    Converts a tag map with raw names into a tag map with pretty, formal names.
-    Args:
-        raw_tag_map: The map with raw keys and values (e.g., {'lit': ['litqa2_validation']}).
-        name_map: The INFORMAL_TO_FORMAL_NAME_MAP used for translation.
-    Returns:
-        A new dictionary with pretty names (e.g., {'Literature Understanding': ['Litqa2 Validation']}).
     """
     pretty_map = {}
-    # A reverse map to find raw keys from formal names if needed, though not used here
-    # This is just for understanding; the main logic uses the forward map.
     # Helper to get pretty name with a fallback
     def get_pretty(raw_name):
-        return name_map.get(raw_name, raw_name.replace("_", " ").title())
     for raw_key, raw_value_list in raw_tag_map.items():
         pretty_key = get_pretty(raw_key)
         pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
-        pretty_map[pretty_key] = sorted(list(set(pretty_value_list)))
     return pretty_map

     "discovery": "Discovery",
     # Validation Names
+    "arxivdigestables_validation": "ArxivDIGES Tables",
+    "sqa_dev": "ScholarQA-CS2",
+    "litqa2_validation": "LitQA2-FullText",
+    "paper_finder_validation": "PaperFindingBench",
+    "paper_finder_litqa2_validation": "LitQA2-FullText-Search",
+    "discoverybench_validation": "DiscoveryBench",
+    "core_bench_validation": "CORE-Bench-Hard",
+    "ds1000_validation": "DS-1000",
+    "e2e_discovery_validation": "E2E-Bench",
+    "e2e_discovery_hard_validation": "E2E-Bench-Hard",
+    "super_validation": "SUPER",
     # Test Names
+    "paper_finder_test": "PaperFindingBench",
+    "paper_finder_litqa2_test": "LitQA2-FullText-Search",
+    "sqa_test": "ScholarQA-CS2",
+    "arxivdigestables_test": "ArxivDIGES Tables",
+    "litqa2_test": "LitQA2-FullText",
+    "discoverybench_test": "DiscoveryBench",
+    "core_bench_test": "CORE-Bench-Hard",
+    "ds1000_test": "DS-1000",
+    "e2e_discovery_test": "E2E-Bench",
+    "e2e_discovery_hard_test": "E2E-Bench-Hard",
+    "super_test": "SUPER",
+}
+ORDER_MAP = {
+    'Literature Understanding': [
+        'PaperFindingBench',
+        'LitQA2-FullText-Search',
+        'ScholarQA-CS2',
+        'LitQA2-FullText',
+        'ArxivDIGES Tables'
+    ],
+    'Code Execution': [
+        'CORE-Bench-Hard',
+        'SUPER',
+        'DS-1000'
+    ],
+    # Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
 }
 def _safe_round(value, digits=2):
     """Rounds a number if it's a valid float/int, otherwise returns it as is."""
     return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
             # Capitalize the metric part correctly (e.g., 'score' -> 'Score')
             pretty_metric = metric_part.capitalize()
             return f"{formal_name} {pretty_metric}"
     # Case 3: If no specific rule applies, just make it title case.
 def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
     """
+    Converts a tag map with raw names into a tag map with pretty, formal names,
+    applying a specific, non-alphabetic sort order to the values.
     """
     pretty_map = {}
     # Helper to get pretty name with a fallback
     def get_pretty(raw_name):
+        return name_map.get(raw_name, raw_name.replace("_", " "))
     for raw_key, raw_value_list in raw_tag_map.items():
         pretty_key = get_pretty(raw_key)
         pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
+        # Get the unique values first
+        unique_values = list(set(pretty_value_list))
+        # Get the custom order for the current key. Fall back to an empty list.
+        custom_order = ORDER_MAP.get(pretty_key, [])
+        def sort_key(value):
+            if value in custom_order:
+                return 0, custom_order.index(value)
+            else:
+                return 1, value
+        pretty_map[pretty_key] = sorted(unique_values, key=sort_key)
+    print(f"Created pretty tag map: {pretty_map}")
     return pretty_map

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-agent-eval==0.1.29
 aiobotocore==2.22.0
 aiofiles==24.1.0
 aiohappyeyeballs==2.6.1

+agent-eval==0.1.30
 aiobotocore==2.22.0
 aiofiles==24.1.0
 aiohappyeyeballs==2.6.1

submission.py CHANGED Viewed

@@ -14,7 +14,6 @@ import requests
 from agenteval import (
     process_eval_logs,
     upload_folder_to_hf,
-    upload_summary_to_hf,
 )
 from agenteval.leaderboard.models import LeaderboardSubmission
 from agenteval.leaderboard.upload import sanitize_path_component

 from agenteval import (
     process_eval_logs,
     upload_folder_to_hf,
 )
 from agenteval.leaderboard.models import LeaderboardSubmission
 from agenteval.leaderboard.upload import sanitize_path_component

ui_components.py CHANGED Viewed

@@ -530,7 +530,7 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     # Fallback for unexpected types
     return pd.DataFrame(), {}
 # Create sub-nav bar for benchmarks
-def create_gradio_anchor_id(text: str) -> str:
     """
     Replicates the ID format created by gr.Markdown(header_links=True).
     Example: "Paper Finder Validation" -> "h-paper-finder-validation"
@@ -538,8 +538,10 @@ def create_gradio_anchor_id(text: str) -> str:
     text = text.lower()
     text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
     text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
     return f"h-{text}-leaderboard"
-def create_sub_navigation_bar(tag_map: dict, category_name: str):
     """
     Builds the entire sub-navigation bar as a single, self-contained HTML component.
     This bypasses Gradio's layout components, giving us full control.
@@ -552,7 +554,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str):
     # Start building the list of HTML button elements as strings
     html_buttons = []
     for name in benchmark_names:
-        target_id = create_gradio_anchor_id(name)
         # Create a standard HTML button.
         # The onclick attribute calls our global JS function directly.

     # Fallback for unexpected types
     return pd.DataFrame(), {}
 # Create sub-nav bar for benchmarks
+def create_gradio_anchor_id(text: str, validation) -> str:
     """
     Replicates the ID format created by gr.Markdown(header_links=True).
     Example: "Paper Finder Validation" -> "h-paper-finder-validation"
     text = text.lower()
     text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
     text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
+    if validation:
+        return f"h-{text}-leaderboard-1"
     return f"h-{text}-leaderboard"
+def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
     """
     Builds the entire sub-navigation bar as a single, self-contained HTML component.
     This bypasses Gradio's layout components, giving us full control.
     # Start building the list of HTML button elements as strings
     html_buttons = []
     for name in benchmark_names:
+        target_id = create_gradio_anchor_id(name, validation)
         # Create a standard HTML button.
         # The onclick attribute calls our global JS function directly.