Amber Tanaka commited on
Commit
64716c3
·
unverified ·
1 Parent(s): ad338fc

Reorder benchmarks (#29)

Browse files
category_page_builder.py CHANGED
@@ -9,7 +9,7 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
9
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
10
  test_df, test_tag_map = get_full_leaderboard_data("test")
11
  with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
12
- create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
13
 
14
  with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
15
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
@@ -19,8 +19,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
19
  with gr.Tabs():
20
  with gr.Tab("Results: Test Set") as test_tab:
21
  # Repeat the process for the "test" split
22
- test_df, test_tag_map = get_full_leaderboard_data("test")
23
-
24
  if not test_df.empty:
25
  create_leaderboard_display(
26
  full_df=test_df,
@@ -37,8 +35,6 @@ def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
37
  gr.Markdown("No data available for test split.")
38
  with gr.Tab("Results: Validation Set") as validation_tab:
39
  # 1. Load all necessary data for the "validation" split ONCE.
40
- validation_df, validation_tag_map = get_full_leaderboard_data("validation")
41
-
42
  if not validation_df.empty:
43
  # 2. Render the main category display using the loaded data.
44
  create_leaderboard_display(
 
9
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
10
  test_df, test_tag_map = get_full_leaderboard_data("test")
11
  with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
12
+ create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME, validation=True)
13
 
14
  with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
15
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
 
19
  with gr.Tabs():
20
  with gr.Tab("Results: Test Set") as test_tab:
21
  # Repeat the process for the "test" split
 
 
22
  if not test_df.empty:
23
  create_leaderboard_display(
24
  full_df=test_df,
 
35
  gr.Markdown("No data available for test split.")
36
  with gr.Tab("Results: Validation Set") as validation_tab:
37
  # 1. Load all necessary data for the "validation" split ONCE.
 
 
38
  if not validation_df.empty:
39
  # 2. Render the main category display using the loaded data.
40
  create_leaderboard_display(
leaderboard_transformer.py CHANGED
@@ -16,34 +16,47 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
16
  "discovery": "Discovery",
17
 
18
  # Validation Names
19
- "arxivdigestables_validation": "Arxivdigestables Validation",
20
- "sqa_dev": "Sqa Dev",
21
- "litqa2_validation": "Litqa2 Validation",
22
- "paper_finder_validation": "Paper Finder Validation",
23
- "paper_finder_litqa2_validation": "Paper Finder Litqa2 Validation",
24
- "discoverybench_validation": "Discoverybench Validation",
25
- "core_bench_validation": "Core Bench Validation",
26
- "ds1000_validation": "DS1000 Validation",
27
- "e2e_discovery_validation": "E2E Discovery Validation",
28
- "e2e_discovery_hard_validation": "E2E Discovery Hard Validation",
29
- "super_validation": "Super Validation",
30
  # Test Names
31
- "paper_finder_test": "Paper Finder Test",
32
- "paper_finder_litqa2_test": "Paper Finder Litqa2 Test",
33
- "sqa_test": "Sqa Test",
34
- "arxivdigestables_test": "Arxivdigestables Test",
35
- "litqa2_test": "Litqa2 Test",
36
- "discoverybench_test": "Discoverybench Test",
37
- "core_bench_test": "Core Bench Test",
38
- "ds1000_test": "DS1000 Test",
39
- "e2e_discovery_test": "E2E Discovery Test",
40
- "e2e_discovery_hard_test": "E2E Discovery Hard Test",
41
- "super_test": "Super Test",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
 
44
 
45
- ### 2. The Updated Helper Functions ###
46
-
47
  def _safe_round(value, digits=2):
48
  """Rounds a number if it's a valid float/int, otherwise returns it as is."""
49
  return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
@@ -88,7 +101,6 @@ def _pretty_column_name(raw_col: str) -> str:
88
 
89
  # Capitalize the metric part correctly (e.g., 'score' -> 'Score')
90
  pretty_metric = metric_part.capitalize()
91
-
92
  return f"{formal_name} {pretty_metric}"
93
 
94
  # Case 3: If no specific rule applies, just make it title case.
@@ -97,28 +109,30 @@ def _pretty_column_name(raw_col: str) -> str:
97
 
98
  def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
99
  """
100
- Converts a tag map with raw names into a tag map with pretty, formal names.
101
-
102
- Args:
103
- raw_tag_map: The map with raw keys and values (e.g., {'lit': ['litqa2_validation']}).
104
- name_map: The INFORMAL_TO_FORMAL_NAME_MAP used for translation.
105
-
106
- Returns:
107
- A new dictionary with pretty names (e.g., {'Literature Understanding': ['Litqa2 Validation']}).
108
  """
109
  pretty_map = {}
110
- # A reverse map to find raw keys from formal names if needed, though not used here
111
- # This is just for understanding; the main logic uses the forward map.
112
-
113
  # Helper to get pretty name with a fallback
114
  def get_pretty(raw_name):
115
- return name_map.get(raw_name, raw_name.replace("_", " ").title())
116
 
117
  for raw_key, raw_value_list in raw_tag_map.items():
118
  pretty_key = get_pretty(raw_key)
119
  pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
120
- pretty_map[pretty_key] = sorted(list(set(pretty_value_list)))
121
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  return pretty_map
123
 
124
 
 
16
  "discovery": "Discovery",
17
 
18
  # Validation Names
19
+ "arxivdigestables_validation": "ArxivDIGES Tables",
20
+ "sqa_dev": "ScholarQA-CS2",
21
+ "litqa2_validation": "LitQA2-FullText",
22
+ "paper_finder_validation": "PaperFindingBench",
23
+ "paper_finder_litqa2_validation": "LitQA2-FullText-Search",
24
+ "discoverybench_validation": "DiscoveryBench",
25
+ "core_bench_validation": "CORE-Bench-Hard",
26
+ "ds1000_validation": "DS-1000",
27
+ "e2e_discovery_validation": "E2E-Bench",
28
+ "e2e_discovery_hard_validation": "E2E-Bench-Hard",
29
+ "super_validation": "SUPER",
30
  # Test Names
31
+ "paper_finder_test": "PaperFindingBench",
32
+ "paper_finder_litqa2_test": "LitQA2-FullText-Search",
33
+ "sqa_test": "ScholarQA-CS2",
34
+ "arxivdigestables_test": "ArxivDIGES Tables",
35
+ "litqa2_test": "LitQA2-FullText",
36
+ "discoverybench_test": "DiscoveryBench",
37
+ "core_bench_test": "CORE-Bench-Hard",
38
+ "ds1000_test": "DS-1000",
39
+ "e2e_discovery_test": "E2E-Bench",
40
+ "e2e_discovery_hard_test": "E2E-Bench-Hard",
41
+ "super_test": "SUPER",
42
+ }
43
+ ORDER_MAP = {
44
+ 'Literature Understanding': [
45
+ 'PaperFindingBench',
46
+ 'LitQA2-FullText-Search',
47
+ 'ScholarQA-CS2',
48
+ 'LitQA2-FullText',
49
+ 'ArxivDIGES Tables'
50
+ ],
51
+ 'Code Execution': [
52
+ 'CORE-Bench-Hard',
53
+ 'SUPER',
54
+ 'DS-1000'
55
+ ],
56
+ # Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
57
  }
58
 
59
 
 
 
60
  def _safe_round(value, digits=2):
61
  """Rounds a number if it's a valid float/int, otherwise returns it as is."""
62
  return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
 
101
 
102
  # Capitalize the metric part correctly (e.g., 'score' -> 'Score')
103
  pretty_metric = metric_part.capitalize()
 
104
  return f"{formal_name} {pretty_metric}"
105
 
106
  # Case 3: If no specific rule applies, just make it title case.
 
109
 
110
  def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict:
111
  """
112
+ Converts a tag map with raw names into a tag map with pretty, formal names,
113
+ applying a specific, non-alphabetic sort order to the values.
 
 
 
 
 
 
114
  """
115
  pretty_map = {}
 
 
 
116
  # Helper to get pretty name with a fallback
117
  def get_pretty(raw_name):
118
+ return name_map.get(raw_name, raw_name.replace("_", " "))
119
 
120
  for raw_key, raw_value_list in raw_tag_map.items():
121
  pretty_key = get_pretty(raw_key)
122
  pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list]
 
123
 
124
+ # Get the unique values first
125
+ unique_values = list(set(pretty_value_list))
126
+ # Get the custom order for the current key. Fall back to an empty list.
127
+ custom_order = ORDER_MAP.get(pretty_key, [])
128
+ def sort_key(value):
129
+ if value in custom_order:
130
+ return 0, custom_order.index(value)
131
+ else:
132
+ return 1, value
133
+ pretty_map[pretty_key] = sorted(unique_values, key=sort_key)
134
+
135
+ print(f"Created pretty tag map: {pretty_map}")
136
  return pretty_map
137
 
138
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- agent-eval==0.1.29
2
  aiobotocore==2.22.0
3
  aiofiles==24.1.0
4
  aiohappyeyeballs==2.6.1
 
1
+ agent-eval==0.1.30
2
  aiobotocore==2.22.0
3
  aiofiles==24.1.0
4
  aiohappyeyeballs==2.6.1
submission.py CHANGED
@@ -14,7 +14,6 @@ import requests
14
  from agenteval import (
15
  process_eval_logs,
16
  upload_folder_to_hf,
17
- upload_summary_to_hf,
18
  )
19
  from agenteval.leaderboard.models import LeaderboardSubmission
20
  from agenteval.leaderboard.upload import sanitize_path_component
 
14
  from agenteval import (
15
  process_eval_logs,
16
  upload_folder_to_hf,
 
17
  )
18
  from agenteval.leaderboard.models import LeaderboardSubmission
19
  from agenteval.leaderboard.upload import sanitize_path_component
ui_components.py CHANGED
@@ -530,7 +530,7 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
530
  # Fallback for unexpected types
531
  return pd.DataFrame(), {}
532
  # Create sub-nav bar for benchmarks
533
- def create_gradio_anchor_id(text: str) -> str:
534
  """
535
  Replicates the ID format created by gr.Markdown(header_links=True).
536
  Example: "Paper Finder Validation" -> "h-paper-finder-validation"
@@ -538,8 +538,10 @@ def create_gradio_anchor_id(text: str) -> str:
538
  text = text.lower()
539
  text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
540
  text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
 
 
541
  return f"h-{text}-leaderboard"
542
- def create_sub_navigation_bar(tag_map: dict, category_name: str):
543
  """
544
  Builds the entire sub-navigation bar as a single, self-contained HTML component.
545
  This bypasses Gradio's layout components, giving us full control.
@@ -552,7 +554,7 @@ def create_sub_navigation_bar(tag_map: dict, category_name: str):
552
  # Start building the list of HTML button elements as strings
553
  html_buttons = []
554
  for name in benchmark_names:
555
- target_id = create_gradio_anchor_id(name)
556
 
557
  # Create a standard HTML button.
558
  # The onclick attribute calls our global JS function directly.
 
530
  # Fallback for unexpected types
531
  return pd.DataFrame(), {}
532
  # Create sub-nav bar for benchmarks
533
+ def create_gradio_anchor_id(text: str, validation) -> str:
534
  """
535
  Replicates the ID format created by gr.Markdown(header_links=True).
536
  Example: "Paper Finder Validation" -> "h-paper-finder-validation"
 
538
  text = text.lower()
539
  text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
540
  text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
541
+ if validation:
542
+ return f"h-{text}-leaderboard-1"
543
  return f"h-{text}-leaderboard"
544
+ def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
545
  """
546
  Builds the entire sub-navigation bar as a single, self-contained HTML component.
547
  This bypasses Gradio's layout components, giving us full control.
 
554
  # Start building the list of HTML button elements as strings
555
  html_buttons = []
556
  for name in benchmark_names:
557
+ target_id = create_gradio_anchor_id(name, validation)
558
 
559
  # Create a standard HTML button.
560
  # The onclick attribute calls our global JS function directly.