asta-bench-leaderboard / ui_components.py
Amber Tanaka
copy changes around graph (#47)
02a4349 unverified
raw
history blame
25.3 kB
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import os
import re
import base64
from agenteval.leaderboard.view import LeaderboardViewer
from huggingface_hub import HfApi
from leaderboard_transformer import (
DataTransformer,
transform_raw_dataframe,
create_pretty_tag_map,
INFORMAL_TO_FORMAL_NAME_MAP,
_plot_scatter_plotly,
format_cost_column,
format_score_column,
get_pareto_df,
clean_llm_base_list,
)
from content import (
scatter_disclaimer_html,
format_error,
format_log,
format_warning,
hf_uri_to_web_url,
hyperlink,
)
# --- Constants and Configuration ---
LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
OWNER = "allenai"
PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
if LOCAL_DEBUG:
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
else:
DATA_DIR = "/home/user/data/" + CONFIG_NAME
EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
api = HfApi()
MAX_UPLOAD_BYTES = 100 * 1024**2
AGENTEVAL_MANIFEST_NAME = "agenteval.json"
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
# Global variables
COMBINED_ICON_MAP = {
"Open Source + Open Weights": {
"Standard": "assets/os-ow-standard.svg", # Bright pink star
"Custom with Standard Search": "assets/os-ow-equivalent.svg", # Bright pink diamond
"Custom": "assets/os-ow-custom.svg", # Bright pink triangle
},
"Open Source": {
"Standard": "assets/os-standard.svg", # Orange/pink star
"Custom with Standard Search": "assets/os-equivalent.svg", # Orange/pink diamond
"Fully Custom": "assets/os-custom.svg", # Orange/pink triangle
},
"API Available": {
"Standard": "assets/api-standard.svg", # Yellow/pink star
"Custom with Standard Search": "assets/api-equivalent.svg", # Yellow/pink diamond
"Fully Custom": "assets/api-custom.svg", # Yellow/pink triangle
},
"Closed": {
"Standard": "assets/c-standard.svg", # Hollow pink star
"Equivalent": "assets/c-equivalent.svg", # Hollow pink diamond
"Fully Custom": "assets/c-custom.svg", # Hollow pink triangle
}
}
OPENNESS_SVG_MAP = {
"Open Source + Open Weights": "assets/os-ow-standard.svg",
"Open Source": "assets/os-standard.svg",
"API Available": "assets/api-standard.svg",
"Closed": "assets/c-standard.svg",
}
TOOLING_SVG_MAP = {
"Standard": "assets/os-ow-standard.svg",
"Custom with Standard Search": "assets/os-ow-equivalent.svg",
"Fully Custom": "assets/os-ow-custom.svg",
}
def get_svg_as_data_uri(path: str) -> str:
"""Reads an SVG file and returns it as a base64-encoded data URI."""
try:
with open(path, "rb") as svg_file:
encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8")
return f"data:image/svg+xml;base64,{encoded_svg}"
except FileNotFoundError:
print(f"Warning: SVG file not found at {path}")
return ""
# Create a pre-loaded version of our map. This should be run ONCE when the app starts.
PRELOADED_URI_MAP = {
openness: {
tooling: get_svg_as_data_uri(path)
for tooling, path in tooling_map.items()
}
for openness, tooling_map in COMBINED_ICON_MAP.items()
}
def get_combined_icon_html(row, uri_map):
"""
Looks up the correct icon URI from the pre-loaded map based on the row's
'Openness' and 'Agent Tooling' values and returns an HTML <img> tag.
"""
openness_val = row['Openness']
tooling_val = row['Agent Tooling']
uri = uri_map.get(openness_val, {}).get(tooling_val, "")
# The tooltip will show the exact combination for clarity.
tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}"
# Return the HTML string that Gradio will render in the DataFrame.
return f'<img src="{uri}" alt="{tooltip}" title="{tooltip}" style="width:24px; height:24px;">'
def create_svg_html(value, svg_map):
"""
Generates the absolute simplest HTML for an icon, without any extra text.
This version is compatible with gr.DataFrame.
"""
if pd.isna(value) or value not in svg_map:
return ""
path_info = svg_map[value]
src = get_svg_as_data_uri(path_info)
# Generate the HTML for the single icon, with NO text.
if src:
return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
return ""
# Dynamically generate the correct HTML for the legend parts
openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
# Create HTML for the "Openness" legend items
openness_html_items = []
for name, path in OPENNESS_SVG_MAP.items():
uri = get_svg_as_data_uri(path)
# Each item is now its own flexbox container to guarantee alignment
openness_html_items.append(
f'<div style="display: flex; align-items: center; white-space: nowrap;">'
f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
f'<span>{name}</span>'
f'</div>'
)
openness_html = " ".join(openness_html_items)
# Create HTML for the "Tooling" legend items
tooling_html_items = []
for name, path in TOOLING_SVG_MAP.items():
uri = get_svg_as_data_uri(path)
tooling_html_items.append(
f'<div style="display: flex; align-items: center; white-space: nowrap;">'
f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
f'<span>{name}</span>'
f'</div>'
)
tooling_html = " ".join(tooling_html_items)
# Your final legend_markdown string (the structure of this does not change)
legend_markdown = f"""
<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 24px; font-size: 14px; padding-bottom: 8px;">
<div> <!-- Container for the Pareto section -->
<b>Pareto</b><span class="tooltip-icon" data-tooltip="Indicates if agent is on the Pareto frontier
">ⓘ</span>
<div style="padding-top: 4px;"><span>🏆 On frontier</span></div>
</div>
<div> <!-- Container for the Openness section -->
<b>Agent Openness</b><span class="tooltip-icon" data-tooltip="•Closed: No API or code available
•API Available: API available, but no code
•Open Source: Code available, but no weights
•Open Source + Open Weights: Code and weights available
">ⓘ</span>
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{openness_html}</div>
</div>
<div> <!-- Container for the Tooling section -->
<b>Agent Tooling</b><span class="tooltip-icon" data-tooltip="• Standard: Standard Approach used by the agent
• Custom with Standard Search: Standard search used by the agent
• Fully Custom: Fully custom tools used by the agent
">ⓘ</span>
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{tooling_html}</div>
</div>
<div><b>Column Descriptions</b><span class="tooltip-icon" data-tooltip="• Overall Score: Performance across all benchmarks
• Overall Cost: Cost per task in USD
• Literature Understanding Score: Performance on scientific literature tasks
• Literature Understanding Cost: Cost per literature understanding task in USD
• Data Analysis Score: Performance on data analysis tasks
• Code Execution Score: Performance on coding tasks
• Code Execution Cost: Cost per code execution task in USD
• Discovery Score: Performance on information discovery tasks
• Discovery Cost: Cost per discovery task in USD
• Categories Attempted: Number of benchmark categories the agent participated in
• Logs: Link to detailed evaluation logs">ⓘ</span></div>
</div>
"""
# --- Global State for Viewers (simple caching) ---
CACHED_VIEWERS = {}
CACHED_TAG_MAPS = {}
class DummyViewer:
"""A mock viewer to be cached on error. It has a ._load() method
to ensure it behaves like the real LeaderboardViewer."""
def __init__(self, error_df):
self._error_df = error_df
def _load(self):
# The _load method returns the error DataFrame and an empty tag map
return self._error_df, {}
def get_leaderboard_viewer_instance(split: str):
"""
Fetches the LeaderboardViewer for a split, using a cache to avoid
re-downloading data. On error, returns a stable DummyViewer object.
"""
global CACHED_VIEWERS, CACHED_TAG_MAPS
if split in CACHED_VIEWERS:
# Cache hit: return the cached viewer and tag map
return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
# --- Cache miss: try to load data from the source ---
try:
print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
viewer = LeaderboardViewer(
repo_id=RESULTS_DATASET,
config=CONFIG_NAME,
split=split,
is_internal=IS_INTERNAL
)
# Simplify tag map creation
pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
# Cache the results for next time
CACHED_VIEWERS[split] = viewer
CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
return viewer, pretty_tag_map
except Exception as e:
# On ANY error, create a consistent error message and cache a DummyViewer
error_message = f"Error loading data for split '{split}': {e}"
print(format_error(error_message))
dummy_df = pd.DataFrame({"Message": [error_message]})
dummy_viewer = DummyViewer(dummy_df)
dummy_tag_map = {"Overall": []}
# Cache the dummy objects so we don't try to fetch again on this run
CACHED_VIEWERS[split] = dummy_viewer
CACHED_TAG_MAPS[split] = dummy_tag_map
return dummy_viewer, dummy_tag_map
def create_leaderboard_display(
full_df: pd.DataFrame,
tag_map: dict,
category_name: str,
split_name: str
):
"""
This UI factory takes pre-loaded data and renders the main DataFrame and Plot
for a given category (e.g., "Overall" or "Literature Understanding").
"""
# 1. Instantiate the transformer and get the specific view for this category.
# The function no longer loads data itself; it filters the data it receives.
transformer = DataTransformer(full_df, tag_map)
df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
pareto_df = get_pareto_df(df_view)
# Get the list of agents on the frontier. We'll use this list later.
if not pareto_df.empty and 'id' in pareto_df.columns:
pareto_agent_names = pareto_df['id'].tolist()
else:
pareto_agent_names = []
df_view['Pareto'] = df_view.apply(
lambda row: '🏆' if row['id'] in pareto_agent_names else '',
axis=1
)
# Create mapping for Openness / tooling
df_view['Icon'] = df_view.apply(
lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
)
# Format cost columns
for col in df_view.columns:
if "Cost" in col:
df_view = format_cost_column(df_view, col)
# Fill NaN scores with 0
for col in df_view.columns:
if "Score" in col:
df_view = format_score_column(df_view, col)
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
#Make pretty and format the LLM Base column
df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
all_cols = df_view.columns.tolist()
# Remove pareto and Icon columns and insert it at the beginning
all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
df_view = df_view[all_cols]
# Drop internally used columns that are not needed in the display
columns_to_drop = ['id', 'agent_for_hover', 'Openness', 'Agent Tooling']
df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
df_headers = df_view.columns.tolist()
df_datatypes = []
for col in df_headers:
if col in ["Logs", "Agent"] or "Cost" in col or "Score" in col:
df_datatypes.append("markdown")
elif col in ["Icon","LLM Base"]:
df_datatypes.append("html")
else:
df_datatypes.append("str")
header_rename_map = {
"Pareto": "",
"Icon": "",
}
# 2. Create the final list of headers for display.
df_view = df_view.rename(columns=header_rename_map)
plot_component = gr.Plot(
value=scatter_plot,
show_label=False
)
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
# Put table and key into an accordion
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
dataframe_component = gr.DataFrame(
headers=df_headers,
value=df_view,
datatype=df_datatypes,
interactive=False,
wrap=True,
column_widths=[40, 40, 200, 200],
elem_classes=["wrap-header-df"]
)
# Return the components so they can be referenced elsewhere.
return plot_component, dataframe_component
# # --- Detailed Benchmark Display ---
def create_benchmark_details_display(
full_df: pd.DataFrame,
tag_map: dict,
category_name: str
):
"""
Generates a detailed breakdown for each benchmark within a given category.
For each benchmark, it creates a title, a filtered table, and a scatter plot.
Args:
full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
tag_map (dict): The "pretty" tag map to find the list of benchmarks.
category_name (str): The main category to display details for (e.g., "Literature Understanding").
"""
# 1. Get the list of benchmarks for the selected category
benchmark_names = tag_map.get(category_name, [])
if not benchmark_names:
gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
return
gr.Markdown("---")
gr.Markdown("## Detailed Benchmark Results")
# 2. Loop through each benchmark and create its UI components
for benchmark_name in benchmark_names:
with gr.Row(elem_classes=["benchmark-header"]):
gr.Markdown(f"### {benchmark_name} Leaderboard", header_links=True)
button_str = f"""
<button
class="scroll-up-button"
onclick="scroll_to_element('page-content-wrapper')"
>
{"⬆"}
</button>
"""
gr.HTML(button_str,elem_classes="scroll-up-container")
# 3. Prepare the data for this specific benchmark's table and plot
benchmark_score_col = f"{benchmark_name} Score"
benchmark_cost_col = f"{benchmark_name} Cost"
# Define the columns needed for the detailed table
table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
# Filter to only columns that actually exist in the full dataframe
existing_table_cols = [col for col in table_cols if col in full_df.columns]
if benchmark_score_col not in existing_table_cols:
gr.Markdown(f"Score data for {benchmark_name} not available.")
continue # Skip to the next benchmark if score is missing
# Create a specific DataFrame for the table view
benchmark_table_df = full_df[existing_table_cols].copy()
pareto_df = get_pareto_df(benchmark_table_df)
# Get the list of agents on the frontier. We'll use this list later.
if not pareto_df.empty and 'id' in pareto_df.columns:
pareto_agent_names = pareto_df['id'].tolist()
else:
pareto_agent_names = []
benchmark_table_df['Pareto'] = benchmark_table_df.apply(
lambda row: ' 🏆' if row['id'] in pareto_agent_names else '',
axis=1
)
benchmark_table_df['Icon'] = benchmark_table_df.apply(
lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
)
#Make pretty and format the LLM Base column
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
# Calculated and add "Benchmark Attempted" column
def check_benchmark_status(row):
has_score = pd.notna(row.get(benchmark_score_col))
has_cost = pd.notna(row.get(benchmark_cost_col))
if has_score and has_cost:
return "✅"
if has_score or has_cost:
return "⚠️"
return "🚫 "
# Apply the function to create the new column
benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
# Sort the DataFrame
if benchmark_score_col in benchmark_table_df.columns:
benchmark_table_df = benchmark_table_df.sort_values(
by=benchmark_score_col, ascending=False, na_position='last'
)
# 1. Format the cost and score columns
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
desired_cols_in_order = [
'Pareto',
'Icon',
'Agent',
'Submitter',
'LLM Base',
'Attempted Benchmark',
benchmark_score_col,
benchmark_cost_col,
'Logs'
]
for col in desired_cols_in_order:
if col not in benchmark_table_df.columns:
benchmark_table_df[col] = pd.NA # Add as an empty column
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
# Rename columns for a cleaner table display, as requested
benchmark_table_df.rename({
benchmark_score_col: 'Score',
benchmark_cost_col: 'Cost',
}, inplace=True)
# Ensure the 'Logs' column is formatted correctly
df_headers = benchmark_table_df.columns.tolist()
df_datatypes = []
for col in df_headers:
if "Logs" in col or "Cost" in col or "Score" in col:
df_datatypes.append("markdown")
elif col in ["Icon", "LLM Base"]:
df_datatypes.append("html")
else:
df_datatypes.append("str")
# Remove Pareto, Openness, and Agent Tooling from the headers
header_rename_map = {
"Pareto": "",
"Icon": "",
}
# 2. Create the final list of headers for display.
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
# Create the scatter plot using the full data for context, but plotting benchmark metrics
# This shows all agents on the same axis for better comparison.
benchmark_plot = _plot_scatter_plotly(
data=full_df,
x=benchmark_cost_col,
y=benchmark_score_col,
agent_col="Agent",
name=benchmark_name
)
gr.Plot(value=benchmark_plot, show_label=False)
gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
# Put table and key into an accordion
with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
gr.DataFrame(
headers=df_headers,
value=benchmark_table_df,
datatype=df_datatypes,
interactive=False,
wrap=True,
column_widths=[40, 40, 200, 150, 175, 85],
elem_classes=["wrap-header-df"]
)
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
"""
Loads and transforms the complete dataset for a given split.
This function handles caching and returns the final "pretty" DataFrame and tag map.
"""
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)):
raw_df, _ = viewer_or_data._load()
if raw_df.empty:
return pd.DataFrame(), {}
pretty_df = transform_raw_dataframe(raw_df)
pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
if "Logs" in pretty_df.columns:
def format_log_entry_to_html(raw_uri):
if pd.isna(raw_uri) or raw_uri == "": return ""
web_url = hf_uri_to_web_url(str(raw_uri))
return hyperlink(web_url, "🔗") if web_url else ""
# Apply the function to the "Logs" column
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
return pretty_df, pretty_tag_map
# Fallback for unexpected types
return pd.DataFrame(), {}
# Create sub-nav bar for benchmarks
def create_gradio_anchor_id(text: str, validation) -> str:
"""
Replicates the ID format created by gr.Markdown(header_links=True).
Example: "Paper Finder Validation" -> "h-paper-finder-validation"
"""
text = text.lower()
text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
if validation:
return f"h-{text}-leaderboard-1"
return f"h-{text}-leaderboard"
def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
"""
Builds the entire sub-navigation bar as a single, self-contained HTML component.
This bypasses Gradio's layout components, giving us full control.
"""
benchmark_names = tag_map.get(category_name, [])
if not benchmark_names:
# Return an empty HTML component to prevent errors
return gr.HTML()
# Start building the list of HTML button elements as strings
html_buttons = []
for name in benchmark_names:
target_id = create_gradio_anchor_id(name, validation)
# Create a standard HTML button.
# The onclick attribute calls our global JS function directly.
# Note the mix of double and single quotes.
button_str = f"""
<button
class="sub-nav-link-button"
onclick="scroll_to_element('{target_id}')"
>
{name}
</button>
"""
html_buttons.append(button_str)
# Join the button strings and wrap them in a single div container
# This container will be our flexbox row.
full_html = f"""
<div class="sub-nav-bar-container">
<span class="sub-nav-label">Benchmarks:</span>
{''.join(html_buttons)}
</div>
"""
# Return the entire navigation bar as one single Gradio HTML component
return gr.HTML(full_html)
def format_llm_base_with_html(value):
"""
Formats the 'LLM Base' cell value.
If the value is a list with more than 1 element, it returns an
HTML <span> with the full list in a hover-over tooltip.
If it's a single-element list, it returns just that element.
Otherwise, it returns the original value.
"""
if isinstance(value, list):
if len(value) > 1:
# Join the list items with a newline character for a clean tooltip
tooltip_text = "\n".join(map(str, value))
# Return an HTML span with the title attribute for the tooltip
return f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{value[0]} (+ {len(value) - 1}) ⓘ</span>'
if len(value) == 1:
# If only one item, just return that item
return value[0]
# Return the value as-is if it's not a list or is an empty list
return value