Spaces:
Running
Running
Amber Tanaka
commited on
Add Repro links to Agent column (#63)
Browse files- leaderboard_transformer.py +2 -2
- ui_components.py +26 -9
leaderboard_transformer.py
CHANGED
|
@@ -113,6 +113,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'Agent tooling': 'Agent Tooling',
|
| 115 |
'LLM base': 'LLM Base',
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
if raw_col in fixed_mappings:
|
|
@@ -186,7 +187,6 @@ def transform_raw_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
| 186 |
raise TypeError("Input 'raw_df' must be a pandas DataFrame.")
|
| 187 |
|
| 188 |
df = raw_df.copy()
|
| 189 |
-
|
| 190 |
# Create the mapping for pretty column names
|
| 191 |
pretty_cols_map = {col: _pretty_column_name(col) for col in df.columns}
|
| 192 |
|
|
@@ -255,7 +255,7 @@ class DataTransformer:
|
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
# --- 3. Add Columns for Agent Openness and Tooling ---
|
| 258 |
-
base_cols = ["id","Agent","Submitter","LLM Base"]
|
| 259 |
new_cols = ["Openness", "Agent Tooling"]
|
| 260 |
ending_cols = ["Logs"]
|
| 261 |
|
|
|
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'Agent tooling': 'Agent Tooling',
|
| 115 |
'LLM base': 'LLM Base',
|
| 116 |
+
'Source': 'Source',
|
| 117 |
}
|
| 118 |
|
| 119 |
if raw_col in fixed_mappings:
|
|
|
|
| 187 |
raise TypeError("Input 'raw_df' must be a pandas DataFrame.")
|
| 188 |
|
| 189 |
df = raw_df.copy()
|
|
|
|
| 190 |
# Create the mapping for pretty column names
|
| 191 |
pretty_cols_map = {col: _pretty_column_name(col) for col in df.columns}
|
| 192 |
|
|
|
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
# --- 3. Add Columns for Agent Openness and Tooling ---
|
| 258 |
+
base_cols = ["id","Agent","Submitter","LLM Base","Source"]
|
| 259 |
new_cols = ["Openness", "Agent Tooling"]
|
| 260 |
ending_cols = ["Logs"]
|
| 261 |
|
ui_components.py
CHANGED
|
@@ -451,6 +451,12 @@ def create_leaderboard_display(
|
|
| 451 |
#Make pretty and format the LLM Base column
|
| 452 |
df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
|
| 453 |
df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
all_cols = df_view.columns.tolist()
|
| 456 |
# Remove pareto and Icon columns and insert it at the beginning
|
|
@@ -458,7 +464,7 @@ def create_leaderboard_display(
|
|
| 458 |
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
|
| 459 |
df_view = df_view[all_cols]
|
| 460 |
# Drop internally used columns that are not needed in the display
|
| 461 |
-
columns_to_drop = ['id', 'Openness', 'Agent Tooling']
|
| 462 |
df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
|
| 463 |
|
| 464 |
df_headers = df_view.columns.tolist()
|
|
@@ -466,7 +472,7 @@ def create_leaderboard_display(
|
|
| 466 |
for col in df_headers:
|
| 467 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 468 |
df_datatypes.append("markdown")
|
| 469 |
-
elif col in ["Icon","LLM Base"]:
|
| 470 |
df_datatypes.append("html")
|
| 471 |
else:
|
| 472 |
df_datatypes.append("str")
|
|
@@ -484,8 +490,8 @@ def create_leaderboard_display(
|
|
| 484 |
for col in remaining_headers:
|
| 485 |
if "Score" in col or "Cost" in col:
|
| 486 |
num_score_cost_cols += 1
|
| 487 |
-
dynamic_widths = [
|
| 488 |
-
fixed_end_widths = [
|
| 489 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 490 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 491 |
|
|
@@ -553,7 +559,7 @@ def create_benchmark_details_display(
|
|
| 553 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 554 |
|
| 555 |
# Define the columns needed for the detailed table
|
| 556 |
-
table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
|
| 557 |
|
| 558 |
# Filter to only columns that actually exist in the full dataframe
|
| 559 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -583,6 +589,12 @@ def create_benchmark_details_display(
|
|
| 583 |
#Make pretty and format the LLM Base column
|
| 584 |
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
|
| 585 |
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# Calculated and add "Benchmark Attempted" column
|
| 588 |
def check_benchmark_status(row):
|
|
@@ -630,7 +642,7 @@ def create_benchmark_details_display(
|
|
| 630 |
for col in df_headers:
|
| 631 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 632 |
df_datatypes.append("markdown")
|
| 633 |
-
elif col in ["Icon", "LLM Base"]:
|
| 634 |
df_datatypes.append("html")
|
| 635 |
else:
|
| 636 |
df_datatypes.append("str")
|
|
@@ -641,8 +653,6 @@ def create_benchmark_details_display(
|
|
| 641 |
}
|
| 642 |
# 2. Create the final list of headers for display.
|
| 643 |
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
| 644 |
-
# Create the scatter plot using the full data for context, but plotting benchmark metrics
|
| 645 |
-
# This shows all agents on the same axis for better comparison.
|
| 646 |
benchmark_plot = _plot_scatter_plotly(
|
| 647 |
data=full_df,
|
| 648 |
x=benchmark_cost_col,
|
|
@@ -685,10 +695,17 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 685 |
if pd.isna(raw_uri) or raw_uri == "": return ""
|
| 686 |
web_url = hf_uri_to_web_url(str(raw_uri))
|
| 687 |
return hyperlink(web_url, "🔗") if web_url else ""
|
| 688 |
-
|
| 689 |
# Apply the function to the "Logs" column
|
| 690 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
return pretty_df, pretty_tag_map
|
| 693 |
|
| 694 |
# Fallback for unexpected types
|
|
|
|
| 451 |
#Make pretty and format the LLM Base column
|
| 452 |
df_view['LLM Base'] = df_view['LLM Base'].apply(clean_llm_base_list)
|
| 453 |
df_view['LLM Base'] = df_view['LLM Base'].apply(format_llm_base_with_html)
|
| 454 |
+
# append the repro url to the end of the agent name
|
| 455 |
+
if 'Source' in df_view.columns:
|
| 456 |
+
df_view['Agent'] = df_view.apply(
|
| 457 |
+
lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
|
| 458 |
+
axis=1
|
| 459 |
+
)
|
| 460 |
|
| 461 |
all_cols = df_view.columns.tolist()
|
| 462 |
# Remove pareto and Icon columns and insert it at the beginning
|
|
|
|
| 464 |
all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
|
| 465 |
df_view = df_view[all_cols]
|
| 466 |
# Drop internally used columns that are not needed in the display
|
| 467 |
+
columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
|
| 468 |
df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
|
| 469 |
|
| 470 |
df_headers = df_view.columns.tolist()
|
|
|
|
| 472 |
for col in df_headers:
|
| 473 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 474 |
df_datatypes.append("markdown")
|
| 475 |
+
elif col in ["Agent","Icon","LLM Base"]:
|
| 476 |
df_datatypes.append("html")
|
| 477 |
else:
|
| 478 |
df_datatypes.append("str")
|
|
|
|
| 490 |
for col in remaining_headers:
|
| 491 |
if "Score" in col or "Cost" in col:
|
| 492 |
num_score_cost_cols += 1
|
| 493 |
+
dynamic_widths = [90] * num_score_cost_cols
|
| 494 |
+
fixed_end_widths = [90, 50]
|
| 495 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 496 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 497 |
|
|
|
|
| 559 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 560 |
|
| 561 |
# Define the columns needed for the detailed table
|
| 562 |
+
table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'LLM Base']
|
| 563 |
|
| 564 |
# Filter to only columns that actually exist in the full dataframe
|
| 565 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 589 |
#Make pretty and format the LLM Base column
|
| 590 |
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(clean_llm_base_list)
|
| 591 |
benchmark_table_df['LLM Base'] = benchmark_table_df['LLM Base'].apply(format_llm_base_with_html)
|
| 592 |
+
# append the repro url to the end of the agent name
|
| 593 |
+
if 'Source' in benchmark_table_df.columns:
|
| 594 |
+
benchmark_table_df['Agent'] = benchmark_table_df.apply(
|
| 595 |
+
lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
|
| 596 |
+
axis=1
|
| 597 |
+
)
|
| 598 |
|
| 599 |
# Calculated and add "Benchmark Attempted" column
|
| 600 |
def check_benchmark_status(row):
|
|
|
|
| 642 |
for col in df_headers:
|
| 643 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 644 |
df_datatypes.append("markdown")
|
| 645 |
+
elif col in ["Agent","Icon", "LLM Base"]:
|
| 646 |
df_datatypes.append("html")
|
| 647 |
else:
|
| 648 |
df_datatypes.append("str")
|
|
|
|
| 653 |
}
|
| 654 |
# 2. Create the final list of headers for display.
|
| 655 |
benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
|
|
|
|
|
|
|
| 656 |
benchmark_plot = _plot_scatter_plotly(
|
| 657 |
data=full_df,
|
| 658 |
x=benchmark_cost_col,
|
|
|
|
| 695 |
if pd.isna(raw_uri) or raw_uri == "": return ""
|
| 696 |
web_url = hf_uri_to_web_url(str(raw_uri))
|
| 697 |
return hyperlink(web_url, "🔗") if web_url else ""
|
|
|
|
| 698 |
# Apply the function to the "Logs" column
|
| 699 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 700 |
|
| 701 |
+
if "Source" in pretty_df.columns:
|
| 702 |
+
def format_source_url_to_html(raw_url):
|
| 703 |
+
# Handle empty or NaN values, returning a blank string.
|
| 704 |
+
if pd.isna(raw_url) or raw_url == "": return ""
|
| 705 |
+
# Assume 'source_url' is already a valid web URL and doesn't need conversion.
|
| 706 |
+
return hyperlink(str(raw_url), "🔗")
|
| 707 |
+
# Apply the function to the "source_url" column.
|
| 708 |
+
pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html)
|
| 709 |
return pretty_df, pretty_tag_map
|
| 710 |
|
| 711 |
# Fallback for unexpected types
|