Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Amber Tanaka
		
	commited on
		
		
					Current Updated! State (#4)
Browse files- app.py +28 -10
 - assets/just-icon.svg +3 -0
 - assets/logo.svg +12 -0
 - c_and_e.py +9 -3
 - content.py +57 -24
 - data_analysis.py +8 -3
 - e2e.py +8 -3
 - json_leaderboard.py +0 -485
 - leaderboard_transformer.py +192 -56
 - leaderboard_viewer.py +0 -319
 - literature_understanding.py +9 -3
 - main_page.py +5 -321
 - requirements.txt +131 -5
 - submission.py +324 -0
 - ui_components.py +143 -40
 
    	
        app.py
    CHANGED
    
    | 
         @@ -4,9 +4,9 @@ import os 
     | 
|
| 4 | 
         | 
| 5 | 
         
             
            from apscheduler.schedulers.background import BackgroundScheduler
         
     | 
| 6 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 7 | 
         
            -
            import literature_understanding, main_page, c_and_e, data_analysis, e2e
         
     | 
| 8 | 
         | 
| 9 | 
         
            -
            from content import  
     | 
| 10 | 
         | 
| 11 | 
         
             
            # --- Constants and Configuration  ---
         
     | 
| 12 | 
         
             
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 
         @@ -15,7 +15,7 @@ OWNER = "allenai" 
     | 
|
| 15 | 
         
             
            PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
         
     | 
| 16 | 
         
             
            LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
         
     | 
| 17 | 
         
             
            api = HfApi()
         
     | 
| 18 | 
         
            -
            LOGO_PATH = " 
     | 
| 19 | 
         | 
| 20 | 
         | 
| 21 | 
         | 
| 
         @@ -50,13 +50,22 @@ theme = gr.themes.Base( 
     | 
|
| 50 | 
         
             
                button_primary_background_fill_dark='*primary_900',
         
     | 
| 51 | 
         
             
                button_primary_background_fill_hover='*secondary_600',
         
     | 
| 52 | 
         
             
                button_primary_background_fill_hover_dark='*primary_600',
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 53 | 
         
             
                button_primary_text_color='*neutral_900',
         
     | 
| 54 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 55 | 
         
             
            )
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            with demo:
         
     | 
| 59 | 
         
            -
                gr.Image(
         
     | 
| 60 | 
         
             
                    value=LOGO_PATH,
         
     | 
| 61 | 
         
             
                    show_label=False,
         
     | 
| 62 | 
         
             
                    interactive=False,
         
     | 
| 
         @@ -65,17 +74,26 @@ with demo: 
     | 
|
| 65 | 
         
             
                    show_fullscreen_button=False,
         
     | 
| 66 | 
         
             
                    elem_id="logo-image"
         
     | 
| 67 | 
         
             
                )
         
     | 
| 68 | 
         
            -
             
     | 
| 69 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 70 | 
         
             
                main_page.demo.render()
         
     | 
| 71 | 
         
             
            with demo.route("Literature Understanding"):
         
     | 
| 
         | 
|
| 72 | 
         
             
                literature_understanding.demo.render()
         
     | 
| 73 | 
         
             
            with demo.route("Code & Execution"):
         
     | 
| 
         | 
|
| 74 | 
         
             
                c_and_e.demo.render()
         
     | 
| 75 | 
         
             
            with demo.route("Data Analysis"):
         
     | 
| 
         | 
|
| 76 | 
         
             
                data_analysis.demo.render()
         
     | 
| 77 | 
         
             
            with demo.route("Discovery"):
         
     | 
| 
         | 
|
| 78 | 
         
             
                e2e.demo.render()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 79 | 
         | 
| 80 | 
         
             
            # --- Scheduler and Launch
         
     | 
| 81 | 
         
             
            def restart_space_job():
         
     | 
| 
         | 
|
| 4 | 
         | 
| 5 | 
         
             
            from apscheduler.schedulers.background import BackgroundScheduler
         
     | 
| 6 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 7 | 
         
            +
            import literature_understanding, main_page, c_and_e, data_analysis, e2e, submission
         
     | 
| 8 | 
         | 
| 9 | 
         
            +
            from content import css
         
     | 
| 10 | 
         | 
| 11 | 
         
             
            # --- Constants and Configuration  ---
         
     | 
| 12 | 
         
             
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 
         | 
|
| 15 | 
         
             
            PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
         
     | 
| 16 | 
         
             
            LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
         
     | 
| 17 | 
         
             
            api = HfApi()
         
     | 
| 18 | 
         
            +
            LOGO_PATH = "assets/logo.svg"
         
     | 
| 19 | 
         | 
| 20 | 
         | 
| 21 | 
         | 
| 
         | 
|
| 50 | 
         
             
                button_primary_background_fill_dark='*primary_900',
         
     | 
| 51 | 
         
             
                button_primary_background_fill_hover='*secondary_600',
         
     | 
| 52 | 
         
             
                button_primary_background_fill_hover_dark='*primary_600',
         
     | 
| 53 | 
         
            +
                button_secondary_background_fill="#9FEAD1",
         
     | 
| 54 | 
         
            +
                button_secondary_background_fill_dark="#9FEAD1",
         
     | 
| 55 | 
         
            +
                button_secondary_text_color="*neutral_900",
         
     | 
| 56 | 
         
            +
                button_secondary_text_color_dark="*neutral_900",
         
     | 
| 57 | 
         
            +
                block_title_text_color="*neutral_900",
         
     | 
| 58 | 
         
             
                button_primary_text_color='*neutral_900',
         
     | 
| 59 | 
         
            +
                block_title_text_color_dark="#ffffff",
         
     | 
| 60 | 
         
            +
                checkbox_label_text_color_dark="#000",
         
     | 
| 61 | 
         
            +
                button_primary_text_color_dark='*neutral_900',
         
     | 
| 62 | 
         
            +
                block_border_color="#032629",
         
     | 
| 63 | 
         
            +
                block_border_color_dark="#9fead1",
         
     | 
| 64 | 
         
            +
                block_background_fill_dark="#032629",
         
     | 
| 65 | 
         
            +
                block_background_fill="#FAF2E9",
         
     | 
| 66 | 
         
             
            )
         
     | 
| 67 | 
         
            +
            def render_logo():
         
     | 
| 68 | 
         
            +
                return gr.Image(
         
     | 
| 
         | 
|
| 
         | 
|
| 69 | 
         
             
                    value=LOGO_PATH,
         
     | 
| 70 | 
         
             
                    show_label=False,
         
     | 
| 71 | 
         
             
                    interactive=False,
         
     | 
| 
         | 
|
| 74 | 
         
             
                    show_fullscreen_button=False,
         
     | 
| 75 | 
         
             
                    elem_id="logo-image"
         
     | 
| 76 | 
         
             
                )
         
     | 
| 77 | 
         
            +
            # --- Gradio App Definition ---
         
     | 
| 78 | 
         
            +
            demo = gr.Blocks(theme=theme, css=css)
         
     | 
| 79 | 
         
            +
            with demo:
         
     | 
| 80 | 
         
            +
                render_logo()
         
     | 
| 81 | 
         
             
                main_page.demo.render()
         
     | 
| 82 | 
         
             
            with demo.route("Literature Understanding"):
         
     | 
| 83 | 
         
            +
                render_logo()
         
     | 
| 84 | 
         
             
                literature_understanding.demo.render()
         
     | 
| 85 | 
         
             
            with demo.route("Code & Execution"):
         
     | 
| 86 | 
         
            +
                render_logo()
         
     | 
| 87 | 
         
             
                c_and_e.demo.render()
         
     | 
| 88 | 
         
             
            with demo.route("Data Analysis"):
         
     | 
| 89 | 
         
            +
                render_logo()
         
     | 
| 90 | 
         
             
                data_analysis.demo.render()
         
     | 
| 91 | 
         
             
            with demo.route("Discovery"):
         
     | 
| 92 | 
         
            +
                render_logo()
         
     | 
| 93 | 
         
             
                e2e.demo.render()
         
     | 
| 94 | 
         
            +
            with demo.route(" 🚀 Submit an Agent"):
         
     | 
| 95 | 
         
            +
                render_logo()
         
     | 
| 96 | 
         
            +
                submission.demo.render()
         
     | 
| 97 | 
         | 
| 98 | 
         
             
            # --- Scheduler and Launch
         
     | 
| 99 | 
         
             
            def restart_space_job():
         
     | 
    	
        assets/just-icon.svg
    ADDED
    
    | 
											 | 
									
								
    	
        assets/logo.svg
    ADDED
    
    | 
											 | 
									
								
    	
        c_and_e.py
    CHANGED
    
    | 
         @@ -2,13 +2,19 @@ import gradio as gr 
     | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            -
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Code Execution"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            -
                gr.Markdown(f"## {CATEGORY_NAME}  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         | 
| 13 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 14 | 
         
             
                with gr.Tabs():
         
     | 
| 
         | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            +
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data,create_sub_navigation_bar
         
     | 
| 6 | 
         
            +
            from content import PLACEHOLDER_DESCRIPTION
         
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Code Execution"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            +
                gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
         
     | 
| 12 | 
         
            +
                validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         
     | 
| 13 | 
         
            +
                test_df, test_tag_map = get_full_leaderboard_data("test")
         
     | 
| 14 | 
         
            +
                gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
         
     | 
| 15 | 
         
            +
                if validation_tag_map:
         
     | 
| 16 | 
         
            +
                    create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         | 
| 19 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 20 | 
         
             
                with gr.Tabs():
         
     | 
    	
        content.py
    CHANGED
    
    | 
         @@ -1,8 +1,5 @@ 
     | 
|
| 1 | 
         
            -
            TITLE = """<h1 align=" 
     | 
| 2 | 
         | 
| 3 | 
         
            -
            INTRODUCTION_TEXT = """
         
     | 
| 4 | 
         
            -
            ## Introduction
         
     | 
| 5 | 
         
            -
            """
         
     | 
| 6 | 
         
             
            INTRO_PARAGRAPH = """
         
     | 
| 7 | 
         
             
            AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
         
     | 
| 8 | 
         
             
            <br>
         
     | 
| 
         @@ -11,9 +8,14 @@ Enter AstaBench, a grand challenge benchmark developed by Ai2 to test how well a 
     | 
|
| 11 | 
         
             
            SCATTER_DISCLAIMER = """
         
     | 
| 12 | 
         
             
            Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
         
     | 
| 13 | 
         
             
            """
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 17 | 
         
             
            """
         
     | 
| 18 | 
         | 
| 19 | 
         
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         
     | 
| 
         @@ -66,17 +68,18 @@ def hf_uri_to_web_url(uri: str) -> str: 
     | 
|
| 66 | 
         
             
                return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
         
     | 
| 67 | 
         | 
| 68 | 
         
             
            css = """
         
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
                 
     | 
| 71 | 
         
            -
                 
     | 
| 72 | 
         
            -
                border-color: #ec4899;
         
     | 
| 73 | 
         
             
            }
         
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
                font- 
     | 
| 76 | 
         
            -
                 
     | 
| 77 | 
         
             
            }
         
     | 
| 78 | 
         
             
            #logo-image { 
         
     | 
| 79 | 
         
            -
                margin:  
     | 
| 
         | 
|
| 
         | 
|
| 80 | 
         
             
                max-width: 250px;       
         
     | 
| 81 | 
         
             
                height: auto;           
         
     | 
| 82 | 
         
             
            }
         
     | 
| 
         @@ -84,7 +87,6 @@ css = """ 
     | 
|
| 84 | 
         
             
                height: auto !important;
         
     | 
| 85 | 
         
             
                max-height: none !important;
         
     | 
| 86 | 
         
             
            }
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
             
            .table-wrap {
         
     | 
| 89 | 
         
             
                max-height: none !important;
         
     | 
| 90 | 
         
             
                height: auto !important;
         
     | 
| 
         @@ -96,24 +98,55 @@ table.gr-table th, table.gr-table td { 
     | 
|
| 96 | 
         
             
                width: 1%;
         
     | 
| 97 | 
         
             
                white-space: nowrap;
         
     | 
| 98 | 
         
             
            }
         
     | 
| 99 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 100 | 
         
             
            table.gr-table {
         
     | 
| 101 | 
         
             
                font-size: 14px !important;
         
     | 
| 102 | 
         
             
            }
         
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
            /* Example of making the "Agent" column (the 1st column) a bit wider if needed */
         
     | 
| 105 | 
         
            -
            table.gr-table th:nth-child(1),
         
     | 
| 106 | 
         
            -
            table.gr-table td:nth-child(1) {
         
     | 
| 107 | 
         
            -
                min-width: 150px !important;
         
     | 
| 108 | 
         
            -
                white-space: normal !important; /* Allow agent names to wrap if long */
         
     | 
| 109 | 
         
            -
            }
         
     | 
| 110 | 
         
             
            .html-container {
         
     | 
| 111 | 
         
             
                padding-top: 0 !important;
         
     | 
| 112 | 
         
             
            }
         
     | 
| 113 | 
         
             
            #scatter-disclaimer {
         
     | 
| 114 | 
         
             
                color: #f0529c !important;
         
     | 
| 115 | 
         
             
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 116 | 
         
             
            thead.svelte-1e98i6s th {
         
     | 
| 117 | 
         
             
                background: white !important;
         
     | 
| 118 | 
         
             
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 119 | 
         
             
            """
         
     | 
| 
         | 
|
| 1 | 
         
            +
            TITLE = """<h1 align="left" id="space-title">AstaBench Leaderboard</h1>"""
         
     | 
| 2 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 3 | 
         
             
            INTRO_PARAGRAPH = """
         
     | 
| 4 | 
         
             
            AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
         
     | 
| 5 | 
         
             
            <br>
         
     | 
| 
         | 
|
| 8 | 
         
             
            SCATTER_DISCLAIMER = """
         
     | 
| 9 | 
         
             
            Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
         
     | 
| 10 | 
         
             
            """
         
     | 
| 11 | 
         
            +
            PARETO_DISCLAIMER = """
         
     | 
| 12 | 
         
            +
            Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost. 
         
     | 
| 13 | 
         
            +
            """
         
     | 
| 14 | 
         
            +
            LIT_DESCRIPTION = """
         
     | 
| 15 | 
         
            +
            Several of the evaluations in AstaBench probe an AI model's literature understanding skills — that is, its ability to find research papers based on a description, review questions on citation quality, retrieve information from the literature, and so on.
         
     | 
| 16 | 
         
            +
            """
         
     | 
| 17 | 
         
            +
            PLACEHOLDER_DESCRIPTION = """
         
     | 
| 18 | 
         
            +
            THIS IS PLACEHOLDER TEXT. AstaBench is a benchmark suite designed to evaluate AI agents on their ability to perform complex tasks that require reasoning, planning, and execution. It includes a variety of benchmarks that test different aspects of agent performance, such as literature understanding, data analysis, and code execution.
         
     | 
| 19 | 
         
             
            """
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         
     | 
| 
         | 
|
| 68 | 
         
             
                return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
         
     | 
| 69 | 
         | 
| 70 | 
         
             
            css = """
         
     | 
| 71 | 
         
            +
            #intro-paragraph {
         
     | 
| 72 | 
         
            +
                font-size: 18px;
         
     | 
| 73 | 
         
            +
                max-width: 60%;
         
     | 
| 
         | 
|
| 74 | 
         
             
            }
         
     | 
| 75 | 
         
            +
            #category-intro {
         
     | 
| 76 | 
         
            +
                font-size: 18px;
         
     | 
| 77 | 
         
            +
                max-width: 60%;
         
     | 
| 78 | 
         
             
            }
         
     | 
| 79 | 
         
             
            #logo-image { 
         
     | 
| 80 | 
         
            +
                margin: 0;
         
     | 
| 81 | 
         
            +
                margin-bottom: 30px; 
         
     | 
| 82 | 
         
            +
                justify-content: flex-start;        
         
     | 
| 83 | 
         
             
                max-width: 250px;       
         
     | 
| 84 | 
         
             
                height: auto;           
         
     | 
| 85 | 
         
             
            }
         
     | 
| 
         | 
|
| 87 | 
         
             
                height: auto !important;
         
     | 
| 88 | 
         
             
                max-height: none !important;
         
     | 
| 89 | 
         
             
            }
         
     | 
| 
         | 
|
| 90 | 
         
             
            .table-wrap {
         
     | 
| 91 | 
         
             
                max-height: none !important;
         
     | 
| 92 | 
         
             
                height: auto !important;
         
     | 
| 
         | 
|
| 98 | 
         
             
                width: 1%;
         
     | 
| 99 | 
         
             
                white-space: nowrap;
         
     | 
| 100 | 
         
             
            }
         
     | 
| 101 | 
         
            +
            table.svelte-1e98i6s td {
         
     | 
| 102 | 
         
            +
                vertical-align: top !important;
         
     | 
| 103 | 
         
            +
            }
         
     | 
| 104 | 
         
             
            table.gr-table {
         
     | 
| 105 | 
         
             
                font-size: 14px !important;
         
     | 
| 106 | 
         
             
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 107 | 
         
             
            .html-container {
         
     | 
| 108 | 
         
             
                padding-top: 0 !important;
         
     | 
| 109 | 
         
             
            }
         
     | 
| 110 | 
         
             
            #scatter-disclaimer {
         
     | 
| 111 | 
         
             
                color: #f0529c !important;
         
     | 
| 112 | 
         
             
            }
         
     | 
| 113 | 
         
            +
            #pareto-disclaimer {
         
     | 
| 114 | 
         
            +
                color: #f0529c !important;
         
     | 
| 115 | 
         
            +
            }
         
     | 
| 116 | 
         
             
            thead.svelte-1e98i6s th {
         
     | 
| 117 | 
         
             
                background: white !important;
         
     | 
| 118 | 
         
             
            }
         
     | 
| 119 | 
         
            +
            .dark thead.svelte-1e98i6s th {
         
     | 
| 120 | 
         
            +
                background: #091a1a !important;
         
     | 
| 121 | 
         
            +
            }
         
     | 
| 122 | 
         
            +
            .cell-wrap.svelte-v1pjjd {
         
     | 
| 123 | 
         
            +
                font-family: 'Manrope';
         
     | 
| 124 | 
         
            +
                }
         
     | 
| 125 | 
         
            +
            nav.svelte-ti537g.svelte-ti537g {
         
     | 
| 126 | 
         
            +
                justify-content: flex-start;
         
     | 
| 127 | 
         
            +
            }
         
     | 
| 128 | 
         
            +
            #legend-markdown span {
         
     | 
| 129 | 
         
            +
                margin-right: 15px !important; 
         
     | 
| 130 | 
         
            +
            }
         
     | 
| 131 | 
         
            +
            #leaderboard-accordion .label-wrap {
         
     | 
| 132 | 
         
            +
                font-size: 1.4rem !important; 
         
     | 
| 133 | 
         
            +
            }
         
     | 
| 134 | 
         
            +
            .dark #leaderboard-accordion .label-wrap {
         
     | 
| 135 | 
         
            +
                color: #0FCB8C !important; 
         
     | 
| 136 | 
         
            +
            }
         
     | 
| 137 | 
         
            +
            .dark block.svelte-1svsvh2 {
         
     | 
| 138 | 
         
            +
                background: #032629 !important;
         
     | 
| 139 | 
         
            +
            }
         
     | 
| 140 | 
         
            +
            .sub-nav-bar {
         
     | 
| 141 | 
         
            +
                margin-bottom: 20px; /* The space below the nav bar */
         
     | 
| 142 | 
         
            +
            }
         
     | 
| 143 | 
         
            +
            .sub-nav-bar a {
         
     | 
| 144 | 
         
            +
                font-size: 16px;
         
     | 
| 145 | 
         
            +
                border-radius: 5px;
         
     | 
| 146 | 
         
            +
                transition: background-color 0.2s;
         
     | 
| 147 | 
         
            +
                padding-right: 15px;
         
     | 
| 148 | 
         
            +
            }
         
     | 
| 149 | 
         
            +
            .padding.svelte-phx28p {
         
     | 
| 150 | 
         
            +
                padding: 0 !important;
         
     | 
| 151 | 
         
            +
            }
         
     | 
| 152 | 
         
             
            """
         
     | 
    	
        data_analysis.py
    CHANGED
    
    | 
         @@ -2,13 +2,18 @@ import gradio as gr 
     | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            -
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Data Analysis"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            -
                gr.Markdown(f"## {CATEGORY_NAME}  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         | 
| 13 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 14 | 
         
             
                with gr.Tabs():
         
     | 
| 
         | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            +
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
         
     | 
| 6 | 
         
            +
            from content import PLACEHOLDER_DESCRIPTION
         
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Data Analysis"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            +
                gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
         
     | 
| 12 | 
         
            +
                validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         
     | 
| 13 | 
         
            +
                test_df, test_tag_map = get_full_leaderboard_data("test")
         
     | 
| 14 | 
         
            +
                gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
         
     | 
| 15 | 
         
            +
                if validation_tag_map:
         
     | 
| 16 | 
         
            +
                    create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
         
     | 
| 17 | 
         | 
| 18 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 19 | 
         
             
                with gr.Tabs():
         
     | 
    	
        e2e.py
    CHANGED
    
    | 
         @@ -2,13 +2,18 @@ import gradio as gr 
     | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            -
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Discovery"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            -
                gr.Markdown(f"## {CATEGORY_NAME}  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         | 
| 13 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 14 | 
         
             
                with gr.Tabs():
         
     | 
| 
         | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            +
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
         
     | 
| 6 | 
         
            +
            from content import PLACEHOLDER_DESCRIPTION
         
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Discovery"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            +
                gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
         
     | 
| 12 | 
         
            +
                validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         
     | 
| 13 | 
         
            +
                test_df, test_tag_map = get_full_leaderboard_data("test")
         
     | 
| 14 | 
         
            +
                gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
         
     | 
| 15 | 
         
            +
                if validation_tag_map:
         
     | 
| 16 | 
         
            +
                    create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
         
     | 
| 17 | 
         | 
| 18 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 19 | 
         
             
                with gr.Tabs():
         
     | 
    	
        json_leaderboard.py
    DELETED
    
    | 
         @@ -1,485 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import logging
         
     | 
| 2 | 
         
            -
            from typing import Optional, Any, Dict # Added Dict
         
     | 
| 3 | 
         
            -
            from zoneinfo import ZoneInfo
         
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
            # datasets import might not be strictly needed by LeaderboardViewer itself anymore,
         
     | 
| 6 | 
         
            -
            # but _get_dataframe might still use types from it if EvalResult refers to them.
         
     | 
| 7 | 
         
            -
            # For now, let's keep it if your EvalResult or SuiteConfig models have dependencies.
         
     | 
| 8 | 
         
            -
            # If not, it can be removed from here.
         
     | 
| 9 | 
         
            -
            import datasets # Potentially removable from this file
         
     | 
| 10 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 11 | 
         
            -
            import plotly.express as px
         
     | 
| 12 | 
         
            -
            import plotly.graph_objects as go
         
     | 
| 13 | 
         
            -
            import numpy as np
         
     | 
| 14 | 
         
            -
            import pandas as pd
         
     | 
| 15 | 
         
            -
            import seaborn as sns
         
     | 
| 16 | 
         
            -
            import json # For loading the local JSON file
         
     | 
| 17 | 
         
            -
            import os # For checking file existence
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
            from agenteval import compute_summary_statistics
         
     | 
| 20 | 
         
            -
            from agenteval.config import SuiteConfig
         
     | 
| 21 | 
         
            -
            from agenteval.models import EvalResult
         
     | 
| 22 | 
         
            -
             
     | 
| 23 | 
         
            -
            logger = logging.getLogger(__name__)
         
     | 
| 24 | 
         
            -
             
     | 
| 25 | 
         
            -
            import logging
         
     | 
| 26 | 
         
            -
            from typing import Optional, Any, Dict, List # Added List
         
     | 
| 27 | 
         
            -
            from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
         
     | 
| 28 | 
         
            -
            import json
         
     | 
| 29 | 
         
            -
            import os
         
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
            # Assuming these are correctly imported from your project
         
     | 
| 32 | 
         
            -
            from agenteval.config import SuiteConfig
         
     | 
| 33 | 
         
            -
            from agenteval.models import EvalResult
         
     | 
| 34 | 
         
            -
            # from agenteval import compute_summary_statistics # Used by _get_dataframe
         
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
            class DataTransformer:
         
     | 
| 38 | 
         
            -
                """
         
     | 
| 39 | 
         
            -
                Load and visualize leaderboard from a single, local JSON result file.
         
     | 
| 40 | 
         
            -
                """
         
     | 
| 41 | 
         
            -
                _INFORMAL_TO_FORMAL_NAME_MAP = {
         
     | 
| 42 | 
         
            -
                    "lit": "Literature Understanding",
         
     | 
| 43 | 
         
            -
                    "data": "Data Analysis",
         
     | 
| 44 | 
         
            -
                    "code": "Code Execution",
         
     | 
| 45 | 
         
            -
                    "discovery": "Discovery",
         
     | 
| 46 | 
         
            -
                    "arxivdigestables_validation": "Arxivdigestables Validation",
         
     | 
| 47 | 
         
            -
                    "sqa_dev": "Sqa Dev",
         
     | 
| 48 | 
         
            -
                    "litqa2_validation": "Litqa2 Validation",
         
     | 
| 49 | 
         
            -
                    "paper_finder_validation": "Paper Finder Validation",
         
     | 
| 50 | 
         
            -
                    "discoverybench_validation": "Discoverybench Validation",
         
     | 
| 51 | 
         
            -
                    "core_bench_validation": "Core Bench Validation",
         
     | 
| 52 | 
         
            -
                    "ds1000_validation": "DS1000 Validation",
         
     | 
| 53 | 
         
            -
                    "e2e_discovery_validation": "E2E Discovery Validation",
         
     | 
| 54 | 
         
            -
                    "super_validation": "Super Validation",
         
     | 
| 55 | 
         
            -
                    # Add any other raw names that can appear in task.name or task.tags
         
     | 
| 56 | 
         
            -
                }
         
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
                def __init__(
         
     | 
| 59 | 
         
            -
                        self,
         
     | 
| 60 | 
         
            -
                        json_file_path: str, # Mandatory: path to the local JSON file
         
     | 
| 61 | 
         
            -
                        split: str,          # Still needed for context within the JSON's suite_config
         
     | 
| 62 | 
         
            -
                        is_internal: bool = False
         
     | 
| 63 | 
         
            -
                ):
         
     | 
| 64 | 
         
            -
                    self._json_file_path = json_file_path
         
     | 
| 65 | 
         
            -
                    self._split = split
         
     | 
| 66 | 
         
            -
                    self._internal = is_internal
         
     | 
| 67 | 
         
            -
                    self._loaded_json_data: Optional[Dict[str, Any]] = None
         
     | 
| 68 | 
         
            -
                    self._cfg: Optional[SuiteConfig] = None
         
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
                    logger.info(f"Initializing LeaderboardViewer with local JSON file: {self._json_file_path}")
         
     | 
| 71 | 
         
            -
             
     | 
| 72 | 
         
            -
                    # --- Load and Validate JSON data ---
         
     | 
| 73 | 
         
            -
                    if not os.path.exists(self._json_file_path):
         
     | 
| 74 | 
         
            -
                        raise FileNotFoundError(f"JSON file not found at path: {self._json_file_path}")
         
     | 
| 75 | 
         
            -
                    try:
         
     | 
| 76 | 
         
            -
                        with open(self._json_file_path, 'r', encoding='utf-8') as f:
         
     | 
| 77 | 
         
            -
                            self._loaded_json_data = json.load(f)
         
     | 
| 78 | 
         
            -
                    except json.JSONDecodeError as e:
         
     | 
| 79 | 
         
            -
                        raise ValueError(f"Failed to parse JSON from local file {self._json_file_path}: {e}")
         
     | 
| 80 | 
         
            -
                    except Exception as e:
         
     | 
| 81 | 
         
            -
                        raise ValueError(f"Error reading local file {self._json_file_path}: {e}")
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
                    if not self._loaded_json_data:
         
     | 
| 84 | 
         
            -
                        raise ValueError(f"No data loaded from JSON file {self._json_file_path}.")
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
                    try:
         
     | 
| 87 | 
         
            -
                        eval_result = EvalResult.model_validate(self._loaded_json_data)
         
     | 
| 88 | 
         
            -
                    except Exception as e:
         
     | 
| 89 | 
         
            -
                        raise ValueError(f"Failed to validate JSON data from file '{self._json_file_path}' against EvalResult model: {e}")
         
     | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
                    self._cfg = eval_result.suite_config
         
     | 
| 92 | 
         
            -
                    if not isinstance(self._cfg, SuiteConfig):
         
     | 
| 93 | 
         
            -
                        raise TypeError(f"self._cfg is not a SuiteConfig object after loading from '{self._json_file_path}', got {type(self._cfg)}.")
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
                    # --- Populate Tag Map (Corrected Placement and Helper Function Access) ---
         
     | 
| 96 | 
         
            -
                    self.tag_map: dict[str, list[str]] = {}
         
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
                    # Access tasks from the loaded config
         
     | 
| 99 | 
         
            -
                    tasks_for_split: List[Any] = self._cfg.get_tasks(self._split) # Assuming get_tasks returns a list of task-like objects
         
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
                    for task in tasks_for_split:
         
     | 
| 102 | 
         
            -
                        # Ensure task object has 'name' and 'tags' attributes
         
     | 
| 103 | 
         
            -
                        if not hasattr(task, 'name') or not hasattr(task, 'tags'):
         
     | 
| 104 | 
         
            -
                            logger.warning(f"Task object {task} is missing 'name' or 'tags' attribute. Skipping.")
         
     | 
| 105 | 
         
            -
                            continue
         
     | 
| 106 | 
         
            -
             
     | 
| 107 | 
         
            -
                        formal_task_display_name = self._get_formal_display_name_static(task.name) # Use the helper method
         
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
                        if not (task.tags or []):
         
     | 
| 110 | 
         
            -
                            continue
         
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
                        for raw_tag_name in task.tags:
         
     | 
| 113 | 
         
            -
                            formal_tag_display_name_key = self._get_formal_display_name_static(raw_tag_name)
         
     | 
| 114 | 
         
            -
             
     | 
| 115 | 
         
            -
                            self.tag_map.setdefault(formal_tag_display_name_key, []).append(formal_task_display_name)
         
     | 
| 116 | 
         
            -
             
     | 
| 117 | 
         
            -
                    for key in self.tag_map:
         
     | 
| 118 | 
         
            -
                        self.tag_map[key] = sorted(list(set(self.tag_map[key])))
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
                # --- Helper function defined as a static method or regular method ---
         
     | 
| 121 | 
         
            -
                # Option 1: Static method (doesn't need 'self', uses the class attribute)
         
     | 
| 122 | 
         
            -
                @staticmethod
         
     | 
| 123 | 
         
            -
                def _get_formal_display_name_static(raw_name: str) -> str:
         
     | 
| 124 | 
         
            -
                    """
         
     | 
| 125 | 
         
            -
                    Helper function to get the formal display name for a raw tag or task name.
         
     | 
| 126 | 
         
            -
                    Uses the class's map and provides a fallback.
         
     | 
| 127 | 
         
            -
                    """
         
     | 
| 128 | 
         
            -
                    return DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP.get(raw_name, raw_name.replace("_", " ").title())
         
     | 
| 129 | 
         
            -
             
     | 
| 130 | 
         
            -
                def _load(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
         
     | 
| 131 | 
         
            -
                    """
         
     | 
| 132 | 
         
            -
                    Prepares the DataFrame from the loaded JSON data.
         
     | 
| 133 | 
         
            -
                    The JSON data is already loaded and validated in __init__.
         
     | 
| 134 | 
         
            -
                    """
         
     | 
| 135 | 
         
            -
                    if self._loaded_json_data is None or self._cfg is None:
         
     | 
| 136 | 
         
            -
                        # This should not happen if __init__ completed successfully
         
     | 
| 137 | 
         
            -
                        raise RuntimeError("LeaderboardViewer2 not properly initialized. JSON data or SuiteConfig is missing.")
         
     | 
| 138 | 
         
            -
             
     | 
| 139 | 
         
            -
                    # The _get_dataframe function expects a list of records.
         
     | 
| 140 | 
         
            -
                    # Since we have a single JSON file representing one result, wrap it in a list.
         
     | 
| 141 | 
         
            -
                    records_list: list[dict] = [self._loaded_json_data]
         
     | 
| 142 | 
         
            -
             
     | 
| 143 | 
         
            -
                    overview_df = _get_dataframe(
         
     | 
| 144 | 
         
            -
                        records_list=records_list,
         
     | 
| 145 | 
         
            -
                        split=self._split,
         
     | 
| 146 | 
         
            -
                        is_internal=self._internal,
         
     | 
| 147 | 
         
            -
                        suite_config=self._cfg, # Pass the SuiteConfig loaded in __init__
         
     | 
| 148 | 
         
            -
                    )
         
     | 
| 149 | 
         
            -
                    return overview_df, self.tag_map
         
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                # --- view method remains the same as your last version ---
         
     | 
| 152 | 
         
            -
                def view(
         
     | 
| 153 | 
         
            -
                        self,
         
     | 
| 154 | 
         
            -
                        tag: Optional[str] = None,
         
     | 
| 155 | 
         
            -
                        with_plots: bool = False,
         
     | 
| 156 | 
         
            -
                        use_plotly: bool = False,
         
     | 
| 157 | 
         
            -
                ) -> tuple[pd.DataFrame, dict[str, Any]]:
         
     | 
| 158 | 
         
            -
                    data, tag_map = self._load() # tag_map is also returned by _load now
         
     | 
| 159 | 
         
            -
                    print(f"AHAHASHJDBFGASJHDBJAHSDB,AHDB {tag_map}")
         
     | 
| 160 | 
         
            -
                    print(f"THIS IS THE DATA DATA DTAA {data.columns}")
         
     | 
| 161 | 
         
            -
                    if data.empty or (len(data) == 1 and data.iloc[0].get("Agent") == "No data"):
         
     | 
| 162 | 
         
            -
                        logger.warning("No data available to view. Returning empty DataFrame and plots.")
         
     | 
| 163 | 
         
            -
                        return data, {}
         
     | 
| 164 | 
         
            -
             
     | 
| 165 | 
         
            -
                    base_cols = ["Agent", "Submitter", "Date", "Logs"]
         
     | 
| 166 | 
         
            -
                    existing_cols = [col for col in base_cols if col in data.columns]
         
     | 
| 167 | 
         
            -
             
     | 
| 168 | 
         
            -
                    primary_score_col: str
         
     | 
| 169 | 
         
            -
                    group_metric_names: list[str]
         
     | 
| 170 | 
         
            -
             
     | 
| 171 | 
         
            -
                    if tag is None:
         
     | 
| 172 | 
         
            -
                        primary = "Overall"
         
     | 
| 173 | 
         
            -
                        group = list(tag_map.keys())
         
     | 
| 174 | 
         
            -
                    else:
         
     | 
| 175 | 
         
            -
                        primary = tag
         
     | 
| 176 | 
         
            -
                        group = tag_map.get(tag, [])
         
     | 
| 177 | 
         
            -
             
     | 
| 178 | 
         
            -
                    if f"{primary} Score" in data.columns:
         
     | 
| 179 | 
         
            -
                        data = data.sort_values(f"{primary} Score", ascending=False)
         
     | 
| 180 | 
         
            -
                    else:
         
     | 
| 181 | 
         
            -
                        logger.warning(f"Primary metric '{primary}' for sorting not found. Data will not be sorted by it.")
         
     | 
| 182 | 
         
            -
             
     | 
| 183 | 
         
            -
                    metrics_to_display = []
         
     | 
| 184 | 
         
            -
                    if f"{primary} Cost" in data.columns:
         
     | 
| 185 | 
         
            -
                        metrics_to_display.append(f"{primary} Cost")
         
     | 
| 186 | 
         
            -
                    if f"{primary} Score" in data.columns:
         
     | 
| 187 | 
         
            -
                        metrics_to_display.append(f"{primary} Score")
         
     | 
| 188 | 
         
            -
             
     | 
| 189 | 
         
            -
                    for g_item in group:
         
     | 
| 190 | 
         
            -
                        if g_item in data.columns:
         
     | 
| 191 | 
         
            -
                            metrics_to_display.append(g_item)
         
     | 
| 192 | 
         
            -
                        if f"{g_item} Cost" in data.columns:
         
     | 
| 193 | 
         
            -
                            metrics_to_display.append(f"{g_item} Cost")
         
     | 
| 194 | 
         
            -
                        if f"{g_item} Score" in data.columns:
         
     | 
| 195 | 
         
            -
                            metrics_to_display.append(f"{g_item} Score")
         
     | 
| 196 | 
         
            -
             
     | 
| 197 | 
         
            -
             
     | 
| 198 | 
         
            -
                    final_cols_to_display = existing_cols + [m for m in metrics_to_display if m in data.columns]
         
     | 
| 199 | 
         
            -
                    final_cols_to_display = sorted(list(set(final_cols_to_display)), key=final_cols_to_display.index)
         
     | 
| 200 | 
         
            -
             
     | 
| 201 | 
         
            -
                    df_view = data.loc[:, final_cols_to_display].reset_index(drop=True)
         
     | 
| 202 | 
         
            -
             
     | 
| 203 | 
         
            -
                    plots: dict[str, Any] = {}
         
     | 
| 204 | 
         
            -
                    if with_plots:
         
     | 
| 205 | 
         
            -
                        plot_metric_names = [primary] + [g_item for g_item in group if g_item in data.columns]
         
     | 
| 206 | 
         
            -
                        for metric_name in plot_metric_names:
         
     | 
| 207 | 
         
            -
                            score_col = f"{metric_name} Score"
         
     | 
| 208 | 
         
            -
                            cost_col = f"{metric_name} Cost"
         
     | 
| 209 | 
         
            -
                            if score_col in df_view.columns and cost_col in df_view.columns:
         
     | 
| 210 | 
         
            -
                                if use_plotly:
         
     | 
| 211 | 
         
            -
                                    fig = _plot_scatter_plotly(df_view, x=cost_col, y=score_col, agent_col="Agent")
         
     | 
| 212 | 
         
            -
                                plots[f"scatter_{metric_name}"] = fig
         
     | 
| 213 | 
         
            -
                            else:
         
     | 
| 214 | 
         
            -
                                logger.warning(
         
     | 
| 215 | 
         
            -
                                    f"Skipping plot for '{metric_name}': score column '{score_col}' or cost column '{cost_col}' not found."
         
     | 
| 216 | 
         
            -
                                )
         
     | 
| 217 | 
         
            -
                    return df_view, plots
         
     | 
| 218 | 
         
            -
             
     | 
| 219 | 
         
            -
             
     | 
| 220 | 
         
            -
            def _safe_round(value, digits=2):
         
     | 
| 221 | 
         
            -
                return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
         
     | 
| 222 | 
         
            -
             
     | 
| 223 | 
         
            -
            def _get_dataframe(
         
     | 
| 224 | 
         
            -
                    records_list: list[dict],
         
     | 
| 225 | 
         
            -
                    split: str,
         
     | 
| 226 | 
         
            -
                    is_internal: bool,
         
     | 
| 227 | 
         
            -
                    suite_config: SuiteConfig,
         
     | 
| 228 | 
         
            -
                    timezone: str = "US/Pacific",
         
     | 
| 229 | 
         
            -
            ) -> pd.DataFrame:
         
     | 
| 230 | 
         
            -
                # This function remains the same as in the previous version you provided.
         
     | 
| 231 | 
         
            -
                # It takes a list of records (which will be a list containing one item
         
     | 
| 232 | 
         
            -
                # from the loaded JSON file) and processes it.
         
     | 
| 233 | 
         
            -
                if not records_list:
         
     | 
| 234 | 
         
            -
                    logger.warning(f"No records provided to _get_dataframe for split '{split}'. Returning empty DataFrame with placeholder.")
         
     | 
| 235 | 
         
            -
                    expected_pretty_cols = ["Agent Name", "Submitter", "Date", "Overall Score", "Logs"]
         
     | 
| 236 | 
         
            -
                    empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
         
     | 
| 237 | 
         
            -
                    return empty_df
         
     | 
| 238 | 
         
            -
             
     | 
| 239 | 
         
            -
                cfg = suite_config
         
     | 
| 240 | 
         
            -
             
     | 
| 241 | 
         
            -
                rows = []
         
     | 
| 242 | 
         
            -
                for itm_idx, itm in enumerate(records_list):
         
     | 
| 243 | 
         
            -
                    if not isinstance(itm, dict):
         
     | 
| 244 | 
         
            -
                        logger.warning(f"Item {itm_idx} in records_list is not a dict, skipping.")
         
     | 
| 245 | 
         
            -
                        continue
         
     | 
| 246 | 
         
            -
                    try:
         
     | 
| 247 | 
         
            -
                        ev = EvalResult.model_validate(itm)
         
     | 
| 248 | 
         
            -
                    except Exception as e:
         
     | 
| 249 | 
         
            -
                        logger.error(f"Failed to validate item {itm_idx} with EvalResult: {itm}. Error: {e}")
         
     | 
| 250 | 
         
            -
                        continue
         
     | 
| 251 | 
         
            -
             
     | 
| 252 | 
         
            -
                    sub = ev.submission
         
     | 
| 253 | 
         
            -
                    date_str = None
         
     | 
| 254 | 
         
            -
                    if sub.submit_time is not None:
         
     | 
| 255 | 
         
            -
                        submit_dt = sub.submit_time
         
     | 
| 256 | 
         
            -
                        if not isinstance(submit_dt, pd.Timestamp):
         
     | 
| 257 | 
         
            -
                            if submit_dt.tzinfo is None:
         
     | 
| 258 | 
         
            -
                                logger.debug(f"Submission time for {sub.agent_name} is timezone-naive, assuming UTC.")
         
     | 
| 259 | 
         
            -
                                submit_dt = submit_dt.replace(tzinfo=ZoneInfo("UTC"))
         
     | 
| 260 | 
         
            -
                        date_str = pd.Timestamp(submit_dt).tz_convert(ZoneInfo(timezone)).strftime("%Y-%m-%d")
         
     | 
| 261 | 
         
            -
                    else:
         
     | 
| 262 | 
         
            -
                        date_str = None
         
     | 
| 263 | 
         
            -
             
     | 
| 264 | 
         
            -
                    if not ev.results:
         
     | 
| 265 | 
         
            -
                        logger.warning(
         
     | 
| 266 | 
         
            -
                            f"Skipping submission {sub.agent_name} ({sub.username or 'N/A'}) "
         
     | 
| 267 | 
         
            -
                            f"({sub.submit_time or 'N/A'}) due to no results."
         
     | 
| 268 | 
         
            -
                        )
         
     | 
| 269 | 
         
            -
                        continue
         
     | 
| 270 | 
         
            -
                    stats = compute_summary_statistics(
         
     | 
| 271 | 
         
            -
                        suite_config=cfg, split=split, results=ev.results
         
     | 
| 272 | 
         
            -
                    )
         
     | 
| 273 | 
         
            -
                    flat = {}
         
     | 
| 274 | 
         
            -
                    print(f"STATS STATS ASTATAS SD T S T A A {stats}")
         
     | 
| 275 | 
         
            -
                    for key, s_obj in stats.items():
         
     | 
| 276 | 
         
            -
                        parts = key.split("/")
         
     | 
| 277 | 
         
            -
                        if parts[0] == "overall":
         
     | 
| 278 | 
         
            -
                            flat["overall/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
         
     | 
| 279 | 
         
            -
                            flat["overall/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
         
     | 
| 280 | 
         
            -
                        elif parts[0] == "tag" and len(parts) > 1:
         
     | 
| 281 | 
         
            -
                            tag_name = parts[1]
         
     | 
| 282 | 
         
            -
                            flat[f"tag/{tag_name}/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
         
     | 
| 283 | 
         
            -
                            flat[f"tag/{tag_name}/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
         
     | 
| 284 | 
         
            -
                        elif parts[0] == "task" and len(parts) > 1:
         
     | 
| 285 | 
         
            -
                            task_name = parts[1]
         
     | 
| 286 | 
         
            -
                            score = getattr(s_obj, 'score', np.nan)
         
     | 
| 287 | 
         
            -
                            cost = getattr(s_obj, 'cost', np.nan)
         
     | 
| 288 | 
         
            -
                            score_stderr = getattr(s_obj, 'score_stderr', np.nan)
         
     | 
| 289 | 
         
            -
                            cost_stderr = getattr(s_obj, 'cost_stderr', np.nan)
         
     | 
| 290 | 
         
            -
             
     | 
| 291 | 
         
            -
                            flat[f"task/{task_name}/score"] = _safe_round(score)
         
     | 
| 292 | 
         
            -
                            flat[f"task/{task_name}/score_ci"] = _safe_round(score_stderr * 1.96 if pd.notna(score_stderr) else np.nan)
         
     | 
| 293 | 
         
            -
                            flat[f"task/{task_name}/cost"] = _safe_round(cost)
         
     | 
| 294 | 
         
            -
                            flat[f"task/{task_name}/cost_ci"] = _safe_round(cost_stderr * 1.96 if pd.notna(cost_stderr) else np.nan)
         
     | 
| 295 | 
         
            -
                        else:
         
     | 
| 296 | 
         
            -
                            logger.debug(f"Uncommon key structure from compute_summary_statistics: '{key}'. Attempting generic add.")
         
     | 
| 297 | 
         
            -
                            if hasattr(s_obj, 'score'):
         
     | 
| 298 | 
         
            -
                                flat[f"{key}/score"] = _safe_round(s_obj.score)
         
     | 
| 299 | 
         
            -
                            if hasattr(s_obj, 'cost'):
         
     | 
| 300 | 
         
            -
                                flat[f"{key}/cost"] = _safe_round(s_obj.cost)
         
     | 
| 301 | 
         
            -
             
     | 
| 302 | 
         
            -
                    current_logs_url = None
         
     | 
| 303 | 
         
            -
                    if is_internal and sub.logs_url:
         
     | 
| 304 | 
         
            -
                        current_logs_url = str(sub.logs_url)
         
     | 
| 305 | 
         
            -
                    elif not is_internal and sub.logs_url_public:
         
     | 
| 306 | 
         
            -
                        current_logs_url = str(sub.logs_url_public)
         
     | 
| 307 | 
         
            -
             
     | 
| 308 | 
         
            -
                    rows.append(
         
     | 
| 309 | 
         
            -
                        {
         
     | 
| 310 | 
         
            -
                            "agent_name": sub.agent_name or "N/A",
         
     | 
| 311 | 
         
            -
                            "username": sub.username or "N/A",
         
     | 
| 312 | 
         
            -
                            "submit_time": date_str,
         
     | 
| 313 | 
         
            -
                            **flat,
         
     | 
| 314 | 
         
            -
                            "logs_url": current_logs_url,
         
     | 
| 315 | 
         
            -
                        }
         
     | 
| 316 | 
         
            -
                    )
         
     | 
| 317 | 
         
            -
             
     | 
| 318 | 
         
            -
                if not rows:
         
     | 
| 319 | 
         
            -
                    logger.warning(f"No valid rows generated from records_list for split '{split}'. Returning empty DataFrame with placeholder.")
         
     | 
| 320 | 
         
            -
                    expected_pretty_cols = ["Agent", "Submitter", "Date", "Overall Score", "Overall Cost", "Logs"]
         
     | 
| 321 | 
         
            -
                    empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
         
     | 
| 322 | 
         
            -
                    return empty_df
         
     | 
| 323 | 
         
            -
             
     | 
| 324 | 
         
            -
                df = pd.DataFrame(rows)
         
     | 
| 325 | 
         
            -
                pretty_cols = {c: _pretty_column_name(c) for c in df.columns if c in df.columns}
         
     | 
| 326 | 
         
            -
                overview = df.rename(columns=pretty_cols)
         
     | 
| 327 | 
         
            -
                return overview
         
     | 
| 328 | 
         
            -
             
     | 
| 329 | 
         
            -
            def _pretty_column_name(col: str) -> str:
         
     | 
| 330 | 
         
            -
                """Map raw column name to display name."""
         
     | 
| 331 | 
         
            -
                # --- Step 1: Fixed, direct mappings ---
         
     | 
| 332 | 
         
            -
                fixed_mappings = {
         
     | 
| 333 | 
         
            -
                    "submit_time": "Date",
         
     | 
| 334 | 
         
            -
                    "agent_name": "Agent",
         
     | 
| 335 | 
         
            -
                    "username": "Submitter",
         
     | 
| 336 | 
         
            -
                    "logs_url": "Logs",
         
     | 
| 337 | 
         
            -
                    "overall/score": "Overall Score",
         
     | 
| 338 | 
         
            -
                    "overall/cost": "Overall Cost",
         
     | 
| 339 | 
         
            -
                }
         
     | 
| 340 | 
         
            -
                if col in fixed_mappings:
         
     | 
| 341 | 
         
            -
                    return fixed_mappings[col]
         
     | 
| 342 | 
         
            -
             
     | 
| 343 | 
         
            -
                # --- Step 2: Define your mapping for informal names to descriptive names ---
         
     | 
| 344 | 
         
            -
                informal_map = DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP
         
     | 
| 345 | 
         
            -
             
     | 
| 346 | 
         
            -
                # --- Step 3: Dynamic mappings for task or tag columns using the informal_to_formal_name_map ---
         
     | 
| 347 | 
         
            -
                parts = col.split("/")
         
     | 
| 348 | 
         
            -
                if len(parts) == 3:
         
     | 
| 349 | 
         
            -
                    item_type, informal_name, metric_suffix = parts #
         
     | 
| 350 | 
         
            -
             
     | 
| 351 | 
         
            -
                    formal_name = informal_map.get(informal_name)
         
     | 
| 352 | 
         
            -
                    if formal_name is None:
         
     | 
| 353 | 
         
            -
                        formal_name = informal_name.replace("_", " ").title()
         
     | 
| 354 | 
         
            -
                        print(f"[DEBUG _pretty_column_name] Informal name '{informal_name}' not in map, using fallback: '{formal_name}'")
         
     | 
| 355 | 
         
            -
             
     | 
| 356 | 
         
            -
                    if metric_suffix == "score":
         
     | 
| 357 | 
         
            -
                        return f"{formal_name} Score"
         
     | 
| 358 | 
         
            -
                    if metric_suffix == "cost":
         
     | 
| 359 | 
         
            -
                        return f"{formal_name} Cost"
         
     | 
| 360 | 
         
            -
                    if metric_suffix == "score_ci":
         
     | 
| 361 | 
         
            -
                        return f"{formal_name} Score 95% CI"
         
     | 
| 362 | 
         
            -
                    if metric_suffix == "cost_ci":
         
     | 
| 363 | 
         
            -
                        return f"{formal_name} Cost 95% CI"
         
     | 
| 364 | 
         
            -
             
     | 
| 365 | 
         
            -
                # --- Step 4: Fallback for columns that don't match the "type/name/metric" pattern ---
         
     | 
| 366 | 
         
            -
                if "/" not in col:
         
     | 
| 367 | 
         
            -
                    return col.replace("_", " ").title()
         
     | 
| 368 | 
         
            -
                else:
         
     | 
| 369 | 
         
            -
                    return parts[-1].replace("_", " ").title()
         
     | 
| 370 | 
         
            -
             
     | 
| 371 | 
         
            -
            DEFAULT_Y_COLUMN = "Overall Score"
         
     | 
| 372 | 
         
            -
            DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 # Value to use if x-axis data (costs) is missing
         
     | 
| 373 | 
         
            -
             
     | 
| 374 | 
         
            -
            def _plot_scatter_plotly(
         
     | 
| 375 | 
         
            -
                    data: pd.DataFrame,
         
     | 
| 376 | 
         
            -
                    x: Optional[str],
         
     | 
| 377 | 
         
            -
                    y: str,
         
     | 
| 378 | 
         
            -
                    agent_col: str = "Agent"
         
     | 
| 379 | 
         
            -
            ) -> go.Figure:
         
     | 
| 380 | 
         
            -
             
     | 
| 381 | 
         
            -
                x_col_to_use = x
         
     | 
| 382 | 
         
            -
                y_col_to_use = y
         
     | 
| 383 | 
         
            -
             
     | 
| 384 | 
         
            -
                # 1. Check if y-column exists
         
     | 
| 385 | 
         
            -
                if y_col_to_use not in data.columns:
         
     | 
| 386 | 
         
            -
                    logger.error(
         
     | 
| 387 | 
         
            -
                        f"y-axis column '{y_col_to_use}' MUST exist in DataFrame. "
         
     | 
| 388 | 
         
            -
                        f"Cannot generate plot. Available columns: {data.columns.tolist()}"
         
     | 
| 389 | 
         
            -
                    )
         
     | 
| 390 | 
         
            -
                    return go.Figure()
         
     | 
| 391 | 
         
            -
             
     | 
| 392 | 
         
            -
                # 2. Check if agent_col exists
         
     | 
| 393 | 
         
            -
                if agent_col not in data.columns:
         
     | 
| 394 | 
         
            -
                    logger.warning(
         
     | 
| 395 | 
         
            -
                        f"Agent column '{agent_col}' not found in DataFrame. "
         
     | 
| 396 | 
         
            -
                        f"Available columns: {data.columns.tolist()}. Returning empty figure."
         
     | 
| 397 | 
         
            -
                    )
         
     | 
| 398 | 
         
            -
                    return go.Figure()
         
     | 
| 399 | 
         
            -
             
     | 
| 400 | 
         
            -
                # 3. Prepare data (make a copy, handle numeric conversion for y)
         
     | 
| 401 | 
         
            -
                data_plot = data.copy()
         
     | 
| 402 | 
         
            -
                try:
         
     | 
| 403 | 
         
            -
                    data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
         
     | 
| 404 | 
         
            -
                except Exception as e:
         
     | 
| 405 | 
         
            -
                    logger.error(f"Error converting y-column '{y_col_to_use}' to numeric: {e}. Returning empty figure.")
         
     | 
| 406 | 
         
            -
                    return go.Figure()
         
     | 
| 407 | 
         
            -
             
     | 
| 408 | 
         
            -
                # 4. Handle x-column (costs)
         
     | 
| 409 | 
         
            -
                x_axis_label = x_col_to_use if x_col_to_use else "Cost (Data N/A)" # Label for the x-axis
         
     | 
| 410 | 
         
            -
                x_data_is_valid = False
         
     | 
| 411 | 
         
            -
             
     | 
| 412 | 
         
            -
                if x_col_to_use and x_col_to_use in data_plot.columns:
         
     | 
| 413 | 
         
            -
                    try:
         
     | 
| 414 | 
         
            -
                        data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
         
     | 
| 415 | 
         
            -
                        # Check if there's any non-NaN data after coercion for x
         
     | 
| 416 | 
         
            -
                        if data_plot[x_col_to_use].notna().any():
         
     | 
| 417 | 
         
            -
                            x_data_is_valid = True
         
     | 
| 418 | 
         
            -
                        else:
         
     | 
| 419 | 
         
            -
                            logger.info(f"x-axis column '{x_col_to_use}' exists but contains all NaN/None values after numeric conversion.")
         
     | 
| 420 | 
         
            -
                    except Exception as e:
         
     | 
| 421 | 
         
            -
                        logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}. Will use dummy x-values.")
         
     | 
| 422 | 
         
            -
                        # x_data_is_valid remains False
         
     | 
| 423 | 
         
            -
                else:
         
     | 
| 424 | 
         
            -
                    if x_col_to_use: # Name was provided but column doesn't exist
         
     | 
| 425 | 
         
            -
                        logger.warning(f"x-axis column '{x_col_to_use}' not found in DataFrame.")
         
     | 
| 426 | 
         
            -
                    else: # x (column name) was None
         
     | 
| 427 | 
         
            -
                        logger.info("x-axis column name was not provided (is None).")
         
     | 
| 428 | 
         
            -
             
     | 
| 429 | 
         
            -
                if not x_data_is_valid:
         
     | 
| 430 | 
         
            -
                    logger.info(f"Using dummy x-value '{DUMMY_X_VALUE_FOR_MISSING_COSTS}' for all data points as x-data is missing or invalid.")
         
     | 
| 431 | 
         
            -
                    # Create a new column with the dummy x-value for all rows
         
     | 
| 432 | 
         
            -
                    # Use a unique name for this dummy column to avoid potential clashes
         
     | 
| 433 | 
         
            -
                    dummy_x_col_name = "__dummy_x_for_plotting__"
         
     | 
| 434 | 
         
            -
                    data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
         
     | 
| 435 | 
         
            -
                    x_col_to_use = dummy_x_col_name # Update x_col_to_use to point to our dummy data
         
     | 
| 436 | 
         
            -
                    x_axis_label = x if x else "Cost (Data N/A)" # Use original x name for label if provided
         
     | 
| 437 | 
         
            -
                    # or a generic label if x was None.
         
     | 
| 438 | 
         
            -
                    # Could also be f"Cost (Fixed at {DUMMY_X_VALUE_FOR_MISSING_COSTS})"
         
     | 
| 439 | 
         
            -
             
     | 
| 440 | 
         
            -
             
     | 
| 441 | 
         
            -
                # 5. Drop rows where y is NaN (x is now guaranteed to have values, either real or dummy)
         
     | 
| 442 | 
         
            -
                data_plot.dropna(subset=[y_col_to_use], inplace=True)
         
     | 
| 443 | 
         
            -
             
     | 
| 444 | 
         
            -
                fig = go.Figure()
         
     | 
| 445 | 
         
            -
             
     | 
| 446 | 
         
            -
                if data_plot.empty:
         
     | 
| 447 | 
         
            -
                    logger.warning(f"No valid data to plot for y='{y_col_to_use}' (and x='{x_col_to_use}') after cleaning NaNs from y.")
         
     | 
| 448 | 
         
            -
                    # Still return a figure object, but it will be empty. Update layout for clarity.
         
     | 
| 449 | 
         
            -
                    fig.update_layout(
         
     | 
| 450 | 
         
            -
                        title=f"{y_col_to_use} vs. {x_axis_label} (No Data)",
         
     | 
| 451 | 
         
            -
                        xaxis=dict(title=x_axis_label, range=[DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1] if not x_data_is_valid else None),
         
     | 
| 452 | 
         
            -
                        yaxis=dict(title=y_col_to_use)
         
     | 
| 453 | 
         
            -
                    )
         
     | 
| 454 | 
         
            -
                    return fig
         
     | 
| 455 | 
         
            -
             
     | 
| 456 | 
         
            -
             
     | 
| 457 | 
         
            -
                for agent, group in data_plot.groupby(agent_col):
         
     | 
| 458 | 
         
            -
                    hover_x_display = "%{x:.2f}" if x_data_is_valid else str(DUMMY_X_VALUE_FOR_MISSING_COSTS) + " (fixed)"
         
     | 
| 459 | 
         
            -
                    fig.add_trace(go.Scatter(
         
     | 
| 460 | 
         
            -
                        x=group[x_col_to_use],
         
     | 
| 461 | 
         
            -
                        y=group[y_col_to_use],
         
     | 
| 462 | 
         
            -
                        mode='markers',
         
     | 
| 463 | 
         
            -
                        name=str(agent),
         
     | 
| 464 | 
         
            -
                        hovertemplate=f"{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}<extra>{str(agent)}</extra>",
         
     | 
| 465 | 
         
            -
                        marker=dict(size=10)
         
     | 
| 466 | 
         
            -
                    ))
         
     | 
| 467 | 
         
            -
             
     | 
| 468 | 
         
            -
                # Configure layout
         
     | 
| 469 | 
         
            -
                xaxis_config = dict(title=x_axis_label)
         
     | 
| 470 | 
         
            -
                if not x_data_is_valid: # If using dummy x, set a tighter, fixed range for x-axis
         
     | 
| 471 | 
         
            -
                    xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
         
     | 
| 472 | 
         
            -
                    xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS] # Show only one tick at the dummy value
         
     | 
| 473 | 
         
            -
                    xaxis_config['ticktext'] = [str(DUMMY_X_VALUE_FOR_MISSING_COSTS)]
         
     | 
| 474 | 
         
            -
                else: # Real x-data
         
     | 
| 475 | 
         
            -
                    xaxis_config['rangemode'] = "tozero"
         
     | 
| 476 | 
         
            -
             
     | 
| 477 | 
         
            -
             
     | 
| 478 | 
         
            -
                fig.update_layout(
         
     | 
| 479 | 
         
            -
                    title=f"{y_col_to_use} vs. {x_axis_label}",
         
     | 
| 480 | 
         
            -
                    xaxis=xaxis_config,
         
     | 
| 481 | 
         
            -
                    yaxis=dict(title=y_col_to_use, rangemode="tozero"),
         
     | 
| 482 | 
         
            -
                    legend_title_text=agent_col
         
     | 
| 483 | 
         
            -
                )
         
     | 
| 484 | 
         
            -
             
     | 
| 485 | 
         
            -
                return fig
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        leaderboard_transformer.py
    CHANGED
    
    | 
         @@ -2,10 +2,8 @@ import plotly.graph_objects as go 
     | 
|
| 2 | 
         
             
            import numpy as np
         
     | 
| 3 | 
         
             
            import pandas as pd
         
     | 
| 4 | 
         
             
            import logging
         
     | 
| 5 | 
         
            -
            from typing import Optional 
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
            -
            import json
         
     | 
| 8 | 
         
            -
            import os
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 11 | 
         | 
| 
         @@ -46,14 +44,18 @@ def _pretty_column_name(raw_col: str) -> str: 
     | 
|
| 46 | 
         
             
                """
         
     | 
| 47 | 
         
             
                # Case 1: Handle fixed, special-case mappings first.
         
     | 
| 48 | 
         
             
                fixed_mappings = {
         
     | 
| 
         | 
|
| 49 | 
         
             
                    'Agent': 'Agent',
         
     | 
| 50 | 
         
             
                    'Agent description': 'Agent Description',
         
     | 
| 51 | 
         
             
                    'User/organization': 'Submitter',
         
     | 
| 52 | 
         
             
                    'Submission date': 'Date',
         
     | 
| 53 | 
         
             
                    'Overall': 'Overall Score',
         
     | 
| 54 | 
         
             
                    'Overall cost': 'Overall Cost',
         
     | 
| 55 | 
         
            -
                    'Logs': 'Logs'
         
     | 
| 
         | 
|
| 
         | 
|
| 56 | 
         
             
                }
         
     | 
| 
         | 
|
| 57 | 
         
             
                if raw_col in fixed_mappings:
         
     | 
| 58 | 
         
             
                    return fixed_mappings[raw_col]
         
     | 
| 59 | 
         | 
| 
         @@ -146,7 +148,6 @@ class DataTransformer: 
     | 
|
| 146 | 
         
             
                def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
         
     | 
| 147 | 
         
             
                    """
         
     | 
| 148 | 
         
             
                    Initializes the viewer.
         
     | 
| 149 | 
         
            -
             
     | 
| 150 | 
         
             
                    Args:
         
     | 
| 151 | 
         
             
                        dataframe (pd.DataFrame): The presentation-ready leaderboard data.
         
     | 
| 152 | 
         
             
                        tag_map (dict): A map of formal tag names to formal task names.
         
     | 
| 
         @@ -188,29 +189,53 @@ class DataTransformer: 
     | 
|
| 188 | 
         
             
                    if primary_score_col in self.data.columns:
         
     | 
| 189 | 
         
             
                        df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
         
     | 
| 190 | 
         | 
| 191 | 
         
            -
                     
     | 
| 192 | 
         
            -
                     
     | 
| 193 | 
         
            -
                     
     | 
| 194 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 195 | 
         | 
| 196 | 
         
            -
             
     | 
| 197 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 198 | 
         | 
| 199 | 
         
            -
                    #  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 200 | 
         
             
                    for item in group_metrics:
         
     | 
| 201 | 
         
             
                        metrics_to_display.append(f"{item} Score")
         
     | 
| 202 | 
         
             
                        metrics_to_display.append(f"{item} Cost")
         
     | 
| 203 | 
         | 
| 204 | 
         
            -
                     
     | 
| 205 | 
         
            -
                    final_cols_ordered = base_cols + list(dict.fromkeys(metrics_to_display))+ new_cols + ending_cols
         
     | 
| 206 | 
         | 
| 207 | 
         
            -
                    # Filter to only include columns that actually exist in our DataFrame
         
     | 
| 208 | 
         
            -
                    df_view = df_sorted.copy()
         
     | 
| 209 | 
         
             
                    for col in final_cols_ordered:
         
     | 
| 210 | 
         
             
                        if col not in df_view.columns:
         
     | 
| 211 | 
         
             
                            df_view[col] = pd.NA
         
     | 
| 212 | 
         | 
| 
         | 
|
| 213 | 
         
             
                    df_view = df_view[final_cols_ordered].reset_index(drop=True)
         
     | 
| 
         | 
|
| 214 | 
         | 
| 215 | 
         
             
                    # Calculated and add "Categories Attempted" column
         
     | 
| 216 | 
         
             
                    if primary_metric == "Overall":
         
     | 
| 
         @@ -220,29 +245,28 @@ class DataTransformer: 
     | 
|
| 220 | 
         | 
| 221 | 
         
             
                            # Return the formatted string with the correct emoji
         
     | 
| 222 | 
         
             
                            if count == 4:
         
     | 
| 223 | 
         
            -
                                return f"4/4 
     | 
| 224 | 
         
             
                            if count == 0:
         
     | 
| 225 | 
         
            -
                                return f"0/4 
     | 
| 226 | 
         
            -
                            return f"{count}/4 
     | 
| 227 | 
         | 
| 228 | 
         
             
                        # Apply the function row-wise to create the new column
         
     | 
| 229 | 
         
             
                        attempted_column = df_view.apply(calculate_attempted, axis=1)
         
     | 
| 230 | 
         
             
                        # Insert the new column at a nice position (e.g., after "Date")
         
     | 
| 231 | 
         
            -
                        df_view.insert( 
     | 
| 232 | 
         
             
                    else:
         
     | 
| 233 | 
         
             
                        total_benchmarks = len(group_metrics)
         
     | 
| 234 | 
         
             
                        def calculate_benchmarks_attempted(row):
         
     | 
| 235 | 
         
             
                            # Count how many benchmarks in this category have COST data reported
         
     | 
| 236 | 
         
             
                            count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
         
     | 
| 237 | 
         
             
                            if count == total_benchmarks:
         
     | 
| 238 | 
         
            -
                                return f"{count}/{total_benchmarks}  
     | 
| 239 | 
         
             
                            elif count == 0:
         
     | 
| 240 | 
         
            -
                                return f"{count}/{total_benchmarks}  
     | 
| 241 | 
         
             
                            else:
         
     | 
| 242 | 
         
            -
                                return f"{count}/{total_benchmarks} 
     | 
| 243 | 
         
             
                        # Insert the new column, for example, after "Date"
         
     | 
| 244 | 
         
            -
                        df_view.insert( 
     | 
| 245 | 
         
            -
             
     | 
| 246 | 
         | 
| 247 | 
         
             
                    # --- 4. Generate the Scatter Plot for the Primary Metric ---
         
     | 
| 248 | 
         
             
                    plots: dict[str, go.Figure] = {}
         
     | 
| 
         @@ -254,7 +278,7 @@ class DataTransformer: 
     | 
|
| 254 | 
         
             
                                data=df_view,
         
     | 
| 255 | 
         
             
                                x=primary_cost_col,
         
     | 
| 256 | 
         
             
                                y=primary_score_col,
         
     | 
| 257 | 
         
            -
                                agent_col=" 
     | 
| 258 | 
         
             
                            )
         
     | 
| 259 | 
         
             
                            # Use a consistent key for easy retrieval later
         
     | 
| 260 | 
         
             
                            plots['scatter_plot'] = fig
         
     | 
| 
         @@ -274,24 +298,37 @@ def _plot_scatter_plotly( 
     | 
|
| 274 | 
         
             
                    data: pd.DataFrame,
         
     | 
| 275 | 
         
             
                    x: Optional[str],
         
     | 
| 276 | 
         
             
                    y: str,
         
     | 
| 277 | 
         
            -
                    agent_col: str =  
     | 
| 278 | 
         
             
            ) -> go.Figure:
         
     | 
| 279 | 
         | 
| 280 | 
         
            -
                # ---  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 281 | 
         
             
                x_col_to_use = x
         
     | 
| 282 | 
         
             
                y_col_to_use = y
         
     | 
| 283 | 
         | 
| 284 | 
         
            -
                 
     | 
| 285 | 
         
            -
             
     | 
| 286 | 
         
            -
                     
     | 
| 287 | 
         
            -
                if agent_col not in data.columns:
         
     | 
| 288 | 
         
            -
                    logger.warning(f"Agent column '{agent_col}' not found.")
         
     | 
| 289 | 
         
             
                    return go.Figure()
         
     | 
| 290 | 
         | 
| 291 | 
         
             
                data_plot = data.copy()
         
     | 
| 292 | 
         
             
                data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
         
     | 
| 293 | 
         | 
| 294 | 
         
            -
                x_axis_label = x if x else "Cost (Data N/A)"
         
     | 
| 295 | 
         
             
                x_data_is_valid = False
         
     | 
| 296 | 
         
             
                if x and x in data_plot.columns:
         
     | 
| 297 | 
         
             
                    try:
         
     | 
| 
         @@ -307,30 +344,27 @@ def _plot_scatter_plotly( 
     | 
|
| 307 | 
         
             
                    x_col_to_use = dummy_x_col_name
         
     | 
| 308 | 
         
             
                    logger.info("Using dummy x-values for plotting.")
         
     | 
| 309 | 
         | 
| 310 | 
         
            -
                #  
     | 
| 311 | 
         
            -
                data_plot.dropna(subset=[y_col_to_use, x_col_to_use], inplace=True)
         
     | 
| 
         | 
|
| 
         | 
|
| 312 | 
         
             
                fig = go.Figure()
         
     | 
| 313 | 
         
             
                if data_plot.empty:
         
     | 
| 314 | 
         
            -
                    logger.warning(f"No valid data to plot  
     | 
| 315 | 
         
             
                    return fig
         
     | 
| 316 | 
         | 
| 317 | 
         
            -
                #  
     | 
| 318 | 
         
             
                if x_data_is_valid:
         
     | 
| 319 | 
         
            -
                    # Sort by cost (ascending), then by score (descending) to break ties
         
     | 
| 320 | 
         
             
                    sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
         
     | 
| 321 | 
         
            -
             
     | 
| 322 | 
         
             
                    frontier_points = []
         
     | 
| 323 | 
         
             
                    max_score_so_far = float('-inf')
         
     | 
| 324 | 
         | 
| 325 | 
         
            -
                    for  
     | 
| 326 | 
         
             
                        score = row[y_col_to_use]
         
     | 
| 327 | 
         
            -
                         
     | 
| 328 | 
         
            -
                        # it's part of the frontier.
         
     | 
| 329 | 
         
            -
                        if score > max_score_so_far:
         
     | 
| 330 | 
         
             
                            frontier_points.append({'x': row[x_col_to_use], 'y': score})
         
     | 
| 331 | 
         
             
                            max_score_so_far = score
         
     | 
| 332 | 
         | 
| 333 | 
         
            -
                    # Add the frontier line trace to the plot if we found any points
         
     | 
| 334 | 
         
             
                    if frontier_points:
         
     | 
| 335 | 
         
             
                        frontier_df = pd.DataFrame(frontier_points)
         
     | 
| 336 | 
         
             
                        fig.add_trace(go.Scatter(
         
     | 
| 
         @@ -339,22 +373,67 @@ def _plot_scatter_plotly( 
     | 
|
| 339 | 
         
             
                            mode='lines',
         
     | 
| 340 | 
         
             
                            name='Efficiency Frontier',
         
     | 
| 341 | 
         
             
                            line=dict(color='firebrick', width=2, dash='dash'),
         
     | 
| 342 | 
         
            -
                            hoverinfo='skip' 
     | 
| 343 | 
         
             
                        ))
         
     | 
| 344 | 
         | 
| 345 | 
         
            -
                # ---  
     | 
| 346 | 
         
            -
                 
     | 
| 347 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 348 | 
         
             
                    fig.add_trace(go.Scatter(
         
     | 
| 349 | 
         
             
                        x=group[x_col_to_use],
         
     | 
| 350 | 
         
             
                        y=group[y_col_to_use],
         
     | 
| 351 | 
         
             
                        mode='markers',
         
     | 
| 352 | 
         
            -
                        name= 
     | 
| 353 | 
         
            -
                         
     | 
| 354 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 355 | 
         
             
                    ))
         
     | 
| 356 | 
         | 
| 357 | 
         
            -
                # ---  
     | 
| 358 | 
         
             
                xaxis_config = dict(title=x_axis_label)
         
     | 
| 359 | 
         
             
                if not x_data_is_valid:
         
     | 
| 360 | 
         
             
                    xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
         
     | 
| 
         @@ -362,15 +441,32 @@ def _plot_scatter_plotly( 
     | 
|
| 362 | 
         
             
                else:
         
     | 
| 363 | 
         
             
                    xaxis_config['rangemode'] = "tozero"
         
     | 
| 364 | 
         | 
| 
         | 
|
| 
         | 
|
| 365 | 
         
             
                fig.update_layout(
         
     | 
| 
         | 
|
| 366 | 
         
             
                    title=f"{y_col_to_use} vs. {x_axis_label}",
         
     | 
| 367 | 
         
             
                    xaxis=xaxis_config,
         
     | 
| 368 | 
         
             
                    yaxis=dict(title=y_col_to_use, rangemode="tozero"),
         
     | 
| 369 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 370 | 
         
             
                )
         
     | 
| 371 | 
         | 
| 372 | 
         
             
                return fig
         
     | 
| 373 | 
         | 
| 
         | 
|
| 374 | 
         
             
            def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
         
     | 
| 375 | 
         
             
                """
         
     | 
| 376 | 
         
             
                Applies custom formatting to a cost column based on its corresponding score column.
         
     | 
| 
         @@ -398,7 +494,7 @@ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame: 
     | 
|
| 398 | 
         
             
                    if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
         
     | 
| 399 | 
         
             
                        return f"${cost_value:.2f}"
         
     | 
| 400 | 
         
             
                    elif pd.notna(score_value):
         
     | 
| 401 | 
         
            -
                        return f'<span style="color: {status_color};">Missing 
     | 
| 402 | 
         
             
                    else:
         
     | 
| 403 | 
         
             
                        return f'<span style="color: {status_color};">Not Attempted</span>'  # Neither score nor cost exists
         
     | 
| 404 | 
         | 
| 
         @@ -434,3 +530,43 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame: 
     | 
|
| 434 | 
         
             
                # Apply the formatting and return the updated DataFrame
         
     | 
| 435 | 
         
             
                return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
         
     | 
| 436 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 2 | 
         
             
            import numpy as np
         
     | 
| 3 | 
         
             
            import pandas as pd
         
     | 
| 4 | 
         
             
            import logging
         
     | 
| 5 | 
         
            +
            from typing import Optional
         
     | 
| 6 | 
         
            +
            import base64
         
     | 
| 
         | 
|
| 
         | 
|
| 7 | 
         | 
| 8 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 9 | 
         | 
| 
         | 
|
| 44 | 
         
             
                """
         
     | 
| 45 | 
         
             
                # Case 1: Handle fixed, special-case mappings first.
         
     | 
| 46 | 
         
             
                fixed_mappings = {
         
     | 
| 47 | 
         
            +
                    'id': 'id',
         
     | 
| 48 | 
         
             
                    'Agent': 'Agent',
         
     | 
| 49 | 
         
             
                    'Agent description': 'Agent Description',
         
     | 
| 50 | 
         
             
                    'User/organization': 'Submitter',
         
     | 
| 51 | 
         
             
                    'Submission date': 'Date',
         
     | 
| 52 | 
         
             
                    'Overall': 'Overall Score',
         
     | 
| 53 | 
         
             
                    'Overall cost': 'Overall Cost',
         
     | 
| 54 | 
         
            +
                    'Logs': 'Logs',
         
     | 
| 55 | 
         
            +
                    'Openness': 'Openness',
         
     | 
| 56 | 
         
            +
                    'Agent tooling': 'Agent Tooling',
         
     | 
| 57 | 
         
             
                }
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
             
                if raw_col in fixed_mappings:
         
     | 
| 60 | 
         
             
                    return fixed_mappings[raw_col]
         
     | 
| 61 | 
         | 
| 
         | 
|
| 148 | 
         
             
                def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
         
     | 
| 149 | 
         
             
                    """
         
     | 
| 150 | 
         
             
                    Initializes the viewer.
         
     | 
| 
         | 
|
| 151 | 
         
             
                    Args:
         
     | 
| 152 | 
         
             
                        dataframe (pd.DataFrame): The presentation-ready leaderboard data.
         
     | 
| 153 | 
         
             
                        tag_map (dict): A map of formal tag names to formal task names.
         
     | 
| 
         | 
|
| 189 | 
         
             
                    if primary_score_col in self.data.columns:
         
     | 
| 190 | 
         
             
                        df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
         
     | 
| 191 | 
         | 
| 192 | 
         
            +
                    df_view = df_sorted.copy()
         
     | 
| 193 | 
         
            +
                    #preserve just agent name for scatterplot hover
         
     | 
| 194 | 
         
            +
                    df_view['agent_for_hover'] = df_view['Agent']
         
     | 
| 195 | 
         
            +
                    # 3. Combine "Agent" and "Submitter" into a single HTML-formatted column
         
     | 
| 196 | 
         
            +
                    #    We do this *before* defining the final column list.
         
     | 
| 197 | 
         
            +
                    if 'Agent' in df_view.columns and 'Submitter' in df_view.columns:
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
                        def combine_agent_submitter(row):
         
     | 
| 200 | 
         
            +
                            agent = row['Agent']
         
     | 
| 201 | 
         
            +
                            submitter = row['Submitter']
         
     | 
| 202 | 
         
            +
             
     | 
| 203 | 
         
            +
                            # Check if submitter exists and is not empty
         
     | 
| 204 | 
         
            +
                            if pd.notna(submitter) and submitter.strip() != '':
         
     | 
| 205 | 
         
            +
                                # Create a two-line HTML string with styled submitter text
         
     | 
| 206 | 
         
            +
                                return (
         
     | 
| 207 | 
         
            +
                                    f"<div>{agent}<br>"
         
     | 
| 208 | 
         
            +
                                    f"<span style='font-size: 0.9em; color: #667876;'>{submitter}</span>"
         
     | 
| 209 | 
         
            +
                                    f"</div>"
         
     | 
| 210 | 
         
            +
                                )
         
     | 
| 211 | 
         
            +
                            else:
         
     | 
| 212 | 
         
            +
                                # If no submitter, just return the agent name
         
     | 
| 213 | 
         
            +
                                return agent
         
     | 
| 214 | 
         | 
| 215 | 
         
            +
                        # Apply the function to create the new combined 'Agent' column
         
     | 
| 216 | 
         
            +
                        df_view['Agent'] = df_view.apply(combine_agent_submitter, axis=1)
         
     | 
| 217 | 
         
            +
                        # The 'Submitter' column is no longer needed
         
     | 
| 218 | 
         
            +
                        df_view = df_view.drop(columns=['Submitter'])
         
     | 
| 219 | 
         | 
| 220 | 
         
            +
                    # 4. Build the List of Columns to Display (now simplified)
         
     | 
| 221 | 
         
            +
                    base_cols = ["id","Agent","agent_for_hover"]
         
     | 
| 222 | 
         
            +
                    new_cols = ["Openness", "Agent Tooling"]
         
     | 
| 223 | 
         
            +
                    ending_cols = ["Logs"]
         
     | 
| 224 | 
         
            +
             
     | 
| 225 | 
         
            +
                    metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
         
     | 
| 226 | 
         
             
                    for item in group_metrics:
         
     | 
| 227 | 
         
             
                        metrics_to_display.append(f"{item} Score")
         
     | 
| 228 | 
         
             
                        metrics_to_display.append(f"{item} Cost")
         
     | 
| 229 | 
         | 
| 230 | 
         
            +
                    final_cols_ordered = new_cols + base_cols +  list(dict.fromkeys(metrics_to_display)) + ending_cols
         
     | 
| 
         | 
|
| 231 | 
         | 
| 
         | 
|
| 
         | 
|
| 232 | 
         
             
                    for col in final_cols_ordered:
         
     | 
| 233 | 
         
             
                        if col not in df_view.columns:
         
     | 
| 234 | 
         
             
                            df_view[col] = pd.NA
         
     | 
| 235 | 
         | 
| 236 | 
         
            +
                    # The final selection will now use the new column structure
         
     | 
| 237 | 
         
             
                    df_view = df_view[final_cols_ordered].reset_index(drop=True)
         
     | 
| 238 | 
         
            +
                    cols = len(final_cols_ordered)
         
     | 
| 239 | 
         | 
| 240 | 
         
             
                    # Calculated and add "Categories Attempted" column
         
     | 
| 241 | 
         
             
                    if primary_metric == "Overall":
         
     | 
| 
         | 
|
| 245 | 
         | 
| 246 | 
         
             
                            # Return the formatted string with the correct emoji
         
     | 
| 247 | 
         
             
                            if count == 4:
         
     | 
| 248 | 
         
            +
                                return f"4/4"
         
     | 
| 249 | 
         
             
                            if count == 0:
         
     | 
| 250 | 
         
            +
                                return f"0/4"
         
     | 
| 251 | 
         
            +
                            return f"{count}/4"
         
     | 
| 252 | 
         | 
| 253 | 
         
             
                        # Apply the function row-wise to create the new column
         
     | 
| 254 | 
         
             
                        attempted_column = df_view.apply(calculate_attempted, axis=1)
         
     | 
| 255 | 
         
             
                        # Insert the new column at a nice position (e.g., after "Date")
         
     | 
| 256 | 
         
            +
                        df_view.insert((cols - 1), "Categories Attempted", attempted_column)
         
     | 
| 257 | 
         
             
                    else:
         
     | 
| 258 | 
         
             
                        total_benchmarks = len(group_metrics)
         
     | 
| 259 | 
         
             
                        def calculate_benchmarks_attempted(row):
         
     | 
| 260 | 
         
             
                            # Count how many benchmarks in this category have COST data reported
         
     | 
| 261 | 
         
             
                            count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
         
     | 
| 262 | 
         
             
                            if count == total_benchmarks:
         
     | 
| 263 | 
         
            +
                                return f"{count}/{total_benchmarks} "
         
     | 
| 264 | 
         
             
                            elif count == 0:
         
     | 
| 265 | 
         
            +
                                return f"{count}/{total_benchmarks} "
         
     | 
| 266 | 
         
             
                            else:
         
     | 
| 267 | 
         
            +
                                return f"{count}/{total_benchmarks}"
         
     | 
| 268 | 
         
             
                        # Insert the new column, for example, after "Date"
         
     | 
| 269 | 
         
            +
                        df_view.insert((cols - 1), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
         
     | 
| 
         | 
|
| 270 | 
         | 
| 271 | 
         
             
                    # --- 4. Generate the Scatter Plot for the Primary Metric ---
         
     | 
| 272 | 
         
             
                    plots: dict[str, go.Figure] = {}
         
     | 
| 
         | 
|
| 278 | 
         
             
                                data=df_view,
         
     | 
| 279 | 
         
             
                                x=primary_cost_col,
         
     | 
| 280 | 
         
             
                                y=primary_score_col,
         
     | 
| 281 | 
         
            +
                                agent_col="agent_for_hover"
         
     | 
| 282 | 
         
             
                            )
         
     | 
| 283 | 
         
             
                            # Use a consistent key for easy retrieval later
         
     | 
| 284 | 
         
             
                            plots['scatter_plot'] = fig
         
     | 
| 
         | 
|
| 298 | 
         
             
                    data: pd.DataFrame,
         
     | 
| 299 | 
         
             
                    x: Optional[str],
         
     | 
| 300 | 
         
             
                    y: str,
         
     | 
| 301 | 
         
            +
                    agent_col: str = 'agent_for_hover'
         
     | 
| 302 | 
         
             
            ) -> go.Figure:
         
     | 
| 303 | 
         | 
| 304 | 
         
            +
                # --- Section 1: Define Mappings ---
         
     | 
| 305 | 
         
            +
                color_map = {
         
     | 
| 306 | 
         
            +
                    "Closed": "red",
         
     | 
| 307 | 
         
            +
                    "API Available": "orange",
         
     | 
| 308 | 
         
            +
                    "Open Source": "green",
         
     | 
| 309 | 
         
            +
                    "Open Source + Open Weights": "blue"
         
     | 
| 310 | 
         
            +
                }
         
     | 
| 311 | 
         
            +
                category_order = list(color_map.keys())
         
     | 
| 312 | 
         
            +
             
     | 
| 313 | 
         
            +
                shape_map = {
         
     | 
| 314 | 
         
            +
                    "Standard": "star",
         
     | 
| 315 | 
         
            +
                    "Custom with Standard Search": "diamond",
         
     | 
| 316 | 
         
            +
                    "Fully Custom": "circle"
         
     | 
| 317 | 
         
            +
                }
         
     | 
| 318 | 
         
            +
                default_shape = 'square'
         
     | 
| 319 | 
         
            +
             
     | 
| 320 | 
         
             
                x_col_to_use = x
         
     | 
| 321 | 
         
             
                y_col_to_use = y
         
     | 
| 322 | 
         | 
| 323 | 
         
            +
                required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
         
     | 
| 324 | 
         
            +
                if not all(col in data.columns for col in required_cols):
         
     | 
| 325 | 
         
            +
                    logger.error(f"Missing one or more required columns for plotting: {required_cols}")
         
     | 
| 
         | 
|
| 
         | 
|
| 326 | 
         
             
                    return go.Figure()
         
     | 
| 327 | 
         | 
| 328 | 
         
             
                data_plot = data.copy()
         
     | 
| 329 | 
         
             
                data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
         
     | 
| 330 | 
         | 
| 331 | 
         
            +
                x_axis_label = f"{x} (USD)" if x else "Cost (Data N/A)"
         
     | 
| 332 | 
         
             
                x_data_is_valid = False
         
     | 
| 333 | 
         
             
                if x and x in data_plot.columns:
         
     | 
| 334 | 
         
             
                    try:
         
     | 
| 
         | 
|
| 344 | 
         
             
                    x_col_to_use = dummy_x_col_name
         
     | 
| 345 | 
         
             
                    logger.info("Using dummy x-values for plotting.")
         
     | 
| 346 | 
         | 
| 347 | 
         
            +
                # Clean data based on all necessary columns
         
     | 
| 348 | 
         
            +
                data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness", "Agent Tooling"], inplace=True)
         
     | 
| 349 | 
         
            +
             
     | 
| 350 | 
         
            +
                # --- Section 3: Initialize Figure ---
         
     | 
| 351 | 
         
             
                fig = go.Figure()
         
     | 
| 352 | 
         
             
                if data_plot.empty:
         
     | 
| 353 | 
         
            +
                    logger.warning(f"No valid data to plot after cleaning.")
         
     | 
| 354 | 
         
             
                    return fig
         
     | 
| 355 | 
         | 
| 356 | 
         
            +
                # --- Section 4: Calculate and Draw Pareto Frontier (Restored from your original code) ---
         
     | 
| 357 | 
         
             
                if x_data_is_valid:
         
     | 
| 
         | 
|
| 358 | 
         
             
                    sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
         
     | 
| 
         | 
|
| 359 | 
         
             
                    frontier_points = []
         
     | 
| 360 | 
         
             
                    max_score_so_far = float('-inf')
         
     | 
| 361 | 
         | 
| 362 | 
         
            +
                    for _, row in sorted_data.iterrows():
         
     | 
| 363 | 
         
             
                        score = row[y_col_to_use]
         
     | 
| 364 | 
         
            +
                        if score >= max_score_so_far:
         
     | 
| 
         | 
|
| 
         | 
|
| 365 | 
         
             
                            frontier_points.append({'x': row[x_col_to_use], 'y': score})
         
     | 
| 366 | 
         
             
                            max_score_so_far = score
         
     | 
| 367 | 
         | 
| 
         | 
|
| 368 | 
         
             
                    if frontier_points:
         
     | 
| 369 | 
         
             
                        frontier_df = pd.DataFrame(frontier_points)
         
     | 
| 370 | 
         
             
                        fig.add_trace(go.Scatter(
         
     | 
| 
         | 
|
| 373 | 
         
             
                            mode='lines',
         
     | 
| 374 | 
         
             
                            name='Efficiency Frontier',
         
     | 
| 375 | 
         
             
                            line=dict(color='firebrick', width=2, dash='dash'),
         
     | 
| 376 | 
         
            +
                            hoverinfo='skip'
         
     | 
| 377 | 
         
             
                        ))
         
     | 
| 378 | 
         | 
| 379 | 
         
            +
                # --- Section 5: Prepare for Marker Plotting ---
         
     | 
| 380 | 
         
            +
                # Pre-generate hover text and shapes for each point
         
     | 
| 381 | 
         
            +
                data_plot['hover_text'] = data_plot.apply(
         
     | 
| 382 | 
         
            +
                    lambda row: f"<b>{row[agent_col]}</b><br>{x_axis_label}: ${row[x_col_to_use]:.2f}<br>{y_col_to_use}: {row[y_col_to_use]:.2f}",
         
     | 
| 383 | 
         
            +
                    axis=1
         
     | 
| 384 | 
         
            +
                )
         
     | 
| 385 | 
         
            +
                data_plot['shape_symbol'] = data_plot['Agent Tooling'].map(shape_map).fillna(default_shape)
         
     | 
| 386 | 
         
            +
             
     | 
| 387 | 
         
            +
                # --- Section 6: Plot Markers by "Openness" Category ---
         
     | 
| 388 | 
         
            +
                for category in category_order:
         
     | 
| 389 | 
         
            +
                    group = data_plot[data_plot['Openness'] == category]
         
     | 
| 390 | 
         
            +
                    if group.empty:
         
     | 
| 391 | 
         
            +
                        continue
         
     | 
| 392 | 
         
            +
             
     | 
| 393 | 
         
             
                    fig.add_trace(go.Scatter(
         
     | 
| 394 | 
         
             
                        x=group[x_col_to_use],
         
     | 
| 395 | 
         
             
                        y=group[y_col_to_use],
         
     | 
| 396 | 
         
             
                        mode='markers',
         
     | 
| 397 | 
         
            +
                        name=category,
         
     | 
| 398 | 
         
            +
                        showlegend=False,
         
     | 
| 399 | 
         
            +
                        text=group['hover_text'],
         
     | 
| 400 | 
         
            +
                        hoverinfo='text',
         
     | 
| 401 | 
         
            +
                        marker=dict(
         
     | 
| 402 | 
         
            +
                            color=color_map.get(category, 'grey'),
         
     | 
| 403 | 
         
            +
                            symbol=group['shape_symbol'],
         
     | 
| 404 | 
         
            +
                            size=10,
         
     | 
| 405 | 
         
            +
                            opacity=0.8,
         
     | 
| 406 | 
         
            +
                            line=dict(width=1, color='DarkSlateGrey')
         
     | 
| 407 | 
         
            +
                        )
         
     | 
| 408 | 
         
            +
                    ))
         
     | 
| 409 | 
         
            +
                # ---- Add logic for making the legend -----------
         
     | 
| 410 | 
         
            +
                for i, category in enumerate(category_order):
         
     | 
| 411 | 
         
            +
                    fig.add_trace(go.Scatter(
         
     | 
| 412 | 
         
            +
                        x=[None], y=[None],
         
     | 
| 413 | 
         
            +
                        mode='markers',
         
     | 
| 414 | 
         
            +
                        name=category,
         
     | 
| 415 | 
         
            +
                        legendgroup="openness_group",
         
     | 
| 416 | 
         
            +
                        legendgrouptitle_text="Agent Openness" if i == 0 else None,
         
     | 
| 417 | 
         
            +
                        marker=dict(
         
     | 
| 418 | 
         
            +
                            color=color_map.get(category, 'grey'),
         
     | 
| 419 | 
         
            +
                            symbol='circle',
         
     | 
| 420 | 
         
            +
                            size=12
         
     | 
| 421 | 
         
            +
                        )
         
     | 
| 422 | 
         
            +
                    ))
         
     | 
| 423 | 
         
            +
             
     | 
| 424 | 
         
            +
                # Part B: Dummy traces for the SHAPES ("Agent Tooling")
         
     | 
| 425 | 
         
            +
                shape_items = list(shape_map.items())
         
     | 
| 426 | 
         
            +
                for i, (shape_name, shape_symbol) in enumerate(shape_items):
         
     | 
| 427 | 
         
            +
                    fig.add_trace(go.Scatter(
         
     | 
| 428 | 
         
            +
                        x=[None], y=[None],
         
     | 
| 429 | 
         
            +
                        mode='markers',
         
     | 
| 430 | 
         
            +
                        name=shape_name,
         
     | 
| 431 | 
         
            +
                        legendgroup="tooling_group",
         
     | 
| 432 | 
         
            +
                        legendgrouptitle_text="Agent Tooling" if i == 0 else None,
         
     | 
| 433 | 
         
            +
                        marker=dict(color='grey', symbol=shape_symbol, size=12)
         
     | 
| 434 | 
         
             
                    ))
         
     | 
| 435 | 
         | 
| 436 | 
         
            +
                # --- Section 8: Configure Layout (Restored from your original code) ---
         
     | 
| 437 | 
         
             
                xaxis_config = dict(title=x_axis_label)
         
     | 
| 438 | 
         
             
                if not x_data_is_valid:
         
     | 
| 439 | 
         
             
                    xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
         
     | 
| 
         | 
|
| 441 | 
         
             
                else:
         
     | 
| 442 | 
         
             
                    xaxis_config['rangemode'] = "tozero"
         
     | 
| 443 | 
         | 
| 444 | 
         
            +
                logo_data_uri = svg_to_data_uri("assets/just-icon.svg")
         
     | 
| 445 | 
         
            +
             
     | 
| 446 | 
         
             
                fig.update_layout(
         
     | 
| 447 | 
         
            +
                    template="plotly_white",
         
     | 
| 448 | 
         
             
                    title=f"{y_col_to_use} vs. {x_axis_label}",
         
     | 
| 449 | 
         
             
                    xaxis=xaxis_config,
         
     | 
| 450 | 
         
             
                    yaxis=dict(title=y_col_to_use, rangemode="tozero"),
         
     | 
| 451 | 
         
            +
                    legend=dict(
         
     | 
| 452 | 
         
            +
                        bgcolor='#FAF2E9',
         
     | 
| 453 | 
         
            +
                    )
         
     | 
| 454 | 
         
            +
                )
         
     | 
| 455 | 
         
            +
                fig.add_layout_image(
         
     | 
| 456 | 
         
            +
                    dict(
         
     | 
| 457 | 
         
            +
                        source=logo_data_uri,
         
     | 
| 458 | 
         
            +
                        xref="x domain", yref="y domain",
         
     | 
| 459 | 
         
            +
                        x=1.1, y=1.1,
         
     | 
| 460 | 
         
            +
                        sizex=0.2, sizey=0.2,
         
     | 
| 461 | 
         
            +
                        xanchor="left",
         
     | 
| 462 | 
         
            +
                        yanchor="bottom",
         
     | 
| 463 | 
         
            +
                        layer="above",
         
     | 
| 464 | 
         
            +
                    ),
         
     | 
| 465 | 
         
             
                )
         
     | 
| 466 | 
         | 
| 467 | 
         
             
                return fig
         
     | 
| 468 | 
         | 
| 469 | 
         
            +
             
     | 
| 470 | 
         
             
            def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
         
     | 
| 471 | 
         
             
                """
         
     | 
| 472 | 
         
             
                Applies custom formatting to a cost column based on its corresponding score column.
         
     | 
| 
         | 
|
| 494 | 
         
             
                    if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
         
     | 
| 495 | 
         
             
                        return f"${cost_value:.2f}"
         
     | 
| 496 | 
         
             
                    elif pd.notna(score_value):
         
     | 
| 497 | 
         
            +
                        return f'<span style="color: {status_color};">Missing</span>'  # Score exists, but cost is missing
         
     | 
| 498 | 
         
             
                    else:
         
     | 
| 499 | 
         
             
                        return f'<span style="color: {status_color};">Not Attempted</span>'  # Neither score nor cost exists
         
     | 
| 500 | 
         | 
| 
         | 
|
| 530 | 
         
             
                # Apply the formatting and return the updated DataFrame
         
     | 
| 531 | 
         
             
                return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
         
     | 
| 532 | 
         | 
| 533 | 
         
            +
             
     | 
| 534 | 
         
            +
            def get_pareto_df(data):
         
     | 
| 535 | 
         
            +
                # This is a placeholder; use your actual function that handles dynamic column names
         
     | 
| 536 | 
         
            +
                # A robust version might look for any column with "Cost" and "Score"
         
     | 
| 537 | 
         
            +
                cost_cols = [c for c in data.columns if 'Cost' in c]
         
     | 
| 538 | 
         
            +
                score_cols = [c for c in data.columns if 'Score' in c]
         
     | 
| 539 | 
         
            +
                if not cost_cols or not score_cols:
         
     | 
| 540 | 
         
            +
                    return pd.DataFrame()
         
     | 
| 541 | 
         
            +
             
     | 
| 542 | 
         
            +
                x_col, y_col = cost_cols[0], score_cols[0]
         
     | 
| 543 | 
         
            +
             
     | 
| 544 | 
         
            +
                frontier_data = data.dropna(subset=[x_col, y_col]).copy()
         
     | 
| 545 | 
         
            +
                frontier_data[y_col] = pd.to_numeric(frontier_data[y_col], errors='coerce')
         
     | 
| 546 | 
         
            +
                frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce')
         
     | 
| 547 | 
         
            +
                frontier_data.dropna(subset=[x_col, y_col], inplace=True)
         
     | 
| 548 | 
         
            +
                if frontier_data.empty:
         
     | 
| 549 | 
         
            +
                    return pd.DataFrame()
         
     | 
| 550 | 
         
            +
             
     | 
| 551 | 
         
            +
                frontier_data = frontier_data.sort_values(by=[x_col, y_col], ascending=[True, False])
         
     | 
| 552 | 
         
            +
             
     | 
| 553 | 
         
            +
                pareto_points = []
         
     | 
| 554 | 
         
            +
                max_score_at_cost = -np.inf
         
     | 
| 555 | 
         
            +
             
     | 
| 556 | 
         
            +
                for _, row in frontier_data.iterrows():
         
     | 
| 557 | 
         
            +
                    if row[y_col] >= max_score_at_cost:
         
     | 
| 558 | 
         
            +
                        pareto_points.append(row)
         
     | 
| 559 | 
         
            +
                        max_score_at_cost = row[y_col]
         
     | 
| 560 | 
         
            +
             
     | 
| 561 | 
         
            +
                return pd.DataFrame(pareto_points)
         
     | 
| 562 | 
         
            +
             
     | 
| 563 | 
         
            +
             
     | 
| 564 | 
         
            +
            def svg_to_data_uri(path: str) -> str:
         
     | 
| 565 | 
         
            +
                """Reads an SVG file and encodes it as a Data URI for Plotly."""
         
     | 
| 566 | 
         
            +
                try:
         
     | 
| 567 | 
         
            +
                    with open(path, "rb") as f:
         
     | 
| 568 | 
         
            +
                        encoded_string = base64.b64encode(f.read()).decode()
         
     | 
| 569 | 
         
            +
                    return f"data:image/svg+xml;base64,{encoded_string}"
         
     | 
| 570 | 
         
            +
                except FileNotFoundError:
         
     | 
| 571 | 
         
            +
                    logger.warning(f"SVG file not found at: {path}")
         
     | 
| 572 | 
         
            +
                    return None
         
     | 
    	
        leaderboard_viewer.py
    DELETED
    
    | 
         @@ -1,319 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            """
         
     | 
| 2 | 
         
            -
            View and plot leaderboard results.
         
     | 
| 3 | 
         
            -
            """
         
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
            import logging
         
     | 
| 6 | 
         
            -
            from typing import Optional
         
     | 
| 7 | 
         
            -
            from zoneinfo import ZoneInfo
         
     | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
            import datasets
         
     | 
| 10 | 
         
            -
            import matplotlib.pyplot as plt
         
     | 
| 11 | 
         
            -
            import numpy as np
         
     | 
| 12 | 
         
            -
            import pandas as pd
         
     | 
| 13 | 
         
            -
            import seaborn as sns
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
            from agenteval import compute_summary_statistics
         
     | 
| 16 | 
         
            -
            from agenteval.config import SuiteConfig
         
     | 
| 17 | 
         
            -
            from agenteval.models import EvalResult
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
            logger = logging.getLogger(__name__)
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
            class LeaderboardViewer:
         
     | 
| 23 | 
         
            -
                """
         
     | 
| 24 | 
         
            -
                Load and visualize leaderboard for a given HF dataset split.
         
     | 
| 25 | 
         
            -
                """
         
     | 
| 26 | 
         
            -
             
     | 
| 27 | 
         
            -
                def __init__(
         
     | 
| 28 | 
         
            -
                        self, repo_id: str, config: str, split: str, is_internal: bool = False
         
     | 
| 29 | 
         
            -
                ):
         
     | 
| 30 | 
         
            -
                    self._repo_id = repo_id
         
     | 
| 31 | 
         
            -
                    self._config = config
         
     | 
| 32 | 
         
            -
                    self._split = split
         
     | 
| 33 | 
         
            -
                    self._internal = is_internal
         
     | 
| 34 | 
         
            -
             
     | 
| 35 | 
         
            -
                    # build suite_config and mapping from tags to tasks from the first result
         
     | 
| 36 | 
         
            -
                    # TODO: Verify the sort order
         
     | 
| 37 | 
         
            -
                    ds = datasets.load_dataset(repo_id, name=config).get(split)
         
     | 
| 38 | 
         
            -
                    if not ds:
         
     | 
| 39 | 
         
            -
                        raise ValueError(f"Split '{split}' not found in dataset results")
         
     | 
| 40 | 
         
            -
                    suite = EvalResult.model_validate(ds[0]).suite_config
         
     | 
| 41 | 
         
            -
                    self._cfg = suite
         
     | 
| 42 | 
         
            -
                    self.tag_map: dict[str, list[str]] = {}
         
     | 
| 43 | 
         
            -
                    for task in suite.get_tasks(split):
         
     | 
| 44 | 
         
            -
                        for t in task.tags or []:
         
     | 
| 45 | 
         
            -
                            self.tag_map.setdefault(t, []).append(task.name)
         
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
                def _load(self):
         
     | 
| 48 | 
         
            -
                    results = datasets.load_dataset(self._repo_id, name=self._config)
         
     | 
| 49 | 
         
            -
                    overview = _get_dataframe(
         
     | 
| 50 | 
         
            -
                        eval_results=results,
         
     | 
| 51 | 
         
            -
                        split=self._split,
         
     | 
| 52 | 
         
            -
                        is_internal=self._internal,
         
     | 
| 53 | 
         
            -
                        suite_config=self._cfg,
         
     | 
| 54 | 
         
            -
                    )
         
     | 
| 55 | 
         
            -
                    return overview, self.tag_map
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
                def view(
         
     | 
| 58 | 
         
            -
                        self, tag: Optional[str] = None, with_plots: bool = False
         
     | 
| 59 | 
         
            -
                ) -> tuple[pd.DataFrame, dict[str, plt.Figure]]:
         
     | 
| 60 | 
         
            -
                    """
         
     | 
| 61 | 
         
            -
                    If tag is None, primary="Overall" and group=all tags.
         
     | 
| 62 | 
         
            -
                    Otherwise primary=tag and group=tasks under that tag.
         
     | 
| 63 | 
         
            -
                    """
         
     | 
| 64 | 
         
            -
                    data, tag_map = self._load()
         
     | 
| 65 | 
         
            -
                    cols = [
         
     | 
| 66 | 
         
            -
                        "Agent",
         
     | 
| 67 | 
         
            -
                        "Submitter",
         
     | 
| 68 | 
         
            -
                        "Completeness",
         
     | 
| 69 | 
         
            -
                        "LLM Base",
         
     | 
| 70 | 
         
            -
                        "Openness" ,
         
     | 
| 71 | 
         
            -
                        "Date",
         
     | 
| 72 | 
         
            -
                        "Logs",
         
     | 
| 73 | 
         
            -
                    ]
         
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
                    # choose primary metric and its sub‐group
         
     | 
| 76 | 
         
            -
                    if tag is None:
         
     | 
| 77 | 
         
            -
                        primary = "Overall"
         
     | 
| 78 | 
         
            -
                        group = list(tag_map.keys())
         
     | 
| 79 | 
         
            -
                    else:
         
     | 
| 80 | 
         
            -
                        primary = tag
         
     | 
| 81 | 
         
            -
                        group = tag_map.get(tag, [])
         
     | 
| 82 | 
         
            -
                    data = data.sort_values(primary, ascending=False)
         
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
                    # build full metric list: primary + its cost + each member and its cost
         
     | 
| 85 | 
         
            -
                    metrics = [primary, f"{primary} cost"] + [
         
     | 
| 86 | 
         
            -
                        m for t in group for m in (t, f"{t} cost")
         
     | 
| 87 | 
         
            -
                    ]
         
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
                    # filter to relevant columns
         
     | 
| 90 | 
         
            -
                    ci_cols = [f"{m} 95% CI" for m in metrics if f"{m} 95% CI" in data.columns]
         
     | 
| 91 | 
         
            -
                    df = data.loc[
         
     | 
| 92 | 
         
            -
                         :,
         
     | 
| 93 | 
         
            -
                         cols + [c for c in metrics if c in data.columns] + ci_cols,
         
     | 
| 94 | 
         
            -
                         ].reset_index(drop=True)
         
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
                    plots: dict[str, plt.Figure] = {}
         
     | 
| 97 | 
         
            -
                    if with_plots:
         
     | 
| 98 | 
         
            -
                        avail = [c for c in metrics if c in df.columns]
         
     | 
| 99 | 
         
            -
                        for m in [primary] + group:
         
     | 
| 100 | 
         
            -
                            x, y = f"{m} cost", m
         
     | 
| 101 | 
         
            -
                            if x in df.columns and y in df.columns:
         
     | 
| 102 | 
         
            -
                                plots[f"scatter_{m}"] = _plot_scatter(
         
     | 
| 103 | 
         
            -
                                    df, x=x, y=y, agent_col="Agent"
         
     | 
| 104 | 
         
            -
                                )
         
     | 
| 105 | 
         
            -
             
     | 
| 106 | 
         
            -
                    return df, plots
         
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
            def _get_dataframe(
         
     | 
| 110 | 
         
            -
                    eval_results: datasets.DatasetDict,
         
     | 
| 111 | 
         
            -
                    split: str,
         
     | 
| 112 | 
         
            -
                    is_internal: bool,
         
     | 
| 113 | 
         
            -
                    suite_config: SuiteConfig,
         
     | 
| 114 | 
         
            -
                    timezone: str = "US/Pacific",
         
     | 
| 115 | 
         
            -
            ) -> pd.DataFrame:
         
     | 
| 116 | 
         
            -
                """
         
     | 
| 117 | 
         
            -
                Load leaderboard results from the given dataset split and return a DataFrame.
         
     | 
| 118 | 
         
            -
                """
         
     | 
| 119 | 
         
            -
                ds = eval_results.get(split)
         
     | 
| 120 | 
         
            -
                if not ds:
         
     | 
| 121 | 
         
            -
                    cols = ["agent_name", "agent_description", "username", "submit_time"]
         
     | 
| 122 | 
         
            -
                    pretty = [_pretty_column_name(c) for c in cols]
         
     | 
| 123 | 
         
            -
                    empty = pd.DataFrame({c: ["No data"] for c in pretty})
         
     | 
| 124 | 
         
            -
                    return empty
         
     | 
| 125 | 
         
            -
             
     | 
| 126 | 
         
            -
                cfg = suite_config
         
     | 
| 127 | 
         
            -
             
     | 
| 128 | 
         
            -
                rows = []
         
     | 
| 129 | 
         
            -
                for itm in ds:
         
     | 
| 130 | 
         
            -
                    ev = EvalResult.model_validate(itm)
         
     | 
| 131 | 
         
            -
                    sub = ev.submission
         
     | 
| 132 | 
         
            -
                    # only format if submit_time present, else leave as None
         
     | 
| 133 | 
         
            -
                    ts = sub.submit_time
         
     | 
| 134 | 
         
            -
                    if ts is not None:
         
     | 
| 135 | 
         
            -
                        date = ts.astimezone(ZoneInfo(timezone)).strftime("%Y-%m-%d")
         
     | 
| 136 | 
         
            -
                    else:
         
     | 
| 137 | 
         
            -
                        date = None
         
     | 
| 138 | 
         
            -
             
     | 
| 139 | 
         
            -
                    if not ev.results:
         
     | 
| 140 | 
         
            -
                        logger.warning(
         
     | 
| 141 | 
         
            -
                            f"Skipping submission {sub.agent_name} ({sub.username}) "
         
     | 
| 142 | 
         
            -
                            f"({sub.submit_time}) with no results"
         
     | 
| 143 | 
         
            -
                        )
         
     | 
| 144 | 
         
            -
                        continue
         
     | 
| 145 | 
         
            -
                    stats = compute_summary_statistics(
         
     | 
| 146 | 
         
            -
                        suite_config=cfg, split=split, results=ev.results
         
     | 
| 147 | 
         
            -
                    )
         
     | 
| 148 | 
         
            -
                    flat = {}
         
     | 
| 149 | 
         
            -
                    for key, s in stats.items():
         
     | 
| 150 | 
         
            -
                        parts = key.split("/")
         
     | 
| 151 | 
         
            -
                        if parts[0] == "overall":
         
     | 
| 152 | 
         
            -
                            flat["overall/score"], flat["overall/cost"] = s.score, s.cost
         
     | 
| 153 | 
         
            -
                        elif parts[0] == "tag":
         
     | 
| 154 | 
         
            -
                            flat[f"tag/{parts[1]}/score"], flat[f"tag/{parts[1]}/cost"] = (
         
     | 
| 155 | 
         
            -
                                s.score,
         
     | 
| 156 | 
         
            -
                                s.cost,
         
     | 
| 157 | 
         
            -
                            )
         
     | 
| 158 | 
         
            -
                        else:  # task
         
     | 
| 159 | 
         
            -
                            t0 = parts[1]
         
     | 
| 160 | 
         
            -
                            # compute 95% CI half-width from stderr
         
     | 
| 161 | 
         
            -
                            flat.update(
         
     | 
| 162 | 
         
            -
                                {
         
     | 
| 163 | 
         
            -
                                    f"task/{t0}/score": s.score,
         
     | 
| 164 | 
         
            -
                                    f"task/{t0}/score_ci": (
         
     | 
| 165 | 
         
            -
                                        (s.score_stderr * 1.96)
         
     | 
| 166 | 
         
            -
                                        if s.score_stderr is not None
         
     | 
| 167 | 
         
            -
                                        else np.nan
         
     | 
| 168 | 
         
            -
                                    ),
         
     | 
| 169 | 
         
            -
                                    f"task/{t0}/cost": s.cost,
         
     | 
| 170 | 
         
            -
                                    f"task/{t0}/cost_ci": (
         
     | 
| 171 | 
         
            -
                                        (s.cost_stderr * 1.96)
         
     | 
| 172 | 
         
            -
                                        if s.cost_stderr is not None
         
     | 
| 173 | 
         
            -
                                        else np.nan
         
     | 
| 174 | 
         
            -
                                    ),
         
     | 
| 175 | 
         
            -
                                }
         
     | 
| 176 | 
         
            -
                            )
         
     | 
| 177 | 
         
            -
             
     | 
| 178 | 
         
            -
                    rows.append(
         
     | 
| 179 | 
         
            -
                        {
         
     | 
| 180 | 
         
            -
                            "agent_name": sub.agent_name,
         
     | 
| 181 | 
         
            -
                            "username": sub.username or "",
         
     | 
| 182 | 
         
            -
                            "submit_time": date,
         
     | 
| 183 | 
         
            -
                            **flat,
         
     | 
| 184 | 
         
            -
                            "logs_url": sub.logs_url if is_internal else sub.logs_url_public,
         
     | 
| 185 | 
         
            -
                        }
         
     | 
| 186 | 
         
            -
                    )
         
     | 
| 187 | 
         
            -
             
     | 
| 188 | 
         
            -
                df = pd.DataFrame(rows)
         
     | 
| 189 | 
         
            -
             
     | 
| 190 | 
         
            -
                # prepare pretty column mapping
         
     | 
| 191 | 
         
            -
                pretty_cols = {c: _pretty_column_name(c) for c in df.columns}
         
     | 
| 192 | 
         
            -
             
     | 
| 193 | 
         
            -
                # construct overview table with human-friendly names
         
     | 
| 194 | 
         
            -
                overview = df.rename(columns=pretty_cols)
         
     | 
| 195 | 
         
            -
             
     | 
| 196 | 
         
            -
                return overview
         
     | 
| 197 | 
         
            -
             
     | 
| 198 | 
         
            -
             
     | 
| 199 | 
         
            -
            def _pretty_column_name(col: str) -> str:
         
     | 
| 200 | 
         
            -
                """Map raw column name to display name."""
         
     | 
| 201 | 
         
            -
                # fixed mappings
         
     | 
| 202 | 
         
            -
                mapping = {
         
     | 
| 203 | 
         
            -
                    "submit_time": "Date",
         
     | 
| 204 | 
         
            -
                    "agent_name": "Agent",
         
     | 
| 205 | 
         
            -
                    "username": "User/organization",
         
     | 
| 206 | 
         
            -
                    "logs_url": "Logs",
         
     | 
| 207 | 
         
            -
                    "overall/score": "Score",
         
     | 
| 208 | 
         
            -
                    "overall/cost": "Cost (USD)",
         
     | 
| 209 | 
         
            -
                }
         
     | 
| 210 | 
         
            -
                if col in mapping:
         
     | 
| 211 | 
         
            -
                    return mapping[col]
         
     | 
| 212 | 
         
            -
                # dynamic: task/{name}/{metric} or tag/{name}/{metric}
         
     | 
| 213 | 
         
            -
                parts = col.split("/")
         
     | 
| 214 | 
         
            -
                if len(parts) == 3:
         
     | 
| 215 | 
         
            -
                    _, name, metric = parts
         
     | 
| 216 | 
         
            -
                    if metric == "score":
         
     | 
| 217 | 
         
            -
                        return name
         
     | 
| 218 | 
         
            -
                    if metric == "cost":
         
     | 
| 219 | 
         
            -
                        return f"{name} cost"
         
     | 
| 220 | 
         
            -
                    if metric == "score_ci":
         
     | 
| 221 | 
         
            -
                        return f"{name} 95% CI"
         
     | 
| 222 | 
         
            -
                    if metric == "cost_ci":
         
     | 
| 223 | 
         
            -
                        return f"{name} cost 95% CI"
         
     | 
| 224 | 
         
            -
                # fallback to last segment
         
     | 
| 225 | 
         
            -
                return parts[-1]
         
     | 
| 226 | 
         
            -
             
     | 
| 227 | 
         
            -
             
     | 
| 228 | 
         
            -
             
     | 
| 229 | 
         
            -
            def _plot_scatter(
         
     | 
| 230 | 
         
            -
                    data: pd.DataFrame,
         
     | 
| 231 | 
         
            -
                    x: str,  # Cost column name (e.g., "Overall cost")
         
     | 
| 232 | 
         
            -
                    y: str,  # Score column name (e.g., "Overall score")
         
     | 
| 233 | 
         
            -
                    agent_col: str,
         
     | 
| 234 | 
         
            -
            ) -> plt.Figure:
         
     | 
| 235 | 
         
            -
                """Scatter plot of agent results, showing score vs cost with Pareto frontier."""
         
     | 
| 236 | 
         
            -
                fig, ax = plt.subplots(figsize=(20,7))
         
     | 
| 237 | 
         
            -
             
     | 
| 238 | 
         
            -
                # Make a copy for manipulation to find frontier without affecting original data
         
     | 
| 239 | 
         
            -
                plot_data = data.copy()
         
     | 
| 240 | 
         
            -
             
     | 
| 241 | 
         
            -
                # Ensure score (y) and cost (x) are numeric and drop NaNs for frontier calculation
         
     | 
| 242 | 
         
            -
                plot_data[y] = pd.to_numeric(plot_data[y], errors='coerce')
         
     | 
| 243 | 
         
            -
                plot_data[x] = pd.to_numeric(plot_data[x], errors='coerce')
         
     | 
| 244 | 
         
            -
                frontier_data = plot_data.dropna(subset=[y, x])
         
     | 
| 245 | 
         
            -
             
     | 
| 246 | 
         
            -
                if not frontier_data.empty:
         
     | 
| 247 | 
         
            -
                    # Sort by cost (x) ascending, then by score (y) descending for tie-breaking
         
     | 
| 248 | 
         
            -
                    frontier_data = frontier_data.sort_values(by=[x, y], ascending=[True, False])
         
     | 
| 249 | 
         
            -
             
     | 
| 250 | 
         
            -
                    pareto_points = []
         
     | 
| 251 | 
         
            -
                    max_score_at_cost = -np.inf  # Initialize with negative infinity
         
     | 
| 252 | 
         
            -
             
     | 
| 253 | 
         
            -
                    for index, row in frontier_data.iterrows():
         
     | 
| 254 | 
         
            -
                        current_score = row[y]
         
     | 
| 255 | 
         
            -
                        current_cost = row[x]
         
     | 
| 256 | 
         
            -
                        # Only add point if it offers a higher score than any previous point
         
     | 
| 257 | 
         
            -
                        # on the frontier with less or equal cost (implicit by sorting).
         
     | 
| 258 | 
         
            -
                        # More strictly, for a point to be on the frontier here, it must improve the score.
         
     | 
| 259 | 
         
            -
                        if current_score > max_score_at_cost:
         
     | 
| 260 | 
         
            -
                            # Optional: If allowing same score but lower cost (already handled by sort somewhat)
         
     | 
| 261 | 
         
            -
                            # you might need to check if a point with same score but lower cost exists
         
     | 
| 262 | 
         
            -
                            # For this algorithm, we simply take points that strictly increase score.
         
     | 
| 263 | 
         
            -
                            pareto_points.append(row)
         
     | 
| 264 | 
         
            -
                            max_score_at_cost = current_score
         
     | 
| 265 | 
         
            -
             
     | 
| 266 | 
         
            -
                    if pareto_points:
         
     | 
| 267 | 
         
            -
                        pareto_df = pd.DataFrame(pareto_points)
         
     | 
| 268 | 
         
            -
                        # Sort pareto_df by cost again just to be sure for plotting line
         
     | 
| 269 | 
         
            -
                        pareto_df = pareto_df.sort_values(by=x)
         
     | 
| 270 | 
         
            -
                        # Plot the Pareto frontier line
         
     | 
| 271 | 
         
            -
                        ax.plot(pareto_df[x], pareto_df[y], marker='o', linestyle='-', color='red', alpha=0.7, linewidth=2, markersize=5, label='Pareto Frontier')
         
     | 
| 272 | 
         
            -
             
     | 
| 273 | 
         
            -
                # Plot all data points
         
     | 
| 274 | 
         
            -
                sns.scatterplot(data=data, x=x, y=y, hue=agent_col, s=100, ax=ax, legend="auto")
         
     | 
| 275 | 
         
            -
             
     | 
| 276 | 
         
            -
                # Error bars (if they exist)
         
     | 
| 277 | 
         
            -
                x_ci_col = f"{x} 95% CI"
         
     | 
| 278 | 
         
            -
                y_ci_col = f"{y} 95% CI"
         
     | 
| 279 | 
         
            -
                if x_ci_col in data.columns or y_ci_col in data.columns:
         
     | 
| 280 | 
         
            -
                    # Filter data for error bars to only include rows present in the original 'data'
         
     | 
| 281 | 
         
            -
                    # This is important if 'frontier_data' subset was used for some logic but error bars are for all.
         
     | 
| 282 | 
         
            -
                    error_bar_data = data.copy() # Use original data for error bars
         
     | 
| 283 | 
         
            -
                    error_bar_data[x_ci_col] = pd.to_numeric(error_bar_data.get(x_ci_col), errors='coerce')
         
     | 
| 284 | 
         
            -
                    error_bar_data[y_ci_col] = pd.to_numeric(error_bar_data.get(y_ci_col), errors='coerce')
         
     | 
| 285 | 
         
            -
             
     | 
| 286 | 
         
            -
                    ax.errorbar(
         
     | 
| 287 | 
         
            -
                        x=error_bar_data[x], # Use original data's x
         
     | 
| 288 | 
         
            -
                        y=error_bar_data[y], # Use original data's y
         
     | 
| 289 | 
         
            -
                        xerr=error_bar_data.get(x_ci_col),
         
     | 
| 290 | 
         
            -
                        yerr=error_bar_data.get(y_ci_col),
         
     | 
| 291 | 
         
            -
                        fmt="none",
         
     | 
| 292 | 
         
            -
                        ecolor="gray",
         
     | 
| 293 | 
         
            -
                        alpha=0.5,
         
     | 
| 294 | 
         
            -
                        capsize=3,
         
     | 
| 295 | 
         
            -
                        zorder=0 # Draw error bars behind scatter points
         
     | 
| 296 | 
         
            -
                    )
         
     | 
| 297 | 
         
            -
             
     | 
| 298 | 
         
            -
                ax.set_xlim(left=0)
         
     | 
| 299 | 
         
            -
                ax.set_ylim(bottom=0) # Scores and costs are typically non-negative
         
     | 
| 300 | 
         
            -
                ax.set_xlabel(x) # x is cost
         
     | 
| 301 | 
         
            -
                ax.set_ylabel(y) # y is score
         
     | 
| 302 | 
         
            -
             
     | 
| 303 | 
         
            -
                # Adjust legend: Get handles and labels from seaborn plot, then add frontier's
         
     | 
| 304 | 
         
            -
                handles, labels = ax.get_legend_handles_labels()
         
     | 
| 305 | 
         
            -
                # Check if "Pareto Frontier" was actually plotted and add its handle/label if so
         
     | 
| 306 | 
         
            -
                if pareto_points and "Pareto Frontier" not in labels: # Avoid duplicate legend items
         
     | 
| 307 | 
         
            -
                    # Find the frontier line object to get its handle
         
     | 
| 308 | 
         
            -
                    frontier_line = next((line for line in ax.get_lines() if line.get_label() == 'Pareto Frontier'), None)
         
     | 
| 309 | 
         
            -
                    if frontier_line:
         
     | 
| 310 | 
         
            -
                        handles.append(frontier_line)
         
     | 
| 311 | 
         
            -
                        labels.append('Pareto Frontier')
         
     | 
| 312 | 
         
            -
             
     | 
| 313 | 
         
            -
                ax.legend(handles=handles, labels=labels, title=agent_col, bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
         
     | 
| 314 | 
         
            -
             
     | 
| 315 | 
         
            -
                plt.tight_layout(rect=[0, 0, 0.85, 1])
         
     | 
| 316 | 
         
            -
                return fig
         
     | 
| 317 | 
         
            -
             
     | 
| 318 | 
         
            -
             
     | 
| 319 | 
         
            -
            __all__ = ["LeaderboardViewer"]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        literature_understanding.py
    CHANGED
    
    | 
         @@ -2,13 +2,19 @@ import gradio as gr 
     | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            -
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Literature Understanding"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            -
                gr.Markdown(f"## {CATEGORY_NAME}  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         | 
| 13 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 14 | 
         
             
                with gr.Tabs():
         
     | 
| 
         | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # Import our UI factories and the data loader
         
     | 
| 5 | 
         
            +
            from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
         
     | 
| 6 | 
         
            +
            from content import LIT_DESCRIPTION
         
     | 
| 7 | 
         
             
            # Define the category for this page
         
     | 
| 8 | 
         
             
            CATEGORY_NAME = "Literature Understanding"
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            with gr.Blocks() as demo:
         
     | 
| 11 | 
         
            +
                gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
                validation_df, validation_tag_map = get_full_leaderboard_data("validation")
         
     | 
| 14 | 
         
            +
                test_df, test_tag_map = get_full_leaderboard_data("test")
         
     | 
| 15 | 
         
            +
                gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
         
     | 
| 16 | 
         
            +
                if validation_tag_map:
         
     | 
| 17 | 
         
            +
                    create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
         
     | 
| 18 | 
         | 
| 19 | 
         
             
                # --- This page now has two main sections: Validation and Test ---
         
     | 
| 20 | 
         
             
                with gr.Tabs():
         
     | 
    	
        main_page.py
    CHANGED
    
    | 
         @@ -1,341 +1,26 @@ 
     | 
|
| 1 | 
         
             
            import matplotlib
         
     | 
| 2 | 
         
             
            matplotlib.use('Agg')
         
     | 
| 3 | 
         
            -
             
     | 
| 4 | 
         
            -
            import os
         
     | 
| 5 | 
         
            -
            import shutil
         
     | 
| 6 | 
         
            -
            import tarfile
         
     | 
| 7 | 
         
            -
            import tempfile
         
     | 
| 8 | 
         
            -
            from datetime import datetime, timedelta, timezone
         
     | 
| 9 | 
         
            -
            from email.utils import parseaddr
         
     | 
| 10 | 
         
            -
            from pathlib import Path
         
     | 
| 11 | 
         
            -
            # from zoneinfo import ZoneInfo # LeaderboardViewer uses this, ensure it's available
         
     | 
| 12 | 
         
            -
             
     | 
| 13 | 
         
             
            import gradio as gr
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
            from agenteval import (
         
     | 
| 16 | 
         
            -
                # compute_summary_statistics, # This will now be used by LeaderboardViewer
         
     | 
| 17 | 
         
            -
                process_eval_logs,
         
     | 
| 18 | 
         
            -
                upload_folder_to_hf,
         
     | 
| 19 | 
         
            -
                upload_summary_to_hf,
         
     | 
| 20 | 
         
            -
            )
         
     | 
| 21 | 
         
            -
            from agenteval.models import EvalResult # Used by submission and LeaderboardViewer (implicitly)
         
     | 
| 22 | 
         
            -
            from agenteval.leaderboard.upload import sanitize_path_component
         
     | 
| 23 | 
         
            -
            from datasets import Dataset, DatasetDict, VerificationMode, load_dataset # load_dataset used by LV
         
     | 
| 24 | 
         
            -
            from datasets.data_files import EmptyDatasetError
         
     | 
| 25 | 
         
            -
            from huggingface_hub import HfApi
         
     | 
| 26 | 
         | 
| 27 | 
         
             
            from ui_components import create_leaderboard_display, get_full_leaderboard_data
         
     | 
| 28 | 
         | 
| 29 | 
         
             
            from content import (
         
     | 
| 30 | 
         
             
                CITATION_BUTTON_LABEL,
         
     | 
| 31 | 
         
             
                CITATION_BUTTON_TEXT,
         
     | 
| 32 | 
         
            -
                 
     | 
| 33 | 
         
            -
                SUBMISSION_TEXT,
         
     | 
| 34 | 
         
            -
                INTRO_PARAGRAPH,
         
     | 
| 35 | 
         
            -
                SCATTER_DISCLAIMER,
         
     | 
| 36 | 
         
            -
                format_error,
         
     | 
| 37 | 
         
            -
                format_log,
         
     | 
| 38 | 
         
            -
                format_warning,
         
     | 
| 39 | 
         
             
            )
         
     | 
| 40 | 
         | 
| 41 | 
         
            -
            # --- Constants and Configuration  ---
         
     | 
| 42 | 
         
            -
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 43 | 
         
            -
            CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
         
     | 
| 44 | 
         
            -
            IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
         
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
            OWNER = "allenai"
         
     | 
| 47 | 
         
            -
            PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
         
     | 
| 48 | 
         
            -
            SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
         
     | 
| 49 | 
         
            -
            SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
         
     | 
| 50 | 
         
            -
            CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
         
     | 
| 51 | 
         
            -
            RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
         
     | 
| 52 | 
         
            -
            LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
         
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
            if LOCAL_DEBUG:
         
     | 
| 55 | 
         
            -
                DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
         
     | 
| 56 | 
         
            -
            else:
         
     | 
| 57 | 
         
            -
                DATA_DIR = "/home/user/data/" + CONFIG_NAME
         
     | 
| 58 | 
         
            -
            EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
         
     | 
| 59 | 
         
            -
             
     | 
| 60 | 
         
            -
            api = HfApi()
         
     | 
| 61 | 
         
            -
            MAX_UPLOAD_BYTES = 100 * 1024**2
         
     | 
| 62 | 
         
            -
            AGENTEVAL_MANIFEST_NAME = "agenteval.json"
         
     | 
| 63 | 
         
            -
            os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
         
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
             
            # --- Global State for Viewers (simple caching) ---
         
     | 
| 66 | 
         
             
            CACHED_VIEWERS = {}
         
     | 
| 67 | 
         
             
            CACHED_TAG_MAPS = {}
         
     | 
| 68 | 
         | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
                try:
         
     | 
| 72 | 
         
            -
                    return load_dataset(*args, **kwargs)
         
     | 
| 73 | 
         
            -
                except EmptyDatasetError:
         
     | 
| 74 | 
         
            -
                    return DatasetDict()
         
     | 
| 75 | 
         
            -
                except ValueError: # Handles cases where dataset is empty or ill-formed
         
     | 
| 76 | 
         
            -
                    return DatasetDict()
         
     | 
| 77 | 
         
            -
             
     | 
| 78 | 
         
            -
            def checked_upload_folder(
         
     | 
| 79 | 
         
            -
                    api_hf: HfApi, # Renamed to avoid conflict with global api
         
     | 
| 80 | 
         
            -
                    folder_path: str,
         
     | 
| 81 | 
         
            -
                    repo_id: str,
         
     | 
| 82 | 
         
            -
                    config_name_ul: str, # Renamed
         
     | 
| 83 | 
         
            -
                    split_ul: str, # Renamed
         
     | 
| 84 | 
         
            -
                    submission_name_ul: str, # Renamed
         
     | 
| 85 | 
         
            -
            ) -> str:
         
     | 
| 86 | 
         
            -
                total = 0
         
     | 
| 87 | 
         
            -
                for root, _, files in os.walk(folder_path):
         
     | 
| 88 | 
         
            -
                    for f_ul in files: # Renamed
         
     | 
| 89 | 
         
            -
                        total += os.path.getsize(os.path.join(root, f_ul))
         
     | 
| 90 | 
         
            -
                        if total > MAX_UPLOAD_BYTES:
         
     | 
| 91 | 
         
            -
                            raise ValueError(
         
     | 
| 92 | 
         
            -
                                f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
         
     | 
| 93 | 
         
            -
                            )
         
     | 
| 94 | 
         
            -
                return upload_folder_to_hf(
         
     | 
| 95 | 
         
            -
                    api=api_hf, # Use renamed parameter
         
     | 
| 96 | 
         
            -
                    folder_path=folder_path,
         
     | 
| 97 | 
         
            -
                    repo_id=repo_id,
         
     | 
| 98 | 
         
            -
                    config_name=config_name_ul,
         
     | 
| 99 | 
         
            -
                    split=split_ul,
         
     | 
| 100 | 
         
            -
                    submission_name=submission_name_ul,
         
     | 
| 101 | 
         
            -
                )
         
     | 
| 102 | 
         
            -
             
     | 
| 103 | 
         
            -
            def add_new_eval(
         
     | 
| 104 | 
         
            -
                    val_or_test: str,
         
     | 
| 105 | 
         
            -
                    agent_name: str | None,
         
     | 
| 106 | 
         
            -
                    agent_description: str,
         
     | 
| 107 | 
         
            -
                    agent_url: str,
         
     | 
| 108 | 
         
            -
                    openness: str | None,
         
     | 
| 109 | 
         
            -
                    degree_of_control: str | None,
         
     | 
| 110 | 
         
            -
                    path_to_file: tempfile._TemporaryFileWrapper | None,
         
     | 
| 111 | 
         
            -
                    username: str,
         
     | 
| 112 | 
         
            -
                    mail: str,
         
     | 
| 113 | 
         
            -
                    profile: gr.OAuthProfile,
         
     | 
| 114 | 
         
            -
                    # We need global eval_results for checks; this might need rethinking if it's purely display driven now
         
     | 
| 115 | 
         
            -
                    # For now, let's assume we still load it for submission checks
         
     | 
| 116 | 
         
            -
            ):
         
     | 
| 117 | 
         
            -
                # Load current eval_results for submission checks
         
     | 
| 118 | 
         
            -
                # This is a bit redundant if display part reloads it, but submission needs its own consistent view
         
     | 
| 119 | 
         
            -
                current_eval_results_for_submission = try_load_dataset_submission(
         
     | 
| 120 | 
         
            -
                    RESULTS_DATASET,
         
     | 
| 121 | 
         
            -
                    CONFIG_NAME,
         
     | 
| 122 | 
         
            -
                    download_mode="force_redownload", # Or a less aggressive mode
         
     | 
| 123 | 
         
            -
                    verification_mode=VerificationMode.NO_CHECKS,
         
     | 
| 124 | 
         
            -
                    trust_remote_code=True,
         
     | 
| 125 | 
         
            -
                )
         
     | 
| 126 | 
         
            -
                if not agent_name:
         
     | 
| 127 | 
         
            -
                    return format_warning("Please provide an agent name.")
         
     | 
| 128 | 
         
            -
             
     | 
| 129 | 
         
            -
                submission_time = datetime.now(timezone.utc)
         
     | 
| 130 | 
         
            -
                if not username or username.strip() == "":
         
     | 
| 131 | 
         
            -
                    username = profile.username # Default to HF username
         
     | 
| 132 | 
         
            -
             
     | 
| 133 | 
         
            -
                # User account age check
         
     | 
| 134 | 
         
            -
                try:
         
     | 
| 135 | 
         
            -
                    user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
         
     | 
| 136 | 
         
            -
                    user_data_resp.raise_for_status()
         
     | 
| 137 | 
         
            -
                    creation_date_str = user_data_resp.json()["createdAt"]
         
     | 
| 138 | 
         
            -
                    created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
         
     | 
| 139 | 
         
            -
                    if submission_time - created_at < timedelta(days=60):
         
     | 
| 140 | 
         
            -
                        return format_error("This account is not authorized to submit here (account too new).")
         
     | 
| 141 | 
         
            -
                except Exception as e:
         
     | 
| 142 | 
         
            -
                    print(f"Error checking user account age: {e}")
         
     | 
| 143 | 
         
            -
                    return format_error("Could not verify account age. Please try again later.")
         
     | 
| 144 | 
         
            -
             
     | 
| 145 | 
         
            -
                # Submission frequency check
         
     | 
| 146 | 
         
            -
                contact_infos = try_load_dataset_submission(
         
     | 
| 147 | 
         
            -
                    CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
         
     | 
| 148 | 
         
            -
                    verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
         
     | 
| 149 | 
         
            -
                )
         
     | 
| 150 | 
         
            -
                user_submission_dates = sorted(
         
     | 
| 151 | 
         
            -
                    datetime.fromisoformat(row["submit_time"])
         
     | 
| 152 | 
         
            -
                    for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
         
     | 
| 153 | 
         
            -
                )
         
     | 
| 154 | 
         
            -
                if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
         
     | 
| 155 | 
         
            -
                    return format_error("You already submitted once in the last 24h for this split; please try again later.")
         
     | 
| 156 | 
         
            -
             
     | 
| 157 | 
         
            -
                # Email validation
         
     | 
| 158 | 
         
            -
                _, parsed_mail = parseaddr(mail)
         
     | 
| 159 | 
         
            -
                if "@" not in parsed_mail:
         
     | 
| 160 | 
         
            -
                    return format_warning("Please provide a valid email address.")
         
     | 
| 161 | 
         
            -
             
     | 
| 162 | 
         
            -
                # Duplicate submission check
         
     | 
| 163 | 
         
            -
                if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
         
     | 
| 164 | 
         
            -
                    existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
         
     | 
| 165 | 
         
            -
                    for sub_item in existing_submissions:
         
     | 
| 166 | 
         
            -
                        if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
         
     | 
| 167 | 
         
            -
                                sub_item.get("username", "").lower() == username.lower()):
         
     | 
| 168 | 
         
            -
                            return format_warning("This agent name by this user has already been submitted to this split.")
         
     | 
| 169 | 
         
            -
             
     | 
| 170 | 
         
            -
                if path_to_file is None:
         
     | 
| 171 | 
         
            -
                    return format_warning("Please attach a .tar.gz file.")
         
     | 
| 172 | 
         
            -
             
     | 
| 173 | 
         
            -
                safe_username = sanitize_path_component(username)
         
     | 
| 174 | 
         
            -
                safe_agent_name = sanitize_path_component(agent_name)
         
     | 
| 175 | 
         
            -
                extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
         
     | 
| 176 | 
         
            -
             
     | 
| 177 | 
         
            -
                # File extraction
         
     | 
| 178 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 179 | 
         
            -
                    try:
         
     | 
| 180 | 
         
            -
                        if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
         
     | 
| 181 | 
         
            -
                        os.makedirs(extracted_dir, exist_ok=True)
         
     | 
| 182 | 
         
            -
                        with tarfile.open(path_to_file.name, "r:gz") as tar:
         
     | 
| 183 | 
         
            -
                            members_extracted = 0
         
     | 
| 184 | 
         
            -
                            for member in tar.getmembers():
         
     | 
| 185 | 
         
            -
                                if not member.isreg(): continue
         
     | 
| 186 | 
         
            -
                                fname = os.path.basename(member.name)
         
     | 
| 187 | 
         
            -
                                if not fname or fname.startswith("."): continue
         
     | 
| 188 | 
         
            -
                                fobj = tar.extractfile(member)
         
     | 
| 189 | 
         
            -
                                if not fobj: continue
         
     | 
| 190 | 
         
            -
                                with open(os.path.join(extracted_dir, fname), "wb") as out:
         
     | 
| 191 | 
         
            -
                                    out.write(fobj.read())
         
     | 
| 192 | 
         
            -
                                members_extracted +=1
         
     | 
| 193 | 
         
            -
                            if members_extracted == 0:
         
     | 
| 194 | 
         
            -
                                return format_error("Submission tarball is empty or contains no valid files.")
         
     | 
| 195 | 
         
            -
                    except Exception as e:
         
     | 
| 196 | 
         
            -
                        return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
         
     | 
| 197 | 
         
            -
                else: print("mock extracted file", flush=True)
         
     | 
| 198 | 
         
            -
             
     | 
| 199 | 
         
            -
             
     | 
| 200 | 
         
            -
                submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
         
     | 
| 201 | 
         
            -
             
     | 
| 202 | 
         
            -
                # 1. Upload raw (unscored) submission files
         
     | 
| 203 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 204 | 
         
            -
                    try:
         
     | 
| 205 | 
         
            -
                        checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
         
     | 
| 206 | 
         
            -
                    except ValueError as e: return format_error(str(e))
         
     | 
| 207 | 
         
            -
                    except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
         
     | 
| 208 | 
         
            -
                else: print("mock uploaded raw submission", flush=True)
         
     | 
| 209 | 
         
            -
             
     | 
| 210 | 
         
            -
                # 2. Save contact information
         
     | 
| 211 | 
         
            -
                contact_info = {
         
     | 
| 212 | 
         
            -
                    "agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
         
     | 
| 213 | 
         
            -
                    "username": username, "username_auth": profile.username, "mail": mail,
         
     | 
| 214 | 
         
            -
                    "submit_time": submission_time.isoformat(),
         
     | 
| 215 | 
         
            -
                }
         
     | 
| 216 | 
         
            -
                if val_or_test in contact_infos:
         
     | 
| 217 | 
         
            -
                    contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
         
     | 
| 218 | 
         
            -
                else:
         
     | 
| 219 | 
         
            -
                    contact_infos[val_or_test] = Dataset.from_list([contact_info])
         
     | 
| 220 | 
         
            -
             
     | 
| 221 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 222 | 
         
            -
                    try:
         
     | 
| 223 | 
         
            -
                        contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
         
     | 
| 224 | 
         
            -
                    except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
         
     | 
| 225 | 
         
            -
                else: print("mock uploaded contact info", flush=True)
         
     | 
| 226 | 
         
            -
             
     | 
| 227 | 
         
            -
             
     | 
| 228 | 
         
            -
                # 3. Process and score the submission
         
     | 
| 229 | 
         
            -
                eval_result_obj = None # Define to avoid NameError
         
     | 
| 230 | 
         
            -
                try:
         
     | 
| 231 | 
         
            -
                    json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
         
     | 
| 232 | 
         
            -
                    if not json_path.exists():
         
     | 
| 233 | 
         
            -
                        return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
         
     | 
| 234 | 
         
            -
             
     | 
| 235 | 
         
            -
                    eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
         
     | 
| 236 | 
         
            -
                    if eval_result_obj.suite_config.version != CONFIG_NAME:
         
     | 
| 237 | 
         
            -
                        return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
         
     | 
| 238 | 
         
            -
                    if eval_result_obj.split != val_or_test:
         
     | 
| 239 | 
         
            -
                        return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
         
     | 
| 240 | 
         
            -
             
     | 
| 241 | 
         
            -
                    # Re-compute results from logs for integrity
         
     | 
| 242 | 
         
            -
                    eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
         
     | 
| 243 | 
         
            -
                    eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
         
     | 
| 244 | 
         
            -
             
     | 
| 245 | 
         
            -
                except Exception as e:
         
     | 
| 246 | 
         
            -
                    return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
         
     | 
| 247 | 
         
            -
             
     | 
| 248 | 
         
            -
                # 4. Upload scored submission files
         
     | 
| 249 | 
         
            -
                logs_url_private_val, logs_url_public_val = None, None
         
     | 
| 250 | 
         
            -
                scored_submission_name = f"{submission_name}_scored"
         
     | 
| 251 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 252 | 
         
            -
                    try:
         
     | 
| 253 | 
         
            -
                        logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 254 | 
         
            -
                        if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
         
     | 
| 255 | 
         
            -
                            logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 256 | 
         
            -
                    except ValueError as e: return format_error(str(e))
         
     | 
| 257 | 
         
            -
                    except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
         
     | 
| 258 | 
         
            -
                else: print("mock uploaded scored submission", flush=True)
         
     | 
| 259 | 
         
            -
             
     | 
| 260 | 
         
            -
             
     | 
| 261 | 
         
            -
                # Update EvalResult with submission details
         
     | 
| 262 | 
         
            -
                eval_result_obj.submission.agent_name = agent_name
         
     | 
| 263 | 
         
            -
                eval_result_obj.submission.agent_description = agent_description
         
     | 
| 264 | 
         
            -
                eval_result_obj.submission.agent_url = agent_url
         
     | 
| 265 | 
         
            -
                eval_result_obj.submission.openness = openness
         
     | 
| 266 | 
         
            -
                eval_result_obj.submission.degree_of_control = degree_of_control
         
     | 
| 267 | 
         
            -
                eval_result_obj.submission.username = username
         
     | 
| 268 | 
         
            -
                eval_result_obj.submission.submit_time = submission_time
         
     | 
| 269 | 
         
            -
                eval_result_obj.submission.logs_url = logs_url_private_val
         
     | 
| 270 | 
         
            -
                eval_result_obj.submission.logs_url_public = logs_url_public_val
         
     | 
| 271 | 
         
            -
             
     | 
| 272 | 
         
            -
                # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
         
     | 
| 273 | 
         
            -
                if not LOCAL_DEBUG:
         
     | 
| 274 | 
         
            -
                    try:
         
     | 
| 275 | 
         
            -
                        upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 276 | 
         
            -
                    except Exception as e:
         
     | 
| 277 | 
         
            -
                        return format_error(f"Failed to upload summary results to leaderboard: {e}")
         
     | 
| 278 | 
         
            -
                else: print("mock uploaded results to lb", flush=True)
         
     | 
| 279 | 
         
            -
             
     | 
| 280 | 
         
            -
                # Invalidate viewer cache for the split that was updated
         
     | 
| 281 | 
         
            -
                if val_or_test in CACHED_VIEWERS:
         
     | 
| 282 | 
         
            -
                    del CACHED_VIEWERS[val_or_test]
         
     | 
| 283 | 
         
            -
                if val_or_test in CACHED_TAG_MAPS:
         
     | 
| 284 | 
         
            -
                    del CACHED_TAG_MAPS[val_or_test]
         
     | 
| 285 | 
         
            -
             
     | 
| 286 | 
         
            -
             
     | 
| 287 | 
         
            -
                return format_log(
         
     | 
| 288 | 
         
            -
                    f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
         
     | 
| 289 | 
         
            -
                    "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
         
     | 
| 290 | 
         
            -
                )
         
     | 
| 291 | 
         
            -
             
     | 
| 292 | 
         
            -
            with gr.Blocks() as demo:
         
     | 
| 293 | 
         
            -
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         
     | 
| 294 | 
         
            -
                gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
         
     | 
| 295 | 
         
            -
             
     | 
| 296 | 
         
            -
                # --- Submission Accordion ---
         
     | 
| 297 | 
         
            -
                with gr.Accordion("🚀 Submit a new agent for evaluation", open=False, elem_classes="submission-accordion"):
         
     | 
| 298 | 
         
            -
                    gr.Markdown(SUBMISSION_TEXT, elem_id="markdown-text")
         
     | 
| 299 | 
         
            -
                    with gr.Row():
         
     | 
| 300 | 
         
            -
                        with gr.Column():
         
     | 
| 301 | 
         
            -
                            level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
         
     | 
| 302 | 
         
            -
                            agent_name_tb = gr.Textbox(label="Agent Name")
         
     | 
| 303 | 
         
            -
                            agent_desc_tb = gr.Textbox(label="Agent Description")
         
     | 
| 304 | 
         
            -
                            agent_url_tb = gr.Textbox(label="URL to Agent Information")
         
     | 
| 305 | 
         
            -
                            openness_radio = gr.Radio(["Open Source", "API", "UI"], value=None, label="Openness of Agent")
         
     | 
| 306 | 
         
            -
                            degree_of_control_radio = gr.Radio(["Standard", "Custom"], value=None, label="Degree of Control")
         
     | 
| 307 | 
         
            -
                        with gr.Column():
         
     | 
| 308 | 
         
            -
                            username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
         
     | 
| 309 | 
         
            -
                            mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
         
     | 
| 310 | 
         
            -
                            file_upload_comp = gr.File(
         
     | 
| 311 | 
         
            -
                                label="Submission File (.tar.gz ...)", # Shortened for brevity
         
     | 
| 312 | 
         
            -
                                file_types=[".gz", ".tar.gz"]
         
     | 
| 313 | 
         
            -
                            )
         
     | 
| 314 | 
         
            -
                    with gr.Row():
         
     | 
| 315 | 
         
            -
                        gr.LoginButton()
         
     | 
| 316 | 
         
            -
                        submit_eval_button = gr.Button("Submit Evaluation")
         
     | 
| 317 | 
         
            -
                    submission_result = gr.Markdown()
         
     | 
| 318 | 
         
            -
             
     | 
| 319 | 
         
            -
                    submit_eval_button.click(
         
     | 
| 320 | 
         
            -
                        add_new_eval,
         
     | 
| 321 | 
         
            -
                        [
         
     | 
| 322 | 
         
            -
                            level_of_test_radio,
         
     | 
| 323 | 
         
            -
                            agent_name_tb,
         
     | 
| 324 | 
         
            -
                            agent_desc_tb,
         
     | 
| 325 | 
         
            -
                            agent_url_tb,
         
     | 
| 326 | 
         
            -
                            openness_radio,
         
     | 
| 327 | 
         
            -
                            degree_of_control_radio,
         
     | 
| 328 | 
         
            -
                            file_upload_comp,
         
     | 
| 329 | 
         
            -
                            username_tb,
         
     | 
| 330 | 
         
            -
                            mail_tb
         
     | 
| 331 | 
         
            -
                        ],
         
     | 
| 332 | 
         
            -
                        submission_result,
         
     | 
| 333 | 
         
            -
                    )
         
     | 
| 334 | 
         
            -
             
     | 
| 335 | 
         
             
                # --- Leaderboard Display Section ---
         
     | 
| 336 | 
         
             
                gr.Markdown("---")
         
     | 
| 337 | 
         
             
                CATEGORY_NAME = "Overall"
         
     | 
| 338 | 
         
            -
                gr.Markdown(f"## {CATEGORY_NAME}  
     | 
| 339 | 
         | 
| 340 | 
         
             
                with gr.Tabs() as tabs:
         
     | 
| 341 | 
         
             
                    with gr.Tab("Results: Validation"):
         
     | 
| 
         @@ -352,7 +37,6 @@ with gr.Blocks() as demo: 
     | 
|
| 352 | 
         
             
                                split_name="validation"
         
     | 
| 353 | 
         
             
                            )
         
     | 
| 354 | 
         
             
                        else:
         
     | 
| 355 | 
         
            -
                            # Display a message if no data is available
         
     | 
| 356 | 
         
             
                            gr.Markdown("No data available for validation split.")
         
     | 
| 357 | 
         | 
| 358 | 
         
             
                    with gr.Tab("Results: Test"):
         
     | 
| 
         | 
|
| 1 | 
         
             
            import matplotlib
         
     | 
| 2 | 
         
             
            matplotlib.use('Agg')
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 3 | 
         
             
            import gradio as gr
         
     | 
| 4 | 
         
            +
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 5 | 
         | 
| 6 | 
         
             
            from ui_components import create_leaderboard_display, get_full_leaderboard_data
         
     | 
| 7 | 
         | 
| 8 | 
         
             
            from content import (
         
     | 
| 9 | 
         
             
                CITATION_BUTTON_LABEL,
         
     | 
| 10 | 
         
             
                CITATION_BUTTON_TEXT,
         
     | 
| 11 | 
         
            +
                INTRO_PARAGRAPH
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         
             
            )
         
     | 
| 13 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
            # --- Global State for Viewers (simple caching) ---
         
     | 
| 15 | 
         
             
            CACHED_VIEWERS = {}
         
     | 
| 16 | 
         
             
            CACHED_TAG_MAPS = {}
         
     | 
| 17 | 
         | 
| 18 | 
         
            +
            with gr.Blocks(fill_width=True) as demo:
         
     | 
| 19 | 
         
            +
                gr.Markdown(INTRO_PARAGRAPH, elem_id="intro-paragraph")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 20 | 
         
             
                # --- Leaderboard Display Section ---
         
     | 
| 21 | 
         
             
                gr.Markdown("---")
         
     | 
| 22 | 
         
             
                CATEGORY_NAME = "Overall"
         
     | 
| 23 | 
         
            +
                gr.Markdown(f"## {CATEGORY_NAME} Categories Aggregated")
         
     | 
| 24 | 
         | 
| 25 | 
         
             
                with gr.Tabs() as tabs:
         
     | 
| 26 | 
         
             
                    with gr.Tab("Results: Validation"):
         
     | 
| 
         | 
|
| 37 | 
         
             
                                split_name="validation"
         
     | 
| 38 | 
         
             
                            )
         
     | 
| 39 | 
         
             
                        else:
         
     | 
| 
         | 
|
| 40 | 
         
             
                            gr.Markdown("No data available for validation split.")
         
     | 
| 41 | 
         | 
| 42 | 
         
             
                    with gr.Tab("Results: Test"):
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -1,5 +1,131 @@ 
     | 
|
| 1 | 
         
            -
             
     | 
| 2 | 
         
            -
             
     | 
| 3 | 
         
            -
             
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            agent-eval==0.1.13
         
     | 
| 2 | 
         
            +
            aiobotocore==2.22.0
         
     | 
| 3 | 
         
            +
            aiofiles==24.1.0
         
     | 
| 4 | 
         
            +
            aiohappyeyeballs==2.6.1
         
     | 
| 5 | 
         
            +
            aiohttp==3.11.18
         
     | 
| 6 | 
         
            +
            aioitertools==0.12.0
         
     | 
| 7 | 
         
            +
            aiosignal==1.3.2
         
     | 
| 8 | 
         
            +
            annotated-types==0.7.0
         
     | 
| 9 | 
         
            +
            anyio==4.9.0
         
     | 
| 10 | 
         
            +
            APScheduler==3.11.0
         
     | 
| 11 | 
         
            +
            async-timeout==5.0.1
         
     | 
| 12 | 
         
            +
            attrs==25.3.0
         
     | 
| 13 | 
         
            +
            Authlib==1.5.2
         
     | 
| 14 | 
         
            +
            beautifulsoup4==4.13.4
         
     | 
| 15 | 
         
            +
            black==25.1.0
         
     | 
| 16 | 
         
            +
            botocore==1.37.3
         
     | 
| 17 | 
         
            +
            certifi==2025.4.26
         
     | 
| 18 | 
         
            +
            cffi==1.17.1
         
     | 
| 19 | 
         
            +
            charset-normalizer==3.4.2
         
     | 
| 20 | 
         
            +
            click==8.1.8
         
     | 
| 21 | 
         
            +
            contourpy==1.3.2
         
     | 
| 22 | 
         
            +
            cryptography==44.0.3
         
     | 
| 23 | 
         
            +
            cycler==0.12.1
         
     | 
| 24 | 
         
            +
            datasets==3.6.0
         
     | 
| 25 | 
         
            +
            debugpy==1.8.14
         
     | 
| 26 | 
         
            +
            dill==0.3.8
         
     | 
| 27 | 
         
            +
            distro==1.9.0
         
     | 
| 28 | 
         
            +
            docstring_parser==0.16
         
     | 
| 29 | 
         
            +
            exceptiongroup==1.2.2
         
     | 
| 30 | 
         
            +
            fastapi==0.115.12
         
     | 
| 31 | 
         
            +
            ffmpy==0.5.0
         
     | 
| 32 | 
         
            +
            filelock==3.18.0
         
     | 
| 33 | 
         
            +
            fonttools==4.58.1
         
     | 
| 34 | 
         
            +
            frozenlist==1.6.0
         
     | 
| 35 | 
         
            +
            fsspec==2025.3.0
         
     | 
| 36 | 
         
            +
            gradio==5.30.0
         
     | 
| 37 | 
         
            +
            gradio_client==1.10.1
         
     | 
| 38 | 
         
            +
            groovy==0.1.2
         
     | 
| 39 | 
         
            +
            h11==0.16.0
         
     | 
| 40 | 
         
            +
            httpcore==1.0.9
         
     | 
| 41 | 
         
            +
            httpx==0.28.1
         
     | 
| 42 | 
         
            +
            huggingface-hub==0.30.2
         
     | 
| 43 | 
         
            +
            idna==3.10
         
     | 
| 44 | 
         
            +
            ijson==3.3.0
         
     | 
| 45 | 
         
            +
            importlib_metadata==8.7.0
         
     | 
| 46 | 
         
            +
            inspect_ai==0.3.94
         
     | 
| 47 | 
         
            +
            isort==6.0.1
         
     | 
| 48 | 
         
            +
            itsdangerous==2.2.0
         
     | 
| 49 | 
         
            +
            Jinja2==3.1.6
         
     | 
| 50 | 
         
            +
            jiter==0.9.0
         
     | 
| 51 | 
         
            +
            jmespath==1.0.1
         
     | 
| 52 | 
         
            +
            jsonlines==4.0.0
         
     | 
| 53 | 
         
            +
            jsonpatch==1.33
         
     | 
| 54 | 
         
            +
            jsonpointer==3.0.0
         
     | 
| 55 | 
         
            +
            jsonschema==4.23.0
         
     | 
| 56 | 
         
            +
            jsonschema-specifications==2025.4.1
         
     | 
| 57 | 
         
            +
            kiwisolver==1.4.8
         
     | 
| 58 | 
         
            +
            linkify-it-py==2.0.3
         
     | 
| 59 | 
         
            +
            litellm==1.68.1
         
     | 
| 60 | 
         
            +
            markdown-it-py==3.0.0
         
     | 
| 61 | 
         
            +
            MarkupSafe==3.0.2
         
     | 
| 62 | 
         
            +
            matplotlib==3.10.3
         
     | 
| 63 | 
         
            +
            mdit-py-plugins==0.4.2
         
     | 
| 64 | 
         
            +
            mdurl==0.1.2
         
     | 
| 65 | 
         
            +
            mmh3==5.1.0
         
     | 
| 66 | 
         
            +
            mplcursors==0.6
         
     | 
| 67 | 
         
            +
            multidict==6.4.3
         
     | 
| 68 | 
         
            +
            multiprocess==0.70.16
         
     | 
| 69 | 
         
            +
            mypy_extensions==1.1.0
         
     | 
| 70 | 
         
            +
            narwhals==1.38.2
         
     | 
| 71 | 
         
            +
            nest-asyncio==1.6.0
         
     | 
| 72 | 
         
            +
            numpy==2.2.5
         
     | 
| 73 | 
         
            +
            openai==1.75.0
         
     | 
| 74 | 
         
            +
            orjson==3.10.18
         
     | 
| 75 | 
         
            +
            packaging==25.0
         
     | 
| 76 | 
         
            +
            pandas==2.2.3
         
     | 
| 77 | 
         
            +
            pathspec==0.12.1
         
     | 
| 78 | 
         
            +
            pillow==11.2.1
         
     | 
| 79 | 
         
            +
            platformdirs==4.3.7
         
     | 
| 80 | 
         
            +
            plotly==6.0.1
         
     | 
| 81 | 
         
            +
            propcache==0.3.1
         
     | 
| 82 | 
         
            +
            psutil==7.0.0
         
     | 
| 83 | 
         
            +
            pyarrow==20.0.0
         
     | 
| 84 | 
         
            +
            pycparser==2.22
         
     | 
| 85 | 
         
            +
            pydantic==2.11.4
         
     | 
| 86 | 
         
            +
            pydantic_core==2.33.2
         
     | 
| 87 | 
         
            +
            pydub==0.25.1
         
     | 
| 88 | 
         
            +
            Pygments==2.19.1
         
     | 
| 89 | 
         
            +
            pyparsing==3.2.3
         
     | 
| 90 | 
         
            +
            python-dateutil==2.9.0.post0
         
     | 
| 91 | 
         
            +
            python-dotenv==1.1.0
         
     | 
| 92 | 
         
            +
            python-multipart==0.0.20
         
     | 
| 93 | 
         
            +
            pytz==2025.2
         
     | 
| 94 | 
         
            +
            PyYAML==6.0.2
         
     | 
| 95 | 
         
            +
            referencing==0.36.2
         
     | 
| 96 | 
         
            +
            regex==2024.11.6
         
     | 
| 97 | 
         
            +
            requests==2.32.3
         
     | 
| 98 | 
         
            +
            rich==13.9.4
         
     | 
| 99 | 
         
            +
            rpds-py==0.24.0
         
     | 
| 100 | 
         
            +
            ruff==0.11.8
         
     | 
| 101 | 
         
            +
            s3fs==2025.3.0
         
     | 
| 102 | 
         
            +
            safehttpx==0.1.6
         
     | 
| 103 | 
         
            +
            seaborn==0.13.2
         
     | 
| 104 | 
         
            +
            semantic-version==2.10.0
         
     | 
| 105 | 
         
            +
            semver==3.0.4
         
     | 
| 106 | 
         
            +
            shellingham==1.5.4
         
     | 
| 107 | 
         
            +
            shortuuid==1.0.13
         
     | 
| 108 | 
         
            +
            six==1.17.0
         
     | 
| 109 | 
         
            +
            sniffio==1.3.1
         
     | 
| 110 | 
         
            +
            soupsieve==2.7
         
     | 
| 111 | 
         
            +
            starlette==0.46.2
         
     | 
| 112 | 
         
            +
            tenacity==9.1.2
         
     | 
| 113 | 
         
            +
            textual==3.2.0
         
     | 
| 114 | 
         
            +
            tiktoken==0.9.0
         
     | 
| 115 | 
         
            +
            tokenizers==0.21.1
         
     | 
| 116 | 
         
            +
            tomli==2.2.1
         
     | 
| 117 | 
         
            +
            tomlkit==0.13.2
         
     | 
| 118 | 
         
            +
            tqdm==4.67.1
         
     | 
| 119 | 
         
            +
            typer==0.15.3
         
     | 
| 120 | 
         
            +
            typing-inspection==0.4.0
         
     | 
| 121 | 
         
            +
            typing_extensions==4.13.2
         
     | 
| 122 | 
         
            +
            tzdata==2025.2
         
     | 
| 123 | 
         
            +
            tzlocal==5.3.1
         
     | 
| 124 | 
         
            +
            uc-micro-py==1.0.3
         
     | 
| 125 | 
         
            +
            urllib3==2.4.0
         
     | 
| 126 | 
         
            +
            uvicorn==0.34.2
         
     | 
| 127 | 
         
            +
            websockets==15.0.1
         
     | 
| 128 | 
         
            +
            wrapt==1.17.2
         
     | 
| 129 | 
         
            +
            xxhash==3.5.0
         
     | 
| 130 | 
         
            +
            yarl==1.20.0
         
     | 
| 131 | 
         
            +
            zipp==3.21.0
         
     | 
    	
        submission.py
    ADDED
    
    | 
         @@ -0,0 +1,324 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import matplotlib
         
     | 
| 2 | 
         
            +
            matplotlib.use('Agg')
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            import os
         
     | 
| 5 | 
         
            +
            import shutil
         
     | 
| 6 | 
         
            +
            import tarfile
         
     | 
| 7 | 
         
            +
            import tempfile
         
     | 
| 8 | 
         
            +
            from datetime import datetime, timedelta, timezone
         
     | 
| 9 | 
         
            +
            from email.utils import parseaddr
         
     | 
| 10 | 
         
            +
            from pathlib import Path
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            import gradio as gr
         
     | 
| 13 | 
         
            +
            import requests
         
     | 
| 14 | 
         
            +
            from agenteval import (
         
     | 
| 15 | 
         
            +
                process_eval_logs,
         
     | 
| 16 | 
         
            +
                upload_folder_to_hf,
         
     | 
| 17 | 
         
            +
                upload_summary_to_hf,
         
     | 
| 18 | 
         
            +
            )
         
     | 
| 19 | 
         
            +
            from agenteval.models import EvalResult
         
     | 
| 20 | 
         
            +
            from agenteval.leaderboard.upload import sanitize_path_component
         
     | 
| 21 | 
         
            +
            from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
         
     | 
| 22 | 
         
            +
            from datasets.data_files import EmptyDatasetError
         
     | 
| 23 | 
         
            +
            from huggingface_hub import HfApi
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            from content import (
         
     | 
| 26 | 
         
            +
                CITATION_BUTTON_LABEL,
         
     | 
| 27 | 
         
            +
                CITATION_BUTTON_TEXT,
         
     | 
| 28 | 
         
            +
                format_error,
         
     | 
| 29 | 
         
            +
                format_log,
         
     | 
| 30 | 
         
            +
                format_warning,
         
     | 
| 31 | 
         
            +
            )
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            # --- Constants and Configuration  ---
         
     | 
| 34 | 
         
            +
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 35 | 
         
            +
            CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
         
     | 
| 36 | 
         
            +
            IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            OWNER = "allenai"
         
     | 
| 39 | 
         
            +
            PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
         
     | 
| 40 | 
         
            +
            SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
         
     | 
| 41 | 
         
            +
            SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
         
     | 
| 42 | 
         
            +
            CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
         
     | 
| 43 | 
         
            +
            RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
         
     | 
| 44 | 
         
            +
            LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
            if LOCAL_DEBUG:
         
     | 
| 47 | 
         
            +
                DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
         
     | 
| 48 | 
         
            +
            else:
         
     | 
| 49 | 
         
            +
                DATA_DIR = "/home/user/data/" + CONFIG_NAME
         
     | 
| 50 | 
         
            +
            EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
            api = HfApi()
         
     | 
| 53 | 
         
            +
            MAX_UPLOAD_BYTES = 100 * 1024**2
         
     | 
| 54 | 
         
            +
            AGENTEVAL_MANIFEST_NAME = "agenteval.json"
         
     | 
| 55 | 
         
            +
            os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            # --- Global State for Viewers (simple caching) ---
         
     | 
| 58 | 
         
            +
            CACHED_VIEWERS = {}
         
     | 
| 59 | 
         
            +
            CACHED_TAG_MAPS = {}
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
            # --- Submission Logic (largely unchanged from original, ensure EvalResult and other deps are fine) ---
         
     | 
| 62 | 
         
            +
            def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
         
     | 
| 63 | 
         
            +
                try:
         
     | 
| 64 | 
         
            +
                    return load_dataset(*args, **kwargs)
         
     | 
| 65 | 
         
            +
                except EmptyDatasetError:
         
     | 
| 66 | 
         
            +
                    return DatasetDict()
         
     | 
| 67 | 
         
            +
                except ValueError: # Handles cases where dataset is empty or ill-formed
         
     | 
| 68 | 
         
            +
                    return DatasetDict()
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            def checked_upload_folder(
         
     | 
| 71 | 
         
            +
                    api_hf: HfApi, # Renamed to avoid conflict with global api
         
     | 
| 72 | 
         
            +
                    folder_path: str,
         
     | 
| 73 | 
         
            +
                    repo_id: str,
         
     | 
| 74 | 
         
            +
                    config_name_ul: str, # Renamed
         
     | 
| 75 | 
         
            +
                    split_ul: str, # Renamed
         
     | 
| 76 | 
         
            +
                    submission_name_ul: str, # Renamed
         
     | 
| 77 | 
         
            +
            ) -> str:
         
     | 
| 78 | 
         
            +
                total = 0
         
     | 
| 79 | 
         
            +
                for root, _, files in os.walk(folder_path):
         
     | 
| 80 | 
         
            +
                    for f_ul in files: # Renamed
         
     | 
| 81 | 
         
            +
                        total += os.path.getsize(os.path.join(root, f_ul))
         
     | 
| 82 | 
         
            +
                        if total > MAX_UPLOAD_BYTES:
         
     | 
| 83 | 
         
            +
                            raise ValueError(
         
     | 
| 84 | 
         
            +
                                f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
         
     | 
| 85 | 
         
            +
                            )
         
     | 
| 86 | 
         
            +
                return upload_folder_to_hf(
         
     | 
| 87 | 
         
            +
                    api=api_hf, # Use renamed parameter
         
     | 
| 88 | 
         
            +
                    folder_path=folder_path,
         
     | 
| 89 | 
         
            +
                    repo_id=repo_id,
         
     | 
| 90 | 
         
            +
                    config_name=config_name_ul,
         
     | 
| 91 | 
         
            +
                    split=split_ul,
         
     | 
| 92 | 
         
            +
                    submission_name=submission_name_ul,
         
     | 
| 93 | 
         
            +
                )
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
            def add_new_eval(
         
     | 
| 96 | 
         
            +
                    val_or_test: str,
         
     | 
| 97 | 
         
            +
                    agent_name: str | None,
         
     | 
| 98 | 
         
            +
                    agent_description: str,
         
     | 
| 99 | 
         
            +
                    agent_url: str,
         
     | 
| 100 | 
         
            +
                    openness: str | None,
         
     | 
| 101 | 
         
            +
                    degree_of_control: str | None,
         
     | 
| 102 | 
         
            +
                    path_to_file: tempfile._TemporaryFileWrapper | None,
         
     | 
| 103 | 
         
            +
                    username: str,
         
     | 
| 104 | 
         
            +
                    mail: str,
         
     | 
| 105 | 
         
            +
                    profile: gr.OAuthProfile,
         
     | 
| 106 | 
         
            +
                    # We need global eval_results for checks; this might need rethinking if it's purely display driven now
         
     | 
| 107 | 
         
            +
                    # For now, let's assume we still load it for submission checks
         
     | 
| 108 | 
         
            +
            ):
         
     | 
| 109 | 
         
            +
                # Load current eval_results for submission checks
         
     | 
| 110 | 
         
            +
                # This is a bit redundant if display part reloads it, but submission needs its own consistent view
         
     | 
| 111 | 
         
            +
                current_eval_results_for_submission = try_load_dataset_submission(
         
     | 
| 112 | 
         
            +
                    RESULTS_DATASET,
         
     | 
| 113 | 
         
            +
                    CONFIG_NAME,
         
     | 
| 114 | 
         
            +
                    download_mode="force_redownload", # Or a less aggressive mode
         
     | 
| 115 | 
         
            +
                    verification_mode=VerificationMode.NO_CHECKS,
         
     | 
| 116 | 
         
            +
                    trust_remote_code=True,
         
     | 
| 117 | 
         
            +
                )
         
     | 
| 118 | 
         
            +
                if not agent_name:
         
     | 
| 119 | 
         
            +
                    return format_warning("Please provide an agent name.")
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                submission_time = datetime.now(timezone.utc)
         
     | 
| 122 | 
         
            +
                if not username or username.strip() == "":
         
     | 
| 123 | 
         
            +
                    username = profile.username # Default to HF username
         
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
            +
                # User account age check
         
     | 
| 126 | 
         
            +
                try:
         
     | 
| 127 | 
         
            +
                    user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
         
     | 
| 128 | 
         
            +
                    user_data_resp.raise_for_status()
         
     | 
| 129 | 
         
            +
                    creation_date_str = user_data_resp.json()["createdAt"]
         
     | 
| 130 | 
         
            +
                    created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
         
     | 
| 131 | 
         
            +
                    if submission_time - created_at < timedelta(days=60):
         
     | 
| 132 | 
         
            +
                        return format_error("This account is not authorized to submit here (account too new).")
         
     | 
| 133 | 
         
            +
                except Exception as e:
         
     | 
| 134 | 
         
            +
                    print(f"Error checking user account age: {e}")
         
     | 
| 135 | 
         
            +
                    return format_error("Could not verify account age. Please try again later.")
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
                # Submission frequency check
         
     | 
| 138 | 
         
            +
                contact_infos = try_load_dataset_submission(
         
     | 
| 139 | 
         
            +
                    CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
         
     | 
| 140 | 
         
            +
                    verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
         
     | 
| 141 | 
         
            +
                )
         
     | 
| 142 | 
         
            +
                user_submission_dates = sorted(
         
     | 
| 143 | 
         
            +
                    datetime.fromisoformat(row["submit_time"])
         
     | 
| 144 | 
         
            +
                    for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
         
     | 
| 145 | 
         
            +
                )
         
     | 
| 146 | 
         
            +
                if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
         
     | 
| 147 | 
         
            +
                    return format_error("You already submitted once in the last 24h for this split; please try again later.")
         
     | 
| 148 | 
         
            +
             
     | 
| 149 | 
         
            +
                # Email validation
         
     | 
| 150 | 
         
            +
                _, parsed_mail = parseaddr(mail)
         
     | 
| 151 | 
         
            +
                if "@" not in parsed_mail:
         
     | 
| 152 | 
         
            +
                    return format_warning("Please provide a valid email address.")
         
     | 
| 153 | 
         
            +
             
     | 
| 154 | 
         
            +
                # Duplicate submission check
         
     | 
| 155 | 
         
            +
                if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
         
     | 
| 156 | 
         
            +
                    existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
         
     | 
| 157 | 
         
            +
                    for sub_item in existing_submissions:
         
     | 
| 158 | 
         
            +
                        if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
         
     | 
| 159 | 
         
            +
                                sub_item.get("username", "").lower() == username.lower()):
         
     | 
| 160 | 
         
            +
                            return format_warning("This agent name by this user has already been submitted to this split.")
         
     | 
| 161 | 
         
            +
             
     | 
| 162 | 
         
            +
                if path_to_file is None:
         
     | 
| 163 | 
         
            +
                    return format_warning("Please attach a .tar.gz file.")
         
     | 
| 164 | 
         
            +
             
     | 
| 165 | 
         
            +
                safe_username = sanitize_path_component(username)
         
     | 
| 166 | 
         
            +
                safe_agent_name = sanitize_path_component(agent_name)
         
     | 
| 167 | 
         
            +
                extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
                # File extraction
         
     | 
| 170 | 
         
            +
                if not LOCAL_DEBUG:
         
     | 
| 171 | 
         
            +
                    try:
         
     | 
| 172 | 
         
            +
                        if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
         
     | 
| 173 | 
         
            +
                        os.makedirs(extracted_dir, exist_ok=True)
         
     | 
| 174 | 
         
            +
                        with tarfile.open(path_to_file.name, "r:gz") as tar:
         
     | 
| 175 | 
         
            +
                            members_extracted = 0
         
     | 
| 176 | 
         
            +
                            for member in tar.getmembers():
         
     | 
| 177 | 
         
            +
                                if not member.isreg(): continue
         
     | 
| 178 | 
         
            +
                                fname = os.path.basename(member.name)
         
     | 
| 179 | 
         
            +
                                if not fname or fname.startswith("."): continue
         
     | 
| 180 | 
         
            +
                                fobj = tar.extractfile(member)
         
     | 
| 181 | 
         
            +
                                if not fobj: continue
         
     | 
| 182 | 
         
            +
                                with open(os.path.join(extracted_dir, fname), "wb") as out:
         
     | 
| 183 | 
         
            +
                                    out.write(fobj.read())
         
     | 
| 184 | 
         
            +
                                members_extracted +=1
         
     | 
| 185 | 
         
            +
                            if members_extracted == 0:
         
     | 
| 186 | 
         
            +
                                return format_error("Submission tarball is empty or contains no valid files.")
         
     | 
| 187 | 
         
            +
                    except Exception as e:
         
     | 
| 188 | 
         
            +
                        return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
         
     | 
| 189 | 
         
            +
                else: print("mock extracted file", flush=True)
         
     | 
| 190 | 
         
            +
             
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
                submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
         
     | 
| 193 | 
         
            +
             
     | 
| 194 | 
         
            +
                # 1. Upload raw (unscored) submission files
         
     | 
| 195 | 
         
            +
                if not LOCAL_DEBUG:
         
     | 
| 196 | 
         
            +
                    try:
         
     | 
| 197 | 
         
            +
                        checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
         
     | 
| 198 | 
         
            +
                    except ValueError as e: return format_error(str(e))
         
     | 
| 199 | 
         
            +
                    except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
         
     | 
| 200 | 
         
            +
                else: print("mock uploaded raw submission", flush=True)
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
                # 2. Save contact information
         
     | 
| 203 | 
         
            +
                contact_info = {
         
     | 
| 204 | 
         
            +
                    "agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
         
     | 
| 205 | 
         
            +
                    "username": username, "username_auth": profile.username, "mail": mail,
         
     | 
| 206 | 
         
            +
                    "submit_time": submission_time.isoformat(),
         
     | 
| 207 | 
         
            +
                }
         
     | 
| 208 | 
         
            +
                if val_or_test in contact_infos:
         
     | 
| 209 | 
         
            +
                    contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
         
     | 
| 210 | 
         
            +
                else:
         
     | 
| 211 | 
         
            +
                    contact_infos[val_or_test] = Dataset.from_list([contact_info])
         
     | 
| 212 | 
         
            +
             
     | 
| 213 | 
         
            +
                if not LOCAL_DEBUG:
         
     | 
| 214 | 
         
            +
                    try:
         
     | 
| 215 | 
         
            +
                        contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
         
     | 
| 216 | 
         
            +
                    except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
         
     | 
| 217 | 
         
            +
                else: print("mock uploaded contact info", flush=True)
         
     | 
| 218 | 
         
            +
             
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
                # 3. Process and score the submission
         
     | 
| 221 | 
         
            +
                eval_result_obj = None # Define to avoid NameError
         
     | 
| 222 | 
         
            +
                try:
         
     | 
| 223 | 
         
            +
                    json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
         
     | 
| 224 | 
         
            +
                    if not json_path.exists():
         
     | 
| 225 | 
         
            +
                        return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
         
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
                    eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
         
     | 
| 228 | 
         
            +
                    if eval_result_obj.suite_config.version != CONFIG_NAME:
         
     | 
| 229 | 
         
            +
                        return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
         
     | 
| 230 | 
         
            +
                    if eval_result_obj.split != val_or_test:
         
     | 
| 231 | 
         
            +
                        return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
         
     | 
| 232 | 
         
            +
             
     | 
| 233 | 
         
            +
                    # Re-compute results from logs for integrity
         
     | 
| 234 | 
         
            +
                    eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
         
     | 
| 235 | 
         
            +
                    eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
         
     | 
| 236 | 
         
            +
             
     | 
| 237 | 
         
            +
                except Exception as e:
         
     | 
| 238 | 
         
            +
                    return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
         
     | 
| 239 | 
         
            +
             
     | 
| 240 | 
         
            +
                # 4. Upload scored submission files
         
     | 
| 241 | 
         
            +
                logs_url_private_val, logs_url_public_val = None, None
         
     | 
| 242 | 
         
            +
                scored_submission_name = f"{submission_name}_scored"
         
     | 
| 243 | 
         
            +
                if not LOCAL_DEBUG:
         
     | 
| 244 | 
         
            +
                    try:
         
     | 
| 245 | 
         
            +
                        logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 246 | 
         
            +
                        if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
         
     | 
| 247 | 
         
            +
                            logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 248 | 
         
            +
                    except ValueError as e: return format_error(str(e))
         
     | 
| 249 | 
         
            +
                    except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
         
     | 
| 250 | 
         
            +
                else: print("mock uploaded scored submission", flush=True)
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
             
     | 
| 253 | 
         
            +
                # Update EvalResult with submission details
         
     | 
| 254 | 
         
            +
                eval_result_obj.submission.agent_name = agent_name
         
     | 
| 255 | 
         
            +
                eval_result_obj.submission.agent_description = agent_description
         
     | 
| 256 | 
         
            +
                eval_result_obj.submission.agent_url = agent_url
         
     | 
| 257 | 
         
            +
                eval_result_obj.submission.openness = openness
         
     | 
| 258 | 
         
            +
                eval_result_obj.submission.degree_of_control = degree_of_control
         
     | 
| 259 | 
         
            +
                eval_result_obj.submission.username = username
         
     | 
| 260 | 
         
            +
                eval_result_obj.submission.submit_time = submission_time
         
     | 
| 261 | 
         
            +
                eval_result_obj.submission.logs_url = logs_url_private_val
         
     | 
| 262 | 
         
            +
                eval_result_obj.submission.logs_url_public = logs_url_public_val
         
     | 
| 263 | 
         
            +
             
     | 
| 264 | 
         
            +
                # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
         
     | 
| 265 | 
         
            +
                if not LOCAL_DEBUG:
         
     | 
| 266 | 
         
            +
                    try:
         
     | 
| 267 | 
         
            +
                        upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
         
     | 
| 268 | 
         
            +
                    except Exception as e:
         
     | 
| 269 | 
         
            +
                        return format_error(f"Failed to upload summary results to leaderboard: {e}")
         
     | 
| 270 | 
         
            +
                else: print("mock uploaded results to lb", flush=True)
         
     | 
| 271 | 
         
            +
             
     | 
| 272 | 
         
            +
                # Invalidate viewer cache for the split that was updated
         
     | 
| 273 | 
         
            +
                if val_or_test in CACHED_VIEWERS:
         
     | 
| 274 | 
         
            +
                    del CACHED_VIEWERS[val_or_test]
         
     | 
| 275 | 
         
            +
                if val_or_test in CACHED_TAG_MAPS:
         
     | 
| 276 | 
         
            +
                    del CACHED_TAG_MAPS[val_or_test]
         
     | 
| 277 | 
         
            +
             
     | 
| 278 | 
         
            +
             
     | 
| 279 | 
         
            +
                return format_log(
         
     | 
| 280 | 
         
            +
                    f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
         
     | 
| 281 | 
         
            +
                    "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
         
     | 
| 282 | 
         
            +
                )
         
     | 
| 283 | 
         
            +
             
     | 
| 284 | 
         
            +
             
     | 
| 285 | 
         
            +
            # --- Submission Accordion ---
         
     | 
| 286 | 
         
            +
            with gr.Blocks() as demo:
         
     | 
| 287 | 
         
            +
                gr.Markdown(f"## 🚀 Submit a new agent for evaluation", elem_id="markdown-text")
         
     | 
| 288 | 
         
            +
                with gr.Row():
         
     | 
| 289 | 
         
            +
                    with gr.Column():
         
     | 
| 290 | 
         
            +
                        level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
         
     | 
| 291 | 
         
            +
                        agent_name_tb = gr.Textbox(label="Agent Name")
         
     | 
| 292 | 
         
            +
                        agent_desc_tb = gr.Textbox(label="Agent Description")
         
     | 
| 293 | 
         
            +
                        agent_url_tb = gr.Textbox(label="URL to Agent Information")
         
     | 
| 294 | 
         
            +
                        openness_radio = gr.Radio(["Open Source","Open Source Open Weights", "API Available", "Closed"], value=None, label="Openness of Agent")
         
     | 
| 295 | 
         
            +
                        degree_of_control_radio = gr.Radio(["Standard","Custom with Standard Search", "Fully Custom"], value=None, label="Agent Tooling")
         
     | 
| 296 | 
         
            +
                    with gr.Column():
         
     | 
| 297 | 
         
            +
                        username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
         
     | 
| 298 | 
         
            +
                        mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
         
     | 
| 299 | 
         
            +
                        file_upload_comp = gr.File(
         
     | 
| 300 | 
         
            +
                            label="Submission File (.tar.gz ...)", # Shortened for brevity
         
     | 
| 301 | 
         
            +
                            file_types=[".gz", ".tar.gz"]
         
     | 
| 302 | 
         
            +
                        )
         
     | 
| 303 | 
         
            +
                with gr.Row():
         
     | 
| 304 | 
         
            +
                    gr.LoginButton()
         
     | 
| 305 | 
         
            +
                    submit_eval_button = gr.Button("Submit Evaluation")
         
     | 
| 306 | 
         
            +
                submission_result = gr.Markdown()
         
     | 
| 307 | 
         
            +
             
     | 
| 308 | 
         
            +
                submit_eval_button.click(
         
     | 
| 309 | 
         
            +
                    add_new_eval,
         
     | 
| 310 | 
         
            +
                    [
         
     | 
| 311 | 
         
            +
                        level_of_test_radio,
         
     | 
| 312 | 
         
            +
                        agent_name_tb,
         
     | 
| 313 | 
         
            +
                        agent_desc_tb,
         
     | 
| 314 | 
         
            +
                        agent_url_tb,
         
     | 
| 315 | 
         
            +
                        openness_radio,
         
     | 
| 316 | 
         
            +
                        degree_of_control_radio,
         
     | 
| 317 | 
         
            +
                        file_upload_comp,
         
     | 
| 318 | 
         
            +
                        username_tb,
         
     | 
| 319 | 
         
            +
                        mail_tb
         
     | 
| 320 | 
         
            +
                    ],
         
     | 
| 321 | 
         
            +
                    submission_result,
         
     | 
| 322 | 
         
            +
                )
         
     | 
| 323 | 
         
            +
                with gr.Accordion("📙 Citation", open=False):
         
     | 
| 324 | 
         
            +
                    gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
         
     | 
    	
        ui_components.py
    CHANGED
    
    | 
         @@ -1,13 +1,22 @@ 
     | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 2 | 
         
            -
            from gradio.events import SelectData
         
     | 
| 3 | 
         
             
            import pandas as pd
         
     | 
| 4 | 
         
             
            import plotly.graph_objects as go
         
     | 
| 5 | 
         
             
            import os
         
     | 
| 
         | 
|
| 6 | 
         | 
| 7 | 
         
             
            from agenteval.leaderboard.view import LeaderboardViewer
         
     | 
| 8 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 9 | 
         | 
| 10 | 
         
            -
            from leaderboard_transformer import  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 11 | 
         
             
            from content import (
         
     | 
| 12 | 
         
             
                SCATTER_DISCLAIMER,
         
     | 
| 13 | 
         
             
                format_error,
         
     | 
| 
         @@ -19,7 +28,7 @@ from content import ( 
     | 
|
| 19 | 
         | 
| 20 | 
         
             
            # --- Constants and Configuration  ---
         
     | 
| 21 | 
         
             
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 22 | 
         
            -
            CONFIG_NAME = "1.0.0- 
     | 
| 23 | 
         
             
            IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
         
     | 
| 24 | 
         | 
| 25 | 
         
             
            OWNER = "allenai"
         
     | 
| 
         @@ -41,6 +50,24 @@ MAX_UPLOAD_BYTES = 100 * 1024**2 
     | 
|
| 41 | 
         
             
            AGENTEVAL_MANIFEST_NAME = "agenteval.json"
         
     | 
| 42 | 
         
             
            os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
         
     | 
| 43 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 44 | 
         | 
| 45 | 
         
             
            # --- Global State for Viewers (simple caching) ---
         
     | 
| 46 | 
         
             
            CACHED_VIEWERS = {}
         
     | 
| 
         @@ -117,29 +144,48 @@ def create_leaderboard_display( 
     | 
|
| 117 | 
         
             
                # The function no longer loads data itself; it filters the data it receives.
         
     | 
| 118 | 
         
             
                transformer = DataTransformer(full_df, tag_map)
         
     | 
| 119 | 
         
             
                df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
         
     | 
| 120 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 121 | 
         
             
                for col in df_view.columns:
         
     | 
| 122 | 
         
             
                    if "Cost" in col:
         
     | 
| 123 | 
         
             
                        df_view = format_cost_column(df_view, col)
         
     | 
| 124 | 
         | 
| 125 | 
         
            -
                #  
     | 
| 126 | 
         
             
                for col in df_view.columns:
         
     | 
| 127 | 
         
             
                    if "Score" in col:
         
     | 
| 128 | 
         
             
                        df_view = format_score_column(df_view, col)
         
     | 
| 129 | 
         
             
                scatter_plot = plots_dict.get('scatter_plot', go.Figure())
         
     | 
| 130 | 
         | 
| 131 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 132 | 
         
             
                df_headers = df_view.columns.tolist()
         
     | 
| 133 | 
         
            -
                df_datatypes = ["markdown" if col == "Logs" or "Cost" in col or "Score" in col else "str" for col in df_headers]
         
     | 
| 134 | 
         
            -
             
     | 
| 135 | 
         
            -
                dataframe_component = gr.DataFrame(
         
     | 
| 136 | 
         
            -
                    headers=df_headers,
         
     | 
| 137 | 
         
            -
                    value=df_view,
         
     | 
| 138 | 
         
            -
                    datatype=df_datatypes,
         
     | 
| 139 | 
         
            -
                    interactive=False,
         
     | 
| 140 | 
         
            -
                    wrap=True,
         
     | 
| 141 | 
         
            -
                    column_widths=[100, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 75, 75, 50, 50]
         
     | 
| 142 | 
         
            -
                )
         
     | 
| 143 | 
         | 
| 144 | 
         
             
                plot_component = gr.Plot(
         
     | 
| 145 | 
         
             
                    value=scatter_plot,
         
     | 
| 
         @@ -147,8 +193,20 @@ def create_leaderboard_display( 
     | 
|
| 147 | 
         
             
                )
         
     | 
| 148 | 
         
             
                gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         
     | 
| 149 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 150 | 
         
             
                # Return the components so they can be referenced elsewhere.
         
     | 
| 151 | 
         
            -
                return dataframe_component, 
     | 
| 152 | 
         | 
| 153 | 
         
             
            def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
         
     | 
| 154 | 
         
             
                """
         
     | 
| 
         @@ -178,8 +236,36 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]: 
     | 
|
| 178 | 
         | 
| 179 | 
         
             
                # Fallback for unexpected types
         
     | 
| 180 | 
         
             
                return pd.DataFrame(), {}
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 181 | 
         | 
| 182 | 
         
            -
            #  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 183 | 
         
             
            def create_benchmark_details_display(
         
     | 
| 184 | 
         
             
                    full_df: pd.DataFrame,
         
     | 
| 185 | 
         
             
                    tag_map: dict,
         
     | 
| 
         @@ -206,14 +292,14 @@ def create_benchmark_details_display( 
     | 
|
| 206 | 
         
             
                # 2. Loop through each benchmark and create its UI components
         
     | 
| 207 | 
         
             
                for benchmark_name in benchmark_names:
         
     | 
| 208 | 
         
             
                    with gr.Blocks():
         
     | 
| 209 | 
         
            -
                        gr.Markdown(f"### {benchmark_name}")
         
     | 
| 210 | 
         | 
| 211 | 
         
             
                        # 3. Prepare the data for this specific benchmark's table and plot
         
     | 
| 212 | 
         
             
                        benchmark_score_col = f"{benchmark_name} Score"
         
     | 
| 213 | 
         
             
                        benchmark_cost_col = f"{benchmark_name} Cost"
         
     | 
| 214 | 
         | 
| 215 | 
         
             
                        # Define the columns needed for the detailed table
         
     | 
| 216 | 
         
            -
                        table_cols = ['Agent', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs']
         
     | 
| 217 | 
         | 
| 218 | 
         
             
                        # Filter to only columns that actually exist in the full dataframe
         
     | 
| 219 | 
         
             
                        existing_table_cols = [col for col in table_cols if col in full_df.columns]
         
     | 
| 
         @@ -224,11 +310,29 @@ def create_benchmark_details_display( 
     | 
|
| 224 | 
         | 
| 225 | 
         
             
                        # Create a specific DataFrame for the table view
         
     | 
| 226 | 
         
             
                        benchmark_table_df = full_df[existing_table_cols].copy()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 227 | 
         
             
                        # Calculated and add "Benchmark Attempted" column
         
     | 
| 228 | 
         
             
                        def check_benchmark_status(row):
         
     | 
| 229 | 
         
             
                            has_score = pd.notna(row.get(benchmark_score_col))
         
     | 
| 230 | 
         
             
                            has_cost = pd.notna(row.get(benchmark_cost_col))
         
     | 
| 231 | 
         
            -
             
     | 
| 232 | 
         
             
                            if has_score and has_cost:
         
     | 
| 233 | 
         
             
                                return "✅"
         
     | 
| 234 | 
         
             
                            if has_score or has_cost:
         
     | 
| 
         @@ -246,14 +350,14 @@ def create_benchmark_details_display( 
     | 
|
| 246 | 
         
             
                        benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
         
     | 
| 247 | 
         
             
                        benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
         
     | 
| 248 | 
         
             
                        desired_cols_in_order = [
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 249 | 
         
             
                            'Agent',
         
     | 
| 250 | 
         
             
                            'Submitter',
         
     | 
| 251 | 
         
             
                            'Attempted Benchmark',
         
     | 
| 252 | 
         
             
                            benchmark_score_col,
         
     | 
| 253 | 
         
             
                            benchmark_cost_col,
         
     | 
| 254 | 
         
            -
                            'Openness',
         
     | 
| 255 | 
         
            -
                            'Degree of Control',
         
     | 
| 256 | 
         
            -
                            'Date',
         
     | 
| 257 | 
         
             
                            'Logs'
         
     | 
| 258 | 
         
             
                        ]
         
     | 
| 259 | 
         
             
                        for col in desired_cols_in_order:
         
     | 
| 
         @@ -261,25 +365,13 @@ def create_benchmark_details_display( 
     | 
|
| 261 | 
         
             
                                benchmark_table_df[col] = pd.NA # Add as an empty column
         
     | 
| 262 | 
         
             
                        benchmark_table_df = benchmark_table_df[desired_cols_in_order]
         
     | 
| 263 | 
         
             
                        # Rename columns for a cleaner table display, as requested
         
     | 
| 264 | 
         
            -
                        benchmark_table_df.rename( 
     | 
| 265 | 
         
             
                            benchmark_score_col: 'Score',
         
     | 
| 266 | 
         
            -
                            benchmark_cost_col: 'Cost'
         
     | 
| 267 | 
         
             
                        }, inplace=True)
         
     | 
| 268 | 
         
             
                        # Ensure the 'Logs' column is formatted correctly
         
     | 
| 269 | 
         
            -
                         
     | 
| 270 | 
         
            -
                         
     | 
| 271 | 
         
            -
                        df_datatypes = [
         
     | 
| 272 | 
         
            -
                            "markdown" if col in ["Logs", "Cost", "Score"] else "str"
         
     | 
| 273 | 
         
            -
                            for col in table_headers
         
     | 
| 274 | 
         
            -
                        ]
         
     | 
| 275 | 
         
            -
             
     | 
| 276 | 
         
            -
                        # Create the Gradio component, now with the correct datatypes
         
     | 
| 277 | 
         
            -
                        gr.DataFrame(
         
     | 
| 278 | 
         
            -
                            value=benchmark_table_df,
         
     | 
| 279 | 
         
            -
                            datatype=df_datatypes,
         
     | 
| 280 | 
         
            -
                            interactive=False,
         
     | 
| 281 | 
         
            -
                            wrap=True,
         
     | 
| 282 | 
         
            -
                        )
         
     | 
| 283 | 
         | 
| 284 | 
         
             
                        # Create the scatter plot using the full data for context, but plotting benchmark metrics
         
     | 
| 285 | 
         
             
                        # This shows all agents on the same axis for better comparison.
         
     | 
| 
         @@ -291,3 +383,14 @@ def create_benchmark_details_display( 
     | 
|
| 291 | 
         
             
                        )
         
     | 
| 292 | 
         
             
                        gr.Plot(value=benchmark_plot)
         
     | 
| 293 | 
         
             
                        gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 
         | 
|
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         
             
            import plotly.graph_objects as go
         
     | 
| 4 | 
         
             
            import os
         
     | 
| 5 | 
         
            +
            import re
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            from agenteval.leaderboard.view import LeaderboardViewer
         
     | 
| 8 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 9 | 
         | 
| 10 | 
         
            +
            from leaderboard_transformer import (
         
     | 
| 11 | 
         
            +
                DataTransformer,
         
     | 
| 12 | 
         
            +
                transform_raw_dataframe,
         
     | 
| 13 | 
         
            +
                create_pretty_tag_map,
         
     | 
| 14 | 
         
            +
                INFORMAL_TO_FORMAL_NAME_MAP,
         
     | 
| 15 | 
         
            +
                _plot_scatter_plotly,
         
     | 
| 16 | 
         
            +
                format_cost_column,
         
     | 
| 17 | 
         
            +
                format_score_column,
         
     | 
| 18 | 
         
            +
                get_pareto_df,
         
     | 
| 19 | 
         
            +
            )
         
     | 
| 20 | 
         
             
            from content import (
         
     | 
| 21 | 
         
             
                SCATTER_DISCLAIMER,
         
     | 
| 22 | 
         
             
                format_error,
         
     | 
| 
         | 
|
| 28 | 
         | 
| 29 | 
         
             
            # --- Constants and Configuration  ---
         
     | 
| 30 | 
         
             
            LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
         
     | 
| 31 | 
         
            +
            CONFIG_NAME = "1.0.0-dev2" # This corresponds to 'config' in LeaderboardViewer
         
     | 
| 32 | 
         
             
            IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
         
     | 
| 33 | 
         | 
| 34 | 
         
             
            OWNER = "allenai"
         
     | 
| 
         | 
|
| 50 | 
         
             
            AGENTEVAL_MANIFEST_NAME = "agenteval.json"
         
     | 
| 51 | 
         
             
            os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
         
     | 
| 52 | 
         | 
| 53 | 
         
            +
            # Global variables
         
     | 
| 54 | 
         
            +
            openness_emoji_map = {
         
     | 
| 55 | 
         
            +
                "Closed": '🔴',
         
     | 
| 56 | 
         
            +
                "API Available": '🟠',
         
     | 
| 57 | 
         
            +
                "Open Source": '🟢',
         
     | 
| 58 | 
         
            +
                "Open Source + Open Weights": '🔵'
         
     | 
| 59 | 
         
            +
            }
         
     | 
| 60 | 
         
            +
            control_emoji_map = {
         
     | 
| 61 | 
         
            +
                "Standard": "⭐",
         
     | 
| 62 | 
         
            +
                "Custom with Standard Search": "🔶",
         
     | 
| 63 | 
         
            +
                "Fully Custom": "⚪️",
         
     | 
| 64 | 
         
            +
            }
         
     | 
| 65 | 
         
            +
            legend_markdown = """
         
     | 
| 66 | 
         
            +
                <span>On pareto curve:📈</span>
         
     | 
| 67 | 
         
            +
                <span>**Agent Openness**:</span>   <span>🔴 Closed</span>    <span>🟠 API Available</span>    <span>🟢 Open Source</span>    <span>🔵 Open Source + Open Weights</span>
         
     | 
| 68 | 
         
            +
                <span>**Agent Tooling**:</span>   <span>⭐ Standard</span>    <span>🔶 Custom with Standard Search</span>    <span>⚪️ Fully Custom</span>
         
     | 
| 69 | 
         
            +
                <span>**COMING SOON:** COLUMN DESCRIPTIONS</span>
         
     | 
| 70 | 
         
            +
                """
         
     | 
| 71 | 
         | 
| 72 | 
         
             
            # --- Global State for Viewers (simple caching) ---
         
     | 
| 73 | 
         
             
            CACHED_VIEWERS = {}
         
     | 
| 
         | 
|
| 144 | 
         
             
                # The function no longer loads data itself; it filters the data it receives.
         
     | 
| 145 | 
         
             
                transformer = DataTransformer(full_df, tag_map)
         
     | 
| 146 | 
         
             
                df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
         
     | 
| 147 | 
         
            +
                pareto_df = get_pareto_df(df_view)
         
     | 
| 148 | 
         
            +
                # Get the list of agents on the frontier. We'll use this list later.
         
     | 
| 149 | 
         
            +
                if not pareto_df.empty and 'id' in pareto_df.columns:
         
     | 
| 150 | 
         
            +
                    pareto_agent_names = pareto_df['id'].tolist()
         
     | 
| 151 | 
         
            +
                else:
         
     | 
| 152 | 
         
            +
                    pareto_agent_names = []
         
     | 
| 153 | 
         
            +
                df_view['Pareto'] = df_view.apply(
         
     | 
| 154 | 
         
            +
                    lambda row: '📈' if row['id'] in pareto_agent_names else '',
         
     | 
| 155 | 
         
            +
                    axis=1
         
     | 
| 156 | 
         
            +
                )
         
     | 
| 157 | 
         
            +
                # Create mapping for Openness
         
     | 
| 158 | 
         
            +
                original_openness = df_view['Openness']
         
     | 
| 159 | 
         
            +
                df_view['Openness'] = df_view['Openness'].map(openness_emoji_map).fillna(original_openness)
         
     | 
| 160 | 
         
            +
             
     | 
| 161 | 
         
            +
                # For this column, we'll use .apply() to handle the "Other" case cleanly.
         
     | 
| 162 | 
         
            +
                df_view['Agent Tooling'] = df_view['Agent Tooling'].apply(
         
     | 
| 163 | 
         
            +
                    lambda ctrl: control_emoji_map.get(ctrl, f"{ctrl}" if pd.notna(ctrl) else "")
         
     | 
| 164 | 
         
            +
                )
         
     | 
| 165 | 
         
            +
             
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
                # Format cost columns
         
     | 
| 168 | 
         
             
                for col in df_view.columns:
         
     | 
| 169 | 
         
             
                    if "Cost" in col:
         
     | 
| 170 | 
         
             
                        df_view = format_cost_column(df_view, col)
         
     | 
| 171 | 
         | 
| 172 | 
         
            +
                # Fill NaN scores with 0
         
     | 
| 173 | 
         
             
                for col in df_view.columns:
         
     | 
| 174 | 
         
             
                    if "Score" in col:
         
     | 
| 175 | 
         
             
                        df_view = format_score_column(df_view, col)
         
     | 
| 176 | 
         
             
                scatter_plot = plots_dict.get('scatter_plot', go.Figure())
         
     | 
| 177 | 
         | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
                all_cols = df_view.columns.tolist()
         
     | 
| 180 | 
         
            +
                # Remove 'Pareto' from the list and insert it at the beginning
         
     | 
| 181 | 
         
            +
                all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
         
     | 
| 182 | 
         
            +
                df_view = df_view[all_cols]
         
     | 
| 183 | 
         
            +
                # Drop internally used columns that are not needed in the display
         
     | 
| 184 | 
         
            +
                columns_to_drop = ['id', 'agent_for_hover']
         
     | 
| 185 | 
         
            +
                df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
         
     | 
| 186 | 
         
            +
             
     | 
| 187 | 
         
             
                df_headers = df_view.columns.tolist()
         
     | 
| 188 | 
         
            +
                df_datatypes = ["markdown" if col == "Logs" or col == "Agent" or "Cost" in col or "Score" in col else "str" for col in df_headers]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 189 | 
         | 
| 190 | 
         
             
                plot_component = gr.Plot(
         
     | 
| 191 | 
         
             
                    value=scatter_plot,
         
     | 
| 
         | 
|
| 193 | 
         
             
                )
         
     | 
| 194 | 
         
             
                gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         
     | 
| 195 | 
         | 
| 196 | 
         
            +
                # Put table and key into an accordion
         
     | 
| 197 | 
         
            +
                with gr.Accordion("See Table", open=False, elem_id="leaderboard-accordion"):
         
     | 
| 198 | 
         
            +
                    dataframe_component = gr.DataFrame(
         
     | 
| 199 | 
         
            +
                        headers=df_headers,
         
     | 
| 200 | 
         
            +
                        value=df_view,
         
     | 
| 201 | 
         
            +
                        datatype=df_datatypes,
         
     | 
| 202 | 
         
            +
                        interactive=False,
         
     | 
| 203 | 
         
            +
                        wrap=True,
         
     | 
| 204 | 
         
            +
                        column_widths=[30, 30, 30, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 50, 30]
         
     | 
| 205 | 
         
            +
                    )
         
     | 
| 206 | 
         
            +
                    gr.Markdown(value=legend_markdown, elem_id="legend-markdown")
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
             
                # Return the components so they can be referenced elsewhere.
         
     | 
| 209 | 
         
            +
                return plot_component, dataframe_component,
         
     | 
| 210 | 
         | 
| 211 | 
         
             
            def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
         
     | 
| 212 | 
         
             
                """
         
     | 
| 
         | 
|
| 236 | 
         | 
| 237 | 
         
             
                # Fallback for unexpected types
         
     | 
| 238 | 
         
             
                return pd.DataFrame(), {}
         
     | 
| 239 | 
         
            +
            # Create sub-nav bar for benchmarks
         
     | 
| 240 | 
         
            +
            def create_gradio_anchor_id(text: str) -> str:
         
     | 
| 241 | 
         
            +
                """
         
     | 
| 242 | 
         
            +
                Replicates the ID format created by gr.Markdown(header_links=True).
         
     | 
| 243 | 
         
            +
                Example: "Paper Finder Validation" -> "h-paper-finder-validation"
         
     | 
| 244 | 
         
            +
                """
         
     | 
| 245 | 
         
            +
                text = text.lower()
         
     | 
| 246 | 
         
            +
                text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
         
     | 
| 247 | 
         
            +
                text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
         
     | 
| 248 | 
         
            +
                return f"h-{text}"
         
     | 
| 249 | 
         
            +
            def create_sub_navigation_bar(tag_map: dict, category_name: str):
         
     | 
| 250 | 
         
            +
                """
         
     | 
| 251 | 
         
            +
                Generates and renders the HTML for the anchor-link sub-navigation bar.
         
     | 
| 252 | 
         
            +
                """
         
     | 
| 253 | 
         
            +
                benchmark_names = tag_map.get(category_name, [])
         
     | 
| 254 | 
         
            +
                if not benchmark_names:
         
     | 
| 255 | 
         
            +
                    return # Do nothing if there are no benchmarks
         
     | 
| 256 | 
         
            +
             
     | 
| 257 | 
         
            +
                anchor_links = []
         
     | 
| 258 | 
         
            +
                for name in benchmark_names:
         
     | 
| 259 | 
         
            +
                    # Use the helper function to create the correct ID format
         
     | 
| 260 | 
         
            +
                    target_id = create_gradio_anchor_id(name)
         
     | 
| 261 | 
         
            +
                    anchor_links.append(f"<a href='#{target_id}'>{name}</a>")
         
     | 
| 262 | 
         
            +
             
     | 
| 263 | 
         
            +
                nav_bar_html = f"<div class='sub-nav-bar'>{'   '.join(anchor_links)}</div>"
         
     | 
| 264 | 
         | 
| 265 | 
         
            +
                # Use gr.HTML to render the links correctly
         
     | 
| 266 | 
         
            +
                gr.HTML(nav_bar_html)
         
     | 
| 267 | 
         
            +
             
     | 
| 268 | 
         
            +
            # # --- Detailed Benchmark Display ---
         
     | 
| 269 | 
         
             
            def create_benchmark_details_display(
         
     | 
| 270 | 
         
             
                    full_df: pd.DataFrame,
         
     | 
| 271 | 
         
             
                    tag_map: dict,
         
     | 
| 
         | 
|
| 292 | 
         
             
                # 2. Loop through each benchmark and create its UI components
         
     | 
| 293 | 
         
             
                for benchmark_name in benchmark_names:
         
     | 
| 294 | 
         
             
                    with gr.Blocks():
         
     | 
| 295 | 
         
            +
                        gr.Markdown(f"### {benchmark_name}", header_links=True)
         
     | 
| 296 | 
         | 
| 297 | 
         
             
                        # 3. Prepare the data for this specific benchmark's table and plot
         
     | 
| 298 | 
         
             
                        benchmark_score_col = f"{benchmark_name} Score"
         
     | 
| 299 | 
         
             
                        benchmark_cost_col = f"{benchmark_name} Cost"
         
     | 
| 300 | 
         | 
| 301 | 
         
             
                        # Define the columns needed for the detailed table
         
     | 
| 302 | 
         
            +
                        table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id']
         
     | 
| 303 | 
         | 
| 304 | 
         
             
                        # Filter to only columns that actually exist in the full dataframe
         
     | 
| 305 | 
         
             
                        existing_table_cols = [col for col in table_cols if col in full_df.columns]
         
     | 
| 
         | 
|
| 310 | 
         | 
| 311 | 
         
             
                        # Create a specific DataFrame for the table view
         
     | 
| 312 | 
         
             
                        benchmark_table_df = full_df[existing_table_cols].copy()
         
     | 
| 313 | 
         
            +
                        pareto_df = get_pareto_df(benchmark_table_df)
         
     | 
| 314 | 
         
            +
                        # Get the list of agents on the frontier. We'll use this list later.
         
     | 
| 315 | 
         
            +
                        if not pareto_df.empty and 'id' in pareto_df.columns:
         
     | 
| 316 | 
         
            +
                            pareto_agent_names = pareto_df['id'].tolist()
         
     | 
| 317 | 
         
            +
                        else:
         
     | 
| 318 | 
         
            +
                            pareto_agent_names = []
         
     | 
| 319 | 
         
            +
                        benchmark_table_df['Pareto'] = benchmark_table_df.apply(
         
     | 
| 320 | 
         
            +
                            lambda row: '📈' if row['id'] in pareto_agent_names else '',
         
     | 
| 321 | 
         
            +
                            axis=1
         
     | 
| 322 | 
         
            +
                        )
         
     | 
| 323 | 
         
            +
             
     | 
| 324 | 
         
            +
                        original_openness = benchmark_table_df['Openness']
         
     | 
| 325 | 
         
            +
                        benchmark_table_df['Openness'] = benchmark_table_df['Openness'].map(openness_emoji_map).fillna(original_openness)
         
     | 
| 326 | 
         
            +
             
     | 
| 327 | 
         
            +
                        # For this column, we'll use .apply() to handle the "Other" case cleanly.
         
     | 
| 328 | 
         
            +
                        benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(
         
     | 
| 329 | 
         
            +
                            lambda ctrl: control_emoji_map.get(ctrl, f"{ctrl}" if pd.notna(ctrl) else "")
         
     | 
| 330 | 
         
            +
                        )
         
     | 
| 331 | 
         
            +
             
     | 
| 332 | 
         
             
                        # Calculated and add "Benchmark Attempted" column
         
     | 
| 333 | 
         
             
                        def check_benchmark_status(row):
         
     | 
| 334 | 
         
             
                            has_score = pd.notna(row.get(benchmark_score_col))
         
     | 
| 335 | 
         
             
                            has_cost = pd.notna(row.get(benchmark_cost_col))
         
     | 
| 
         | 
|
| 336 | 
         
             
                            if has_score and has_cost:
         
     | 
| 337 | 
         
             
                                return "✅"
         
     | 
| 338 | 
         
             
                            if has_score or has_cost:
         
     | 
| 
         | 
|
| 350 | 
         
             
                        benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
         
     | 
| 351 | 
         
             
                        benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
         
     | 
| 352 | 
         
             
                        desired_cols_in_order = [
         
     | 
| 353 | 
         
            +
                            'Pareto',
         
     | 
| 354 | 
         
            +
                            'Openness',
         
     | 
| 355 | 
         
            +
                            'Agent Tooling',
         
     | 
| 356 | 
         
             
                            'Agent',
         
     | 
| 357 | 
         
             
                            'Submitter',
         
     | 
| 358 | 
         
             
                            'Attempted Benchmark',
         
     | 
| 359 | 
         
             
                            benchmark_score_col,
         
     | 
| 360 | 
         
             
                            benchmark_cost_col,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 361 | 
         
             
                            'Logs'
         
     | 
| 362 | 
         
             
                        ]
         
     | 
| 363 | 
         
             
                        for col in desired_cols_in_order:
         
     | 
| 
         | 
|
| 365 | 
         
             
                                benchmark_table_df[col] = pd.NA # Add as an empty column
         
     | 
| 366 | 
         
             
                        benchmark_table_df = benchmark_table_df[desired_cols_in_order]
         
     | 
| 367 | 
         
             
                        # Rename columns for a cleaner table display, as requested
         
     | 
| 368 | 
         
            +
                        benchmark_table_df.rename({
         
     | 
| 369 | 
         
             
                            benchmark_score_col: 'Score',
         
     | 
| 370 | 
         
            +
                            benchmark_cost_col: 'Cost',
         
     | 
| 371 | 
         
             
                        }, inplace=True)
         
     | 
| 372 | 
         
             
                        # Ensure the 'Logs' column is formatted correctly
         
     | 
| 373 | 
         
            +
                        df_headers = benchmark_table_df.columns.tolist()
         
     | 
| 374 | 
         
            +
                        df_datatypes = ["markdown" if col == "Logs" or col == "Agent" or "Cost" in col or "Score" in col else "str" for col in df_headers]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 375 | 
         | 
| 376 | 
         
             
                        # Create the scatter plot using the full data for context, but plotting benchmark metrics
         
     | 
| 377 | 
         
             
                        # This shows all agents on the same axis for better comparison.
         
     | 
| 
         | 
|
| 383 | 
         
             
                        )
         
     | 
| 384 | 
         
             
                        gr.Plot(value=benchmark_plot)
         
     | 
| 385 | 
         
             
                        gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         
     | 
| 386 | 
         
            +
                        # Put table and key into an accordion
         
     | 
| 387 | 
         
            +
                        with gr.Accordion("See Table", open=False, elem_id="leaderboard-accordion"):
         
     | 
| 388 | 
         
            +
                            gr.DataFrame(
         
     | 
| 389 | 
         
            +
                                headers=df_headers,
         
     | 
| 390 | 
         
            +
                                value=benchmark_table_df,
         
     | 
| 391 | 
         
            +
                                datatype=df_datatypes,
         
     | 
| 392 | 
         
            +
                                interactive=False,
         
     | 
| 393 | 
         
            +
                                wrap=True,
         
     | 
| 394 | 
         
            +
                            )
         
     | 
| 395 | 
         
            +
                            gr.Markdown(value=legend_markdown, elem_id="legend-markdown")
         
     | 
| 396 | 
         
            +
             
     |