import sys from pathlib import Path sys.path.append(str(Path(__file__).parent.parent)) import fev import pandas as pd import streamlit as st from streamlit.elements.lib.column_types import ColumnConfig from src.strings import ( CHRONOS_BENCHMARK_BASIC_INFO, CHRONOS_BENCHMARK_DETAILS, CITATION_CHRONOS, CITATION_FEV, CITATION_HEADER, PAIRWISE_BENCHMARK_DETAILS, get_pivot_legend, ) from src.utils import ( construct_bar_chart, construct_pairwise_chart, construct_pivot_table, format_leaderboard, format_metric_name, get_metric_description, ) st.set_page_config(layout="wide", page_title="FEV Benchmark Leaderboard", page_icon=":material/trophy:") TITLE = "

Chronos Benchmark II

" BASELINE_MODEL = "seasonal_naive" LEAKAGE_IMPUTATION_MODEL = "chronos_bolt_base" SORT_COL = "win_rate" N_RESAMPLES_FOR_CI = 1000 TOP_K_MODELS_TO_PLOT = 15 AVAILABLE_METRICS = ["WQL", "MASE"] SUMMARY_URLS = [ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm-2.0.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/ttm-r2.csv", "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/tirex.csv", ] @st.cache_data() def load_summaries(): summaries = [] for url in SUMMARY_URLS: df = pd.read_csv(url) summaries.append(df) return pd.concat(summaries, ignore_index=True) @st.cache_data() def get_leaderboard(metric_name: str) -> pd.DataFrame: summaries = load_summaries() lb = fev.analysis.leaderboard( summaries=summaries, metric_column=metric_name, missing_strategy="impute", baseline_model=BASELINE_MODEL, leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, ) lb = lb.astype("float64").reset_index() lb["skill_score"] = lb["skill_score"] * 100 lb["win_rate"] = lb["win_rate"] * 100 lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100 return lb @st.cache_data() def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame: if BASELINE_MODEL not in included_models: included_models = included_models + [BASELINE_MODEL] summaries = load_summaries() return ( fev.analysis.pairwise_comparison( summaries, included_models=included_models, metric_column=metric_name, baseline_model=BASELINE_MODEL, missing_strategy="impute", n_resamples=N_RESAMPLES_FOR_CI, leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, ) .round(3) .reset_index() ) with st.sidebar: selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name) st.caption(get_metric_description(selected_metric)) cols = st.columns(spec=[0.025, 0.95, 0.025]) with cols[1] as main_container: st.markdown(TITLE, unsafe_allow_html=True) metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False) top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist() pairwise_df = get_pairwise(selected_metric, included_models=top_k_models) st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True) st.markdown(CHRONOS_BENCHMARK_BASIC_INFO, unsafe_allow_html=True) df_styled = format_leaderboard(metric_df) st.dataframe( df_styled, use_container_width=True, hide_index=True, column_config={ "model_name": ColumnConfig(label="Model Name", alignment="left"), "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"), "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"), "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"), "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"), "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"), "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"), "org": ColumnConfig(label="Organization", alignment="left"), "link": st.column_config.LinkColumn(label="Link", display_text=":material/open_in_new:"), }, ) with st.expander("See details"): st.markdown(CHRONOS_BENCHMARK_DETAILS, unsafe_allow_html=True) st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True) chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45]) with chart_col_1: st.altair_chart( construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric), use_container_width=True, ) with chart_col_2: st.altair_chart( construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric), use_container_width=True, ) with st.expander("See details"): st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True) st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True) with st.expander("Show detailed results"): st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True) st.dataframe( construct_pivot_table( summaries=load_summaries(), metric_name=selected_metric, baseline_model=BASELINE_MODEL, leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, ) ) st.divider() st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True) st.markdown(CITATION_HEADER) st.markdown(CITATION_FEV) st.markdown(CITATION_CHRONOS)