Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add metric selector
Browse files- app.py +35 -5
- src/populate.py +8 -9
- tests/src/test_populate.py +3 -1
- utils.py +24 -1
app.py
CHANGED
|
@@ -17,10 +17,12 @@ from src.display.utils import (
|
|
| 17 |
)
|
| 18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 19 |
from src.populate import get_leaderboard_df
|
| 20 |
-
from utils import update_table
|
| 21 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
| 24 |
def restart_space():
|
| 25 |
API.restart_space(repo_id=REPO_ID)
|
| 26 |
|
|
@@ -41,11 +43,21 @@ def restart_space():
|
|
| 41 |
# except Exception:
|
| 42 |
# restart_space()
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
| 47 |
leaderboard_df = original_df_qa.copy()
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# (
|
| 50 |
# finished_eval_queue_df,
|
| 51 |
# running_eval_queue_df,
|
|
@@ -99,7 +111,7 @@ with demo:
|
|
| 99 |
with gr.Column(min_width=320):
|
| 100 |
selected_metric = gr.Dropdown(
|
| 101 |
choices=metric_list,
|
| 102 |
-
value=metric_list[
|
| 103 |
label="Select the metric",
|
| 104 |
interactive=True,
|
| 105 |
elem_id="metric-select",
|
|
@@ -117,11 +129,13 @@ with demo:
|
|
| 117 |
|
| 118 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 119 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 120 |
-
value=
|
| 121 |
# headers=COLS,
|
| 122 |
# datatype=TYPES,
|
| 123 |
visible=False,
|
| 124 |
)
|
|
|
|
|
|
|
| 125 |
search_bar.submit(
|
| 126 |
update_table,
|
| 127 |
[
|
|
@@ -133,6 +147,8 @@ with demo:
|
|
| 133 |
],
|
| 134 |
leaderboard_table,
|
| 135 |
)
|
|
|
|
|
|
|
| 136 |
for selector in [
|
| 137 |
selected_domains, selected_langs, selected_rerankings
|
| 138 |
]:
|
|
@@ -149,6 +165,20 @@ with demo:
|
|
| 149 |
queue=True,
|
| 150 |
)
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 153 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 154 |
|
|
|
|
| 17 |
)
|
| 18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 19 |
from src.populate import get_leaderboard_df
|
| 20 |
+
from utils import update_table, update_metric
|
| 21 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
|
| 22 |
|
| 23 |
|
| 24 |
+
from functools import partial
|
| 25 |
+
|
| 26 |
def restart_space():
|
| 27 |
API.restart_space(repo_id=REPO_ID)
|
| 28 |
|
|
|
|
| 43 |
# except Exception:
|
| 44 |
# restart_space()
|
| 45 |
|
| 46 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
| 47 |
+
raw_data_qa = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
| 48 |
+
original_df_qa = get_leaderboard_df(raw_data_qa, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
|
| 49 |
print(f'data loaded: {len(raw_data_qa)}, {original_df_qa.shape}')
|
| 50 |
leaderboard_df = original_df_qa.copy()
|
| 51 |
|
| 52 |
+
|
| 53 |
+
def update_metric_qa(
|
| 54 |
+
metric: str,
|
| 55 |
+
domains: list,
|
| 56 |
+
langs: list,
|
| 57 |
+
reranking_model: list,
|
| 58 |
+
query: str,
|
| 59 |
+
):
|
| 60 |
+
return update_metric(raw_data_qa, metric, domains, langs, reranking_model, query)
|
| 61 |
# (
|
| 62 |
# finished_eval_queue_df,
|
| 63 |
# running_eval_queue_df,
|
|
|
|
| 111 |
with gr.Column(min_width=320):
|
| 112 |
selected_metric = gr.Dropdown(
|
| 113 |
choices=metric_list,
|
| 114 |
+
value=metric_list[1],
|
| 115 |
label="Select the metric",
|
| 116 |
interactive=True,
|
| 117 |
elem_id="metric-select",
|
|
|
|
| 129 |
|
| 130 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 131 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 132 |
+
value=leaderboard_df,
|
| 133 |
# headers=COLS,
|
| 134 |
# datatype=TYPES,
|
| 135 |
visible=False,
|
| 136 |
)
|
| 137 |
+
|
| 138 |
+
# Set search_bar listener
|
| 139 |
search_bar.submit(
|
| 140 |
update_table,
|
| 141 |
[
|
|
|
|
| 147 |
],
|
| 148 |
leaderboard_table,
|
| 149 |
)
|
| 150 |
+
|
| 151 |
+
# Set column-wise listener
|
| 152 |
for selector in [
|
| 153 |
selected_domains, selected_langs, selected_rerankings
|
| 154 |
]:
|
|
|
|
| 165 |
queue=True,
|
| 166 |
)
|
| 167 |
|
| 168 |
+
# set metric listener
|
| 169 |
+
selected_metric.change(
|
| 170 |
+
update_metric_qa,
|
| 171 |
+
[
|
| 172 |
+
selected_metric,
|
| 173 |
+
selected_domains,
|
| 174 |
+
selected_langs,
|
| 175 |
+
selected_rerankings,
|
| 176 |
+
search_bar,
|
| 177 |
+
],
|
| 178 |
+
leaderboard_table,
|
| 179 |
+
queue=True
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 183 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 184 |
|
src/populate.py
CHANGED
|
@@ -5,31 +5,30 @@ import pandas as pd
|
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
| 8 |
-
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
| 9 |
-
from typing import Tuple
|
| 10 |
|
| 11 |
|
| 12 |
-
def get_leaderboard_df(
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
| 14 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 15 |
-
print(f"raw_data loaded: {len(raw_data)}")
|
| 16 |
all_data_json = []
|
| 17 |
for v in raw_data:
|
| 18 |
all_data_json += v.to_dict(task=task, metric=metric)
|
| 19 |
-
|
| 20 |
-
print(f'records loaded: {len(all_data_json)}')
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
| 22 |
print(f'dataframe created: {df.shape}')
|
|
|
|
|
|
|
| 23 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 24 |
-
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
|
| 25 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
| 26 |
df.reset_index(inplace=True)
|
|
|
|
| 27 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
| 28 |
df = df[_cols].round(decimals=2)
|
| 29 |
|
| 30 |
# filter out if any of the benchmarks have not been produced
|
| 31 |
df = df[has_no_nan_values(df, _benchmark_cols)]
|
| 32 |
-
return
|
| 33 |
|
| 34 |
|
| 35 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
|
| 8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, FullEvalResult
|
| 9 |
+
from typing import Tuple, List
|
| 10 |
|
| 11 |
|
| 12 |
+
def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_cols: list, task: str, metric: str) -> pd.DataFrame:
|
| 13 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
|
|
| 14 |
all_data_json = []
|
| 15 |
for v in raw_data:
|
| 16 |
all_data_json += v.to_dict(task=task, metric=metric)
|
|
|
|
|
|
|
| 17 |
df = pd.DataFrame.from_records(all_data_json)
|
| 18 |
print(f'dataframe created: {df.shape}')
|
| 19 |
+
|
| 20 |
+
# calculate the average score for selected benchmarks
|
| 21 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 22 |
+
df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
| 23 |
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
| 24 |
df.reset_index(inplace=True)
|
| 25 |
+
|
| 26 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
| 27 |
df = df[_cols].round(decimals=2)
|
| 28 |
|
| 29 |
# filter out if any of the benchmarks have not been produced
|
| 30 |
df = df[has_no_nan_values(df, _benchmark_cols)]
|
| 31 |
+
return df
|
| 32 |
|
| 33 |
|
| 34 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
tests/src/test_populate.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from src.populate import get_leaderboard_df
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
cur_fp = Path(__file__)
|
|
@@ -9,7 +10,8 @@ def test_get_leaderboard_df():
|
|
| 9 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
| 10 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
| 11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
| 12 |
-
raw_data
|
|
|
|
| 13 |
assert df.shape[0] == 2
|
| 14 |
# the results contain only one embedding model
|
| 15 |
for i in range(2):
|
|
|
|
| 1 |
from src.populate import get_leaderboard_df
|
| 2 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
cur_fp = Path(__file__)
|
|
|
|
| 10 |
results_path = cur_fp.parents[1] / "toydata" / "test_results"
|
| 11 |
cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
|
| 12 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
| 13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
+
df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'qa', 'ndcg_at_1')
|
| 15 |
assert df.shape[0] == 2
|
| 16 |
# the results contain only one embedding model
|
| 17 |
for i in range(2):
|
utils.py
CHANGED
|
@@ -2,6 +2,10 @@ import pandas as pd
|
|
| 2 |
|
| 3 |
from src.display.utils import AutoEvalColumnQA, COLS
|
| 4 |
from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
|
@@ -68,4 +72,23 @@ def update_table(
|
|
| 68 |
filtered_df = filter_models(hidden_df, reranking_query)
|
| 69 |
filtered_df = filter_queries(query, filtered_df)
|
| 70 |
df = select_columns(filtered_df, domains, langs)
|
| 71 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from src.display.utils import AutoEvalColumnQA, COLS
|
| 4 |
from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
|
| 5 |
+
from src.leaderboard.read_evals import FullEvalResult
|
| 6 |
+
from typing import List
|
| 7 |
+
from src.populate import get_leaderboard_df
|
| 8 |
+
from src.display.utils import COLS, QA_BENCHMARK_COLS
|
| 9 |
|
| 10 |
|
| 11 |
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
|
|
|
| 72 |
filtered_df = filter_models(hidden_df, reranking_query)
|
| 73 |
filtered_df = filter_queries(query, filtered_df)
|
| 74 |
df = select_columns(filtered_df, domains, langs)
|
| 75 |
+
return df
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def update_metric(
|
| 79 |
+
raw_data: List[FullEvalResult],
|
| 80 |
+
metric: str,
|
| 81 |
+
domains: list,
|
| 82 |
+
langs: list,
|
| 83 |
+
reranking_model: list,
|
| 84 |
+
query: str,
|
| 85 |
+
) -> pd.DataFrame:
|
| 86 |
+
leaderboard_df = get_leaderboard_df(raw_data, COLS, QA_BENCHMARK_COLS, task='qa', metric=metric)
|
| 87 |
+
hidden_df = leaderboard_df
|
| 88 |
+
return update_table(
|
| 89 |
+
hidden_df,
|
| 90 |
+
domains,
|
| 91 |
+
langs,
|
| 92 |
+
reranking_model,
|
| 93 |
+
query
|
| 94 |
+
)
|