Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
refactor: refactor the data loading part
Browse files
app.py
CHANGED
|
@@ -82,8 +82,8 @@ from typing import Optional
|
|
| 82 |
@dataclass
|
| 83 |
class LeaderboardDataStore:
|
| 84 |
raw_data: Optional[list]
|
| 85 |
-
|
| 86 |
-
|
| 87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
| 88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
| 89 |
reranking_models: Optional[list]
|
|
@@ -91,41 +91,52 @@ class LeaderboardDataStore:
|
|
| 91 |
types_long_doc: Optional[list]
|
| 92 |
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def load_eval_results(file_path: str):
|
| 95 |
output = {}
|
| 96 |
versions = ("AIR-Bench_24.04",)
|
| 97 |
for version in versions:
|
| 98 |
-
|
| 99 |
-
output[version]
|
| 100 |
-
output[version].raw_qa_df = get_leaderboard_df(
|
| 101 |
-
output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
| 102 |
-
output[version].original_df_long_doc = get_leaderboard_df(
|
| 103 |
-
output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
| 104 |
-
print(f'raw data: {len(output[version].raw_data)}')
|
| 105 |
-
print(f'QA data loaded: {output[version].raw_qa_df.shape}')
|
| 106 |
-
print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
|
| 107 |
-
|
| 108 |
-
output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
|
| 109 |
-
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
| 110 |
-
shown_columns_qa, types_qa = get_default_cols(
|
| 111 |
-
'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
|
| 112 |
-
output[version].types_qa = types_qa
|
| 113 |
-
output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
| 114 |
-
output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 115 |
-
|
| 116 |
-
output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
|
| 117 |
-
shown_columns_long_doc, types_long_doc = get_default_cols(
|
| 118 |
-
'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
|
| 119 |
-
output[version].types_long_doc = types_long_doc
|
| 120 |
-
output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
| 121 |
-
output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 122 |
-
|
| 123 |
-
output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
|
| 124 |
return output
|
| 125 |
|
| 126 |
|
| 127 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
| 128 |
|
|
|
|
| 129 |
def update_metric_qa(
|
| 130 |
metric: str,
|
| 131 |
domains: list,
|
|
@@ -133,9 +144,11 @@ def update_metric_qa(
|
|
| 133 |
reranking_model: list,
|
| 134 |
query: str,
|
| 135 |
show_anonymous: bool,
|
| 136 |
-
show_revision_and_timestamp,
|
|
|
|
| 137 |
):
|
| 138 |
-
return update_metric(data[
|
|
|
|
| 139 |
|
| 140 |
def update_metric_long_doc(
|
| 141 |
metric: str,
|
|
@@ -188,7 +201,7 @@ with demo:
|
|
| 188 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 189 |
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
| 190 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 191 |
-
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].
|
| 192 |
|
| 193 |
set_listeners(
|
| 194 |
"qa",
|
|
@@ -213,10 +226,13 @@ with demo:
|
|
| 213 |
search_bar,
|
| 214 |
show_anonymous,
|
| 215 |
show_revision_and_timestamp,
|
|
|
|
| 216 |
],
|
| 217 |
leaderboard_table,
|
| 218 |
queue=True
|
| 219 |
)
|
|
|
|
|
|
|
| 220 |
with gr.TabItem("Retrieval Only", id=11):
|
| 221 |
with gr.Row():
|
| 222 |
with gr.Column(scale=1):
|
|
@@ -227,7 +243,7 @@ with demo:
|
|
| 227 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 228 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
| 229 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 230 |
-
hidden_lb_df_retriever = data["AIR-Bench_24.04"].
|
| 231 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
| 232 |
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
| 233 |
|
|
@@ -254,6 +270,7 @@ with demo:
|
|
| 254 |
search_bar_retriever,
|
| 255 |
show_anonymous,
|
| 256 |
show_revision_and_timestamp,
|
|
|
|
| 257 |
],
|
| 258 |
lb_table_retriever,
|
| 259 |
queue=True
|
|
@@ -268,7 +285,7 @@ with demo:
|
|
| 268 |
with gr.Column(scale=1):
|
| 269 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 270 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
| 271 |
-
hidden_lb_df_reranker = data["AIR-Bench_24.04"].
|
| 272 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 273 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 274 |
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
|
@@ -296,6 +313,7 @@ with demo:
|
|
| 296 |
search_bar_reranker,
|
| 297 |
show_anonymous,
|
| 298 |
show_revision_and_timestamp,
|
|
|
|
| 299 |
],
|
| 300 |
lb_table_reranker,
|
| 301 |
queue=True
|
|
@@ -334,7 +352,7 @@ with demo:
|
|
| 334 |
|
| 335 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 336 |
hidden_lb_table_for_search = get_leaderboard_table(
|
| 337 |
-
data["AIR-Bench_24.04"].
|
| 338 |
)
|
| 339 |
|
| 340 |
set_listeners(
|
|
@@ -374,8 +392,8 @@ with demo:
|
|
| 374 |
data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 375 |
]
|
| 376 |
lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
|
| 377 |
-
hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].
|
| 378 |
-
data["AIR-Bench_24.04"].
|
| 379 |
]
|
| 380 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
| 381 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
|
@@ -422,7 +440,7 @@ with demo:
|
|
| 422 |
with gr.Column(scale=1):
|
| 423 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 424 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
| 425 |
-
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].
|
| 426 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 427 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 428 |
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
|
@@ -521,6 +539,7 @@ with demo:
|
|
| 521 |
|
| 522 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 523 |
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 524 |
|
| 525 |
if __name__ == "__main__":
|
| 526 |
scheduler = BackgroundScheduler()
|
|
|
|
| 82 |
@dataclass
|
| 83 |
class LeaderboardDataStore:
|
| 84 |
raw_data: Optional[list]
|
| 85 |
+
raw_df_qa: Optional[pd.DataFrame]
|
| 86 |
+
raw_df_long_doc: Optional[pd.DataFrame]
|
| 87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
| 88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
| 89 |
reranking_models: Optional[list]
|
|
|
|
| 91 |
types_long_doc: Optional[list]
|
| 92 |
|
| 93 |
|
| 94 |
+
def load_leaderboard_data(file_path) -> LeaderboardDataStore:
|
| 95 |
+
lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
|
| 96 |
+
lb_data_store.raw_data = get_raw_eval_results(file_path)
|
| 97 |
+
print(f'raw data: {len(lb_data_store.raw_data)}')
|
| 98 |
+
|
| 99 |
+
lb_data_store.raw_df_qa = get_leaderboard_df(
|
| 100 |
+
lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
| 101 |
+
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
|
| 102 |
+
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
| 103 |
+
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
|
| 104 |
+
shown_columns_qa, types_qa = get_default_cols(
|
| 105 |
+
'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
|
| 106 |
+
lb_data_store.types_qa = types_qa
|
| 107 |
+
lb_data_store.leaderboard_df_qa = \
|
| 108 |
+
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
| 109 |
+
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 110 |
+
|
| 111 |
+
lb_data_store.raw_df_long_doc = get_leaderboard_df(
|
| 112 |
+
lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
| 113 |
+
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
|
| 114 |
+
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
|
| 115 |
+
shown_columns_long_doc, types_long_doc = get_default_cols(
|
| 116 |
+
'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
|
| 117 |
+
lb_data_store.types_long_doc = types_long_doc
|
| 118 |
+
lb_data_store.leaderboard_df_long_doc = \
|
| 119 |
+
lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][
|
| 120 |
+
shown_columns_long_doc]
|
| 121 |
+
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 122 |
+
|
| 123 |
+
lb_data_store.reranking_models = sorted(
|
| 124 |
+
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
| 125 |
+
return lb_data_store
|
| 126 |
+
|
| 127 |
+
|
| 128 |
def load_eval_results(file_path: str):
|
| 129 |
output = {}
|
| 130 |
versions = ("AIR-Bench_24.04",)
|
| 131 |
for version in versions:
|
| 132 |
+
fn = f"{file_path}/{version}"
|
| 133 |
+
output[version] = load_leaderboard_data(fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
return output
|
| 135 |
|
| 136 |
|
| 137 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
| 138 |
|
| 139 |
+
|
| 140 |
def update_metric_qa(
|
| 141 |
metric: str,
|
| 142 |
domains: list,
|
|
|
|
| 144 |
reranking_model: list,
|
| 145 |
query: str,
|
| 146 |
show_anonymous: bool,
|
| 147 |
+
show_revision_and_timestamp: bool,
|
| 148 |
+
selected_version: str,
|
| 149 |
):
|
| 150 |
+
return update_metric(data[selected_version].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
| 151 |
+
|
| 152 |
|
| 153 |
def update_metric_long_doc(
|
| 154 |
metric: str,
|
|
|
|
| 201 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 202 |
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
| 203 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 204 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_df_qa, data["AIR-Bench_24.04"].types_qa, visible=False)
|
| 205 |
|
| 206 |
set_listeners(
|
| 207 |
"qa",
|
|
|
|
| 226 |
search_bar,
|
| 227 |
show_anonymous,
|
| 228 |
show_revision_and_timestamp,
|
| 229 |
+
selected_version,
|
| 230 |
],
|
| 231 |
leaderboard_table,
|
| 232 |
queue=True
|
| 233 |
)
|
| 234 |
+
|
| 235 |
+
"""
|
| 236 |
with gr.TabItem("Retrieval Only", id=11):
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column(scale=1):
|
|
|
|
| 243 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 244 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
| 245 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 246 |
+
hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 247 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
| 248 |
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
| 249 |
|
|
|
|
| 270 |
search_bar_retriever,
|
| 271 |
show_anonymous,
|
| 272 |
show_revision_and_timestamp,
|
| 273 |
+
selected_version,
|
| 274 |
],
|
| 275 |
lb_table_retriever,
|
| 276 |
queue=True
|
|
|
|
| 285 |
with gr.Column(scale=1):
|
| 286 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 287 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
| 288 |
+
hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 289 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 290 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 291 |
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
|
|
|
| 313 |
search_bar_reranker,
|
| 314 |
show_anonymous,
|
| 315 |
show_revision_and_timestamp,
|
| 316 |
+
selected_version,
|
| 317 |
],
|
| 318 |
lb_table_reranker,
|
| 319 |
queue=True
|
|
|
|
| 352 |
|
| 353 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 354 |
hidden_lb_table_for_search = get_leaderboard_table(
|
| 355 |
+
data["AIR-Bench_24.04"].raw_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
| 356 |
)
|
| 357 |
|
| 358 |
set_listeners(
|
|
|
|
| 392 |
data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 393 |
]
|
| 394 |
lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
|
| 395 |
+
hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].raw_df_long_doc[
|
| 396 |
+
data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
| 397 |
]
|
| 398 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
| 399 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
|
|
|
| 440 |
with gr.Column(scale=1):
|
| 441 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 442 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
| 443 |
+
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].raw_df_long_doc[data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 444 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 445 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 446 |
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
|
|
|
| 539 |
|
| 540 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 541 |
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 542 |
+
"""
|
| 543 |
|
| 544 |
if __name__ == "__main__":
|
| 545 |
scheduler = BackgroundScheduler()
|