Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
test: add unit tests for utils
Browse files- src/loaders.py +3 -1
- src/utils.py +28 -21
- tests/src/test_utils.py +57 -19
src/loaders.py
CHANGED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
| 6 |
from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
|
| 7 |
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
|
| 8 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
|
| 9 |
-
from src.utils import get_default_cols, get_leaderboard_df
|
| 10 |
|
| 11 |
pd.options.mode.copy_on_write = True
|
| 12 |
|
|
@@ -60,6 +60,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
|
| 60 |
ds.qa_fmt_df = ds.qa_raw_df.copy()
|
| 61 |
qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
|
| 62 |
ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
|
|
|
|
| 63 |
ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 64 |
|
| 65 |
ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
|
|
@@ -67,6 +68,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
|
| 67 |
ds.doc_fmt_df = ds.doc_raw_df.copy()
|
| 68 |
doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
|
| 69 |
ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
|
|
|
| 70 |
ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 71 |
|
| 72 |
ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
|
|
|
|
| 6 |
from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
|
| 7 |
from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
|
| 8 |
from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
|
| 9 |
+
from src.utils import get_default_cols, get_leaderboard_df, reset_rank
|
| 10 |
|
| 11 |
pd.options.mode.copy_on_write = True
|
| 12 |
|
|
|
|
| 60 |
ds.qa_fmt_df = ds.qa_raw_df.copy()
|
| 61 |
qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
|
| 62 |
ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
|
| 63 |
+
ds.qa_fmt_df = reset_rank(ds.qa_fmt_df)
|
| 64 |
ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 65 |
|
| 66 |
ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
|
|
|
|
| 68 |
ds.doc_fmt_df = ds.doc_raw_df.copy()
|
| 69 |
doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
|
| 70 |
ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
|
| 71 |
+
ds.doc_fmt_df = reset_rank(ds.doc_fmt_df)
|
| 72 |
ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 73 |
|
| 74 |
ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))
|
src/utils.py
CHANGED
|
@@ -334,42 +334,49 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
| 334 |
"""
|
| 335 |
Creates a dataframe from all the individual experiment results
|
| 336 |
"""
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
| 341 |
if task == TaskType.qa:
|
| 342 |
benchmarks = QABenchmarks[datastore.slug]
|
| 343 |
elif task == TaskType.long_doc:
|
| 344 |
benchmarks = LongDocBenchmarks[datastore.slug]
|
| 345 |
else:
|
| 346 |
raise NotImplementedError
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
df = pd.DataFrame.from_records(all_data_json)
|
| 354 |
-
|
| 355 |
-
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
| 356 |
|
| 357 |
-
|
| 358 |
-
df[COL_NAME_AVG] =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
| 360 |
df.reset_index(inplace=True, drop=True)
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
-
#
|
| 366 |
-
df
|
| 367 |
|
| 368 |
# shorten the revision
|
| 369 |
df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
|
| 370 |
|
| 371 |
-
# # replace "0" with "-" for average score
|
| 372 |
-
# df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
|
| 373 |
return df
|
| 374 |
|
| 375 |
|
|
|
|
| 334 |
"""
|
| 335 |
Creates a dataframe from all the individual experiment results
|
| 336 |
"""
|
| 337 |
+
# load the selected metrics into a DataFrame from the raw json
|
| 338 |
+
all_data_json = []
|
| 339 |
+
for v in datastore.raw_data:
|
| 340 |
+
all_data_json += v.to_dict(task=task.value, metric=metric)
|
| 341 |
+
df = pd.DataFrame.from_records(all_data_json)
|
| 342 |
+
|
| 343 |
+
# calculate the average scores for selected task
|
| 344 |
if task == TaskType.qa:
|
| 345 |
benchmarks = QABenchmarks[datastore.slug]
|
| 346 |
elif task == TaskType.long_doc:
|
| 347 |
benchmarks = LongDocBenchmarks[datastore.slug]
|
| 348 |
else:
|
| 349 |
raise NotImplementedError
|
| 350 |
+
valid_cols = frozenset(df.columns.to_list())
|
| 351 |
+
benchmark_cols = []
|
| 352 |
+
for t in list(benchmarks.value):
|
| 353 |
+
if t.value.col_name not in valid_cols:
|
| 354 |
+
continue
|
| 355 |
+
benchmark_cols.append(t.value.col_name)
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
+
## filter out the columns that are not in the data
|
| 358 |
+
df[COL_NAME_AVG] = (
|
| 359 |
+
df[list(benchmark_cols)]
|
| 360 |
+
.apply(calculate_mean, axis=1)
|
| 361 |
+
.round(decimals=2)
|
| 362 |
+
)
|
| 363 |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
| 364 |
df.reset_index(inplace=True, drop=True)
|
| 365 |
|
| 366 |
+
# filter out columns that are not in the data
|
| 367 |
+
display_cols = [COL_NAME_IS_ANONYMOUS, COL_NAME_AVG]
|
| 368 |
+
default_cols, _ = get_default_col_names_and_types(benchmarks)
|
| 369 |
+
for col in default_cols:
|
| 370 |
+
if col in valid_cols:
|
| 371 |
+
display_cols.append(col)
|
| 372 |
+
df = df[display_cols].round(decimals=2)
|
| 373 |
|
| 374 |
+
# rank the scores
|
| 375 |
+
df = reset_rank(df)
|
| 376 |
|
| 377 |
# shorten the revision
|
| 378 |
df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
|
| 379 |
|
|
|
|
|
|
|
| 380 |
return df
|
| 381 |
|
| 382 |
|
tests/src/test_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols
|
| 5 |
from src.models import model_hyperlink, TaskType
|
| 6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 7 |
|
|
@@ -29,13 +29,14 @@ def toy_df():
|
|
| 29 |
"NoReranker"
|
| 30 |
],
|
| 31 |
"Rank 🏆": [1, 2, 3, 4],
|
| 32 |
-
"Revision": ["", "", "", ""],
|
| 33 |
"Submission Date": ["", "", "", ""],
|
| 34 |
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
| 35 |
"wiki_en": [0.8, 0.7, 0.2, 0.1],
|
| 36 |
"wiki_zh": [0.4, 0.1, 0.4, 0.3],
|
| 37 |
"news_en": [0.8, 0.7, 0.2, 0.1],
|
| 38 |
-
"news_zh": [0.4, 0.1, 0.
|
|
|
|
| 39 |
}
|
| 40 |
)
|
| 41 |
|
|
@@ -94,18 +95,22 @@ def test_filter_queries(query, expected):
|
|
| 94 |
|
| 95 |
|
| 96 |
@pytest.mark.parametrize(
|
| 97 |
-
"task_type, slug, expected",
|
| 98 |
[
|
| 99 |
-
(TaskType.qa, "air_bench_2404", NUM_QA_BENCHMARKS_24_04),
|
| 100 |
-
(TaskType.long_doc, "air_bench_2404", NUM_DOC_BENCHMARKS_24_04),
|
| 101 |
-
(TaskType.qa, "air_bench_2405", NUM_QA_BENCHMARKS_24_05),
|
| 102 |
-
(TaskType.long_doc, "air_bench_2405", NUM_DOC_BENCHMARKS_24_05),
|
| 103 |
]
|
| 104 |
)
|
| 105 |
-
def test_get_default_cols(task_type, slug, expected):
|
| 106 |
attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
|
| 107 |
cols, types = get_default_cols(task_type, slug)
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
assert len(benchmark_cols) == expected
|
| 110 |
|
| 111 |
|
|
@@ -133,8 +138,8 @@ def test_get_selected_cols(task_type, domains, languages, expected):
|
|
| 133 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
| 134 |
assert sorted(cols) == sorted(expected)
|
| 135 |
|
| 136 |
-
|
| 137 |
-
def test_select_columns(toy_df):
|
| 138 |
expected = [
|
| 139 |
'Rank 🏆',
|
| 140 |
'Retrieval Method',
|
|
@@ -145,13 +150,46 @@ def test_select_columns(toy_df):
|
|
| 145 |
'news_zh']
|
| 146 |
df_result = select_columns(
|
| 147 |
toy_df,
|
| 148 |
-
[
|
| 149 |
-
|
| 150 |
-
],
|
| 151 |
-
[
|
| 152 |
-
"zh",
|
| 153 |
-
],
|
| 154 |
version_slug="air_bench_2404",
|
|
|
|
| 155 |
)
|
| 156 |
assert len(df_result.columns) == len(expected)
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
|
| 5 |
from src.models import model_hyperlink, TaskType
|
| 6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 7 |
|
|
|
|
| 29 |
"NoReranker"
|
| 30 |
],
|
| 31 |
"Rank 🏆": [1, 2, 3, 4],
|
| 32 |
+
"Revision": ["123", "234", "345", "456"],
|
| 33 |
"Submission Date": ["", "", "", ""],
|
| 34 |
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
| 35 |
"wiki_en": [0.8, 0.7, 0.2, 0.1],
|
| 36 |
"wiki_zh": [0.4, 0.1, 0.4, 0.3],
|
| 37 |
"news_en": [0.8, 0.7, 0.2, 0.1],
|
| 38 |
+
"news_zh": [0.4, 0.1, 0.2, 0.3],
|
| 39 |
+
"Anonymous Submission": [False, False, False, True],
|
| 40 |
}
|
| 41 |
)
|
| 42 |
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
@pytest.mark.parametrize(
|
| 98 |
+
"task_type, slug, add_fix_cols, expected",
|
| 99 |
[
|
| 100 |
+
(TaskType.qa, "air_bench_2404", True, NUM_QA_BENCHMARKS_24_04),
|
| 101 |
+
(TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
|
| 102 |
+
(TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
|
| 103 |
+
(TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
|
| 104 |
]
|
| 105 |
)
|
| 106 |
+
def test_get_default_cols(task_type, slug, add_fix_cols, expected):
|
| 107 |
attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
|
| 108 |
cols, types = get_default_cols(task_type, slug)
|
| 109 |
+
cols_set = frozenset(cols)
|
| 110 |
+
attrs_set = frozenset(attr_cols)
|
| 111 |
+
if add_fix_cols:
|
| 112 |
+
assert attrs_set.issubset(cols_set)
|
| 113 |
+
benchmark_cols = list(cols_set.difference(attrs_set))
|
| 114 |
assert len(benchmark_cols) == expected
|
| 115 |
|
| 116 |
|
|
|
|
| 138 |
cols = get_selected_cols(task_type, slug, domains, languages)
|
| 139 |
assert sorted(cols) == sorted(expected)
|
| 140 |
|
| 141 |
+
@pytest.mark.parametrize("reset_rank", [False])
|
| 142 |
+
def test_select_columns(toy_df, reset_rank):
|
| 143 |
expected = [
|
| 144 |
'Rank 🏆',
|
| 145 |
'Retrieval Method',
|
|
|
|
| 150 |
'news_zh']
|
| 151 |
df_result = select_columns(
|
| 152 |
toy_df,
|
| 153 |
+
["news"],
|
| 154 |
+
["zh"],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
version_slug="air_bench_2404",
|
| 156 |
+
reset_ranking=reset_rank
|
| 157 |
)
|
| 158 |
assert len(df_result.columns) == len(expected)
|
| 159 |
+
if reset_rank:
|
| 160 |
+
assert df_result["Average ⬆️"].equals(df_result["news_zh"])
|
| 161 |
+
else:
|
| 162 |
+
assert df_result["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@pytest.mark.parametrize(
|
| 166 |
+
"reset_rank, show_anony",
|
| 167 |
+
[
|
| 168 |
+
(False, True),
|
| 169 |
+
(True, True),
|
| 170 |
+
(True, False),
|
| 171 |
+
]
|
| 172 |
+
)
|
| 173 |
+
def test__update_df_elem(toy_df, reset_rank, show_anony):
|
| 174 |
+
df = _update_df_elem(
|
| 175 |
+
TaskType.qa,
|
| 176 |
+
"AIR-Bench_24.04",
|
| 177 |
+
toy_df,
|
| 178 |
+
["news"],
|
| 179 |
+
["zh"],
|
| 180 |
+
[],
|
| 181 |
+
"",
|
| 182 |
+
show_anony,
|
| 183 |
+
reset_rank
|
| 184 |
+
)
|
| 185 |
+
if show_anony:
|
| 186 |
+
assert df.shape[0] == 4
|
| 187 |
+
else:
|
| 188 |
+
assert df.shape[0] == 3
|
| 189 |
+
if show_anony:
|
| 190 |
+
if reset_rank:
|
| 191 |
+
assert df["Average ⬆️"].equals(df["news_zh"])
|
| 192 |
+
else:
|
| 193 |
+
assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
| 194 |
+
|
| 195 |
+
|