Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
test: add unit tests for utils
Browse files- src/loaders.py +3 -1
- src/models.py +8 -8
- src/utils.py +1 -1
- tests/src/test_utils.py +25 -1
src/loaders.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import os.path
|
|
|
|
|
|
|
| 2 |
from typing import Dict, List
|
| 3 |
|
| 4 |
import pandas as pd
|
|
@@ -11,7 +13,7 @@ from src.utils import get_default_cols, get_leaderboard_df, reset_rank
|
|
| 11 |
pd.options.mode.copy_on_write = True
|
| 12 |
|
| 13 |
|
| 14 |
-
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
| 15 |
"""
|
| 16 |
Load the evaluation results from a json file
|
| 17 |
"""
|
|
|
|
| 1 |
import os.path
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Union
|
| 4 |
from typing import Dict, List
|
| 5 |
|
| 6 |
import pandas as pd
|
|
|
|
| 13 |
pd.options.mode.copy_on_write = True
|
| 14 |
|
| 15 |
|
| 16 |
+
def load_raw_eval_results(results_path: Union[Path, str]) -> List[FullEvalResult]:
|
| 17 |
"""
|
| 18 |
Load the evaluation results from a json file
|
| 19 |
"""
|
src/models.py
CHANGED
|
@@ -141,14 +141,14 @@ class FullEvalResult:
|
|
| 141 |
class LeaderboardDataStore:
|
| 142 |
version: str
|
| 143 |
slug: str
|
| 144 |
-
raw_data:
|
| 145 |
-
qa_raw_df:
|
| 146 |
-
doc_raw_df:
|
| 147 |
-
qa_fmt_df:
|
| 148 |
-
doc_fmt_df:
|
| 149 |
-
reranking_models:
|
| 150 |
-
qa_types:
|
| 151 |
-
doc_types:
|
| 152 |
|
| 153 |
|
| 154 |
# Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
|
|
|
|
| 141 |
class LeaderboardDataStore:
|
| 142 |
version: str
|
| 143 |
slug: str
|
| 144 |
+
raw_data: list = None
|
| 145 |
+
qa_raw_df: pd.DataFrame = pd.DataFrame()
|
| 146 |
+
doc_raw_df: pd.DataFrame = pd.DataFrame()
|
| 147 |
+
qa_fmt_df: pd.DataFrame = pd.DataFrame()
|
| 148 |
+
doc_fmt_df: pd.DataFrame = pd.DataFrame()
|
| 149 |
+
reranking_models: list = None
|
| 150 |
+
qa_types: list = None
|
| 151 |
+
doc_types: list = None
|
| 152 |
|
| 153 |
|
| 154 |
# Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
|
src/utils.py
CHANGED
|
@@ -354,7 +354,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
|
|
| 354 |
continue
|
| 355 |
benchmark_cols.append(t.value.col_name)
|
| 356 |
|
| 357 |
-
|
| 358 |
df[COL_NAME_AVG] = (
|
| 359 |
df[list(benchmark_cols)]
|
| 360 |
.apply(calculate_mean, axis=1)
|
|
|
|
| 354 |
continue
|
| 355 |
benchmark_cols.append(t.value.col_name)
|
| 356 |
|
| 357 |
+
# filter out the columns that are not in the data
|
| 358 |
df[COL_NAME_AVG] = (
|
| 359 |
df[list(benchmark_cols)]
|
| 360 |
.apply(calculate_mean, axis=1)
|
tests/src/test_utils.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
|
| 4 |
-
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
|
| 5 |
from src.models import model_hyperlink, TaskType
|
| 6 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 7 |
|
|
|
|
| 8 |
|
| 9 |
NUM_QA_BENCHMARKS_24_05 = 53
|
| 10 |
NUM_DOC_BENCHMARKS_24_05 = 11
|
|
@@ -193,3 +195,25 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
|
|
| 193 |
assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
| 194 |
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
|
| 5 |
+
from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem, get_leaderboard_df
|
| 6 |
from src.models import model_hyperlink, TaskType
|
| 7 |
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
| 8 |
|
| 9 |
+
cur_fp = Path(__file__)
|
| 10 |
|
| 11 |
NUM_QA_BENCHMARKS_24_05 = 53
|
| 12 |
NUM_DOC_BENCHMARKS_24_05 = 11
|
|
|
|
| 195 |
assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
|
| 196 |
|
| 197 |
|
| 198 |
+
@pytest.mark.parametrize(
|
| 199 |
+
"version, task_type",
|
| 200 |
+
[
|
| 201 |
+
("AIR-Bench_24.04", TaskType.qa),
|
| 202 |
+
("AIR-Bench_24.04", TaskType.long_doc),
|
| 203 |
+
("AIR-Bench_24.05", TaskType.qa),
|
| 204 |
+
("AIR-Bench_24.05", TaskType.long_doc)
|
| 205 |
+
]
|
| 206 |
+
)
|
| 207 |
+
def test_get_leaderboard_df(version, task_type):
|
| 208 |
+
from src.loaders import load_raw_eval_results
|
| 209 |
+
from src.models import LeaderboardDataStore, get_safe_name
|
| 210 |
+
raw_data = load_raw_eval_results(
|
| 211 |
+
cur_fp.parents[1] / f"toydata/eval_results/{version}"
|
| 212 |
+
)
|
| 213 |
+
ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
|
| 214 |
+
df = get_leaderboard_df(
|
| 215 |
+
ds,
|
| 216 |
+
task_type,
|
| 217 |
+
"ndcg_at_10"
|
| 218 |
+
)
|
| 219 |
+
assert df.shape[0] == 1
|