leaderboard

Running on CPU Upgrade

nan commited on Oct 18, 2024

Commit

2961737

1 Parent(s): 3014147

test: add unit tests for utils

Files changed (4) hide show

src/loaders.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os.path
 from typing import Dict, List
 import pandas as pd
@@ -11,7 +13,7 @@ from src.utils import get_default_cols, get_leaderboard_df, reset_rank
 pd.options.mode.copy_on_write = True
-def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
     """
     Load the evaluation results from a json file
     """

 import os.path
+from pathlib import Path
+from typing import Union
 from typing import Dict, List
 import pandas as pd
 pd.options.mode.copy_on_write = True
+def load_raw_eval_results(results_path: Union[Path, str]) -> List[FullEvalResult]:
     """
     Load the evaluation results from a json file
     """

src/models.py CHANGED Viewed

@@ -141,14 +141,14 @@ class FullEvalResult:
 class LeaderboardDataStore:
     version: str
     slug: str
-    raw_data: Optional[list]
-    qa_raw_df: Optional[pd.DataFrame]
-    doc_raw_df: Optional[pd.DataFrame]
-    qa_fmt_df: Optional[pd.DataFrame]
-    doc_fmt_df: Optional[pd.DataFrame]
-    reranking_models: Optional[list]
-    qa_types: Optional[list]
-    doc_types: Optional[list]
 # Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.

 class LeaderboardDataStore:
     version: str
     slug: str
+    raw_data: list = None
+    qa_raw_df: pd.DataFrame = pd.DataFrame()
+    doc_raw_df: pd.DataFrame = pd.DataFrame()
+    qa_fmt_df: pd.DataFrame = pd.DataFrame()
+    doc_fmt_df: pd.DataFrame = pd.DataFrame()
+    reranking_models: list = None
+    qa_types: list = None
+    doc_types: list = None
 # Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.

src/utils.py CHANGED Viewed

@@ -354,7 +354,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
             continue
         benchmark_cols.append(t.value.col_name)
-    ## filter out the columns that are not in the data
     df[COL_NAME_AVG] = (
         df[list(benchmark_cols)]
         .apply(calculate_mean, axis=1)

             continue
         benchmark_cols.append(t.value.col_name)
+    # filter out the columns that are not in the data
     df[COL_NAME_AVG] = (
         df[list(benchmark_cols)]
         .apply(calculate_mean, axis=1)

tests/src/test_utils.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import pytest
 import pandas as pd
-from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
 from src.models import model_hyperlink, TaskType
 from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
 NUM_QA_BENCHMARKS_24_05 = 53
 NUM_DOC_BENCHMARKS_24_05 = 11
@@ -193,3 +195,25 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
             assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])

 import pytest
 import pandas as pd
+from pathlib import Path
+from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem, get_leaderboard_df
 from src.models import model_hyperlink, TaskType
 from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
+cur_fp = Path(__file__)
 NUM_QA_BENCHMARKS_24_05 = 53
 NUM_DOC_BENCHMARKS_24_05 = 11
             assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
+@pytest.mark.parametrize(
+    "version, task_type",
+    [
+        ("AIR-Bench_24.04", TaskType.qa),
+        ("AIR-Bench_24.04", TaskType.long_doc),
+        ("AIR-Bench_24.05", TaskType.qa),
+        ("AIR-Bench_24.05", TaskType.long_doc)
+    ]
+)
+def test_get_leaderboard_df(version, task_type):
+    from src.loaders import load_raw_eval_results
+    from src.models import LeaderboardDataStore, get_safe_name
+    raw_data = load_raw_eval_results(
+        cur_fp.parents[1] / f"toydata/eval_results/{version}"
+    )
+    ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
+    df = get_leaderboard_df(
+        ds,
+        task_type,
+        "ndcg_at_10"
+    )
+    assert df.shape[0] == 1