leaderboard

Running on CPU Upgrade

App Files Files Community

nan commited on Oct 18, 2024

Commit

3014147

1 Parent(s): b671337

test: add unit tests for utils

Browse files

Files changed (3) hide show

src/loaders.py +3 -1
src/utils.py +28 -21
tests/src/test_utils.py +57 -19

src/loaders.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
 from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
 from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
-from src.utils import get_default_cols, get_leaderboard_df
 pd.options.mode.copy_on_write = True
@@ -60,6 +60,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
     ds.qa_fmt_df = ds.qa_raw_df.copy()
     qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
     ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
     ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
     ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
@@ -67,6 +68,7 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
     ds.doc_fmt_df = ds.doc_raw_df.copy()
     doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
     ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
     ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
     ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))

 from src.columns import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
 from src.envs import BENCHMARK_VERSION_LIST, DEFAULT_METRIC_LONG_DOC, DEFAULT_METRIC_QA
 from src.models import FullEvalResult, LeaderboardDataStore, TaskType, get_safe_name
+from src.utils import get_default_cols, get_leaderboard_df, reset_rank
 pd.options.mode.copy_on_write = True
     ds.qa_fmt_df = ds.qa_raw_df.copy()
     qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
     ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
+    ds.qa_fmt_df = reset_rank(ds.qa_fmt_df)
     ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
     ds.doc_raw_df = get_leaderboard_df(ds, TaskType.long_doc, DEFAULT_METRIC_LONG_DOC)
     ds.doc_fmt_df = ds.doc_raw_df.copy()
     doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
     ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
+    ds.doc_fmt_df = reset_rank(ds.doc_fmt_df)
     ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
     ds.reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in ds.raw_data])))

src/utils.py CHANGED Viewed

@@ -334,42 +334,49 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
     """
     Creates a dataframe from all the individual experiment results
     """
-    raw_data = datastore.raw_data
-    cols = [
-        COL_NAME_IS_ANONYMOUS,
-    ]
     if task == TaskType.qa:
         benchmarks = QABenchmarks[datastore.slug]
     elif task == TaskType.long_doc:
         benchmarks = LongDocBenchmarks[datastore.slug]
     else:
         raise NotImplementedError
-    cols_qa, _ = get_default_col_names_and_types(benchmarks)
-    cols += cols_qa
-    benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
-    all_data_json = []
-    for v in raw_data:
-        all_data_json += v.to_dict(task=task.value, metric=metric)
-    df = pd.DataFrame.from_records(all_data_json)
-    _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
-    # calculate the average score for selected benchmarks
-    df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
     df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
     df.reset_index(inplace=True, drop=True)
-    _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
-    df = df[_cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
     # shorten the revision
     df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
-    # # replace "0" with "-" for average score
-    # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
     return df

     """
     Creates a dataframe from all the individual experiment results
     """
+    # load the selected metrics into a DataFrame from the raw json
+    all_data_json = []
+    for v in datastore.raw_data:
+        all_data_json += v.to_dict(task=task.value, metric=metric)
+    df = pd.DataFrame.from_records(all_data_json)
+    # calculate the average scores for selected task
     if task == TaskType.qa:
         benchmarks = QABenchmarks[datastore.slug]
     elif task == TaskType.long_doc:
         benchmarks = LongDocBenchmarks[datastore.slug]
     else:
         raise NotImplementedError
+    valid_cols = frozenset(df.columns.to_list())
+    benchmark_cols = []
+    for t in list(benchmarks.value):
+        if t.value.col_name not in valid_cols:
+            continue
+        benchmark_cols.append(t.value.col_name)
+    ## filter out the columns that are not in the data
+    df[COL_NAME_AVG] = (
+        df[list(benchmark_cols)]
+        .apply(calculate_mean, axis=1)
+        .round(decimals=2)
+    )
     df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
     df.reset_index(inplace=True, drop=True)
+    # filter out columns that are not in the data
+    display_cols = [COL_NAME_IS_ANONYMOUS, COL_NAME_AVG]
+    default_cols, _ = get_default_col_names_and_types(benchmarks)
+    for col in default_cols:
+        if col in valid_cols:
+            display_cols.append(col)
+    df = df[display_cols].round(decimals=2)
+    # rank the scores
+    df = reset_rank(df)
     # shorten the revision
     df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
     return df

tests/src/test_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pytest
 import pandas as pd
-from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols
 from src.models import model_hyperlink, TaskType
 from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
@@ -29,13 +29,14 @@ def toy_df():
                 "NoReranker"
             ],
             "Rank 🏆": [1, 2, 3, 4],
-            "Revision": ["", "", "", ""],
             "Submission Date": ["", "", "", ""],
             "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
             "wiki_en": [0.8, 0.7, 0.2, 0.1],
             "wiki_zh": [0.4, 0.1, 0.4, 0.3],
             "news_en": [0.8, 0.7, 0.2, 0.1],
-            "news_zh": [0.4, 0.1, 0.4, 0.3],
         }
     )
@@ -94,18 +95,22 @@ def test_filter_queries(query, expected):
 @pytest.mark.parametrize(
-    "task_type, slug, expected",
     [
-        (TaskType.qa, "air_bench_2404", NUM_QA_BENCHMARKS_24_04),
-        (TaskType.long_doc, "air_bench_2404", NUM_DOC_BENCHMARKS_24_04),
-        (TaskType.qa, "air_bench_2405", NUM_QA_BENCHMARKS_24_05),
-        (TaskType.long_doc, "air_bench_2405", NUM_DOC_BENCHMARKS_24_05),
     ]
 )
-def test_get_default_cols(task_type, slug, expected):
     attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
     cols, types = get_default_cols(task_type, slug)
-    benchmark_cols = list(frozenset(cols).difference(frozenset(attr_cols)))
     assert len(benchmark_cols) == expected
@@ -133,8 +138,8 @@ def test_get_selected_cols(task_type, domains, languages, expected):
     cols = get_selected_cols(task_type, slug, domains, languages)
     assert sorted(cols) == sorted(expected)
-def test_select_columns(toy_df):
     expected = [
         'Rank 🏆',
         'Retrieval Method',
@@ -145,13 +150,46 @@ def test_select_columns(toy_df):
         'news_zh']
     df_result = select_columns(
         toy_df,
-        [
-            "news",
-        ],
-        [
-            "zh",
-        ],
         version_slug="air_bench_2404",
     )
     assert len(df_result.columns) == len(expected)
-    assert df_result["Average ⬆️"].equals(df_result["news_zh"])

 import pytest
 import pandas as pd
+from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
 from src.models import model_hyperlink, TaskType
 from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
                 "NoReranker"
             ],
             "Rank 🏆": [1, 2, 3, 4],
+            "Revision": ["123", "234", "345", "456"],
             "Submission Date": ["", "", "", ""],
             "Average ⬆️": [0.6, 0.4, 0.3, 0.2],
             "wiki_en": [0.8, 0.7, 0.2, 0.1],
             "wiki_zh": [0.4, 0.1, 0.4, 0.3],
             "news_en": [0.8, 0.7, 0.2, 0.1],
+            "news_zh": [0.4, 0.1, 0.2, 0.3],
+            "Anonymous Submission": [False, False, False, True],
         }
     )
 @pytest.mark.parametrize(
+    "task_type, slug, add_fix_cols, expected",
     [
+        (TaskType.qa, "air_bench_2404", True, NUM_QA_BENCHMARKS_24_04),
+        (TaskType.long_doc, "air_bench_2404", True, NUM_DOC_BENCHMARKS_24_04),
+        (TaskType.qa, "air_bench_2405", False, NUM_QA_BENCHMARKS_24_05),
+        (TaskType.long_doc, "air_bench_2405", False, NUM_DOC_BENCHMARKS_24_05),
     ]
 )
+def test_get_default_cols(task_type, slug, add_fix_cols, expected):
     attr_cols = ['Rank 🏆', 'Retrieval Method', 'Reranking Model', 'Revision', 'Submission Date', 'Average ⬆️']
     cols, types = get_default_cols(task_type, slug)
+    cols_set = frozenset(cols)
+    attrs_set = frozenset(attr_cols)
+    if add_fix_cols:
+        assert attrs_set.issubset(cols_set)
+    benchmark_cols = list(cols_set.difference(attrs_set))
     assert len(benchmark_cols) == expected
     cols = get_selected_cols(task_type, slug, domains, languages)
     assert sorted(cols) == sorted(expected)
+@pytest.mark.parametrize("reset_rank", [False])
+def test_select_columns(toy_df, reset_rank):
     expected = [
         'Rank 🏆',
         'Retrieval Method',
         'news_zh']
     df_result = select_columns(
         toy_df,
+        ["news"],
+        ["zh"],
         version_slug="air_bench_2404",
+        reset_ranking=reset_rank
     )
     assert len(df_result.columns) == len(expected)
+    if reset_rank:
+        assert df_result["Average ⬆️"].equals(df_result["news_zh"])
+    else:
+        assert df_result["Average ⬆️"].equals(toy_df["Average ⬆️"])
+@pytest.mark.parametrize(
+    "reset_rank, show_anony",
+    [
+        (False, True),
+        (True, True),
+        (True, False),
+    ]
+)
+def test__update_df_elem(toy_df, reset_rank, show_anony):
+    df = _update_df_elem(
+        TaskType.qa,
+        "AIR-Bench_24.04",
+        toy_df,
+        ["news"],
+        ["zh"],
+        [],
+        "",
+        show_anony,
+        reset_rank
+    )
+    if show_anony:
+        assert df.shape[0] == 4
+    else:
+        assert df.shape[0] == 3
+    if show_anony:
+        if reset_rank:
+            assert df["Average ⬆️"].equals(df["news_zh"])
+        else:
+            assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])