Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
test: add unit tests for models
Browse files
src/models.py
CHANGED
|
@@ -85,16 +85,17 @@ class FullEvalResult:
|
|
| 85 |
is_anonymous=config.get("is_anonymous", False),
|
| 86 |
)
|
| 87 |
result_list.append(eval_result)
|
|
|
|
| 88 |
return cls(
|
| 89 |
-
eval_name=f"{
|
| 90 |
-
retrieval_model=
|
| 91 |
-
reranking_model=
|
| 92 |
retrieval_model_link=retrieval_model_link,
|
| 93 |
reranking_model_link=reranking_model_link,
|
| 94 |
results=result_list,
|
| 95 |
-
timestamp=
|
| 96 |
-
revision=
|
| 97 |
-
is_anonymous=
|
| 98 |
)
|
| 99 |
|
| 100 |
def to_dict(self, task="qa", metric="ndcg_at_3") -> List:
|
|
|
|
| 85 |
is_anonymous=config.get("is_anonymous", False),
|
| 86 |
)
|
| 87 |
result_list.append(eval_result)
|
| 88 |
+
eval_result = result_list[0]
|
| 89 |
return cls(
|
| 90 |
+
eval_name=f"{eval_result.retrieval_model}_{eval_result.reranking_model}",
|
| 91 |
+
retrieval_model=eval_result.retrieval_model,
|
| 92 |
+
reranking_model=eval_result.reranking_model,
|
| 93 |
retrieval_model_link=retrieval_model_link,
|
| 94 |
reranking_model_link=reranking_model_link,
|
| 95 |
results=result_list,
|
| 96 |
+
timestamp=eval_result.timestamp,
|
| 97 |
+
revision=eval_result.revision,
|
| 98 |
+
is_anonymous=eval_result.is_anonymous,
|
| 99 |
)
|
| 100 |
|
| 101 |
def to_dict(self, task="qa", metric="ndcg_at_3") -> List:
|
tests/src/test_models.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from src.models import EvalResult, FullEvalResult
|
| 5 |
+
|
| 6 |
+
cur_fp = Path(__file__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_eval_result():
|
| 10 |
+
eval_result = EvalResult(
|
| 11 |
+
eval_name="eval_name",
|
| 12 |
+
retrieval_model="bge-m3",
|
| 13 |
+
reranking_model="NoReranking",
|
| 14 |
+
results=[
|
| 15 |
+
{
|
| 16 |
+
"domain": "law",
|
| 17 |
+
"lang": "en",
|
| 18 |
+
"dataset": "lex_files_500K-600K",
|
| 19 |
+
"value": 0.45723
|
| 20 |
+
}
|
| 21 |
+
],
|
| 22 |
+
task="qa",
|
| 23 |
+
metric="ndcg_at_3",
|
| 24 |
+
timestamp="2024-05-14T03:09:08Z",
|
| 25 |
+
revision="1e243f14bd295ccdea7a118fe847399d",
|
| 26 |
+
is_anonymous=True,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@pytest.mark.parametrize(
|
| 31 |
+
'file_path',
|
| 32 |
+
[
|
| 33 |
+
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
|
| 34 |
+
"AIR-Bench_24.05/bge-m3/NoReranker/results.json"
|
| 35 |
+
])
|
| 36 |
+
def test_full_eval_result_init_from_json_file(file_path):
|
| 37 |
+
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
|
| 38 |
+
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 39 |
+
assert json_fp.parents[0].stem == full_eval_result.reranking_model
|
| 40 |
+
assert json_fp.parents[1].stem == full_eval_result.retrieval_model
|
| 41 |
+
assert len(full_eval_result.results) == 70
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_full_eval_result_to_dict():
|
| 45 |
+
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / "AIR-Bench_24.05/bge-m3/NoReranker/results.json"
|
| 46 |
+
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
|
| 47 |
+
result_dict_list = full_eval_result.to_dict()
|
| 48 |
+
assert len(result_dict_list) == 1
|
| 49 |
+
print(len(result_dict_list[0]))
|
tests/toydata/eval_results/AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/toydata/eval_results/AIR-Bench_24.05/bge-m3/NoReranker/results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|