yangzhitao
refactor: enhance submission functionality with new tabs and improved benchmark handling, and update editorconfig for consistent formatting
3f84332
"""Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py
Enhanced with Pydantic models.
"""
import glob
import json
import os
import warnings
from pathlib import Path
from typing import Any
from loguru import logger
from pydantic_core import from_json
from src.schemas.eval_result import EvalResult
def get_request_file_for_model(requests_path: str, model_name: str, precision: str) -> str:
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
request_files = os.path.join(
requests_path,
f"{model_name}_eval_request_*.json",
)
request_files = glob.glob(request_files)
# Select correct request file (precision)
request_file = ""
request_files = sorted(request_files, reverse=True)
for tmp_request_file in request_files:
with open(tmp_request_file) as f:
req_content = json.load(f)
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
request_file = tmp_request_file
return request_file
def get_raw_eval_results(results_versions_dir: Path, requests_path: Path, *, results_version: str) -> list[EvalResult]:
"""From the path of the results folder root, extract all needed info for results"""
versioned_result_file = results_versions_dir / f"bench_{results_version}.json"
if not versioned_result_file.exists():
raise FileNotFoundError(
f"version={results_version!r} results file not found: {versioned_result_file.as_posix()!r}"
)
logger.info(f"Loading results from: {versioned_result_file.as_posix()!r}")
raw_results_model: dict[str, dict[str, Any]] = from_json(versioned_result_file.read_bytes())
logger.info(f"Loaded {len(raw_results_model)} results")
eval_results: dict[str, EvalResult] = {}
for _model_key, model_data in raw_results_model.items():
# Creation of result
eval_result = EvalResult.init_from_dict(model_data)
eval_result.update_with_request_file(requests_path)
# Store results of same eval together
eval_name = eval_result.eval_name
if eval_name in eval_results.keys():
results_loaded = {k: v for k, v in eval_result.results.items() if v is not None}
eval_results[eval_name].results.update(results_loaded)
else:
eval_results[eval_name] = eval_result
results: list[EvalResult] = []
for v in eval_results.values():
try:
v.to_dict() # we test if the dict version is complete
results.append(v)
except KeyError as e: # not all eval values present
warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2)
continue
return results