Spaces:
Running
Running
yangzhitao
refactor: enhance submission functionality with new tabs and improved benchmark handling, and update editorconfig for consistent formatting
3f84332
| """Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py | |
| Enhanced with Pydantic models. | |
| """ | |
| import glob | |
| import json | |
| import os | |
| import warnings | |
| from pathlib import Path | |
| from typing import Any | |
| from loguru import logger | |
| from pydantic_core import from_json | |
| from src.schemas.eval_result import EvalResult | |
| def get_request_file_for_model(requests_path: str, model_name: str, precision: str) -> str: | |
| """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" | |
| request_files = os.path.join( | |
| requests_path, | |
| f"{model_name}_eval_request_*.json", | |
| ) | |
| request_files = glob.glob(request_files) | |
| # Select correct request file (precision) | |
| request_file = "" | |
| request_files = sorted(request_files, reverse=True) | |
| for tmp_request_file in request_files: | |
| with open(tmp_request_file) as f: | |
| req_content = json.load(f) | |
| if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]: | |
| request_file = tmp_request_file | |
| return request_file | |
| def get_raw_eval_results(results_versions_dir: Path, requests_path: Path, *, results_version: str) -> list[EvalResult]: | |
| """From the path of the results folder root, extract all needed info for results""" | |
| versioned_result_file = results_versions_dir / f"bench_{results_version}.json" | |
| if not versioned_result_file.exists(): | |
| raise FileNotFoundError( | |
| f"version={results_version!r} results file not found: {versioned_result_file.as_posix()!r}" | |
| ) | |
| logger.info(f"Loading results from: {versioned_result_file.as_posix()!r}") | |
| raw_results_model: dict[str, dict[str, Any]] = from_json(versioned_result_file.read_bytes()) | |
| logger.info(f"Loaded {len(raw_results_model)} results") | |
| eval_results: dict[str, EvalResult] = {} | |
| for _model_key, model_data in raw_results_model.items(): | |
| # Creation of result | |
| eval_result = EvalResult.init_from_dict(model_data) | |
| eval_result.update_with_request_file(requests_path) | |
| # Store results of same eval together | |
| eval_name = eval_result.eval_name | |
| if eval_name in eval_results.keys(): | |
| results_loaded = {k: v for k, v in eval_result.results.items() if v is not None} | |
| eval_results[eval_name].results.update(results_loaded) | |
| else: | |
| eval_results[eval_name] = eval_result | |
| results: list[EvalResult] = [] | |
| for v in eval_results.values(): | |
| try: | |
| v.to_dict() # we test if the dict version is complete | |
| results.append(v) | |
| except KeyError as e: # not all eval values present | |
| warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2) | |
| continue | |
| return results | |