"""Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py Enhanced with Pydantic models. """ import glob import json import os import warnings from pathlib import Path from typing import Any from loguru import logger from pydantic_core import from_json from src.schemas.eval_result import EvalResult def get_request_file_for_model(requests_path: str, model_name: str, precision: str) -> str: """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" request_files = os.path.join( requests_path, f"{model_name}_eval_request_*.json", ) request_files = glob.glob(request_files) # Select correct request file (precision) request_file = "" request_files = sorted(request_files, reverse=True) for tmp_request_file in request_files: with open(tmp_request_file) as f: req_content = json.load(f) if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]: request_file = tmp_request_file return request_file def get_raw_eval_results(results_versions_dir: Path, requests_path: Path, *, results_version: str) -> list[EvalResult]: """From the path of the results folder root, extract all needed info for results""" versioned_result_file = results_versions_dir / f"bench_{results_version}.json" if not versioned_result_file.exists(): raise FileNotFoundError( f"version={results_version!r} results file not found: {versioned_result_file.as_posix()!r}" ) logger.info(f"Loading results from: {versioned_result_file.as_posix()!r}") raw_results_model: dict[str, dict[str, Any]] = from_json(versioned_result_file.read_bytes()) logger.info(f"Loaded {len(raw_results_model)} results") eval_results: dict[str, EvalResult] = {} for _model_key, model_data in raw_results_model.items(): # Creation of result eval_result = EvalResult.init_from_dict(model_data) eval_result.update_with_request_file(requests_path) # Store results of same eval together eval_name = eval_result.eval_name if eval_name in eval_results.keys(): results_loaded = {k: v for k, v in eval_result.results.items() if v is not None} eval_results[eval_name].results.update(results_loaded) else: eval_results[eval_name] = eval_result results: list[EvalResult] = [] for v in eval_results.values(): try: v.to_dict() # we test if the dict version is complete results.append(v) except KeyError as e: # not all eval values present warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2) continue return results