File size: 2,853 Bytes
d66a6a3
 
 
 
 
3b3db42
8b1f7a0
3b3db42
12947f9
d66a6a3
3f84332
8b1f7a0
c2c3c10
 
8b1f7a0
3f84332
8b1f7a0
c85dcc4
3f84332
1ba1924
8b1f7a0
1ba1924
8b1f7a0
 
 
 
 
 
 
 
3165936
8b1f7a0
3165936
8b1f7a0
 
 
 
c2c3c10
1ba1924
c2c3c10
 
 
 
 
 
 
 
8b1f7a0
60906bd
c2c3c10
8b1f7a0
c2c3c10
1ba1924
8b1f7a0
 
 
 
c85dcc4
 
8b1f7a0
 
 
60906bd
8b1f7a0
 
3165936
ceb2102
12947f9
 
8b1f7a0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py

Enhanced with Pydantic models.
"""

import glob
import json
import os
import warnings
from pathlib import Path
from typing import Any

from loguru import logger
from pydantic_core import from_json

from src.schemas.eval_result import EvalResult


def get_request_file_for_model(requests_path: str, model_name: str, precision: str) -> str:
    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
    request_files = os.path.join(
        requests_path,
        f"{model_name}_eval_request_*.json",
    )
    request_files = glob.glob(request_files)

    # Select correct request file (precision)
    request_file = ""
    request_files = sorted(request_files, reverse=True)
    for tmp_request_file in request_files:
        with open(tmp_request_file) as f:
            req_content = json.load(f)
            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
                request_file = tmp_request_file
    return request_file


def get_raw_eval_results(results_versions_dir: Path, requests_path: Path, *, results_version: str) -> list[EvalResult]:
    """From the path of the results folder root, extract all needed info for results"""
    versioned_result_file = results_versions_dir / f"bench_{results_version}.json"
    if not versioned_result_file.exists():
        raise FileNotFoundError(
            f"version={results_version!r} results file not found: {versioned_result_file.as_posix()!r}"
        )
    logger.info(f"Loading results from: {versioned_result_file.as_posix()!r}")
    raw_results_model: dict[str, dict[str, Any]] = from_json(versioned_result_file.read_bytes())
    logger.info(f"Loaded {len(raw_results_model)} results")

    eval_results: dict[str, EvalResult] = {}
    for _model_key, model_data in raw_results_model.items():
        # Creation of result
        eval_result = EvalResult.init_from_dict(model_data)
        eval_result.update_with_request_file(requests_path)

        # Store results of same eval together
        eval_name = eval_result.eval_name
        if eval_name in eval_results.keys():
            results_loaded = {k: v for k, v in eval_result.results.items() if v is not None}
            eval_results[eval_name].results.update(results_loaded)
        else:
            eval_results[eval_name] = eval_result

    results: list[EvalResult] = []
    for v in eval_results.values():
        try:
            v.to_dict()  # we test if the dict version is complete
            results.append(v)
        except KeyError as e:  # not all eval values present
            warnings.warn(f"Not all eval values present for {v.eval_name}: {e}", stacklevel=2)
            continue

    return results