Spaces:

lmms-lab-si
/

EASI-Leaderboard

Running

File size: 8,666 Bytes

3f84332

from pathlib import Path
from typing import Annotated, Any

import numpy as np
from pydantic import BaseModel, ConfigDict, Field, computed_field
from typing_extensions import Self

from src.display.formatting import make_clickable_model
from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
from src.prepare import get_benchmarks, load_meta_toml
from src.submission.check_validity import is_model_on_hub


class EvalResultJson(BaseModel):
    """Model of the eval result json file."""

    model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)

    config: "EvalResultJson_Config"
    results: dict[str, dict[str, float | None]]


class EvalResultJson_Config(BaseModel):
    """`config` in the eval result json file."""

    model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)

    model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
    model_key: Annotated[str, Field(..., description="The model key. e.g. 'qwen2.5_3b'")]
    model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None
    model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
    model_args: Annotated[str | None, Field(description="The model args.")] = None


class EvalResult(BaseModel):
    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""

    eval_name: Annotated[
        str,
        Field(
            ...,
            description="The evaluation name. e.g. '{model_key}_{precision}', '{org}_{model_key}_{precision}' (unique identifier)",
        ),
    ]
    full_model: Annotated[
        str,
        Field(
            ...,
            description="The full model name. e.g. '{org}/{model_title}' (path on hub)",
        ),
    ]
    org: str | None
    model: str
    link_url: str | None = None
    revision: str  # commit hash, "" if main
    results: dict[str, float]
    precision: Precision = Precision.Unknown
    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
    weight_type: WeightType = WeightType.Original  # Original or Adapter
    architecture: str = "Unknown"
    license: str = "?"
    likes: int = 0
    num_params: int = 0
    date: str = ""  # submission date of request file
    still_on_hub: bool = False

    @computed_field
    @property
    def link(self) -> str | None:
        """Link to the model on the hub or other platform."""
        if self.link_url:
            # Use explicitly provided link
            return self.link_url
        if self.org and self.model:
            # Use inferred link on HuggingFace
            return f"https://huggingface.co/{self.org}/{self.model}"
        return None

    @classmethod
    def init_from_json_file(cls, json_path: Path) -> Self:
        """Inits the result from the specific model result file"""
        json_data = json_path.read_bytes()
        return cls.init_from_json(json_data)

    @classmethod
    def init_from_json(cls, json_data: str | bytes | bytearray) -> Self:
        """Inits the result from the specific json data"""
        data = EvalResultJson.model_validate_json(json_data)
        return cls.init_from_model(data)

    @classmethod
    def init_from_dict(cls, raw_model: dict[str, Any]) -> Self:
        """Inits the result from the specific json content"""
        data = EvalResultJson.model_validate(raw_model)
        return cls.init_from_model(data)

    @classmethod
    def init_from_model(cls, data: EvalResultJson) -> Self:
        BENCHMARKS = get_benchmarks()
        config = data.config

        # Precision
        precision = Precision.from_str(config.model_dtype)

        meta_toml = load_meta_toml()

        # Get model and org
        model_key: str = config.model_key or config.model_args or ""
        org = None
        link_url = None
        m_repo = meta_toml.model_key_to_repo.get(model_key)
        if m_repo is not None:
            if m_repo.repo_id:
                org, _, model_key = m_repo.repo_id.rpartition("/")
                org = org or None
            if m_repo.link:
                link_url = m_repo.link
        if not org:
            result_key = f"{model_key}_{precision.value.name}"
        else:
            result_key = f"{org}_{model_key}_{precision.value.name}"

        model_title = model_key
        m_meta = meta_toml.model_key_to_model.get(model_key)
        if m_meta is not None and m_meta.title:
            model_title = m_meta.title

        if org:
            still_on_hub, _, model_config = is_model_on_hub(
                f"{org}/{model_key}",
                config.model_sha or "main",
                trust_remote_code=True,
                test_tokenizer=False,
            )
        else:
            still_on_hub = False
            model_config = None
        architecture: str = "?"
        if model_config is not None:
            architectures: list[str] | None = getattr(model_config, "architectures", None)
            if architectures:
                architecture = ";".join(architectures)

        # Extract results available in this file (some results are split in several files)
        results: dict[str, float] = {}
        for task in BENCHMARKS:
            # We average all scores of a given metric (not all metrics are present in all files)
            # TODO: support multiple metrics
            metric_keys = ["caa", "acc"]
            accs = np.array([
                v.get(metric, np.nan)
                for k, v in data.results.items()
                if task.key == k
                for metric in metric_keys
                if metric in v
            ])
            if accs.size == 0 or any(np.isnan(acc) for acc in accs):
                continue

            # mean_acc = np.mean(accs) * 100.0
            mean_acc = np.mean(accs)
            results[task.title] = float(mean_acc)

        return cls.model_validate({
            "eval_name": result_key,
            "full_model": model_title,
            "org": org or None,
            "model": model_key,
            "link_url": link_url or None,
            "results": results,
            "precision": precision,
            "revision": config.model_sha or "",
            "still_on_hub": still_on_hub,
            "architecture": architecture,
        })

    def update_with_request_file(self, requests_path: Path | str) -> None:
        """Finds the relevant request file for the current model and updates info with it"""
        # TODO: do nothing for now
        return
        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)

        try:
            with open(request_file) as f:
                request: dict[str, Any] = json.load(f)
            self.model_type = ModelType.from_str(request.get("model_type", ""))
            self.weight_type = WeightType[request.get("weight_type", "Original")]
            self.license = request.get("license", "?")
            self.likes = request.get("likes", 0)
            self.num_params = request.get("params", 0)
            self.date = request.get("submitted_time", "")
        except Exception as e:
            print(
                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}. Error: {e}"
            )

    def to_dict(self) -> dict:
        """Converts the Eval Result to a dict compatible with our dataframe display"""
        BENCHMARKS = get_benchmarks()

        average = sum(v for v in self.results.values() if v is not None) / len(BENCHMARKS)
        data_dict = {
            "eval_name": self.eval_name,  # not a column, just a save name,
            AutoEvalColumn.precision.name: self.precision.value.name,
            AutoEvalColumn.model_type.name: self.model_type.value.name,
            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
            AutoEvalColumn.architecture.name: self.architecture,
            AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
            AutoEvalColumn.revision.name: self.revision,
            AutoEvalColumn.average.name: average,
            AutoEvalColumn.license.name: self.license,
            AutoEvalColumn.likes.name: self.likes,
            AutoEvalColumn.params.name: self.num_params,
            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
        }

        for task in BENCHMARKS:
            data_dict[task.title] = self.results.get(task.title, None)

        return data_dict