from pathlib import Path from typing import Annotated, Any import numpy as np from pydantic import BaseModel, ConfigDict, Field, computed_field from typing_extensions import Self from src.display.formatting import make_clickable_model from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType from src.prepare import get_benchmarks, load_meta_toml from src.submission.check_validity import is_model_on_hub class EvalResultJson(BaseModel): """Model of the eval result json file.""" model_config: ConfigDict = ConfigDict(extra="allow", frozen=True) config: "EvalResultJson_Config" results: dict[str, dict[str, float | None]] class EvalResultJson_Config(BaseModel): """`config` in the eval result json file.""" model_config: ConfigDict = ConfigDict(extra="allow", frozen=True) model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")] model_key: Annotated[str, Field(..., description="The model key. e.g. 'qwen2.5_3b'")] model_dtype: Annotated[str | None, Field(description="The model precision. e.g. torch.bfloat16")] = None model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = "" model_args: Annotated[str | None, Field(description="The model args.")] = None class EvalResult(BaseModel): """Represents one full evaluation. Built from a combination of the result and request file for a given run.""" eval_name: Annotated[ str, Field( ..., description="The evaluation name. e.g. '{model_key}_{precision}', '{org}_{model_key}_{precision}' (unique identifier)", ), ] full_model: Annotated[ str, Field( ..., description="The full model name. e.g. '{org}/{model_title}' (path on hub)", ), ] org: str | None model: str link_url: str | None = None revision: str # commit hash, "" if main results: dict[str, float] precision: Precision = Precision.Unknown model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... weight_type: WeightType = WeightType.Original # Original or Adapter architecture: str = "Unknown" license: str = "?" likes: int = 0 num_params: int = 0 date: str = "" # submission date of request file still_on_hub: bool = False @computed_field @property def link(self) -> str | None: """Link to the model on the hub or other platform.""" if self.link_url: # Use explicitly provided link return self.link_url if self.org and self.model: # Use inferred link on HuggingFace return f"https://huggingface.co/{self.org}/{self.model}" return None @classmethod def init_from_json_file(cls, json_path: Path) -> Self: """Inits the result from the specific model result file""" json_data = json_path.read_bytes() return cls.init_from_json(json_data) @classmethod def init_from_json(cls, json_data: str | bytes | bytearray) -> Self: """Inits the result from the specific json data""" data = EvalResultJson.model_validate_json(json_data) return cls.init_from_model(data) @classmethod def init_from_dict(cls, raw_model: dict[str, Any]) -> Self: """Inits the result from the specific json content""" data = EvalResultJson.model_validate(raw_model) return cls.init_from_model(data) @classmethod def init_from_model(cls, data: EvalResultJson) -> Self: BENCHMARKS = get_benchmarks() config = data.config # Precision precision = Precision.from_str(config.model_dtype) meta_toml = load_meta_toml() # Get model and org model_key: str = config.model_key or config.model_args or "" org = None link_url = None m_repo = meta_toml.model_key_to_repo.get(model_key) if m_repo is not None: if m_repo.repo_id: org, _, model_key = m_repo.repo_id.rpartition("/") org = org or None if m_repo.link: link_url = m_repo.link if not org: result_key = f"{model_key}_{precision.value.name}" else: result_key = f"{org}_{model_key}_{precision.value.name}" model_title = model_key m_meta = meta_toml.model_key_to_model.get(model_key) if m_meta is not None and m_meta.title: model_title = m_meta.title if org: still_on_hub, _, model_config = is_model_on_hub( f"{org}/{model_key}", config.model_sha or "main", trust_remote_code=True, test_tokenizer=False, ) else: still_on_hub = False model_config = None architecture: str = "?" if model_config is not None: architectures: list[str] | None = getattr(model_config, "architectures", None) if architectures: architecture = ";".join(architectures) # Extract results available in this file (some results are split in several files) results: dict[str, float] = {} for task in BENCHMARKS: # We average all scores of a given metric (not all metrics are present in all files) # TODO: support multiple metrics metric_keys = ["caa", "acc"] accs = np.array([ v.get(metric, np.nan) for k, v in data.results.items() if task.key == k for metric in metric_keys if metric in v ]) if accs.size == 0 or any(np.isnan(acc) for acc in accs): continue # mean_acc = np.mean(accs) * 100.0 mean_acc = np.mean(accs) results[task.title] = float(mean_acc) return cls.model_validate({ "eval_name": result_key, "full_model": model_title, "org": org or None, "model": model_key, "link_url": link_url or None, "results": results, "precision": precision, "revision": config.model_sha or "", "still_on_hub": still_on_hub, "architecture": architecture, }) def update_with_request_file(self, requests_path: Path | str) -> None: """Finds the relevant request file for the current model and updates info with it""" # TODO: do nothing for now return request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) try: with open(request_file) as f: request: dict[str, Any] = json.load(f) self.model_type = ModelType.from_str(request.get("model_type", "")) self.weight_type = WeightType[request.get("weight_type", "Original")] self.license = request.get("license", "?") self.likes = request.get("likes", 0) self.num_params = request.get("params", 0) self.date = request.get("submitted_time", "") except Exception as e: print( f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}. Error: {e}" ) def to_dict(self) -> dict: """Converts the Eval Result to a dict compatible with our dataframe display""" BENCHMARKS = get_benchmarks() average = sum(v for v in self.results.values() if v is not None) / len(BENCHMARKS) data_dict = { "eval_name": self.eval_name, # not a column, just a save name, AutoEvalColumn.precision.name: self.precision.value.name, AutoEvalColumn.model_type.name: self.model_type.value.name, AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, AutoEvalColumn.weight_type.name: self.weight_type.value.name, AutoEvalColumn.architecture.name: self.architecture, AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link), AutoEvalColumn.revision.name: self.revision, AutoEvalColumn.average.name: average, AutoEvalColumn.license.name: self.license, AutoEvalColumn.likes.name: self.likes, AutoEvalColumn.params.name: self.num_params, AutoEvalColumn.still_on_hub.name: self.still_on_hub, } for task in BENCHMARKS: data_dict[task.title] = self.results.get(task.title, None) return data_dict