# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" import ast import json from typing import Literal import datasets import evaluate _CITATION = """\ @InProceedings{huggingface:module, title = {Test of Time Accuracy}, authors={Auss Abbood}, year={2025} } """ _DESCRIPTION = """\ The Test of Time (ToT) benchmarks expects models format their answers as a JSON with an explanation field and an answer field that follows a predefined format. The metrics extracts JSONs objects from the model's output, retains only the first JSON, drops the explanation field and compares it with the reference answer. """ _KWARGS_DESCRIPTION = """ Compares the extracted answer from the model's output with the reference answer. Args: predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM). references: list of reference answers. subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic". return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True. Returns: accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False. Examples: >>> import evaluate >>> metric = evaluate.load("aauss/test_of_time_accuracy") >>> predictions = [ ... '{"explanation": "Some explanation...", "unordered_list": ["London"]}', ... ' "Response without opening curly brackets...", "answer": "2005-04-07"}', ... ] >>> references = [ ... '{"unordered_list": ["London"]}', ... "{'answer': '2005-04-07'}", ... ] >>> results = metric.compute(predictions=predictions, references=references, subset="arithmetic") >>> print(results) {'accuracy': 0.5} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class TestOfTimeAccuracy(evaluate.Metric): """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" __test__ = False def _info(self): return evaluate.MetricInfo( module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), } ), # Homepage of the module for documentation # homepage="http://module.homepage", # Additional links to the codebase or references # codebase_urls=["http://github.com/path/to/codebase/of/new_module"], # reference_urls=["http://path.to.reference.url/new_module"], ) @staticmethod def _extract_first_json_object(s: str) -> dict | None: decoder = json.JSONDecoder() idx, end = 0, len(s) while idx < end: try: obj, next_idx = decoder.raw_decode(s, idx) idx = next_idx if isinstance(obj, dict): return obj except ValueError: idx += 1 return None @staticmethod def _pop_explanation(d): if isinstance(d, dict): d.pop("explanation", None) return d @staticmethod def _get_answer(d): if isinstance(d, dict): return d.get("answer", None) return d @staticmethod def _parse_label(s): """Parses a string that could be a Python dict.""" try: # Safe: only parses literals, does not execute code return ast.literal_eval(s) except (ValueError, SyntaxError): return None def _compute( self, predictions, references, subset: Literal["arithmetic", "semantic"], return_average: bool = True, ): """Returns the scores""" predictions = [self._extract_first_json_object(p) for p in predictions] if subset == "semantic": predictions = [self._get_answer(p) for p in predictions] elif subset == "arithmetic": predictions = [self._pop_explanation(p) for p in predictions] references = [self._parse_label(r) for r in references] else: raise ValueError(f"Invalid subset: {subset}") accuracy = [] for i, j in zip(predictions, references): if subset == "arithmetic" and "unordered_list" in j: i = sorted(i["unordered_list"]) j = sorted(j["unordered_list"]) accuracy.append( str(i) == str(j) ) # Semantic subset answer JSON somestimes has int as value. Label is string. if return_average: return {"accuracy": sum(accuracy) / len(accuracy)} return {"accuracy": accuracy}