Spaces:

aauss
/

test_of_time_accuracy

Running

aauss commited on 22 days ago

Commit

b807233

1 Parent(s): ddf1ba7

Add kwargs description.

Files changed (1) hide show

test_of_time_accuracy.py CHANGED Viewed

@@ -33,25 +33,29 @@ The Test of Time (ToT) benchmarks expects models format their answers as a JSON
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
 Compares the extracted answer from the model's output with the reference answer.
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
@@ -139,3 +143,4 @@ class TestOfTimeAccuracy(evaluate.Metric):
         if return_average:
             return {"accuracy": sum(accuracy) / len(accuracy)}
         return {"accuracy": accuracy}

 """
 _KWARGS_DESCRIPTION = """
 Compares the extracted answer from the model's output with the reference answer.
 Args:
+    predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM).
+    references: list of reference answers.
+    subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic".
+    return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True.
 Returns:
+    accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False.
 Examples:
+    >>> import evaluate
+    >>> metric = evaluate.load("aauss/test_of_time_accuracy")
+    >>> predictions = [
+    ...     '{"explanation": "Some explanation...", "unordered_list": ["London"]}',
+    ...     ' "Response without opening curly brackets...", "answer": "2005-04-07"}',
+    ... ]
+    >>> references = [
+    ...     '{"unordered_list": ["London"]}',
+    ...     "{'answer': '2005-04-07'}",
+    ... ]
+    >>> results = metric.compute(predictions=predictions, references=references, subset="arithmetic")
     >>> print(results)
+    {'accuracy': 0.5}
 """
         if return_average:
             return {"accuracy": sum(accuracy) / len(accuracy)}
         return {"accuracy": accuracy}