Spaces:
Running
Running
Add kwargs description.
Browse files- test_of_time_accuracy.py +18 -13
test_of_time_accuracy.py
CHANGED
|
@@ -33,25 +33,29 @@ The Test of Time (ToT) benchmarks expects models format their answers as a JSON
|
|
| 33 |
"""
|
| 34 |
|
| 35 |
|
| 36 |
-
# TODO: Add description of the arguments of the module here
|
| 37 |
_KWARGS_DESCRIPTION = """
|
| 38 |
Compares the extracted answer from the model's output with the reference answer.
|
| 39 |
Args:
|
| 40 |
-
predictions: list of predictions to score. Each
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
Returns:
|
| 45 |
-
accuracy:
|
| 46 |
-
another_score: description of the second score,
|
| 47 |
Examples:
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
>>> print(results)
|
| 54 |
-
{'accuracy':
|
| 55 |
"""
|
| 56 |
|
| 57 |
|
|
@@ -139,3 +143,4 @@ class TestOfTimeAccuracy(evaluate.Metric):
|
|
| 139 |
if return_average:
|
| 140 |
return {"accuracy": sum(accuracy) / len(accuracy)}
|
| 141 |
return {"accuracy": accuracy}
|
|
|
|
|
|
| 33 |
"""
|
| 34 |
|
| 35 |
|
|
|
|
| 36 |
_KWARGS_DESCRIPTION = """
|
| 37 |
Compares the extracted answer from the model's output with the reference answer.
|
| 38 |
Args:
|
| 39 |
+
predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM).
|
| 40 |
+
references: list of reference answers.
|
| 41 |
+
subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic".
|
| 42 |
+
return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True.
|
| 43 |
Returns:
|
| 44 |
+
accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False.
|
|
|
|
| 45 |
Examples:
|
| 46 |
+
>>> import evaluate
|
| 47 |
+
>>> metric = evaluate.load("aauss/test_of_time_accuracy")
|
| 48 |
+
>>> predictions = [
|
| 49 |
+
... '{"explanation": "Some explanation...", "unordered_list": ["London"]}',
|
| 50 |
+
... ' "Response without opening curly brackets...", "answer": "2005-04-07"}',
|
| 51 |
+
... ]
|
| 52 |
+
>>> references = [
|
| 53 |
+
... '{"unordered_list": ["London"]}',
|
| 54 |
+
... "{'answer': '2005-04-07'}",
|
| 55 |
+
... ]
|
| 56 |
+
>>> results = metric.compute(predictions=predictions, references=references, subset="arithmetic")
|
| 57 |
>>> print(results)
|
| 58 |
+
{'accuracy': 0.5}
|
| 59 |
"""
|
| 60 |
|
| 61 |
|
|
|
|
| 143 |
if return_average:
|
| 144 |
return {"accuracy": sum(accuracy) / len(accuracy)}
|
| 145 |
return {"accuracy": accuracy}
|
| 146 |
+
|