aauss commited on
Commit
b807233
·
1 Parent(s): ddf1ba7

Add kwargs description.

Browse files
Files changed (1) hide show
  1. test_of_time_accuracy.py +18 -13
test_of_time_accuracy.py CHANGED
@@ -33,25 +33,29 @@ The Test of Time (ToT) benchmarks expects models format their answers as a JSON
33
  """
34
 
35
 
36
- # TODO: Add description of the arguments of the module here
37
  _KWARGS_DESCRIPTION = """
38
  Compares the extracted answer from the model's output with the reference answer.
39
  Args:
40
- predictions: list of predictions to score. Each predictions
41
- should be a string with tokens separated by spaces.
42
- references: list of reference for each prediction. Each
43
- reference should be a string with tokens separated by spaces.
44
  Returns:
45
- accuracy: description of the first score,
46
- another_score: description of the second score,
47
  Examples:
48
- Examples should be written in doctest format, and should illustrate how
49
- to use the function.
50
-
51
- >>> my_new_module = evaluate.load("my_new_module")
52
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
 
 
 
 
 
 
53
  >>> print(results)
54
- {'accuracy': 1.0}
55
  """
56
 
57
 
@@ -139,3 +143,4 @@ class TestOfTimeAccuracy(evaluate.Metric):
139
  if return_average:
140
  return {"accuracy": sum(accuracy) / len(accuracy)}
141
  return {"accuracy": accuracy}
 
 
33
  """
34
 
35
 
 
36
  _KWARGS_DESCRIPTION = """
37
  Compares the extracted answer from the model's output with the reference answer.
38
  Args:
39
+ predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM).
40
+ references: list of reference answers.
41
+ subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic".
42
+ return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True.
43
  Returns:
44
+ accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False.
 
45
  Examples:
46
+ >>> import evaluate
47
+ >>> metric = evaluate.load("aauss/test_of_time_accuracy")
48
+ >>> predictions = [
49
+ ... '{"explanation": "Some explanation...", "unordered_list": ["London"]}',
50
+ ... ' "Response without opening curly brackets...", "answer": "2005-04-07"}',
51
+ ... ]
52
+ >>> references = [
53
+ ... '{"unordered_list": ["London"]}',
54
+ ... "{'answer': '2005-04-07'}",
55
+ ... ]
56
+ >>> results = metric.compute(predictions=predictions, references=references, subset="arithmetic")
57
  >>> print(results)
58
+ {'accuracy': 0.5}
59
  """
60
 
61
 
 
143
  if return_average:
144
  return {"accuracy": sum(accuracy) / len(accuracy)}
145
  return {"accuracy": accuracy}
146
+