Spaces:
Running
Running
Regan Huff
commited on
Bump agenteval version in leaderboard code (#20)
Browse files- requirements.txt +3 -3
- submission.py +4 -4
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
agent-eval==0.1.
|
| 2 |
aiobotocore==2.22.0
|
| 3 |
aiofiles==24.1.0
|
| 4 |
aiohappyeyeballs==2.6.1
|
|
@@ -43,7 +43,7 @@ huggingface-hub==0.30.2
|
|
| 43 |
idna==3.10
|
| 44 |
ijson==3.3.0
|
| 45 |
importlib_metadata==8.7.0
|
| 46 |
-
inspect_ai==0.3.
|
| 47 |
isort==6.0.1
|
| 48 |
itsdangerous==2.2.0
|
| 49 |
Jinja2==3.1.6
|
|
@@ -110,7 +110,7 @@ sniffio==1.3.1
|
|
| 110 |
soupsieve==2.7
|
| 111 |
starlette==0.46.2
|
| 112 |
tenacity==9.1.2
|
| 113 |
-
textual
|
| 114 |
tiktoken==0.9.0
|
| 115 |
tokenizers==0.21.1
|
| 116 |
tomli==2.2.1
|
|
|
|
| 1 |
+
agent-eval==0.1.24
|
| 2 |
aiobotocore==2.22.0
|
| 3 |
aiofiles==24.1.0
|
| 4 |
aiohappyeyeballs==2.6.1
|
|
|
|
| 43 |
idna==3.10
|
| 44 |
ijson==3.3.0
|
| 45 |
importlib_metadata==8.7.0
|
| 46 |
+
inspect_ai==0.3.104
|
| 47 |
isort==6.0.1
|
| 48 |
itsdangerous==2.2.0
|
| 49 |
Jinja2==3.1.6
|
|
|
|
| 110 |
soupsieve==2.7
|
| 111 |
starlette==0.46.2
|
| 112 |
tenacity==9.1.2
|
| 113 |
+
textual<3.0.0
|
| 114 |
tiktoken==0.9.0
|
| 115 |
tokenizers==0.21.1
|
| 116 |
tomli==2.2.1
|
submission.py
CHANGED
|
@@ -16,7 +16,7 @@ from agenteval import (
|
|
| 16 |
upload_folder_to_hf,
|
| 17 |
upload_summary_to_hf,
|
| 18 |
)
|
| 19 |
-
from agenteval.models import
|
| 20 |
from agenteval.leaderboard.upload import sanitize_path_component
|
| 21 |
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
| 22 |
from datasets.data_files import EmptyDatasetError
|
|
@@ -58,7 +58,7 @@ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
|
| 58 |
CACHED_VIEWERS = {}
|
| 59 |
CACHED_TAG_MAPS = {}
|
| 60 |
|
| 61 |
-
# --- Submission Logic (largely unchanged from original, ensure
|
| 62 |
def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
|
| 63 |
try:
|
| 64 |
return load_dataset(*args, **kwargs)
|
|
@@ -224,7 +224,7 @@ def add_new_eval(
|
|
| 224 |
if not json_path.exists():
|
| 225 |
return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
|
| 226 |
|
| 227 |
-
eval_result_obj =
|
| 228 |
if eval_result_obj.suite_config.version != CONFIG_NAME:
|
| 229 |
return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
|
| 230 |
if eval_result_obj.split != val_or_test:
|
|
@@ -250,7 +250,7 @@ def add_new_eval(
|
|
| 250 |
else: print("mock uploaded scored submission", flush=True)
|
| 251 |
|
| 252 |
|
| 253 |
-
# Update
|
| 254 |
eval_result_obj.submission.agent_name = agent_name
|
| 255 |
eval_result_obj.submission.agent_description = agent_description
|
| 256 |
eval_result_obj.submission.agent_url = agent_url
|
|
|
|
| 16 |
upload_folder_to_hf,
|
| 17 |
upload_summary_to_hf,
|
| 18 |
)
|
| 19 |
+
from agenteval.leaderboard.models import LeaderboardSubmission
|
| 20 |
from agenteval.leaderboard.upload import sanitize_path_component
|
| 21 |
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
| 22 |
from datasets.data_files import EmptyDatasetError
|
|
|
|
| 58 |
CACHED_VIEWERS = {}
|
| 59 |
CACHED_TAG_MAPS = {}
|
| 60 |
|
| 61 |
+
# --- Submission Logic (largely unchanged from original, ensure LeaderboardSubmission and other deps are fine) ---
|
| 62 |
def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
|
| 63 |
try:
|
| 64 |
return load_dataset(*args, **kwargs)
|
|
|
|
| 224 |
if not json_path.exists():
|
| 225 |
return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
|
| 226 |
|
| 227 |
+
eval_result_obj = LeaderboardSubmission.model_validate_json(json_path.read_text(encoding="utf-8"))
|
| 228 |
if eval_result_obj.suite_config.version != CONFIG_NAME:
|
| 229 |
return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
|
| 230 |
if eval_result_obj.split != val_or_test:
|
|
|
|
| 250 |
else: print("mock uploaded scored submission", flush=True)
|
| 251 |
|
| 252 |
|
| 253 |
+
# Update LeaderboardSubmission with submission details
|
| 254 |
eval_result_obj.submission.agent_name = agent_name
|
| 255 |
eval_result_obj.submission.agent_description = agent_description
|
| 256 |
eval_result_obj.submission.agent_url = agent_url
|