switch to dual eval system
Browse files- about.py +2 -1
- app.py +12 -8
- evaluate.py +16 -6
- requirements.txt +2 -1
- utils.py +4 -3
about.py
CHANGED
|
@@ -35,5 +35,6 @@ THROTTLE_MINUTES = 480 # minutes between submissions
|
|
| 35 |
API = HfApi(token=TOKEN)
|
| 36 |
organization="OpenADMET"
|
| 37 |
submissions_repo = f'{organization}/openadmet-expansionrx-challenge-submissions' # private
|
| 38 |
-
|
|
|
|
| 39 |
test_repo = f'{organization}/openadmet-expansionrx-challenge-test-data' # private
|
|
|
|
| 35 |
API = HfApi(token=TOKEN)
|
| 36 |
organization="OpenADMET"
|
| 37 |
submissions_repo = f'{organization}/openadmet-expansionrx-challenge-submissions' # private
|
| 38 |
+
results_repo_test = f'{organization}/openadmet-expansionrx-challenge-results' # public
|
| 39 |
+
results_repo_validation = f'{organization}/openadmet-expansionrx-challenge-results-validation' # public
|
| 40 |
test_repo = f'{organization}/openadmet-expansionrx-challenge-test-data' # private
|
app.py
CHANGED
|
@@ -12,13 +12,15 @@ from utils import (
|
|
| 12 |
)
|
| 13 |
from datasets import load_dataset
|
| 14 |
import tempfile
|
| 15 |
-
|
| 16 |
from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
|
| 17 |
|
| 18 |
|
|
|
|
| 19 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 20 |
|
| 21 |
def build_leaderboard(df_results):
|
|
|
|
| 22 |
per_ep = {}
|
| 23 |
for ep in ALL_EPS:
|
| 24 |
df = df_results[df_results["Endpoint"] == ep].copy()
|
|
@@ -45,7 +47,7 @@ def build_leaderboard(df_results):
|
|
| 45 |
sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
|
| 46 |
sorted_df = map_metric_to_stats(sorted_df)
|
| 47 |
per_ep[ep] = sorted_df[LB_COLS]
|
| 48 |
-
|
| 49 |
return per_ep
|
| 50 |
|
| 51 |
# Initialize global dataframe
|
|
@@ -55,8 +57,8 @@ def gradio_interface():
|
|
| 55 |
|
| 56 |
with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
|
| 57 |
theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
| 58 |
-
timer = gr.Timer(
|
| 59 |
-
data_version = gr.State(0) # Track data changes
|
| 60 |
def update_current_dataframe(v):
|
| 61 |
global current_df
|
| 62 |
new_df = fetch_dataset_df()
|
|
@@ -174,7 +176,7 @@ def gradio_interface():
|
|
| 174 |
|
| 175 |
π
**Timeline**:
|
| 176 |
- **September 16:** Challenge announcement
|
| 177 |
-
- **October
|
| 178 |
- **October 27:** Challenge starts
|
| 179 |
- **October-November:** Online Q&A sessions and support via the Discord channel
|
| 180 |
- **January 19, 2026:** Submission closes
|
|
@@ -244,7 +246,7 @@ def gradio_interface():
|
|
| 244 |
select_columns=LB_AVG,
|
| 245 |
search_columns=["user"],
|
| 246 |
render=True,
|
| 247 |
-
every=
|
| 248 |
)
|
| 249 |
# per-endpoint leaderboard
|
| 250 |
for endpoint in ENDPOINTS:
|
|
@@ -255,7 +257,7 @@ def gradio_interface():
|
|
| 255 |
select_columns=LB_COLS,
|
| 256 |
search_columns=["user"],
|
| 257 |
render=True,
|
| 258 |
-
every=
|
| 259 |
)
|
| 260 |
# Auto-refresh
|
| 261 |
def refresh_if_changed():
|
|
@@ -395,4 +397,6 @@ def gradio_interface():
|
|
| 395 |
return demo
|
| 396 |
|
| 397 |
if __name__ == "__main__":
|
| 398 |
-
|
|
|
|
|
|
|
|
|
| 12 |
)
|
| 13 |
from datasets import load_dataset
|
| 14 |
import tempfile
|
| 15 |
+
from loguru import logger
|
| 16 |
from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
|
| 17 |
|
| 18 |
|
| 19 |
+
|
| 20 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 21 |
|
| 22 |
def build_leaderboard(df_results):
|
| 23 |
+
logger.info("Rebuilding leaderboard data...")
|
| 24 |
per_ep = {}
|
| 25 |
for ep in ALL_EPS:
|
| 26 |
df = df_results[df_results["Endpoint"] == ep].copy()
|
|
|
|
| 47 |
sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
|
| 48 |
sorted_df = map_metric_to_stats(sorted_df)
|
| 49 |
per_ep[ep] = sorted_df[LB_COLS]
|
| 50 |
+
logger.info("Finished rebuilding leaderboard data.")
|
| 51 |
return per_ep
|
| 52 |
|
| 53 |
# Initialize global dataframe
|
|
|
|
| 57 |
|
| 58 |
with gr.Blocks(title="OpenADMET ADMET Challenge", fill_height=False,
|
| 59 |
theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
| 60 |
+
timer = gr.Timer(30) # Run every 30 seconds when page is focused
|
| 61 |
+
data_version = gr.State(0) # Track data changes
|
| 62 |
def update_current_dataframe(v):
|
| 63 |
global current_df
|
| 64 |
new_df = fetch_dataset_df()
|
|
|
|
| 176 |
|
| 177 |
π
**Timeline**:
|
| 178 |
- **September 16:** Challenge announcement
|
| 179 |
+
- **October 10:** Second announcement and sample data release
|
| 180 |
- **October 27:** Challenge starts
|
| 181 |
- **October-November:** Online Q&A sessions and support via the Discord channel
|
| 182 |
- **January 19, 2026:** Submission closes
|
|
|
|
| 246 |
select_columns=LB_AVG,
|
| 247 |
search_columns=["user"],
|
| 248 |
render=True,
|
| 249 |
+
every=30,
|
| 250 |
)
|
| 251 |
# per-endpoint leaderboard
|
| 252 |
for endpoint in ENDPOINTS:
|
|
|
|
| 257 |
select_columns=LB_COLS,
|
| 258 |
search_columns=["user"],
|
| 259 |
render=True,
|
| 260 |
+
every=30,
|
| 261 |
)
|
| 262 |
# Auto-refresh
|
| 263 |
def refresh_if_changed():
|
|
|
|
| 397 |
return demo
|
| 398 |
|
| 399 |
if __name__ == "__main__":
|
| 400 |
+
logger.info("Starting Gradio app...")
|
| 401 |
+
gradio_interface().launch(ssr_mode=False)
|
| 402 |
+
logger.info("Gradio app closed.")
|
evaluate.py
CHANGED
|
@@ -5,9 +5,9 @@ from typing import Optional
|
|
| 5 |
from about import (
|
| 6 |
ENDPOINTS, API,
|
| 7 |
submissions_repo,
|
| 8 |
-
|
|
|
|
| 9 |
test_repo,
|
| 10 |
-
multiplier_dict,
|
| 11 |
THROTTLE_MINUTES
|
| 12 |
)
|
| 13 |
from utils import bootstrap_metrics, clip_and_log_transform, fetch_dataset_df
|
|
@@ -23,6 +23,7 @@ from pydantic import (
|
|
| 23 |
field_validator,
|
| 24 |
ValidationError
|
| 25 |
)
|
|
|
|
| 26 |
|
| 27 |
HF_USERNAME_RE = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-_]{1,38})$")
|
| 28 |
def _safeify_username(username: str) -> str:
|
|
@@ -189,6 +190,15 @@ def submit_data(predictions_file: str,
|
|
| 189 |
return "β
Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
|
| 190 |
|
| 191 |
def evaluate_data(filename: str) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
# Load the submission csv
|
| 194 |
try:
|
|
@@ -205,7 +215,7 @@ def evaluate_data(filename: str) -> None:
|
|
| 205 |
test_path = hf_hub_download(
|
| 206 |
repo_id=test_repo,
|
| 207 |
repo_type="dataset",
|
| 208 |
-
filename=
|
| 209 |
)
|
| 210 |
except Exception as e:
|
| 211 |
raise gr.Error(f"Failed to download test file: {e}")
|
|
@@ -277,9 +287,9 @@ def calculate_metrics(
|
|
| 277 |
_check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
|
| 278 |
|
| 279 |
|
| 280 |
-
|
| 281 |
-
if not (
|
| 282 |
-
raise gr.Error("
|
| 283 |
|
| 284 |
|
| 285 |
# 3) check no duplicated molecules in the predictions file
|
|
|
|
| 5 |
from about import (
|
| 6 |
ENDPOINTS, API,
|
| 7 |
submissions_repo,
|
| 8 |
+
results_repo_test,
|
| 9 |
+
results_repo_validation,
|
| 10 |
test_repo,
|
|
|
|
| 11 |
THROTTLE_MINUTES
|
| 12 |
)
|
| 13 |
from utils import bootstrap_metrics, clip_and_log_transform, fetch_dataset_df
|
|
|
|
| 23 |
field_validator,
|
| 24 |
ValidationError
|
| 25 |
)
|
| 26 |
+
from loguru import logger
|
| 27 |
|
| 28 |
HF_USERNAME_RE = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-_]{1,38})$")
|
| 29 |
def _safeify_username(username: str) -> str:
|
|
|
|
| 190 |
return "β
Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
|
| 191 |
|
| 192 |
def evaluate_data(filename: str) -> None:
|
| 193 |
+
# do test set first as a more stringent check of the submission w.r.t matching molecules
|
| 194 |
+
logger.info(f"Evaluating submission file {filename}")
|
| 195 |
+
# evaluate on the test set
|
| 196 |
+
_evaluate_data(filename, test_repo=test_repo, split_filename="data/expansion_data_test.csv", results_repo=results_repo_test)
|
| 197 |
+
# evaluate on the validation set
|
| 198 |
+
_evaluate_data(filename, test_repo=test_repo, split_filename="data/expansion_data_test_validation.csv", results_repo=results_repo_validation)
|
| 199 |
+
logger.info(f"Finished evaluating submission file {filename}")
|
| 200 |
+
|
| 201 |
+
def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_repo: str) -> None:
|
| 202 |
|
| 203 |
# Load the submission csv
|
| 204 |
try:
|
|
|
|
| 215 |
test_path = hf_hub_download(
|
| 216 |
repo_id=test_repo,
|
| 217 |
repo_type="dataset",
|
| 218 |
+
filename=split_filename
|
| 219 |
)
|
| 220 |
except Exception as e:
|
| 221 |
raise gr.Error(f"Failed to download test file: {e}")
|
|
|
|
| 287 |
_check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
|
| 288 |
|
| 289 |
|
| 290 |
+
# 2) Check all Molecules in the test set are present in the predictions
|
| 291 |
+
if not (test_dataframe['Molecule Name']).isin(results_dataframe['Molecule Name']).all():
|
| 292 |
+
raise gr.Error("Some molecules in the test set are missing from the predictions file. Please ensure all molecules are included.")
|
| 293 |
|
| 294 |
|
| 295 |
# 3) check no duplicated molecules in the predictions file
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ huggingface_hub
|
|
| 4 |
gradio-leaderboard
|
| 5 |
plotly
|
| 6 |
scipy
|
| 7 |
-
scikit-learn
|
|
|
|
|
|
| 4 |
gradio-leaderboard
|
| 5 |
plotly
|
| 6 |
scipy
|
| 7 |
+
scikit-learn
|
| 8 |
+
loguru
|
utils.py
CHANGED
|
@@ -3,8 +3,9 @@ import pandas as pd
|
|
| 3 |
import numpy as np
|
| 4 |
from typing import Tuple
|
| 5 |
from datasets import load_dataset, Features, Value
|
| 6 |
-
from about import
|
| 7 |
from about import METRICS, STANDARD_COLS
|
|
|
|
| 8 |
|
| 9 |
def make_user_clickable(name: str):
|
| 10 |
link =f'https://huggingface.co/{name}'
|
|
@@ -13,7 +14,7 @@ def make_tag_clickable(tag: str):
|
|
| 13 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 14 |
|
| 15 |
def fetch_dataset_df():
|
| 16 |
-
|
| 17 |
# Specify feature types to load results dataset
|
| 18 |
metric_features = {
|
| 19 |
f'mean_{m}': Value('float64') for m in METRICS
|
|
@@ -30,7 +31,7 @@ def fetch_dataset_df():
|
|
| 30 |
}
|
| 31 |
feature_schema = Features(metric_features | other_features)
|
| 32 |
|
| 33 |
-
dset = load_dataset(
|
| 34 |
split='train',
|
| 35 |
features=feature_schema,
|
| 36 |
download_mode="force_redownload")
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from typing import Tuple
|
| 5 |
from datasets import load_dataset, Features, Value
|
| 6 |
+
from about import results_repo_validation, results_repo_test
|
| 7 |
from about import METRICS, STANDARD_COLS
|
| 8 |
+
from loguru import logger
|
| 9 |
|
| 10 |
def make_user_clickable(name: str):
|
| 11 |
link =f'https://huggingface.co/{name}'
|
|
|
|
| 14 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 15 |
|
| 16 |
def fetch_dataset_df():
|
| 17 |
+
logger.info("Fetching latest results dataset from Hugging Face Hub...")
|
| 18 |
# Specify feature types to load results dataset
|
| 19 |
metric_features = {
|
| 20 |
f'mean_{m}': Value('float64') for m in METRICS
|
|
|
|
| 31 |
}
|
| 32 |
feature_schema = Features(metric_features | other_features)
|
| 33 |
|
| 34 |
+
dset = load_dataset(results_repo_validation, # change to results_repo_test for test set
|
| 35 |
split='train',
|
| 36 |
features=feature_schema,
|
| 37 |
download_mode="force_redownload")
|