Maria Castellanos
commited on
Commit
Β·
b2070e0
1
Parent(s):
bcf1817
Add links
Browse files- about.py +0 -1
- app.py +10 -11
- evaluate.py +12 -7
about.py
CHANGED
|
@@ -9,7 +9,6 @@ ENDPOINTS = ["LogD",
|
|
| 9 |
"Caco-2 Permeability Papp A>B",
|
| 10 |
"MPPB",
|
| 11 |
"MBPB",
|
| 12 |
-
"RLM CLint",
|
| 13 |
"MGMB"]
|
| 14 |
|
| 15 |
LB_COLS0 = ["endpoint",
|
|
|
|
| 9 |
"Caco-2 Permeability Papp A>B",
|
| 10 |
"MPPB",
|
| 11 |
"MBPB",
|
|
|
|
| 12 |
"MGMB"]
|
| 13 |
|
| 14 |
LB_COLS0 = ["endpoint",
|
app.py
CHANGED
|
@@ -107,7 +107,7 @@ def gradio_interface():
|
|
| 107 |
|
| 108 |
Participants will be tasked with solving real-world ADMET prediction problems ExpansionRx faced during lead optimization.
|
| 109 |
Specifically, you will be asked to predict the ADMET properties of late-stage molecules based on earlier-stage data from the same campaigns.
|
| 110 |
-
For this challenge we selected
|
| 111 |
|
| 112 |
- LogD
|
| 113 |
- Kinetic Solubility **KSOL**: uM
|
|
@@ -117,14 +117,13 @@ def gradio_interface():
|
|
| 117 |
- Caco-2 Papp A>B (10^-6 cm/s)
|
| 118 |
- Mouse Plasma Protein Binding (**MPPB**): % Unbound
|
| 119 |
- Mouse Brain Protein Binding (**MBPB**): % Unbound
|
| 120 |
-
- Rat Liver Microsomal (**RLM**) *Clint*: mL/min/kg
|
| 121 |
- Mouse Gastrocnemius Muscle Binding (**MGMB**): % Unbound
|
| 122 |
|
| 123 |
Find more information about these endpoints on our [blog](https://openadmet.org/community/blogs/challenge_announcement2/).
|
| 124 |
|
| 125 |
## β
How to Participate
|
| 126 |
1. **Register**: Create an account with Hugging Face.
|
| 127 |
-
2. **Download the Public Dataset**:
|
| 128 |
3. **Train Your Model**: Use the provided training data for each ADMET property of your choice.
|
| 129 |
4. **Submit Predictions**: Follow the instructions in the *Submit* tab to upload your predictions.
|
| 130 |
5. Join the discussion on the [Challenge Discord](https://discord.gg/MY5cEFHH3D)!
|
|
@@ -145,10 +144,9 @@ def gradio_interface():
|
|
| 145 |
| Caco-2 Permeability Papp A>B | 10^-6 cm/s | float | Caco-2 Permeability Papp A>B |
|
| 146 |
| MPPB | % Unbound | float | Mouse Plasma Protein Binding |
|
| 147 |
| MBPB | % Unbound | float | Mouse Brain Protein Binding |
|
| 148 |
-
| RLM CLint | mL/min/kg | float | Rat Liver Microsomal Stability |
|
| 149 |
| MGMB. | % Unbound | float | Mouse Gastrocnemius Muscle Binding |
|
| 150 |
|
| 151 |
-
You can download the training data from the [Hugging Face dataset](https://huggingface.co/datasets/
|
| 152 |
The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
|
| 153 |
|
| 154 |
## π Evaluation
|
|
@@ -156,7 +154,7 @@ def gradio_interface():
|
|
| 156 |
- We welcome submissions of any kind, including machine learning and physics-based approaches. You can also employ pre-training approaches as you see fit,
|
| 157 |
as well as incorporate data from external sources into your models and submissions.
|
| 158 |
- In the spirit of open science and open source we would love to see code showing how you created your submission if possible, in the form of a Github Repository.
|
| 159 |
-
If not possible due to IP or other constraints you must at a minimum provide a short report written methodology based on the template [here](
|
| 160 |
**Make sure your lat submission before the deadline includes a link to a report or to a Github repository.**
|
| 161 |
- Each participant can submit as many times as they like, up to a limit of 5 times/day. **Only your latest submission will be considered for the final leaderboard.**
|
| 162 |
- The endpoints will be judged individually by mean absolute error (**MAE**), while an overall leaderboard will be judged by the macro-averaged relative absolute error (**MA-RAE**).
|
|
@@ -165,7 +163,7 @@ def gradio_interface():
|
|
| 165 |
|
| 166 |
π
**Timeline**:
|
| 167 |
- **September 16:** Challenge announcement
|
| 168 |
-
- **
|
| 169 |
- **October 27:** Challenge starts
|
| 170 |
- **October-November:** Online Q&A sessions and support via the Discord channel
|
| 171 |
- **January 19, 2026:** Submission closes
|
|
@@ -334,15 +332,16 @@ def gradio_interface():
|
|
| 334 |
gr.Markdown(
|
| 335 |
"""
|
| 336 |
## Submission Instructions
|
| 337 |
-
|
|
|
|
| 338 |
Only your latest submission will be considered.
|
| 339 |
|
| 340 |
-
|
| 341 |
"""
|
| 342 |
)
|
| 343 |
download_btn = gr.DownloadButton(
|
| 344 |
-
label="π₯ Download Test Set
|
| 345 |
-
value="data/
|
| 346 |
variant="secondary",
|
| 347 |
)
|
| 348 |
with gr.Column():
|
|
|
|
| 107 |
|
| 108 |
Participants will be tasked with solving real-world ADMET prediction problems ExpansionRx faced during lead optimization.
|
| 109 |
Specifically, you will be asked to predict the ADMET properties of late-stage molecules based on earlier-stage data from the same campaigns.
|
| 110 |
+
For this challenge we selected nine (9) crucial endpoints for the community to predict:
|
| 111 |
|
| 112 |
- LogD
|
| 113 |
- Kinetic Solubility **KSOL**: uM
|
|
|
|
| 117 |
- Caco-2 Papp A>B (10^-6 cm/s)
|
| 118 |
- Mouse Plasma Protein Binding (**MPPB**): % Unbound
|
| 119 |
- Mouse Brain Protein Binding (**MBPB**): % Unbound
|
|
|
|
| 120 |
- Mouse Gastrocnemius Muscle Binding (**MGMB**): % Unbound
|
| 121 |
|
| 122 |
Find more information about these endpoints on our [blog](https://openadmet.org/community/blogs/challenge_announcement2/).
|
| 123 |
|
| 124 |
## β
How to Participate
|
| 125 |
1. **Register**: Create an account with Hugging Face.
|
| 126 |
+
2. **Download the Public Dataset**: Download the ExpansionRx dataset.
|
| 127 |
3. **Train Your Model**: Use the provided training data for each ADMET property of your choice.
|
| 128 |
4. **Submit Predictions**: Follow the instructions in the *Submit* tab to upload your predictions.
|
| 129 |
5. Join the discussion on the [Challenge Discord](https://discord.gg/MY5cEFHH3D)!
|
|
|
|
| 144 |
| Caco-2 Permeability Papp A>B | 10^-6 cm/s | float | Caco-2 Permeability Papp A>B |
|
| 145 |
| MPPB | % Unbound | float | Mouse Plasma Protein Binding |
|
| 146 |
| MBPB | % Unbound | float | Mouse Brain Protein Binding |
|
|
|
|
| 147 |
| MGMB. | % Unbound | float | Mouse Gastrocnemius Muscle Binding |
|
| 148 |
|
| 149 |
+
You can download the training data from the [Hugging Face dataset](https://huggingface.co/datasets/openadmet/openadmet-challenge-train-data).
|
| 150 |
The test set will remained blinded until the challenge submission deadline. You will be tasked with predicting the same set of ADMET endpoints for the test set molecules.
|
| 151 |
|
| 152 |
## π Evaluation
|
|
|
|
| 154 |
- We welcome submissions of any kind, including machine learning and physics-based approaches. You can also employ pre-training approaches as you see fit,
|
| 155 |
as well as incorporate data from external sources into your models and submissions.
|
| 156 |
- In the spirit of open science and open source we would love to see code showing how you created your submission if possible, in the form of a Github Repository.
|
| 157 |
+
If not possible due to IP or other constraints you must at a minimum provide a short report written methodology based on the template [here](https://docs.google.com/document/d/1bttGiBQcLiSXFngmzUdEqVchzPhj-hcYLtYMszaOqP8/edit?usp=sharing).
|
| 158 |
**Make sure your lat submission before the deadline includes a link to a report or to a Github repository.**
|
| 159 |
- Each participant can submit as many times as they like, up to a limit of 5 times/day. **Only your latest submission will be considered for the final leaderboard.**
|
| 160 |
- The endpoints will be judged individually by mean absolute error (**MAE**), while an overall leaderboard will be judged by the macro-averaged relative absolute error (**MA-RAE**).
|
|
|
|
| 163 |
|
| 164 |
π
**Timeline**:
|
| 165 |
- **September 16:** Challenge announcement
|
| 166 |
+
- **October XX:** Second announcement and sample data release
|
| 167 |
- **October 27:** Challenge starts
|
| 168 |
- **October-November:** Online Q&A sessions and support via the Discord channel
|
| 169 |
- **January 19, 2026:** Submission closes
|
|
|
|
| 332 |
gr.Markdown(
|
| 333 |
"""
|
| 334 |
## Submission Instructions
|
| 335 |
+
After training your model with the [ExpansionRx trainining set](https://huggingface.co/datasets/openadmet/openadmet-challenge-train-data),
|
| 336 |
+
please upload a single CSV file containing your predictions for all compounds in the test set.
|
| 337 |
Only your latest submission will be considered.
|
| 338 |
|
| 339 |
+
Download a CSV file with the compounds in the test set here:
|
| 340 |
"""
|
| 341 |
)
|
| 342 |
download_btn = gr.DownloadButton(
|
| 343 |
+
label="π₯ Download Test Set Compounds",
|
| 344 |
+
value="data/expansion_data_test_blinded.csv",
|
| 345 |
variant="secondary",
|
| 346 |
)
|
| 347 |
with gr.Column():
|
evaluate.py
CHANGED
|
@@ -255,18 +255,23 @@ def calculate_metrics(
|
|
| 255 |
# Do some checks
|
| 256 |
|
| 257 |
# 1) Check all columns are present
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
| 260 |
# 2) Check all Molecules in the test set are present in the predictions
|
| 261 |
-
merged_df = pd.merge(test_dataframe, results_dataframe, on=['
|
| 262 |
if not (merged_df['_merge'] == 'both').all():
|
| 263 |
raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
|
| 264 |
# TODO: What to do when a molecule is duplicated in the Predictions file?
|
| 265 |
|
| 266 |
df_results = pd.DataFrame(columns=["endpoint", "MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"])
|
| 267 |
for i, measurement in enumerate(ENDPOINTS):
|
| 268 |
-
df_pred = results_dataframe[['
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
| 270 |
# coerce numeric columns
|
| 271 |
df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
|
| 272 |
df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
|
|
@@ -280,7 +285,7 @@ def calculate_metrics(
|
|
| 280 |
df_pred.rename(columns={measurement: f"{measurement}_pred"})
|
| 281 |
.merge(
|
| 282 |
df_true.rename(columns={measurement: f"{measurement}_true"}),
|
| 283 |
-
on="
|
| 284 |
how="inner",
|
| 285 |
)
|
| 286 |
.dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
|
|
@@ -288,7 +293,7 @@ def calculate_metrics(
|
|
| 288 |
n_total = merged[f"{measurement}_true"].notna().sum() # Valid test set points
|
| 289 |
n_pairs = len(merged) # actual pairs with predictions
|
| 290 |
coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
|
| 291 |
-
merged = merged.sort_values("
|
| 292 |
|
| 293 |
# validate pairs
|
| 294 |
if n_pairs < 10:
|
|
|
|
| 255 |
# Do some checks
|
| 256 |
|
| 257 |
# 1) Check all columns are present
|
| 258 |
+
if "Molecule Name" in results_dataframe.columns: # Temporary check so old version of results doesn't fail
|
| 259 |
+
results_dataframe.rename({"Molecule Name": "Name"}, inplace=True)
|
| 260 |
+
_check_required_columns(results_dataframe, "Results file", ["Name"] + ENDPOINTS)
|
| 261 |
+
_check_required_columns(test_dataframe, "Test file", ["Name"] + ENDPOINTS)
|
| 262 |
# 2) Check all Molecules in the test set are present in the predictions
|
| 263 |
+
merged_df = pd.merge(test_dataframe, results_dataframe, on=['Name'], how='left', indicator=True)
|
| 264 |
if not (merged_df['_merge'] == 'both').all():
|
| 265 |
raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
|
| 266 |
# TODO: What to do when a molecule is duplicated in the Predictions file?
|
| 267 |
|
| 268 |
df_results = pd.DataFrame(columns=["endpoint", "MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"])
|
| 269 |
for i, measurement in enumerate(ENDPOINTS):
|
| 270 |
+
df_pred = results_dataframe[['Name', measurement]].copy()
|
| 271 |
+
# Only use data with operator "="
|
| 272 |
+
mask = test_dataframe[f"op_{measurement}"] != '='
|
| 273 |
+
test_dataframe.loc[mask, measurement] = np.nan
|
| 274 |
+
df_true = test_dataframe[['Name', measurement]].copy()
|
| 275 |
# coerce numeric columns
|
| 276 |
df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
|
| 277 |
df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
|
|
|
|
| 285 |
df_pred.rename(columns={measurement: f"{measurement}_pred"})
|
| 286 |
.merge(
|
| 287 |
df_true.rename(columns={measurement: f"{measurement}_true"}),
|
| 288 |
+
on="Name",
|
| 289 |
how="inner",
|
| 290 |
)
|
| 291 |
.dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
|
|
|
|
| 293 |
n_total = merged[f"{measurement}_true"].notna().sum() # Valid test set points
|
| 294 |
n_pairs = len(merged) # actual pairs with predictions
|
| 295 |
coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
|
| 296 |
+
merged = merged.sort_values("Name", kind="stable")
|
| 297 |
|
| 298 |
# validate pairs
|
| 299 |
if n_pairs < 10:
|