Spaces:

ml-jku
/

tox21_snn_classifier

Sleeping

App Files Files Community

antoniaebner commited on 13 days ago

Commit

35189e2

1 Parent(s): aae42ec

upload code

Browse files

Files changed (15) hide show

.gitignore +4 -0
Dockerfile +16 -0
LICENSE +407 -0
MODEL_CARD.md +26 -0
README.md +97 -4
app.py +78 -0
config/config.json +36 -0
data/tox_smarts.json +0 -0
predict.py +101 -0
preprocess.py +68 -0
requirements.txt +12 -0
src/__init__.py +0 -0
src/model.py +126 -0
src/preprocess.py +670 -0
src/utils.py +525 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+hiddens/
+logs/
+checkpoints_/

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,407 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Model card - tox21_snn_classifier
+### Model details
+- Model name: Self-Normalizing Neural Network Tox21 Baseline
+- Developer: JKU (Linz)
+- Paper URL: https://proceedings.neurips.cc/paper_files/paper/2017/hash/5d44ee6f2c3f71b73125876103c8f6c4-Abstract.html
+- Model type / architecture:
+    - Self-Normalizing Neural Network implemented using PyTorch.
+    - Hyperparameters: https://huggingface.co/spaces/ml-jku/tox21_snn_classifier/blob/main/config/config.json
+    - A multitask network is trained for all Tox21 targets.
+- Inference: Access via FastAPI endpoint. Upon receiving a Tox21 prediction request, the model generates and returns predictions for all Tox21 targets simultaneously.
+- Model version: v0
+- Model date: 14.10.2025
+- Reproducibility: Code for full training is available and enables retraining from
+scratch.
+### Intended use
+This model serves as a baseline benchmark for evaluating and comparing toxicity prediction methods across the 12 pathway assays of the Tox21 dataset. It is not intended for clinical decision-making without experimental validation.
+### Metric
+Each Tox21 task is evaluated using the area under the receiver operating characteristic curve (AUC). Overall performance is reported as the mean AUC across all individual tasks.
+### Training data
+Tox21 training and validation sets.
+### Evaluation data
+Tox21 test set.

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Tox21 Snn Classifier
-emoji: 🏢
-colorFrom: yellow
 colorTo: pink
 sdk: docker
 pinned: false
@@ -9,4 +9,97 @@ license: cc-by-nc-4.0
 short_description: Self-Normalizing Neural Network Baseline for Tox21
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tox21 SNN Classifier
+emoji: 🌖
+colorFrom: green
 colorTo: pink
 sdk: docker
 pinned: false
 short_description: Self-Normalizing Neural Network Baseline for Tox21
 ---
+# Tox21 SNN Classifier
+This repository hosts a Hugging Face Space that provides an API for submitting models to the [Tox21 Leaderboard](https://huggingface.co/spaces/ml-jku/tox21_leaderboard).
+Here a [self-normalizing network (SNN)](https://arxiv.org/abs/1706.02515) is trained on the Tox21 dataset, and the trained models are provided for
+inference. Model input is a SMILES string of the small molecule, and the output are 12 numeric values for
+each of the toxic effects of the Tox21 dataset.
+**Important:** For leaderboard submission, your Space needs to include training code. The file `train.py` should train the model using the config specified inside the `config/` folder and save the final model parameters into a file inside the `checkpoints/` folder. The model should be trained using the [Tox21_dataset](https://huggingface.co/datasets/tschouis/tox21) provided on Hugging Face. The datasets can be loaded like this:
+```python
+from datasets import load_dataset
+ds = load_dataset("ml-jku/tox21", token=token)
+train_df = ds["train"].to_pandas()
+val_df = ds["validation"].to_pandas()
+```
+Additionally, the Space needs to implement inference in the `predict()` function inside `predict.py`. The `predict()` function must keep the provided skeleton: it should take a list of SMILES strings as input and return a nested prediction dictionary as output, with SMILES as keys and dictionaries containing targetname-prediction pairs as values. Therefore, any preprocessing of SMILES strings must be executed on-the-fly during inference.
+# Repository Structure
+- `predict.py` - Defines the `predict()` function required by the leaderboard (entry point for inference).
+- `app.py` - FastAPI application wrapper (can be used as-is).
+- `preprocess.py` - preprocesses SMILES strings to generate feature descriptors and saves results as NPZ files in `data/`.
+- `train.py` - trains and saves a model using the config in the `config/` folder.
+- `config/` - the config file used by `train.py`.
+- `logs/` - all the logs of `train.py`, the saved model, and predictions on the validation set.
+- `data/` - SNN uses numerical data. During preprocessing in `preprocess.py` two NPZ files containing molecule features are created and saved here.
+- `checkpoints/` - the saved model that is used in `predict.py` is here.
+- `src/` - Core model & preprocessing logic:
+    - `preprocess.py` - SMILES preprocessing logic
+    - `model.py` - SNN model class with processing, saving and loading logic
+    - `utils.py` - utility functions
+# Quickstart with Spaces
+You can easily adapt this project in your own Hugging Face account:
+- Open this Space on Hugging Face.
+- Click "Duplicate this Space" (top-right corner).
+- Modify `src/` for your preprocessing pipeline and model class
+- Modify `predict()` inside `predict.py` to perform model inference while keeping the function skeleton unchanged to remain compatible with the leaderboard.
+- Modify `train.py` and/or `preprocess.py` according to your model and preprocessing pipeline.
+- Modify the file inside `config/` to contain all hyperparameters that are set in `train.py`.
+That’s it, your model will be available as an API endpoint for the Tox21 Leaderboard.
+# Installation
+To run (and train) the SNN, clone the repository and install dependencies:
+```bash
+git clone https://huggingface.co/spaces/ml-jku/tox21_snn_classifier
+cd tox21_snn_classifier
+conda create -n tox21_snn_cls python=3.11
+conda activate tox21_snn_cls
+pip install -r requirements.txt
+```
+# Inference
+For inference, you only need `predict.py`.
+Example usage inside Python:
+```python
+from predict import predict
+smiles_list = ["CCO", "c1ccccc1", "CC(=O)O"]
+results = predict(smiles_list)
+print(results)
+```
+The output will be a nested dictionary in the format:
+```python
+{
+    "CCO": {"target1": 0, "target2": 1, ..., "target12": 0},
+    "c1ccccc1": {"target1": 1, "target2": 0, ..., "target12": 1},
+    "CC(=O)O": {"target1": 0, "target2": 0, ..., "target12": 0}
+}
+```
+# Notes
+- Adapting `predict.py`, `train.py`, `config/`, and `checkpoints/` is required for leaderboard submission.
+- Preprocessing must be done inside `predict.py` not just `train.py`.

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+This is the main entry point for the FastAPI application.
+The app handles the request to predict toxicity for a list of SMILES strings.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies and global variable definition
+import os
+from typing import List, Dict, Optional
+from fastapi import FastAPI, Header, HTTPException
+from pydantic import BaseModel, Field
+from predict import predict as predict_func
+API_KEY = os.getenv("API_KEY")  # set via Space Secrets
+# ---------------------------------------------------------------------------------------
+class Request(BaseModel):
+    smiles: List[str] = Field(min_items=1, max_items=1000)
+class Response(BaseModel):
+    predictions: dict
+    model_info: Dict[str, str] = {}
+app = FastAPI(title="toxicity-api")
+@app.get("/")
+def root():
+    return {
+        "message": "Toxicity Prediction API",
+        "endpoints": {
+            "/metadata": "GET - API metadata and capabilities",
+            "/healthz": "GET - Health check",
+            "/predict": "POST - Predict toxicity for SMILES",
+        },
+        "usage": "Send POST to /predict with {'smiles': ['your_smiles_here']} and Authorization header",
+    }
+@app.get("/metadata")
+def metadata():
+    return {
+        "name": "SNN",
+        "version": "1.0.0",
+        "max_batch_size": 256,
+        "tox_endpoints": [
+            "NR-AR",
+            "NR-AR-LBD",
+            "NR-AhR",
+            "NR-Aromatase",
+            "NR-ER",
+            "NR-ER-LBD",
+            "NR-PPAR-gamma",
+            "SR-ARE",
+            "SR-ATAD5",
+            "SR-HSE",
+            "SR-MMP",
+            "SR-p53",
+        ],
+    }
+@app.get("/healthz")
+def healthz():
+    return {"ok": True}
+@app.post("/predict", response_model=Response)
+def predict(request: Request):
+    predictions = predict_func(request.smiles)
+    return {
+        "predictions": predictions,
+        "model_info": {"name": "SNN", "version": "1.0.0"},
+    }

config/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "seed": 0,
+    "debug": "false",
+    "device": "cpu",
+    "log_folder": "logs/",
+    "data_folder": "data/",
+    "cvfold": 4,
+    "ecfp" : {
+        "radius": 3,
+        "fpsize": 8192
+    },
+    "merge_train_val": "false",
+    "descriptors": ["ecfps", "rdkit_descrs", "maccs", "tox"],
+    "feature_selection": {
+        "use": "true",
+        "min_var": 0.05,
+        "max_corr": 1,
+        "max_features": -1,
+        "min_var__feature_keys": ["ecfps", "tox"],
+        "max_corr__feature_keys": ["ecfps", "tox"],
+        "min_var__independent_keys": "true",
+        "max_corr__independent_keys": "true"
+    },
+    "feature_quantilization": {
+        "use": "true",
+        "feature_keys": ["rdkit_descrs"]
+    },
+    "max_samples": -1,
+    "scaler": "squash",
+    "preprocessor_path": "checkpoints/preprocessor.joblib",
+    "ckpt_path": "checkpoints/snn_ckpt.pth",
+    "model_config": "none"
+}

data/tox_smarts.json ADDED Viewed

The diff for this file is too large to render. See raw diff

predict.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+This files includes a predict function for the Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+from collections import defaultdict
+import numpy as np
+import json
+import joblib
+import torch
+from src.model import Tox21SNNClassifier, SNNConfig
+from src.preprocess import create_descriptors, FeaturePreprocessor
+from src.utils import TASKS, normalize_config
+# ---------------------------------------------------------------------------------------
+CONFIG_FILE = "./config/config.json"
+def predict(
+    smiles_list: list[str], default_prediction=0.5
+) -> dict[str, dict[str, float]]:
+    """Applies the classifier to a list of SMILES strings. Returns prediction=0.0 for
+    any molecule that could not be cleaned.
+    Args:
+        smiles_list (list[str]): list of SMILES strings
+    Returns:
+        dict: nested prediction dictionary, following {'<smiles>': {'<target>': <pred>}}
+    """
+    print(f"Received {len(smiles_list)} SMILES strings")
+    # preprocessing pipeline
+    with open(CONFIG_FILE, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    features, is_clean = create_descriptors(
+        smiles_list, config["descriptors"], **config["ecfp"]
+    )
+    print(f"Created descriptors for {sum(is_clean)} molecules.")
+    print(f"{len(is_clean) - sum(is_clean)} molecules removed during cleaning")
+    # setup model
+    preprocessor = FeaturePreprocessor(
+        feature_selection_config=config["feature_selection"],
+        feature_quantilization_config=config["feature_quantilization"],
+        descriptors=config["descriptors"],
+        max_samples=config["max_samples"],
+        scaler=config["scaler"],
+    )
+    preprocessor_ckpt = joblib.load(config["preprocessor_path"])
+    preprocessor.set_state(preprocessor_ckpt["preprocessor"])
+    print(f"Loaded preprocessor from {config['preprocessor_path']}")
+    features = {descr: array[is_clean] for descr, array in features.items()}
+    features = preprocessor.transform(features)
+    dataset = torch.utils.data.TensorDataset(torch.FloatTensor(features))
+    loader = torch.utils.data.DataLoader(
+        dataset, batch_size=256, shuffle=False, num_workers=0
+    )
+    # setup model
+    cfg = SNNConfig(
+        hidden_dim=512,
+        n_layers=8,
+        dropout=0.05,
+        layer_form="rect",
+        in_features=features.shape[1],
+        out_features=12,
+    )
+    model = Tox21SNNClassifier(cfg)
+    model.load_model(config["ckpt_path"])
+    model.eval()
+    print(f"Loaded model from {config['ckpt_path']}")
+    predictions = defaultdict(dict)
+    print(f"Create predictions:")
+    preds = []
+    with torch.no_grad():
+        preds = np.concatenate([model.predict(batch[0]) for batch in loader], axis=0)
+    for i, target in enumerate(model.tasks):
+        target_preds = np.empty_like(is_clean, dtype=float)
+        target_preds[~is_clean] = default_prediction
+        target_preds[is_clean] = preds[:, i]
+        for smiles, pred in zip(smiles_list, target_preds):
+            predictions[smiles][target] = float(pred)
+    return predictions

preprocess.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import json
+import argparse
+import numpy as np
+from src.preprocess import create_descriptors, get_tox21_split
+from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--config",
+    type=str,
+    default="config/config.json",
+)
+def main(config):
+    """Create molecule descriptors for HF Tox21 dataset"""
+    ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"])
+    splits = ["train", "validation"]
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        ds_split = ds[split]
+        smiles = list(ds_split["smiles"])
+        features, clean_mol_mask = create_descriptors(
+            smiles, config["descriptors"], **config["ecfp"]
+        )
+        labels = []
+        for task in TASKS:
+            labels.append(ds_split[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                clean_mol_mask=clean_mol_mask,
+                labels=labels,
+                **features,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = json.load(f)
+    config = normalize_config(config)
+    create_dir(config["data_folder"])
+    main(config)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn[standard]
+statsmodels==0.14.5
+rdkit==2025.03.5
+numpy==2.2.6
+scikit-learn==1.6.1
+joblib
+tabulate
+datasets==4.0.0
+scipy==1.16.1
+pandas==2.3.2
+torch==2.8.0

src/__init__.py ADDED Viewed

File without changes

src/model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+This files includes a XGBoost model for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+# ---------------------------------------------------------------------------------------
+# Dependencies
+from typing import Literal
+from dataclasses import dataclass
+import numpy as np
+import torch
+import torch.nn as nn
+from .utils import TASKS
+# ---------------------------------------------------------------------------------------
+@dataclass
+class SNNConfig:
+    hidden_dim: int
+    n_layers: int
+    dropout: float
+    layer_form: Literal["conic", "rect"]
+    in_features: int
+    out_features: int
+class Tox21SNNClassifier(nn.Module):
+    """An SNN classifier that assigns a toxicity score to a given SMILES string."""
+    def __init__(self, config: SNNConfig):
+        """Initialize an SNN classifier for each of the 12 Tox21 tasks.
+        Args:
+            seed (int, optional): seed for SNN to ensure reproducibility. Defaults to 42.
+        """
+        super(Tox21SNNClassifier, self).__init__()
+        self.tasks = TASKS
+        self.num_tasks = len(TASKS)
+        activation = nn.SELU()
+        dropout = nn.AlphaDropout(p=config.dropout)
+        n_hidden = (
+            (
+                config.hidden_dim
+                * np.power(
+                    np.power(
+                        config.out_features / config.hidden_dim, 1 / (config.n_layers)
+                    ),
+                    range(-1, config.n_layers),
+                )
+            ).astype(int)
+            if config.layer_form == "conic"
+            else [config.hidden_dim] * (config.n_layers + 1)
+        )
+        n_hidden[0] = config.in_features
+        n_hidden[config.n_layers] = config.out_features
+        layers = []
+        for l in range(config.n_layers + 1):
+            fc = nn.Linear(
+                in_features=n_hidden[l],
+                out_features=(
+                    n_hidden[config.n_layers]
+                    if l == config.n_layers
+                    else n_hidden[l + 1]
+                ),
+            )
+            if l < config.n_layers:
+                block = [
+                    fc,
+                    activation,
+                    dropout,
+                ]
+            else:  # last layer
+                block = [fc]
+            layers.extend(block)
+        self.model = nn.Sequential(*layers)
+        self.config = config
+        self.reset_parameters()
+    def reset_parameters(self):
+        for param in self.model.parameters():
+            # biases zero
+            if len(param.shape) == 1:
+                nn.init.constant_(param, 0)
+            # others using lecun-normal initialization
+            else:
+                nn.init.kaiming_normal_(param, mode="fan_in", nonlinearity="linear")
+    def forward(self, x) -> torch.Tensor:
+        x = self.model(x)
+        return x  # x.view(x.size(0), self.num_tasks)
+    def load_model(self, path: str):
+        state_dict = torch.load(
+            path, weights_only=False, map_location=torch.device("cpu")
+        )["model"]
+        self.load_state_dict(state_dict)
+        self.eval()
+    @torch.no_grad()
+    def predict(self, features: torch.tensor) -> np.ndarray:
+        """Predicts labels for a given Tox21 target using molecule features
+        Args:
+            task (str): the Tox21 target to predict for
+            features (torch.tensor): molecule features used for prediction
+        Returns:
+            np.ndarray: predicted probability for positive class
+        """
+        assert (
+            len(features.shape) == 2
+        ), f"Function expects 2D torch.tensor. Current shape: {features.shape}"
+        return torch.nn.functional.sigmoid(self.model(features)).detach().cpu().numpy()

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,670 @@

+import copy
+import json
+from typing import Any
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import StandardScaler, FunctionTransformer
+from statsmodels.distributions.empirical_distribution import ECDF
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from .utils import USED_200_DESCR, TOX_SMARTS_PATH, Standardizer, FeatureDictMixin
+class SquashScaler(TransformerMixin, BaseEstimator):
+    """
+    Scaler that performs sequential standardization, nonlinearity (tanh), and
+    re-standardization. Inspired by DeepTox (Mayr et al., 2016)
+    """
+    def __init__(self):
+        self.scaler1 = StandardScaler()
+        self.scaler2 = StandardScaler()
+    def fit(self, X):
+        _X = X.copy()
+        _X = self.scaler1.fit_transform(_X)
+        _X = np.tanh(_X)
+        _X = self.scaler2.fit(_X)
+        self.is_fitted_ = True
+        return self
+    def transform(self, X):
+        _X = X.copy()
+        _X = self.scaler1.transform(_X)
+        _X = np.tanh(_X)
+        return self.scaler2.transform(_X)
+SCALER_REGISTRY = {
+    None: FunctionTransformer,
+    "standard": StandardScaler,
+    "squash": SquashScaler,
+}
+class SubSampler(TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that randomly samples `max_samples` from data.
+    Args:
+        max_samples (int): Maximum allowed samples. If -1, all samples are retained.
+    Input:
+        np.ndarray: A 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        np.ndarray: Subsampled array of shape (min(n_samples, max_samples), n_features).
+    """
+    def __init__(self, *, max_samples=-1):
+        self.max_samples = max_samples
+        self.is_fitted_ = True
+    def fit(self, X: np.ndarray, y: np.ndarray | None = None):
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.max_samples > 0 and _X.shape[0] > self.max_samples:
+            resample_idxs = np.random.choice(
+                np.arange(_X.shape[0]), size=(self.max_samples,), replace=True
+            )
+            _X = _X[resample_idxs]
+            _y = _y[resample_idxs] if _y is not None else None
+        if _y is None:
+            return _X
+        return _X, _y
+class FeatureSelector(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that performs feature selection based on variance and correlation.
+    This transformer selects features that:
+    1. Have variance above a specified threshold.
+    2. Are below a given pairwise correlation threshold.
+    3. Among the remaining features, keeps only the top `max_features` with the highest variance.
+    The input and output are both dictionaries mapping feature types to their corresponding
+    feature matrices.
+    Args:
+        min_var (float): Minimum variance required for a feature to be retained.
+        max_corr (float): Maximum allowed correlation between features.
+            Features exceeding this threshold with others are removed.
+        max_features (int): Maximum number of features to keep after filtering.
+            If -1, all remaining features are retained.
+        feature_keys (list[str]): Features to apply feature selection to.
+        independent_keys (bool): Apply filtering only within features types.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        containing only the selected features for each feature type.
+    """
+    def __init__(
+        self,
+        *,
+        min_var=0.0,
+        max_corr=1.0,
+        max_features=-1,
+        feature_keys=None,
+        min_var__feature_keys=None,
+        max_corr__feature_keys=None,
+        max_features__feature_keys=None,
+        min_var__independent_keys=False,
+        max_corr__independent_keys=False,
+        max_features__independent_keys=False,
+    ):
+        self.min_var = min_var
+        self.max_corr = max_corr
+        self.max_features = max_features
+        self.min_var__feature_keys = min_var__feature_keys
+        self.max_corr__feature_keys = max_corr__feature_keys
+        self.max_features__feature_keys = max_features__feature_keys
+        self.min_var__independent_keys = min_var__independent_keys
+        self.max_corr__independent_keys = max_corr__independent_keys
+        self.max_features__independent_keys = max_features__independent_keys
+        super().__init__(feature_keys=feature_keys)
+    def _get_min_var_mask(self, X: np.ndarray, *args) -> np.ndarray:
+        var_thresh = VarianceThreshold(threshold=self.min_var)
+        return var_thresh.fit(X).get_support()  # mask
+    def _get_max_corr_mask(
+        self, X: np.ndarray, prev_feature_mask: np.ndarray
+    ) -> np.ndarray:
+        _prev_feature_mask = prev_feature_mask.copy()
+        corr_matrix = np.corrcoef(X[:, _prev_feature_mask], rowvar=False)
+        upper_tri = np.triu(corr_matrix, k=1)
+        to_keep = np.ones((sum(_prev_feature_mask),), dtype=bool)
+        for i in range(upper_tri.shape[0]):
+            for j in range(upper_tri.shape[1]):
+                if upper_tri[i, j] > self.max_corr:
+                    to_keep[j] = False
+        _prev_feature_mask[_prev_feature_mask] = to_keep
+        return _prev_feature_mask
+    def _get_max_features_mask(
+        self, X: np.ndarray, prev_feature_mask: np.ndarray
+    ) -> np.ndarray:
+        _prev_feature_mask = prev_feature_mask.copy()
+        # select features with at least max_var variation
+        feature_vars = np.nanvar(X[:, _prev_feature_mask], axis=0)
+        order = np.argsort(feature_vars)[: -(self.max_features + 1) : -1]
+        keep_feat_idx = np.arange(len(_prev_feature_mask))[order]
+        _prev_feature_mask = np.isin(
+            np.arange(len(_prev_feature_mask)), keep_feat_idx, assume_unique=True
+        )
+        return _prev_feature_mask
+    def apply_filter(self, filter, X, prev_feature_mask):
+        mask = prev_feature_mask.copy()
+        func = self.__getattribute__(f"_get_{filter}_mask")
+        feature_keys = self.__getattribute__(f"{filter}__feature_keys")
+        if self.__getattribute__(f"{filter}__independent_keys"):
+            for key in feature_keys:
+                key_mask = self._curr_keys == key
+                mask[key_mask] = func(X[:, key_mask], mask[key_mask])
+        else:
+            feature_key_mask = np.isin(self._curr_keys, feature_keys)
+            mask[feature_key_mask] = func(
+                X[:, feature_key_mask], mask[feature_key_mask]
+            )
+        return mask
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        feature_mask = np.ones((_X.shape[1]), dtype=bool)
+        # select features with at least min_var variation
+        if self.min_var > 0.0:
+            if self.min_var__independent_keys:
+                for key in self.min_var__feature_keys:
+                    key_mask = self._curr_keys == key
+                    feature_mask[key_mask] = self._get_min_var_mask(_X[:, key_mask])
+            else:
+                feature_key_mask = np.isin(self._curr_keys, self.min_var__feature_keys)
+                feature_mask[feature_key_mask] = self._get_min_var_mask(
+                    _X[:, feature_key_mask]
+                )
+        # select features with at least max_var variation
+        if self.max_corr < 1.0:
+            if self.max_corr__independent_keys:
+                for key in self.max_corr__feature_keys:
+                    key_mask = self._curr_keys == key
+                    subset = _X[:, key_mask]
+                    feature_mask[key_mask] = self._get_max_corr_mask(
+                        subset, feature_mask[key_mask]
+                    )
+            else:
+                feature_key_mask = np.isin(self._curr_keys, self.max_corr__feature_keys)
+                feature_mask[feature_key_mask] = self._get_max_corr_mask(
+                    _X[:, feature_key_mask], feature_mask[feature_key_mask]
+                )
+        if self.max_features == 0:
+            raise ValueError(
+                f"max_features (={self.max_features}) must be -1 or larger 0."
+            )
+        elif self.max_features > 0:
+            if self.max_features__independent_keys:
+                for key in self.max_features__feature_keys:
+                    key_mask = self._curr_keys == key
+                    feature_mask[key_mask] = self._get_max_features_mask(
+                        _X[:, key_mask], feature_mask[key_mask]
+                    )
+            else:
+                feature_key_mask = np.isin(
+                    self._curr_keys, self.max_features__feature_keys
+                )
+                feature_mask[feature_key_mask] = self._get_max_features_mask(
+                    _X[:, feature_key_mask], feature_mask[feature_key_mask]
+                )
+        self._feature_mask = feature_mask
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+        _X = self.dict_to_array(X)
+        _X = _X[:, self._feature_mask]
+        self._curr_keys = self._curr_keys[self._feature_mask]
+        return self.array_to_dict(_X)
+class QuantileCreator(FeatureDictMixin, TransformerMixin, BaseEstimator):
+    """
+    Preprocessor that transforms features into empirical quantiles using ECDFs.
+    This transformer applies an Empirical Cumulative Distribution Function (ECDF)
+    to each feature and replaces feature values with their corresponding quantile
+    ranks. The transformation is applied independently to each feature type.
+    Both input and output are dictionaries mapping feature types to their
+    corresponding feature matrices.
+    Args:
+        feature_keys (list[str]): Features to apply quantile creation to.
+    Input:
+        dict[str, np.ndarray]: A dictionary where each key corresponds to a feature type
+        and each value is a 2D NumPy array of shape (n_samples, n_features).
+    Output:
+        dict[str, np.ndarray]: A dictionary with the same keys as the input,
+        where each feature value is replaced by its corresponding ECDF quantile rank.
+    """
+    def __init__(self, *, feature_keys=None):
+        self._ecdfs = None
+        super().__init__(feature_keys=feature_keys)
+    def fit(self, X: dict[str, np.ndarray]):
+        _X = self.dict_to_array(X)
+        ecdfs = []
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdfs.append(ECDF(raw_values))
+        self._ecdfs = ecdfs
+        self.is_fitted_ = True
+        return self
+    def transform(self, X: dict[str, np.ndarray]) -> np.ndarray:
+        _X = self.dict_to_array(X)
+        quantiles = np.zeros_like(_X)
+        for column in range(_X.shape[1]):
+            raw_values = _X[:, column].reshape(-1)
+            ecdf = self._ecdfs[column]
+            q = ecdf(raw_values)
+            quantiles[:, column] = q
+        return self.array_to_dict(quantiles)
+class FeaturePreprocessor(TransformerMixin, BaseEstimator):
+    """This class implements the feature preprocessing from a dictionary of molecule features."""
+    def __init__(
+        self,
+        feature_selection_config: dict[str, Any],
+        feature_quantilization_config: dict[str, Any],
+        descriptors: list[str],
+        max_samples: int = -1,
+        scaler: str = "standard",
+    ):
+        self.descriptors = descriptors
+        self.feature_quantilization_config = copy.deepcopy(
+            feature_quantilization_config
+        )
+        self.use_feat_quant = self.feature_quantilization_config.pop("use")
+        self.quantile_creator = QuantileCreator(**self.feature_quantilization_config)
+        self.feature_selection_config = copy.deepcopy(feature_selection_config)
+        self.use_feat_selec = self.feature_selection_config.pop("use")
+        self.feature_selection_config["feature_keys"] = descriptors
+        self.feature_selector = FeatureSelector(**self.feature_selection_config)
+        self.max_samples = max_samples
+        self.sub_sampler = SubSampler(max_samples=max_samples)
+        self.scaler = SCALER_REGISTRY[scaler]()
+    def __getstate__(self):
+        state = super().__getstate__()
+        state["quantile_creator"] = self.quantile_creator.__getstate__()
+        state["feature_selector"] = self.feature_selector.__getstate__()
+        state["sub_sampler"] = self.sub_sampler.__getstate__()
+        state["scaler"] = self.scaler.__getstate__()
+        return state
+    def __setstate__(self, state):
+        _state = copy.deepcopy(state)
+        self.quantile_creator.__setstate__(_state.pop("quantile_creator"))
+        self.feature_selector.__setstate__(_state.pop("feature_selector"))
+        self.sub_sampler.__setstate__(_state.pop("sub_sampler"))
+        self.scaler.__setstate__(_state.pop("scaler"))
+        super().__setstate__(_state)
+    def get_state(self):
+        return self.__getstate__()
+    def set_state(self, state):
+        return self.__setstate__(state)
+    def fit(self, X: dict[str, np.ndarray]):
+        """Fit the processor transformers"""
+        _X = copy.deepcopy(X)
+        if self.use_feat_quant:
+            _X = self.quantile_creator.fit_transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.fit_transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        self.scaler.fit(_X)
+        return self
+    def transform(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> np.ndarray | tuple[np.ndarray]:
+        _X = X.copy()
+        _y = y.copy() if y is not None else None
+        if self.use_feat_quant:
+            _X = self.quantile_creator.transform(_X)
+        if self.use_feat_selec:
+            _X = self.feature_selector.transform(_X)
+        _X = np.concatenate([_X[descr] for descr in self.descriptors], axis=1)
+        _X = self.scaler.transform(_X)
+        if _y is None:
+            _X = self.sub_sampler.transform(_X)
+            return _X
+        _X, _y = self.sub_sampler.transform(_X, _y)
+        return _X, _y
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Modification by Antonia Ebner:
+        - skip uncleanable molecules
+        - return clean molecule mask
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol], radius=3, fpsize=2048, **kwargs) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Inspired by from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    for mol in mols:
+        gen = rdFingerprintGenerator.GetMorganGenerator(
+            countSimulation=True, fpSize=fpsize, radius=radius
+        )
+        fp_sparse_vec = gen.GetCountFingerprint(mol)
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    """This function creates MACCS keys for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: MACCS keys of molecules
+    """
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This retrieves the tox features defined in filepath.
+    Args:
+        filepath (str): A list of tox features
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def create_descriptors(
+    smiles,
+    descriptors,
+    **ecfp_kwargs,
+):
+    """Generate molecular descriptors for multiple SMILES strings.
+    Inspired by https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+    Each SMILES is processed and sanitized using RDKit.
+    SMILES that cannot be sanitized are encoded with NaNs, and a corresponding boolean mask
+    is returned to indicate which inputs were successfully processed.
+    Args:
+        smiles (list[str]): List of SMILES strings for which to generate descriptors.
+        descriptors (list[str]): List of descriptor types to compute.
+            Supported values include:
+            ['ecfps', 'tox', 'maccs', 'rdkit_descrs'].
+    Returns:
+        tuple[dict[str, np.ndarray], np.ndarray]:
+            - A dictionary mapping descriptor names to their computed arrays.
+            - A boolean mask of shape (len(smiles),) indicating which SMILES
+            were successfully sanitized and processed.
+    """
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print(f"Cleaned molecules, {(~clean_mol_mask).sum()} could not be sanitized")
+    # Create fingerprints and descriptors
+    if "ecfps" in descriptors:
+        ecfps = create_ecfp_fps(mols, **ecfp_kwargs)
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+    if "tox" in descriptors:
+        tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+    if "maccs" in descriptors:
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+    if "rdkit_descrs" in descriptors:
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        rdkit_descrs = fill(rdkit_descrs, ~clean_mol_mask)
+        print("Created RDKit descriptors")
+    # concatenate features
+    features = {}
+    for descr in descriptors:
+        features[descr] = vars()[descr]
+    return features, clean_mol_mask
+def get_tox21_split(token, cvfold=None):
+    """Retrieve Tox21 splits from HuggingFace with respect to given cvfold."""
+    ds = load_dataset("ml-jku/tox21", token=token)
+    train_df = ds["train"].to_pandas()
+    val_df = ds["validation"].to_pandas()
+    if cvfold is None:
+        return {"train": train_df, "validation": val_df}
+    combined_df = pd.concat([train_df, val_df], ignore_index=True)
+    cvfold = float(cvfold)
+    # create new splits
+    cvfold = float(cvfold)
+    train_df = combined_df[combined_df.CVfold != cvfold]
+    val_df = combined_df[combined_df.CVfold == cvfold]
+    # exclude train mols that occur in the validation split
+    val_inchikeys = set(val_df["inchikey"])
+    train_df = train_df[~train_df["inchikey"].isin(val_inchikeys)]
+    return {
+        "train": train_df.reset_index(drop=True),
+        "validation": val_df.reset_index(drop=True),
+    }

src/utils.py ADDED Viewed

	@@ -0,0 +1,525 @@

+## These MolStandardizer classes are due to Paolo Tosco
+## It was taken from the FS-Mol github
+## (https://github.com/microsoft/FS-Mol/blob/main/fs_mol/preprocessing/utils/
+##  standardizer.py)
+## They ensure that a sequence of standardization operations are applied
+## https://gist.github.com/ptosco/7e6b9ab9cc3e44ba0919060beaed198e
+import os
+import pickle
+from typing import Any
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+HF_TOKEN = os.environ.get("HF_TOKEN")
+PAD_VALUE = -100
+TOX_SMARTS_PATH = "data/tox_smarts.json"
+TASKS = [
+    "NR-AR",
+    "NR-AR-LBD",
+    "NR-AhR",
+    "NR-Aromatase",
+    "NR-ER",
+    "NR-ER-LBD",
+    "NR-PPAR-gamma",
+    "SR-ARE",
+    "SR-ATAD5",
+    "SR-HSE",
+    "SR-MMP",
+    "SR-p53",
+]
+USED_200_DESCR = [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47,
+    48,
+    49,
+    50,
+    51,
+    52,
+    53,
+    54,
+    55,
+    56,
+    57,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    64,
+    65,
+    66,
+    67,
+    68,
+    69,
+    70,
+    71,
+    72,
+    73,
+    74,
+    75,
+    76,
+    77,
+    78,
+    79,
+    80,
+    81,
+    82,
+    83,
+    84,
+    85,
+    86,
+    87,
+    88,
+    89,
+    90,
+    91,
+    92,
+    93,
+    94,
+    95,
+    96,
+    97,
+    98,
+    99,
+    100,
+    101,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    110,
+    111,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    121,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    128,
+    129,
+    130,
+    131,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    140,
+    141,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+    150,
+    151,
+    152,
+    153,
+    154,
+    155,
+    156,
+    157,
+    158,
+    159,
+    160,
+    161,
+    162,
+    163,
+    164,
+    165,
+    166,
+    167,
+    168,
+    169,
+    170,
+    171,
+    172,
+    173,
+    174,
+    175,
+    176,
+    177,
+    178,
+    179,
+    180,
+    181,
+    182,
+    183,
+    184,
+    185,
+    186,
+    187,
+    188,
+    189,
+    190,
+    191,
+    192,
+    193,
+    194,
+    195,
+    196,
+    197,
+    198,
+    199,
+    200,
+    201,
+    202,
+    203,
+    204,
+    205,
+    206,
+    207,
+]
+class Standardizer:
+    """
+    Simple wrapper class around rdkit Standardizer.
+    """
+    DEFAULT_CANON_TAUT = False
+    DEFAULT_METAL_DISCONNECT = False
+    MAX_TAUTOMERS = 100
+    MAX_TRANSFORMS = 100
+    MAX_RESTARTS = 200
+    PREFER_ORGANIC = True
+    def __init__(
+        self,
+        metal_disconnect=None,
+        canon_taut=None,
+    ):
+        """
+        Constructor.
+        All parameters are optional.
+        :param metal_disconnect:    if True, metallorganic complexes are
+                                    disconnected
+        :param canon_taut:          if True, molecules are converted to their
+                                    canonical tautomer
+        """
+        super().__init__()
+        if metal_disconnect is None:
+            metal_disconnect = self.DEFAULT_METAL_DISCONNECT
+        if canon_taut is None:
+            canon_taut = self.DEFAULT_CANON_TAUT
+        self._canon_taut = canon_taut
+        self._metal_disconnect = metal_disconnect
+        self._taut_enumerator = None
+        self._uncharger = None
+        self._lfrag_chooser = None
+        self._metal_disconnector = None
+        self._normalizer = None
+        self._reionizer = None
+        self._params = None
+    @property
+    def params(self):
+        """Return the MolStandardize CleanupParameters."""
+        if self._params is None:
+            self._params = rdMolStandardize.CleanupParameters()
+            self._params.maxTautomers = self.MAX_TAUTOMERS
+            self._params.maxTransforms = self.MAX_TRANSFORMS
+            self._params.maxRestarts = self.MAX_RESTARTS
+            self._params.preferOrganic = self.PREFER_ORGANIC
+            self._params.tautomerRemoveSp3Stereo = False
+        return self._params
+    @property
+    def canon_taut(self):
+        """Return whether tautomer canonicalization will be done."""
+        return self._canon_taut
+    @property
+    def metal_disconnect(self):
+        """Return whether metallorganic complexes will be disconnected."""
+        return self._metal_disconnect
+    @property
+    def taut_enumerator(self):
+        """Return the TautomerEnumerator object."""
+        if self._taut_enumerator is None:
+            self._taut_enumerator = rdMolStandardize.TautomerEnumerator(self.params)
+        return self._taut_enumerator
+    @property
+    def uncharger(self):
+        """Return the Uncharger object."""
+        if self._uncharger is None:
+            self._uncharger = rdMolStandardize.Uncharger()
+        return self._uncharger
+    @property
+    def lfrag_chooser(self):
+        """Return the LargestFragmentChooser object."""
+        if self._lfrag_chooser is None:
+            self._lfrag_chooser = rdMolStandardize.LargestFragmentChooser(
+                self.params.preferOrganic
+            )
+        return self._lfrag_chooser
+    @property
+    def metal_disconnector(self):
+        """Return the MetalDisconnector object."""
+        if self._metal_disconnector is None:
+            self._metal_disconnector = rdMolStandardize.MetalDisconnector()
+        return self._metal_disconnector
+    @property
+    def normalizer(self):
+        """Return the Normalizer object."""
+        if self._normalizer is None:
+            self._normalizer = rdMolStandardize.Normalizer(
+                self.params.normalizationsFile, self.params.maxRestarts
+            )
+        return self._normalizer
+    @property
+    def reionizer(self):
+        """Return the Reionizer object."""
+        if self._reionizer is None:
+            self._reionizer = rdMolStandardize.Reionizer(self.params.acidbaseFile)
+        return self._reionizer
+    def charge_parent(self, mol_in):
+        """Sequentially apply a series of MolStandardize operations:
+        * MetalDisconnector
+        * Normalizer
+        * Reionizer
+        * LargestFragmentChooser
+        * Uncharger
+        The net result is that a desalted, normalized, neutral
+        molecule with implicit Hs is returned.
+        """
+        params = Chem.RemoveHsParameters()
+        params.removeAndTrackIsotopes = True
+        mol_in = Chem.RemoveHs(mol_in, params, sanitize=False)
+        if self._metal_disconnect:
+            mol_in = self.metal_disconnector.Disconnect(mol_in)
+        normalized = self.normalizer.normalize(mol_in)
+        Chem.SanitizeMol(normalized)
+        normalized = self.reionizer.reionize(normalized)
+        Chem.AssignStereochemistry(normalized)
+        normalized = self.lfrag_chooser.choose(normalized)
+        normalized = self.uncharger.uncharge(normalized)
+        # need this to reassess aromaticity on things like
+        # cyclopentadienyl, tropylium, azolium, etc.
+        Chem.SanitizeMol(normalized)
+        return Chem.RemoveHs(Chem.AddHs(normalized))
+    def standardize_mol(self, mol_in):
+        """
+        Standardize a single molecule.
+        :param mol_in:  a Chem.Mol
+        :return:        * (standardized Chem.Mol, n_taut) tuple
+                          if success. n_taut will be negative if
+                          tautomer enumeration was aborted due
+                          to reaching a limit
+                        * (None, error_msg) if failure
+        This calls self.charge_parent() and, if self._canon_taut
+        is True, runs tautomer canonicalization.
+        """
+        n_tautomers = 0
+        if isinstance(mol_in, Chem.Mol):
+            name = None
+            try:
+                name = mol_in.GetProp("_Name")
+            except KeyError:
+                pass
+            if not name:
+                name = "NONAME"
+        else:
+            error = f"Expected SMILES or Chem.Mol as input, got {str(type(mol_in))}"
+            return None, error
+        try:
+            mol_out = self.charge_parent(mol_in)
+        except Exception as e:
+            error = f"charge_parent FAILED: {str(e).strip()}"
+            return None, error
+        if self._canon_taut:
+            try:
+                res = self.taut_enumerator.Enumerate(mol_out, False)
+            except TypeError:
+                # we are still on the pre-2021 RDKit API
+                res = self.taut_enumerator.Enumerate(mol_out)
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+            n_tautomers = len(res)
+            if hasattr(res, "status"):
+                completed = (
+                    res.status == rdMolStandardize.TautomerEnumeratorStatus.Completed
+                )
+            else:
+                # we are still on the pre-2021 RDKit API
+                completed = len(res) < 1000
+            if not completed:
+                n_tautomers = -n_tautomers
+            try:
+                mol_out = self.taut_enumerator.PickCanonical(res)
+            except AttributeError:
+                # we are still on the pre-2021 RDKit API
+                mol_out = max(
+                    [(self.taut_enumerator.ScoreTautomer(m), m) for m in res]
+                )[1]
+            except Exception as e:
+                # something else went wrong
+                error = f"canon_taut FAILED: {str(e).strip()}"
+                return None, error
+        mol_out.SetProp("_Name", name)
+        return mol_out, n_tautomers
+class FeatureDictMixin:
+    """
+    Mixin that enables bidirectional handling of dict-based multi-feature inputs.
+    Allows selective removal of columns directly from the combined array.
+    Example input:
+        {
+            "ecfps": np.ndarray,
+            "tox": np.ndarray,
+        }
+    """
+    def __init__(self, feature_keys=None):
+        self.feature_keys = feature_keys
+        self._curr_keys = None
+        self._unused_data = None
+    def dict_to_array(self, input: dict[Any, np.ndarray]) -> np.ndarray:
+        """Parse dict input and concatenate into a single array."""
+        if not isinstance(input, dict):
+            raise TypeError("Input must be a dict {feature_type: np.ndarray, ...}")
+        self._unused_data = {}
+        remaining_input = {}
+        for key in list(input.keys()):
+            if key not in self.feature_keys:
+                self._unused_data[key] = input[key]
+            else:
+                remaining_input[key] = input[key]
+        curr_keys = []
+        output = []
+        for key in self.feature_keys:
+            array = remaining_input.pop(key)
+            if array.ndim != 2:
+                raise ValueError(f"Feature '{key}' must be 2D, got shape {array.shape}")
+            curr_keys.extend([key] * array.shape[1])
+            output.append(array)
+        self._curr_keys = np.array(curr_keys)
+        return np.concatenate(output, axis=1)
+    def array_to_dict(self, input: np.ndarray) -> dict[Any, np.ndarray]:
+        """Reconstruct dict from a concatenated array."""
+        if self._curr_keys is None:
+            raise ValueError("No feature mapping stored. Did you call parse_input()?")
+        output = {key: input[:, self._curr_keys == key] for key in self.feature_keys}
+        output.update(self._unused_data)
+        self._curr_keys = None
+        self._unused_data = None
+        return output
+def load_pickle(path: str):
+    with open(path, "rb") as file:
+        content = pickle.load(file)
+    return content
+def write_pickle(path: str, obj: object):
+    with open(path, "wb") as file:
+        pickle.dump(obj, file)
+def create_dir(path, is_file=False):
+    """Creates the parent directories if a path to a file is given, else create the given directory"""
+    to_create = os.path.dirname(path) if is_file else path
+    if not os.path.exists(to_create):
+        os.makedirs(to_create)
+def normalize_config(config: dict):
+    """Normalizes a json config recursively by applying a mapping"""
+    mapping = {"none": None, "true": True, "false": False}
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = normalize_config(val)
+        elif isinstance(val, (int, float, str)) and val in mapping:
+            new_config[key] = mapping[val]
+        else:
+            new_config[key] = val
+    return new_config