Spaces:
Sleeping
Sleeping
| """ | |
| This files includes a the data processing for Tox21. | |
| As an input it takes a list of SMILES and it outputs a nested dictionary with | |
| SMILES and target names as keys. | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| import numpy as np | |
| from src.preprocess import create_descriptors, get_tox21_split | |
| from src.utils import TASKS, HF_TOKEN, create_dir, normalize_config | |
| parser = argparse.ArgumentParser( | |
| description="Data preprocessing script for the Tox21 dataset" | |
| ) | |
| parser.add_argument( | |
| "--config", | |
| type=str, | |
| default="config/config.json", | |
| ) | |
| def main(config): | |
| """Create molecule descriptors for HF Tox21 dataset""" | |
| ds = get_tox21_split(HF_TOKEN, cvfold=config["cvfold"]) | |
| splits = ["train", "validation"] | |
| for split in splits: | |
| print(f"Preprocess {split} molecules") | |
| ds_split = ds[split] | |
| smiles = list(ds_split["smiles"]) | |
| features, clean_mol_mask = create_descriptors( | |
| smiles, config["descriptors"], **config["ecfp"] | |
| ) | |
| labels = [] | |
| for task in TASKS: | |
| labels.append(ds_split[task].to_numpy()) | |
| labels = np.stack(labels, axis=1) | |
| save_path = os.path.join(config["data_folder"], f"tox21_{split}_cv4.npz") | |
| with open(save_path, "wb") as f: | |
| np.savez( | |
| f, | |
| clean_mol_mask=clean_mol_mask, | |
| labels=labels, | |
| **features, | |
| ) | |
| print(f"Saved preprocessed {split} split under {save_path}") | |
| print("Preprocessing finished successfully") | |
| if __name__ == "__main__": | |
| args = parser.parse_args() | |
| with open(args.config, "r") as f: | |
| config = json.load(f) | |
| config = normalize_config(config) | |
| create_dir(config["data_folder"]) | |
| main(config) | |