Upload model
Browse files- dataset.py +41 -13
- lmdb_jpg.py +69 -0
- modelling_cxrmate_ed.py +42 -17
dataset.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import torch
|
| 5 |
from torch.utils.data import Dataset
|
| 6 |
-
from torchvision.io import read_image
|
| 7 |
|
| 8 |
# Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
|
| 9 |
VIEW_ORDER = ['LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL', 'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
|
|
@@ -25,7 +26,8 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 25 |
self,
|
| 26 |
split,
|
| 27 |
records,
|
| 28 |
-
|
|
|
|
| 29 |
max_images_per_study=None,
|
| 30 |
transforms=None,
|
| 31 |
images=True,
|
|
@@ -39,8 +41,9 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 39 |
"""
|
| 40 |
Argument/s:
|
| 41 |
split - 'train', 'validate', or 'test'.
|
| 42 |
-
dataset_dir - Dataset directory.
|
| 43 |
records - MIMIC-CXR & MIMIC-IV-ED records class instance.
|
|
|
|
|
|
|
| 44 |
max_images_per_study - the maximum number of images per study.
|
| 45 |
transforms - torchvision transformations.
|
| 46 |
colour_space - PIL target colour space.
|
|
@@ -54,7 +57,8 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 54 |
"""
|
| 55 |
super(StudyIDEDStayIDSubset, self).__init__()
|
| 56 |
self.split = split
|
| 57 |
-
self.
|
|
|
|
| 58 |
self.records = records
|
| 59 |
self.max_images_per_study = max_images_per_study
|
| 60 |
self.transforms = transforms
|
|
@@ -68,15 +72,16 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 68 |
# If max images per study is not set:
|
| 69 |
self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
|
| 70 |
|
| 71 |
-
assert self.extension == 'jpg' or self.extension == 'dcm'
|
|
|
|
| 72 |
|
| 73 |
-
if self.
|
| 74 |
if self.extension == 'jpg':
|
| 75 |
-
if 'physionet.org/files/mimic-cxr-jpg/2.0.0/files' not in self.
|
| 76 |
-
self.
|
| 77 |
elif self.extension == 'dcm':
|
| 78 |
-
if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.
|
| 79 |
-
self.
|
| 80 |
|
| 81 |
query = f"""
|
| 82 |
SELECT {columns}
|
|
@@ -108,6 +113,18 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 108 |
self.num_dicom_ids = len(df['dicom_id'].unique().tolist())
|
| 109 |
self.num_subject_ids = len(df['subject_id'].unique().tolist())
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def __len__(self):
|
| 112 |
return self.num_study_ids
|
| 113 |
|
|
@@ -212,9 +229,20 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
| 212 |
"""
|
| 213 |
|
| 214 |
if self.extension == 'jpg':
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
elif self.extension == 'dcm':
|
| 220 |
raise NotImplementedError
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
import lmdb
|
| 4 |
import pandas as pd
|
| 5 |
import torch
|
| 6 |
from torch.utils.data import Dataset
|
| 7 |
+
from torchvision.io import decode_image, read_image
|
| 8 |
|
| 9 |
# Ordered by oblique, lateral, AP, and then PA views so that PA views are closest in position to the generated tokens (and oblique is furtherest).
|
| 10 |
VIEW_ORDER = ['LPO', 'RAO', 'LAO', 'SWIMMERS', 'XTABLE LATERAL', 'LL', 'LATERAL', 'AP AXIAL', 'AP RLD', 'AP LLD', 'AP', 'PA RLD', 'PA LLD', 'PA']
|
|
|
|
| 26 |
self,
|
| 27 |
split,
|
| 28 |
records,
|
| 29 |
+
mimic_cxr_jpg_lmdb_path=None,
|
| 30 |
+
mimic_cxr_dir=None,
|
| 31 |
max_images_per_study=None,
|
| 32 |
transforms=None,
|
| 33 |
images=True,
|
|
|
|
| 41 |
"""
|
| 42 |
Argument/s:
|
| 43 |
split - 'train', 'validate', or 'test'.
|
|
|
|
| 44 |
records - MIMIC-CXR & MIMIC-IV-ED records class instance.
|
| 45 |
+
mimic_cxr_jpg_lmdb_path - JPG database for MIMIC-CXR-JPG.
|
| 46 |
+
mimic_cxr_dir - Path to the MIMIC-CXR directory containing the patient study subdirectories with the JPG or DCM images.
|
| 47 |
max_images_per_study - the maximum number of images per study.
|
| 48 |
transforms - torchvision transformations.
|
| 49 |
colour_space - PIL target colour space.
|
|
|
|
| 57 |
"""
|
| 58 |
super(StudyIDEDStayIDSubset, self).__init__()
|
| 59 |
self.split = split
|
| 60 |
+
self.mimic_cxr_jpg_lmdb_path = mimic_cxr_jpg_lmdb_path
|
| 61 |
+
self.mimic_cxr_dir = mimic_cxr_dir
|
| 62 |
self.records = records
|
| 63 |
self.max_images_per_study = max_images_per_study
|
| 64 |
self.transforms = transforms
|
|
|
|
| 72 |
# If max images per study is not set:
|
| 73 |
self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
|
| 74 |
|
| 75 |
+
assert self.extension == 'jpg' or self.extension == 'dcm', '"extension" can only be either "jpg" or "dcm".'
|
| 76 |
+
assert (mimic_cxr_jpg_lmdb_path is None) != (mimic_cxr_dir is None), 'Either "mimic_cxr_jpg_lmdb_path" or "mimic_cxr_dir" can be set.'
|
| 77 |
|
| 78 |
+
if self.mimic_cxr_dir is not None and self.mimic_cxr_jpg_lmdb_path is None:
|
| 79 |
if self.extension == 'jpg':
|
| 80 |
+
if 'physionet.org/files/mimic-cxr-jpg/2.0.0/files' not in self.mimic_cxr_dir:
|
| 81 |
+
self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr-jpg/2.0.0/files')
|
| 82 |
elif self.extension == 'dcm':
|
| 83 |
+
if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.mimic_cxr_dir:
|
| 84 |
+
self.mimic_cxr_dir = os.path.join(self.mimic_cxr_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
|
| 85 |
|
| 86 |
query = f"""
|
| 87 |
SELECT {columns}
|
|
|
|
| 113 |
self.num_dicom_ids = len(df['dicom_id'].unique().tolist())
|
| 114 |
self.num_subject_ids = len(df['subject_id'].unique().tolist())
|
| 115 |
|
| 116 |
+
# Prepare the LMDB .jpg database:
|
| 117 |
+
if self.mimic_cxr_jpg_lmdb_path is not None:
|
| 118 |
+
|
| 119 |
+
print('Loading images using LMDB.')
|
| 120 |
+
|
| 121 |
+
# Map size:
|
| 122 |
+
map_size = int(0.65 * (1024 ** 4))
|
| 123 |
+
assert isinstance(map_size, int)
|
| 124 |
+
|
| 125 |
+
self.env = lmdb.open(self.mimic_cxr_jpg_lmdb_path, map_size=map_size, lock=False, readonly=True)
|
| 126 |
+
self.txn = self.env.begin(write=False)
|
| 127 |
+
|
| 128 |
def __len__(self):
|
| 129 |
return self.num_study_ids
|
| 130 |
|
|
|
|
| 229 |
"""
|
| 230 |
|
| 231 |
if self.extension == 'jpg':
|
| 232 |
+
|
| 233 |
+
if self.mimic_cxr_jpg_lmdb_path is not None:
|
| 234 |
+
|
| 235 |
+
# Convert to bytes:
|
| 236 |
+
key = bytes(dicom_id, 'utf-8')
|
| 237 |
+
|
| 238 |
+
# Retrieve image:
|
| 239 |
+
image = bytearray(self.txn.get(key))
|
| 240 |
+
image = torch.frombuffer(image, dtype=torch.uint8)
|
| 241 |
+
image = decode_image(image)
|
| 242 |
+
|
| 243 |
+
else:
|
| 244 |
+
image_file_path = mimic_cxr_image_path(self.mimic_cxr_dir, subject_id, study_id, dicom_id, self.extension)
|
| 245 |
+
image = read_image(image_file_path)
|
| 246 |
|
| 247 |
elif self.extension == 'dcm':
|
| 248 |
raise NotImplementedError
|
lmdb_jpg.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import multiprocessing
|
| 2 |
+
|
| 3 |
+
import duckdb
|
| 4 |
+
import lmdb
|
| 5 |
+
from torch.utils.data import DataLoader, Dataset
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
from .dataset import mimic_cxr_image_path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class JPGDataset(Dataset):
|
| 12 |
+
def __init__(self, df, jpg_path):
|
| 13 |
+
self.df = df
|
| 14 |
+
self.jpg_path = jpg_path
|
| 15 |
+
|
| 16 |
+
def __len__(self):
|
| 17 |
+
return len(self.df)
|
| 18 |
+
|
| 19 |
+
def __getitem__(self, idx):
|
| 20 |
+
|
| 21 |
+
row = self.df.iloc[idx]
|
| 22 |
+
|
| 23 |
+
jpg_path = mimic_cxr_image_path(self.jpg_path, row['subject_id'], row['study_id'], row['dicom_id'], 'jpg')
|
| 24 |
+
|
| 25 |
+
# Convert key to bytes:
|
| 26 |
+
key = bytes(row['dicom_id'], 'utf-8')
|
| 27 |
+
|
| 28 |
+
# Read the .jpg file as bytes:
|
| 29 |
+
with open(jpg_path, 'rb') as f:
|
| 30 |
+
image = f.read()
|
| 31 |
+
|
| 32 |
+
return {
|
| 33 |
+
'keys': key,
|
| 34 |
+
'images': image,
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
def prepare_mimic_cxr_jpg_lmdb(mimic_iv_duckdb_path, mimic_cxr_jpg_path, mimic_cxr_jpg_lmdb_path, map_size_tb, num_workers=None):
|
| 38 |
+
|
| 39 |
+
num_workers = num_workers if num_workers is not None else multiprocessing.cpu_count()
|
| 40 |
+
|
| 41 |
+
connect = duckdb.connect(mimic_iv_duckdb_path, read_only=True)
|
| 42 |
+
df = connect.sql("SELECT DISTINCT ON(dicom_id) subject_id, study_id, dicom_id FROM mimic_cxr").df()
|
| 43 |
+
connect.close()
|
| 44 |
+
|
| 45 |
+
# Map size:
|
| 46 |
+
map_size = int(map_size_tb * (1024 ** 4))
|
| 47 |
+
assert isinstance(map_size, int)
|
| 48 |
+
|
| 49 |
+
print(f'Map size: {map_size}')
|
| 50 |
+
|
| 51 |
+
dataset = JPGDataset(df, mimic_cxr_jpg_path)
|
| 52 |
+
dataloader = DataLoader(
|
| 53 |
+
dataset,
|
| 54 |
+
batch_size=num_workers,
|
| 55 |
+
shuffle=False,
|
| 56 |
+
num_workers=num_workers,
|
| 57 |
+
prefetch_factor=1,
|
| 58 |
+
collate_fn=lambda x: x,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
env = lmdb.open(mimic_cxr_jpg_lmdb_path, map_size=map_size, readonly=False)
|
| 62 |
+
for batch in tqdm(dataloader):
|
| 63 |
+
for i in batch:
|
| 64 |
+
with env.begin(write=True) as txn:
|
| 65 |
+
value = txn.get(b'image_keys')
|
| 66 |
+
if value is None:
|
| 67 |
+
txn.put(i['keys'], i['images'])
|
| 68 |
+
env.sync()
|
| 69 |
+
env.close()
|
modelling_cxrmate_ed.py
CHANGED
|
@@ -21,6 +21,7 @@ from transformers.utils import logging
|
|
| 21 |
|
| 22 |
from .create_section_files import create_section_files
|
| 23 |
from .dataset import StudyIDEDStayIDSubset
|
|
|
|
| 24 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
| 25 |
from .records import EDCXRSubjectRecords
|
| 26 |
from .tables import ed_module_tables, mimic_cxr_tables
|
|
@@ -917,11 +918,14 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 917 |
return position_ids
|
| 918 |
|
| 919 |
@staticmethod
|
| 920 |
-
def prepare_data(physionet_dir,
|
| 921 |
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
|
|
|
|
|
|
|
|
|
| 925 |
|
| 926 |
mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
|
| 927 |
if not os.path.exists(mimic_cxr_sectioned_path):
|
|
@@ -947,9 +951,9 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 947 |
no_split=True,
|
| 948 |
)
|
| 949 |
|
| 950 |
-
if not os.path.exists(
|
| 951 |
|
| 952 |
-
connect = duckdb.connect(
|
| 953 |
|
| 954 |
csv_paths = []
|
| 955 |
csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
|
|
@@ -982,14 +986,16 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 982 |
# MIMIC-CXR report sections:
|
| 983 |
print(f'Copying mimic_cxr_sectioned into database...')
|
| 984 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr_sectioned AS FROM '{mimic_cxr_sectioned_path}';")
|
| 985 |
-
connect.sql(
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
|
|
|
|
|
|
| 993 |
|
| 994 |
splits = connect.sql("FROM mimic_cxr_2_0_0_split").df()
|
| 995 |
reports = connect.sql("FROM mimic_cxr_sectioned").df()
|
|
@@ -1065,6 +1071,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 1065 |
df = df.sort_values(by='study_datetime', ascending=False)
|
| 1066 |
df = df.groupby('study_id').first().reset_index()
|
| 1067 |
|
|
|
|
| 1068 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 1069 |
edstays = connect.sql(
|
| 1070 |
f"""
|
|
@@ -1109,21 +1116,39 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 1109 |
df = pd.DataFrame(v)
|
| 1110 |
df = df.drop_duplicates(subset=['study_id', 'stay_id'])
|
| 1111 |
connect.sql(f"CREATE TABLE {k}_study_ids AS SELECT * FROM df")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1112 |
|
| 1113 |
@staticmethod
|
| 1114 |
-
def get_dataset(split, transforms,
|
|
|
|
|
|
|
|
|
|
| 1115 |
|
| 1116 |
if records is None:
|
| 1117 |
|
| 1118 |
# This is the setup for CXRs + all effective inputs - medicine reconciliation:
|
| 1119 |
-
records = EDCXRSubjectRecords(database_path=
|
| 1120 |
|
| 1121 |
records.ed_module_tables = {k: records.ed_module_tables[k] for k in ['edstays', 'triage', 'vitalsign']}
|
| 1122 |
records.mimic_cxr_tables = {k: records.mimic_cxr_tables[k] for k in ['mimic_cxr_sectioned']}
|
| 1123 |
records.mimic_cxr_tables['mimic_cxr_sectioned'].text_columns = ['indication', 'history']
|
| 1124 |
|
| 1125 |
dataset = StudyIDEDStayIDSubset(
|
| 1126 |
-
|
|
|
|
| 1127 |
transforms=transforms,
|
| 1128 |
split=split,
|
| 1129 |
max_images_per_study=max_images_per_study,
|
|
|
|
| 21 |
|
| 22 |
from .create_section_files import create_section_files
|
| 23 |
from .dataset import StudyIDEDStayIDSubset
|
| 24 |
+
from .lmdb_jpg import prepare_mimic_cxr_jpg_lmdb
|
| 25 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
| 26 |
from .records import EDCXRSubjectRecords
|
| 27 |
from .tables import ed_module_tables, mimic_cxr_tables
|
|
|
|
| 918 |
return position_ids
|
| 919 |
|
| 920 |
@staticmethod
|
| 921 |
+
def prepare_data(physionet_dir, database_dir):
|
| 922 |
|
| 923 |
+
Path(database_dir).mkdir(parents=True, exist_ok=True)
|
| 924 |
+
|
| 925 |
+
mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
|
| 926 |
+
mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db')
|
| 927 |
+
|
| 928 |
+
sectioned_dir = os.path.join(database_dir, 'mimic_cxr_sectioned')
|
| 929 |
|
| 930 |
mimic_cxr_sectioned_path = os.path.join(sectioned_dir, 'mimic_cxr_sectioned.csv')
|
| 931 |
if not os.path.exists(mimic_cxr_sectioned_path):
|
|
|
|
| 951 |
no_split=True,
|
| 952 |
)
|
| 953 |
|
| 954 |
+
if not os.path.exists(mimic_iv_duckdb_path):
|
| 955 |
|
| 956 |
+
connect = duckdb.connect(mimic_iv_duckdb_path)
|
| 957 |
|
| 958 |
csv_paths = []
|
| 959 |
csv_paths.append(glob(os.path.join(physionet_dir, 'mimic-iv-ed', '*', 'ed', 'edstays.csv.gz'))[0])
|
|
|
|
| 986 |
# MIMIC-CXR report sections:
|
| 987 |
print(f'Copying mimic_cxr_sectioned into database...')
|
| 988 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr_sectioned AS FROM '{mimic_cxr_sectioned_path}';")
|
| 989 |
+
columns = list(connect.sql('FROM mimic_cxr_sectioned LIMIT 1').df().columns)
|
| 990 |
+
if 'column0' in columns: # If the column headers are not read correctly:
|
| 991 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column0 TO study;")
|
| 992 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column1 TO impression;")
|
| 993 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column2 TO findings;")
|
| 994 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column3 TO indication;")
|
| 995 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column4 TO history;")
|
| 996 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column5 TO last_paragraph;")
|
| 997 |
+
connect.sql("ALTER TABLE mimic_cxr_sectioned RENAME COLUMN column6 TO comparison;")
|
| 998 |
+
connect.sql("DELETE FROM mimic_cxr_sectioned WHERE study='study';")
|
| 999 |
|
| 1000 |
splits = connect.sql("FROM mimic_cxr_2_0_0_split").df()
|
| 1001 |
reports = connect.sql("FROM mimic_cxr_sectioned").df()
|
|
|
|
| 1071 |
df = df.sort_values(by='study_datetime', ascending=False)
|
| 1072 |
df = df.groupby('study_id').first().reset_index()
|
| 1073 |
|
| 1074 |
+
print('Searching for studies associated with an ED stay...')
|
| 1075 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 1076 |
edstays = connect.sql(
|
| 1077 |
f"""
|
|
|
|
| 1116 |
df = pd.DataFrame(v)
|
| 1117 |
df = df.drop_duplicates(subset=['study_id', 'stay_id'])
|
| 1118 |
connect.sql(f"CREATE TABLE {k}_study_ids AS SELECT * FROM df")
|
| 1119 |
+
|
| 1120 |
+
connect.close()
|
| 1121 |
+
|
| 1122 |
+
if not os.path.exists(mimic_cxr_jpg_lmdb_path):
|
| 1123 |
+
print('Preparing MIMIC-CXR-JPG LMDB database...')
|
| 1124 |
+
pattern = os.path.join(physionet_dir, 'mimic-cxr-jpg', '*', 'files')
|
| 1125 |
+
mimic_cxr_jpg_dir = glob(pattern)
|
| 1126 |
+
assert len(mimic_cxr_jpg_dir), f'Multiple directories matched the pattern {pattern}: {mimic_cxr_jpg_dir}. Only one is required.'
|
| 1127 |
+
prepare_mimic_cxr_jpg_lmdb(
|
| 1128 |
+
mimic_iv_duckdb_path=mimic_iv_duckdb_path,
|
| 1129 |
+
mimic_cxr_jpg_dir=mimic_cxr_jpg_dir[0],
|
| 1130 |
+
mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
|
| 1131 |
+
map_size_tb=0.65
|
| 1132 |
+
)
|
| 1133 |
|
| 1134 |
@staticmethod
|
| 1135 |
+
def get_dataset(split, transforms, database_dir, max_images_per_study=5, mimic_cxr_jpg_dir=None, records=None):
|
| 1136 |
+
|
| 1137 |
+
mimic_iv_duckdb_path = os.path.join(database_dir, 'mimic_iv_duckdb.db')
|
| 1138 |
+
mimic_cxr_jpg_lmdb_path = os.path.join(database_dir, 'mimic_cxr_jpg_lmdb.db') if mimic_cxr_jpg_dir is None else None
|
| 1139 |
|
| 1140 |
if records is None:
|
| 1141 |
|
| 1142 |
# This is the setup for CXRs + all effective inputs - medicine reconciliation:
|
| 1143 |
+
records = EDCXRSubjectRecords(database_path=mimic_iv_duckdb_path, time_delta_map=lambda x: 1 / math.sqrt(x + 1))
|
| 1144 |
|
| 1145 |
records.ed_module_tables = {k: records.ed_module_tables[k] for k in ['edstays', 'triage', 'vitalsign']}
|
| 1146 |
records.mimic_cxr_tables = {k: records.mimic_cxr_tables[k] for k in ['mimic_cxr_sectioned']}
|
| 1147 |
records.mimic_cxr_tables['mimic_cxr_sectioned'].text_columns = ['indication', 'history']
|
| 1148 |
|
| 1149 |
dataset = StudyIDEDStayIDSubset(
|
| 1150 |
+
mimic_cxr_jpg_lmdb_path=mimic_cxr_jpg_lmdb_path,
|
| 1151 |
+
mimic_cxr_dir=mimic_cxr_jpg_dir,
|
| 1152 |
transforms=transforms,
|
| 1153 |
split=split,
|
| 1154 |
max_images_per_study=max_images_per_study,
|