AshmithaIRRI commited on
Commit
5b1fe9d
·
verified ·
1 Parent(s): 1dc7c25

Create app_recovery.py

Browse files
Files changed (1) hide show
  1. app_recovery.py +594 -0
app_recovery.py ADDED
@@ -0,0 +1,594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Nov 24 12:47:37 2024
4
+
5
+ @author: Ashmitha
6
+ """
7
+
8
+ # -*- coding: utf-8 -*-
9
+ """
10
+ Created on Sun Nov 24 12:25:57 2024
11
+
12
+ @author: Ashmitha
13
+ """
14
+
15
+ # -*- coding: utf-8 -*-
16
+ """
17
+ Created on Sat Nov 9 15:44:40 2024
18
+
19
+ @author: Ashmitha
20
+ """
21
+
22
+ import pandas as pd
23
+ import numpy as np
24
+ import gradio as gr
25
+ from sklearn.metrics import mean_squared_error,r2_score
26
+ from scipy.stats import pearsonr
27
+ from sklearn.preprocessing import StandardScaler
28
+ from sklearn.model_selection import KFold
29
+ import tensorflow as tf
30
+ from tensorflow.keras.models import Sequential
31
+ from tensorflow.keras.layers import GRU,Dense,Dropout,BatchNormalization,LeakyReLU
32
+ from tensorflow.keras.optimizers import Adam
33
+ from tensorflow.keras import regularizers
34
+ from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping
35
+ import os
36
+ from sklearn.preprocessing import MinMaxScaler
37
+ from keras.layers import Conv1D,MaxPooling1D,Dense,Flatten,Dropout,LeakyReLU
38
+ from keras.callbacks import ReduceLROnPlateau,EarlyStopping
39
+ from sklearn.ensemble import RandomForestRegressor
40
+ from xgboost import XGBRegressor
41
+ import io
42
+ from sklearn.feature_selection import SelectFromModel
43
+ import tempfile
44
+
45
+ #-------------------------------------Feature selection---------------------------------------------------------------------------------------------
46
+
47
+ def RandomForestFeatureSelection(trainX, trainy, num_features=60):
48
+ rf = RandomForestRegressor(n_estimators=1000, random_state=50)
49
+ rf.fit(trainX, trainy)
50
+
51
+ # Get feature importances
52
+ importances = rf.feature_importances_
53
+
54
+ # Select the top N important features
55
+ indices = np.argsort(importances)[-num_features:]
56
+ return indices
57
+ #----------------------------------------------------------GRU Model---------------------------------------------------------------------
58
+ import numpy as np
59
+ from tensorflow.keras.models import Sequential
60
+ from tensorflow.keras.layers import GRU, Dense, BatchNormalization, Dropout, LeakyReLU
61
+ from tensorflow.keras.optimizers import Adam
62
+ from tensorflow.keras import regularizers
63
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
64
+ from sklearn.preprocessing import MinMaxScaler
65
+ from sklearn.ensemble import RandomForestRegressor
66
+ from sklearn.feature_selection import SelectFromModel
67
+
68
+ def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
69
+
70
+ # Apply feature selection using Random Forest Regressor
71
+ if feature_selection:
72
+ # Use RandomForestRegressor to rank features by importance
73
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
74
+ rf.fit(trainX, trainy)
75
+
76
+ # Select features with importance greater than a threshold (e.g., mean importance)
77
+ selector = SelectFromModel(rf, threshold="mean", prefit=True)
78
+ trainX = selector.transform(trainX)
79
+ if testX is not None:
80
+ testX = selector.transform(testX)
81
+ print(f"Selected {trainX.shape[1]} features based on feature importance.")
82
+
83
+ # Scale the input data using MinMaxScaler to normalize the feature range
84
+ scaler = MinMaxScaler()
85
+ trainX_scaled = scaler.fit_transform(trainX)
86
+ if testX is not None:
87
+ testX_scaled = scaler.transform(testX)
88
+
89
+ # Scale the target variable using MinMaxScaler
90
+ target_scaler = MinMaxScaler()
91
+ trainy_scaled = target_scaler.fit_transform(trainy.reshape(-1, 1)) # Reshape to 2D for scaler
92
+
93
+ # Reshape trainX and testX to be 3D: (samples, timesteps, features)
94
+ trainX = trainX_scaled.reshape((trainX.shape[0], 1, trainX.shape[1])) # Adjusted for general feature count
95
+ if testX is not None:
96
+ testX = testX_scaled.reshape((testX.shape[0], 1, testX.shape[1])) # Reshape testX if it exists
97
+
98
+ model = Sequential()
99
+
100
+ # GRU Layer
101
+ model.add(GRU(512, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=False, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
102
+
103
+ # Dense Layers with Batch Normalization, Dropout, LeakyReLU
104
+ model.add(Dense(256, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
105
+ model.add(BatchNormalization())
106
+ model.add(Dropout(dropout_rate))
107
+ model.add(LeakyReLU(alpha=0.1))
108
+
109
+ model.add(Dense(128, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
110
+ model.add(BatchNormalization())
111
+ model.add(Dropout(dropout_rate))
112
+ model.add(LeakyReLU(alpha=0.1))
113
+
114
+ model.add(Dense(64, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
115
+ model.add(BatchNormalization())
116
+ model.add(Dropout(dropout_rate))
117
+ model.add(LeakyReLU(alpha=0.1))
118
+
119
+ model.add(Dense(32, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
120
+ model.add(BatchNormalization())
121
+ model.add(Dropout(dropout_rate))
122
+ model.add(LeakyReLU(alpha=0.1))
123
+
124
+ # Output Layer with ReLU activation to prevent negative predictions
125
+ model.add(Dense(1, activation="relu"))
126
+
127
+ # Compile the model
128
+ model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
129
+
130
+ # Callbacks for learning rate reduction and early stopping
131
+ learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose=1, factor=0.5, min_lr=1e-6)
132
+ early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
133
+
134
+ # Train the model
135
+ history = model.fit(trainX, trainy_scaled, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
136
+ callbacks=[learning_rate_reduction, early_stopping])
137
+
138
+ # Predict train and test
139
+ predicted_train = model.predict(trainX)
140
+ predicted_test = model.predict(testX) if testX is not None else None
141
+
142
+ # Flatten predictions
143
+ predicted_train = predicted_train.flatten()
144
+ if predicted_test is not None:
145
+ predicted_test = predicted_test.flatten()
146
+ else:
147
+ predicted_test = np.zeros_like(predicted_train)
148
+
149
+ # Inverse scale the predictions to get them back to original range
150
+ predicted_train = target_scaler.inverse_transform(predicted_train.reshape(-1, 1)).flatten()
151
+ if predicted_test is not None:
152
+ predicted_test = target_scaler.inverse_transform(predicted_test.reshape(-1, 1)).flatten()
153
+
154
+ return predicted_train, predicted_test, history
155
+
156
+
157
+
158
+
159
+ #-----------------------------------------------------------DeepMap-------------------------------------------------------------------------------
160
+ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
161
+ if feature_selection:
162
+ rf=RandomForestRegressor(n_estimators=100,random_state=42)
163
+ rf.fit(trainX,trainy)
164
+
165
+ selector=SelectFromModel(rf, threshold="mean",prefit=True)
166
+ trainX=selector.transform(trainX)
167
+ if testX is not None:
168
+ testX=selector.transform(testX)
169
+ print(f"Selected {trainX.shape[1]} feature based on the important feature")
170
+
171
+
172
+
173
+ # Scaling the inputs
174
+ scaler = MinMaxScaler()
175
+ trainX_scaled = scaler.fit_transform(trainX)
176
+ if testX is not None:
177
+ testX_scaled = scaler.transform(testX)
178
+
179
+ # Reshape for CNN input (samples, features, channels)
180
+ trainX = trainX_scaled.reshape((trainX.shape[0], trainX.shape[1], 1))
181
+ if testX is not None:
182
+ testX = testX_scaled.reshape((testX.shape[0], testX.shape[1], 1))
183
+
184
+ model = Sequential()
185
+
186
+ # Convolutional layers
187
+ model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
188
+ model.add(MaxPooling1D(pool_size=2))
189
+ model.add(Dropout(dropout_rate))
190
+
191
+ model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
192
+ model.add(MaxPooling1D(pool_size=2))
193
+ model.add(Dropout(dropout_rate))
194
+
195
+ # Flatten and Dense layers
196
+ model.add(Flatten())
197
+ model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
198
+ model.add(LeakyReLU(alpha=0.1))
199
+ model.add(Dropout(dropout_rate))
200
+
201
+ model.add(Dense(1, activation='linear'))
202
+
203
+ # Compile the model
204
+ model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
205
+
206
+ # Callbacks
207
+ learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.5, min_lr=1e-6)
208
+ early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
209
+
210
+ # Train the model
211
+ history = model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
212
+ callbacks=[learning_rate_reduction, early_stopping])
213
+
214
+ predicted_train = model.predict(trainX).flatten()
215
+ predicted_test = model.predict(testX).flatten() if testX is not None else None
216
+
217
+ return predicted_train, predicted_test, history
218
+
219
+ #-------------------------------------------------------------------------Random Forest----------------------------------------------------
220
+ def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
221
+ if feature_selection:
222
+ rf=RandomForestRegressor(n_estimators=100, random_state=42)
223
+ rf.fit(trainX, trainy)
224
+ selector=SelectFromModel(rf, threshold="mean", prefit=True)
225
+ trainX=selector.transform(trainX)
226
+ if testX is not None:
227
+ testX=selector.transform(testX)
228
+ print(f"Selected {trainX.shape[1]} feature based on the feature selection")
229
+
230
+
231
+ # Log transformation of the target variable
232
+
233
+ # Scaling the feature data
234
+ scaler = MinMaxScaler()
235
+ trainX_scaled = scaler.fit_transform(trainX)
236
+ if testX is not None:
237
+ testX_scaled = scaler.transform(testX)
238
+
239
+ # Define and train the RandomForest model
240
+ rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
241
+ history=rf_model.fit(trainX_scaled, trainy)
242
+
243
+
244
+ # Predictions
245
+ predicted_train = rf_model.predict(trainX_scaled)
246
+ predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
247
+
248
+ return predicted_train, predicted_test,history
249
+ #------------------------------------------------------------------------------XGboost---------------------------------------------------------------
250
+ def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,feature_selection=True, n_estimators=100, max_depth=None):
251
+ if feature_selection:
252
+ rf=RandomForestRegressor(n_estimators=100,random_state=42)
253
+ rf.fit(trainX,trainy)
254
+ selector=SelectFromModel(rf,threshold="mean",prefit=True)
255
+ trainX=selector.transform(trainX)
256
+ if testX is not None:
257
+ testX=selector.transform(testX)
258
+ print(f"Selected {trainX.shape[1]} features based on feature importance")
259
+
260
+
261
+ #trainy_log = np.log1p(trainy) # Log-transform to handle large phenotypic values
262
+ #if testy is not None:
263
+ # testy_log = np.log1p(testy)
264
+
265
+ # Scale the features
266
+ scaler = MinMaxScaler()
267
+ trainX_scaled = scaler.fit_transform(trainX)
268
+ if testX is not None:
269
+ testX_scaled = scaler.transform(testX)
270
+
271
+ # Define and train the XGBoost model
272
+ # xgb_model = XGBRegressor(n_estimators=n_estimators, max_depth=100, random_state=42)
273
+ #xgb_model = XGBRegressor(objective ='reg:linear',
274
+ # n_estimators = 100, seed = 100)
275
+ xgb_model=XGBRegressor(objective="reg:squarederror",random_state=42)
276
+ history=xgb_model.fit(trainX, trainy)
277
+ param_grid={
278
+ "learning_rate":0.01,
279
+ "max_depth" : 10,
280
+ "n_estimators": 100,
281
+ "min_child_weight": 5
282
+ }
283
+
284
+
285
+ # Predictions
286
+ predicted_train = xgb_model.predict(trainX_scaled)
287
+ predicted_test = xgb_model.predict(testX_scaled) if testX is not None else None
288
+
289
+
290
+ return predicted_train, predicted_test,history
291
+
292
+
293
+
294
+
295
+
296
+
297
+ #----------------------------------------reading file----------------------------------------------------------------------------------------
298
+
299
+
300
+
301
+
302
+
303
+ # Helper function to read the uploaded CSV file
304
+ def read_csv_file(uploaded_file):
305
+ if uploaded_file is not None:
306
+ if hasattr(uploaded_file, 'data'): # For NamedBytes
307
+ return pd.read_csv(io.BytesIO(uploaded_file.data))
308
+ elif hasattr(uploaded_file, 'name'): # For NamedString
309
+ return pd.read_csv(uploaded_file.name)
310
+ return None
311
+
312
+
313
+ #-----------------------------------------------------------------calculate topsis score--------------------------------------------------------
314
+
315
+
316
+ def calculate_topsis_score(df):
317
+ # Normalize the metrics
318
+ metrics = df[['Train_MSE', 'Train_RMSE', 'Train_R2', 'Train_Corr']].dropna() # Ensure no NaN values
319
+ norm_metrics = metrics / np.sqrt((metrics ** 2).sum(axis=0))
320
+
321
+ # Define ideal best and worst for each metric
322
+ ideal_best = pd.Series(index=norm_metrics.columns)
323
+ ideal_worst = pd.Series(index=norm_metrics.columns)
324
+
325
+ # For RMSE and MSE (minimization criteria): min is best, max is worst
326
+ for col in ['Train_MSE', 'Train_RMSE']:
327
+ ideal_best[col] = norm_metrics[col].min()
328
+ ideal_worst[col] = norm_metrics[col].max()
329
+
330
+ # For R2 and Corr (maximization criteria): max is best, min is worst
331
+ for col in ['Train_R2', 'Train_Corr']:
332
+ ideal_best[col] = norm_metrics[col].max()
333
+ ideal_worst[col] = norm_metrics[col].min()
334
+
335
+ # Calculate Euclidean distance to ideal best and worst
336
+ dist_to_best = np.sqrt(((norm_metrics - ideal_best) ** 2).sum(axis=1))
337
+ dist_to_worst = np.sqrt(((norm_metrics - ideal_worst) ** 2).sum(axis=1))
338
+
339
+ # Calculate TOPSIS score
340
+ topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)
341
+ df['TOPSIS_Score'] = np.nan # Initialize with NaN
342
+ df.loc[metrics.index, 'TOPSIS_Score'] = topsis_score # Assign TOPSIS scores
343
+ return df
344
+
345
+ #--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
346
+ from sklearn.ensemble import RandomForestRegressor
347
+ from sklearn.model_selection import KFold
348
+ from sklearn.preprocessing import StandardScaler
349
+ from sklearn.feature_selection import SelectFromModel
350
+ from sklearn.metrics import mean_squared_error, r2_score
351
+ from scipy.stats import pearsonr
352
+ import numpy as np
353
+ import pandas as pd
354
+
355
+ def NestedKFoldCrossValidation(
356
+ training_data, training_additive, testing_data, testing_additive,
357
+ training_dominance, testing_dominance, epochs, learning_rate, min_child_weight,
358
+ batch_size=64, outer_n_splits=2, output_file='cross_validation_results.csv',
359
+ predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True
360
+ ):
361
+
362
+ if 'phenotypes' not in training_data.columns:
363
+ raise ValueError("Training data does not contain the 'phenotypes' column.")
364
+
365
+ # Remove Sample ID columns from additive and dominance data
366
+ training_additive = training_additive.iloc[:, 1:]
367
+ testing_additive = testing_additive.iloc[:, 1:]
368
+ training_dominance = training_dominance.iloc[:, 1:]
369
+ testing_dominance = testing_dominance.iloc[:, 1:]
370
+
371
+ # Merge training and testing data with additive and dominance components
372
+ training_data_merged = pd.concat([training_data, training_additive, training_dominance], axis=1)
373
+ testing_data_merged = pd.concat([testing_data, testing_additive, testing_dominance], axis=1)
374
+
375
+ phenotypic_info = training_data['phenotypes'].values
376
+ phenotypic_test_info = testing_data['phenotypes'].values if 'phenotypes' in testing_data.columns else None
377
+ sample_ids = testing_data.iloc[:, 0].values
378
+
379
+ training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
380
+ testing_genotypic_data_merged = testing_data_merged.iloc[:, 2:].values
381
+
382
+ # Feature selection
383
+ if feature_selection:
384
+ rf = RandomForestRegressor(n_estimators=100, random_state=65)
385
+ rf.fit(training_genotypic_data_merged, phenotypic_info)
386
+ selector = SelectFromModel(rf, threshold="mean", prefit=True)
387
+ training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
388
+ testing_genotypic_data_merged = selector.transform(testing_genotypic_data_merged)
389
+ print(f"Selected {training_genotypic_data_merged.shape[1]} features based on importance.")
390
+
391
+ # Standardize the genotypic data
392
+ scaler = StandardScaler()
393
+ training_genotypic_data_merged = scaler.fit_transform(training_genotypic_data_merged)
394
+ testing_genotypic_data_merged = scaler.transform(testing_genotypic_data_merged)
395
+
396
+ outer_kf = KFold(n_splits=outer_n_splits)
397
+
398
+ results = []
399
+ all_predicted_phenotypes = []
400
+
401
+ def calculate_metrics(true_values, predicted_values):
402
+ mse = mean_squared_error(true_values, predicted_values)
403
+ rmse = np.sqrt(mse)
404
+ r2 = r2_score(true_values, predicted_values)
405
+ corr = pearsonr(true_values, predicted_values)[0]
406
+ return mse, rmse, r2, corr
407
+
408
+ models = [
409
+ ('GRUModel', GRUModel),
410
+ ('CNNModel', CNNModel),
411
+ ('RFModel', RFModel),
412
+ ('XGBoostModel', XGBoostModel)
413
+ ]
414
+
415
+ for outer_fold, (outer_train_index, outer_test_index) in enumerate(outer_kf.split(phenotypic_info), 1):
416
+ outer_trainX = training_genotypic_data_merged[outer_train_index]
417
+ outer_trainy = phenotypic_info[outer_train_index]
418
+
419
+ outer_testX = testing_genotypic_data_merged
420
+ outer_testy = phenotypic_test_info
421
+
422
+ for model_name, model_func in models:
423
+ print(f"Running model: {model_name} for fold {outer_fold}")
424
+ if model_name in ['GRUModel', 'CNNModel']:
425
+ predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy, epochs=epochs, batch_size=batch_size)
426
+ elif model_name in ['RFModel']:
427
+ predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy)
428
+ else:
429
+ predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy, learning_rate, min_child_weight)
430
+
431
+ # Calculate metrics
432
+ mse_train, rmse_train, r2_train, corr_train = calculate_metrics(outer_trainy, predicted_train)
433
+ mse_test, rmse_test, r2_test, corr_test = calculate_metrics(outer_testy, predicted_test) if outer_testy is not None else (None, None, None, None)
434
+
435
+ results.append({
436
+ 'Model': model_name,
437
+ 'Fold': outer_fold,
438
+ 'Train_MSE': mse_train,
439
+ 'Train_RMSE': rmse_train,
440
+ 'Train_R2': r2_train,
441
+ 'Train_Corr': corr_train,
442
+ 'Test_MSE': mse_test,
443
+ 'Test_RMSE': rmse_test,
444
+ 'Test_R2': r2_test,
445
+ 'Test_Corr': corr_test
446
+ })
447
+
448
+ if predicted_test is not None:
449
+ predicted_test_df = pd.DataFrame({
450
+ 'Sample_ID': sample_ids,
451
+ 'Predicted_Phenotype': predicted_test,
452
+ 'Model': model_name
453
+ })
454
+ all_predicted_phenotypes.append(predicted_test_df)
455
+
456
+ # Compile results
457
+ results_df = pd.DataFrame(results)
458
+ avg_results_df = results_df.groupby('Model').agg({
459
+ 'Train_MSE': 'mean',
460
+ 'Train_RMSE': 'mean',
461
+ 'Train_R2': 'mean',
462
+ 'Train_Corr': 'mean',
463
+ 'Test_MSE': 'mean',
464
+ 'Test_RMSE': 'mean',
465
+ 'Test_R2': 'mean',
466
+ 'Test_Corr': 'mean'
467
+ }).reset_index()
468
+
469
+ # Calculate the TOPSIS score for the average metrics (considering only MSE, RMSE, R², and Correlation)
470
+ def calculate_topsis_score(df):
471
+ # Normalize the data
472
+ norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
473
+
474
+ # Calculate the positive and negative ideal solutions
475
+ ideal_positive = norm_df.max(axis=0)
476
+ ideal_negative = norm_df.min(axis=0)
477
+
478
+ # Calculate the Euclidean distances
479
+ dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
480
+ dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
481
+
482
+ # Calculate the TOPSIS score
483
+ topsis_score = dist_negative / (dist_positive + dist_negative)
484
+
485
+ # Add the TOPSIS score to the dataframe
486
+ df['TOPSIS_Score'] = topsis_score
487
+
488
+ return df
489
+
490
+ avg_results_df = calculate_topsis_score(avg_results_df)
491
+
492
+ # Save the results with TOPSIS scores to the file
493
+ avg_results_df.to_csv(output_file, index=False)
494
+
495
+ # Save predicted phenotypes
496
+ if all_predicted_phenotypes:
497
+ predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
498
+ predicted_all_df.to_csv(predicted_phenotype_file, index=False)
499
+
500
+ return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
501
+
502
+
503
+ # Save the results to the file
504
+ #results_df.to_csv(output_file, index=False)
505
+
506
+ # Save predicted phenotypes
507
+ #if all_predicted_phenotypes:
508
+ # predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
509
+ #predicted_all_df.to_csv(predicted_phenotype_file, index=False)
510
+
511
+ # return results_df, predicted_all_df if all_predicted_phenotypes else None
512
+
513
+ #--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
514
+
515
+ def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
516
+ training_dominance_file, testing_dominance_file,feature_selection,learning_rate,min_child_weight):
517
+
518
+ # Default parameters
519
+ epochs = 1000
520
+ batch_size = 64
521
+
522
+ inner_n_splits = 2
523
+ min_child_weight=5
524
+ learning_rate=0.001
525
+ #learning_rate=learning_rate
526
+ # min_child_weight=min_child_weight
527
+
528
+ # Load datasets
529
+ training_data = pd.read_csv(training_file.name)
530
+ training_additive = pd.read_csv(training_additive_file.name)
531
+ testing_data = pd.read_csv(testing_file.name)
532
+ testing_additive = pd.read_csv(testing_additive_file.name)
533
+ training_dominance = pd.read_csv(training_dominance_file.name)
534
+ testing_dominance = pd.read_csv(testing_dominance_file.name)
535
+
536
+ # Call the cross-validation function
537
+ results, predicted_phenotypes = NestedKFoldCrossValidation(
538
+ training_data=training_data,
539
+ training_additive=training_additive,
540
+ testing_data=testing_data,
541
+ testing_additive=testing_additive,
542
+ training_dominance=training_dominance,
543
+ testing_dominance=testing_dominance,
544
+ epochs=epochs,
545
+ batch_size=batch_size,
546
+ #outer_n_splits= outer_n_splits,
547
+ #outer_n_splits=outer_n_splits,
548
+ #inner_n_splits=inner_n_splits,
549
+ learning_rate=learning_rate,
550
+ min_child_weight=min_child_weight,
551
+ feature_selection=feature_selection
552
+ )
553
+
554
+ # Save outputs
555
+ results_file = "cross_validation_results.csv"
556
+ predicted_file = "predicted_phenotype.csv"
557
+ results.to_csv(results_file, index=False)
558
+ predicted_phenotypes.to_csv(predicted_file, index=False)
559
+
560
+ return results_file, predicted_file
561
+
562
+ # Gradio interface
563
+ with gr.Blocks() as interface:
564
+ gr.Markdown("# DeepMap - An Integrated GUI for Genotype to Phenotype Prediction")
565
+
566
+ with gr.Row():
567
+ training_file = gr.File(label="Upload Training Data (CSV)")
568
+ training_additive_file = gr.File(label="Upload Training Additive Data (CSV)")
569
+ training_dominance_file = gr.File(label="Upload Training Dominance Data (CSV)")
570
+
571
+ with gr.Row():
572
+ testing_file = gr.File(label="Upload Testing Data (CSV)")
573
+ testing_additive_file = gr.File(label="Upload Testing Additive Data (CSV)")
574
+ testing_dominance_file = gr.File(label="Upload Testing Dominance Data (CSV)")
575
+
576
+ with gr.Row():
577
+ feature_selection = gr.Checkbox(label="Enable Feature Selection", value=True)
578
+
579
+ output1 = gr.File(label="Cross-Validation Results (CSV)")
580
+ output2 = gr.File(label="Predicted Phenotypes (CSV)")
581
+
582
+ submit_btn = gr.Button("Run DeepMap")
583
+ submit_btn.click(
584
+ run_cross_validation,
585
+ inputs=[
586
+ training_file, training_additive_file, testing_file,
587
+ testing_additive_file, training_dominance_file,testing_dominance_file,
588
+ feature_selection
589
+ ],
590
+ outputs=[output1, output2]
591
+ )
592
+
593
+ # Launch the interface
594
+ interface.launch()