diff --git a/DNN_Training.py b/DNN_Training.py index 4909158957359c4d48ffc4197f7f52746ec6be5c..d95c03c80b214798bd699092d859826fea9cd518 100644 --- a/DNN_Training.py +++ b/DNN_Training.py @@ -1,56 +1,28 @@ # import required libraries for data analysis to train DNN model -import numpy as np -import pandas as pd import matplotlib.pyplot as plt import seaborn as sns -from imblearn.over_sampling import RandomOverSampler -from collections import Counter -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler #%matplotlib inline # Define keras and tenserflow libraries for DNN from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization from tensorflow.keras.callbacks import EarlyStopping from sklearn.metrics import classification_report, confusion_matrix -from sklearn.model_selection import train_test_split import visualkeras from tensorflow.keras.utils import plot_model from tensorflow.keras.utils import to_categorical from tensorflow.keras.optimizers import Adam -#import pydot -#import graphviz from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras import regularizers +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() +df = load_data() -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # Define DNN architecture and regularisation techniques model = Sequential([ diff --git a/ExtraTrees_Training.py b/ExtraTrees_Training.py index 7e5bda58fe1447e4278b287fc7f97b7f2462321d..0100e3c915c6e2dc6f7bff63bd0cdefbde522a68 100644 --- a/ExtraTrees_Training.py +++ b/ExtraTrees_Training.py @@ -6,39 +6,16 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns -from imblearn.over_sampling import RandomOverSampler -from collections import Counter -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() +df = load_data() -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -#define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # Set up the hyperparameter grid with valid parameters for ExtraTreesClassifier param_grid = { diff --git a/LR_Test.py b/LR_Test.py index 26d1aa745b21516516b22eed1b181b51fed88627..7833a5f271a17d53df3d568da68bed6151d70252 100644 --- a/LR_Test.py +++ b/LR_Test.py @@ -12,43 +12,15 @@ from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report from sklearn.metrics import roc_auc_score -from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import roc_curve +from sklearn.metrics import ConfusionMatrixDisplay +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() - -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) - - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -###X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) -###X_train, y_train = oversample.fit_resample(X_train, y_train) - +df = load_data() -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # set to the best hyperparameters lr = LogisticRegression(penalty = 'l2', C = 0.001,random_state = 0) diff --git a/MLP_Training.py b/MLP_Training.py index 8a1ba1a17c6f4aa669db50e702ac410c16e6233c..c7e44306ace43a8cf774556b07a64d6c8689f3da 100644 --- a/MLP_Training.py +++ b/MLP_Training.py @@ -13,32 +13,13 @@ from sklearn.metrics import classification_report import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, roc_auc_score from sklearn.metrics import ConfusionMatrixDisplay +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() +df = load_data() -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # Set up the hyperparameter grid with valid parameters for MLP mlp_gs = MLPClassifier(max_iter=100) diff --git a/RF_Test.py b/RF_Test.py index 59148ced21a2a7d0e6fb0000481302e393401ee3..2672e65c90d8ef0516082f00a2f84d107ac7ed6c 100644 --- a/RF_Test.py +++ b/RF_Test.py @@ -13,32 +13,13 @@ from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_val_score from sklearn.metrics import classification_report +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() - -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +df = load_data() + +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # set to the best hyperparameters randmf = RandomForestClassifier(n_estimators=180, random_state=42, min_samples_leaf= 1,min_samples_split = 6, max_features = 'sqrt', max_depth= 130, bootstrap=False) diff --git a/SVM_Test.py b/SVM_Test.py index 5d980760f723853cf480e4e461c22d65be035893..2b2f59c0b4b1ad9637cba0a1e55b714aaf998e18 100644 --- a/SVM_Test.py +++ b/SVM_Test.py @@ -11,35 +11,16 @@ from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import ConfusionMatrixDisplay +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("../OneHotEncoder_2.csv") - return data -df = load() +df = load_data() -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # seting the hyperparameters -svc= SVC( C=1.0,gamma=o.2,kernel='rbf', +svc= SVC( C=1.0,gamma=0.2,kernel='rbf') #Fit the SVM model according to the given training data svc.fit(X_train_scaled, y_train) @@ -47,11 +28,11 @@ svc.fit(X_train_scaled, y_train) y_pred=svc.predict(X_test_scaled) # Model accuracy -print('Model accuracy : {0:0.3f}'. format(accuracy_score(y_test, y_pred)) +print('Model accuracy : {0:0.3f}'.format(accuracy_score(y_test, y_pred))) # Classification report print(classification_report(y_test, y_pred)) # Display Confusion matrix ConfusionMatrixDisplay.from_predictions(y_test, y_pred) -plt.show()) +plt.show() diff --git a/XGBoost_Inference.py b/XGBoost_Inference.py index 23dd6966fc0a1345f998b9b05ad95019c61f34d2..a4b2cb14a3423330db31ea4c66c84a3f5045d934 100644 --- a/XGBoost_Inference.py +++ b/XGBoost_Inference.py @@ -23,35 +23,13 @@ from xgboost import XGBClassifier import xgboost as xgb # Make sure xgboost is imported import optuna import shap - -import warnings -warnings.filterwarnings('ignore') +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() - -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) +df = load_data() -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # set to the best hyperparameters #147, 139, 155(12), 159(12), 166(12), 177(11), 188(12),86.5%, @@ -82,9 +60,9 @@ print(confusion_matrix(y_test, y_test_hat)) # Plot the imprtant features and shaply values shap_values = shap.TreeExplainer(model).shap_values(X_train_scaled) -shap.summary_plot(shap_values, X_train, plot_type="bar", color_bar='red') +shap.summary_plot(shap_values, X_train_scaled, plot_type="bar", color_bar='red') -shap.summary_plot(shap_values, X_train, plot_type="violin") +shap.summary_plot(shap_values, X_train_scaled, plot_type="violin") # perform the model test on the test set y_pred = model.predict(X_test_scaled) @@ -96,7 +74,7 @@ plt.show() # ROC -y_probs = model.predict_proba(X_test)[:, 1] # Get probability for the positive class +y_probs = model.predict_proba(X_test_scaled)[:, 1] # Get probability for the positive class fpr, tpr, thresholds = roc_curve(y_test, y_probs) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, label='ROC Curve') diff --git a/XGBoost_Training.py b/XGBoost_Training.py index 9328eb1e435e3110867c5ad3a77ad874ce73e26c..84a88745076b702cda796c3fd2e8ecea4451b064 100644 --- a/XGBoost_Training.py +++ b/XGBoost_Training.py @@ -10,34 +10,13 @@ from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score import optuna import xgboost as xgb # Make sure xgboost is imported -import warnings -warnings.filterwarnings('ignore') +from preprocessing import load_data, preprocess_data # Loading imputed data -def load(): - data = pd.read_csv("OneHotEncoder_2.csv") - return data -df = load() +df = load_data() -# Define the predicted target -Target = ['Mortality'] -X=df.drop("Mortality_All",axis=1) -y=df["Mortality_All"] - -# define the oversampling strategy for balancing between the two classes -oversample = RandomOverSampler(sampling_strategy='minority') -#oversample = RandomOverSampler(sampling_strategy=0.5) -X_ROS, y_ROS = oversample.fit_resample(X, y) - -# Check the shape of X_ROS: -#print("Shape of X_ROS:", X_ROS.shape) -# Split our data into 70% for training and 30% for test -X_train , X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0) -# Use standard Scaler for our data -scaler= StandardScaler() -scaler.fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +# Balance dataset and scale +X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) # Defining the objective function def objective(trial): diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..d73c607f9b16f78f639b25c726ba41e894dab78c --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,59 @@ +""" +preprocessing.py + +This module provides shared functions for loading and preprocessing +data. It includes: +- load_data: Loads the input dataset from a CSV file. +- preprocess_data: Performs oversampling, splits the data into + training and testing sets, and applies standard scaling. + +Functions: + load_data(filepath: str) -> pd.DataFrame: + Loads the dataset from the specified CSV file. + + preprocess_data(df: pd.DataFrame, target_column: str, + sampling_strategy: str, test_size: float, + random_state: int) + -> + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + + Preprocesses the data by balancing classes, splitting into + train/test sets, and scaling features. + +Example usage: + from preprocessing import load_data, preprocess_data + + df = load_data() + X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) +""" + +import pandas as pd +from imblearn.over_sampling import RandomOverSampler +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +# Function to load data +def load_data(filepath="OneHotEncoder_2.csv"): + return pd.read_csv(filepath) + +# Function to preprocess data +def preprocess_data(df, target_column='Mortality_All', sampling_strategy='minority', test_size=0.30, random_state=0): + # Define the predicted target + X = df.drop(target_column, axis=1) + y = df[target_column] + + # Define the oversampling strategy for balancing between the two classes + oversample = RandomOverSampler(sampling_strategy=sampling_strategy) + X_ROS, y_ROS = oversample.fit_resample(X, y) + + # Split our data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=test_size, random_state=random_state) + + # Use StandardScaler for our data + scaler = StandardScaler() + scaler.fit(X_train) + X_train_scaled = scaler.transform(X_train) + X_test_scaled = scaler.transform(X_test) + + return X_train_scaled, X_test_scaled, y_train, y_test +