diff --git a/DNN_Training.py b/DNN_Training.py
index 4909158957359c4d48ffc4197f7f52746ec6be5c..d95c03c80b214798bd699092d859826fea9cd518 100644
--- a/DNN_Training.py
+++ b/DNN_Training.py
@@ -1,56 +1,28 @@
 # import required libraries for data analysis to train DNN model
-import numpy as np
-import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-from imblearn.over_sampling import RandomOverSampler
-from collections import Counter
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
 #%matplotlib inline
 # Define keras and tenserflow libraries for DNN 
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization
 from tensorflow.keras.callbacks import EarlyStopping
 from sklearn.metrics import classification_report, confusion_matrix
-from sklearn.model_selection import train_test_split
 import visualkeras
 from tensorflow.keras.utils import plot_model
 from tensorflow.keras.utils import to_categorical
 from tensorflow.keras.optimizers import Adam
-#import pydot
-#import graphviz
 from keras.models import Sequential
 from keras.layers import Dense
 from keras.layers import Dropout
 from keras import regularizers
+from preprocessing import load_data, preprocess_data
 
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
+df = load_data()
 
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # Define DNN architecture and regularisation techniques 
 model = Sequential([
diff --git a/ExtraTrees_Training.py b/ExtraTrees_Training.py
index 7e5bda58fe1447e4278b287fc7f97b7f2462321d..0100e3c915c6e2dc6f7bff63bd0cdefbde522a68 100644
--- a/ExtraTrees_Training.py
+++ b/ExtraTrees_Training.py
@@ -6,39 +6,16 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-from imblearn.over_sampling import RandomOverSampler
-from collections import Counter
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import cross_val_score
 from sklearn.metrics import classification_report
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
+df = load_data()
 
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-#define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # Set up the hyperparameter grid with valid parameters for ExtraTreesClassifier
 param_grid = {
diff --git a/LR_Test.py b/LR_Test.py
index 26d1aa745b21516516b22eed1b181b51fed88627..7833a5f271a17d53df3d568da68bed6151d70252 100644
--- a/LR_Test.py
+++ b/LR_Test.py
@@ -12,43 +12,15 @@ from sklearn.metrics import accuracy_score
 from sklearn.model_selection import cross_val_score
 from sklearn.metrics import classification_report
 from sklearn.metrics import roc_auc_score
-from sklearn.metrics import ConfusionMatrixDisplay
 from sklearn.metrics import roc_curve
+from sklearn.metrics import ConfusionMatrixDisplay
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
-
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-###X_train , X_test, y_train,  y_test = train_test_split(X, y, test_size=0.30, random_state=0)
-###X_train, y_train = oversample.fit_resample(X_train, y_train)
-
+df = load_data()
 
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # set to the best hyperparameters 
 lr = LogisticRegression(penalty = 'l2', C = 0.001,random_state = 0)
diff --git a/MLP_Training.py b/MLP_Training.py
index 8a1ba1a17c6f4aa669db50e702ac410c16e6233c..c7e44306ace43a8cf774556b07a64d6c8689f3da 100644
--- a/MLP_Training.py
+++ b/MLP_Training.py
@@ -13,32 +13,13 @@ from sklearn.metrics import classification_report
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, roc_auc_score
 from sklearn.metrics import ConfusionMatrixDisplay
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
+df = load_data()
 
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # Set up the hyperparameter grid with valid parameters for MLP
 mlp_gs = MLPClassifier(max_iter=100)
diff --git a/RF_Test.py b/RF_Test.py
index 59148ced21a2a7d0e6fb0000481302e393401ee3..2672e65c90d8ef0516082f00a2f84d107ac7ed6c 100644
--- a/RF_Test.py
+++ b/RF_Test.py
@@ -13,32 +13,13 @@ from sklearn.metrics import ConfusionMatrixDisplay
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import cross_val_score
 from sklearn.metrics import classification_report
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
-
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+df = load_data()
+
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # set to the best hyperparameters 
 randmf = RandomForestClassifier(n_estimators=180, random_state=42, min_samples_leaf= 1,min_samples_split = 6, max_features = 'sqrt', max_depth= 130, bootstrap=False)
diff --git a/SVM_Test.py b/SVM_Test.py
index 5d980760f723853cf480e4e461c22d65be035893..2b2f59c0b4b1ad9637cba0a1e55b714aaf998e18 100644
--- a/SVM_Test.py
+++ b/SVM_Test.py
@@ -11,35 +11,16 @@ from sklearn.svm import SVC
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import classification_report
 from sklearn.metrics import ConfusionMatrixDisplay
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("../OneHotEncoder_2.csv")
-    return data
-df = load()
+df = load_data()
 
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # seting the hyperparameters 
-svc= SVC( C=1.0,gamma=o.2,kernel='rbf',
+svc= SVC( C=1.0,gamma=0.2,kernel='rbf')
 #Fit the SVM model according to the given training data              
 svc.fit(X_train_scaled, y_train)
 
@@ -47,11 +28,11 @@ svc.fit(X_train_scaled, y_train)
 y_pred=svc.predict(X_test_scaled)
 
 # Model accuracy
-print('Model accuracy : {0:0.3f}'. format(accuracy_score(y_test, y_pred))
+print('Model accuracy : {0:0.3f}'.format(accuracy_score(y_test, y_pred)))
       
 # Classification report
 print(classification_report(y_test, y_pred))
 
 # Display Confusion matrix
 ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
-plt.show())
+plt.show()
diff --git a/XGBoost_Inference.py b/XGBoost_Inference.py
index 23dd6966fc0a1345f998b9b05ad95019c61f34d2..a4b2cb14a3423330db31ea4c66c84a3f5045d934 100644
--- a/XGBoost_Inference.py
+++ b/XGBoost_Inference.py
@@ -23,35 +23,13 @@ from xgboost import XGBClassifier
 import xgboost as xgb  # Make sure xgboost is imported
 import optuna
 import shap
-
-import warnings
-warnings.filterwarnings('ignore')
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
-
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
+df = load_data()
 
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # set to the best hyperparameters 
 #147, 139, 155(12), 159(12), 166(12), 177(11), 188(12),86.5%, 
@@ -82,9 +60,9 @@ print(confusion_matrix(y_test, y_test_hat))
 
 # Plot the imprtant features and shaply values
 shap_values = shap.TreeExplainer(model).shap_values(X_train_scaled)
-shap.summary_plot(shap_values, X_train, plot_type="bar", color_bar='red')
+shap.summary_plot(shap_values, X_train_scaled, plot_type="bar", color_bar='red')
 
-shap.summary_plot(shap_values, X_train, plot_type="violin")
+shap.summary_plot(shap_values, X_train_scaled, plot_type="violin")
 
 # perform the model test on the test set
 y_pred = model.predict(X_test_scaled)  
@@ -96,7 +74,7 @@ plt.show()
 
 
 # ROC 
-y_probs = model.predict_proba(X_test)[:, 1]  # Get probability for the positive class
+y_probs = model.predict_proba(X_test_scaled)[:, 1]  # Get probability for the positive class
 fpr, tpr, thresholds = roc_curve(y_test, y_probs)
 plt.figure(figsize=(8, 6))
 plt.plot(fpr, tpr, label='ROC Curve')
diff --git a/XGBoost_Training.py b/XGBoost_Training.py
index 9328eb1e435e3110867c5ad3a77ad874ce73e26c..84a88745076b702cda796c3fd2e8ecea4451b064 100644
--- a/XGBoost_Training.py
+++ b/XGBoost_Training.py
@@ -10,34 +10,13 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import cross_val_score
 import optuna
 import xgboost as xgb  # Make sure xgboost is imported
-import warnings
-warnings.filterwarnings('ignore')
+from preprocessing import load_data, preprocess_data
 
 # Loading imputed data
-def load():
-    data = pd.read_csv("OneHotEncoder_2.csv")
-    return data
-df = load()
+df = load_data()
 
-# Define the predicted target 
-Target = ['Mortality']
-X=df.drop("Mortality_All",axis=1)
-y=df["Mortality_All"]
-
-# define the oversampling strategy for balancing between the two classes 
-oversample = RandomOverSampler(sampling_strategy='minority')
-#oversample = RandomOverSampler(sampling_strategy=0.5)
-X_ROS, y_ROS = oversample.fit_resample(X, y)
-
-# Check the shape of X_ROS:
-#print("Shape of X_ROS:", X_ROS.shape)
-# Split our data into 70% for training and 30% for test
-X_train , X_test, y_train,  y_test = train_test_split(X_ROS, y_ROS, test_size=0.30, random_state=0)
-# Use standard Scaler for our data
-scaler= StandardScaler()
-scaler.fit(X_train)
-X_train_scaled = scaler.transform(X_train)
-X_test_scaled = scaler.transform(X_test)
+# Balance dataset and scale
+X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
 
 # Defining the objective function
 def objective(trial):
diff --git a/preprocessing.py b/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d73c607f9b16f78f639b25c726ba41e894dab78c
--- /dev/null
+++ b/preprocessing.py
@@ -0,0 +1,59 @@
+"""
+preprocessing.py
+
+This module provides shared functions for loading and preprocessing
+data. It includes:
+- load_data: Loads the input dataset from a CSV file.
+- preprocess_data: Performs oversampling, splits the data into
+  training and testing sets, and applies standard scaling.
+
+Functions:
+    load_data(filepath: str) -> pd.DataFrame:
+        Loads the dataset from the specified CSV file.
+
+    preprocess_data(df: pd.DataFrame, target_column: str,
+        sampling_strategy: str, test_size: float, 
+        random_state: int) 
+        ->
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+
+	Preprocesses the data by balancing classes, splitting into
+        train/test sets, and scaling features.
+
+Example usage:
+    from preprocessing import load_data, preprocess_data
+    
+    df = load_data()
+    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
+"""
+
+import pandas as pd
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+
+# Function to load data
+def load_data(filepath="OneHotEncoder_2.csv"):
+    return pd.read_csv(filepath)
+
+# Function to preprocess data
+def preprocess_data(df, target_column='Mortality_All', sampling_strategy='minority', test_size=0.30, random_state=0):
+    # Define the predicted target
+    X = df.drop(target_column, axis=1)
+    y = df[target_column]
+
+    # Define the oversampling strategy for balancing between the two classes
+    oversample = RandomOverSampler(sampling_strategy=sampling_strategy)
+    X_ROS, y_ROS = oversample.fit_resample(X, y)
+
+    # Split our data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X_ROS, y_ROS, test_size=test_size, random_state=random_state)
+
+    # Use StandardScaler for our data
+    scaler = StandardScaler()
+    scaler.fit(X_train)
+    X_train_scaled = scaler.transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    return X_train_scaled, X_test_scaled, y_train, y_test
+