#importing the required packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, plot_importance
import lightgbm as lgb

# Importing the Data Set
dt = pd.read_csv('conversion_predictors_of_clinically_isolated_syndrome_to_multiple_sclerosis.csv')

# Data Information
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               273 non-null    int64  
 1   Gender                   273 non-null    int64  
 2   Age                      273 non-null    int64  
 3   Schooling                272 non-null    float64
 4   Breastfeeding            273 non-null    int64  
 5   Varicella                273 non-null    int64  
 6   Initial_Symptom          272 non-null    float64
 7   Mono_or_Polysymptomatic  273 non-null    int64  
 8   Oligoclonal_Bands        273 non-null    int64  
 9   LLSSEP                   273 non-null    int64  
 10  ULSSEP                   273 non-null    int64  
 11  VEP                      273 non-null    int64  
 12  BAEP                     273 non-null    int64  
 13  Periventricular_MRI      273 non-null    int64  
 14  Cortical_MRI             273 non-null    int64  
 15  Infratentorial_MRI       273 non-null    int64  
 16  Spinal_Cord_MRI          273 non-null    int64  
 17  Initial_EDSS             125 non-null    float64
 18  Final_EDSS               125 non-null    float64
 19  group                    273 non-null    int64  
dtypes: float64(4), int64(16)
memory usage: 42.8 KB

# Delete "Unnamed",'Initial_EDSS','Final_EDSS' 
dt = dt.drop(['Unnamed: 0','Initial_EDSS','Final_EDSS'], axis=1)

# Remove Missing Values
dt = dt.dropna()

# Turning it to Integers
dt[dt.columns] = dt[dt.columns].astype('Int64')

# Mapping numeric values to categorical labels
gender_map = {1: 'Male', 2: 'Female'}
breastfeeding_map = {1: 'yes', 2: 'no', 3: 'unknown'}
varicella_map = {1: 'positive', 2: 'negative', 3: 'unknown'}
group_map = {1: 'CDMS', 2: 'Non-CDMS'}
symptom_type_map = {1: 'monosymptomatic', 2: 'polysymptomatic', 3: 'unknown'}
oligoclonal_bands_map = {0: 'negative', 1: 'positive', 2: 'unknown'}
llssep_map = {0: 'negative', 1: 'positive'}
ulssep_map = {0: 'negative', 1: 'positive'}
vep_map = {0: 'negative', 1: 'positive'}
baep_map = {0: 'negative', 1: 'positive'}
periventricular_mri_map = {0: 'negative', 1: 'positive'}
cortical_mri_map = {0: 'negative', 1: 'positive'}
infratentorial_mri_map = {0: 'negative', 1: 'positive'}
spinal_cord_mri_map = {0: 'negative', 1: 'positive'}

# Applying the mappings to the DataFrame columns
dt['Gender'] = dt['Gender'].map(gender_map)
dt['Breastfeeding'] = dt['Breastfeeding'].map(breastfeeding_map)
dt['Varicella'] = dt['Varicella'].map(varicella_map)
dt['group'] = dt['group'].map(group_map)
dt['Mono_or_Polysymptomatic'] = dt['Mono_or_Polysymptomatic'].map(symptom_type_map)
dt['Oligoclonal_Bands'] = dt['Oligoclonal_Bands'].map(oligoclonal_bands_map)
dt['LLSSEP'] = dt['LLSSEP'].map(llssep_map)
dt['ULSSEP'] = dt['ULSSEP'].map(ulssep_map)
dt['VEP'] = dt['VEP'].map(vep_map)
dt['BAEP'] = dt['BAEP'].map(baep_map)
dt['Periventricular_MRI'] = dt['Periventricular_MRI'].map(periventricular_mri_map)
dt['Cortical_MRI'] = dt['Cortical_MRI'].map(cortical_mri_map)
dt['Infratentorial_MRI'] = dt['Infratentorial_MRI'].map(infratentorial_mri_map)
dt['Spinal_Cord_MRI'] = dt['Spinal_Cord_MRI'].map(spinal_cord_mri_map)

# Verifying if the mappings have been applied correctly
print(dt.head())

   Gender  Age  Schooling Breastfeeding Varicella  Initial_Symptom  \
0    Male   34         20           yes  positive                2   
1    Male   61         25       unknown  negative               10   
2    Male   22         20       unknown  positive                3   
3  Female   41         15           yes  positive                7   
4  Female   34         20            no  positive                6   

  Mono_or_Polysymptomatic Oligoclonal_Bands    LLSSEP    ULSSEP       VEP  \
0         monosymptomatic          negative  positive  positive  negative   
1         polysymptomatic          positive  positive  negative  positive   
2         monosymptomatic          positive  negative  negative  negative   
3         polysymptomatic          positive  negative  positive  positive   
4         polysymptomatic          negative  positive  negative  negative   

       BAEP Periventricular_MRI Cortical_MRI Infratentorial_MRI  \
0  negative            negative     positive           negative   
1  negative            negative     negative           negative   
2  negative            negative     positive           negative   
3  negative            positive     positive           negative   
4  negative            positive     negative           negative   

  Spinal_Cord_MRI group  
0        positive  CDMS  
1        positive  CDMS  
2        negative  CDMS  
3        negative  CDMS  
4        negative  CDMS

# Checking the distribution of the dependent variable
plt.figure(figsize=(12,8))
ax=sns.countplot(x='group', data=dt)
plt.bar_label(ax.containers[0]) 
plt.title('Distribution of the dependent variable')
plt.show()

# Visualizing the distribution of the target variable
plt.figure(figsize=(12, 8))
ax = sns.countplot(x='group', data=dt)
ax.bar_label(ax.containers[0])
plt.title('Distribution of the Target Variable')
plt.xlabel('Group')
plt.ylabel('Count')
plt.show()

# Examining the distribution of Gender across Groups
pd.crosstab(dt['Gender'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Gender Across Groups')
plt.xlabel('Gender')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Breastfeeding across Groups
pd.crosstab(dt['Breastfeeding'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Breastfeeding Across Groups')
plt.xlabel('Breastfeeding Status')
plt.ylabel('Proportion')
plt.show()

# Examining the distribution of Mono or Polysymptomatic across Groups
pd.crosstab(dt['Mono_or_Polysymptomatic'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Mono or Polysymptomatic Across Groups')
plt.xlabel('Symptom Type')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Varicella status across Groups
pd.crosstab(dt['Varicella'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Varicella Status Across Groups')
plt.xlabel('Varicella Status')
plt.ylabel('Proportion')
plt.show()

# Examining the distribution of Oligoclonal Bands across Groups
pd.crosstab(dt['Oligoclonal_Bands'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Oligoclonal Bands Across Groups')
plt.xlabel('Oligoclonal Bands Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of LLSSEP across Groups
pd.crosstab(dt['LLSSEP'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of LLSSEP Across Groups')
plt.xlabel('LLSSEP Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of ULSSEP status across Groups
pd.crosstab(dt['ULSSEP'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of ULSSEP Status Across Groups')
plt.xlabel('ULSSEP Status')
plt.ylabel('Proportion')
plt.show()

# Examining the distribution of VEP status across Groups
pd.crosstab(dt['VEP'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of VEP Status Across Groups')
plt.xlabel('VEP Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of BAEP status across Groups
pd.crosstab(dt['BAEP'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of BAEP Status Across Groups')
plt.xlabel('BAEP Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Periventricular MRI status across Groups
pd.crosstab(dt['Periventricular_MRI'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Periventricular MRI Status Across Groups')
plt.xlabel('Periventricular MRI Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Cortical MRI status across Groups
pd.crosstab(dt['Cortical_MRI'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Cortical MRI Status Across Groups')
plt.xlabel('Cortical MRI Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Infratentorial MRI status across Groups
pd.crosstab(dt['Infratentorial_MRI'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Infratentorial MRI Status Across Groups')
plt.xlabel('Infratentorial MRI Status')
plt.ylabel('Proportion')
plt.show()

# Analyzing the distribution of Spinal Cord MRI status across Groups
pd.crosstab(dt['Spinal_Cord_MRI'], dt['group'], normalize='index').plot.bar(stacked=False)
plt.title('Distribution of Spinal Cord MRI Status Across Groups')
plt.xlabel('Spinal Cord MRI Status')
plt.ylabel('Proportion')
plt.show()

# Setting the preprocessing instance
le = preprocessing.LabelEncoder()

categorical_columns = [
    'Gender', 'Breastfeeding', 'LLSSEP', 'ULSSEP', 'VEP', 'BAEP', 
    'Periventricular_MRI', 'Cortical_MRI', 'Infratentorial_MRI', 
    'Spinal_Cord_MRI', 'group', 'Oligoclonal_Bands', 'Mono_or_Polysymptomatic', 'Varicella'
]

# Apply label encoding to each column in the list
for col in categorical_columns:
    dt[col] = le.fit_transform(dt[col])

dt.head()

#  The dependent and independent variables
X = dt.drop(["group"],axis=1)
y = dt['group']

# Train, Test and scaling the train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

with warnings.catch_warnings(record=True):
    rf = RandomForestClassifier(random_state=101)  # Initialize Random Forest model
    
    # Define the hyperparameter grid for tuning
    param_grid = {
        'n_estimators': [64, 100, 128],
        'max_features': [4, 8, 12],
        'bootstrap': [True, False],
        'oob_score': [True, False]
    }
    
    # Grid search for hyperparameter tuning
    rf_grid_search = GridSearchCV(estimator=rf, param_grid=param_grid)
    rf_grid_search.fit(X_train, y_train)

rf_grid_search.best_params_

{'bootstrap': True, 'max_features': 8, 'n_estimators': 64, 'oob_score': True}

# Suppressing warnings during the process
with warnings.catch_warnings(record=True):
    # Initialize LightGBM classifier
    lgbm = lgb.LGBMClassifier(random_state=101, verbose=-1)
    
    # Define the hyperparameter grid for tuning
    param_grid = {
        'n_estimators': [64, 100, 128],
        'max_depth': [4, 6, 8,16,32],
        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'num_leaves': [31, 50, 100,200]
    }
    
    # Grid search for hyperparameter tuning
    lgbm_grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid)
    lgbm_grid_search.fit(X_train, y_train)  # Fit model to training data

  lgbm_grid_search.best_params_

{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 64, 'num_leaves': 31}

# fitting the random forest with the  best parameters
RanF= RandomForestClassifier(max_features= 8, n_estimators=64, random_state=101, bootstrap=True,oob_score=True)
RanF.fit(X_train,y_train)
RanF_pred = RanF.predict(X_test)
print(classification_report(y_test,RanF_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        35
           1       0.91      0.87      0.89        47

    accuracy                           0.88        82
   macro avg       0.87      0.88      0.88        82
weighted avg       0.88      0.88      0.88        82

# fitting the Light  GBM with the  best parameters
LGBMM= lgb.LGBMClassifier(max_depth= 16, n_estimators=128, random_state=101, learning_rate=0.2, num_leaves=31)
LGBMM.fit(X_train,y_train)
LGBMM_pred = LGBMM.predict(X_test)
print(classification_report(y_test,LGBMM_pred))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        35
           1       0.85      0.85      0.85        47

    accuracy                           0.83        82
   macro avg       0.83      0.83      0.83        82
weighted avg       0.83      0.83      0.83        82

# Most important features
feature_names = dt.columns[:-1]
feat_importances = pd.Series(RanF.feature_importances_,index=feature_names)  # Ensure X_train is a DataFrame
feat_importances.nlargest(10).plot(kind='barh')

<Axes: >

	Gender	Age	Schooling	Breastfeeding	Varicella	Initial_Symptom	Mono_or_Polysymptomatic	Oligoclonal_Bands	LLSSEP	ULSSEP	VEP	Periventricular_MRI	Cortical_MRI	Spinal_Cord_MRI
0	1	34	20	2	1	2	0	0	1	1	0	0	1	1
1	1	61	25	1	0	10	1	1	1	0	1	0	0	1
2	1	22	20	1	1	3	0	1	0	0	0	0	1	0
3	0	41	15	2	1	7	1	1	0	1	1	1	1	0
4	0	34	20	0	1	6	1	0	1	0	0	1	0	0

	Gender	Age	Schooling	Breastfeeding	Varicella	Initial_Symptom	Mono_or_Polysymptomatic	Oligoclonal_Bands	LLSSEP	ULSSEP	VEP	Periventricular_MRI	Cortical_MRI	Spinal_Cord_MRI
0	1	34	20	2	1	2	0	0	1	1	0	0	1	1
1	1	61	25	1	0	10	1	1	1	0	1	0	0	1
2	1	22	20	1	1	3	0	1	0	0	0	0	1	0
3	0	41	15	2	1	7	1	1	0	1	1	1	1	0
4	0	34	20	0	1	6	1	0	1	0	0	1	0	0

Introduction¶

	Gender	Age	Schooling	Breastfeeding	Varicella	Initial_Symptom	Mono_or_Polysymptomatic	Oligoclonal_Bands	LLSSEP	ULSSEP	VEP	Periventricular_MRI	Cortical_MRI	Spinal_Cord_MRI
0	1	34	20	2	1	2	0	0	1	1	0	0	1	1
1	1	61	25	1	0	10	1	1	1	0	1	0	0	1
2	1	22	20	1	1	3	0	1	0	0	0	0	1	0
3	0	41	15	2	1	7	1	1	0	1	1	1	1	0
4	0	34	20	0	1	6	1	0	1	0	0	1	0	0