Beginners' Guide to Classification using Pima Diabetes Dataset

Updated: Feb 2, 2023


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


def drop_columns(data_frame, column_name):
    data_frame = data_frame.copy()
    data_frame.drop(column_name, axis=1, inplace = True)

    return data_frame
def best_correlated_features(data_frame, output_variable, max_corr = 0.25, min_corr = -0.25):
    data_frame = data_frame.copy()
    correlation_with_output = pd.DataFrame(data_frame.corr()[f'{output_variable}'])
    column_names = correlation_with_output.index
    column_values = correlation_with_output.values
    best_columns = list()
    for i, j in zip(column_names, column_values):
        if j >= max_corr:
        elif j <= min_corr:

    return best_columns
def display_correlation(data_frame):
    plt.figure(figsize=(10, 7))
    cmap_value = 'CMRmap_r'
    sns.heatmap(data_frame.corr(), annot = True, cmap = cmap_value)
def dataset_info(data_frame):
    data_frame = data_frame.copy()
    null_dataset = data_frame.isnull().sum()
    data_type_dataset = data_frame.dtypes
    indices = null_dataset.index
    null_values = null_dataset.values
    data_type_values = data_type_dataset.values

    dataset_info_dict = {'features': indices, 'null_values': null_values, 'data_type': data_type_values}

    print(f'Shape of the data frame {data_frame.shape}')


df = pd.read_csv('diabetes.csv')

                   features  null_values data_type
0               Pregnancies            0     int64
1                   Glucose            0     int64
2             BloodPressure            0     int64
3             SkinThickness            0     int64
4                   Insulin            0     int64
5                       BMI            0   float64
6  DiabetesPedigreeFunction            0   float64
7                       Age            0     int64
8                   Outcome            0     int64
Shape of the data frame (768, 9)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

0    500
1    268
Name: Outcome, dtype: int64

best_correlated_columns = best_correlated_features(df, 'Outcome', 0.21, -0.20)
['Pregnancies', 'Glucose', 'BMI', 'Age', 'Outcome'] 

X = df.iloc[:, :-1]
y = df.loc[:, ['Outcome']]
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X, y = sm.fit_resample(X, y)

display_correlation(pd.concat([pd.DataFrame(X), y], axis = 1))

aug_df = pd.concat([pd.DataFrame(X), y], axis = 1)

best_correlated_columns = best_correlated_features(aug_df, 'Outcome', 0.21, -0.20)
['Glucose', 'BMI', 'Age', 'Outcome'] 

X = aug_df.loc[:, best_correlated_columns]
y = aug_df.loc[:, ['Outcome']]

scaler = MinMaxScaler()
X = model.transform(X)

scaler = MinMaxScaler()
y = model.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

0.0    400
1.0    400
dtype: int64

params = {
    'n_estimators': [100,200,500,750,1000],
    'max_depth': [3,5,7,9],
    'min_child_weight': [1,3,5],
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
    'learning_rate': [0.01, 0.02, 0.05, 0.1]

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 ), y_train.ravel())

Fitting 5 folds for each of 5 candidates, totalling 25 fits
RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f9e54a51820>,
                   estimator=XGBClassifier(learning_rate=0.02, n_estimators=600,
                                           nthread=1, silent=True),
                   n_iter=5, n_jobs=4,
                   param_distributions={'colsample_bytree': [0.6, 0.7, 0.8,
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.01, 0.02, 0.05,
                                        'max_depth': [3, 5, 7, 9],
                                        'min_child_weight': [1, 3, 5],
                                        'n_estimators': [100, 200, 500, 750,
                                        'reg_alpha': [0, 0.001, 0.005, 0.01,
                                                      0.05, 0.1, 1],
                                        'subsample': [0.6, 0.7, 0.8, 0.9]},
                   random_state=1001, scoring='roc_auc', verbose=3)

print('\n All results:')
print('\n Best estimator:')
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

 All results: {'mean_fit_time': array([0.85436773, 0.12571445, 0.29614768, 0.40871468, 0.56435437]), 'std_fit_time': array([0.05036582, 0.03244128, 0.06253441, 0.10844844, 0.06243001]), 'mean_score_time': array([0.01597557, 0.01393166, 0.01356397, 0.01151047, 0.00780344]), 'std_score_time': array([0.00593373, 0.00377552, 0.00399773, 0.00429262, 0.0047163 ]), 'param_subsample': masked_array(data=[0.7, 0.7, 0.8, 0.8, 0.9],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_reg_alpha': masked_array(data=[1, 1, 0.001, 0.1, 1],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_n_estimators': masked_array(data=[750, 100, 200, 500, 750],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_min_child_weight': masked_array(data=[1, 1, 5, 5, 3],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_max_depth': masked_array(data=[3, 3, 5, 7, 7],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.01, 0.05, 0.05],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_gamma': masked_array(data=[0.2, 0.2, 0.2, 0.4, 0.4],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'param_colsample_bytree': masked_array(data=[0.8, 0.7, 0.9, 0.7, 0.8],              mask=[False, False, False, False, False],        fill_value='?',             dtype=object), 'params': [{'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}, {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7}, {'subsample': 0.8, 'reg_alpha': 0.001, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}, {'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}, {'subsample': 0.9, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.8}], 'split0_test_score': array([1., 1., 1., 1., 1.]), 'split1_test_score': array([1., 1., 1., 1., 1.]), 'split2_test_score': array([1., 1., 1., 1., 1.]), 'split3_test_score': array([1., 1., 1., 1., 1.]), 'split4_test_score': array([1., 1., 1., 1., 1.]), 'mean_test_score': array([1., 1., 1., 1., 1.]), 'std_test_score': array([0.00000000e+00, 4.96506831e-17, 4.96506831e-17, 4.96506831e-17,        0.00000000e+00]), 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32)}   Best estimator: XGBClassifier(colsample_bytree=0.8, gamma=0.2, learning_rate=0.01,               n_estimators=750, nthread=1, reg_alpha=1, silent=True,               subsample=0.7)   Best normalized gini score for 5-fold search with 5 parameter combinations: 1.0   Best hyperparameters: {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}

y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       100
         1.0       1.00      1.00      1.00       100

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

sns.pairplot(df.loc[:, best_correlated_columns], hue="Outcome")
