import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
def drop_columns(data_frame, column_name):
data_frame = data_frame.copy()
data_frame.drop(column_name, axis=1, inplace = True)
return data_frame
def best_correlated_features(data_frame, output_variable, max_corr = 0.25, min_corr = -0.25):
data_frame = data_frame.copy()
correlation_with_output = pd.DataFrame(data_frame.corr()[f'{output_variable}'])
column_names = correlation_with_output.index
column_values = correlation_with_output.values
best_columns = list()
for i, j in zip(column_names, column_values):
if j >= max_corr:
elif j <= min_corr:
return best_columns
def display_correlation(data_frame):
plt.figure(figsize=(10, 7))
cmap_value = 'CMRmap_r'
sns.heatmap(data_frame.corr(), annot = True, cmap = cmap_value)
def dataset_info(data_frame):
data_frame = data_frame.copy()
null_dataset = data_frame.isnull().sum()
data_type_dataset = data_frame.dtypes
indices = null_dataset.index
null_values = null_dataset.values
data_type_values = data_type_dataset.values
dataset_info_dict = {'features': indices, 'null_values': null_values, 'data_type': data_type_values}
print(f'Shape of the data frame {data_frame.shape}')
df = pd.read_csv('diabetes.csv')
features null_values data_type
0 Pregnancies 0 int64
1 Glucose 0 int64
2 BloodPressure 0 int64
3 SkinThickness 0 int64
4 Insulin 0 int64
5 BMI 0 float64
6 DiabetesPedigreeFunction 0 float64
7 Age 0 int64
8 Outcome 0 int64
Shape of the data frame (768, 9)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
0 500
1 268
Name: Outcome, dtype: int64
best_correlated_columns = best_correlated_features(df, 'Outcome', 0.21, -0.20)
['Pregnancies', 'Glucose', 'BMI', 'Age', 'Outcome']
X = df.iloc[:, :-1]
y = df.loc[:, ['Outcome']]
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X, y = sm.fit_resample(X, y)
display_correlation(pd.concat([pd.DataFrame(X), y], axis = 1))
aug_df = pd.concat([pd.DataFrame(X), y], axis = 1)
best_correlated_columns = best_correlated_features(aug_df, 'Outcome', 0.21, -0.20)
['Glucose', 'BMI', 'Age', 'Outcome']
X = aug_df.loc[:, best_correlated_columns]
y = aug_df.loc[:, ['Outcome']]
scaler = MinMaxScaler()
X = model.transform(X)
scaler = MinMaxScaler()
y = model.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
0.0 400
1.0 400
dtype: int64
params = {
'n_estimators': [100,200,500,750,1000],
'max_depth': [3,5,7,9],
'min_child_weight': [1,3,5],
'gamma':[i/10.0 for i in range(0,5)],
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)],
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
'learning_rate': [0.01, 0.02, 0.05, 0.1]
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
silent=True, nthread=1)
folds = 5
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
random_search.fit(X_train, y_train.ravel())
Fitting 5 folds for each of 5 candidates, totalling 25 fits
RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f9e54a51820>,
estimator=XGBClassifier(learning_rate=0.02, n_estimators=600,
nthread=1, silent=True),
n_iter=5, n_jobs=4,
param_distributions={'colsample_bytree': [0.6, 0.7, 0.8,
'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
'learning_rate': [0.01, 0.02, 0.05,
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5],
'n_estimators': [100, 200, 500, 750,
'reg_alpha': [0, 0.001, 0.005, 0.01,
0.05, 0.1, 1],
'subsample': [0.6, 0.7, 0.8, 0.9]},
random_state=1001, scoring='roc_auc', verbose=3)
print('\n All results:')
print('\n Best estimator:')
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)
All results: {'mean_fit_time': array([0.85436773, 0.12571445, 0.29614768, 0.40871468, 0.56435437]), 'std_fit_time': array([0.05036582, 0.03244128, 0.06253441, 0.10844844, 0.06243001]), 'mean_score_time': array([0.01597557, 0.01393166, 0.01356397, 0.01151047, 0.00780344]), 'std_score_time': array([0.00593373, 0.00377552, 0.00399773, 0.00429262, 0.0047163 ]), 'param_subsample': masked_array(data=[0.7, 0.7, 0.8, 0.8, 0.9], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_reg_alpha': masked_array(data=[1, 1, 0.001, 0.1, 1], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_n_estimators': masked_array(data=[750, 100, 200, 500, 750], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_min_child_weight': masked_array(data=[1, 1, 5, 5, 3], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_max_depth': masked_array(data=[3, 3, 5, 7, 7], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.01, 0.05, 0.05], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_gamma': masked_array(data=[0.2, 0.2, 0.2, 0.4, 0.4], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_colsample_bytree': masked_array(data=[0.8, 0.7, 0.9, 0.7, 0.8], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}, {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7}, {'subsample': 0.8, 'reg_alpha': 0.001, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}, {'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}, {'subsample': 0.9, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.8}], 'split0_test_score': array([1., 1., 1., 1., 1.]), 'split1_test_score': array([1., 1., 1., 1., 1.]), 'split2_test_score': array([1., 1., 1., 1., 1.]), 'split3_test_score': array([1., 1., 1., 1., 1.]), 'split4_test_score': array([1., 1., 1., 1., 1.]), 'mean_test_score': array([1., 1., 1., 1., 1.]), 'std_test_score': array([0.00000000e+00, 4.96506831e-17, 4.96506831e-17, 4.96506831e-17, 0.00000000e+00]), 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32)} Best estimator: XGBClassifier(colsample_bytree=0.8, gamma=0.2, learning_rate=0.01, n_estimators=750, nthread=1, reg_alpha=1, silent=True, subsample=0.7) Best normalized gini score for 5-fold search with 5 parameter combinations: 1.0 Best hyperparameters: {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}
y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0.0 1.00 1.00 1.00 100
1.0 1.00 1.00 1.00 100
accuracy 1.00 200
macro avg 1.00 1.00 1.00 200
weighted avg 1.00 1.00 1.00 200
sns.pairplot(df.loc[:, best_correlated_columns], hue="Outcome")
