INTRODUCTION
Titanic, a British passenger liner, that sank on 14th or 15th of April 1912. It is one of the most famous tragic events in the history of man-kind. In popular cultures, many movies have been made on this.
Many studies have been done to figure out its actual cause. Much data has been collected about the passengers and the ship, and some of that data is available with us in the form of a dataset. One of those data is properly known as the titanic dataset.
IMAGE CREDIT: david_do-1229468 @ Pixabay
IMPORT THE ESSENTIAL LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
DEFINE THE ESSENTAIL FUNCTIONS
This function will fill the null values with median values.
def fill_null_values_with_median(data_frame, column_name):
data_frame = data_frame.copy()
data_frame[f'{column_name}'].fillna(data_frame[f'{column_name}'].median(), inplace = True)
return data_frame
This function is used to drop the columns
def drop_columns(data_frame, column_name):
data_frame = data_frame.copy()
data_frame.drop(column_name, axis=1, inplace = True)
return data_frame
To get the names of numeric columns.
def numerical_columns(data_frame):
loop_counter = 0
column_names = data_frame.columns
numeric_columns = list()
for i in column_names:
try:
float(train.loc[1, [f"{i}"]].values[0])
numeric_columns.append(i)
except:
continue
return numeric_columns
To perform label encoding.
def label_encoding(data_frame, column_name):
data_frame = data_frame.copy()
column_unique_values = data_frame[f'{column_name}'].unique()
for i in range(len(column_unique_values)):
data_frame[f'{column_name}'].replace(column_unique_values[i], i, inplace = True)
return data_frame
To perform one-hot encoding.
def one_hot_encoding(dataframe, column_name):
dataframe = dataframe.copy()
column_unique_values = dataframe[f'{column_name}'].unique()
for i in range(len(column_unique_values) - 1):
encoded_column_name = f'{column_name}_{column_unique_values[i]}'
dataframe[encoded_column_name] = 0
dataframe.loc[dataframe[f'{column_name}'] == column_unique_values[i], encoded_column_name] = 1
dataframe = dataframe.drop([f'{column_name}'], axis=1)
return dataframe
To get the features based on correlation.
def best_correlated_features(data_frame, output_variable, max_corr = 0.25, min_corr = -0.25):
data_frame = data_frame.copy()
correlation_with_output = pd.DataFrame(data_frame.corr()[f'{output_variable}'])
column_names = correlation_with_output.index
column_values = correlation_with_output.values
best_columns = list()
for i, j in zip(column_names, column_values):
if j >= max_corr:
best_columns.append(i)
elif j <= min_corr:
best_columns.append(i)
return best_columns
To get the information about the dataset. It provides the feature names, number of null values, and the data type associated with each feature.
def dataset_info(data_frame):
data_frame = data_frame.copy()
null_dataset = data_frame.isnull().sum()
data_type_dataset = data_frame.dtypes
indices = null_dataset.index
null_values = null_dataset.values
data_type_values = data_type_dataset.values
dataset_info_dict = {'features': indices, 'null_values': null_values, 'data_type': data_type_values}
print(pd.DataFrame(dataset_info_dict).head(len(indices)))
IMPORT THE DATASET
First we will import the train and test set.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dataset_info(train)
features null_values data_type
0 PassengerId 0 int64
1 Survived 0 int64
2 Pclass 0 int64
3 Name 0 object
4 Sex 0 object
5 Age 177 float64
6 SibSp 0 int64
7 Parch 0 int64
8 Ticket 0 object
9 Fare 0 float64
10 Cabin 687 object
11 Embarked 2 object
dataset_info(test)
features null_values data_type
0 PassengerId 0 int64
1 Pclass 0 int64
2 Name 0 object
3 Sex 0 object
4 Age 86 float64
5 SibSp 0 int64
6 Parch 0 int64
7 Ticket 0 object
8 Fare 1 float64
9 Cabin 327 object
10 Embarked 0 object
Assign the median value to age and fare features.
for column_name in ['Age', 'Fare']:
train = fill_null_values_with_median(train, column_name)
test = fill_null_values_with_median(test, column_name)
Drop the unnecessary columns.
columns_to_drop = ['PassengerId','Cabin', 'Ticket']
for column_name in columns_to_drop:
train = drop_columns(train, column_name)
test = drop_columns(test, column_name)
Creating new feature through existing features.
data_frames = [train, test]
for data_frame in data_frames:
data_frame['FamilySize'] = data_frame['SibSp'] + data_frame['Parch'] + 1
data_frame['IsAlone'] = 1
data_frame['IsAlone'][data_frame['FamilySize'] > 1] = 0
Performing label encodings.
train = label_encoding(train, 'Sex')
test = label_encoding(test, 'Sex')
train = label_encoding(train, 'Embarked')
test = label_encoding(test, 'Embarked')
numerical_cols = numerical_columns(train)
print(numerical_cols)
['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']
Only get the numerical columns for training.
train_numerical = train.loc[:, numerical_cols]
test_numerical = test.loc[:, [i for i in numerical_cols if i != 'Survived']]
Get the correlation map.
plt.figure(figsize=(10, 7))
cmap_value = 'CMRmap_r'
sns.heatmap(train_numerical.corr(), annot = True, cmap = cmap_value)
plt.show()
best_correlated_columns = best_correlated_features(train_numerical, 'Survived', 0.20, -0.20)
print(best_correlated_columns)
['Survived', 'Pclass', 'Sex', 'Fare', 'IsAlone']
Use the features that we have got through correlation.
X_train = train_numerical.loc[:, best_correlated_columns]
X_test = test_numerical.loc[:, [i for i in best_correlated_columns if i != 'Survived']]
Perform one hot encoding
X_train = one_hot_encoding(X_train, 'Pclass')
X_test = one_hot_encoding(X_test, 'Pclass')
Checking if the data is balanced or not.
print(X_train.Survived.value_counts())
0 549
1 340
Name: Survived, dtype: int64
The data is imbalanced, we need to balance this.
y_train = X_train.loc[:, ['Survived']]
X_train = X_train.drop('Survived', axis = 1)
Performing smote oversampling.
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train, y_train = sm.fit_resample(X_train, y_train)
Perform Min=Max Scaler.
# scale features
scaler = MinMaxScaler()
model=scaler.fit(X_train)
X_train = model.transform(X_train)
model = scaler.fit(X_test)
X_test = model.transform(X_test)
Defining Parameters for Randomized search.
params = {
'n_estimators': [100,200,500,750,1000],
'max_depth': [3,5,7,9],
'min_child_weight': [1,3,5],
'gamma':[i/10.0 for i in range(0,5)],
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)],
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
'learning_rate': [0.01, 0.02, 0.05, 0.1]
}
Using XGBoost for classification.
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
silent=True, nthread=1)
folds = 3
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
random_search.fit(X_train, y_train.values.ravel())
Fitting 3 folds for each of 5 candidates, totalling 15 fits
RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f7d0ea81c10>, estimator=XGBClassifier(learning_rate=0.02, n_estimators=600, nthread=1, silent=True), n_iter=5, n_jobs=4, param_distributions={'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5], 'n_estimators': [100, 200, 500, 750, 1000], 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05, 0.1, 1], 'subsample': [0.6, 0.7, 0.8, 0.9]}, random_state=1001, scoring='roc_auc', verbose=3)
The final result after Randomized Search
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)
All results: {'mean_fit_time': array([1.29787787, 0.15219148, 0.50298834, 1.13052392, 1.52190614]), 'std_fit_time': array([0.03484256, 0.03479486, 0.0074278 , 0.02992335, 0.22400016]), 'mean_score_time': array([0.02945463, 0.00742928, 0.0113128 , 0.0265882 , 0.02289049]), 'std_score_time': array([0.00512708, 0.00290971, 0.00325435, 0.00048981, 0.00964618]), 'param_subsample': masked_array(data=[0.7, 0.7, 0.8, 0.8, 0.9], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_reg_alpha': masked_array(data=[1, 1, 0.001, 0.1, 1], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_n_estimators': masked_array(data=[750, 100, 200, 500, 750], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_min_child_weight': masked_array(data=[1, 1, 5, 5, 3], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_max_depth': masked_array(data=[3, 3, 5, 7, 7], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.01, 0.05, 0.05], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_gamma': masked_array(data=[0.2, 0.2, 0.2, 0.4, 0.4], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_colsample_bytree': masked_array(data=[0.8, 0.7, 0.9, 0.7, 0.8], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}, {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7}, {'subsample': 0.8, 'reg_alpha': 0.001, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}, {'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}, {'subsample': 0.9, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.8}], 'split0_test_score': array([0.86044671, 0.85973006, 0.84509839, 0.87727313, 0.87819881]), 'split1_test_score': array([0.86803129, 0.87425722, 0.85993908, 0.878587 , 0.88733614]), 'split2_test_score': array([0.88873959, 0.88744065, 0.88081161, 0.88497716, 0.89151662]), 'mean_test_score': array([0.87240586, 0.87380931, 0.86194969, 0.8802791 , 0.88568386]), 'std_test_score': array([0.01195754, 0.01131723, 0.01464901, 0.00336505, 0.00556109]), 'rank_test_score': array([4, 3, 5, 2, 1], dtype=int32)} Best estimator: XGBClassifier(colsample_bytree=0.8, gamma=0.4, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=750, nthread=1, reg_alpha=1, silent=True, subsample=0.9) Best normalized gini score for 3-fold search with 5 parameter combinations: 0.7713677127813117 Best hyperparameters: {'subsample': 0.9, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.8}
random_search.best_score_
0.8856838563906558
y_pred = random_search.predict(X_test)
final_information = {
'Passenger': [i for i in range(892, 892 + len(y_pred))], 'Survived': y_pred
}
pd.DataFrame(final_information).to_csv('submission.csv', index = False)
If you are looking for help in any project contact us contact@codersarts.com
Comments