Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

The Data

train = pd.read_csv('titanic_train.csv')
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

Missing Data

 train.isnull() # if True indicates a null value
    
# but it is not a good way as data set can be vast
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False

891 rows × 12 columns

sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin 
<matplotlib.axes._subplots.AxesSubplot at 0x19a91cd84c8>
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=train)
<matplotlib.axes._subplots.AxesSubplot at 0x19a91d397c8>
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train, palette='RdBu_r')
<matplotlib.axes._subplots.AxesSubplot at 0x19a92b67d88>
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train, palette='rainbow')
<matplotlib.axes._subplots.AxesSubplot at 0x19a92c2ee88>
sns.distplot(train['Age'].dropna(), kde=False, color='darkred', bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x19a92ca32c8>
train['Age'].hist(bins=30, color='blue',alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x19a92daeb88>
sns.countplot(x='SibSp', data=train)
<matplotlib.axes._subplots.AxesSubplot at 0x19a92e6f2c8>
train['Fare'].hist(bins=30,color='green',alpha=0.4)
<matplotlib.axes._subplots.AxesSubplot at 0x19a92ed4e48>

Data Cleaning

plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')
<matplotlib.axes._subplots.AxesSubplot at 0x19a92c2e5c8>

we can see the wealthrer passengers in the higher classes tend to older, which makes sense, we'll use these average age values to impute based on pcalss for age

def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
        
    else:
        return Age
train['Age'] = train [['Age','Pclass']].apply(impute_age,axis=1 )

# now check heatmap again
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin 
<matplotlib.axes._subplots.AxesSubplot at 0x19a931ec508>

we have to apply a lot of feature engineering to handle Cabin coz of a lot of Nan values hence we'll drop it for now

train.drop('Cabin',axis=1,inplace=True)
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')
<matplotlib.axes._subplots.AxesSubplot at 0x19a93031ac8>
train.dropna(inplace=True)
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')
<matplotlib.axes._subplots.AxesSubplot at 0x19a930b6148>
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S

Converting Categorical Features

we'll need to convert categorical features to dummy variables using pandas, otherwise our machine learning algorithm wont be able to directly take in those features as inputs

train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB
pd.get_dummies(train['Embarked'],drop_first=True).head()
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train.head()
PassengerId Survived Pclass Age SibSp Parch Fare
0 1 0 3 22.0 1 0 7.2500
1 2 1 1 38.0 1 0 71.2833
2 3 1 3 26.0 0 0 7.9250
3 4 1 1 35.0 1 0 53.1000
4 5 0 3 35.0 0 0 8.0500
train = pd.concat([train,sex,embark],axis=1)
train.head()
PassengerId Survived Pclass Age SibSp Parch Fare male Q S
0 1 0 3 22.0 1 0 7.2500 1 0 1
1 2 1 1 38.0 1 0 71.2833 0 0 0
2 3 1 3 26.0 0 0 7.9250 0 0 1
3 4 1 1 35.0 1 0 53.1000 0 0 1
4 5 0 3 35.0 0 0 8.0500 1 0 1

Building a Logistic Regression Model

Train Test Split

train.drop('Survived',axis=1).head()
PassengerId Pclass Age SibSp Parch Fare male Q S
0 1 3 22.0 1 0 7.2500 1 0 1
1 2 1 38.0 1 0 71.2833 0 0 0
2 3 3 26.0 0 0 7.9250 0 0 1
3 4 1 35.0 1 0 53.1000 0 0 1
4 5 3 35.0 0 0 8.0500 1 0 1
train['Survived'].head()
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train.drop('Survived',axis=1),train['Survived'],test_size=0.30,random_state=101)

Training and Predicting

from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
C:\Users\mrsid\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
predictions = logmodel.predict(X_test)
from sklearn.metrics import confusion_matrix
accuracy = confusion_matrix(y_test,predictions)
accuracy
array([[149,  14],
       [ 39,  65]], dtype=int64)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,predictions)
accuracy
0.8014981273408239
predictions
array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)