Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

The Data

train = pd.read_csv('titanic_train.csv')

train.head()

Missing Data

 train.isnull() # if True indicates a null value
    
# but it is not a good way as data set can be vast

sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin

<matplotlib.axes._subplots.AxesSubplot at 0x19a91cd84c8>

sns.set_style('whitegrid')
sns.countplot(x='Survived',data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x19a91d397c8>

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train, palette='RdBu_r')

<matplotlib.axes._subplots.AxesSubplot at 0x19a92b67d88>

sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train, palette='rainbow')

<matplotlib.axes._subplots.AxesSubplot at 0x19a92c2ee88>

sns.distplot(train['Age'].dropna(), kde=False, color='darkred', bins=40)

<matplotlib.axes._subplots.AxesSubplot at 0x19a92ca32c8>

train['Age'].hist(bins=30, color='blue',alpha=0.3)

<matplotlib.axes._subplots.AxesSubplot at 0x19a92daeb88>

sns.countplot(x='SibSp', data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x19a92e6f2c8>

train['Fare'].hist(bins=30,color='green',alpha=0.4)

<matplotlib.axes._subplots.AxesSubplot at 0x19a92ed4e48>

Data Cleaning

plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')

<matplotlib.axes._subplots.AxesSubplot at 0x19a92c2e5c8>

we can see the wealthrer passengers in the higher classes tend to older, which makes sense, we'll use these average age values to impute based on pcalss for age

def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
        
    else:
        return Age

train['Age'] = train [['Age','Pclass']].apply(impute_age,axis=1 )

# now check heatmap again

sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin

<matplotlib.axes._subplots.AxesSubplot at 0x19a931ec508>

we have to apply a lot of feature engineering to handle Cabin coz of a lot of Nan values hence we'll drop it for now

train.drop('Cabin',axis=1,inplace=True)

sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')

<matplotlib.axes._subplots.AxesSubplot at 0x19a93031ac8>

train.dropna(inplace=True)

sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')

<matplotlib.axes._subplots.AxesSubplot at 0x19a930b6148>

train.head()

Converting Categorical Features

we'll need to convert categorical features to dummy variables using pandas, otherwise our machine learning algorithm wont be able to directly take in those features as inputs

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB

pd.get_dummies(train['Embarked'],drop_first=True).head()

sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

train.head()

train = pd.concat([train,sex,embark],axis=1)

train.head()

Building a Logistic Regression Model

Train Test Split

train.drop('Survived',axis=1).head()

train['Survived'].head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train.drop('Survived',axis=1),train['Survived'],test_size=0.30,random_state=101)

Training and Predicting

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

C:\Users\mrsid\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

predictions = logmodel.predict(X_test)

from sklearn.metrics import confusion_matrix

accuracy = confusion_matrix(y_test,predictions)

accuracy

array([[149,  14],
       [ 39,  65]], dtype=int64)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test,predictions)
accuracy

0.8014981273408239

predictions

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	S

	PassengerId	Survived	Pclass	Age	SibSp	Fare
0	1	0	3	22.0	1	7.2500
1	2	1	1	38.0	1	71.2833
2	3	1	3	26.0	0	7.9250
3	4	1	1	35.0	1	53.1000
4	5	0	3	35.0	0	8.0500

	PassengerId	Survived	Pclass	Age	SibSp	Fare	male	S
0	1	0	3	22.0	1	7.2500	1	1
1	2	1	1	38.0	1	71.2833	0	0
2	3	1	3	26.0	0	7.9250	0	1
3	4	1	1	35.0	1	53.1000	0	1
4	5	0	3	35.0	0	8.0500	1	1

	PassengerId	Pclass	Age	SibSp	Fare	male	S
0	1	3	22.0	1	7.2500	1	1
1	2	1	38.0	1	71.2833	0	0
2	3	3	26.0	0	7.9250	0	1
3	4	1	35.0	1	53.1000	0	1
4	5	3	35.0	0	8.0500	1	1

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	False	False	False	False	False	False	False	False	False	False	True	False
1	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	True	False
3	False	False	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	True	False
...	...	...	...	...	...	...	...	...	...	...	...	...
886	False	False	False	False	False	False	False	False	False	False	True	False
887	False	False	False	False	False	False	False	False	False	False	False	False
888	False	False	False	False	False	True	False	False	False	False	True	False
889	False	False	False	False	False	False	False	False	False	False	False	False
890	False	False	False	False	False	False	False	False	False	False	True	False