EDA with Python and Applying Logistic Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train = pd.read_csv('titanic_train.csv')
train.head()
train.isnull() # if True indicates a null value
# but it is not a good way as data set can be vast
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=train)
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train, palette='RdBu_r')
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train, palette='rainbow')
sns.distplot(train['Age'].dropna(), kde=False, color='darkred', bins=40)
train['Age'].hist(bins=30, color='blue',alpha=0.3)
sns.countplot(x='SibSp', data=train)
train['Fare'].hist(bins=30,color='green',alpha=0.4)
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')
we can see the wealthrer passengers in the higher classes tend to older, which makes sense, we'll use these average age values to impute based on pcalss for age
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
train['Age'] = train [['Age','Pclass']].apply(impute_age,axis=1 )
# now check heatmap again
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis') # so most of the null values are present in age and cabin
we have to apply a lot of feature engineering to handle Cabin coz of a lot of Nan values hence we'll drop it for now
train.drop('Cabin',axis=1,inplace=True)
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')
train.dropna(inplace=True)
sns.heatmap(train.isnull(),yticklabels = False, cbar =False, cmap='viridis')
train.head()
we'll need to convert categorical features to dummy variables using pandas, otherwise our machine learning algorithm wont be able to directly take in those features as inputs
train.info()
pd.get_dummies(train['Embarked'],drop_first=True).head()
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train.head()
train = pd.concat([train,sex,embark],axis=1)
train.head()
Train Test Split
train.drop('Survived',axis=1).head()
train['Survived'].head()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train.drop('Survived',axis=1),train['Survived'],test_size=0.30,random_state=101)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
from sklearn.metrics import confusion_matrix
accuracy = confusion_matrix(y_test,predictions)
accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,predictions)
accuracy
predictions