Multicollinearity in Linear Regression
import pandas as pd
import statsmodels.api as sm
df_adv = pd.read_csv('Advertising.csv', index_col=0)
df_adv.head()
X = df_adv[['TV','radio','newspaper']]
y = df_adv['sales']
print(X,y)
X = sm.add_constant(X)
X
model = sm.OLS(y, X).fit()
model.summary() # const indicates B0 value
import matplotlib.pyplot as plt
import seaborn as sns
X.iloc[:,1:].corr()
plt.imshow(X,cmap='autumn')
plt.show()
sns.heatmap(X,linewidth = 0.5 , cmap = 'coolwarm')
plt.show()
Using Salary DataSet, link :-
https://github.com/mr-siddy/Machine-Learning/blob/master/Linear%20Regression/Salary_Data.csv
df_salary = pd.read_csv('Salary_Data.csv')
df_salary.head()
X = df_salary[['YearsExperience','Age']]
y = df_salary['Salary']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
model.summary() # here observe R2, const, stderr and P>|t| --> high correlation
X.iloc[:,1:].corr()
sns.heatmap(X, cmap='summer')
drop_age = X.drop('Age', axis=1)
model = sm.OLS(y,drop_age).fit()
model.summary()