Multiple Linear Regrassion using 50_Startups dataset
dataset link :- https://www.kaggle.com/farhanmd29/50-startups
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv("50_Startups.csv",",")
dataset.head()
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,4]
print(X.head(),"\n")
print(y.head())
states=pd.get_dummies(X['State'],drop_first=True) #get_dummies helps to create dummy variables wrt no of categorial fratures
# drop_first = True helps us to create dummy variable trap
X = X.drop('State',axis=1)
X
print(states.head())
X=pd.concat([X,states],axis=1)
print(X.head()) # Now we will apply y=b0+b1x1+b2x2+.......
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)
y_test
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred
from sklearn.metrics import r2_score # r2 = 1-(sum_of_residual/sum_of_mean) also model is good if r2 --> 1
score= r2_score(y_test,y_pred)
score