Import Libraries and Creating Dataframe

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
dataset = pd.read_csv("50_Startups.csv",",")

dataset.head()
R&D Spend Administration Marketing Spend State Profit
0 165349.20 136897.80 471784.10 New York 192261.83
1 162597.70 151377.59 443898.53 California 191792.06
2 153441.51 101145.55 407934.54 Florida 191050.39
3 144372.41 118671.85 383199.62 New York 182901.99
4 142107.34 91391.77 366168.42 Florida 166187.94
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,4]
print(X.head(),"\n")
print(y.head())
   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida 

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

Data Preprocessing

states=pd.get_dummies(X['State'],drop_first=True) #get_dummies helps to create dummy variables wrt no of categorial fratures

# drop_first = True helps us to create dummy variable trap
X = X.drop('State',axis=1)
X
R&D Spend Administration Marketing Spend
0 165349.20 136897.80 471784.10
1 162597.70 151377.59 443898.53
2 153441.51 101145.55 407934.54
3 144372.41 118671.85 383199.62
4 142107.34 91391.77 366168.42
5 131876.90 99814.71 362861.36
6 134615.46 147198.87 127716.82
7 130298.13 145530.06 323876.68
8 120542.52 148718.95 311613.29
9 123334.88 108679.17 304981.62
10 101913.08 110594.11 229160.95
11 100671.96 91790.61 249744.55
12 93863.75 127320.38 249839.44
13 91992.39 135495.07 252664.93
14 119943.24 156547.42 256512.92
15 114523.61 122616.84 261776.23
16 78013.11 121597.55 264346.06
17 94657.16 145077.58 282574.31
18 91749.16 114175.79 294919.57
19 86419.70 153514.11 0.00
20 76253.86 113867.30 298664.47
21 78389.47 153773.43 299737.29
22 73994.56 122782.75 303319.26
23 67532.53 105751.03 304768.73
24 77044.01 99281.34 140574.81
25 64664.71 139553.16 137962.62
26 75328.87 144135.98 134050.07
27 72107.60 127864.55 353183.81
28 66051.52 182645.56 118148.20
29 65605.48 153032.06 107138.38
30 61994.48 115641.28 91131.24
31 61136.38 152701.92 88218.23
32 63408.86 129219.61 46085.25
33 55493.95 103057.49 214634.81
34 46426.07 157693.92 210797.67
35 46014.02 85047.44 205517.64
36 28663.76 127056.21 201126.82
37 44069.95 51283.14 197029.42
38 20229.59 65947.93 185265.10
39 38558.51 82982.09 174999.30
40 28754.33 118546.05 172795.67
41 27892.92 84710.77 164470.71
42 23640.93 96189.63 148001.11
43 15505.73 127382.30 35534.17
44 22177.74 154806.14 28334.72
45 1000.23 124153.04 1903.93
46 1315.46 115816.21 297114.46
47 0.00 135426.92 0.00
48 542.05 51743.15 0.00
49 0.00 116983.80 45173.06
print(states.head())
   Florida  New York
0        0         1
1        0         0
2        1         0
3        0         1
4        1         0
X=pd.concat([X,states],axis=1)
print(X.head()) # Now we will apply y=b0+b1x1+b2x2+.......
   R&D Spend  Administration  Marketing Spend  Florida  New York
0  165349.20       136897.80        471784.10        0         1
1  162597.70       151377.59        443898.53        0         0
2  153441.51       101145.55        407934.54        1         0
3  144372.41       118671.85        383199.62        0         1
4  142107.34        91391.77        366168.42        1         0

Train_test_split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)
y_test
28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

Applying Linear Regression

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred = regressor.predict(X_test)
y_pred
array([103015.20159796, 132582.27760816, 132447.73845174,  71976.09851258,
       178537.48221055, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533012, 167921.0656955 ])
from sklearn.metrics import r2_score # r2 = 1-(sum_of_residual/sum_of_mean) also model is good if r2 --> 1
score= r2_score(y_test,y_pred)
score
0.9347068473282423