import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
os.listdir('weather-dataset-rattle-package')
['weatherAUS.csv']
raw_df = pd.read_csv('weather-dataset-rattle-package/weatherAUS.csv')
raw_df.head(10)
Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow
0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN W 44.0 W WNW 20.0 24.0 71.0 22.0 1007.7 1007.1 8.0 NaN 16.9 21.8 No No
1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW WSW 4.0 22.0 44.0 25.0 1010.6 1007.8 NaN NaN 17.2 24.3 No No
2 2008-12-03 Albury 12.9 25.7 0.0 NaN NaN WSW 46.0 W WSW 19.0 26.0 38.0 30.0 1007.6 1008.7 NaN 2.0 21.0 23.2 No No
3 2008-12-04 Albury 9.2 28.0 0.0 NaN NaN NE 24.0 SE E 11.0 9.0 45.0 16.0 1017.6 1012.8 NaN NaN 18.1 26.5 No No
4 2008-12-05 Albury 17.5 32.3 1.0 NaN NaN W 41.0 ENE NW 7.0 20.0 82.0 33.0 1010.8 1006.0 7.0 8.0 17.8 29.7 No No
5 2008-12-06 Albury 14.6 29.7 0.2 NaN NaN WNW 56.0 W W 19.0 24.0 55.0 23.0 1009.2 1005.4 NaN NaN 20.6 28.9 No No
6 2008-12-07 Albury 14.3 25.0 0.0 NaN NaN W 50.0 SW W 20.0 24.0 49.0 19.0 1009.6 1008.2 1.0 NaN 18.1 24.6 No No
7 2008-12-08 Albury 7.7 26.7 0.0 NaN NaN W 35.0 SSE W 6.0 17.0 48.0 19.0 1013.4 1010.1 NaN NaN 16.3 25.5 No No
8 2008-12-09 Albury 9.7 31.9 0.0 NaN NaN NNW 80.0 SE NW 7.0 28.0 42.0 9.0 1008.9 1003.6 NaN NaN 18.3 30.2 No Yes
9 2008-12-10 Albury 13.1 30.1 1.4 NaN NaN W 28.0 S SSE 15.0 11.0 58.0 27.0 1007.0 1005.7 NaN NaN 20.1 28.2 Yes No
raw_df.shape
(145460, 23)
raw_df.info() # to check column types of dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null   float64
 18  Cloud3pm       86102 non-null   float64
 19  Temp9am        143693 non-null  float64
 20  Temp3pm        141851 non-null  float64
 21  RainToday      142199 non-null  object 
 22  RainTomorrow   142193 non-null  object 
dtypes: float64(16), object(7)
memory usage: 25.5+ MB
raw_df.dropna(subset=['RainTomorrow'], inplace=True)
raw_df.head(2)
Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow
0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN W 44.0 W WNW 20.0 24.0 71.0 22.0 1007.7 1007.1 8.0 NaN 16.9 21.8 No No
1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW WSW 4.0 22.0 44.0 25.0 1010.6 1007.8 NaN NaN 17.2 24.3 No No
raw_df.shape # shape has become 142193
(142193, 23)

Training Validation and Test Sets

plt.title("no.of Rows per Year")
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year);
year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year<2015]
val_df = raw_df[year==2015]
test_df = raw_df[year>2015]

print(train_df.shape, val_df.shape, test_df.shape)
(98988, 23) (17231, 23) (25974, 23)

Input and Target Columns

input_cols = list(train_df.columns)[1:-1]
target_cols = 'RainTomorrow'
target_cols
'RainTomorrow'
input_cols
['Location',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday']
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
print(numeric_cols)
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
print(categorical_cols)
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

Imputing Missing Numeric Values

train_inputs[numeric_cols].isna().sum().sort_values(ascending=False)
Sunshine         40696
Evaporation      37110
Cloud3pm         36766
Cloud9am         35764
Pressure9am       9345
Pressure3pm       9309
WindGustSpeed     6902
Humidity9am       1265
Humidity3pm       1186
WindSpeed3pm      1140
WindSpeed9am      1133
Rainfall          1000
Temp9am            783
Temp3pm            663
MinTemp            434
MaxTemp            198
dtype: int64
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean').fit(raw_df[numeric_cols]) # imputer will figureout the avg for each of cols
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols]) # fill empty data
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])
train_inputs[numeric_cols].isna().sum()
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
dtype: int64

Scaling Numeric Features

from sklearn.preprocessing import MinMaxScaler
val_inputs.describe().loc[['min', 'max']]
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm
min -8.2 -3.2 0.0 0.0 0.0 7.0 0.0 0.0 4.0 0.0 988.1 982.2 0.0 0.0 -6.2 -4.0
max 31.9 45.4 247.2 70.4 14.5 135.0 87.0 74.0 100.0 100.0 1039.3 1037.3 8.0 8.0 37.5 42.8
scaler = MinMaxScaler().fit(raw_df[numeric_cols])
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])
val_inputs.describe().loc[['min', 'max']]
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm
min 0.007075 0.030246 0.000000 0.000000 0.0 0.007752 0.000000 0.000000 0.04 0.0 0.125620 0.0816 0.000000 0.000000 0.021097 0.026871
max 0.952830 0.948960 0.666307 0.485517 1.0 1.000000 0.669231 0.850575 1.00 1.0 0.971901 0.9632 0.888889 0.888889 0.943038 0.925144

Encoding Categorical Data

from sklearn.preprocessing import OneHotEncoder
train_df[categorical_cols].fillna('Unkown')
val_df[categorical_cols].fillna('Unkown')
test_df[categorical_cols].fillna('Unknown')
Location WindGustDir WindDir9am WindDir3pm RainToday
2498 Albury ENE Unknown ESE No
2499 Albury SSE SSE SE No
2500 Albury ENE ESE ENE Yes
2501 Albury SSE SE SSE Yes
2502 Albury ENE SE SSE Yes
... ... ... ... ... ...
145454 Uluru E ESE E No
145455 Uluru E SE ENE No
145456 Uluru NNW SE N No
145457 Uluru N SE WNW No
145458 Uluru SE SSE N No

25974 rows × 5 columns

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(raw_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))  
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])
print(encoded_cols)
['Location_Adelaide', 'Location_Albany', 'Location_Albury', 'Location_AliceSprings', 'Location_BadgerysCreek', 'Location_Ballarat', 'Location_Bendigo', 'Location_Brisbane', 'Location_Cairns', 'Location_Canberra', 'Location_Cobar', 'Location_CoffsHarbour', 'Location_Dartmoor', 'Location_Darwin', 'Location_GoldCoast', 'Location_Hobart', 'Location_Katherine', 'Location_Launceston', 'Location_Melbourne', 'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree', 'Location_MountGambier', 'Location_MountGinini', 'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead', 'Location_NorfolkIsland', 'Location_Nuriootpa', 'Location_PearceRAAF', 'Location_Penrith', 'Location_Perth', 'Location_PerthAirport', 'Location_Portland', 'Location_Richmond', 'Location_Sale', 'Location_SalmonGums', 'Location_Sydney', 'Location_SydneyAirport', 'Location_Townsville', 'Location_Tuggeranong', 'Location_Uluru', 'Location_WaggaWagga', 'Location_Walpole', 'Location_Watsonia', 'Location_Williamtown', 'Location_Witchcliffe', 'Location_Wollongong', 'Location_Woomera', 'WindGustDir_E', 'WindGustDir_ENE', 'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE', 'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE', 'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W', 'WindGustDir_WNW', 'WindGustDir_WSW', 'WindGustDir_nan', 'WindDir9am_E', 'WindDir9am_ENE', 'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE', 'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE', 'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W', 'WindDir9am_WNW', 'WindDir9am_WSW', 'WindDir9am_nan', 'WindDir3pm_E', 'WindDir3pm_ENE', 'WindDir3pm_ESE', 'WindDir3pm_N', 'WindDir3pm_NE', 'WindDir3pm_NNE', 'WindDir3pm_NNW', 'WindDir3pm_NW', 'WindDir3pm_S', 'WindDir3pm_SE', 'WindDir3pm_SSE', 'WindDir3pm_SSW', 'WindDir3pm_SW', 'WindDir3pm_W', 'WindDir3pm_WNW', 'WindDir3pm_WSW', 'WindDir3pm_nan', 'RainToday_No', 'RainToday_Yes', 'RainToday_nan']
train_inputs.head(10)
Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday Location_Adelaide Location_Albany Location_Albury Location_AliceSprings Location_BadgerysCreek Location_Ballarat Location_Bendigo Location_Brisbane Location_Cairns Location_Canberra Location_Cobar Location_CoffsHarbour Location_Dartmoor Location_Darwin Location_GoldCoast Location_Hobart Location_Katherine Location_Launceston Location_Melbourne Location_MelbourneAirport Location_Mildura Location_Moree Location_MountGambier Location_MountGinini Location_Newcastle Location_Nhil Location_NorahHead Location_NorfolkIsland Location_Nuriootpa Location_PearceRAAF Location_Penrith Location_Perth Location_PerthAirport Location_Portland Location_Richmond Location_Sale Location_SalmonGums Location_Sydney Location_SydneyAirport Location_Townsville Location_Tuggeranong Location_Uluru Location_WaggaWagga Location_Walpole Location_Watsonia Location_Williamtown Location_Witchcliffe Location_Wollongong Location_Woomera WindGustDir_E WindGustDir_ENE WindGustDir_ESE WindGustDir_N WindGustDir_NE WindGustDir_NNE WindGustDir_NNW WindGustDir_NW WindGustDir_S WindGustDir_SE WindGustDir_SSE WindGustDir_SSW WindGustDir_SW WindGustDir_W WindGustDir_WNW WindGustDir_WSW WindGustDir_nan WindDir9am_E WindDir9am_ENE WindDir9am_ESE WindDir9am_N WindDir9am_NE WindDir9am_NNE WindDir9am_NNW WindDir9am_NW WindDir9am_S WindDir9am_SE WindDir9am_SSE WindDir9am_SSW WindDir9am_SW WindDir9am_W WindDir9am_WNW WindDir9am_WSW WindDir9am_nan WindDir3pm_E WindDir3pm_ENE WindDir3pm_ESE WindDir3pm_N WindDir3pm_NE WindDir3pm_NNE WindDir3pm_NNW WindDir3pm_NW WindDir3pm_S WindDir3pm_SE WindDir3pm_SSE WindDir3pm_SSW WindDir3pm_SW WindDir3pm_W WindDir3pm_WNW WindDir3pm_WSW WindDir3pm_nan RainToday_No RainToday_Yes RainToday_nan
0 Albury 0.516509 0.523629 0.001617 0.037723 0.525852 W 0.294574 W WNW 0.153846 0.275862 0.71 0.22 0.449587 0.4800 0.888889 0.500352 0.508439 0.522073 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0
1 Albury 0.375000 0.565217 0.000000 0.037723 0.525852 WNW 0.294574 NNW WSW 0.030769 0.252874 0.44 0.25 0.497521 0.4912 0.493021 0.500352 0.514768 0.570058 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2 Albury 0.504717 0.576560 0.000000 0.037723 0.525852 WSW 0.310078 W WSW 0.146154 0.298851 0.38 0.30 0.447934 0.5056 0.493021 0.222222 0.594937 0.548944 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
3 Albury 0.417453 0.620038 0.000000 0.037723 0.525852 NE 0.139535 SE E 0.084615 0.103448 0.45 0.16 0.613223 0.5712 0.493021 0.500352 0.533755 0.612284 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 Albury 0.613208 0.701323 0.002695 0.037723 0.525852 W 0.271318 ENE NW 0.053846 0.229885 0.82 0.33 0.500826 0.4624 0.777778 0.888889 0.527426 0.673704 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
5 Albury 0.544811 0.652174 0.000539 0.037723 0.525852 WNW 0.387597 W W 0.146154 0.275862 0.55 0.23 0.474380 0.4528 0.493021 0.500352 0.586498 0.658349 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
6 Albury 0.537736 0.563327 0.000000 0.037723 0.525852 W 0.341085 SW W 0.153846 0.275862 0.49 0.19 0.480992 0.4976 0.111111 0.500352 0.533755 0.575816 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
7 Albury 0.382075 0.595463 0.000000 0.037723 0.525852 W 0.224806 SSE W 0.046154 0.195402 0.48 0.19 0.543802 0.5280 0.493021 0.500352 0.495781 0.593090 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
8 Albury 0.429245 0.693762 0.000000 0.037723 0.525852 NNW 0.573643 SE NW 0.053846 0.321839 0.42 0.09 0.469421 0.4240 0.493021 0.500352 0.537975 0.683301 No 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
9 Albury 0.509434 0.659735 0.003774 0.037723 0.525852 W 0.170543 S SSE 0.115385 0.126437 0.58 0.27 0.438017 0.4576 0.493021 0.500352 0.575949 0.644914 Yes 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test =  test_inputs[numeric_cols + encoded_cols]
X_test.head(10)
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm Location_Adelaide Location_Albany Location_Albury Location_AliceSprings Location_BadgerysCreek Location_Ballarat Location_Bendigo Location_Brisbane Location_Cairns Location_Canberra Location_Cobar Location_CoffsHarbour Location_Dartmoor Location_Darwin Location_GoldCoast Location_Hobart Location_Katherine Location_Launceston Location_Melbourne Location_MelbourneAirport Location_Mildura Location_Moree Location_MountGambier Location_MountGinini Location_Newcastle Location_Nhil Location_NorahHead Location_NorfolkIsland Location_Nuriootpa Location_PearceRAAF Location_Penrith Location_Perth Location_PerthAirport Location_Portland Location_Richmond Location_Sale Location_SalmonGums Location_Sydney Location_SydneyAirport Location_Townsville Location_Tuggeranong Location_Uluru Location_WaggaWagga Location_Walpole Location_Watsonia Location_Williamtown Location_Witchcliffe Location_Wollongong Location_Woomera WindGustDir_E WindGustDir_ENE WindGustDir_ESE WindGustDir_N WindGustDir_NE WindGustDir_NNE WindGustDir_NNW WindGustDir_NW WindGustDir_S WindGustDir_SE WindGustDir_SSE WindGustDir_SSW WindGustDir_SW WindGustDir_W WindGustDir_WNW WindGustDir_WSW WindGustDir_nan WindDir9am_E WindDir9am_ENE WindDir9am_ESE WindDir9am_N WindDir9am_NE WindDir9am_NNE WindDir9am_NNW WindDir9am_NW WindDir9am_S WindDir9am_SE WindDir9am_SSE WindDir9am_SSW WindDir9am_SW WindDir9am_W WindDir9am_WNW WindDir9am_WSW WindDir9am_nan WindDir3pm_E WindDir3pm_ENE WindDir3pm_ESE WindDir3pm_N WindDir3pm_NE WindDir3pm_NNE WindDir3pm_NNW WindDir3pm_NW WindDir3pm_S WindDir3pm_SE WindDir3pm_SSE WindDir3pm_SSW WindDir3pm_SW WindDir3pm_W WindDir3pm_WNW WindDir3pm_WSW WindDir3pm_nan RainToday_No RainToday_Yes RainToday_nan
2498 0.681604 0.801512 0.000000 0.037723 0.525852 0.372093 0.000000 0.080460 0.46 0.17 0.543802 0.5136 0.777778 0.333333 0.702532 0.808061 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2499 0.693396 0.725898 0.001078 0.037723 0.525852 0.341085 0.069231 0.195402 0.54 0.30 0.505785 0.5008 0.888889 0.888889 0.675105 0.712092 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2500 0.634434 0.527410 0.005930 0.037723 0.525852 0.325581 0.084615 0.448276 0.62 0.67 0.553719 0.6032 0.888889 0.888889 0.611814 0.477927 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2501 0.608491 0.538752 0.042049 0.037723 0.525852 0.255814 0.069231 0.195402 0.74 0.65 0.618182 0.6304 0.888889 0.888889 0.556962 0.518234 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2502 0.566038 0.523629 0.018329 0.037723 0.525852 0.193798 0.046154 0.103448 0.92 0.63 0.591736 0.5888 0.888889 0.888889 0.514768 0.529750 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2503 0.601415 0.621928 0.000539 0.037723 0.525852 0.255814 0.069231 0.126437 0.76 0.52 0.563636 0.5680 0.888889 0.888889 0.580169 0.596929 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2504 0.587264 0.620038 0.000000 0.037723 0.525852 0.224806 0.153846 0.229885 0.46 0.31 0.609917 0.6176 0.493021 0.222222 0.592827 0.614203 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2505 0.537736 0.689981 0.000000 0.037723 0.525852 0.139535 0.084615 0.068966 0.63 0.24 0.646281 0.6416 0.493021 0.888889 0.561181 0.654511 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2506 0.594340 0.752363 0.000000 0.037723 0.525852 0.170543 0.084615 0.103448 0.52 0.24 0.629752 0.6144 0.493021 0.333333 0.662447 0.738964 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
2507 0.620283 0.790170 0.000000 0.037723 0.525852 0.271318 0.069231 0.195402 0.54 0.17 0.596694 0.5680 0.493021 0.500352 0.704641 0.798464 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0

Training and Visualizing Decision Trees

Training

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42) # random state is provided to get same value each time
%%time
model.fit(X_train, train_targets)
CPU times: user 2.93 s, sys: 12.6 ms, total: 2.94 s
Wall time: 2.95 s
DecisionTreeClassifier(random_state=42)

Evaluation

from sklearn.metrics import accuracy_score, confusion_matrix
train_preds = model.predict(X_train)
train_preds
array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)
pd.value_counts(train_preds)
No     76707
Yes    22281
dtype: int64

Decision tree also returns probabilities of each prediction

train_probs = model.predict_proba(X_train)
train_probs
array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])
train_targets
0         No
1         No
2         No
3         No
4         No
          ..
144548    No
144549    No
144550    No
144551    No
144552    No
Name: RainTomorrow, Length: 98988, dtype: object
accuracy_score(train_preds, train_targets)
0.9999797955307714
model.score(X_val, val_targets) # direct prediction on val inputs and compare accuracy

#only ~79%
0.7921188555510418
val_targets.value_counts() / len(val_targets)
No     0.788289
Yes    0.211711
Name: RainTomorrow, dtype: float64

It appears that the model has learned the training examples perfect, and doesn't generalize well to previously unseen examples. This phenomenon is called "overfitting", and reducing overfitting is one of the most important parts of any machine learning project.

Visualizing Tree

from sklearn.tree import plot_tree, export_text
plt.figure(figsize=(80, 40))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True)
[Text(2232.0, 1902.6000000000001, 'Humidity3pm <= 0.715\ngini = 0.349\nsamples = 98988\nvalue = [76705, 22283]'),
 Text(1116.0, 1359.0, 'Rainfall <= 0.004\ngini = 0.248\nsamples = 82418\nvalue = [70439, 11979]'),
 Text(558.0, 815.4000000000001, 'Sunshine <= 0.525\ngini = 0.198\nsamples = 69252\nvalue = [61538, 7714]'),
 Text(279.0, 271.79999999999995, '\n  (...)  \n'),
 Text(837.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1674.0, 815.4000000000001, 'Humidity3pm <= 0.512\ngini = 0.438\nsamples = 13166\nvalue = [8901, 4265]'),
 Text(1395.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1953.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3348.0, 1359.0, 'Humidity3pm <= 0.825\ngini = 0.47\nsamples = 16570\nvalue = [6266, 10304]'),
 Text(2790.0, 815.4000000000001, 'WindGustSpeed <= 0.279\ngini = 0.499\nsamples = 9136\nvalue = [4804, 4332]'),
 Text(2511.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3069.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3906.0, 815.4000000000001, 'Rainfall <= 0.01\ngini = 0.316\nsamples = 7434\nvalue = [1462, 5972]'),
 Text(3627.0, 271.79999999999995, '\n  (...)  \n'),
 Text(4185.0, 271.79999999999995, '\n  (...)  \n')]

How a Decision Tree is Created

Note the gini value in each box. This is the loss function used by the decision tree to decide which column should be used for splitting the data, and at what point the column should be split. A lower Gini index indicates a better split. A perfect split (only one class on each side) has a Gini index of 0.

For a mathematical discussion of the Gini Index, watch this video: It has the following formula:

dt2

Conceptually speaking, while training the models evaluates all possible splits across all possible columns and picks the best one. Then, it recursively performs an optimal split for the two portions. In practice, however, it's very inefficient to check all possible splits, so the model uses a heuristic (predefined strategy) combined with some randomization.

Let's check the depth of the tree that was created.

model.tree_.max_depth
48
tree_text = export_text(model, max_depth=10, feature_names=list(X_train.columns))
print(tree_text[:5000])
|--- Humidity3pm <= 0.72
|   |--- Rainfall <= 0.00
|   |   |--- Sunshine <= 0.52
|   |   |   |--- Pressure3pm <= 0.58
|   |   |   |   |--- WindGustSpeed <= 0.36
|   |   |   |   |   |--- Humidity3pm <= 0.28
|   |   |   |   |   |   |--- WindDir9am_NE <= 0.50
|   |   |   |   |   |   |   |--- Location_Watsonia <= 0.50
|   |   |   |   |   |   |   |   |--- Cloud9am <= 0.83
|   |   |   |   |   |   |   |   |   |--- WindSpeed3pm <= 0.07
|   |   |   |   |   |   |   |   |   |   |--- Pressure3pm <= 0.46
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- Pressure3pm >  0.46
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |--- WindSpeed3pm >  0.07
|   |   |   |   |   |   |   |   |   |   |--- MinTemp <= 0.32
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- MinTemp >  0.32
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |   |   |   |   |--- Cloud9am >  0.83
|   |   |   |   |   |   |   |   |   |--- Cloud3pm <= 0.42
|   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |--- Cloud3pm >  0.42
|   |   |   |   |   |   |   |   |   |   |--- Rainfall <= 0.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- Rainfall >  0.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |--- Location_Watsonia >  0.50
|   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |--- WindDir9am_NE >  0.50
|   |   |   |   |   |   |   |--- WindGustSpeed <= 0.25
|   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |--- WindGustSpeed >  0.25
|   |   |   |   |   |   |   |   |--- Pressure9am <= 0.54
|   |   |   |   |   |   |   |   |   |--- Evaporation <= 0.09
|   |   |   |   |   |   |   |   |   |   |--- Location_AliceSprings <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- Location_AliceSprings >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |--- Evaporation >  0.09
|   |   |   |   |   |   |   |   |   |   |--- WindGustDir_ENE <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- WindGustDir_ENE >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |--- Pressure9am >  0.54
|   |   |   |   |   |   |   |   |   |--- Humidity3pm <= 0.20
|   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |--- Humidity3pm >  0.20
|   |   |   |   |   |   |   |   |   |   |--- Evaporation <= 0.02
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- Evaporation >  0.02
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |--- Humidity3pm >  0.28
|   |   |   |   |   |   |--- Sunshine <= 0.05
|   |   |   |   |   |   |   |--- WindGustSpeed <= 0.25
|   |   |   |   |   |   |   |   |--- Evaporation <= 0.01
|   |   |   |   |   |   |   |   |   |--- WindGustSpeed <= 0.23
|   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |--- WindGustSpeed >  0.23
|   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |--- Evaporation >  0.01
|   |   |   |   |   |   |   |   |   |--- Evaporation <= 0.07
|   |   |   |   |   |   |   |   |   |   |--- Temp3pm <= 0.34
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- Temp3pm >  0.34
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |   |   |   |   |   |   |--- Evaporation >  0.07
|   |   |   |   |   |   |   |   |   |   |--- WindSpeed9am <= 0.12
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- WindSpeed9am >  0.12
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |--- WindGustSpeed >  0.25
|   |   |   |   |   |   |   |   |--- Pressure9am <= 0.56
|   |   |   |   |   |   |   |   |   |--- MinTemp <= 0.40
|   |   |   |   |   |   |   |   |   |   |--- WindDir9am_WNW <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |   |   |--- WindDir9am_WNW >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |--- MinTemp >  0.40
|   |   |   |   |   |   |   |   |   |   |--- Humidity3pm <= 0.66
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |   |   |   |   |   |   |--- Humidity3pm >  0.66
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |--- Pressure9am >  0.56
|   |   |   |   |   

Feature Importance

X_train.columns
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm',
       ...
       'WindDir3pm_SSE', 'WindDir3pm_SSW', 'WindDir3pm_SW', 'WindDir3pm_W',
       'WindDir3pm_WNW', 'WindDir3pm_WSW', 'WindDir3pm_nan', 'RainToday_No',
       'RainToday_Yes', 'RainToday_nan'],
      dtype='object', length=119)
model.feature_importances_
array([3.48942086e-02, 3.23605486e-02, 5.91385668e-02, 2.49619907e-02,
       4.94652143e-02, 5.63334673e-02, 2.80205998e-02, 2.98128801e-02,
       4.02182908e-02, 2.61441297e-01, 3.44145027e-02, 6.20573699e-02,
       1.36406176e-02, 1.69229866e-02, 3.50001550e-02, 3.04064076e-02,
       2.24086587e-03, 2.08018104e-03, 1.27475954e-03, 7.26936324e-04,
       1.39779517e-03, 1.15264873e-03, 6.92808159e-04, 1.80675598e-03,
       1.08370901e-03, 1.19773895e-03, 8.87119103e-04, 2.15764220e-03,
       1.67094731e-03, 7.98919493e-05, 1.10558668e-03, 1.42008656e-03,
       4.10087635e-04, 1.09028115e-03, 1.44164766e-03, 9.08284767e-04,
       1.05770304e-03, 6.18133455e-04, 1.80387272e-03, 2.10403527e-03,
       2.74413333e-04, 7.31599405e-04, 1.35408990e-03, 1.54759332e-03,
       1.30917564e-03, 1.07134670e-03, 8.36408023e-04, 1.62662229e-03,
       1.00326116e-03, 2.16053455e-03, 8.46802258e-04, 1.88919081e-03,
       9.29325203e-04, 1.29545157e-03, 1.27604831e-03, 5.12736888e-04,
       1.38458902e-03, 3.97103931e-04, 1.03734689e-03, 1.44437047e-03,
       1.75870184e-03, 1.42487857e-03, 2.78109569e-03, 2.00782698e-03,
       2.80617652e-04, 1.61509734e-03, 1.64361598e-03, 2.36124112e-03,
       3.05457932e-03, 2.33239534e-03, 2.78643875e-03, 2.16695261e-03,
       3.41491352e-03, 2.30573542e-03, 2.28270604e-03, 2.34408118e-03,
       2.26557332e-03, 2.54592702e-03, 2.75264499e-03, 2.83905192e-03,
       2.49480561e-03, 1.54840338e-03, 2.50305095e-03, 2.53945388e-03,
       2.28130055e-03, 3.80572180e-03, 2.58535069e-03, 3.10172224e-03,
       2.54236791e-03, 2.50297796e-03, 2.06400988e-03, 2.52931192e-03,
       2.07840517e-03, 1.77387278e-03, 1.78920555e-03, 2.77709687e-03,
       2.42564566e-03, 2.26471887e-03, 1.73346117e-03, 2.23926957e-03,
       2.47865244e-03, 2.31917387e-03, 3.21211861e-03, 2.92382975e-03,
       2.24399274e-03, 3.68774754e-03, 3.87595982e-03, 3.20326068e-03,
       2.53323550e-03, 2.40444844e-03, 2.26790411e-03, 2.19744009e-03,
       2.28064147e-03, 2.88545323e-03, 2.05278867e-03, 1.12604304e-03,
       2.86325849e-04, 1.32322128e-03, 1.72690480e-03])
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
importance_df.head(10)
feature importance
9 Humidity3pm 0.261441
11 Pressure3pm 0.062057
2 Rainfall 0.059139
5 WindGustSpeed 0.056333
4 Sunshine 0.049465
8 Humidity9am 0.040218
14 Temp9am 0.035000
0 MinTemp 0.034894
10 Pressure9am 0.034415
1 MaxTemp 0.032361
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

Hyperparameter Tuning and Overfitting

?DecisionTreeClassifier

As we saw in the previous section, our decision tree classifier memorized all training examples, leading to a 100% training accuracy, while the validation accuracy was only marginally better than a dumb baseline model. This phenomenon is called overfitting, and in this section, we'll look at some strategies for reducing overfitting. The process of reducing overfitting is known as regularlization.

The DecisionTreeClassifier accepts several arguments, some of which can be modified to reduce overfitting.

These arguments are called hyperparameters because they must be configured manually (as opposed to the parameters within the model which are learned from the data. We'll explore a couple of hyperparameters:

  • max_depth
  • max_leaf_nodes

max_depth

By reducing the maximum depth of the decision tree, we can prevent the tree from memorizing all training examples, which may lead to better generalization

model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X_train, train_targets)
DecisionTreeClassifier(max_depth=3, random_state=42)
model.score(X_train, train_targets)
0.8291308037337859
model.score(X_val, val_targets)
0.8334397307178921
model.classes_
array(['No', 'Yes'], dtype=object)

Great, while the training accuracy of the model has gone down, the validation accuracy of the model has increased significantly.

plt.figure(figsize=(80, 40))
plot_tree(model, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_)
[Text(2232.0, 1902.6000000000001, 'Humidity3pm <= 0.715\ngini = 0.349\nsamples = 98988\nvalue = [76705, 22283]\nclass = No'),
 Text(1116.0, 1359.0, 'Rainfall <= 0.004\ngini = 0.248\nsamples = 82418\nvalue = [70439, 11979]\nclass = No'),
 Text(558.0, 815.4000000000001, 'Sunshine <= 0.525\ngini = 0.198\nsamples = 69252\nvalue = [61538, 7714]\nclass = No'),
 Text(279.0, 271.79999999999995, 'gini = 0.363\nsamples = 12620\nvalue = [9618, 3002]\nclass = No'),
 Text(837.0, 271.79999999999995, 'gini = 0.153\nsamples = 56632\nvalue = [51920, 4712]\nclass = No'),
 Text(1674.0, 815.4000000000001, 'Humidity3pm <= 0.512\ngini = 0.438\nsamples = 13166\nvalue = [8901, 4265]\nclass = No'),
 Text(1395.0, 271.79999999999995, 'gini = 0.293\nsamples = 4299\nvalue = [3531, 768]\nclass = No'),
 Text(1953.0, 271.79999999999995, 'gini = 0.478\nsamples = 8867\nvalue = [5370, 3497]\nclass = No'),
 Text(3348.0, 1359.0, 'Humidity3pm <= 0.825\ngini = 0.47\nsamples = 16570\nvalue = [6266, 10304]\nclass = Yes'),
 Text(2790.0, 815.4000000000001, 'WindGustSpeed <= 0.279\ngini = 0.499\nsamples = 9136\nvalue = [4804, 4332]\nclass = No'),
 Text(2511.0, 271.79999999999995, 'gini = 0.472\nsamples = 5583\nvalue = [3457, 2126]\nclass = No'),
 Text(3069.0, 271.79999999999995, 'gini = 0.471\nsamples = 3553\nvalue = [1347, 2206]\nclass = Yes'),
 Text(3906.0, 815.4000000000001, 'Rainfall <= 0.01\ngini = 0.316\nsamples = 7434\nvalue = [1462, 5972]\nclass = Yes'),
 Text(3627.0, 271.79999999999995, 'gini = 0.391\nsamples = 4360\nvalue = [1161, 3199]\nclass = Yes'),
 Text(4185.0, 271.79999999999995, 'gini = 0.177\nsamples = 3074\nvalue = [301, 2773]\nclass = Yes')]
print(export_text(model, feature_names=list(X_train.columns)))
|--- Humidity3pm <= 0.72
|   |--- Rainfall <= 0.00
|   |   |--- Sunshine <= 0.52
|   |   |   |--- class: No
|   |   |--- Sunshine >  0.52
|   |   |   |--- class: No
|   |--- Rainfall >  0.00
|   |   |--- Humidity3pm <= 0.51
|   |   |   |--- class: No
|   |   |--- Humidity3pm >  0.51
|   |   |   |--- class: No
|--- Humidity3pm >  0.72
|   |--- Humidity3pm <= 0.82
|   |   |--- WindGustSpeed <= 0.28
|   |   |   |--- class: No
|   |   |--- WindGustSpeed >  0.28
|   |   |   |--- class: Yes
|   |--- Humidity3pm >  0.82
|   |   |--- Rainfall <= 0.01
|   |   |   |--- class: Yes
|   |   |--- Rainfall >  0.01
|   |   |   |--- class: Yes

def max_depth_error(md):
    model = DecisionTreeClassifier(max_depth=md, random_state=42)
    model.fit(X_train, train_targets)
    train_error = 1 - model.score(X_train, train_targets)
    val_error = 1 - model.score(X_val, val_targets)
    return {'Max Depth': md, 'Training Error': train_error, 'Validation Error': val_error}
%%time
errors_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])
CPU times: user 26.4 s, sys: 225 ms, total: 26.6 s
Wall time: 26.7 s
errors_df
Max Depth Training Error Validation Error
0 1 0.184315 0.177935
1 2 0.179547 0.172712
2 3 0.170869 0.166560
3 4 0.165707 0.164355
4 5 0.160676 0.159074
5 6 0.156271 0.157275
6 7 0.153312 0.154605
7 8 0.147806 0.158029
8 9 0.140906 0.156578
9 10 0.132945 0.157333
10 11 0.123227 0.159248
11 12 0.113489 0.160815
12 13 0.101750 0.163833
13 14 0.089981 0.167373
14 15 0.078999 0.171261
15 16 0.068180 0.174279
16 17 0.058138 0.176890
17 18 0.048733 0.181243
18 19 0.040025 0.187569
19 20 0.032539 0.190297
plt.figure()
plt.plot(errors_df['Max Depth'], errors_df['Training Error'])
plt.plot(errors_df['Max Depth'], errors_df['Validation Error'])
plt.title("Training vs Validation Error")
plt.xticks(range(0,21,2))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error ie 1-Accuracy')
plt.legend(['Training', 'Validation'])
<matplotlib.legend.Legend at 0x7f1fcb8fd3a0>

overfitting

So for us max depth of 7 results in lowest validation error

model = DecisionTreeClassifier(max_depth=7, random_state=42).fit(X_train, train_targets)
model.score(X_val, val_targets), model.score(X_train, train_targets)
(0.8453949277465034, 0.8466884874934335)

max_leaf_nodes

Another way to control the size of complexity of a decision tree is to limit the number of leaf nodes. This allows branches of the tree to have varying depths.

model = DecisionTreeClassifier(max_leaf_nodes = 128, random_state = 42)
model.fit(X_train, train_targets)
DecisionTreeClassifier(max_leaf_nodes=128, random_state=42)
model.score(X_train, train_targets)
0.8480421869317493
model.score(X_val, val_targets)
0.8442342290058615
model.tree_.max_depth
12

Notice that the model was able to achieve a greater depth of 12 for certain paths while keeping other paths shorter.

model_text = export_text(model, feature_names = list(X_train.columns))
print(model_text[:3000])
|--- Humidity3pm <= 0.72
|   |--- Rainfall <= 0.00
|   |   |--- Sunshine <= 0.52
|   |   |   |--- Pressure3pm <= 0.58
|   |   |   |   |--- WindGustSpeed <= 0.36
|   |   |   |   |   |--- Humidity3pm <= 0.28
|   |   |   |   |   |   |--- class: No
|   |   |   |   |   |--- Humidity3pm >  0.28
|   |   |   |   |   |   |--- Sunshine <= 0.05
|   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |--- Sunshine >  0.05
|   |   |   |   |   |   |   |--- Pressure3pm <= 0.43
|   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |--- Pressure3pm >  0.43
|   |   |   |   |   |   |   |   |--- Humidity3pm <= 0.57
|   |   |   |   |   |   |   |   |   |--- WindDir9am_NE <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- WindDir9am_NNE <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |   |--- WindDir9am_NNE >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |--- WindDir9am_NE >  0.50
|   |   |   |   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |   |   |--- Humidity3pm >  0.57
|   |   |   |   |   |   |   |   |   |--- MaxTemp <= 0.53
|   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |--- MaxTemp >  0.53
|   |   |   |   |   |   |   |   |   |   |--- Temp3pm <= 0.67
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |   |   |   |--- Temp3pm >  0.67
|   |   |   |   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |--- WindGustSpeed >  0.36
|   |   |   |   |   |--- Humidity3pm <= 0.45
|   |   |   |   |   |   |--- Sunshine <= 0.39
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |--- Sunshine >  0.39
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |--- Humidity3pm >  0.45
|   |   |   |   |   |   |--- Pressure3pm <= 0.49
|   |   |   |   |   |   |   |--- class: Yes
|   |   |   |   |   |   |--- Pressure3pm >  0.49
|   |   |   |   |   |   |   |--- class: Yes
|   |   |   |--- Pressure3pm >  0.58
|   |   |   |   |--- Pressure3pm <= 0.70
|   |   |   |   |   |--- Sunshine <= 0.32
|   |   |   |   |   |   |--- WindDir9am_N <= 0.50
|   |   |   |   |   |   |   |--- Humidity3pm <= 0.67
|   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |   |--- Humidity3pm >  0.67
|   |   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |--- WindDir9am_N >  0.50
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |--- Sunshine >  0.32
|   |   |   |   |   |   |--- WindGustSpeed <= 0.33
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |   |   |--- WindGustSpeed >  0.33
|   |   |   |   |   |   |   |--- class: No
|   |   |   |   |--- Pressure3pm >  0.70
|   |   |   |   |   |--- Location_CoffsHarbour <= 0.50
|   |   |   |   |   |   |--- class: No
|   |   |   |   |   |--- Location_CoffsHarbour >  0.50
|   |   |   |   |   |   |--- class: No
|   |   |--- Sunshine >  0.52
|   |   |   

Random Forest with Ausstralia Rain Dataset

While tuning the hyperparameters of a single decision tree may lead to some improvements, a much more effective strategy is to combine the results of several decision trees trained with slightly different parameters. This is called a random forest model.

The key idea here is that each decision tree in the forest will make different kinds of errors, and upon averaging, many of their errors will cancel out.

A random forest works by averaging/combining the results of several decision trees:

RF1

RF3

We'll use the RandomForestClassifier class from sklearn.ensemble.

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_jobs = -1, random_state=42) # n_jobs= -1 to train in parallel 
%%time
model.fit(X_train, train_targets)
CPU times: user 38.8 s, sys: 156 ms, total: 39 s
Wall time: 4.13 s
RandomForestClassifier(n_jobs=-1, random_state=42)
model.score(X_train, train_targets)
0.9999494888269285
model.score(X_val, val_targets)
0.8566537055307295

Once again, the training accuracy is almost 100%, but this time the validation accuracy is much better. In fact, it is better than the best single decision tree we had trained so far. Do you see the power of random forests?

This general technique of combining the results of many models is called "ensembling", it works because most errors of individual models cancel out on averaging. Here's what it looks like visually:

RF2

We can also look at the probabilities for the predictions. The probability of a class is simply the fraction of trees which that predicted the given class.

train_probs = model.predict_proba(X_train)
train_preds
array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)
train_probs
array([[0.93, 0.07],
       [1.  , 0.  ],
       [0.99, 0.01],
       ...,
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.96, 0.04]])
model.estimators_[0]
DecisionTreeClassifier(max_features='auto', random_state=1608637542)
len(model.estimators_)
100
    
plt.figure(figsize=(80,40))
plot_tree(model.estimators_[0], max_depth=2, feature_names=X_train.columns, filled=True, class_names=model.classes_)
[Text(2232.0, 1902.6000000000001, 'Sunshine <= 0.403\ngini = 0.347\nsamples = 62607\nvalue = [76887, 22101]\nclass = No'),
 Text(1116.0, 1359.0, 'Pressure9am <= 0.609\ngini = 0.499\nsamples = 11288\nvalue = [9272, 8542]\nclass = No'),
 Text(558.0, 815.4000000000001, 'Cloud9am <= 0.833\ngini = 0.475\nsamples = 6067\nvalue = [3702, 5808]\nclass = Yes'),
 Text(279.0, 271.79999999999995, '\n  (...)  \n'),
 Text(837.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1674.0, 815.4000000000001, 'WindGustDir_NNE <= 0.5\ngini = 0.442\nsamples = 5221\nvalue = [5570, 2734]\nclass = No'),
 Text(1395.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1953.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3348.0, 1359.0, 'RainToday_Yes <= 0.5\ngini = 0.278\nsamples = 51319\nvalue = [67615, 13559]\nclass = No'),
 Text(2790.0, 815.4000000000001, 'Pressure9am <= 0.521\ngini = 0.207\nsamples = 41960\nvalue = [58514, 7796]\nclass = No'),
 Text(2511.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3069.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3906.0, 815.4000000000001, 'Pressure9am <= 0.614\ngini = 0.475\nsamples = 9359\nvalue = [9101, 5763]\nclass = No'),
 Text(3627.0, 271.79999999999995, '\n  (...)  \n'),
 Text(4185.0, 271.79999999999995, '\n  (...)  \n')]
    
plt.figure(figsize=(80,40))
plot_tree(model.estimators_[40], max_depth=2, feature_names=X_train.columns, filled=True, class_names=model.classes_)
[Text(2232.0, 1902.6000000000001, 'Cloud9am <= 0.722\ngini = 0.35\nsamples = 62556\nvalue = [76638, 22350]\nclass = No'),
 Text(1116.0, 1359.0, 'RainToday_No <= 0.5\ngini = 0.291\nsamples = 48000\nvalue = [62502, 13407]\nclass = No'),
 Text(558.0, 815.4000000000001, 'Evaporation <= 0.038\ngini = 0.483\nsamples = 9005\nvalue = [8394, 5804]\nclass = No'),
 Text(279.0, 271.79999999999995, '\n  (...)  \n'),
 Text(837.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1674.0, 815.4000000000001, 'Pressure9am <= 0.517\ngini = 0.216\nsamples = 38995\nvalue = [54108, 7603]\nclass = No'),
 Text(1395.0, 271.79999999999995, '\n  (...)  \n'),
 Text(1953.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3348.0, 1359.0, 'Humidity3pm <= 0.755\ngini = 0.475\nsamples = 14556\nvalue = [14136, 8943]\nclass = No'),
 Text(2790.0, 815.4000000000001, 'Rainfall <= 0.004\ngini = 0.388\nsamples = 10894\nvalue = [12707, 4538]\nclass = No'),
 Text(2511.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3069.0, 271.79999999999995, '\n  (...)  \n'),
 Text(3906.0, 815.4000000000001, 'Temp9am <= 0.37\ngini = 0.37\nsamples = 3662\nvalue = [1429, 4405]\nclass = Yes'),
 Text(3627.0, 271.79999999999995, '\n  (...)  \n'),
 Text(4185.0, 271.79999999999995, '\n  (...)  \n')]

Random forest also assign importance to each feature, by combining the importance values from individual trees

imortance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
importance_df.head(10)
feature importance
9 Humidity3pm 0.261441
11 Pressure3pm 0.062057
2 Rainfall 0.059139
5 WindGustSpeed 0.056333
4 Sunshine 0.049465
8 Humidity9am 0.040218
14 Temp9am 0.035000
0 MinTemp 0.034894
10 Pressure9am 0.034415
1 MaxTemp 0.032361
plt.title('importance of features')
sns.barplot(data=importance_df.head(10), x='importance', y='feature')
<AxesSubplot:title={'center':'importance of features'}, xlabel='importance', ylabel='feature'>

Hyperparameter Tuning with Random Forests

?RandomForestClassifier
base_model = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_train, train_targets)
base_train_acc = base_model.score(X_train, train_targets)
base_train_acc
0.9999494888269285
base_val_acc = base_model.score(X_val, val_targets)
base_val_acc
0.8566537055307295
base_accs = base_train_acc, base_val_acc

n_estimators

This argument controls the number of decision trees in the random forest. The default value is 100. For larger datasets, it helps to have a greater number of estimators. As a general rule, try to have as few estimators as needed.

10 estimators

model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=10)
model.fit(X_train, train_targets)
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42)
model.score(X_train, train_targets), model.score(X_val, val_targets)
(0.986958015112943, 0.8485868492832686)
base_accs
(0.9999494888269285, 0.8566537055307295)

500 estimators

model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500)
model.fit(X_train, train_targets)
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
model.score(X_train, train_targets), model.score(X_val, val_targets)
(0.9999797955307714, 0.8577563693343393)
base_accs
(0.9999494888269285, 0.8566537055307295)

max_depth and max_leaf_nodes

These arguments are passed directly to each decision tree, and control the maximum depth and max. no leaf nodes of each tree respectively. By default, no maximum depth is specified, which is why each tree has a training accuracy of 100%. You can specify a max_depth to reduce overfitting.

DC

Let's define a helper function test_params to make it easy to test hyperparameters.

def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(X_train, train_targets)
    return model.score(X_train, train_targets), model.score(X_val, val_targets)
test_params(max_depth=5, max_leaf_nodes=1024, n_estimators=1000)
(0.8231704853113508, 0.8279264116998433)
test_params(max_depth=26)
(0.9814826039519942, 0.8572340549010504)
test_params(max_leaf_nodes=2**5)
(0.8314341132258456, 0.833904010214149)
test_params(max_leaf_nodes=2**20)
(0.9999494888269285, 0.8556671116011839)
base_accs
(0.9999494888269285, 0.8566537055307295)

max_features

Instead of picking all features (columns) for every split, we can specify that only a fraction of features be chosen randomly to figure out a split.

max

Notice that the default value auto causes only $\sqrt{n}$ out of total features ( $n$ ) to be chosen randomly at each split. This is the reason each decision tree in the forest is different. While it may seem counterintuitive, choosing all features for every split of every tree will lead to identical trees, so the random forest will not generalize well.

test_params(max_features='log2')
(0.9999595910615429, 0.8558992513493123)
test_params(max_features=3)
(0.9999494888269285, 0.8543323080494458)
test_params(max_features=20)
(0.9999595910615429, 0.8565956705936975)
base_accs
(0.9999494888269285, 0.8566537055307295)

min_samples_split and min_sample_leaf

By default, the decision tree classifier tries to split every node that has 2 or more. You can increase the values of these arguments to change this behavior and reduce overfitting, especially for very large datasets.

test_params(min_samples_split=3, min_samples_leaf=2)
(0.9625005051117307, 0.8565956705936975)
test_params(min_samples_split=100, min_samples_leaf=60)
(0.8495676243585081, 0.8451047530613429)
base_accs
(0.9999494888269285, 0.8566537055307295)

min_impurity_decrease

This argument is used to control the threshold for splitting nodes. A node will be split if this split induces a decrease of the impurity (Gini index) greater than or equal to this value. It's default value is 0, and you can increase it to reduce overfitting.

test_params(min_impurity_decrease=1e-7)
(0.9996060128500425, 0.8561313910974406)
test_params(min_impurity_decrease=1e-2)
(0.774891906089627, 0.7882885497069235)
base_accs
(0.9999494888269285, 0.8566537055307295)

bootstrap, max_samples

By default, a random forest doesn't use the entire dataset for training each decision tree. Instead it applies a technique called bootstrapping. For each tree, rows from the dataset are picked one by one randomly, with replacement i.e. some rows may not show up at all, while some rows may show up multiple times.

bootstrap

Bootstrapping helps the random forest generalize better, because each decision tree only sees a fraction of th training set, and some rows randomly get higher weightage than others.

test_params(bootstrap=False)
(0.9999797955307714, 0.8567697754047937)
base_accs
(0.9999494888269285, 0.8566537055307295)

When bootstrapping is enabled, you can also control the number or fraction of rows to be considered for each bootstrap using max_samples. This can further generalize the model.

max_ample

test_params(max_samples=0.9)
(0.9997676486038711, 0.8565376356566653)
base_accs
(0.9999494888269285, 0.8566537055307295)

class_weight

model.classes_
array(['No', 'Yes'], dtype=object)
test_params(class_weight='balanced')
(0.9999494888269285, 0.8543903429864779)
test_params(class_weight={'No': 1, 'Yes': 2})
(0.9999595910615429, 0.8558412164122802)
base_accs
(0.9999494888269285, 0.8566537055307295)

Putting it together

model = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=500,
                               max_features=7,
                               max_depth=30, 
                               class_weight={'No': 1, 'Yes': 1.5})
model.fit(X_train, train_targets)
RandomForestClassifier(class_weight={'No': 1, 'Yes': 1.5}, max_depth=30,
                       max_features=7, n_estimators=500, n_jobs=-1,
                       random_state=42)
model.score(X_train, train_targets), model.score(X_val, val_targets)
(0.9920192346547057, 0.8563054959085369)
base_accs
(0.9999494888269285, 0.8566537055307295)
model.score(X_test, test_targets)
0.8451913451913452

Making Predictions on New Inputs

def predict_input(model, single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}
predict_input(model, new_input)
('Yes', 0.7608595348304202)
raw_df.Location.unique()
array(['Albury', 'BadgerysCreek', 'Cobar', 'CoffsHarbour', 'Moree',
       'Newcastle', 'NorahHead', 'NorfolkIsland', 'Penrith', 'Richmond',
       'Sydney', 'SydneyAirport', 'WaggaWagga', 'Williamtown',
       'Wollongong', 'Canberra', 'Tuggeranong', 'MountGinini', 'Ballarat',
       'Bendigo', 'Sale', 'MelbourneAirport', 'Melbourne', 'Mildura',
       'Nhil', 'Portland', 'Watsonia', 'Dartmoor', 'Brisbane', 'Cairns',
       'GoldCoast', 'Townsville', 'Adelaide', 'MountGambier', 'Nuriootpa',
       'Woomera', 'Albany', 'Witchcliffe', 'PearceRAAF', 'PerthAirport',
       'Perth', 'SalmonGums', 'Walpole', 'Hobart', 'Launceston',
       'AliceSprings', 'Darwin', 'Katherine', 'Uluru'], dtype=object)

Saving and Loading Trained Models

import joblib
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_cols,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}
joblib.dump(aussie_rain, 'aussie_rain.joblib')
['aussie_rain.joblib']
aussie_rain2 = joblib.load('aussie_rain.joblib')
test_preds2 = aussie_rain2['model'].predict(X_test)
accuracy_score(test_targets, test_preds2)
0.8451913451913452