k-Means Clustering on Mall Customer Segmentation Dataset
- Importing Dependencies
- EDA on Mall Customer Segmentation Dataset
- WCSS - Within Cluster Sum of Squares
- k-Means Clustering
- Visualizing Clusters
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import os
data_dir = '/media/siddy/D Drive/Datasets/'
print(os.listdir(data_dir))
file_path = os.path.join(data_dir + 'Clustering', "Mall_Customers.csv")
print(file_path)
df = pd.read_csv(f'{file_path}')
df.head(10)
df.info()
df.isnull().sum()
features = df.iloc[:, [3,4]].values
features
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(features)
wcss.append(kmeans.inertia_)
sns.set()
plt.plot(range(1, 11), wcss)
plt.title('ELBOW point graph')
plt.xlabel('No of clusters')
plt.ylabel('WCSS')
plt.show()
optimum number of clusters = 5
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state=0)
#return a label for each datapoint based on their clusters
y = kmeans.fit_predict(features)
print(y)
len(y)
plt.figure(figsize=(8,8))
# features[y==0(cluster no),0(column of features)], s= size of dot
plt.scatter(features[y==0,0], features[y==0,1], s=50, c='green', label='Cluster 1')
plt.scatter(features[y==1,0], features[y==1,1], s=50, c='blue', label='Cluster 2')
plt.scatter(features[y==2,0], features[y==2,1], s=50, c='red', label='Cluster 3')
plt.scatter(features[y==3,0], features[y==3,1], s=50, c='orange', label='Cluster 4')
plt.scatter(features[y==4,0], features[y==4,1], s=50, c='magenta', label='Cluster 5')
# plot the centroids
plt.scatter(kmeans.cluster_centers_[:, 0],kmeans.cluster_centers_[:, 1], s=100, c='black', label='centroids')
plt.title('Customer Groups')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()