-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathkmeans.py
52 lines (46 loc) · 2.25 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Importing the libraries
import matplotlib.pyplot as plt
import pandas as pd
# Import the Dataset
data_path = 'annotated-code/Part 4 - Clustering/Section 24 - K-Means Clustering/Mall_Customers.csv'
dataset = pd.read_csv(data_path)
# In this example we will run clustering with only 2 variables so that we can visualize them.
X = dataset.iloc[:, [3, 4]].values # Get all rows for columns 3 (annual income) and 4 (spending score)
# Use the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
# Try with 10 number of clusters
for i in range(1, 11):
# random_state is fixed in this example to make sure the results match with the course
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0)
kmeans.fit(X)
wcss.append(kmeans.inertia_) # WCSS is called inertia_ in sklearn
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()
n_clusters_opt = 5 # By eyeballing the graph, we conclude that the optimal number of clusters is 5
# Apply k-means to the dataset using the optimal number of clusters
kmeans = KMeans(n_clusters=n_clusters_opt, init='k-means++', max_iter=300, random_state=0)
# kmeans.fit_predict returns the an array with shape [n_samples, 1] with the cluster to where each observation belongs
# e.g. [0, 2, ...] means [cluster0, cluster2]. Note the cluster numbering is 0 based.
y_kmeans = kmeans.fit_predict(X)
# Visualizing the Clusters using a scatter plot (only works for 2D)
# Plotting high-dimensional problems can be done using dimensionality reduction techniques like PCA or LDA
colors = ['red', 'blue', 'green', 'cyan', 'magenta']
for cluster_num in range(0, n_clusters_opt):
# X[y_kmeans == cluster_num, i] => get all rows in which y_kmeans == cluster_num from X (and the i-th column)
plt.scatter(
X[y_kmeans == cluster_num, 0],
X[y_kmeans == cluster_num, 1],
s=100,
c=colors[cluster_num],
label='Cluster ' + str(cluster_num + 1)
)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of Clients')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (0-100)')
plt.legend()
plt.show()