-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathKMeans_Sales_Transactions_sklearn.py
111 lines (90 loc) · 3.36 KB
/
KMeans_Sales_Transactions_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
## k-Means Sales Transactions Scikit Learn Version
#
# Author: David Lee
# Create Date: 2018/10/22
#
# Detail:
# Total Data = 811
import numpy as np
import pandas as pd # Read csv
import matplotlib.pyplot as plt # Plot elbow
from sklearn.cluster import KMeans
from sklearn import metrics # Evaluate model
def loadData(path, data='normalized'):
inputData = pd.read_csv(path)
inputData = inputData.drop(['Product_Code'], 1) # Drop product code (ID)
if data == 'normalized':
inputData = inputData.iloc[:, -52:] # Select only normalized data (Based on paper)
if data == 'original':
inputData = inputData.iloc[:, :52] # Select non-normalized data
# else all the data
data = np.array(inputData)
return data
def trainKMeans(data_train, k):
kmeans = KMeans(n_clusters=k)
kmeans.fit(data_train)
return kmeans
def testScore(data_train, kmeans):
return kmeans.score(data_train)
def evaluateModel(data_train, kmeans):
labels = kmeans.labels_
silhouette_score = metrics.silhouette_score(data_train, labels, metric="euclidean")
calinski_harabaz_score = metrics.calinski_harabaz_score(data_train, labels)
print("the mean Silhouette Coefficient of all samples:", silhouette_score)
print("the Calinski and Harabaz score:", calinski_harabaz_score)
return silhouette_score, calinski_harabaz_score
def process(dataType='normalized'):
MAX_TRY = 20
# Load Data
data_train = loadData('Datasets/Sales_Transactions_Dataset_Weekly.csv', data=dataType)
print("==== %s ====" % (dataType))
# Train Model and Test Score and Evaluate Model
# try many different k
scores = []
scores2 = []
scores3 = []
for k in range(2, MAX_TRY+1):
# Train
kMeans_model = trainKMeans(data_train, k)
# Score
score = float(testScore(data_train, kMeans_model))
scores.append(score)
print('Score of k = %d:' % (k), score)
# Evaluate
score2, score3 = evaluateModel(data_train, kMeans_model)
scores2.append(score2)
scores3.append(score3)
# Plot the k - loss diagram
fig = plt.figure(figsize=(15, 5))
fig.suptitle('Comparison of three metrics score (%s)' % (dataType))
plt.subplot(131)
plt.ylabel("Opposite of the value of X on the K-means objective")
plt.grid(True)
plt.xticks(range(2, MAX_TRY+1))
plt.plot(np.arange(2, MAX_TRY+1, 1), np.array(scores)*-1)
plt.subplot(132)
plt.ylabel("Mean Silhouette Coefficient of all samples")
plt.grid(True)
plt.xticks(range(2, MAX_TRY+1))
plt.plot(np.arange(2, MAX_TRY+1, 1), np.array(scores2))
plt.subplot(133)
plt.ylabel("Calinski and Harabaz score")
plt.grid(True)
plt.xticks(range(2, MAX_TRY+1))
plt.plot(np.arange(2, MAX_TRY+1, 1), np.array(scores3))
k_max = np.arange(2, MAX_TRY+1, 1)[scores3.index(max(scores3))]
arrow_xy = [k_max, max(scores3)]
text_xy = arrow_xy[:]
text_xy[0] += MAX_TRY//2
text_xy[1] = text_xy[1] * 0.8
plt.annotate('k of max\n(k=%d)' % (k_max), xy=arrow_xy, xytext=text_xy,
arrowprops=dict(facecolor='black', shrink=0.05))
plt.subplots_adjust(top=0.88, bottom=0.11, left=0.08, right=0.98, hspace=0.20,
wspace=0.20)
def main():
process('normalized')
process('original')
#process('all data')
plt.show()
if __name__ == '__main__':
main()