-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_kmeans-1-8-8-4-21.py
36 lines (29 loc) · 1.37 KB
/
analyze_kmeans-1-8-8-4-21.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
CLUSTER_NUM = 5
df_kddcup = pd.read_csv('.\\dataset\\kddcup.data_small.csv')
df_kddcup = df_kddcup.iloc[:, [0, 7, 10, 11, 13, 35, 37, 39]]
df_kddcup = (df_kddcup - df_kddcup.mean()) / df_kddcup.mean()
kddcup_array = np.array([df_kddcup['duration'].tolist(),
df_kddcup['wrong_fragment'].tolist(),
df_kddcup['num_failed_logins'].tolist(),
df_kddcup['logged_in'].tolist(),
df_kddcup['root_shell'].tolist(),
df_kddcup['dst_host_same_src_port_rate'].tolist(),
df_kddcup['dst_host_serror_rate'].tolist(),
df_kddcup['dst_host_rerror_rate'].tolist(),
], np.float)
kddcup_array = kddcup_array.T
pred = KMeans(n_clusters=CLUSTER_NUM).fit_predict(kddcup_array)
df_kddcup['cluster_id'] = pred
print(df_kddcup)
print(df_kddcup['cluster_id'].value_counts())
cluster_info = pd.DataFrame()
for i in range(CLUSTER_NUM):
cluster_info['cluster' + str(i)] = df_kddcup[df_kddcup['cluster_id'] == i].mean()
cluster_info = cluster_info.drop('cluster_id')
kdd_plot = cluster_info.T.plot(kind='bar', stacked=True, title="Mean Value of Clusters")
kdd_plot.set_xticklabels(kdd_plot.xaxis.get_majorticklabels(), rotation=0)
print('finish!!')