-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathdownsample_fun.py
211 lines (146 loc) · 5.93 KB
/
downsample_fun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
# string used to format titles
super_string = ': '
def main(net, df=None, ds_type='kmeans', axis='row', num_samples=100, random_state=1000):
if df is None:
df = net.export_df()
# # run downsampling
# random_state = 1000
ds_df, ds_data = run_kmeans_mini_batch(df, num_samples, axis, random_state)
net.load_df(ds_df)
return ds_data
def run_kmeans_mini_batch(df, num_samples=100, axis='row', random_state=1000):
# gather downsampled axis information
if axis == 'row':
X = df
orig_labels = df.index.tolist()
non_ds_labels = df.columns.tolist()
else:
X = df.transpose()
orig_labels = df.columns.tolist()
non_ds_labels = df.index.tolist()
cat_index = 1
# run until the number of returned clusters with data-points is equal to the
# number of requested clusters
num_returned_clusters = 0
while num_samples != num_returned_clusters:
clusters, num_returned_clusters, cluster_data, cluster_pop = \
calc_mbk_clusters(X, num_samples, random_state)
random_state = random_state + random_state
clust_numbers = range(num_returned_clusters)
clust_labels = [ 'cluster-' + str(i) for i in clust_numbers]
if type(orig_labels[0]) is tuple:
found_cats = True
else:
found_cats = False
# Gather categories if necessary
########################################
# check if there are categories
if found_cats:
all_cats = generate_cat_data(cluster_data, orig_labels, num_samples)
# genrate cluster labels, e.g. add number in each cluster and majority cat
# if necessary
cluster_labels = []
for i in range(num_returned_clusters):
inst_name = 'Cluster: ' + clust_labels[i]
num_in_clust_string = 'number in clust: '+ str(cluster_pop[i])
inst_tuple = (inst_name,)
if found_cats:
for cat_data in all_cats:
cat_values = cat_data['counts'][i]
max_cat_fraction = cat_values.max()
max_index = np.where(cat_values == max_cat_fraction)[0][0]
max_cat_name = cat_data['types'][max_index]
# add category title if available
cat_name_string = 'Majority-'+ cat_data['title'] +': ' + max_cat_name
inst_tuple = inst_tuple + (cat_name_string,)
inst_tuple = inst_tuple + (num_in_clust_string,)
cluster_labels.append(inst_tuple)
# ds_df is always downsampling the rows, if the user wants to downsample the
# columns, the df will be switched back later
ds_df = pd.DataFrame(data=clusters, index=cluster_labels, columns=non_ds_labels)
# swap back for downsampled columns
if axis == 'col':
ds_df = ds_df.transpose()
return ds_df, cluster_data
def generate_cat_data(cluster_data, orig_labels, num_samples):
# generate an array of orig_labels, using an array so that I can gather
# label subsets using indices
orig_array = np.asarray(orig_labels)
example_label = orig_labels[0]
# find out how many string categories are available
num_cats = 0
for i in range(len(example_label)):
if i > 0:
inst_cat = example_label[i]
if super_string in inst_cat:
inst_cat = inst_cat.split(super_string)[1]
string_cat = True
try:
float(inst_cat)
string_cat = False
except:
string_cat = True
if string_cat:
num_cats = num_cats + 1
all_cats = []
for cat_index in range(num_cats):
# index zero is for the names
cat_index = cat_index + 1
cat_data = {}
if super_string in example_label[cat_index]:
cat_data['title'] = example_label[cat_index].split(super_string)[0]
else:
cat_data['title'] = 'Category'
# if there are string categories, then keep track of how many of each category
# are found in each of the downsampled clusters.
cat_data['types'] = []
# gather possible categories
for inst_label in orig_labels:
inst_cat = inst_label[cat_index]
if super_string in inst_cat:
inst_cat = inst_cat.split(super_string)[1]
# get first category
cat_data['types'].append(inst_cat)
cat_data['types'] = sorted(list(set(cat_data['types'])))
num_cats = len(cat_data['types'])
# initialize cat_data['counts'] dictionary
cat_data['counts'] = {}
for inst_clust in range(num_samples):
cat_data['counts'][inst_clust] = np.zeros([num_cats])
# populate cat_data['counts']
for inst_clust in range(num_samples):
# get the indicies of all original labels that fall in the cluster
found = np.where(cluster_data == inst_clust)
found_indicies = found[0]
clust_names = orig_array[found_indicies]
for inst_name in clust_names:
# get first category name
inst_name = inst_name[cat_index]
if super_string in inst_name:
inst_name = inst_name.split(super_string)[1]
tmp_index = cat_data['types'].index(inst_name)
cat_data['counts'][inst_clust][tmp_index] = cat_data['counts'][inst_clust][tmp_index] + 1
# calculate fractions
for inst_clust in range(num_samples):
# get array
counts = cat_data['counts'][inst_clust]
inst_total = np.sum(counts)
cat_data['counts'][inst_clust] = cat_data['counts'][inst_clust] / inst_total
all_cats.append(cat_data)
return all_cats
def calc_mbk_clusters(X, n_clusters, random_state=1000):
# kmeans is run with rows as data-points and columns as dimensions
mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters,
max_no_improvement=100, verbose=0,
random_state=random_state)
# need to loop through each label (each k-means cluster) and count how many
# points were given this label. This will give the population size of each label
mbk.fit(X)
cluster_data = mbk.labels_
clusters = mbk.cluster_centers_
mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True)
num_returned_clusters = len(cluster_pop)
return clusters, num_returned_clusters, cluster_data, cluster_pop