-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmeasures.py
278 lines (215 loc) · 10.5 KB
/
measures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
from clustering import *
from hungarian import *
####################################################### SPATIAL COHERENCE #######################################################
def cluster_label_change(cluster, max_size):
l = len(cluster.objects)
if l == 0: return 0
cluster = sorted(cluster.objects) #list(cluster)
nbr = 0
nxt = cluster[l-1]
for i in range(l-1):
cur = cluster[i]
nxt = cluster[i+1]
if (cur+1) != nxt:
nbr += 1
if nxt != max_size-1:
nbr += 1
return nbr
def clustering_label_change(clustering, max_size):
""" count the number of label changes for each clustering"""
if not clustering.clusters:
return 0
total = sum([cluster_label_change(clt, max_size) for clt in clustering.clusters])
return total
def expected_number_of_changes(clustering, max_size):
""" expected number """
nbr_of_clusters = len(clustering.clusters)
if (nbr_of_clusters == 0):
return 0
sum_of_square = 0
for cluster in clustering.clusters:
sum_of_square += len(cluster.objects)**2
return max_size-sum_of_square*1.0/max_size
def spatial_coherence(clustering):
""" the tricky part about computing this measure lies in the fact that some clustering contains only non-noise clusters.
meaning that sometime we have to know the cardinality of the whole dataset"""
from collections import defaultdict
if clustering.contains_noise:
""" add the noisy part as an additional cluster """
cardinality = clustering.max_cardinality
all_objects = set(range(cardinality)) # all objects possible from the data space
other_objects = all_objects.difference(clustering.get_objects())
#print other_objects
#print all_objects
#print other_objects.__class__.__name__
print 'other_cluster-0'
other_cluster = SubspaceCluster(clustering_id = clustering.clustering_id, objects = (other_objects), dimensions=[])
print 'other_cluster-1'
all_clusters = clustering.clusters.append(other_cluster)
nbr_changes = clustering_label_change(clustering, cardinality)
expected = expected_number_of_changes(clustering, cardinality)
#print "nbr_changes=%s, expected=%s" %(nbr_changes, expected)
if not expected: return 0.0, 1
return float(nbr_changes-expected)/(cardinality-expected), 1
####################################################### f1 #######################################################
# 2 helpers for computing the precision and the recall between 2 sets
def set_convert(ls):
if ls.__class__.__name__ !='set':
ls = set(ls)
return ls
def precision_sets(set1, set2):
""" computes the precision score of 2 sets"""
set1, set2 = set_convert(set1), set_convert(set2)
intersection = set1.intersection(set2)
if set1:
return float(len(intersection))/len(set1)
return 0.0
def recall_sets(set1, set2):
""" computes the recall of 2 sets """
return precision_sets(set2, set1)
#f1
def f1_score(set1, set2, beta = 1):
""" if b > 1 that means the completeness is more important (weighted more) than the homogeneity """
precision = precision_sets(set1, set2)
recall = recall_sets(set1, set2)
if (precision+recall == 0): return 0.0
return (1+beta)*(precision*recall)/(beta*precision+recall)
####################################################### F1-Score #####################################################
def f1_hidden_cluster_map(ref_clustering, target_clustering, beta = 1):
""" each found cluster is mapped to the hidden cluster
which is covered to the most part by this found cluster"""
mapped_hidden = {}
hidden_clusters = ref_clustering.clusters
for clust in target_clustering.clusters:
mapped_precision = -1
for hclust in hidden_clusters:
precision = precision_sets(hclust.objects, clust.objects)
if precision > mapped_precision:
mapped_precision = precision
if mapped_precision > 0:
for hclust in hidden_clusters:
if mapped_precision == precision_sets(hclust.objects, clust.objects):
mapped_hidden.setdefault(hclust, set())
mapped_hidden[hclust].update(clust.objects)
return mapped_hidden
def f1_score_clustering(ref_clustering, target_clustering, beta = 1):
if not ref_clustering.clusters or not target_clustering.clusters:
return 0.0
nbr_of_hidden = len(ref_clustering.clusters) # number of hidden clusters
mapped_clusters = f1_hidden_cluster_map(ref_clustering, target_clustering, beta)
total_f1 = 0.0
for hidden_cluster, mapped_objects in mapped_clusters.iteritems():
#print "%s, %s, %s"%(hidden_cluster.objects, mapped_objects,f1_score(hidden_cluster.objects, mapped_objects))
total_f1 += f1_score(hidden_cluster.objects, mapped_objects)
return total_f1/float(nbr_of_hidden)
####################################################### Entropy #######################################################
def entropy_score_cluster(ref_clustering, target_cluster):
import math
etp = 0.0
for clust in ref_clustering.clusters: #for each cluster of the hidden clustering
""" the precision can be seen as the shared fraction/proba"""
proba = precision_sets(target_cluster.objects, clust.objects) #compute the precision(target, hidden)
if (proba != 0):
etp += -proba*math.log(proba)
return etp
def entropy_score_clustering(ref_clustering, target_clustering):
import math
""" compute the entropy-measure of clustering. entropy is a metric to measure the homogeneity of
a clustering. it should contains mainly objects from one ref cluster. a split is deemed to have bad quality"""
if not ref_clustering.clusters or not target_clustering.clusters:
return 0.0
m = len(ref_clustering.clusters)
entropy_max = math.log(m)
entropy_max_sum = entropy_max*sum([len(p.objects) for p in target_clustering.clusters])
if (entropy_max_sum == 0):
return 1.0-0.0
else:
entropy_weighted_sum = sum(len(p.objects)*entropy_score_cluster(ref_clustering, p)
for p in target_clustering.clusters)
entropy_weighted_average = float(entropy_weighted_sum)/entropy_max_sum;
return 1.0-entropy_weighted_average
####################################################### RNIA ######################################################
def rnia_clustering_coordonates(clustering):
""" return a list of pair depicting the coordonates of the micro-objects """
rs = set([])
for cluster in clustering.clusters:
for row in cluster.objects:
for col in cluster.dimensions:
rs.add((row, col))
return rs
def rnia_score(ref_clustering, target_clustering):
ref_coord = rnia_clustering_coordonates(ref_clustering)
target_coord = rnia_clustering_coordonates(target_clustering)
union = float(len(set.union(ref_coord, target_coord)))
if union ==0:
return 0.0
intersection = len(set.intersection(ref_coord, target_coord))
return 1-(union-intersection)/union
####################################################### CE #######################################################
def cluster_intersection(clustA, clustB):
rsA = set([])
for row in clustA.objects:
for col in clustA.dimensions:
rsA.add((row, col))
rsB = set([])
for row in clustB.objects:
for col in clustB.dimensions:
rsB.add((row, col))
intersection = rsA.intersection(rsB)
return len(intersection)
def make_ce_matrix(ref_clustering, target_clustering):
#clt = clustering
ref_clusters = ref_clustering.clusters
target_clusters = target_clustering.clusters
nbr_row = len(ref_clusters)
nbr_col = len(target_clusters)
mat = 0*np.ones([nbr_row, nbr_col], dtype=int)
for i in xrange(nbr_row):
for j in xrange(nbr_col):
ref_clust = ref_clusters[i]
target_clust = target_clusters[j]
mat[i,j] = cluster_intersection(ref_clust, target_clust)
return mat
def clustering_error(ref_clustering, target_clustering):
mat = make_ce_matrix(ref_clustering, target_clustering)
ref_coord = rnia_clustering_coordonates(ref_clustering)
target_coord = rnia_clustering_coordonates(target_clustering)
U = float(len(set.union(ref_coord, target_coord)))
if U ==0:
return 0.0
hung_solver = Hungarian()
rs, D_Max = hung_solver.compute(mat, True)
if U==0:
return 0
return 1-(U-D_Max)/(U)
def ce_score(ref_clustering, target_clustering):
return clustering_error(ref_clustering, target_clustering)
###########################################################################################################
########################################## RANKING ########################################################
###########################################################################################################
def rank_measure(clusterings, measures, measures_reverse = None):
""" returns the order ot clusterings in terms of measures. measurs_revese
indicate whether we should reverse the order. we require that these two have the same lenghts"""
import scipy.stats as ss
import numpy as np
from collections import defaultdict
if not measures_reverse:
measures_reverse = defaultdict(int)
rank_dict = {}
score_dict = {}
for measure in measures.keys():
measure_func, smaller_is_better = measures[measure], measures_reverse[measure]
data = []
for clustering in clusterings:
clustering.__dict__[measure] = measure_func(clustering)
if smaller_is_better:
data.append(clustering.__dict__[measure])
else:
data.append(clustering.__dict__[measure]) # reverse the sign in order to sort better
ranking = ss.rankdata(data)
#print measure, data
rk_measure = 'rk_%s'% (measure)
for i in xrange(len(clusterings)):
clusterings[i].__dict__[rk_measure] = ranking[i]
return clusterings #sorted_dict
############################### EVALUATOR ############################