-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathranking.py
351 lines (226 loc) · 10.9 KB
/
ranking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import os, sys,math
from collections import defaultdict, OrderedDict
from threading import Thread,Lock
from helper import stem_stop_words
mutex = Lock()
total_scores = defaultdict(lambda : [0,0,0]) # first element : cosine ; second : strong terms + anchor text ; third : pageRank
# remove other low-idf terms
def get_high_idf_terms(config, num_documents, unique_query_terms, query_postings):
high_idf_terms = list()
threshod = config.threshold_high_idf_terms
# keep increase threshold until at least one high_idf term is found, or stop until threshold is 1
while len(high_idf_terms) == 0 or threshod > 1:
for term in unique_query_terms:
if len(query_postings[term])/num_documents <= threshod:
high_idf_terms.append(term)
# increase threshold
threshod *= (1+config.threshold_increase_percent)
return high_idf_terms
# get all docs that has majority of the query terms combining with (boolean OR, boolean AND)
def get_docs_to_consider(config, doc_ids, unique_query_terms, query_postings):
num_documents = len(doc_ids)
# get unique and more important terms
unique_query_terms = get_high_idf_terms(config, num_documents, unique_query_terms, query_postings)
docs_to_consider = defaultdict(bool)
docs_has_all_terms = defaultdict(bool)
# safe case if all terms are too common, this should not exist if threshold is keeping increasing
if len(unique_query_terms) == 0:
return unique_query_terms,docs_to_consider, docs_has_all_terms
# if there is only one term, return all of its doc_ids
if len(unique_query_terms) == 1:
docs_to_consider = defaultdict(bool,{i : True for i in query_postings[unique_query_terms[0]].keys()})
return unique_query_terms,docs_to_consider, docs_has_all_terms
# determine threshold to determine what should be the number as majority of terms
min_query_terms_length = math.floor(config.threshold_percent_of_terms_in_docs * len(unique_query_terms))
# remove doc_ids below threshold
all_doc_ids = set()
for term in unique_query_terms:
all_doc_ids.update(set(query_postings[term].keys()))
docs_to_terms = defaultdict(int)
# count the number of terms in docs
for term in unique_query_terms:
for doc_id in all_doc_ids:
if query_postings[term][doc_id] != False:
docs_to_terms[doc_id] += 1
docs_to_terms = OrderedDict(sorted(docs_to_terms.items(), key = lambda kv: doc_ids[kv[0]][2],reverse = True))
num_doc_ids = 0
for doc_id,num_terms in docs_to_terms.items():
if num_doc_ids >= config.max_num_urls_per_query:
break
if num_terms >= min_query_terms_length:
# boolean_or: getting all doc_ids that qualifies the threshold
docs_to_consider[doc_id] = True
if num_terms >= len(unique_query_terms):
# boolean_and: getting only the doc_ids that has all the terms
docs_has_all_terms[doc_id] = True
num_doc_ids += 1
return unique_query_terms, docs_to_consider, docs_has_all_terms
############################################################################################################
# get w(term,query)
def get_wtq(total_query_terms):
wtq = defaultdict(int)
for term in total_query_terms:
wtq[term] += 1
for term, freq in wtq.items():
if freq != 0:
wtq[term] = 1 + math.log10(freq)
normalized_wtq = 0
for term, value in wtq.items():
normalized_wtq += value**2
normalized_wtq = math.sqrt(normalized_wtq)
return wtq, normalized_wtq
# get all w(term,document)
def get_wtds(high_idf_terms, query_postings, docs_to_consider):
wtds = defaultdict(bool)
docs = list(docs_to_consider.keys())
for doc_id in docs:
wtds[doc_id] = defaultdict(int)
for doc_id in docs:
for term in high_idf_terms:
if query_postings[term][doc_id] != False:
wtds[doc_id][term] = query_postings[term][doc_id].get_tf_idf()
else:
wtds[doc_id][term] = 0
return wtds
# calculate cosine scores
def calculate_cosine_scores(config, docs_to_consider, doc_ids, total_query_terms, high_idf_terms, query_postings):
cosine_scores = defaultdict(int)
wtq, normalized_wtq = get_wtq(total_query_terms)
wtds = get_wtds(high_idf_terms, query_postings, docs_to_consider)
for doc_id, wtd in wtds.items():
normalized_wtd = doc_ids[doc_id][2]
dot_product = 0
cosine_value = 0
for term in high_idf_terms:
dot_product += wtq[term] * wtd[term]
if normalized_wtd != 0 and normalized_wtq != 0:
cosine_value = dot_product / (normalized_wtd * normalized_wtq)
cosine_scores[doc_id] = cosine_value
update_total_scores(cosine_scores,0)
############################################################################################################
# match position values for boolean AND
def get_intersections_from_positions(config, high_idf_terms, query_postings, docs_has_all_terms):
positional_scores = defaultdict(bool)
docs = list(docs_has_all_terms.keys())
for i in range(0,len(docs_has_all_terms)):
doc_id = docs[i]
gap = 0
intersection_positions = set()
for j in range(len(high_idf_terms)):
posting = query_postings[high_idf_terms[j]]
if j == 0:
intersection_positions = set([p for p in posting[doc_id].get_positions()])
else:
intersection_positions = intersection_positions.intersection(set([p-gap for p in posting[doc_id].get_positions()]))
if intersection_positions is None:
break
elif len(intersection_positions) == 0:
break
gap += 1
if intersection_positions is not None:
if len(intersection_positions) != 0:
positional_scores[doc_id] = len(intersection_positions)
for doc_id in docs:
positional_scores[doc_id] *= config.default_score_boolean_and_position
positional_scores[doc_id] += config.default_score_boolean_and
return positional_scores
# calculate position scores
def calculate_positional_scores(config, docs_has_all_terms, high_idf_terms,query_postings):
if docs_has_all_terms is None:
return None
positional_scores = get_intersections_from_positions(config,high_idf_terms,query_postings, docs_has_all_terms)
update_total_scores(positional_scores,0)
############################################################################################################
# calculate strong_terms scores (title, bold, etc)
def calculate_strong_terms_scores(config, strong_terms, docs_has_all_terms, high_idf_terms):
strong_terms_scores = defaultdict(int)
for term in high_idf_terms:
if strong_terms[term] != False:
for doc_id in strong_terms[term]:
if docs_has_all_terms[doc_id] != False:
strong_terms_scores[doc_id] += config.default_score_strong_terms
update_total_scores(strong_terms_scores,1)
############################################################################################################
# calculate anchor text scores
def calculate_anchor_terms_scores(config, anchor_terms, docs_has_all_terms, high_idf_terms):
anchor_text_scores = defaultdict(int)
for term in high_idf_terms:
if anchor_terms[term] != False:
for doc_id in anchor_terms[term]:
if docs_has_all_terms[doc_id] != False:
anchor_text_scores[doc_id] += config.default_score_anchor_text
update_total_scores(anchor_text_scores,1)
############################################################################################################
# calculate pageRank scores
def calculate_page_ranking_scores(config, doc_ids, docs_has_all_terms):
page_rank_scores = defaultdict(int)
for doc_id in docs_has_all_terms:
page_rank_scores[doc_id] = doc_ids[doc_id][3]
update_total_scores(page_rank_scores,2)
############################################################################################################
# update total scores for ranking
def update_total_scores(scores, index):
global total_scores
global mutex
for doc_id, score in scores.items():
mutex.acquire()
if index == 0:
total_scores[doc_id][0] += score
else:
total_scores[doc_id][0] += score
total_scores[doc_id][1] += score
mutex.release()
# calculate all the scores function
def calculate_scores(config, doc_ids, total_query_terms, strong_terms, anchor_terms,query_postings):
global total_scores
unique_query_terms = list(query_postings.keys())
total_scores = defaultdict(lambda : [0,0,0])
if len(query_postings) == 1:
posting = query_postings[unique_query_terms[0]]
for doc_id in posting.keys():
total_scores[doc_id][0] = posting[doc_id].get_freq()
calculate_page_ranking_scores(config, doc_ids, list(posting.keys()))
calculate_anchor_terms_scores(config, anchor_terms, posting, unique_query_terms)
calculate_strong_terms_scores(config, strong_terms, posting, unique_query_terms)
else:
high_idf_terms, docs_to_consider, docs_has_all_terms = get_docs_to_consider(config, doc_ids, unique_query_terms, query_postings)
if len(high_idf_terms) == 0:
high_idf_terms = list(set(unique_query_terms))
threads = []
if docs_to_consider is not None:
threads.append(Thread(target = calculate_cosine_scores, args = (config, docs_to_consider, doc_ids, total_query_terms, high_idf_terms, query_postings)))
if docs_has_all_terms is not None:
threads.append(Thread(target = calculate_positional_scores, args = (config, docs_has_all_terms, high_idf_terms,query_postings)))
threads.append(Thread(target = calculate_anchor_terms_scores, args =(config, anchor_terms, docs_has_all_terms, high_idf_terms)))
threads.append(Thread(target = calculate_strong_terms_scores, args = (config, strong_terms, docs_has_all_terms, high_idf_terms)))
threads.append(Thread(target = calculate_page_ranking_scores, args = (config, doc_ids, docs_has_all_terms)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# after sorting, the more important documents will be showed fist
sorted_total_scores = OrderedDict(sorted(total_scores.items(), key = lambda kv:(kv[1][0], kv[1][2]),reverse = True))
# sort the first 2 page that has the most importance with more priority for page that has strong terms and anchor texts
if len(sorted_total_scores) > config.max_num_urls_per_page*2:
sorted_total_scores_1 = OrderedDict(list(sorted_total_scores.items())[:config.max_num_urls_per_page*2])
sorted_total_scores_2 = OrderedDict(list(sorted_total_scores.items())[config.max_num_urls_per_page*2:])
# for top results, sort with strong important words
sorted_total_scores_1 = OrderedDict(sorted(sorted_total_scores_1.items(), key = lambda kv:(kv[1][1] * kv[1][0]),reverse = True))
# combine first 2 pages with the rest
sorted_total_scores = OrderedDict(list(sorted_total_scores_1.items()) + list(sorted_total_scores_2.items()))
else:
# for top results, sort with strong important words
sorted_total_scores = OrderedDict(sorted(total_scores.items(), key = lambda kv:(kv[1][1] * kv[1][0]),reverse = True))
return sorted_total_scores
# main ranking function
def ranking(config, doc_ids,total_query_terms,strong_terms, anchor_terms,query_postings):
query_doc_ids = []
# no information found in inverted index
if len(query_postings) == 0:
return None
else:
# get scores
scores = calculate_scores(config, doc_ids, total_query_terms, strong_terms, anchor_terms,query_postings)
# convert to list of doc_ids
query_result = list(scores.keys())
return query_result