-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbm25_ir.py
executable file
·99 lines (80 loc) · 3 KB
/
bm25_ir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import math
from collections import defaultdict, Counter
import pre_process as pp
'''
BM25 scoring formula for VSM IR
Parameters k1, b, k3 need to be tuned
Defaults k1 = 1.5, b = 0.5, k3 = 0
BM25 gives better results than
basic vsm in basic_ir.py
'''
bm25_index = defaultdict(list)
# Collect term frequencies for each sentence in document
def extract_term_freqs(sen):
# bag-of-words representation
tfs = Counter()
sentence = pp.process_sen(sen, True)
for token in sentence:
# stem words
# tfs[stemmer.stem(token.lower())] += 1
# lemmatize words
# compare this to the result of stemming using the PorterStemmer
tfs[pp.lemmatize(token.lower())] += 1
return tfs
# Compute document frequencies for each term
def compute_doc_freqs(doc_term_freq):
dfs = Counter()
for tfs in doc_term_freq.values():
for term in tfs.keys():
dfs[term] += 1
return dfs
def bm25(doc, k1, b):
bm25_index.clear()
# use the above two functions: extract_term_freqs & compute_doc_freqs
# process the training data into term frequencies and document frequencies
doc_term_freqs = {}
sen_id = 0
sen_len_sum = 0
for sentence in doc:
term_freqs = extract_term_freqs(sentence)
doc_term_freqs[sen_id] = term_freqs
sen_id += 1
sen_len_sum += sum(term_freqs.values())
M = len(doc_term_freqs)
avg_sen_length = float(sen_len_sum) / M
doc_freqs = compute_doc_freqs(doc_term_freqs)
# create the bm25 vsm
for senid, term_freqs in doc_term_freqs.items():
N = sum(term_freqs.values())
term_weighting_values = []
for term, count in term_freqs.items():
# 1. find idf values
idf = math.log((M - float(doc_freqs[term]) + 0.5) / (float(doc_freqs[term]) + 0.5))
# 2. find tf and doc length values
K = k1 * (1 - b + b * (float(N) / avg_sen_length)) + count
tf_doc_len = float(k1 + 1) * count / K
score = idf * tf_doc_len
term_weighting_values.append((term, score))
for term, score in term_weighting_values:
# note the inversion of the indexing, to be term -> (doc_id, score)
bm25_index[term].append([senid, score])
# ensure posting lists are in sorted order
for term, senids in bm25_index.items():
senids.sort()
# Query the bm25 index
def query_bm25(query, k3, k=1):
# process query into term frequencies
qfs = Counter()
for token in query:
qfs[pp.lemmatize(token.lower())] += 1
accumulator = Counter()
for term in query:
term = pp.lemmatize(term.lower())
# 3. find query tf values
query_tf = float((k3 + 1) * qfs[term]) / (k3 + qfs[term])
postings = bm25_index[term]
for senid, weight in postings:
accumulator[senid] += weight * query_tf
accumulator_list = sorted(accumulator.items(), key=lambda item: item[1], reverse=True)
return accumulator_list
# return accumulator.most_common(k)