-
Notifications
You must be signed in to change notification settings - Fork 0
/
fbtrie.py
executable file
·224 lines (177 loc) · 6.66 KB
/
fbtrie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python3
"""
Implementation of FB-Trie data structure
for fast levenshtein dictionary search.
Trie represents classic Trie. FB-Trie consists of two tries. F stands for
Forward trie and is the same thing as 'Trie'. B means Backward trie, because it
is a trie which just stores strings in reverse (eg. F trie would contain 'abc',
while B trie would have 'cba').
The search procedure consists of two phases - first in F trie and then the same
thing in B trie. Suppose you have limit of edit distance 'k'. Instead of
dispatching the trie search recursion with limit 'k', you dispatch recursion
with limit 'k / 2' instead. Once you hit a node which represents roughly half
of the search pattern, you increase the limit back to 'k'.
FB-Trie is fast for fuzzy search, because it reduces excessive branching in the
initial levels of classic trie. The pruned branches are then picked up in
second pass using backward trie, which has nodes representing initial levels of
forward trie at the end levels.
"""
class Trie:
def __init__(self):
self.root = {}
def insert(self, word):
word = word + "\0"
node = self.root
for c in word:
if c not in node:
node[c] = {}
node = node[c]
def print(self):
self.print_(self.root, "", "")
def print_(self, node, sofar, before):
num_ch = len(node)
if len(node) > 1:
print(before, sofar, "/", sep="")
before += " " if num_ch > 1 else ""
for c, tree in node.items():
if c == "\0":
print(before, sofar, sep="")
else:
self.print_(tree, sofar + c, before)
def traverse(self, d, node, sofar):
for c, child in node.items():
if c == "\0":
yield sofar, d
else:
yield from self.traverse(d, child, sofar + c)
def fuzzy(self, query, k, k_fn=None, prefix=False):
n = len(query)
tab = [[0] * (n + 1) for _ in range(n + k + 1)]
for j in range(n + 1):
tab[0][j] = j
if not k_fn:
k_fn = lambda i: k
yield from self.fuzzy_(k_fn, tab, self.root, 1, "", query, prefix)
def fuzzy_(self, k_fn, tab, node, i, sofar, query, prefix=False):
k = k_fn(i)
if prefix and i >= len(query) + 1:
d = tab[i - 1][len(query)]
if d <= k:
# yield sofar, d
yield from self.traverse(d, node, sofar)
return
# Can't be more than length of query + k insertions. Don't allow
# children
if i >= len(tab):
return
for c, child in node.items():
d = tab[i - 1][len(query)]
if c == "\0":
if d <= k:
yield sofar, d
continue
new = sofar + c
tab[i][0] = i
for j in range(1, len(tab[i])):
sub_cost = 1 if c != query[j - 1] else 0
sub = tab[i - 1][j - 1] + sub_cost
insert = tab[i - 1][j] + 1
delete = tab[i][j - 1] + 1
tab[i][j] = min(sub, insert, delete)
smallest = min(tab[i])
if smallest <= k:
yield from self.fuzzy_(k_fn, tab, child,
i + 1, new, query, prefix)
####################
class FBTrie:
def __init__(self):
self.f = Trie()
self.b = Trie()
def insert(self, word):
self.f.insert(word)
self.b.insert(word[::-1])
def print(self):
self.f.print()
self.b.print()
def fuzzy(self, query, k):
n = len(query)
q1, q2 = query[:n // 2], query[n // 2:]
# Boytsov 2011:
# Note that there are ⌈k/2⌉ queries of the first type
# and ⌊k/2⌋+ 1 queries of the second type
#
# NOTE: why that many times?? only the last one seems to suffice
# k1 = (k - 1) // 2 + 1
k1 = (k - 1) // 2
print("F", k1)
yield from self.trie_fuzzy(self.f, k1, k, q1, query)
# k2 = k // 2 + 1
k2 = k // 2
print("B", k2)
yield from ((word[::-1], d) for word, d
in self.trie_fuzzy(self.b, k2, k, q2[::-1], query[::-1]))
def trie_fuzzy(self, trie, subk, k, subq, query):
n = len(query)
tab = [[0] * (n + 1) for _ in range(n + k + 1)]
for j in range(n + 1):
tab[0][j] = j
yield from self.trie_fuzzy_(subk, k, tab, trie.root, 1,
"", subq, query, 1)
def trie_fuzzy_(self, subk, k, tab, node, i, sofar, subq, query, phase):
if i >= len(tab):
return
for c, child in sorted(node.items(), key=lambda x: x[0]):
d = tab[i - 1][len(query)]
if c == "\0":
if d <= k:
yield sofar, d
continue
new = sofar + c
tab[i][0] = i
for j in range(1, len(tab[i])):
sub_cost = 1 if c != query[j - 1] else 0
sub = tab[i - 1][j - 1] + sub_cost
insert = tab[i - 1][j] + 1
delete = tab[i][j - 1] + 1
tab[i][j] = min(sub, insert, delete)
if phase == 1 and tab[i][len(subq)] <= subk:
yield from self.trie_fuzzy_(k, k, tab, child, i + 1,
new, subq, query, 2)
continue
smallest = min(tab[i])
if smallest <= subk:
yield from self.trie_fuzzy_(subk, k, tab, child, i + 1,
new, subq, query, phase)
if __name__ == "__main__":
import sys
import time
if len(sys.argv) > 4:
usage = "python trie.py query K [trie|fbtrie]"
print(usage, file=sys.stderr)
sys.exit(1)
query = sys.argv[1]
k = int(sys.argv[2])
typ = "trie"
if len(sys.argv) == 4:
typ = sys.argv[3].lower()
if typ == "trie":
t = Trie()
elif typ == "fbtrie":
t = FBTrie()
else:
print("Unknown trie type. Using default 'trie'", file=sys.stderr)
t = Trie()
print("Reading from stdin...", file=sys.stderr)
for line in sys.stdin:
line = line.strip()
t.insert(line)
print("Processing query", query, k, file=sys.stderr)
start = time.time()
result = sorted(set((d, word) for word, d in t.fuzzy(query, k)))
end = time.time()
for word, d in result:
print(d, word)
print("RESULT: {}~{}: [{} found] in {:.4f}ms using [{}]".format(
query, k, len(result), (end - start) * 1000, t.__class__.__name__),
file=sys.stderr
)