forked from usc-isi-i2/hybrid-jaccard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjaro.py
executable file
·325 lines (255 loc) · 12.1 KB
/
jaro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#! /usr/bin/env python
# coding: utf8
from __future__ import division
import os
import sys
from typo_tables import adjwt
def fn_jaro(len1, len2, num_matches, half_transposes, typo_score, typo_scale):
"""Calculate the classic Jaro metric between two strings.
The strings have lengths 'len1' and 'len2'. 'num_matches' and
'half_transposes' are the output of the count_matches() and
count_half_tranpositions() functions.
'typo_score' is an optional argument, produced by the count_typos()
function if you decide to check the strings for typos with a given
typographical mapping. The typo score will be scaled by 'typo_scale'
before being used."""
if not len1:
if not len2: return 1.0
return 0.0
if not num_matches: return 0.0
similar = (typo_score / typo_scale) + num_matches
weight = ( similar / len1
+ similar / len2
+ (num_matches - half_transposes//2) / num_matches)
return weight / 3
def fn_winkler(weight_jaro, pre_matches, pre_scale):
"""
Scale the standard Jaro metric by 'pre_scale' units per 'pre_matches'.
Note the warning in the docstring of jaro_winkler() regarding the scale.
"""
weight_jaro += pre_matches * pre_scale * (1.0 - weight_jaro)
assert weight_jaro <= 1.0
return weight_jaro
def fn_longer(weight, len1, len2, num_matches, pre_matches):
num = num_matches - pre_matches - 1
den = len1 + len2 - 2*pre_matches + 2
num = (1.0 - weight) * num
return weight + (num / den)
def count_matches(s1, s2, len1, len2):
"""
For every character in string s1, count the number of characters in
string s2 which match, within a given range.
len1 and len2 are the pre-calculated lengths of string s1 and s2
respectively. s1 must be the shorter string.
The function returns the count of matched characters, and two bit arrays
showing which characters in each string this function managed to match
up."""
# If you want to know which characters matched where, un-comment the lines
# involving the 'where_matched' variable below.
assert len1 and len1 <= len2
search_range = max(len2//2-1, 0)
num_matches = 0
# Bit arrays to mark which chars in the strings have already matched
flags1 = [0]*len1
flags2 = [0]*len2
# where_matched = [-1]*len1
# Looking only within the search range, count and flag the matched pairs.
for i, char in enumerate(s1):
lolim = max(i - search_range, 0)
hilim = min(i + search_range, len2 - 1)
for j in range(lolim, hilim + 1):
if not flags2[j] and char == s2[j]:
flags1[i] = flags2[j] = 1
# where_matched[i] = j
num_matches += 1
break
return num_matches, flags1, flags2#, where_matched
def count_half_transpositions(s1, s2, flags1, flags2):
"""Count the number of half transpositions between two strings.
A half transposition, roughly defined, is two characters, taken from the
sequence of paired matched chars, which are unequal to each other.
The arguments 'flags1' and 'flags2' are the bit arrays produced by the
count_matches() function."""
half_transposes = 0
k = 0
for i, flag in enumerate(flags1):
if not flag: continue
# Iterate through every matched char in the first string.
# For every char in the first string, we look for the next matched
# char in second string (starting from where we left off last, k).
while not flags2[k]: k += 1
# Once we've found two matched chars, we compare them. If they're not
# equal, it counts as a transposition.
if s1[i] != s2[k]:
half_transposes += 1
k += 1
return half_transposes
def count_typos(s1, s2, flags1, flags2, typo_table):
"""
Check unmatched characters in strings 's1' and 's2' for typos.
The typos (or known phonetic or character recognition errors) are
defined in the 'typo_table' argument, which defines the mapping between
similar characters, and a score to assign to each. [See the
typo_tables.py module].
The function returns the total typo score, and updates the 'field2' bit
array to mark which characters were adjudged similar.
"""
assert 0 in flags1
# Only call this routine if the smallest string still has some unmatched
# characters.
typo_score = 0
for i, flag1 in enumerate(flags1):
if flag1: continue # Iterate through unmatched chars
row = s1[i]
if row not in typo_table:
# If we don't have a similarity mapping for the char, continue
continue
typo_row = typo_table[row]
# Now look through all the unmatched chars in the second string to
# see if we can find a char similar to the first.
for j, flag2 in enumerate(flags2):
if flag2: continue
col = s2[j]
if col not in typo_row: continue
# print 'Similarity!', row, col
typo_score += typo_row[col]
flags2[j] = 2
break
return typo_score, flags2
def string_metrics(s1, s2, typo_table=None, typo_scale=1, boost_threshold=None,
pre_len=0, pre_scale=0, longer_prob=False):
"""
Calculate the string parameters and flags required by the various Jaro Winkler routines.
For more detail of what the various arguments to this function mean and
do, see the metric_custom() function.
"""
# Defaults are chosen to do least work necessary to get the valuesfor the
# Jaro metric.
assert isinstance(s1, (str, unicode))
assert isinstance(s2, (str, unicode))
assert typo_scale > 0
assert boost_threshold is None or boost_threshold > 0
assert pre_len >= 0
assert 1 >= pre_scale >= 0
len1 = len(s1)
len2 = len(s2)
if len2 < len1:
s1, s2 = s2, s1
len1, len2 = len2, len1
assert len1 <= len2
if not (len1 and len2): return len1, len2, 0, 0, 0, 0, False
num_matches, flags1, flags2 = count_matches(s1, s2, len1, len2)
# If no characters in common - return
if not num_matches: return len1, len2, 0, 0, 0, 0, False
half_transposes = count_half_transpositions(s1, s2, flags1, flags2)
# adjust for similarities in non-matched characters
typo_score = 0
if typo_table and len1 > num_matches:
typo_score, flags2 = count_typos(s1, s2, flags1, flags2, typo_table)
if not boost_threshold:
return len1, len2, num_matches, half_transposes, typo_score, 0, 0
pre_matches = 0
adjust_long = False
weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,
typo_score, typo_scale)
# Continue to boost the weight if the strings are similar
if weight_typo > boost_threshold:
# Adjust for having up to first 'pre_len' chars (not digits) in common
limit = min(len1, pre_len)
while pre_matches < limit:
char1 = s1[pre_matches]
if not( char1.isalpha() and char1 == s2[pre_matches] ):
break
pre_matches += 1
# Optionally adjust for long strings.
# After agreeing beginning chars, at least two more must agree and the
# agreeing characters must be more than half of remaining characters.
if longer_prob:
cond = len1 > pre_len
cond = cond and num_matches > pre_matches + 1
cond = cond and 2 * num_matches >= len1 + pre_matches
cond = cond and s1[0].isalpha()
if cond:
adjust_long = True
return (len1, len2, num_matches, half_transposes,
typo_score, pre_matches, adjust_long)
def metric_jaro(string1, string2):
"The standard, basic Jaro string metric."
ans = string_metrics(string1, string2)
len1, len2, num_matches, half_transposes = ans[:4]
assert ans[4:] == (0, 0, False)
return fn_jaro(len1, len2, num_matches, half_transposes, 0, 1)
def metric_jaro_winkler(string1, string2):
"""The Jaro metric adjusted with Winkler's modification, which boosts
the metric for strings whose prefixes match."""
pre_scale = 0.1
ans = string_metrics(string1, string2,
boost_threshold=0.7, pre_len=4,
# typo_table=adjwt, typo_scale=type_scale,
pre_scale=pre_scale, longer_prob=False)
(len1, len2, num_matches, half_transposes,
typo_score, pre_matches, adjust_long) = ans
assert typo_score == int(adjust_long) == 0
weight_jaro = fn_jaro(len1, len2, num_matches, half_transposes, 0, 1)
return fn_winkler(weight_jaro, pre_matches, pre_scale)
def metric_original(string1, string2):
"""The same metric that would be returned from the reference Jaro-Winkler
C code, taking as it does into account a typo table and adjustments for
longer strings.
This function uses the original table from the reference C code
('adjwt'), which contained only ASCII capital letters and numbers. If
you want to adjust for lower case letters and different character sets,
you need to define your own table. See the typo_tables.py module for
more detail."""
pre_scale = 0.1
typo_scale = 10
ans = string_metrics(string1, string2,
boost_threshold=0.7, pre_len=4,
typo_table=adjwt, typo_scale=typo_scale,
pre_scale=pre_scale, longer_prob=True)
(len1, len2, num_matches, half_transposes,
typo_score, pre_matches, adjust_long) = ans
weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,
typo_score, typo_scale)
weight_longer = fn_winkler(weight_typo, pre_matches, pre_scale)
if adjust_long:
weight_longer = fn_longer(weight_longer, len1, len2,
num_matches, pre_matches)
return weight_longer
def metric_custom(string1, string2, typo_table, typo_scale,
boost_threshold, pre_len, pre_scale, longer_prob):
"""
Calculate the Jaro-Winkler metric with parameters of your own choosing.
If you'd like to check your strings for typos, pass in a typo_table
arg. Any similar, but unmatched, chars from your strings in this table
will be used to proportionally (divided by typo_scale) increase the
count of matched chars between your strings.
If 'pre_len' is non-zero, the metric calculated will be boosted when the
first alpha chars, out of the first 'pre_len' chars of the strings,
match. The metric is adjusted up by 'pre_scale' for every matching char
[see fn_winkler()].
Take care that 'pre_scale' is no larger than 1 / 'pre_len', otherwise
the distance can become larger than 1.
The 'longer' flag tells the functions to make a further adjustment to
the distance if the strings have a longer prefix in common."""
ans = string_metrics(string1, string2,
typo_table, typo_scale,
boost_threshold, pre_len,
pre_scale, longer_prob)
(len1, len2, num_matches, half_transposes,
typo_score, pre_matches, adjust_long) = ans
weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,
typo_score, typo_scale)
weight_winkler_typo = fn_winkler(weight_typo, pre_matches, pre_scale)
weight_longer = weight_winkler_typo
if adjust_long:
weight_longer = fn_longer(weight_longer, len1, len2,
num_matches, pre_matches)
return weight_longer
if __name__ == '__main__':
# print metric_custom('abc', 'cba', adjwt, 10, 0.7, 4, 0.1, True)
if len(sys.argv) < 3:
sys.exit()
s1, s2 = [s.decode('utf8') for s in sys.argv[1:3]]
print('Jaro: %7.5f, Jaro-Winkler: %7.5f, Original: %7.5f.' % (
metric_jaro(s1, s2), metric_jaro_winkler(s1, s2), metric_original(s1, s2)))