Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-122288: Improve performances of fnmatch.translate #122289

Merged
merged 18 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 36 additions & 46 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,32 @@ def translate(pat):
There is no way to quote meta-characters.
"""

STAR = object()
parts = _translate(pat, STAR, '.')
return _join_translated_parts(parts, STAR)
parts, indices = _translate(pat, '*', '.')
return _join_translated_parts(parts, indices)
picnixz marked this conversation as resolved.
Show resolved Hide resolved

_re_setops_sub = re.compile(r'([&~|])').sub
_re_escape = functools.lru_cache(maxsize=32768)(re.escape)
picnixz marked this conversation as resolved.
Show resolved Hide resolved

def _translate(pat, STAR, QUESTION_MARK):
picnixz marked this conversation as resolved.
Show resolved Hide resolved
res = []
add = res.append
indices = []
picnixz marked this conversation as resolved.
Show resolved Hide resolved

i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i+1
if c == '*':
# store the position of the wildcard
picnixz marked this conversation as resolved.
Show resolved Hide resolved
indices.append(len(res))
picnixz marked this conversation as resolved.
Show resolved Hide resolved
add(STAR)
picnixz marked this conversation as resolved.
Show resolved Hide resolved
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
add(STAR)
while i < n and pat[i] == '*':
i += 1
elif c == '?':
# Handling '?' one at a time seems to more efficient
# even if there are consecutive '?' that could have
# been written directly.
picnixz marked this conversation as resolved.
Show resolved Hide resolved
add(QUESTION_MARK)
picnixz marked this conversation as resolved.
Show resolved Hide resolved
elif c == '[':
j = i
Expand Down Expand Up @@ -133,8 +142,6 @@ def _translate(pat, STAR, QUESTION_MARK):
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if not stuff:
# Empty range: never match.
Expand All @@ -143,50 +150,33 @@ def _translate(pat, STAR, QUESTION_MARK):
# Negated empty range: match any character.
add('.')
else:
# Escape set operations (&&, ~~ and ||).
stuff = _re_setops_sub(r'\\\1', stuff)
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
add(f'[{stuff}]')
else:
add(re.escape(c))
assert i == n
return res


def _join_translated_parts(inp, STAR):
# Deal with STARs.
res = []
add = res.append
i, n = 0, len(inp)
# Fixed pieces at the start?
while i < n and inp[i] is not STAR:
add(inp[i])
i += 1
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
picnixz marked this conversation as resolved.
Show resolved Hide resolved
while i < n:
assert inp[i] is STAR
i += 1
if i == n:
add(".*")
break
assert inp[i] is not STAR
fixed = []
while i < n and inp[i] is not STAR:
fixed.append(inp[i])
i += 1
fixed = "".join(fixed)
if i == n:
add(".*")
add(fixed)
else:
add(f"(?>.*?{fixed})")
add(_re_escape(c))
assert i == n
res = "".join(res)
return res, indices


def _join_translated_parts(parts, indices):
if not indices:
return fr'(?s:{"".join(parts)})\Z'
iter_indices = iter(indices)
i, j = 0, next(iter_indices)
buffer = parts[i:j]
picnixz marked this conversation as resolved.
Show resolved Hide resolved
append, extend = buffer.append, buffer.extend
i = j + 1
for j in iter_indices:
append('(?>.*?')
extend(parts[i:j])
append(')')
i = j + 1
append('.*')
extend(parts[i:])
res = ''.join(buffer)
return fr'(?s:{res})\Z'
2 changes: 1 addition & 1 deletion Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
if part:
if not include_hidden and part[0] in '*?':
results.append(r'(?!\.)')
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
if idx < last_part_idx:
results.append(any_sep)
res = ''.join(results)
Expand Down
66 changes: 66 additions & 0 deletions Lib/test/test_fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,72 @@ def test_translate(self):
self.assertTrue(re.match(fatre, 'cbabcaxc'))
self.assertFalse(re.match(fatre, 'dabccbad'))

def test_translate_wildcards(self):
for pattern, expect in [
('ab*', r'(?s:ab.*)\Z'),
('ab*cd', r'(?s:ab.*cd)\Z'),
('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)
picnixz marked this conversation as resolved.
Show resolved Hide resolved

for pattern, expect in [
('*ab', r'(?s:.*ab)\Z'),
('*ab*', r'(?s:(?>.*?ab).*)\Z'),
('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)

def test_translate_expressions(self):
for pattern, expect in [
('[', r'(?s:\[)\Z'),
('[!', r'(?s:\[!)\Z'),
('[]', r'(?s:\[\])\Z'),
('[abc', r'(?s:\[abc)\Z'),
('[!abc', r'(?s:\[!abc)\Z'),
('[abc]', r'(?s:[abc])\Z'),
('[!abc]', r'(?s:[^abc])\Z'),
('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
# with [[
('[[', r'(?s:\[\[)\Z'),
('[[a', r'(?s:\[\[a)\Z'),
('[[]', r'(?s:[\[])\Z'),
('[[]a', r'(?s:[\[]a)\Z'),
('[[]]', r'(?s:[\[]\])\Z'),
('[[]a]', r'(?s:[\[]a\])\Z'),
('[[a]', r'(?s:[\[a])\Z'),
('[[a]]', r'(?s:[\[a]\])\Z'),
('[[a]b', r'(?s:[\[a]b)\Z'),
# backslashes
('[\\', r'(?s:\[\\)\Z'),
(r'[\]', r'(?s:[\\])\Z'),
(r'[\\]', r'(?s:[\\\\])\Z'),
]:
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)

def test_indices_locations(self):
from fnmatch import _translate

blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
parts, indices = _translate(''.join(blocks), '*', '.')
expect_parts = ['a', r'\^', 'b', '*',
'.', '.', '[a-z]', '[1-9]', '*',
r'\+', r'\+', r'\[', r'\[', 'a']
self.assertListEqual(parts, expect_parts)
self.assertListEqual(indices, [3, 8])
picnixz marked this conversation as resolved.
Show resolved Hide resolved


class FilterTestCase(unittest.TestCase):

def test_filter(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performances of :func:`fnmatch.translate` by a factor 1.3. Patch
picnixz marked this conversation as resolved.
Show resolved Hide resolved
by Bénédikt Tran.
Loading