Skip to content

Commit

Permalink
pythonGH-72904: Add glob.translate() function (python#106703)
Browse files Browse the repository at this point in the history
Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`.

This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment.

In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code.

Co-authored-by: Jason R. Coombs <[email protected]>
Co-authored-by: Adam Turner <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2023
1 parent babb787 commit cf67ebf
Show file tree
Hide file tree
Showing 7 changed files with 229 additions and 106 deletions.
39 changes: 39 additions & 0 deletions Doc/library/glob.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,45 @@ default. For example, consider a directory containing :file:`card.gif` and
>>> glob.glob('.c*')
['.card.gif']


.. function:: translate(pathname, *, recursive=False, include_hidden=False, seps=None)

Convert the given path specification to a regular expression for use with
:func:`re.match`. The path specification can contain shell-style wildcards.

For example:

>>> import glob, re
>>>
>>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True)
>>> regex
'(?s:(?:.+/)?[^/]*\\.txt)\\Z'
>>> reobj = re.compile(regex)
>>> reobj.match('foo/bar/baz.txt')
<re.Match object; span=(0, 15), match='foo/bar/baz.txt'>

Path separators and segments are meaningful to this function, unlike
:func:`fnmatch.translate`. By default wildcards do not match path
separators, and ``*`` pattern segments match precisely one path segment.

If *recursive* is true, the pattern segment "``**``" will match any number
of path segments. If "``**``" occurs in any position other than a full
pattern segment, :exc:`ValueError` is raised.

If *include_hidden* is true, wildcards can match path segments that start
with a dot (``.``).

A sequence of path separators may be supplied to the *seps* argument. If
not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used.

.. seealso::

:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods,
which call this function to implement pattern matching and globbing.

.. versionadded:: 3.13


.. seealso::

Module :mod:`fnmatch`
Expand Down
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@ doctest
:attr:`doctest.TestResults.skipped` attributes.
(Contributed by Victor Stinner in :gh:`108794`.)

glob
----

* Add :func:`glob.translate` function that converts a path specification with
shell-style wildcards to a regular expression.
(Contributed by Barney Gale in :gh:`72904`.)

io
--

Expand Down
11 changes: 9 additions & 2 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ def translate(pat):
"""

STAR = object()
parts = _translate(pat, STAR, '.')
return _join_translated_parts(parts, STAR)


def _translate(pat, STAR, QUESTION_MARK):
res = []
add = res.append
i, n = 0, len(pat)
Expand All @@ -89,7 +94,7 @@ def translate(pat):
if (not res) or res[-1] is not STAR:
add(STAR)
elif c == '?':
add('.')
add(QUESTION_MARK)
elif c == '[':
j = i
if j < n and pat[j] == '!':
Expand Down Expand Up @@ -146,9 +151,11 @@ def translate(pat):
else:
add(re.escape(c))
assert i == n
return res


def _join_translated_parts(inp, STAR):
# Deal with STARs.
inp = res
res = []
add = res.append
i, n = 0, len(inp)
Expand Down
60 changes: 60 additions & 0 deletions Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,63 @@ def escape(pathname):


_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)


def translate(pat, *, recursive=False, include_hidden=False, seps=None):
"""Translate a pathname with shell wildcards to a regular expression.
If `recursive` is true, the pattern segment '**' will match any number of
path segments; if '**' appears outside its own segment, ValueError will be
raised.
If `include_hidden` is true, wildcards can match path segments beginning
with a dot ('.').
If a sequence of separator characters is given to `seps`, they will be
used to split the pattern into segments and match path separators. If not
given, os.path.sep and os.path.altsep (where available) are used.
"""
if not seps:
if os.path.altsep:
seps = (os.path.sep, os.path.altsep)
else:
seps = os.path.sep
escaped_seps = ''.join(map(re.escape, seps))
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
not_sep = f'[^{escaped_seps}]'
if include_hidden:
one_last_segment = f'{not_sep}+'
one_segment = f'{one_last_segment}{any_sep}'
any_segments = f'(?:.+{any_sep})?'
any_last_segments = '.*'
else:
one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
one_segment = f'{one_last_segment}{any_sep}'
any_segments = f'(?:{one_segment})*'
any_last_segments = f'{any_segments}(?:{one_last_segment})?'

results = []
parts = re.split(any_sep, pat)
last_part_idx = len(parts) - 1
for idx, part in enumerate(parts):
if part == '*':
results.append(one_segment if idx < last_part_idx else one_last_segment)
continue
if recursive:
if part == '**':
if idx < last_part_idx:
if parts[idx + 1] != '**':
results.append(any_segments)
else:
results.append(any_last_segments)
continue
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
if part:
if not include_hidden and part[0] in '*?':
results.append(r'(?!\.)')
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
if idx < last_part_idx:
results.append(any_sep)
res = ''.join(results)
return fr'(?s:{res})\Z'
125 changes: 21 additions & 104 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"""

import contextlib
import fnmatch
import functools
import glob
import io
import ntpath
import os
Expand Down Expand Up @@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod):
#


# fnmatch.translate() returns a regular expression that includes a prefix and
# a suffix, which enable matching newlines and ensure the end of the string is
# matched, respectively. These features are undesirable for our implementation
# of PurePatch.match(), which represents path separators as newlines and joins
# pattern segments together. As a workaround, we define a slice object that
# can remove the prefix and suffix from any translate() result. See the
# _compile_pattern_lines() function for more details.
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
_SWAP_SEP_AND_NEWLINE = {
'/': str.maketrans({'/': '\n', '\n': '/'}),
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
}


@functools.lru_cache(maxsize=256)
def _compile_pattern(pat, case_sensitive):
def _compile_pattern(pat, sep, case_sensitive):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity), or None if the pattern should match everything."""
if pat == '*':
return None
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
return re.compile(fnmatch.translate(pat), flags).match


@functools.lru_cache()
def _compile_pattern_lines(pattern_lines, case_sensitive):
"""Compile the given pattern lines to an `re.Pattern` object.
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
its path separators and newlines swapped (e.g. '**\n*.py`). By using
newlines to separate path components, and not setting `re.DOTALL`, we
ensure that the `*` wildcard cannot match path separators.
The returned `re.Pattern` object may have its `match()` method called to
match a complete pattern, or `search()` to match from the right. The
argument supplied to these methods must also have its path separators and
newlines swapped.
"""

# Match the start of the path, or just after a path separator
parts = ['^']
for part in pattern_lines.splitlines(keepends=True):
if part == '*\n':
part = r'.+\n'
elif part == '*':
part = r'.+'
elif part == '**\n':
# '**/' component: we use '(?s:.)' rather than '.' so that path
# separators (i.e. newlines) are matched. The trailing '^' ensures
# we terminate after a path separator (i.e. on a new line).
part = r'(?s:.)*^'
elif part == '**':
# '**' component.
part = r'(?s:.)*'
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
# Any other component: pass to fnmatch.translate(). We slice off
# the common prefix and suffix added by translate() to ensure that
# re.DOTALL is not set, and the end of the string not matched,
# respectively. With DOTALL not set, '*' wildcards will not match
# path separators, because the '.' characters in the pattern will
# not match newlines.
part = fnmatch.translate(part)[_FNMATCH_SLICE]
parts.append(part)
# Match the end of the path, always.
parts.append(r'\Z')
flags = re.MULTILINE
if not case_sensitive:
flags |= re.IGNORECASE
return re.compile(''.join(parts), flags=flags)
regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
# The string representation of an empty path is a single dot ('.'). Empty
# paths shouldn't match wildcards, so we consume it with an atomic group.
regex = r'(\.\Z)?+' + regex
return re.compile(regex, flags).match


def _select_children(parent_paths, dir_only, follow_symlinks, match):
Expand All @@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
name = entry.name
if match is None or match(name):
if match(name):
yield parent_path._make_child_relpath(name)


Expand Down Expand Up @@ -297,10 +235,6 @@ class PurePath:
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',

# The `_lines_cached` slot stores the string path with path separators
# and newlines swapped. This is used to implement `match()`.
'_lines_cached',

# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
Expand Down Expand Up @@ -475,20 +409,6 @@ def _parts_normcase(self):
self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep)
return self._parts_normcase_cached

@property
def _lines(self):
# Path with separators and newlines swapped, for pattern matching.
try:
return self._lines_cached
except AttributeError:
path_str = str(self)
if path_str == '.':
self._lines_cached = ''
else:
trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep]
self._lines_cached = path_str.translate(trans)
return self._lines_cached

def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
Expand Down Expand Up @@ -763,13 +683,16 @@ def match(self, path_pattern, *, case_sensitive=None):
path_pattern = self.with_segments(path_pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.pathmod)
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.drive or path_pattern.root:
return pattern.match(self._lines) is not None
pass
elif path_pattern._tail:
return pattern.search(self._lines) is not None
pattern_str = f'**{sep}{pattern_str}'
else:
raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive)
return match(str(self)) is not None


# Subclassing os.PathLike makes isinstance() checks slower,
Expand Down Expand Up @@ -1069,26 +992,19 @@ def _scandir(self):
return contextlib.nullcontext(self.iterdir())

def _make_child_relpath(self, name):
sep = self.pathmod.sep
lines_name = name.replace('\n', sep)
lines_str = self._lines
path_str = str(self)
tail = self._tail
if tail:
path_str = f'{path_str}{sep}{name}'
lines_str = f'{lines_str}\n{lines_name}'
path_str = f'{path_str}{self.pathmod.sep}{name}'
elif path_str != '.':
path_str = f'{path_str}{name}'
lines_str = f'{lines_str}{lines_name}'
else:
path_str = name
lines_str = lines_name
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
path._root = self.root
path._tail_cached = tail + [name]
path._lines_cached = lines_str
return path

def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
Expand Down Expand Up @@ -1139,6 +1055,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
# do not perform any filesystem access, which can be much faster!
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
Expand All @@ -1159,9 +1076,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
paths = _select_recursive(paths, dir_only, follow_symlinks)

# Filter out paths that don't match pattern.
prefix_len = len(self._make_child_relpath('_')._lines) - 1
match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
paths = (path for path in paths if match(path._lines[prefix_len:]))
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths

dir_only = part_idx < len(pattern_parts)
Expand All @@ -1174,7 +1091,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
dir_only = part_idx < len(pattern_parts)
match = _compile_pattern(part, case_sensitive)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
return paths

Expand Down
Loading

0 comments on commit cf67ebf

Please sign in to comment.