Skip to content

Commit

Permalink
pythonGH-77609: Support following symlinks in pathlib.Path.glob()
Browse files Browse the repository at this point in the history
Add a keyword-only *follow_symlinks* parameter to `pathlib.Path.glob()` and
`rglob()`, defaulting to false. When set to true, symlinks to directories
are followed as if they were directories.

Previously these methods followed symlinks except when evaluating "`**`"
wildcards; on Windows they returned paths in filesystem casing except when
evaluating non-wildcard tokens. Both these problems are solved here. This
will allow us to address pythonGH-102613 and pythonGH-81079 in future commits.
  • Loading branch information
barneygale committed Mar 12, 2023
1 parent bb396ee commit 59dcdb1
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 79 deletions.
18 changes: 16 additions & 2 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ call fails (for example because the path doesn't exist).
.. versionadded:: 3.5


.. method:: Path.glob(pattern)
.. method:: Path.glob(pattern, *, follow_symlinks=False)

Glob the given relative *pattern* in the directory represented by this path,
yielding all matching files (of any kind)::
Expand All @@ -873,6 +873,9 @@ call fails (for example because the path doesn't exist).
PosixPath('setup.py'),
PosixPath('test_pathlib.py')]

By default, :meth:`Path.glob` does not follow symlinks. Set
*follow_symlinks* to true to visit symlinks to directories.

.. note::
Using the "``**``" pattern in large directory trees may consume
an inordinate amount of time.
Expand All @@ -883,6 +886,10 @@ call fails (for example because the path doesn't exist).
Return only directories if *pattern* ends with a pathname components
separator (:data:`~os.sep` or :data:`~os.altsep`).

.. versionchanged:: 3.12
The *follow_symlinks* parameter was added. In previous versions,
symlinks were followed except when expanding "``**``" wildcards.

.. method:: Path.group()

Return the name of the group owning the file. :exc:`KeyError` is raised
Expand Down Expand Up @@ -1268,7 +1275,7 @@ call fails (for example because the path doesn't exist).
.. versionadded:: 3.6
The *strict* argument (pre-3.6 behavior is strict).

.. method:: Path.rglob(pattern)
.. method:: Path.rglob(pattern, *, follow_symlinks=False)

Glob the given relative *pattern* recursively. This is like calling
:func:`Path.glob` with "``**/``" added in front of the *pattern*, where
Expand All @@ -1281,12 +1288,19 @@ call fails (for example because the path doesn't exist).
PosixPath('setup.py'),
PosixPath('test_pathlib.py')]

By default, :meth:`Path.rglob` does not follow symlinks. Set
*follow_symlinks* to true to visit symlinks to directories.

.. audit-event:: pathlib.Path.rglob self,pattern pathlib.Path.rglob

.. versionchanged:: 3.11
Return only directories if *pattern* ends with a pathname components
separator (:data:`~os.sep` or :data:`~os.altsep`).

.. versionchanged:: 3.12
The *follow_symlinks* parameter was added. In previous versions,
symlinks were followed except when expanding "``**``" wildcards.

.. method:: Path.rmdir()

Remove this directory. The directory must be empty.
Expand Down
64 changes: 18 additions & 46 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,6 @@ def _ignore_error(exception):
return (getattr(exception, 'errno', None) in _IGNORED_ERRNOS or
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)


def _is_wildcard_pattern(pat):
# Whether this pattern needs actual matching using fnmatch, or can
# be looked up directly as a file.
return "*" in pat or "?" in pat or "[" in pat

#
# Globbing helpers
#
Expand All @@ -74,10 +68,8 @@ def _make_selector(pattern_parts, flavour):
cls = _RecursiveWildcardSelector
elif '**' in pat:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
elif _is_wildcard_pattern(pat):
cls = _WildcardSelector
else:
cls = _PreciseSelector
cls = _WildcardSelector
return cls(pat, child_parts, flavour)


Expand All @@ -94,48 +86,28 @@ def __init__(self, child_parts, flavour):
self.successor = _TerminatingSelector()
self.dironly = False

def select_from(self, parent_path):
def select_from(self, parent_path, follow_symlinks):
"""Iterate over all child paths of `parent_path` matched by this
selector. This can contain parent_path itself."""
path_cls = type(parent_path)
is_dir = path_cls.is_dir
exists = path_cls.exists
scandir = path_cls._scandir
normcase = path_cls._flavour.normcase
if not is_dir(parent_path):
return iter([])
return self._select_from(parent_path, is_dir, exists, scandir, normcase)
return self._select_from(parent_path, follow_symlinks, scandir, normcase)


class _TerminatingSelector:

def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
def _select_from(self, parent_path, follow_symlinks, scandir, normcase):
yield parent_path


class _PreciseSelector(_Selector):

def __init__(self, name, child_parts, flavour):
self.name = name
_Selector.__init__(self, child_parts, flavour)

def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
try:
path = parent_path._make_child_relpath(self.name)
if (is_dir if self.dironly else exists)(path):
for p in self.successor._select_from(path, is_dir, exists, scandir, normcase):
yield p
except PermissionError:
return


class _WildcardSelector(_Selector):

def __init__(self, pat, child_parts, flavour):
self.match = re.compile(fnmatch.translate(flavour.normcase(pat))).fullmatch
_Selector.__init__(self, child_parts, flavour)

def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
def _select_from(self, parent_path, follow_symlinks, scandir, normcase):
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
Expand All @@ -147,7 +119,7 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
# "entry.is_dir()" can raise PermissionError
# in some cases (see bpo-38894), which is not
# among the errors ignored by _ignore_error()
if not entry.is_dir():
if not entry.is_dir(follow_symlinks=follow_symlinks):
continue
except OSError as e:
if not _ignore_error(e):
Expand All @@ -156,7 +128,7 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
name = entry.name
if self.match(normcase(name)):
path = parent_path._make_child_relpath(name)
for p in self.successor._select_from(path, is_dir, exists, scandir, normcase):
for p in self.successor._select_from(path, follow_symlinks, scandir, normcase):
yield p
except PermissionError:
return
Expand All @@ -167,7 +139,7 @@ class _RecursiveWildcardSelector(_Selector):
def __init__(self, pat, child_parts, flavour):
_Selector.__init__(self, child_parts, flavour)

def _iterate_directories(self, parent_path, is_dir, scandir):
def _iterate_directories(self, parent_path, follow_symlinks, scandir):
yield parent_path
try:
# We must close the scandir() object before proceeding to
Expand All @@ -177,24 +149,24 @@ def _iterate_directories(self, parent_path, is_dir, scandir):
for entry in entries:
entry_is_dir = False
try:
entry_is_dir = entry.is_dir()
entry_is_dir = entry.is_dir(follow_symlinks=follow_symlinks)
except OSError as e:
if not _ignore_error(e):
raise
if entry_is_dir and not entry.is_symlink():
if entry_is_dir:
path = parent_path._make_child_relpath(entry.name)
for p in self._iterate_directories(path, is_dir, scandir):
for p in self._iterate_directories(path, follow_symlinks, scandir):
yield p
except PermissionError:
return

def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
def _select_from(self, parent_path, follow_symlinks, scandir, normcase):
try:
yielded = set()
try:
successor_select = self.successor._select_from
for starting_point in self._iterate_directories(parent_path, is_dir, scandir):
for p in successor_select(starting_point, is_dir, exists, scandir, normcase):
for starting_point in self._iterate_directories(parent_path, follow_symlinks, scandir):
for p in successor_select(starting_point, follow_symlinks, scandir, normcase):
if p not in yielded:
yield p
yielded.add(p)
Expand Down Expand Up @@ -763,7 +735,7 @@ def _scandir(self):
# includes scandir(), which is used to implement glob().
return os.scandir(self)

def glob(self, pattern):
def glob(self, pattern, *, follow_symlinks=False):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
Expand All @@ -776,10 +748,10 @@ def glob(self, pattern):
if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
pattern_parts.append('')
selector = _make_selector(tuple(pattern_parts), self._flavour)
for p in selector.select_from(self):
for p in selector.select_from(self, follow_symlinks):
yield p

def rglob(self, pattern):
def rglob(self, pattern, *, follow_symlinks=False):
"""Recursively yield all existing files (of any kind, including
directories) matching the given relative pattern, anywhere in
this subtree.
Expand All @@ -791,7 +763,7 @@ def rglob(self, pattern):
if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep):
pattern_parts.append('')
selector = _make_selector(("**",) + tuple(pattern_parts), self._flavour)
for p in selector.select_from(self):
for p in selector.select_from(self, follow_symlinks):
yield p

def absolute(self):
Expand Down
80 changes: 49 additions & 31 deletions Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1760,22 +1760,25 @@ def _check(glob, expected):
_check(p.glob("dir*/file*"), ["dirB/fileB", "dirC/fileC"])
if not os_helper.can_symlink():
_check(p.glob("*A"), ['dirA', 'fileA'])
else:
_check(p.glob("*A"), ['dirA', 'fileA', 'linkA'])
if not os_helper.can_symlink():
_check(p.glob("*B/*"), ['dirB/fileB'])
else:
_check(p.glob("*B/*"), ['dirB/fileB', 'dirB/linkD',
'linkB/fileB', 'linkB/linkD'])
if not os_helper.can_symlink():
_check(p.glob("*/fileB"), ['dirB/fileB'])
else:
_check(p.glob("*/fileB"), ['dirB/fileB', 'linkB/fileB'])
_check(p.glob("*A"), ['dirA', 'fileA', 'linkA'])
_check(p.glob("*B/*"), ['dirB/fileB', 'dirB/linkD'])
_check(p.glob("*/fileB"), ['dirB/fileB'])
_check(p.glob("*/"), ["dirA", "dirB", "dirC", "dirE"])

if not os_helper.can_symlink():
_check(p.glob("*/"), ["dirA", "dirB", "dirC", "dirE"])
else:
_check(p.glob("*/"), ["dirA", "dirB", "dirC", "dirE", "linkB"])
@os_helper.skip_unless_symlink
def test_glob_follow_symlinks_common(self):
def _check(path, glob, expected):
self.assertEqual(set(path.glob(glob, follow_symlinks=True)), { P(BASE, q) for q in expected })
P = self.cls
p = P(BASE)
_check(p, "fileB", [])
_check(p, "dir*/file*", ["dirB/fileB", "dirC/fileC"])
_check(p, "*A", ['dirA', 'fileA', 'linkA'])
_check(p, "*B/*", ['dirB/fileB', 'dirB/linkD', 'linkB/fileB', 'linkB/linkD'])
_check(p, "*/fileB", ['dirB/fileB', 'linkB/fileB'])
_check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"])

def test_rglob_common(self):
def _check(glob, expected):
Expand All @@ -1787,22 +1790,10 @@ def _check(glob, expected):
_check(it, ["fileA"])
_check(p.rglob("fileB"), ["dirB/fileB"])
_check(p.rglob("*/fileA"), [])
if not os_helper.can_symlink():
_check(p.rglob("*/fileB"), ["dirB/fileB"])
else:
_check(p.rglob("*/fileB"), ["dirB/fileB", "dirB/linkD/fileB",
"linkB/fileB", "dirA/linkC/fileB"])
_check(p.rglob("*/fileB"), ["dirB/fileB"])
_check(p.rglob("file*"), ["fileA", "dirB/fileB",
"dirC/fileC", "dirC/dirD/fileD"])
if not os_helper.can_symlink():
_check(p.rglob("*/"), [
"dirA", "dirB", "dirC", "dirC/dirD", "dirE",
])
else:
_check(p.rglob("*/"), [
"dirA", "dirA/linkC", "dirB", "dirB/linkD", "dirC",
"dirC/dirD", "dirE", "linkB",
])
_check(p.rglob("*/"), ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"])
_check(p.rglob(""), ["", "dirA", "dirB", "dirC", "dirE", "dirC/dirD"])

p = P(BASE, "dirC")
Expand All @@ -1816,6 +1807,33 @@ def _check(glob, expected):
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
_check(p.rglob("*.*"), ["dirC/novel.txt"])

@os_helper.skip_unless_symlink
def test_rglob_follow_symlinks_common(self):
def _check(path, glob, expected):
actual = {path for path in path.rglob(glob, follow_symlinks=True)
if 'linkD' not in path.parts} # exclude symlink loop.
self.assertEqual(actual, { P(BASE, q) for q in expected })
P = self.cls
p = P(BASE)
_check(p, "fileB", ["dirB/fileB", "dirA/linkC/fileB", "linkB/fileB"])
_check(p, "*/fileA", [])
_check(p, "*/fileB", ["dirB/fileB", "dirA/linkC/fileB", "linkB/fileB"])
_check(p, "file*", ["fileA", "dirA/linkC/fileB", "dirB/fileB",
"dirC/fileC", "dirC/dirD/fileD", "linkB/fileB"])
_check(p, "*/", ["dirA", "dirA/linkC", "dirB", "dirC", "dirC/dirD", "dirE", "linkB"])
_check(p, "", ["", "dirA", "dirA/linkC", "dirB", "dirC", "dirE", "dirC/dirD", "linkB"])

p = P(BASE, "dirC")
_check(p, "*", ["dirC/fileC", "dirC/novel.txt",
"dirC/dirD", "dirC/dirD/fileD"])
_check(p, "file*", ["dirC/fileC", "dirC/dirD/fileD"])
_check(p, "*/*", ["dirC/dirD/fileD"])
_check(p, "*/", ["dirC/dirD"])
_check(p, "", ["dirC", "dirC/dirD"])
# gh-91616, a re module regression
_check(p, "*.txt", ["dirC/novel.txt"])
_check(p, "*.*", ["dirC/novel.txt"])

@os_helper.skip_unless_symlink
def test_rglob_symlink_loop(self):
# Don't get fooled by symlink loops (Issue #26012).
Expand Down Expand Up @@ -1856,8 +1874,8 @@ def test_glob_dotdot(self):
# ".." is not special in globs.
P = self.cls
p = P(BASE)
self.assertEqual(set(p.glob("..")), { P(BASE, "..") })
self.assertEqual(set(p.glob("dirA/../file*")), { P(BASE, "dirA/../fileA") })
self.assertEqual(set(p.glob("..")), set())
self.assertEqual(set(p.glob("dirA/../file*")), set())
self.assertEqual(set(p.glob("../xyzzy")), set())

@os_helper.skip_unless_symlink
Expand Down Expand Up @@ -3053,15 +3071,15 @@ def test_glob(self):
self.assertEqual(set(p.glob("FILEa")), { P(BASE, "fileA") })
self.assertEqual(set(p.glob("*a\\")), { P(BASE, "dirA") })
self.assertEqual(set(p.glob("F*a")), { P(BASE, "fileA") })
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\FILEa"})
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})

def test_rglob(self):
P = self.cls
p = P(BASE, "dirC")
self.assertEqual(set(p.rglob("FILEd")), { P(BASE, "dirC/dirD/fileD") })
self.assertEqual(set(p.rglob("*\\")), { P(BASE, "dirC/dirD") })
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\FILEd"})
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})

def test_expanduser(self):
P = self.cls
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add *follow_symlinks* argument to :meth:`pathlib.Path.glob` and
:meth:`~pathlib.Path.rglob`, defaulting to false.

0 comments on commit 59dcdb1

Please sign in to comment.