diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 684466d354aef8..849316553e408a 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -75,10 +75,6 @@ The :mod:`glob` module defines the following functions: Using the "``**``" pattern in large directory trees may consume an inordinate amount of time. - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -88,6 +84,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are returned only once. In previous versions, this + function may return duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, \ include_hidden=False) @@ -98,10 +99,6 @@ The :mod:`glob` module defines the following functions: .. audit-event:: glob.glob pathname,recursive glob.iglob .. audit-event:: glob.glob/2 pathname,recursive,root_dir,dir_fd glob.iglob - .. note:: - This function may return duplicate path names if *pathname* - contains multiple "``**``" patterns and *recursive* is true. - .. versionchanged:: 3.5 Support for recursive globs using "``**``". @@ -111,6 +108,11 @@ The :mod:`glob` module defines the following functions: .. versionchanged:: 3.11 Added the *include_hidden* parameter. + .. versionchanged:: 3.14 + Matching path names are yielded only once. In previous versions, this + function may yield duplicate path names if *pathname* contains multiple + "``**``" patterns and *recursive* is true. + .. function:: escape(pathname) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 72abfebd46f2b9..38e135b06eed16 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -708,6 +708,13 @@ asyncio reduces memory usage. (Contributed by Kumar Aditya in :gh:`107803`.) +glob +---- + +* Reduce the number of system calls in :func:`glob.glob` and :func:`~glob.iglob`, + thereby improving the speed of globbing operations by 20-80%. + (Contributed by Barney Gale in :gh:`116380`.) + io --- * :mod:`io` which provides the built-in :func:`open` makes less system calls diff --git a/Lib/glob.py b/Lib/glob.py index 690ab1b8b9fb1d..007b3d5b0e4870 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -1,13 +1,10 @@ """Filename globbing utility.""" -import contextlib import os import re import fnmatch import functools -import itertools import operator -import stat import sys @@ -45,82 +42,37 @@ def iglob(pathname, *, root_dir=None, dir_fd=None, recursive=False, """ sys.audit("glob.glob", pathname, recursive) sys.audit("glob.glob/2", pathname, recursive, root_dir, dir_fd) - if root_dir is not None: - root_dir = os.fspath(root_dir) - else: - root_dir = pathname[:0] - it = _iglob(pathname, root_dir, dir_fd, recursive, False, - include_hidden=include_hidden) - if not pathname or recursive and _isrecursive(pathname[:2]): - try: - s = next(it) # skip empty string - if s: - it = itertools.chain((s,), it) - except StopIteration: - pass - return it - -def _iglob(pathname, root_dir, dir_fd, recursive, dironly, - include_hidden=False): - dirname, basename = os.path.split(pathname) - if not has_magic(pathname): - assert not dironly - if basename: - if _lexists(_join(root_dir, pathname), dir_fd): - yield pathname - else: - # Patterns ending with a slash should match only directories - if _isdir(_join(root_dir, dirname), dir_fd): - yield pathname - return - if not dirname: - if recursive and _isrecursive(basename): - yield from _glob2(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - else: - yield from _glob1(root_dir, basename, dir_fd, dironly, - include_hidden=include_hidden) - return - # `os.path.split()` returns the argument itself as a dirname if it is a - # drive or UNC path. Prevent an infinite recursion if a drive or UNC path - # contains magic characters (i.e. r'\\?\C:'). - if dirname != pathname and has_magic(dirname): - dirs = _iglob(dirname, root_dir, dir_fd, recursive, True, - include_hidden=include_hidden) - else: - dirs = [dirname] - if has_magic(basename): - if recursive and _isrecursive(basename): - glob_in_dir = _glob2 - else: - glob_in_dir = _glob1 + pathname = os.fspath(pathname) + if isinstance(pathname, bytes): + pathname = os.fsdecode(pathname) + if root_dir is not None: + root_dir = os.fsdecode(root_dir) + paths = _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + return map(os.fsencode, paths) else: - glob_in_dir = _glob0 - for dirname in dirs: - for name in glob_in_dir(_join(root_dir, dirname), basename, dir_fd, dironly, - include_hidden=include_hidden): - yield os.path.join(dirname, name) - -# These 2 helper functions non-recursively glob inside a literal directory. -# They return a list of basenames. _glob1 accepts a pattern while _glob0 -# takes a literal basename (so it only has to check for its existence). - -def _glob1(dirname, pattern, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - if not (include_hidden or _ishidden(pattern)): - names = (x for x in names if not _ishidden(x)) - return fnmatch.filter(names, pattern) - -def _glob0(dirname, basename, dir_fd, dironly, include_hidden=False): - if basename: - if _lexists(_join(dirname, basename), dir_fd): - return [basename] + return _iglob(pathname, root_dir, dir_fd, recursive, include_hidden) + +def _iglob(pathname, root_dir, dir_fd, recursive, include_hidden): + if os.path.altsep: + pathname = pathname.replace(os.path.altsep, os.path.sep) + drive, root, tail = os.path.splitroot(pathname) + anchor = drive + root + parts = tail.split(os.path.sep)[::-1] if tail else [] + globber = _StringGlobber(recursive=recursive, include_hidden=include_hidden) + select = globber.selector(parts) + if anchor: + # Non-relative pattern. The anchor is guaranteed to exist unless it + # has a Windows drive component. + paths = select(anchor, dir_fd, anchor, not drive) else: - # `os.path.split()` returns an empty basename for paths ending with a - # directory separator. 'q*x/' should match only directories. - if _isdir(dirname, dir_fd): - return [basename] - return [] + # Relative pattern. + if root_dir is None: + root_dir = os.path.curdir + paths = _relative_glob(select, root_dir, dir_fd) + # Skip empty string. + if path := next(paths, None): + yield path + yield from paths _deprecated_function_message = ( "{name} is deprecated and will be removed in Python {remove}. Use " @@ -130,102 +82,21 @@ def _glob0(dirname, basename, dir_fd, dironly, include_hidden=False): def glob0(dirname, pattern): import warnings warnings._deprecated("glob.glob0", _deprecated_function_message, remove=(3, 15)) - return _glob0(dirname, pattern, None, False) + return list(_relative_glob(_StringGlobber().literal_selector(pattern, []), dirname)) def glob1(dirname, pattern): import warnings warnings._deprecated("glob.glob1", _deprecated_function_message, remove=(3, 15)) - return _glob1(dirname, pattern, None, False) - -# This helper function recursively yields relative pathnames inside a literal -# directory. - -def _glob2(dirname, pattern, dir_fd, dironly, include_hidden=False): - assert _isrecursive(pattern) - if not dirname or _isdir(dirname, dir_fd): - yield pattern[:0] - yield from _rlistdir(dirname, dir_fd, dironly, - include_hidden=include_hidden) - -# If dironly is false, yields all file names inside a directory. -# If dironly is true, yields only directory names. -def _iterdir(dirname, dir_fd, dironly): - try: - fd = None - fsencode = None - if dir_fd is not None: - if dirname: - fd = arg = os.open(dirname, _dir_open_flags, dir_fd=dir_fd) - else: - arg = dir_fd - if isinstance(dirname, bytes): - fsencode = os.fsencode - elif dirname: - arg = dirname - elif isinstance(dirname, bytes): - arg = bytes(os.curdir, 'ASCII') - else: - arg = os.curdir - try: - with os.scandir(arg) as it: - for entry in it: - try: - if not dironly or entry.is_dir(): - if fsencode is not None: - yield fsencode(entry.name) - else: - yield entry.name - except OSError: - pass - finally: - if fd is not None: - os.close(fd) - except OSError: - return - -def _listdir(dirname, dir_fd, dironly): - with contextlib.closing(_iterdir(dirname, dir_fd, dironly)) as it: - return list(it) - -# Recursively yields relative pathnames inside a literal directory. -def _rlistdir(dirname, dir_fd, dironly, include_hidden=False): - names = _listdir(dirname, dir_fd, dironly) - for x in names: - if include_hidden or not _ishidden(x): - yield x - path = _join(dirname, x) if dirname else x - for y in _rlistdir(path, dir_fd, dironly, - include_hidden=include_hidden): - yield _join(x, y) - - -def _lexists(pathname, dir_fd): - # Same as os.path.lexists(), but with dir_fd - if dir_fd is None: - return os.path.lexists(pathname) - try: - os.lstat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False - else: - return True - -def _isdir(pathname, dir_fd): - # Same as os.path.isdir(), but with dir_fd - if dir_fd is None: - return os.path.isdir(pathname) - try: - st = os.stat(pathname, dir_fd=dir_fd) - except (OSError, ValueError): - return False - else: - return stat.S_ISDIR(st.st_mode) + return list(_relative_glob(_StringGlobber().wildcard_selector(pattern, []), dirname)) -def _join(dirname, basename): - # It is common if dirname or basename is empty - if not dirname or not basename: - return dirname or basename - return os.path.join(dirname, basename) +def _relative_glob(select, dirname, dir_fd=None): + """Globs using a *select* function from the given dirname. The dirname + prefix is removed from results. If dir_fd is supplied, then dirname is + opened relative to the given file descriptor. + """ + dirname = _StringGlobber.add_slash(dirname) + slicer = operator.itemgetter(slice(len(dirname), None)) + return map(slicer, select(dirname, dir_fd, dirname)) magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') @@ -237,15 +108,6 @@ def has_magic(s): match = magic_check.search(s) return match is not None -def _ishidden(path): - return path[0] in ('.', b'.'[0]) - -def _isrecursive(pattern): - if isinstance(pattern, bytes): - return pattern == b'**' - else: - return pattern == '**' - def escape(pathname): """Escape all special characters. """ @@ -319,12 +181,13 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): return fr'(?s:{res})\Z' -@functools.lru_cache(maxsize=512) -def _compile_pattern(pat, sep, case_sensitive, recursive=True): +@functools.lru_cache(maxsize=1024) +def _compile_pattern(pat, sep, case_sensitive, recursive, include_hidden): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep) + regex = translate(pat, recursive=recursive, + include_hidden=include_hidden, seps=sep) return re.compile(regex, flags=flags).match @@ -332,11 +195,13 @@ class _GlobberBase: """Abstract class providing shell-style pattern matching and globbing. """ - def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False): + def __init__(self, sep=os.path.sep, case_sensitive=os.name != 'nt', + case_pedantic=False, recursive=False, include_hidden=False): self.sep = sep self.case_sensitive = case_sensitive self.case_pedantic = case_pedantic self.recursive = recursive + self.include_hidden = include_hidden # Abstract methods @@ -346,12 +211,30 @@ def lexists(path): """ raise NotImplementedError + @staticmethod + def lstat(path, dir_fd=None): + """Implements os.lstat() + """ + raise NotImplementedError + + @staticmethod + def open(path, flags, dir_fd=None): + """Implements os.open() + """ + raise NotImplementedError + @staticmethod def scandir(path): """Implements os.scandir(). """ raise NotImplementedError + @staticmethod + def close(fd): + """Implements os.close(). + """ + raise NotImplementedError + @staticmethod def add_slash(path): """Returns a path with a trailing slash added. @@ -367,7 +250,8 @@ def concat_path(path, text): # High-level methods def compile(self, pat): - return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive) + return _compile_pattern(pat, self.sep, self.case_sensitive, + self.recursive, self.include_hidden) def selector(self, parts): """Returns a function that selects from a given path, walking and @@ -391,9 +275,11 @@ def special_selector(self, part, parts): """ select_next = self.selector(parts) - def select_special(path, exists=False): + def select_special(path, dir_fd=None, rel_path=None, exists=False): path = self.concat_path(self.add_slash(path), part) - return select_next(path, exists) + if dir_fd is not None: + rel_path = self.concat_path(self.add_slash(rel_path), part) + return select_next(path, dir_fd, rel_path, exists) return select_special def literal_selector(self, part, parts): @@ -408,9 +294,11 @@ def literal_selector(self, part, parts): select_next = self.selector(parts) - def select_literal(path, exists=False): + def select_literal(path, dir_fd=None, rel_path=None, exists=False): path = self.concat_path(self.add_slash(path), part) - return select_next(path, exists=False) + if dir_fd is not None: + rel_path = self.concat_path(self.add_slash(rel_path), part) + return select_next(path, dir_fd, rel_path, exists=False) return select_literal def wildcard_selector(self, part, parts): @@ -418,17 +306,21 @@ def wildcard_selector(self, part, parts): filtering by pattern. """ - match = None if part == '*' else self.compile(part) + match = None if self.include_hidden and part == '*' else self.compile(part) dir_only = bool(parts) if dir_only: select_next = self.selector(parts) - def select_wildcard(path, exists=False): + def select_wildcard(path, dir_fd=None, rel_path=None, exists=False): + fd = None try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: - entries = list(scandir_it) + if dir_fd is None: + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + else: + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + with self.scandir(fd) as scandir_it: + entries = list(scandir_it) except OSError: pass else: @@ -443,9 +335,13 @@ def select_wildcard(path, exists=False): continue entry_path = self.concat_path(prefix, entry.name) if dir_only: - yield from select_next(entry_path, exists=True) + yield from select_next( + entry_path, fd, entry.name, exists=True) else: yield entry_path + finally: + if fd is not None: + self.close(fd) return select_wildcard def recursive_selector(self, part, parts): @@ -467,26 +363,47 @@ def recursive_selector(self, part, parts): while parts and parts[-1] not in _special_parts: part += self.sep + parts.pop() - match = None if part == '**' else self.compile(part) + match = None if self.include_hidden and part == '**' else self.compile(part) dir_only = bool(parts) select_next = self.selector(parts) - def select_recursive(path, exists=False): + def select_recursive(path, dir_fd=None, rel_path=None, exists=False): path = self.add_slash(path) + if dir_fd is not None: + rel_path = self.add_slash(rel_path) match_pos = len(str(path)) if match is None or match(str(path), match_pos): - yield from select_next(path, exists) - stack = [path] - while stack: - yield from select_recursive_step(stack, match_pos) + yield from select_next(path, dir_fd, rel_path, exists) + stack = [(path, dir_fd, rel_path)] + try: + while stack: + yield from select_recursive_step(stack, match_pos) + finally: + # Close any file descriptors still on the stack. + while stack: + path, dir_fd, _rel_path = stack.pop() + if path is None: + try: + self.close(dir_fd) + except OSError: + pass def select_recursive_step(stack, match_pos): - path = stack.pop() + path, dir_fd, rel_path = stack.pop() try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: - entries = list(scandir_it) + if path is None: + self.close(dir_fd) + return + elif dir_fd is None: + fd = None + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + else: + fd = self.open(rel_path, _dir_open_flags, dir_fd=dir_fd) + # Schedule the file descriptor to be closed next step. + stack.append((None, fd, None)) + with self.scandir(fd) as scandir_it: + entries = list(scandir_it) except OSError: pass else: @@ -503,23 +420,31 @@ def select_recursive_step(stack, match_pos): entry_path = self.concat_path(prefix, entry.name) if match is None or match(str(entry_path), match_pos): if dir_only: - yield from select_next(entry_path, exists=True) + yield from select_next( + entry_path, fd, entry.name, exists=True) else: # Optimization: directly yield the path if this is # last pattern part. yield entry_path if is_dir: - stack.append(entry_path) + stack.append((entry_path, fd, entry.name)) return select_recursive - def select_exists(self, path, exists=False): - """Yields the given path, if it exists. + def select_exists(self, path, dir_fd=None, rel_path=None, exists=False): + """Yields the given path, if it exists. If *dir_fd* is given, we check + whether *rel_path* exists relative to the fd. """ if exists: # Optimization: this path is already known to exist, e.g. because # it was returned from os.scandir(), so we skip calling lstat(). yield path + elif dir_fd is not None: + try: + self.lstat(rel_path, dir_fd=dir_fd) + yield path + except OSError: + pass elif self.lexists(path): yield path @@ -528,7 +453,10 @@ class _StringGlobber(_GlobberBase): """Provides shell-style pattern matching and globbing for string paths. """ lexists = staticmethod(os.path.lexists) + lstat = staticmethod(os.lstat) + open = staticmethod(os.open) scandir = staticmethod(os.scandir) + close = staticmethod(os.close) concat_path = operator.add if os.name == 'nt': diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 38bc660e0aeb30..26ccb67eec15a0 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -355,7 +355,7 @@ def match(self, path_pattern, *, case_sensitive=None): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False - globber = PathGlobber(sep, case_sensitive) + globber = PathGlobber(sep, case_sensitive, include_hidden=True) for path_part, pattern_part in zip(path_parts, pattern_parts): match = globber.compile(pattern_part) if match(path_part) is None: @@ -371,12 +371,12 @@ def full_match(self, pattern, *, case_sensitive=None): pattern = self.with_segments(pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) - globber = PathGlobber(pattern.parser.sep, case_sensitive, recursive=True) + globber = PathGlobber(pattern.parser.sep, case_sensitive, + recursive=True, include_hidden=True) match = globber.compile(str(pattern)) return match(str(self)) is not None - class ReadablePath(JoinablePath): """Base class for concrete path objects. diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py index d6afb31424265c..a4356ceb47d627 100644 --- a/Lib/pathlib/_local.py +++ b/Lib/pathlib/_local.py @@ -655,7 +655,8 @@ def full_match(self, pattern, *, case_sensitive=None): # paths shouldn't match wildcards, so we change it to the empty string. path = str(self) if self.parts else '' pattern = str(pattern) if pattern.parts else '' - globber = _StringGlobber(self.parser.sep, case_sensitive, recursive=True) + globber = _StringGlobber(self.parser.sep, case_sensitive, + recursive=True, include_hidden=True) return globber.compile(pattern)(path) is not None # Subclassing os.PathLike makes isinstance() checks slower, @@ -888,7 +889,8 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): case_pedantic = True parts = self._parse_pattern(pattern) recursive = True if recurse_symlinks else _no_recurse_symlinks - globber = _StringGlobber(self.parser.sep, case_sensitive, case_pedantic, recursive) + globber = _StringGlobber(self.parser.sep, case_sensitive, case_pedantic, + recursive, include_hidden=True) select = globber.selector(parts[::-1]) root = str(self) paths = select(root) diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index 00187a3fb3537d..1beee2f2785ff4 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -4,14 +4,18 @@ import shutil import sys import unittest +import unittest.mock import warnings from test import support -from test.support import is_wasi, Py_DEBUG +from test.support import is_wasi, Py_DEBUG, infinite_recursion from test.support.os_helper import (TESTFN, skip_unless_symlink, can_symlink, create_empty_file, change_cwd) +_supports_dir_fd = {os.open, os.stat} <= os.supports_dir_fd and os.scandir in os.supports_fd + + class GlobTests(unittest.TestCase): dir_fd = None @@ -49,7 +53,7 @@ def setUp(self): def open_dirfd(self): if self.dir_fd is not None: os.close(self.dir_fd) - if {os.open, os.stat} <= os.supports_dir_fd and os.scandir in os.supports_fd: + if _supports_dir_fd: self.dir_fd = os.open(self.tempdir, os.O_RDONLY | os.O_DIRECTORY) else: self.dir_fd = None @@ -319,8 +323,12 @@ def test_recursive_glob(self): with change_cwd(self.tempdir): join = os.path.join eq(glob.glob('**', recursive=True), [join(*i) for i in full]) + eq(glob.glob(join('**', '**'), recursive=True), + [join(*i) for i in full]) eq(glob.glob(join('**', ''), recursive=True), [join(*i) for i in dirs]) + eq(glob.glob(join('**', '**', ''), recursive=True), + [join(*i) for i in dirs]) eq(glob.glob(join('**', '*'), recursive=True), [join(*i) for i in full]) eq(glob.glob(join(os.curdir, '**'), recursive=True), @@ -387,6 +395,33 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_glob_above_recursion_limit(self): + depth = 30 + base = os.path.join(self.tempdir, 'deep') + p = os.path.join(base, *(['d']*depth)) + os.makedirs(p) + pattern = os.path.join(base, '**', 'd') + with infinite_recursion(depth - 5): + glob.glob(pattern, recursive=True) + + @unittest.skipUnless(_supports_dir_fd, "Needs support for iglob(dir_fd=...)") + def test_iglob_iter_close(self): + base = os.path.join(self.tempdir, 'deep') + p = os.path.join(base, *(['d'] * 10)) + os.makedirs(p) + with ( + unittest.mock.patch("glob._StringGlobber.open", wraps=os.open) as os_open, + unittest.mock.patch("glob._StringGlobber.close", wraps=os.close) as os_close + ): + self.assertEqual(os_open.call_count, os_close.call_count) + iter = glob.iglob('**/*/d', dir_fd=self.dir_fd, recursive=True) + self.assertEqual(os_open.call_count, os_close.call_count) + self.assertEqual(next(iter), 'deep/d') + self.assertEqual(next(iter), 'deep/d/d') + self.assertGreater(os_open.call_count, os_close.call_count) + iter.close() + self.assertEqual(os_open.call_count, os_close.call_count) + def test_glob0(self): with self.assertWarns(DeprecationWarning): glob.glob0(self.tempdir, 'a') diff --git a/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst new file mode 100644 index 00000000000000..b7f27ab7191a96 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-03-05-23-08-11.gh-issue-116380.56HU7I.rst @@ -0,0 +1,2 @@ +Speed up :func:`glob.glob` and :func:`glob.iglob` by making use of +:func:`glob.translate` and tracking path existence more precisely.