diff --git a/CHANGES.rst b/CHANGES.rst index 3b6c260..594a9e4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,17 @@ +v2.1.0 +====== + +#32: Merge with v1.1.0. + +v1.1.0 +====== + +#32: For read-only zip files, complexity of ``.exists`` and +``joinpath`` is now constant time instead of ``O(n)``, preventing +quadratic time in common use-cases and rendering large +zip files unusable for Path. Big thanks to Benjy Weinberger +for the bug report and contributed fix (#33). + v2.0.1 ====== diff --git a/setup.cfg b/setup.cfg index 38e6207..09414bc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,6 +26,7 @@ testing = # upstream # local + jaraco.itertools docs = # upstream diff --git a/test_zipp.py b/test_zipp.py index 37198b7..d4f8fa9 100644 --- a/test_zipp.py +++ b/test_zipp.py @@ -6,6 +6,8 @@ import tempfile import shutil +import jaraco.itertools + import zipp consume = tuple @@ -16,7 +18,7 @@ def add_dirs(zf): Given a writable zip file zf, inject directory entries for any directories implied by the presence of children. """ - for name in zipp.Path._implied_dirs(zf.namelist()): + for name in zipp.CompleteDirs._implied_dirs(zf.namelist()): zf.writestr(name, b"") return zf @@ -201,3 +203,25 @@ def test_mutability(self): assert (root / 'foo.txt').read_text() == 'foo' baz, = (root / 'bar').iterdir() assert baz.read_text() == 'baz' + + HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13 + + def huge_zipfile(self): + """Create a read-only zipfile with a huge number of entries entries.""" + strm = io.BytesIO() + zf = zipfile.ZipFile(strm, "w") + for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): + zf.writestr(entry, entry) + zf.mode = 'r' + return zf + + def test_joinpath_constant_time(self): + """ + Ensure joinpath on items in zipfile is linear time. + """ + root = zipp.Path(self.huge_zipfile()) + entries = jaraco.itertools.Counter(root.iterdir()) + for entry in entries: + entry.joinpath('suffix') + # Check the file iterated all items + assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES diff --git a/zipp.py b/zipp.py index 2b02c5b..46165da 100644 --- a/zipp.py +++ b/zipp.py @@ -3,6 +3,7 @@ import zipfile import functools import itertools +import contextlib from collections import OrderedDict @@ -47,6 +48,90 @@ def _ancestry(path): path, tail = posixpath.split(path) +class CompleteDirs(zipfile.ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + """ + + @staticmethod + def _implied_dirs(names): + parents = itertools.chain.from_iterable(map(_parents, names)) + # Deduplicate entries in original order + implied_dirs = OrderedDict.fromkeys( + p + posixpath.sep for p in parents + # Cast names to a set for O(1) lookups + if p + posixpath.sep not in set(names) + ) + return implied_dirs + + def namelist(self): + names = super(CompleteDirs, self).namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self): + return set(self.namelist()) + + def resolve_dir(self, name): + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + '/' + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + @classmethod + def make(cls, source): + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, zipfile.ZipFile): + return cls(_pathlib_compat(source)) + + # Only allow for FastPath when supplied zipfile is read-only + if 'r' not in source.mode: + cls = CompleteDirs + + res = cls.__new__(cls) + vars(res).update(vars(source)) + return res + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + def namelist(self): + with contextlib.suppress(AttributeError): + return self.__names + self.__names = super(FastLookup, self).namelist() + return self.__names + + def _name_set(self): + with contextlib.suppress(AttributeError): + return self.__lookup + self.__lookup = super(FastLookup, self)._name_set() + return self.__lookup + + +def _pathlib_compat(path): + """ + For path-like objects, convert to a filename for compatibility + on Python 3.6.1 and earlier. + """ + try: + return path.__fspath__() + except AttributeError: + return str(path) + + class Path: """ A pathlib-compatible interface for zip files. @@ -115,24 +200,9 @@ class Path: __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" def __init__(self, root, at=""): - self.root = ( - root - if isinstance(root, zipfile.ZipFile) - else zipfile.ZipFile(self._pathlib_compat(root)) - ) + self.root = FastLookup.make(root) self.at = at - @staticmethod - def _pathlib_compat(path): - """ - For path-like objects, convert to a filename for compatibility - on Python 3.6.1 and earlier. - """ - try: - return path.__fspath__() - except AttributeError: - return str(path) - @property def open(self): return functools.partial(self.root.open, self.at) @@ -162,12 +232,12 @@ def is_file(self): return not self.is_dir() def exists(self): - return self.at in self._names() + return self.at in self.root._name_set() def iterdir(self): if not self.is_dir(): raise ValueError("Can't listdir a file") - subs = map(self._next, self._names()) + subs = map(self._next, self.root.namelist()) return filter(self._is_child, subs) def __str__(self): @@ -177,35 +247,14 @@ def __repr__(self): return self.__repr.format(self=self) def joinpath(self, add): - add = self._pathlib_compat(add) - next = posixpath.join(self.at, add) - next_dir = posixpath.join(self.at, add, "") - names = self._names() - return self._next(next_dir if next not in names and next_dir in names else next) + next = posixpath.join(self.at, _pathlib_compat(add)) + return self._next(self.root.resolve_dir(next)) __truediv__ = joinpath - @staticmethod - def _implied_dirs(names): - parents = itertools.chain.from_iterable(map(_parents, names)) - # Deduplicate entries in original order - implied_dirs = OrderedDict.fromkeys( - p + posixpath.sep for p in parents - # Cast names to a set for O(1) lookups - if p + posixpath.sep not in set(names) - ) - return iter(implied_dirs) - - @classmethod - def _add_implied_dirs(cls, names): - return names + list(cls._implied_dirs(names)) - @property def parent(self): parent_at = posixpath.dirname(self.at.rstrip('/')) if parent_at: parent_at += '/' return self._next(parent_at) - - def _names(self): - return self._add_implied_dirs(self.root.namelist())