From 941b6c7eff72de618bb34eeb5983ed2795988a32 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 13:52:45 +0200 Subject: [PATCH 01/21] feat(src): subst `reduce` with `sum` for size calcs + fix(loose-db): fix bad-attr in ex-message --- gitdb/db/base.py | 3 +-- gitdb/db/loose.py | 4 ++-- gitdb/db/pack.py | 2 -- gitdb/fun.py | 3 +-- gitdb/pack.py | 2 +- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/gitdb/db/base.py b/gitdb/db/base.py index 2d7b9fa..6dbee21 100644 --- a/gitdb/db/base.py +++ b/gitdb/db/base.py @@ -16,7 +16,6 @@ ) from itertools import chain -from functools import reduce __all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB') @@ -209,7 +208,7 @@ def stream(self, sha): def size(self): """:return: total size of all contained databases""" - return reduce(lambda x, y: x + y, (db.size() for db in self._dbs), 0) + return sum(db.size() for db in self._dbs) def sha_iter(self): return chain(*(db.sha_iter() for db in self._dbs)) diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py index 192c524..374cdc7 100644 --- a/gitdb/db/loose.py +++ b/gitdb/db/loose.py @@ -57,7 +57,7 @@ import os -__all__ = ('LooseObjectDB', ) +__all__ = ('LooseObjectDB',) class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): @@ -150,7 +150,7 @@ def _map_loose_object(self, sha): def set_ostream(self, stream): """:raise TypeError: if the stream does not support the Sha1Writer interface""" if stream is not None and not isinstance(stream, Sha1Writer): - raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + raise TypeError("Output stream must support the %s interface" % Sha1Writer) return super(LooseObjectDB, self).set_ostream(stream) def info(self, sha): diff --git a/gitdb/db/pack.py b/gitdb/db/pack.py index 6b03d83..12ec43a 100644 --- a/gitdb/db/pack.py +++ b/gitdb/db/pack.py @@ -20,8 +20,6 @@ from gitdb.pack import PackEntity from gitdb.utils.compat import xrange -from functools import reduce - import os import glob diff --git a/gitdb/fun.py b/gitdb/fun.py index 8ca38c8..cf3ad02 100644 --- a/gitdb/fun.py +++ b/gitdb/fun.py @@ -12,7 +12,6 @@ import mmap from itertools import islice -from functools import reduce from gitdb.const import NULL_BYTE, BYTE_SPACE from gitdb.utils.encoding import force_text @@ -296,7 +295,7 @@ def check_integrity(self, target_size=-1): :raise AssertionError: if the size doen't match""" if target_size > -1: assert self[-1].rbound() == target_size - assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size + assert sum(d.ts for d in self) == target_size # END target size verification if len(self) < 2: diff --git a/gitdb/pack.py b/gitdb/pack.py index 20a4515..cc358c9 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -40,7 +40,7 @@ pass # END try c module -from gitdb.base import ( # Amazing ! +from gitdb.base import ( # Amazing ! OInfo, OStream, OPackInfo, From 2ec8c99e52290b88a6816df7cef0eff8df81c0b7 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Sun, 2 Oct 2016 00:49:57 +0200 Subject: [PATCH 02/21] feat(io): Retrofit streams as context-managers. + feat(util): add logger. + feat(util): add suppress-ex context-handler (from PY3 sources). --- doc/source/tutorial.rst | 59 +++++++++++++++++++-------------------- gitdb/base.py | 44 +++++++++++++++++++++++++---- gitdb/db/mem.py | 13 ++++----- gitdb/pack.py | 12 ++++---- gitdb/stream.py | 55 ++++++++++++++++++++++++++++++++---- gitdb/test/lib.py | 7 +++-- gitdb/test/test_stream.py | 8 ++++-- gitdb/util.py | 37 ++++++++++++++++++++++++ 8 files changed, 176 insertions(+), 59 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 55a737f..ddc0f19 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -35,29 +35,28 @@ Databases support query and/or addition of objects using simple interfaces. They Both have two sets of methods, one of which allows interacting with single objects, the other one allowing to handle a stream of objects simultaneously and asynchronously. Acquiring information about an object from a database is easy if you have a SHA1 to refer to the object:: - - + + ldb = LooseObjectDB(fixture_path("../../../.git/objects")) - + for sha1 in ldb.sha_iter(): oinfo = ldb.info(sha1) - ostream = ldb.stream(sha1) - assert oinfo[:3] == ostream[:3] - - assert len(ostream.read()) == ostream.size - # END for each sha in database - + with =ldb.stream(sha1) as ostream: + assert oinfo[:3] == ostream[:3] + + assert len(ostream.read()) == ostream.size + To store information, you prepare an *IStream* object with the required information. The provided stream will be read and converted into an object, and the respective 20 byte SHA1 identifier is stored in the IStream object:: - + data = "my data" - istream = IStream("blob", len(data), StringIO(data)) - - # the object does not yet have a sha - assert istream.binsha is None - ldb.store(istream) - # now the sha is set - assert len(istream.binsha) == 20 - assert ldb.has_object(istream.binsha) + with IStream("blob", len(data), StringIO(data)) as istream: + + # the object does not yet have a sha + assert istream.binsha is None + ldb.store(istream) + # now the sha is set + assert len(istream.binsha) == 20 + assert ldb.has_object(istream.binsha) ********************** Asynchronous Operation @@ -67,33 +66,33 @@ For each read or write method that allows a single-object to be handled, an *_as Using asynchronous operations is easy, but chaining multiple operations together to form a complex one would require you to read the docs of the *async* package. At the current time, due to the *GIL*, the *GitDB* can only achieve true concurrency during zlib compression and decompression if big objects, if the respective c modules where compiled in *async*. Asynchronous operations are scheduled by a *ThreadPool* which resides in the *gitdb.util* module:: - + from gitdb.util import pool - + # set the pool to use two threads pool.set_size(2) - + # synchronize the mode of operation pool.set_size(0) - - + + Use async methods with readers, which supply items to be processed. The result is given through readers as well:: - + from async import IteratorReader - + # Create a reader from an iterator reader = IteratorReader(ldb.sha_iter()) - + # get reader for object streams info_reader = ldb.stream_async(reader) - + # read one info = info_reader.read(1)[0] - + # read all the rest until depletion ostreams = info_reader.read() - - + + ********* Databases diff --git a/gitdb/base.py b/gitdb/base.py index 42e71d0..03ac1df 100644 --- a/gitdb/base.py +++ b/gitdb/base.py @@ -3,7 +3,7 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module with basic data structures - they are designed to be lightweight and fast""" -from gitdb.util import bin_to_hex +from gitdb.util import bin_to_hex, suppress from gitdb.fun import ( type_id_to_type_map, @@ -134,8 +134,15 @@ def __init__(self, *args, **kwargs): #{ Stream Reader Interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.stream.close() + def read(self, size=-1): - return self[3].read(size) + return self.stream.read(size) @property def stream(self): @@ -171,9 +178,16 @@ def __new__(cls, packoffset, type, size, stream, *args): """Helps with the initialization of subclasses""" return tuple.__new__(cls, (packoffset, type, size, stream)) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.stream.close() + #{ Stream Reader Interface def read(self, size=-1): - return self[3].read(size) + return self.stream.read(size) @property def stream(self): @@ -189,9 +203,16 @@ class ODeltaPackStream(ODeltaPackInfo): def __new__(cls, packoffset, type, size, delta_info, stream): return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.stream.close() + #{ Stream Reader Interface def read(self, size=-1): - return self[4].read(size) + return self.stream.read(size) @property def stream(self): @@ -216,6 +237,13 @@ def __new__(cls, type, size, stream, sha=None): def __init__(self, type, size, stream, sha=None): list.__init__(self, (sha, type, size, stream, None)) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self._stream().close() + #{ Interface @property def hexsha(self): @@ -239,7 +267,7 @@ def _set_error(self, exc): def read(self, size=-1): """Implements a simple stream reader interface, passing the read call on to our internal stream""" - return self[3].read(size) + return self._stream().read(size) #} END stream reader interface @@ -312,4 +340,10 @@ class InvalidOStream(InvalidOInfo): """Carries information about an invalid ODB stream""" __slots__ = tuple() + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + #} END ODB Bases diff --git a/gitdb/db/mem.py b/gitdb/db/mem.py index 8711334..9821896 100644 --- a/gitdb/db/mem.py +++ b/gitdb/db/mem.py @@ -100,13 +100,12 @@ def stream_copy(self, sha_iter, odb): continue # END check object existence - ostream = self.stream(sha) - # compressed data including header - sio = BytesIO(ostream.stream.data()) - istream = IStream(ostream.type, ostream.size, sio, sha) - - odb.store(istream) - count += 1 + with self.stream(sha) as ostream: + # compressed data including header + sio = BytesIO(ostream.stream.data()) + with IStream(ostream.type, ostream.size, sio, sha) as istream: + odb.store(istream) + count += 1 # END for each sha return count #} END interface diff --git a/gitdb/pack.py b/gitdb/pack.py index cc358c9..a6b3a09 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -63,8 +63,8 @@ from gitdb.const import NULL_BYTE from gitdb.utils.compat import ( - izip, - buffer, + izip, + buffer, xrange, to_bytes ) @@ -184,7 +184,7 @@ class IndexWriter(object): __slots__ = '_objs' def __init__(self): - self._objs = list() + self._objs = [] def append(self, binsha, crc, offset): """Append one piece of object information""" @@ -223,7 +223,7 @@ def write(self, pack_sha, write): sha_write(pack('>L', t[1] & 0xffffffff)) # END for each crc - tmplist = list() + tmplist = [] # offset 32 for t in self._objs: ofs = t[2] @@ -370,7 +370,7 @@ def _initialize(self): def _read_fanout(self, byte_offset): """Generate a fanout table from our data""" d = self._cursor.map() - out = list() + out = [] append = out.append for i in xrange(256): append(unpack_from('>L', d, byte_offset + i * 4)[0]) @@ -601,7 +601,7 @@ def collect_streams(self, offset): delta chain. If the object at offset is no delta, the size of the list is 1. :param offset: specifies the first byte of the object within this pack""" - out = list() + out = [] c = self._cursor while True: ostream = pack_object_at(c, offset, True)[1] diff --git a/gitdb/stream.py b/gitdb/stream.py index 2f4c12d..3343a86 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -24,6 +24,7 @@ make_sha, write, close, + suppress, ) from gitdb.const import NULL_BYTE, BYTE_SPACE @@ -142,7 +143,7 @@ def data(self): def close(self): """Close our underlying stream of compressed bytes if this was allowed during initialization :return: True if we closed the underlying stream - :note: can be called safely + :note: can be called safely """ if self._close: if hasattr(self._m, 'close'): @@ -289,11 +290,11 @@ def read(self, size=-1): # if we hit the end of the stream # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. # They are thorough, and I assume it is truly working. - # Why is this logic as convoluted as it is ? Please look at the table in + # Why is this logic as convoluted as it is ? Please look at the table in # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. # Bascially, on py2.6, you want to use branch 1, whereas on all other python version, the second branch - # will be the one that works. - # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the # table in the github issue. This is it ... it was the only way I could make this work everywhere. # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . if PY26 or ((zlib.ZLIB_VERSION == '1.2.7' or zlib.ZLIB_VERSION == '1.2.5') and not sys.platform == 'darwin'): @@ -566,6 +567,12 @@ def __init__(self): #{ Stream Interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + def write(self, data): """:raise IOError: If not all bytes could be written :param data: byte object @@ -593,11 +600,20 @@ class FlexibleSha1Writer(Sha1Writer): """Writer producing a sha1 while passing on the written bytes to the given write function""" - __slots__ = 'writer' + __slots__ = ('writer', '_no_close_writer') - def __init__(self, writer): + def __init__(self, writer, no_close_writer=False): Sha1Writer.__init__(self) self.writer = writer + self._no_close_writer = no_close_writer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if not self._no_close_writer: + with suppress(): + self.writer.close() def write(self, data): Sha1Writer.write(self, data) @@ -614,6 +630,13 @@ def __init__(self): self.buf = BytesIO() self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.close() + def __getattr__(self, attr): return getattr(self.buf, attr) @@ -658,6 +681,13 @@ def __init__(self, fd): #{ Stream Interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.close() + def write(self, data): """:raise IOError: If not all bytes could be written :return: length of incoming data""" @@ -690,6 +720,13 @@ def __init__(self, fd): self._fd = fd self._pos = 0 + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + with suppress(): + self.close() + def write(self, data): self._pos += len(data) os.write(self._fd, data) @@ -719,6 +756,12 @@ class NullStream(object): Use it like /dev/null""" __slots__ = tuple() + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + def read(self, size=0): return '' diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index bbdd241..235d032 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -24,6 +24,9 @@ #{ Bases +log = logging.getLogger(__name__) + + class TestBase(unittest.TestCase): """Base class for all tests @@ -47,8 +50,8 @@ def setUpClass(cls): cls.gitrepopath = os.environ.get(cls.k_env_git_repo) if not cls.gitrepopath: - logging.info( - "You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository", cls.k_env_git_repo) + log.info("You can set the %s environment variable to a .git repository of your choice" + " - defaulting to the gitdb repository", cls.k_env_git_repo) ospd = os.path.dirname cls.gitrepopath = os.path.join(ospd(ospd(ospd(__file__))), '.git') # end assure gitrepo is set diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 9626825..7d3eeae 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -139,9 +139,11 @@ def test_compressed_writer(self): # read everything back, compare to data we zip fd = os.open(path, os.O_RDONLY | getattr(os, 'O_BINARY', 0)) - written_data = os.read(fd, os.path.getsize(path)) - assert len(written_data) == os.path.getsize(path) - os.close(fd) + try: + written_data = os.read(fd, os.path.getsize(path)) + assert len(written_data) == os.path.getsize(path) + finally: + os.close(fd) assert written_data == zlib.compress(data, 1) # best speed os.remove(path) diff --git a/gitdb/util.py b/gitdb/util.py index 242be44..821fc9b 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -15,6 +15,7 @@ SlidingWindowMapManager, SlidingWindowMapBuffer ) +import logging # initialize our global memory manager instance # Use it to free cached (and unused) resources. @@ -75,6 +76,8 @@ def unpack_from(fmt, data, offset=0): #} END Aliases +log = logging.getLogger(__name__) + #{ compatibility stuff ... @@ -218,6 +221,40 @@ def to_bin_sha(sha): #{ Utilities +## Copied from python std-lib. +class suppress: + """Context manager to suppress specified exceptions + + After the exception is suppressed, execution proceeds with the next + statement following the with statement. + + with suppress(FileNotFoundError): + os.remove(somefile) + # Execution still resumes here if the file was already removed + """ + + def __init__(self, *exceptions): + self._exceptions = exceptions + + def __enter__(self): + pass + + def __exit__(self, exctype, excinst, exctb): + # Unlike isinstance and issubclass, CPython exception handling + # currently only looks at the concrete type hierarchy (ignoring + # the instance and subclass checking hooks). While Guido considers + # that a bug rather than a feature, it's a fairly hard one to fix + # due to various internal implementation details. suppress provides + # the simpler issubclass based semantics, rather than trying to + # exactly reproduce the limitations of the CPython interpreter. + # + # See http://bugs.python.org/issue12029 for more details + supp = exctype is not None and issubclass(exctype, self._exceptions) + if supp: + log.debug("Suppressed exception: %s(%s)", exctype, excinst, exc_info=1) + return supp + + class LazyMixin(object): """ From 758c29306f509008d80ee84443e04cf6655215ea Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Sun, 2 Oct 2016 02:03:13 +0200 Subject: [PATCH 03/21] feat(io): breaking API: retrofit Packers as context-managers! + Packers MUST be invoked inside `Withh...` blocks, or `_cursor` won't exist! + Had to drop NotLazy for their hierarchy :-( + Count entrances/exits. + feat(util: add `rmtree()` for READ_ONLY files on Windows. 3-->2 Windows TCs now fail. --- gitdb/base.py | 31 ++- gitdb/db/pack.py | 46 ++-- gitdb/pack.py | 308 +++++++++++++++----------- gitdb/stream.py | 127 ++++++++--- gitdb/test/db/test_pack.py | 3 +- gitdb/test/lib.py | 30 ++- gitdb/test/performance/test_pack.py | 43 ++-- gitdb/test/performance/test_stream.py | 18 +- gitdb/test/test_pack.py | 127 ++++++----- gitdb/util.py | 20 ++ 10 files changed, 458 insertions(+), 295 deletions(-) diff --git a/gitdb/base.py b/gitdb/base.py index 03ac1df..60e169d 100644 --- a/gitdb/base.py +++ b/gitdb/base.py @@ -138,8 +138,11 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self.stream.close() + with suppress(Exception): + self.close() + + def close(self): + self.stream.close() def read(self, size=-1): return self.stream.read(size) @@ -182,8 +185,11 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self.stream.close() + with suppress(Exception): + self.close() + + def close(self): + self.stream.close() #{ Stream Reader Interface def read(self, size=-1): @@ -207,8 +213,11 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self.stream.close() + with suppress(Exception): + self.close() + + def close(self): + self.stream.close() #{ Stream Reader Interface def read(self, size=-1): @@ -241,8 +250,11 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self._stream().close() + with suppress(Exception): + self.close() + + def close(self): + self._stream().close() #{ Interface @property @@ -346,4 +358,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): pass + def close(self): + pass + #} END ODB Bases diff --git a/gitdb/db/pack.py b/gitdb/db/pack.py index 12ec43a..4834caf 100644 --- a/gitdb/db/pack.py +++ b/gitdb/db/pack.py @@ -23,7 +23,7 @@ import os import glob -__all__ = ('PackedDB', ) +__all__ = ('PackedDB',) #{ Utilities @@ -70,7 +70,9 @@ def _pack_info(self, sha): # END update sorting for item in self._entities: - index = item[2](sha) + ent = item[1] + with ent.index() as index: + index = index.sha_to_index(sha) if index is not None: item[0] += 1 # one hit for you self._hit_count += 1 # general hit count @@ -95,24 +97,27 @@ def has_object(self, sha): def info(self, sha): entity, index = self._pack_info(sha) - return entity.info_at_index(index) + with entity: + return entity.info_at_index(index) def stream(self, sha): entity, index = self._pack_info(sha) - return entity.stream_at_index(index) + with entity: + return entity.stream_at_index(index) def sha_iter(self): for entity in self.entities(): - index = entity.index() - sha_by_index = index.sha - for index in xrange(index.size()): - yield sha_by_index(index) - # END for each index - # END for each entity + with entity.index() as index: + sha_by_index = index.sha + for index in xrange(index.size()): + yield sha_by_index(index) def size(self): - sizes = [item[1].index().size() for item in self._entities] - return reduce(lambda x, y: x + y, sizes, 0) + sz = 0 + for entity in self.entities(): + with entity.index() as idx: + sz += idx.size() + return sz #} END object db read @@ -152,8 +157,8 @@ def update_cache(self, force=False): for pack_file in (pack_files - our_pack_files): # init the hit-counter/priority with the size, a good measure for hit- # probability. Its implemented so that only 12 bytes will be read - entity = PackEntity(pack_file) - self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index]) + with PackEntity(pack_file) as entity: + self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index]) # END for each new packfile # removed packs @@ -187,12 +192,13 @@ def partial_to_complete_sha(self, partial_binsha, canonical_length): :raise BadObject: """ candidate = None for item in self._entities: - item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length) - if item_index is not None: - sha = item[1].index().sha(item_index) - if candidate and candidate != sha: - raise AmbiguousObjectName(partial_binsha) - candidate = sha + with item[1] as entity: + item_index = entity.index().partial_sha_to_index(partial_binsha, canonical_length) + if item_index is not None: + sha = entity.index().sha(item_index) + if candidate and candidate != sha: + raise AmbiguousObjectName(partial_binsha) + candidate = sha # END handle full sha could be found # END for each entity diff --git a/gitdb/pack.py b/gitdb/pack.py index a6b3a09..28d973c 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -247,7 +247,7 @@ def write(self, pack_sha, write): return sha -class PackIndexFile(LazyMixin): +class PackIndexFile(object): """A pack index provides offsets into the corresponding pack, allowing to find locations for offsets faster.""" @@ -261,54 +261,65 @@ class PackIndexFile(LazyMixin): _sha_list_offset = 8 + 1024 index_v2_signature = b'\xfftOc' index_version_default = 2 + _entered = 0 def __init__(self, indexpath): - super(PackIndexFile, self).__init__() self._indexpath = indexpath - def _set_cache_(self, attr): - if attr == "_packfile_checksum": - self._packfile_checksum = self._cursor.map()[-40:-20] - elif attr == "_packfile_checksum": - self._packfile_checksum = self._cursor.map()[-20:] - elif attr == "_cursor": - # Note: We don't lock the file when reading as we cannot be sure - # that we can actually write to the location - it could be a read-only - # alternate for instance - self._cursor = mman.make_cursor(self._indexpath).use_region() - # We will assume that the index will always fully fit into memory ! - if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): - raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( - self._indexpath, self._cursor.file_size(), mman.window_size())) - # END assert window size - else: - # now its time to initialize everything - if we are here, someone wants - # to access the fanout table or related properties - - # CHECK VERSION - mmap = self._cursor.map() - self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 - if self._version == 2: - version_id = unpack_from(">L", mmap, 4)[0] - assert version_id == self._version, "Unsupported index version: %i" % version_id - # END assert version - - # SETUP FUNCTIONS - # setup our functions according to the actual version - for fname in ('entry', 'offset', 'sha', 'crc'): - setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) - # END for each function to initialize - - # INITIALIZE DATA - # byte offset is 8 if version is 2, 0 otherwise - self._initialize() + def __enter__(self): + if not hasattr(self, '_cursor'): + assert self._entered == 0, (self, self._indexpath) + self._prepare_enter() + self._entered += 1 + + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._entered -= 1 + assert self._entered >= 0, (self, self._indexpath) + if self._entered == 0: + self._cursor._destroy() + del self._cursor + del self._fanout_table + + def _prepare_enter(self): + # Note: We don't lock the file when reading as we cannot be sure + # that we can actually write to the location - it could be a read-only + # alternate for instance + self._cursor = mman.make_cursor(self._indexpath).use_region() + # We will assume that the index will always fully fit into memory ! + if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): + raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( + self._indexpath, self._cursor.file_size(), mman.window_size())) + # END assert window size + + # now its time to initialize everything - if we are here, someone wants + # to access the fanout table or related properties + + # CHECK VERSION + mmap = self._cursor.map() + self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 + if self._version == 2: + version_id = unpack_from(">L", mmap, 4)[0] + assert version_id == self._version, "Unsupported index version: %i" % version_id + # END assert version + + # SETUP FUNCTIONS + # setup our functions according to the actual version + for fname in ('entry', 'offset', 'sha', 'crc'): + setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) + # END for each function to initialize + + # INITIALIZE DATA + # byte offset is 8 if version is 2, 0 otherwise + self._initialize() # END handle attributes #{ Access V1 def _entry_v1(self, i): """:return: tuple(offset, binsha, 0)""" - return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0, ) + return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0,) def _offset_v1(self, i): """see ``_offset_v2``""" @@ -423,7 +434,7 @@ def sha_to_index(self, sha): :param sha: 20 byte sha to lookup""" first_byte = byte_ord(sha[0]) get_sha = self.sha - lo = 0 # lower index, the left bound of the bisection + lo = 0 if first_byte != 0: lo = self._fanout_table[first_byte - 1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection @@ -503,7 +514,7 @@ def sha_to_index(self, sha): #} END properties -class PackFile(LazyMixin): +class PackFile(object): """A pack is a file written according to the Version 2 for git packs @@ -516,7 +527,12 @@ class PackFile(LazyMixin): for some reason - one clearly doesn't want to read 10GB at once in that case""" - __slots__ = ('_packpath', '_cursor', '_size', '_version') + __slots__ = ('_packpath', + '_cursor', + '_size', + '_version', + '_entered', + ) pack_signature = 0x5041434b # 'PACK' pack_version_default = 2 @@ -526,8 +542,24 @@ class PackFile(LazyMixin): def __init__(self, packpath): self._packpath = packpath + self._entered = 0 + + def __enter__(self): + if not hasattr(self, '_cursor'): + assert self._entered == 0, (self, self._packpath) + self._prepare_enter() + self._entered += 1 - def _set_cache_(self, attr): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._entered -= 1 + assert self._entered >= 0, (self, self._indexpath) + if self._entered == 0: + self._cursor._destroy() + del self._cursor + + def _prepare_enter(self): # we fill the whole cache, whichever attribute gets queried first self._cursor = mman.make_cursor(self._packpath).use_region() @@ -649,14 +681,16 @@ def stream_iter(self, start_offset=0): #} END Read-Database like Interface -class PackEntity(LazyMixin): +class PackEntity(object): """Combines the PackIndexFile and the PackFile into one, allowing the actual objects to be resolved and iterated""" - __slots__ = ('_index', # our index file + __slots__ = ('_basename', + '_index', # our index file '_pack', # our pack file - '_offset_map' # on demand dict mapping one offset to the next consecutive one + '_offset_map', # on demand dict mapping one offset to the next consecutive one + '_entered', ) IndexFileCls = PackIndexFile @@ -664,34 +698,56 @@ class PackEntity(LazyMixin): def __init__(self, pack_or_index_path): """Initialize ourselves with the path to the respective pack or index file""" - basename, ext = os.path.splitext(pack_or_index_path) - self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance - self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance - - def _set_cache_(self, attr): - # currently this can only be _offset_map - # TODO: make this a simple sorted offset array which can be bisected - # to find the respective entry, from which we can take a +1 easily - # This might be slower, but should also be much lighter in memory ! - offsets_sorted = sorted(self._index.offsets()) - last_offset = len(self._pack.data()) - self._pack.footer_size - assert offsets_sorted, "Cannot handle empty indices" - - offset_map = None - if len(offsets_sorted) == 1: - offset_map = {offsets_sorted[0]: last_offset} - else: - iter_offsets = iter(offsets_sorted) - iter_offsets_plus_one = iter(offsets_sorted) - next(iter_offsets_plus_one) - consecutive = izip(iter_offsets, iter_offsets_plus_one) + basename, ext = os.path.splitext(pack_or_index_path) # @UnusedVariable + self._index = self.IndexFileCls("%s.idx" % basename) + self._pack = self.PackFileCls("%s.pack" % basename) + self._entered = False + + def __enter__(self): + if self._entered: + raise ValueError('Re-entered!') + self._index.__enter__() + self._pack.__enter__() + self._entered = True + + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self._index: + self._index.__exit__(exc_type, exc_value, traceback) + if self._pack: + self._pack.__exit__(exc_type, exc_value, traceback) + self._entered = False + + @property + def offset_map(self): + if not hasattr(self, '_offset_map'): + # currently this can only be _offset_map + # TODO: make this a simple sorted offset array which can be bisected + # to find the respective entry, from which we can take a +1 easily + # This might be slower, but should also be much lighter in memory ! + offsets_sorted = sorted(self._index.offsets()) + last_offset = len(self._pack.data()) - self._pack.footer_size + assert offsets_sorted, "Cannot handle empty indices" + + offset_map = None + if len(offsets_sorted) == 1: + offset_map = {offsets_sorted[0]: last_offset} + else: + iter_offsets = iter(offsets_sorted) + iter_offsets_plus_one = iter(offsets_sorted) + next(iter_offsets_plus_one) + consecutive = izip(iter_offsets, iter_offsets_plus_one) - offset_map = dict(consecutive) + offset_map = dict(consecutive) - # the last offset is not yet set - offset_map[offsets_sorted[-1]] = last_offset - # END handle offset amount - self._offset_map = offset_map + # the last offset is not yet set + offset_map[offsets_sorted[-1]] = last_offset + # END handle offset amount + + self._offset_map = offset_map + + return self._offset_map def _sha_to_index(self, sha): """:return: index for the given sha, or raise""" @@ -814,7 +870,7 @@ def is_valid_stream(self, sha, use_crc=False): index = self._sha_to_index(sha) offset = self._index.offset(index) - next_offset = self._offset_map[offset] + next_offset = self.offset_map[offset] crc_value = self._index.crc(index) # create the current crc value, on the compressed object data @@ -835,9 +891,9 @@ def is_valid_stream(self, sha, use_crc=False): return (this_crc_value & 0xffffffff) == crc_value else: shawriter = Sha1Writer() - stream = self._object(sha, as_stream=True) - # write a loose object, which is the basis for the sha - write_object(stream.type, stream.size, stream.read, shawriter.write) + with self._object(sha, as_stream=True) as stream: + # write a loose object, which is the basis for the sha + write_object(stream.type, stream.size, stream.read, shawriter.write) assert shawriter.sha(as_hex=False) == sha return shawriter.sha(as_hex=False) == sha @@ -932,59 +988,59 @@ def write_pack(cls, object_iter, pack_write, index_write=None, object_count = len(objs) # END handle object - pack_writer = FlexibleSha1Writer(pack_write) - pwrite = pack_writer.write - ofs = 0 # current offset into the pack file - index = None - wants_index = index_write is not None + with FlexibleSha1Writer(pack_write) as pack_writer: + pwrite = pack_writer.write + ofs = 0 # current offset into the pack file + index = None + wants_index = index_write is not None - # write header - pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) - ofs += 12 + # write header + pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) + ofs += 12 - if wants_index: - index = IndexWriter() - # END handle index header - - actual_count = 0 - for obj in objs: - actual_count += 1 - crc = 0 - - # object header - hdr = create_pack_object_header(obj.type_id, obj.size) - if index_write: - crc = crc32(hdr) - else: - crc = None - # END handle crc - pwrite(hdr) - - # data stream - zstream = zlib.compressobj(zlib_compression) - ostream = obj.stream - br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) - assert(br == obj.size) if wants_index: - index.append(obj.binsha, crc, ofs) - # END handle index - - ofs += len(hdr) + bw - if actual_count == object_count: - break - # END abort once we are done - # END for each object - - if actual_count != object_count: - raise ValueError( - "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) - # END count assertion - - # write footer - pack_sha = pack_writer.sha(as_hex=False) - assert len(pack_sha) == 20 - pack_write(pack_sha) - ofs += len(pack_sha) # just for completeness ;) + index = IndexWriter() + # END handle index header + + actual_count = 0 + for obj in objs: + actual_count += 1 + crc = 0 + + # object header + hdr = create_pack_object_header(obj.type_id, obj.size) + if index_write: + crc = crc32(hdr) + else: + crc = None + # END handle crc + pwrite(hdr) + + # data stream + zstream = zlib.compressobj(zlib_compression) + ostream = obj.stream + br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) + assert(br == obj.size) + if wants_index: + index.append(obj.binsha, crc, ofs) + # END handle index + + ofs += len(hdr) + bw + if actual_count == object_count: + break + # END abort once we are done + # END for each object + + if actual_count != object_count: + raise ValueError( + "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) + # END count assertion + + # write footer + pack_sha = pack_writer.sha(as_hex=False) + assert len(pack_sha) == 20 + pack_write(pack_sha) + ofs += len(pack_sha) # just for completeness ;) index_sha = None if wants_index: diff --git a/gitdb/stream.py b/gitdb/stream.py index 3343a86..330f749 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -29,7 +29,6 @@ from gitdb.const import NULL_BYTE, BYTE_SPACE from gitdb.utils.compat import buffer -from gitdb.utils.encoding import force_bytes has_perf_mod = False PY26 = sys.version_info[:2] < (2, 7) @@ -65,7 +64,7 @@ class DecompressMemMapReader(LazyMixin): to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, that's all ... """ __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', - '_cbr', '_phi') + '_cbr', '_phi', '_entered') max_read_size = 512 * 1024 # currently unused @@ -94,6 +93,18 @@ def _set_cache_(self, attr): def __del__(self): self.close() + def __enter__(self): + if getattr(self, '_entered', None): + raise ValueError('Re-entered!') + self._entered = True + + return self + + def __exit__(self, exc_type, exc_value, traceback): + #with suppress(): + self.close() + del self._entered + def _parse_header_info(self): """If this stream contains object data, parse the header info and skip the stream to a point where each read will yield object content @@ -138,6 +149,8 @@ def new(self, m, close_on_deletion=False): def data(self): """:return: random access compatible data we are working on""" + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') return self._m def close(self): @@ -172,6 +185,8 @@ def compressed_bytes_read(self): # We are using a custom zlib module for this, if its not present, # we try to put in additional bytes up for decompression if feasible # and check for the unused_data. + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') # Only scrub the stream forward if we are officially done with the # bytes we were to have. @@ -203,6 +218,9 @@ def compressed_bytes_read(self): def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): """Allows to reset the stream to restart reading :raise ValueError: If offset and whence are not 0""" + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): raise ValueError("Can only seek to position 0") # END handle offset @@ -215,6 +233,9 @@ def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): # END skip header def read(self, size=-1): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + if size < 1: size = self._s - self._br else: @@ -353,7 +374,8 @@ class DeltaApplyReader(LazyMixin): "_dstreams", # tuple of delta stream readers "_mm_target", # memory map of the delta-applied data "_size", # actual number of bytes in _mm_target - "_br" # number of bytes read + "_br", # number of bytes read + "_entered", ) #{ Configuration @@ -370,6 +392,22 @@ def __init__(self, stream_list): self._dstreams = tuple(stream_list[:-1]) self._br = 0 + def __enter__(self): + if getattr(self, '_entered', None): + raise ValueError('Re-entered!') + self._entered = True + + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + del self._entered + + def close(self): + self._bstream.close() + for s in self._dstreams: + s.close() + def _set_cache_too_slow_without_c(self, attr): # the direct algorithm is fastest and most direct if there is only one # delta. Also, the extra overhead might not be worth it for items smaller @@ -487,6 +525,9 @@ def _set_cache_brute_(self, attr): #} END configuration def read(self, count=0): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + bl = self._size - self._br # bytes left if count < 1 or count > bl: count = bl @@ -500,6 +541,9 @@ def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): """Allows to reset the stream to restart reading :raise ValueError: If offset and whence are not 0""" + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): raise ValueError("Can only seek to position 0") # END handle offset @@ -560,7 +604,10 @@ class Sha1Writer(object): """Simple stream writer which produces a sha whenever you like as it degests everything it is supposed to write""" - __slots__ = "sha1" + __slots__ = ( + 'sha1', + '_entered', + ) def __init__(self): self.sha1 = make_sha() @@ -568,15 +615,25 @@ def __init__(self): #{ Stream Interface def __enter__(self): + if getattr(self, '_entered', None): + raise ValueError('Re-entered!') + self._entered = True + return self def __exit__(self, exc_type, exc_value, traceback): + self.close() + del self._entered + + def close(self): pass def write(self, data): """:raise IOError: If not all bytes could be written :param data: byte object :return: length of incoming data""" + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') self.sha1.update(data) @@ -600,22 +657,20 @@ class FlexibleSha1Writer(Sha1Writer): """Writer producing a sha1 while passing on the written bytes to the given write function""" - __slots__ = ('writer', '_no_close_writer') + __slots__ = ('writer') - def __init__(self, writer, no_close_writer=False): + def __init__(self, writer): Sha1Writer.__init__(self) self.writer = writer - self._no_close_writer = no_close_writer - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - if not self._no_close_writer: - with suppress(): - self.writer.close() + def close(self): + with suppress(Exception): + self.writer.close() def write(self, data): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + Sha1Writer.write(self, data) self.writer(data) @@ -630,17 +685,13 @@ def __init__(self): self.buf = BytesIO() self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self.close() - def __getattr__(self, attr): return getattr(self.buf, attr) def write(self, data): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + alen = Sha1Writer.write(self, data) self.buf.write(self.zip.compress(data)) @@ -681,13 +732,6 @@ def __init__(self, fd): #{ Stream Interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - with suppress(): - self.close() - def write(self, data): """:raise IOError: If not all bytes could be written :return: length of incoming data""" @@ -714,24 +758,37 @@ class FDStream(object): """A simple wrapper providing the most basic functions on a file descriptor with the fileobject interface. Cannot use os.fdopen as the resulting stream takes ownership""" - __slots__ = ("_fd", '_pos') + __slots__ = ('_fd', + '_pos', + '_entered', + ) def __init__(self, fd): self._fd = fd self._pos = 0 def __enter__(self): + if getattr(self, '_entered', None): + raise ValueError('Re-entered!') + self._entered = True return self def __exit__(self, exc_type, exc_value, traceback): with suppress(): self.close() + del self._entered def write(self, data): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + self._pos += len(data) os.write(self._fd, data) def read(self, count=0): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') + if count == 0: count = os.path.getsize(self._filepath) # END handle read everything @@ -754,21 +811,29 @@ class NullStream(object): """A stream that does nothing but providing a stream interface. Use it like /dev/null""" - __slots__ = tuple() + __slots__ = tuple('_entered') def __enter__(self): + if getattr(self, '_entered', None): + raise ValueError('Re-entered!') + self._entered = True + return self def __exit__(self, exc_type, exc_value, traceback): - pass + del self._entered def read(self, size=0): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') return '' def close(self): pass def write(self, data): + # if not getattr(self, '_entered', None): + # raise ValueError('Not entered!') return len(data) diff --git a/gitdb/test/db/test_pack.py b/gitdb/test/db/test_pack.py index a901581..b361a59 100644 --- a/gitdb/test/db/test_pack.py +++ b/gitdb/test/db/test_pack.py @@ -51,7 +51,8 @@ def test_writing(self, path): for sha in sha_list: pdb.info(sha) - pdb.stream(sha) + with pdb.stream(sha): + pass # END for each sha to query # test short finding - be a bit more brutal here diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index 235d032..8b5cb02 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -3,27 +3,25 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Utilities used in ODB testing""" -from gitdb import OStream -from gitdb.utils.compat import xrange - -import sys -import random from array import array - -from io import BytesIO - -import glob -import unittest -import tempfile -import shutil -import os +from functools import wraps import gc +import glob +from io import BytesIO import logging -from functools import wraps +import os +import random +import shutil +import sys +import tempfile +import unittest + +from gitdb import OStream +from gitdb.util import rmtree +from gitdb.utils.compat import xrange #{ Bases - log = logging.getLogger(__name__) @@ -99,7 +97,7 @@ def wrapper(self): # though this is not the case here unless we collect explicitly. if not keep: gc.collect() - shutil.rmtree(path) + rmtree(path) # END handle exception # END wrapper diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py index fc8d9d5..b99b91e 100644 --- a/gitdb/test/performance/test_pack.py +++ b/gitdb/test/performance/test_pack.py @@ -92,19 +92,20 @@ def test_loose_correctness(self): pdb = GitDB(os.path.join(self.gitrepopath, 'objects')) mdb = MemoryDB() for c, sha in enumerate(pdb.sha_iter()): - ostream = pdb.stream(sha) - # the issue only showed on larger files which are hardly compressible ... - if ostream.type != str_blob_type: - continue - istream = IStream(ostream.type, ostream.size, ostream.stream) - mdb.store(istream) - assert istream.binsha == sha, "Failed on object %s" % bin_to_hex(sha).decode('ascii') - # this can fail ... sometimes, so the packs dataset should be huge - assert len(mdb.stream(sha).read()) == ostream.size + with pdb.stream(sha) as ostream: + # the issue only showed on larger files which are hardly compressible ... + if ostream.type != str_blob_type: + continue + istream = IStream(ostream.type, ostream.size, ostream.stream) + mdb.store(istream) + assert istream.binsha == sha, "Failed on object %s" % bin_to_hex(sha).decode('ascii') + # this can fail ... sometimes, so the packs dataset should be huge + with mdb.stream(sha) as ost2: + assert len(ost2.read()) == ostream.size - if c and c % 1000 == 0: - print("Verified %i loose object compression/decompression cycles" % c, file=sys.stderr) - mdb._cache.clear() + if c and c % 1000 == 0: + print("Verified %i loose object compression/decompression cycles" % c, file=sys.stderr) + mdb._cache.clear() # end for each sha to copy @skip_on_travis_ci @@ -116,14 +117,16 @@ def test_correctness(self): count = 0 st = time() for entity in pdb.entities(): - pack_verify = entity.is_valid_stream - sha_by_index = entity.index().sha - for index in xrange(entity.index().size()): - try: - assert pack_verify(sha_by_index(index), use_crc=crc) - count += 1 - except UnsupportedOperation: - pass + with entity: + pack_verify = entity.is_valid_stream + idx = entity.index() + sha_by_index = idx.sha + for index in xrange(idx.size()): + try: + assert pack_verify(sha_by_index(index), use_crc=crc) + count += 1 + except UnsupportedOperation: + pass # END ignore old indices # END for each index # END for each entity diff --git a/gitdb/test/performance/test_stream.py b/gitdb/test/performance/test_stream.py index 704f4d0..c9b0f15 100644 --- a/gitdb/test/performance/test_stream.py +++ b/gitdb/test/performance/test_stream.py @@ -74,9 +74,9 @@ def test_large_data_streaming(self, path): # reading all at once st = time() - ostream = ldb.stream(sha) - shadata = ostream.read() - elapsed_readall = time() - st + with ldb.stream(sha) as ostream: + shadata = ostream.read() + elapsed_readall = time() - st stream.seek(0) assert shadata == stream.getvalue() @@ -87,12 +87,12 @@ def test_large_data_streaming(self, path): cs = 512 * 1000 chunks = list() st = time() - ostream = ldb.stream(sha) - while True: - data = ostream.read(cs) - chunks.append(data) - if len(data) < cs: - break + with ldb.stream(sha) as ostream: + while True: + data = ostream.read(cs) + chunks.append(data) + if len(data) < cs: + break # END read in chunks elapsed_readchunks = time() - st diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index 6e31363..7484193 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -112,13 +112,14 @@ def _assert_pack_file(self, pack, version, size): continue # END get deltastream - # read all - data = dstream.read() - assert len(data) == dstream.size + with dstream: + # read all + data = dstream.read() + assert len(data) == dstream.size - # test seek - dstream.seek(0) - assert dstream.read() == data + # test seek + dstream.seek(0) + assert dstream.read() == data # read chunks # NOTE: the current implementation is safe, it basically transfers @@ -130,15 +131,15 @@ def _assert_pack_file(self, pack, version, size): def test_pack_index(self): # check version 1 and 2 for indexfile, version, size in (self.packindexfile_v1, self.packindexfile_v2): - index = PackIndexFile(indexfile) - self._assert_index_file(index, version, size) + with PackIndexFile(indexfile) as index: + self._assert_index_file(index, version, size) # END run tests def test_pack(self): # there is this special version 3, but apparently its like 2 ... for packfile, version, size in (self.packfile_v2_3_ascii, self.packfile_v2_1, self.packfile_v2_2): - pack = PackFile(packfile) - self._assert_pack_file(pack, version, size) + with PackFile(packfile) as pack: + self._assert_pack_file(pack, version, size) # END for each pack to test @with_rw_directory @@ -147,42 +148,43 @@ def test_pack_entity(self, rw_dir): for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1), (self.packfile_v2_2, self.packindexfile_v2), (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): - packfile, version, size = packinfo - indexfile, version, size = indexinfo - entity = PackEntity(packfile) - assert entity.pack().path() == packfile - assert entity.index().path() == indexfile - pack_objs.extend(entity.stream_iter()) - - count = 0 - for info, stream in izip(entity.info_iter(), entity.stream_iter()): - count += 1 - assert info.binsha == stream.binsha - assert len(info.binsha) == 20 - assert info.type_id == stream.type_id - assert info.size == stream.size - - # we return fully resolved items, which is implied by the sha centric access - assert not info.type_id in delta_types - - # try all calls - assert len(entity.collect_streams(info.binsha)) - oinfo = entity.info(info.binsha) - assert isinstance(oinfo, OInfo) - assert oinfo.binsha is not None - ostream = entity.stream(info.binsha) - assert isinstance(ostream, OStream) - assert ostream.binsha is not None - - # verify the stream - try: - assert entity.is_valid_stream(info.binsha, use_crc=True) - except UnsupportedOperation: - pass - # END ignore version issues - assert entity.is_valid_stream(info.binsha, use_crc=False) - # END for each info, stream tuple - assert count == size + packfile, version, size = packinfo # @UnusedVariable + indexfile, version, size = indexinfo # @UnusedVariable + with PackEntity(packfile) as entity: + assert entity.pack().path() == packfile + assert entity.index().path() == indexfile + pack_objs.extend(entity.stream_iter()) + + count = 0 + for info, stream in izip(entity.info_iter(), entity.stream_iter()): + with stream: + count += 1 + assert info.binsha == stream.binsha + assert len(info.binsha) == 20 + assert info.type_id == stream.type_id + assert info.size == stream.size + + # we return fully resolved items, which is implied by the sha centric access + assert info.type_id not in delta_types + + # try all calls + assert len(entity.collect_streams(info.binsha)) + oinfo = entity.info(info.binsha) + assert isinstance(oinfo, OInfo) + assert oinfo.binsha is not None + with entity.stream(info.binsha) as ostream: + assert isinstance(ostream, OStream) + assert ostream.binsha is not None + + # verify the stream + try: + assert entity.is_valid_stream(info.binsha, use_crc=True) + except UnsupportedOperation: + pass + # END ignore version issues + assert entity.is_valid_stream(info.binsha, use_crc=False) + # END for each info, stream tuple + assert count == size # END for each entity @@ -217,33 +219,30 @@ def rewind_streams(): assert os.path.getsize(ppath) > 100 # verify pack - pf = PackFile(ppath) # FIXME: Leaks file-pointer(s)! - assert pf.size() == len(pack_objs) - assert pf.version() == PackFile.pack_version_default - assert pf.checksum() == pack_sha - + with PackFile(ppath) as pf: # FIXME: Leaks file-pointer(s)! + assert pf.size() == len(pack_objs) + assert pf.version() == PackFile.pack_version_default + assert pf.checksum() == pack_sha # verify index if ipath is not None: ifile.close() assert os.path.getsize(ipath) > 100 - idx = PackIndexFile(ipath) - assert idx.version() == PackIndexFile.index_version_default - assert idx.packfile_checksum() == pack_sha - assert idx.indexfile_checksum() == index_sha - assert idx.size() == len(pack_objs) + with PackIndexFile(ipath) as idx: + assert idx.version() == PackIndexFile.index_version_default + assert idx.packfile_checksum() == pack_sha + assert idx.indexfile_checksum() == index_sha + assert idx.size() == len(pack_objs) # END verify files exist # END for each packpath, indexpath pair # verify the packs thoroughly rewind_streams() - entity = PackEntity.create(pack_objs, rw_dir) - count = 0 - for info in entity.info_iter(): - count += 1 - for use_crc in range(2): - assert entity.is_valid_stream(info.binsha, use_crc) - # END for each crc mode - # END for each info + with PackEntity.create(pack_objs, rw_dir) as entity: + count = 0 + for info in entity.info_iter(): + count += 1 + for use_crc in range(2): + assert entity.is_valid_stream(info.binsha, use_crc) assert count == len(pack_objs) def test_pack_64(self): diff --git a/gitdb/util.py b/gitdb/util.py index 821fc9b..e6ed8a3 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -16,6 +16,8 @@ SlidingWindowMapBuffer ) import logging +import stat +import shutil # initialize our global memory manager instance # Use it to free cached (and unused) resources. @@ -118,6 +120,24 @@ def byte_ord(b): #{ Routines +def rmtree(path): + """Remove the given recursively. + + :note: we use shutil rmtree but adjust its behaviour to see whether files that + couldn't be deleted are read-only. Windows will not remove them in that case""" + + def onerror(func, path, exc_info): + # Is the error an access error ? + os.chmod(path, stat.S_IWUSR) + + try: + func(path) # Will scream if still not possible to delete. + except Exception as ex: + raise + + return shutil.rmtree(path, False, onerror) + + def make_sha(source=''.encode("ascii")): """A python2.4 workaround for the sha/hashlib module fiasco From ba10cf1089b6dfc43a2dea698994dcae26acf291 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 14:43:57 +0200 Subject: [PATCH 04/21] chore(ci): depend on "leaks" smmap branch --- .appveyor.yml | 1 + .travis.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 2daadaa..734bf72 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -38,6 +38,7 @@ install: git config --global user.name "Travis Runner" - pip install -e . + - pip install -I git+https://github.com/ankostis/smmap.git@leaks build: false diff --git a/.travis.yml b/.travis.yml index ba0beaa..b7fc9e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ git: depth: 1000 install: - pip install coveralls + - pip install -I git+https://github.com/ankostis/smmap.git@leaks script: - ulimit -n 48 - ulimit -n From b199bd5932e2bb838d75d94b05fd2dba2fc954f4 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 15:03:32 +0200 Subject: [PATCH 05/21] chore(ci): depend on "leaks" smmap branch + stop importing git-submodules for gitdb & smmap --- gitdb/__init__.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/gitdb/__init__.py b/gitdb/__init__.py index bfa083b..c718c33 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -7,25 +7,6 @@ import sys import os -#{ Initialization - - -def _init_externals(): - """Initialize external projects by putting them into the path""" - for module in ('smmap',): - sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module)) - - try: - __import__(module) - except ImportError: - raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module) - # END verify import - # END handel imports - -#} END initialization - -_init_externals() - __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" From f0988ccd137dd4208161a9684116a18c829c3b74 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 15:11:00 +0200 Subject: [PATCH 06/21] chore(gitdb): actually delete submodule from sources --- .gitmodules | 3 --- gitdb/ext/smmap | 1 - 2 files changed, 4 deletions(-) delete mode 160000 gitdb/ext/smmap diff --git a/.gitmodules b/.gitmodules index d85b15c..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "smmap"] - path = gitdb/ext/smmap - url = https://github.com/Byron/smmap.git diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap deleted file mode 160000 index ac5df70..0000000 --- a/gitdb/ext/smmap +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ac5df7061ee11232346b3d0eb3aa5b43eebc847d From 08b1f5f4fdc95d4ce24aa33ec82ac0d9723b8a02 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 15:12:33 +0200 Subject: [PATCH 07/21] chore(ver): bump 2.0.0-->2.1.0.dev0 --- .appveyor.yml | 2 +- .travis.yml | 2 +- gitdb/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 734bf72..3d8a678 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -38,7 +38,7 @@ install: git config --global user.name "Travis Runner" - pip install -e . - - pip install -I git+https://github.com/ankostis/smmap.git@leaks + - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev0 build: false diff --git a/.travis.yml b/.travis.yml index b7fc9e0..a29b64d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ git: depth: 1000 install: - pip install coveralls - - pip install -I git+https://github.com/ankostis/smmap.git@leaks + - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev0 script: - ulimit -n 48 - ulimit -n diff --git a/gitdb/__init__.py b/gitdb/__init__.py index c718c33..68c882b 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -10,7 +10,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (2, 0, 0) +version_info = (2, 1, 0, 'dev0') __version__ = '.'.join(str(i) for i in version_info) From 534c9bbe320f638153f9ffd79b79fa124b544d0f Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 17:32:06 +0200 Subject: [PATCH 08/21] fix(win): FIX and HIDE 2 win-errors remaining + File-in-use errors were fixed with `gitdb.util.mman.collect()`! + This call is disabled `gitdb.util.HIDE_WINDOWS_KNOWN_ERRORS == False`. + Depend on latest smmp `v2.1.0.dev1` tag --- .appveyor.yml | 2 +- .travis.yml | 2 +- gitdb/db/loose.py | 7 +- gitdb/db/pack.py | 4 +- gitdb/pack.py | 2 +- gitdb/stream.py | 5 +- gitdb/test/db/lib.py | 10 +-- gitdb/test/db/test_git.py | 3 +- gitdb/test/db/test_pack.py | 10 +++ gitdb/test/lib.py | 9 ++- gitdb/test/performance/test_pack_streaming.py | 5 +- gitdb/test/test_example.py | 21 ++--- gitdb/test/test_pack.py | 77 ++++++++++--------- gitdb/test/test_stream.py | 17 ++-- gitdb/util.py | 35 +++++---- 15 files changed, 119 insertions(+), 90 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 3d8a678..d7b2550 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -38,7 +38,7 @@ install: git config --global user.name "Travis Runner" - pip install -e . - - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev0 + - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 build: false diff --git a/.travis.yml b/.travis.yml index a29b64d..66400ae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ git: depth: 1000 install: - pip install coveralls - - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev0 + - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 script: - ulimit -n 48 - ulimit -n diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py index 374cdc7..1338e83 100644 --- a/gitdb/db/loose.py +++ b/gitdb/db/loose.py @@ -40,7 +40,8 @@ rename, dirname, basename, - join + join, + is_win, ) from gitdb.fun import ( @@ -71,7 +72,7 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): # On windows we need to keep it writable, otherwise it cannot be removed # either new_objects_mode = int("444", 8) - if os.name == 'nt': + if is_win: new_objects_mode = int("644", 8) def __init__(self, root_path): @@ -226,7 +227,7 @@ def store(self, istream): mkdir(obj_dir) # END handle destination directory # rename onto existing doesn't work on windows - if os.name == 'nt': + if is_win: if isfile(obj_path): remove(tmp_path) else: diff --git a/gitdb/db/pack.py b/gitdb/db/pack.py index 4834caf..95eb564 100644 --- a/gitdb/db/pack.py +++ b/gitdb/db/pack.py @@ -43,13 +43,13 @@ def __init__(self, root_path): # * hits - number of times the pack was hit with a request # * entity - Pack entity instance # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query - # self._entities = list() # lazy loaded list + # self._entities = [] # lazy loaded list self._hit_count = 0 # amount of hits self._st_mtime = 0 # last modification data of our root path def _set_cache_(self, attr): if attr == '_entities': - self._entities = list() + self._entities = [] self.update_cache(force=True) # END handle entities initialization diff --git a/gitdb/pack.py b/gitdb/pack.py index 28d973c..fc67ffc 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -40,7 +40,7 @@ pass # END try c module -from gitdb.base import ( # Amazing ! +from gitdb.base import ( OInfo, OStream, OPackInfo, diff --git a/gitdb/stream.py b/gitdb/stream.py index 330f749..1c1df23 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -25,6 +25,7 @@ write, close, suppress, + is_darwin, ) from gitdb.const import NULL_BYTE, BYTE_SPACE @@ -318,7 +319,7 @@ def read(self, size=-1): # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the # table in the github issue. This is it ... it was the only way I could make this work everywhere. # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . - if PY26 or ((zlib.ZLIB_VERSION == '1.2.7' or zlib.ZLIB_VERSION == '1.2.5') and not sys.platform == 'darwin'): + if PY26 or ((zlib.ZLIB_VERSION == '1.2.7' or zlib.ZLIB_VERSION == '1.2.5') and not is_darwin): unused_datalen = len(self._zip.unconsumed_tail) else: unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) @@ -447,7 +448,7 @@ def _set_cache_brute_(self, attr): # TODO: There should be a special case if there is only one stream # Then the default-git algorithm should perform a tad faster, as the # delta is not peaked into, causing less overhead. - buffer_info_list = list() + buffer_info_list = [] max_target_size = 0 for dstream in self._dstreams: buf = dstream.read(512) # read the header information + X diff --git a/gitdb/test/db/lib.py b/gitdb/test/db/lib.py index 528bcc1..d7365ea 100644 --- a/gitdb/test/db/lib.py +++ b/gitdb/test/db/lib.py @@ -39,7 +39,7 @@ class TestDBBase(TestBase): # data two_lines = b'1234\nhello world' - all_data = (two_lines, ) + all_data = (two_lines,) def _assert_object_writing_simple(self, db): # write a bunch of objects and query their streams and info @@ -56,10 +56,10 @@ def _assert_object_writing_simple(self, db): assert isinstance(info, OInfo) assert info.type == istream.type and info.size == istream.size - stream = db.stream(istream.binsha) - assert isinstance(stream, OStream) - assert stream.binsha == info.binsha and stream.type == info.type - assert stream.read() == data + with db.stream(istream.binsha) as stream: + assert isinstance(stream, OStream) + assert stream.binsha == info.binsha and stream.type == info.type + assert stream.read() == data # END for each item assert db.size() == null_objs + ni diff --git a/gitdb/test/db/test_git.py b/gitdb/test/db/test_git.py index acc0f15..b637c13 100644 --- a/gitdb/test/db/test_git.py +++ b/gitdb/test/db/test_git.py @@ -24,7 +24,8 @@ def test_reading(self): # access should be possible gitdb_sha = next(gdb.sha_iter()) assert isinstance(gdb.info(gitdb_sha), OInfo) - assert isinstance(gdb.stream(gitdb_sha), OStream) + with gdb.stream(gitdb_sha) as stream: + assert isinstance(gdb.stream(gitdb_sha), OStream) ni = 50 assert gdb.size() >= ni sha_list = list(gdb.sha_iter()) diff --git a/gitdb/test/db/test_pack.py b/gitdb/test/db/test_pack.py index b361a59..f7e631e 100644 --- a/gitdb/test/db/test_pack.py +++ b/gitdb/test/db/test_pack.py @@ -13,10 +13,16 @@ import os import random +from gitdb.util import mman, HIDE_WINDOWS_KNOWN_ERRORS class TestPackDB(TestDBBase): + ## Unless HIDE_WINDOWS_KNOWN_ERRORS, on Windows fails with: + # File "D:\Work\gitdb.git\gitdb\test\db\test_pack.py", line 41, in test_writing + # os.rename(pack_path, new_pack_path) + # PermissionError: [WinError 32] The process cannot access the file + # because it is being used by another process: 'pack-c0438c19fb16422b6bbcce24387b3264416d485b.packrenamed' @with_rw_directory @with_packs_rw def test_writing(self, path): @@ -30,6 +36,10 @@ def test_writing(self, path): # packs removed - rename a file, should affect the glob pack_path = pdb.entities()[0].pack().path() new_pack_path = pack_path + "renamed" + ## FIXME: Had to manually collect leaked files!! + if HIDE_WINDOWS_KNOWN_ERRORS: + leaked_mmaps = mman.collect() + self.assertEqual(leaked_mmaps, 6) os.rename(pack_path, new_pack_path) pdb.update_cache(force=True) diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index 8b5cb02..0017031 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -17,7 +17,7 @@ import unittest from gitdb import OStream -from gitdb.util import rmtree +from gitdb.util import rmtree, mman, HIDE_WINDOWS_KNOWN_ERRORS from gitdb.utils.compat import xrange @@ -96,6 +96,13 @@ def wrapper(self): # memory maps closed, once objects go out of scope. For some reason # though this is not the case here unless we collect explicitly. if not keep: + if HIDE_WINDOWS_KNOWN_ERRORS: + ## Or else 2 Windows TCs fail with: + # File "D:\Work\gitdb.git\gitdb\util.py", line 141, in onerror + # func(path) # Will scream if still not possible to delete. + # PermissionError: [WinError 32] The process cannot access the file + # because it is being used by another process: 'sss\\index_cc_wll5' + mman.collect() gc.collect() rmtree(path) # END handle exception diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py index 76f0f4a..21a7532 100644 --- a/gitdb/test/performance/test_pack_streaming.py +++ b/gitdb/test/performance/test_pack_streaming.py @@ -46,7 +46,8 @@ def test_pack_writing(self): st = time() for sha in pdb.sha_iter(): count += 1 - pdb.stream(sha) + with pdb.stream(sha): + pass if count == ni: break # END gather objects for pack-writing @@ -55,6 +56,8 @@ def test_pack_writing(self): (ni, elapsed, ni / (elapsed or 1)), file=sys.stderr) st = time() + ## We are leaking files here, but we don't care... + # and we need a `contextlib.ExitStack` to safely close them. PackEntity.write_pack((pdb.stream(sha) for sha in pdb.sha_iter()), ostream.write, object_count=ni) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 diff --git a/gitdb/test/test_example.py b/gitdb/test/test_example.py index 6e80bf5..0bf6d1a 100644 --- a/gitdb/test/test_example.py +++ b/gitdb/test/test_example.py @@ -18,26 +18,19 @@ def test_base(self): for sha1 in ldb.sha_iter(): oinfo = ldb.info(sha1) - ostream = ldb.stream(sha1) - assert oinfo[:3] == ostream[:3] + with ldb.stream(sha1) as ostream: + assert oinfo[:3] == ostream[:3] - assert len(ostream.read()) == ostream.size + assert len(ostream.read()) == ostream.size assert ldb.has_object(oinfo.binsha) # END for each sha in database - # assure we close all files - try: - del(ostream) - del(oinfo) - except UnboundLocalError: - pass - # END ignore exception if there are no loose objects data = "my data".encode("ascii") istream = IStream("blob", len(data), BytesIO(data)) # the object does not yet have a sha assert istream.binsha is None - ldb.store(istream) - # now the sha is set - assert len(istream.binsha) == 20 - assert ldb.has_object(istream.binsha) + with ldb.store(istream): + # now the sha is set + assert len(istream.binsha) == 20 + assert ldb.has_object(istream.binsha) diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index 7484193..4f259cb 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -88,42 +88,42 @@ def _assert_pack_file(self, pack, version, size): num_obj = 0 for obj in pack.stream_iter(): - num_obj += 1 - info = pack.info(obj.pack_offset) - stream = pack.stream(obj.pack_offset) - - assert info.pack_offset == stream.pack_offset - assert info.type_id == stream.type_id - assert hasattr(stream, 'read') - - # it should be possible to read from both streams - assert obj.read() == stream.read() - - streams = pack.collect_streams(obj.pack_offset) - assert streams - - # read the stream - try: - dstream = DeltaApplyReader.new(streams) - except ValueError: - # ignore these, old git versions use only ref deltas, - # which we havent resolved ( as we are without an index ) - # Also ignore non-delta streams - continue - # END get deltastream - - with dstream: - # read all - data = dstream.read() - assert len(data) == dstream.size - - # test seek - dstream.seek(0) - assert dstream.read() == data - - # read chunks - # NOTE: the current implementation is safe, it basically transfers - # all calls to the underlying memory map + with obj: + num_obj += 1 + info = pack.info(obj.pack_offset) + with pack.stream(obj.pack_offset) as stream: + assert info.pack_offset == stream.pack_offset + assert info.type_id == stream.type_id + assert hasattr(stream, 'read') + + # it should be possible to read from both streams + assert obj.read() == stream.read() + + streams = pack.collect_streams(obj.pack_offset) + assert streams + + # read the stream + try: + dstream = DeltaApplyReader.new(streams) + except ValueError: + # ignore these, old git versions use only ref deltas, + # which we havent resolved ( as we are without an index ) + # Also ignore non-delta streams + continue + # END get deltastream + + with dstream: + # read all + data = dstream.read() + assert len(data) == dstream.size + + # test seek + dstream.seek(0) + assert dstream.read() == data + + # read chunks + # NOTE: the current implementation is safe, it basically transfers + # all calls to the underlying memory map # END for each object assert num_obj == size @@ -142,6 +142,11 @@ def test_pack(self): self._assert_pack_file(pack, version, size) # END for each pack to test + ## Unless HIDE_WINDOWS_KNOWN_ERRORS, on Windows fails with: + # File "D:\Work\gitdb.git\gitdb\util.py", line 141, in onerror + # func(path) # Will scream if still not possible to delete. + # PermissionError: [WinError 32] The process cannot access the file + # because it is being used by another process: 'sss\\index_cc_wll5' @with_rw_directory def test_pack_entity(self, rw_dir): pack_objs = list() diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 7d3eeae..9bc3ca5 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -154,13 +154,12 @@ def test_decompress_reader_special_case(self): mdb = MemoryDB() for sha in (b'888401851f15db0eed60eb1bc29dec5ddcace911', b'7bb839852ed5e3a069966281bb08d50012fb309b',): - ostream = odb.stream(hex_to_bin(sha)) - - # if there is a bug, we will be missing one byte exactly ! - data = ostream.read() - assert len(data) == ostream.size - - # Putting it back in should yield nothing new - after all, we have - dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) - assert dump.hexsha == sha + with odb.stream(hex_to_bin(sha)) as ostream: + # if there is a bug, we will be missing one byte exactly ! + data = ostream.read() + assert len(data) == ostream.size + + # Putting it back in should yield nothing new - after all, we have + dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) + assert dump.hexsha == sha # end for each loose object sha to test diff --git a/gitdb/util.py b/gitdb/util.py index e6ed8a3..8a20605 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -3,21 +3,27 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php import binascii -import os -import mmap -import sys import errno - +import hashlib from io import BytesIO +import logging +import mmap +import os +import shutil +import stat +import sys from smmap import ( StaticWindowMapManager, SlidingWindowMapManager, SlidingWindowMapBuffer ) -import logging -import stat -import shutil + +from gitdb.const import ( + NULL_BIN_SHA, + NULL_HEX_SHA +) + # initialize our global memory manager instance # Use it to free cached (and unused) resources. @@ -27,7 +33,6 @@ mman = SlidingWindowMapManager() # END handle mman -import hashlib try: from struct import unpack_from @@ -70,16 +75,20 @@ def unpack_from(fmt, data, offset=0): close = os.close fsync = os.fsync +is_win = (os.name == 'nt') +is_darwin = (os.name == 'darwin') + # Backwards compatibility imports -from gitdb.const import ( - NULL_BIN_SHA, - NULL_HEX_SHA -) #} END Aliases log = logging.getLogger(__name__) +#: We need an easy way to see if Appveyor TCs start failing, +#: so the errors marked with this var are considered "acknowledged" ones, awaiting remedy, +#: till then, we wish to hide them. +HIDE_WINDOWS_KNOWN_ERRORS = is_win and os.environ.get('HIDE_WINDOWS_KNOWN_ERRORS', True) + #{ compatibility stuff ... @@ -420,7 +429,7 @@ def _end_writing(self, successful=True): lockfile = self._lockfilepath() if self._write and successful: # on windows, rename does not silently overwrite the existing one - if sys.platform == "win32": + if is_win: if isfile(self._filepath): os.remove(self._filepath) # END remove if exists From 47e88843942c445ca6672c53fb53ad3c718e7e1f Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 20:25:23 +0200 Subject: [PATCH 09/21] fix(pack): restore packers as LazyMixins Speed-up for non-error commits: + no-leak contextlibing(v.2.0.0, 941b6c7e): 5.20 + Prev commit, no LazyMixin(v.2.1.0.dev1): 7.70 + This commit, LazyMixin: 5.50 --- gitdb/pack.py | 163 +++++++++++++++++++++++--------------------------- 1 file changed, 75 insertions(+), 88 deletions(-) diff --git a/gitdb/pack.py b/gitdb/pack.py index fc67ffc..d76b8ea 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -247,7 +247,7 @@ def write(self, pack_sha, write): return sha -class PackIndexFile(object): +class PackIndexFile(LazyMixin): """A pack index provides offsets into the corresponding pack, allowing to find locations for offsets faster.""" @@ -261,15 +261,19 @@ class PackIndexFile(object): _sha_list_offset = 8 + 1024 index_v2_signature = b'\xfftOc' index_version_default = 2 - _entered = 0 def __init__(self, indexpath): self._indexpath = indexpath + self._entered = 0 + self._cursor = None def __enter__(self): - if not hasattr(self, '_cursor'): - assert self._entered == 0, (self, self._indexpath) - self._prepare_enter() + if self._entered == 0: + # Note: We don't lock the file when reading as we cannot be sure + # that we can actually write to the location - it could be a read-only + # alternate for instance + assert self._cursor is None, self._cursor + self._cursor = self._make_cursor() self._entered += 1 return self @@ -279,20 +283,19 @@ def __exit__(self, exc_type, exc_value, traceback): assert self._entered >= 0, (self, self._indexpath) if self._entered == 0: self._cursor._destroy() - del self._cursor - del self._fanout_table - - def _prepare_enter(self): - # Note: We don't lock the file when reading as we cannot be sure - # that we can actually write to the location - it could be a read-only - # alternate for instance - self._cursor = mman.make_cursor(self._indexpath).use_region() + self._cursor = None + + def _make_cursor(self): + cursor = mman.make_cursor(self._indexpath).use_region() + # We will assume that the index will always fully fit into memory ! - if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): + if mman.window_size() > 0 and cursor.file_size() > mman.window_size(): raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( - self._indexpath, self._cursor.file_size(), mman.window_size())) - # END assert window size + self._indexpath, cursor.file_size(), mman.window_size())) + return cursor + + def _set_cache_(self, attr): # now its time to initialize everything - if we are here, someone wants # to access the fanout table or related properties @@ -312,7 +315,24 @@ def _prepare_enter(self): # INITIALIZE DATA # byte offset is 8 if version is 2, 0 otherwise - self._initialize() + self._fanout_table = self._read_fanout((self._version == 2) * 8) + + if self._version == 2: + self._crc_list_offset = self._sha_list_offset + self.size() * 20 + self._pack_offset = self._crc_list_offset + self.size() * 4 + self._pack_64_offset = self._pack_offset + self.size() * 4 + # END setup base + + def _read_fanout(self, byte_offset): + """Generate a fanout table from our data""" + d = self._cursor.map() + out = [] + append = out.append + for i in xrange(256): + append(unpack_from('>L', d, byte_offset + i * 4)[0]) + # END for each entry + return out + # END handle attributes #{ Access V1 @@ -366,30 +386,6 @@ def _crc_v2(self, i): #} END access V2 - #{ Initialization - - def _initialize(self): - """initialize base data""" - self._fanout_table = self._read_fanout((self._version == 2) * 8) - - if self._version == 2: - self._crc_list_offset = self._sha_list_offset + self.size() * 20 - self._pack_offset = self._crc_list_offset + self.size() * 4 - self._pack_64_offset = self._pack_offset + self.size() * 4 - # END setup base - - def _read_fanout(self, byte_offset): - """Generate a fanout table from our data""" - d = self._cursor.map() - out = [] - append = out.append - for i in xrange(256): - append(unpack_from('>L', d, byte_offset + i * 4)[0]) - # END for each entry - return out - - #} END initialization - #{ Properties def version(self): return self._version @@ -434,7 +430,7 @@ def sha_to_index(self, sha): :param sha: 20 byte sha to lookup""" first_byte = byte_ord(sha[0]) get_sha = self.sha - lo = 0 + lo = 0 # lower index, the left bound of the bisection if first_byte != 0: lo = self._fanout_table[first_byte - 1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection @@ -514,7 +510,7 @@ def sha_to_index(self, sha): #} END properties -class PackFile(object): +class PackFile(LazyMixin): """A pack is a file written according to the Version 2 for git packs @@ -543,32 +539,30 @@ class PackFile(object): def __init__(self, packpath): self._packpath = packpath self._entered = 0 + self._cursor = None def __enter__(self): - if not hasattr(self, '_cursor'): - assert self._entered == 0, (self, self._packpath) - self._prepare_enter() + if self._entered == 0: + assert self._cursor is None, self._cursor + self._cursor = mman.make_cursor(self._packpath).use_region() self._entered += 1 return self def __exit__(self, exc_type, exc_value, traceback): self._entered -= 1 - assert self._entered >= 0, (self, self._indexpath) + assert self._entered >= 0, (self, self._packpath) if self._entered == 0: self._cursor._destroy() - del self._cursor - - def _prepare_enter(self): - # we fill the whole cache, whichever attribute gets queried first - self._cursor = mman.make_cursor(self._packpath).use_region() + self._cursor = None - # read the header information + def _set_cache_(self, attr): + # Fill cache by reading the header information. type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0) # TODO: figure out whether we should better keep the lock, or maybe # add a .keep file instead ? - if type_id != self.pack_signature: + if type_id != PackFile.pack_signature: raise ParseError("Invalid pack signature: %i" % type_id) def _iter_objects(self, start_offset, as_stream=True): @@ -681,12 +675,12 @@ def stream_iter(self, start_offset=0): #} END Read-Database like Interface -class PackEntity(object): +class PackEntity(LazyMixin): """Combines the PackIndexFile and the PackFile into one, allowing the actual objects to be resolved and iterated""" - __slots__ = ('_basename', + __slots__ = ('_basename', # Could have been int, but better limit scurpulus nesting. '_index', # our index file '_pack', # our pack file '_offset_map', # on demand dict mapping one offset to the next consecutive one @@ -713,41 +707,34 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - if self._index: - self._index.__exit__(exc_type, exc_value, traceback) - if self._pack: - self._pack.__exit__(exc_type, exc_value, traceback) + self._index.__exit__(exc_type, exc_value, traceback) + self._pack.__exit__(exc_type, exc_value, traceback) self._entered = False - @property - def offset_map(self): - if not hasattr(self, '_offset_map'): - # currently this can only be _offset_map - # TODO: make this a simple sorted offset array which can be bisected - # to find the respective entry, from which we can take a +1 easily - # This might be slower, but should also be much lighter in memory ! - offsets_sorted = sorted(self._index.offsets()) - last_offset = len(self._pack.data()) - self._pack.footer_size - assert offsets_sorted, "Cannot handle empty indices" - - offset_map = None - if len(offsets_sorted) == 1: - offset_map = {offsets_sorted[0]: last_offset} - else: - iter_offsets = iter(offsets_sorted) - iter_offsets_plus_one = iter(offsets_sorted) - next(iter_offsets_plus_one) - consecutive = izip(iter_offsets, iter_offsets_plus_one) - - offset_map = dict(consecutive) - - # the last offset is not yet set - offset_map[offsets_sorted[-1]] = last_offset - # END handle offset amount + def _set_cache_(self, attr): + # currently this can only be _offset_map + # TODO: make this a simple sorted offset array which can be bisected + # to find the respective entry, from which we can take a +1 easily + # This might be slower, but should also be much lighter in memory ! + offsets_sorted = sorted(self._index.offsets()) + last_offset = len(self._pack.data()) - self._pack.footer_size + assert offsets_sorted, "Cannot handle empty indices" + + offset_map = None + if len(offsets_sorted) == 1: + offset_map = {offsets_sorted[0]: last_offset} + else: + iter_offsets = iter(offsets_sorted) + iter_offsets_plus_one = iter(offsets_sorted) + next(iter_offsets_plus_one) + consecutive = izip(iter_offsets, iter_offsets_plus_one) - self._offset_map = offset_map + offset_map = dict(consecutive) - return self._offset_map + # the last offset is not yet set + offset_map[offsets_sorted[-1]] = last_offset + # END handle offset amount + self._offset_map = offset_map def _sha_to_index(self, sha): """:return: index for the given sha, or raise""" @@ -870,7 +857,7 @@ def is_valid_stream(self, sha, use_crc=False): index = self._sha_to_index(sha) offset = self._index.offset(index) - next_offset = self.offset_map[offset] + next_offset = self._offset_map[offset] crc_value = self._index.crc(index) # create the current crc value, on the compressed object data From 79a754a34ddc68e12d1d0d28fd99b528f71a5044 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 20:38:49 +0200 Subject: [PATCH 10/21] chore(ver): bump 2.0.0.dev0-->2.1.0.dev2 (yes, last ver was old) --- gitdb/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/__init__.py b/gitdb/__init__.py index 68c882b..3c939a4 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -10,7 +10,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (2, 1, 0, 'dev0') +version_info = (2, 1, 0, 'dev1') __version__ = '.'.join(str(i) for i in version_info) From a2e49d98d4a4447b1b1ff31a05ee58a653ac70e7 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 23:25:59 +0200 Subject: [PATCH 11/21] style(listuple): use literals for empty lists/tuples --- gitdb/db/base.py | 4 ++-- gitdb/db/git.py | 2 +- gitdb/db/ref.py | 4 ++-- gitdb/fun.py | 4 ++-- gitdb/test/performance/test_stream.py | 4 ++-- gitdb/test/test_pack.py | 2 +- gitdb/util.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/gitdb/db/base.py b/gitdb/db/base.py index 6dbee21..b71a42c 100644 --- a/gitdb/db/base.py +++ b/gitdb/db/base.py @@ -166,7 +166,7 @@ class CompoundDB(ObjectDBR, LazyMixin, CachingDB): def _set_cache_(self, attr): if attr == '_dbs': - self._dbs = list() + self._dbs = [] elif attr == '_db_cache': self._db_cache = dict() else: @@ -237,7 +237,7 @@ def partial_to_complete_sha_hex(self, partial_hexsha): :return: 20 byte binary sha1 from the given less-than-40 byte hexsha (bytes or str) :param partial_hexsha: hexsha with less than 40 byte :raise AmbiguousObjectName: """ - databases = list() + databases = [] _databases_recursive(self, databases) partial_hexsha = force_text(partial_hexsha) len_partial_hexsha = len(partial_hexsha) diff --git a/gitdb/db/git.py b/gitdb/db/git.py index 7a43d72..7d8c61b 100644 --- a/gitdb/db/git.py +++ b/gitdb/db/git.py @@ -43,7 +43,7 @@ def __init__(self, root_path): def _set_cache_(self, attr): if attr == '_dbs' or attr == '_loose_db': - self._dbs = list() + self._dbs = [] loose_db = None for subpath, dbcls in ((self.packs_dir, self.PackDBCls), (self.loose_dir, self.LooseDBCls), diff --git a/gitdb/db/ref.py b/gitdb/db/ref.py index 2e3db86..ff44f23 100644 --- a/gitdb/db/ref.py +++ b/gitdb/db/ref.py @@ -24,7 +24,7 @@ def __init__(self, ref_file): def _set_cache_(self, attr): if attr == '_dbs': - self._dbs = list() + self._dbs = [] self._update_dbs_from_ref_file() else: super(ReferenceDB, self)._set_cache_(attr) @@ -39,7 +39,7 @@ def _update_dbs_from_ref_file(self): # END get db type # try to get as many as possible, don't fail if some are unavailable - ref_paths = list() + ref_paths = [] try: with open(self._ref_file, 'r') as f: ref_paths = [l.strip() for l in f] diff --git a/gitdb/fun.py b/gitdb/fun.py index cf3ad02..af783b6 100644 --- a/gitdb/fun.py +++ b/gitdb/fun.py @@ -222,7 +222,7 @@ class DeltaChunkList(list): latest delta down to the earliest ancestor. This attribute is queryable after all processing with is_reversed.""" - __slots__ = tuple() + __slots__ = () def rbound(self): """:return: rightmost extend in bytes, absolute""" @@ -323,7 +323,7 @@ class TopdownDeltaChunkList(DeltaChunkList): """Represents a list which is generated by feeding its ancestor streams one by one""" - __slots__ = tuple() + __slots__ = () def connect_with_next_base(self, bdcl): """Connect this chain with the next level of our base delta chunklist. diff --git a/gitdb/test/performance/test_stream.py b/gitdb/test/performance/test_stream.py index c9b0f15..e95c705 100644 --- a/gitdb/test/performance/test_stream.py +++ b/gitdb/test/performance/test_stream.py @@ -48,7 +48,7 @@ class TestObjDBPerformance(TestBigRepoR): @with_rw_directory def test_large_data_streaming(self, path): ldb = LooseObjectDB(path) - string_ios = list() # list of streams we previously created + string_ios = [] # list of streams we previously created # serial mode for randomize in range(2): @@ -85,7 +85,7 @@ def test_large_data_streaming(self, path): # reading in chunks of 1 MiB cs = 512 * 1000 - chunks = list() + chunks = [] st = time() with ldb.stream(sha) as ostream: while True: diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index 4f259cb..f1e8a35 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -149,7 +149,7 @@ def test_pack(self): # because it is being used by another process: 'sss\\index_cc_wll5' @with_rw_directory def test_pack_entity(self, rw_dir): - pack_objs = list() + pack_objs = [] for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1), (self.packfile_v2_2, self.packindexfile_v2), (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): diff --git a/gitdb/util.py b/gitdb/util.py index 8a20605..b0e8147 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -293,7 +293,7 @@ class LazyMixin(object): return the cached value as stored in the Instance's dict or slot. """ - __slots__ = tuple() + __slots__ = () def __getattr__(self, attr): """ From 61ea9bbe488e14af8d185f3873f2d33192fd8297 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 10:20:35 +0200 Subject: [PATCH 12/21] test(travis): enable all tests (inc perf) on TravisCI --- gitdb/test/performance/test_pack.py | 4 ---- gitdb/test/performance/test_pack_streaming.py | 3 --- gitdb/test/performance/test_stream.py | 2 -- 3 files changed, 9 deletions(-) diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py index b99b91e..54a9bf1 100644 --- a/gitdb/test/performance/test_pack.py +++ b/gitdb/test/performance/test_pack.py @@ -18,7 +18,6 @@ from gitdb.exc import UnsupportedOperation from gitdb.db.pack import PackedDB from gitdb.utils.compat import xrange -from gitdb.test.lib import skip_on_travis_ci import sys import os @@ -27,7 +26,6 @@ class TestPackedDBPerformance(TestBigRepoR): - @skip_on_travis_ci def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) @@ -80,7 +78,6 @@ def test_pack_random_access(self): print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib / (elapsed or 1), elapsed, max_items / (elapsed or 1)), file=sys.stderr) - @skip_on_travis_ci def test_loose_correctness(self): """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back into the loose object db (memory). @@ -108,7 +105,6 @@ def test_loose_correctness(self): mdb._cache.clear() # end for each sha to copy - @skip_on_travis_ci def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py index 21a7532..1a61eb6 100644 --- a/gitdb/test/performance/test_pack_streaming.py +++ b/gitdb/test/performance/test_pack_streaming.py @@ -12,7 +12,6 @@ from gitdb.db.pack import PackedDB from gitdb.stream import NullStream from gitdb.pack import PackEntity -from gitdb.test.lib import skip_on_travis_ci import os import sys @@ -34,7 +33,6 @@ def write(self, d): class TestPackStreamingPerformance(TestBigRepoR): - @skip_on_travis_ci def test_pack_writing(self): # see how fast we can write a pack from object streams. # This will not be fast, as we take time for decompressing the streams as well @@ -64,7 +62,6 @@ def test_pack_writing(self): print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb / (elapsed or 1)), sys.stderr) - @skip_on_travis_ci def test_stream_reading(self): # raise SkipTest() pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) diff --git a/gitdb/test/performance/test_stream.py b/gitdb/test/performance/test_stream.py index e95c705..bbb308b 100644 --- a/gitdb/test/performance/test_stream.py +++ b/gitdb/test/performance/test_stream.py @@ -20,7 +20,6 @@ from gitdb.test.lib import ( make_memory_file, with_rw_directory, - skip_on_travis_ci ) @@ -44,7 +43,6 @@ class TestObjDBPerformance(TestBigRepoR): large_data_size_bytes = 1000 * 1000 * 50 # some MiB should do it moderate_data_size_bytes = 1000 * 1000 * 1 # just 1 MiB - @skip_on_travis_ci @with_rw_directory def test_large_data_streaming(self, path): ldb = LooseObjectDB(path) From a566e1189f2aa118728e7b1dc49bcd4dbb3e3c6f Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 11:48:29 +0200 Subject: [PATCH 13/21] refact(win_errs): move HIDE_WINDOWS_KNOWN_ERRORS from main-code to test --- gitdb/test/__init__.py | 10 ++++++++++ gitdb/test/db/test_pack.py | 3 ++- gitdb/test/lib.py | 10 ++++++---- gitdb/util.py | 5 ----- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/gitdb/test/__init__.py b/gitdb/test/__init__.py index 8a681e4..13c8411 100644 --- a/gitdb/test/__init__.py +++ b/gitdb/test/__init__.py @@ -2,3 +2,13 @@ # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php +import os + +from gitdb.util import is_win + + +#: We need an easy way to see if Appveyor TCs start failing, +#: so the errors marked with this var are considered "acknowledged" ones, awaiting remedy, +#: till then, we wish to hide them. +HIDE_WINDOWS_KNOWN_ERRORS = is_win and os.environ.get('HIDE_WINDOWS_KNOWN_ERRORS', True) + diff --git a/gitdb/test/db/test_pack.py b/gitdb/test/db/test_pack.py index f7e631e..ac30a5d 100644 --- a/gitdb/test/db/test_pack.py +++ b/gitdb/test/db/test_pack.py @@ -13,7 +13,8 @@ import os import random -from gitdb.util import mman, HIDE_WINDOWS_KNOWN_ERRORS +from gitdb.util import mman +from gitdb.test import HIDE_WINDOWS_KNOWN_ERRORS class TestPackDB(TestDBBase): diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index 0017031..cc369cd 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -3,6 +3,8 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Utilities used in ODB testing""" +#{ Bases + from array import array from functools import wraps import gc @@ -16,12 +18,12 @@ import tempfile import unittest -from gitdb import OStream -from gitdb.util import rmtree, mman, HIDE_WINDOWS_KNOWN_ERRORS +from gitdb.base import OStream +from gitdb.test import HIDE_WINDOWS_KNOWN_ERRORS +from gitdb.util import rmtree, mman from gitdb.utils.compat import xrange -#{ Bases log = logging.getLogger(__name__) @@ -168,7 +170,7 @@ def make_bytes(size_in_bytes, randomize=False): return a.tostring() -def make_object(type, data): +def make_object(otype, data): """:return: bytes resembling an uncompressed object""" odata = "blob %i\0" % len(data) return odata.encode("ascii") + data diff --git a/gitdb/util.py b/gitdb/util.py index b0e8147..0a77ea2 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -84,11 +84,6 @@ def unpack_from(fmt, data, offset=0): log = logging.getLogger(__name__) -#: We need an easy way to see if Appveyor TCs start failing, -#: so the errors marked with this var are considered "acknowledged" ones, awaiting remedy, -#: till then, we wish to hide them. -HIDE_WINDOWS_KNOWN_ERRORS = is_win and os.environ.get('HIDE_WINDOWS_KNOWN_ERRORS', True) - #{ compatibility stuff ... From fda6bc1262cd661eb6c4b66e7c54126fab664f05 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 15:03:00 +0200 Subject: [PATCH 14/21] fix(io): BREAKING, wrap more out-stream usages + chrore(deps): depend on *contextlib2* for `ExitStack` in PY2. + refact(util): BREAKING API move consts out of utils. + style(pep8): fixe all sources. --- .appveyor.yml | 3 +- .travis.yml | 3 + gitdb/__init__.py | 6 +- gitdb/db/loose.py | 6 +- gitdb/pack.py | 93 ++++++++++--------- gitdb/stream.py | 19 ++-- gitdb/test/db/lib.py | 45 ++++----- gitdb/test/db/test_git.py | 9 +- gitdb/test/db/test_mem.py | 12 ++- gitdb/test/db/test_ref.py | 12 +-- gitdb/test/performance/test_pack.py | 16 ++-- gitdb/test/performance/test_pack_streaming.py | 24 +++-- gitdb/test/test_base.py | 26 +++--- gitdb/test/test_pack.py | 43 +++++---- gitdb/test/test_util.py | 4 +- gitdb/util.py | 29 +----- gitdb/utils/compat.py | 36 +++++-- requirements.txt | 1 + setup.py | 5 +- 19 files changed, 199 insertions(+), 193 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index d7b2550..bfc5eff 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -37,8 +37,9 @@ install: git config --global user.email "travis@ci.com" git config --global user.name "Travis Runner" - - pip install -e . + ## Install any dev-requirements (FIXME: remove on release) - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 + - pip install -r requirements.txt build: false diff --git a/.travis.yml b/.travis.yml index 66400ae..2f8e407 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,10 @@ git: depth: 1000 install: - pip install coveralls + + ## Install any dev-requirements (FIXME: remove on release) - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 + - pip install -r requirements.txt script: - ulimit -n 48 - ulimit -n diff --git a/gitdb/__init__.py b/gitdb/__init__.py index 3c939a4..33229ce 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -15,6 +15,6 @@ # default imports -from gitdb.base import * -from gitdb.db import * -from gitdb.stream import * +from gitdb.base import * # @IgnorePep8 +from gitdb.db import * # @IgnorePep8 +from gitdb.stream import * # @IgnorePep8 diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py index 1338e83..7cee7f8 100644 --- a/gitdb/db/loose.py +++ b/gitdb/db/loose.py @@ -166,8 +166,8 @@ def info(self, sha): def stream(self, sha): m = self._map_loose_object(sha) - type, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True) - return OStream(sha, type, size, stream) + typ, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True) + return OStream(sha, typ, size, stream) def has_object(self, sha): try: @@ -247,7 +247,7 @@ def store(self, istream): def sha_iter(self): # find all files which look like an object, extract sha from there - for root, dirs, files in os.walk(self.root_path()): + for root, dirs, files in os.walk(self.root_path()): # @UnusedVariable root_base = basename(root) if len(root_base) != 2: continue diff --git a/gitdb/pack.py b/gitdb/pack.py index d76b8ea..9575147 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -3,23 +3,30 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Contains PackIndexFile and PackFile implementations""" +import array +from binascii import crc32 +import os +from struct import pack +import sys +import tempfile import zlib +from gitdb.base import ( + OInfo, + OStream, + OPackInfo, + OPackStream, + ODeltaStream, + ODeltaPackInfo, + ODeltaPackStream, +) +from gitdb.const import NULL_BYTE from gitdb.exc import ( BadObject, AmbiguousObjectName, UnsupportedOperation, ParseError ) - -from gitdb.util import ( - mman, - LazyMixin, - unpack_from, - bin_to_hex, - byte_ord, -) - from gitdb.fun import ( create_pack_object_header, pack_object_header_info, @@ -33,23 +40,6 @@ REF_DELTA, msb_size ) - -try: - from gitdb_speedups._perf import PackIndexFile_sha_to_index -except ImportError: - pass -# END try c module - -from gitdb.base import ( - OInfo, - OStream, - OPackInfo, - OPackStream, - ODeltaStream, - ODeltaPackInfo, - ODeltaPackStream, -) - from gitdb.stream import ( DecompressMemMapReader, DeltaApplyReader, @@ -57,22 +47,27 @@ NullStream, FlexibleSha1Writer ) - -from struct import pack -from binascii import crc32 - -from gitdb.const import NULL_BYTE +from gitdb.util import ( + mman, + LazyMixin, + bin_to_hex, + byte_ord, +) from gitdb.utils.compat import ( izip, buffer, xrange, - to_bytes + to_bytes, + unpack_from, ) -import tempfile -import array -import os -import sys + +try: + from gitdb_speedups._perf import PackIndexFile_sha_to_index +except ImportError: + pass +# END try c module + __all__ = ('PackIndexFile', 'PackFile', 'PackEntity') @@ -290,8 +285,9 @@ def _make_cursor(self): # We will assume that the index will always fully fit into memory ! if mman.window_size() > 0 and cursor.file_size() > mman.window_size(): - raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( - self._indexpath, cursor.file_size(), mman.window_size())) + raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). " + "This is a limitation of the hardware." % ( + self._indexpath, cursor.file_size(), mman.window_size())) return cursor @@ -528,7 +524,7 @@ class PackFile(LazyMixin): '_size', '_version', '_entered', - ) + ) pack_signature = 0x5041434b # 'PACK' pack_version_default = 2 @@ -603,7 +599,11 @@ def data(self): """ :return: read-only data of this pack. It provides random access and usually is a memory map. - :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" + + .. note:: + This method is unsafe as it returns a window into a file which might be larger + than than the actual window size + """ # can use map as we are starting at offset 0. Otherwise we would have to use buffer() return self._cursor.use_region().map() @@ -761,7 +761,8 @@ def _object(self, sha, as_stream, index=-1): sha = self._index.sha(index) # END assure sha is present ( in output ) offset = self._index.offset(index) - type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) + type_id, uncomp_size, _ = pack_object_header_info( + self._pack._cursor.use_region(offset).buffer()) if as_stream: if type_id not in delta_types: packstream = self._pack.stream(offset) @@ -784,7 +785,7 @@ def _object(self, sha, as_stream, index=-1): # the actual target size, as opposed to the size of the delta data streams = self.collect_streams_at_offset(offset) buf = streams[0].read(512) - offset, src_size = msb_size(buf) + offset, src_size = msb_size(buf) # @UnusedVariable offset, target_size = msb_size(buf, offset) # collect the streams to obtain the actual object type @@ -1019,8 +1020,9 @@ def write_pack(cls, object_iter, pack_write, index_write=None, # END for each object if actual_count != object_count: - raise ValueError( - "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) + raise ValueError("Expected to write %i objects into pack, " + "but received only %i from iterators" % + (object_count, actual_count)) # END count assertion # write footer @@ -1049,7 +1051,8 @@ def create(cls, object_iter, base_dir, object_count=None, zlib_compression=zlib. pack_write = lambda d: os.write(pack_fd, d) index_write = lambda d: os.write(index_fd, d) - pack_binsha, index_binsha = cls.write_pack(object_iter, pack_write, index_write, object_count, zlib_compression) + pack_binsha, _ = cls.write_pack( + object_iter, pack_write, index_write, object_count, zlib_compression) os.close(pack_fd) os.close(index_fd) diff --git a/gitdb/stream.py b/gitdb/stream.py index 1c1df23..bde6c1c 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -4,12 +4,12 @@ # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from io import BytesIO - import mmap import os import sys import zlib +from gitdb.const import NULL_BYTE, BYTE_SPACE from gitdb.fun import ( msb_size, stream_copy, @@ -17,7 +17,6 @@ connect_deltas, delta_types ) - from gitdb.util import ( allocate_memory, LazyMixin, @@ -27,10 +26,9 @@ suppress, is_darwin, ) - -from gitdb.const import NULL_BYTE, BYTE_SPACE from gitdb.utils.compat import buffer + has_perf_mod = False PY26 = sys.version_info[:2] < (2, 7) try: @@ -157,7 +155,8 @@ def data(self): def close(self): """Close our underlying stream of compressed bytes if this was allowed during initialization :return: True if we closed the underlying stream - :note: can be called safely + + .. note:: can be called safely """ if self._close: if hasattr(self._m, 'close'): @@ -196,7 +195,7 @@ def compressed_bytes_read(self): # but keep the window at its current position self._br = 0 if hasattr(self._zip, 'status'): - while self._zip.status == zlib.Z_OK: + while self._zip.status == zlib.Z_OK: # @UndefinedVariable self.read(mmap.PAGESIZE) # END scrub-loop custom zlib else: @@ -762,7 +761,7 @@ class FDStream(object): __slots__ = ('_fd', '_pos', '_entered', - ) + ) def __init__(self, fd): self._fd = fd @@ -794,9 +793,9 @@ def read(self, count=0): count = os.path.getsize(self._filepath) # END handle read everything - bytes = os.read(self._fd, count) - self._pos += len(bytes) - return bytes + bs = os.read(self._fd, count) + self._pos += len(bs) + return bs def fileno(self): return self._fd diff --git a/gitdb/test/db/lib.py b/gitdb/test/db/lib.py index d7365ea..574bd9b 100644 --- a/gitdb/test/db/lib.py +++ b/gitdb/test/db/lib.py @@ -3,32 +3,28 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Base classes for object db testing""" -from gitdb.test.lib import ( - with_rw_directory, - with_packs_rw, - fixture_path, - TestBase -) - -from gitdb.stream import ( - Sha1Writer, - ZippedStoreShaWriter -) +from io import BytesIO +from struct import pack from gitdb.base import ( IStream, OStream, OInfo ) - from gitdb.exc import BadObject +from gitdb.stream import ( + Sha1Writer, + ZippedStoreShaWriter +) +from gitdb.test.lib import ( + with_rw_directory, # @UnusedImport + with_packs_rw, # @UnusedImport + fixture_path, # @UnusedImport + TestBase +) from gitdb.typ import str_blob_type from gitdb.utils.compat import xrange -from io import BytesIO - -from struct import pack - __all__ = ('TestDBBase', 'with_rw_directory', 'with_packs_rw', 'fixture_path') @@ -98,10 +94,10 @@ def _assert_object_writing(self, db): assert str_blob_type == info.type assert info.size == len(data) - ostream = db.stream(sha) - assert ostream.read() == data - assert ostream.type == str_blob_type - assert ostream.size == len(data) + with db.stream(sha) as ostream: + assert ostream.read() == data + assert ostream.type == str_blob_type + assert ostream.size == len(data) else: self.failUnlessRaises(BadObject, db.info, sha) self.failUnlessRaises(BadObject, db.stream, sha) @@ -120,10 +116,9 @@ def _assert_object_writing(self, db): db.set_ostream(ZippedStoreShaWriter()) db.store(istream) assert istream.binsha == prev_sha - new_ostream = db.ostream() - - # note: only works as long our store write uses the same compression - # level, which is zip_best - assert ostream.getvalue() == new_ostream.getvalue() + with db.ostream() as new_ostream: + # note: only works as long our store write uses the same compression + # level, which is zip_best + assert ostream.getvalue() == new_ostream.getvalue() # END for each data set # END for each dry_run mode diff --git a/gitdb/test/db/test_git.py b/gitdb/test/db/test_git.py index b637c13..6b6550d 100644 --- a/gitdb/test/db/test_git.py +++ b/gitdb/test/db/test_git.py @@ -3,13 +3,14 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php import os + +from gitdb.base import OStream, OInfo +from gitdb.db import GitDB +from gitdb.exc import BadObject from gitdb.test.db.lib import ( TestDBBase, with_rw_directory ) -from gitdb.exc import BadObject -from gitdb.db import GitDB -from gitdb.base import OStream, OInfo from gitdb.util import bin_to_hex @@ -25,7 +26,7 @@ def test_reading(self): gitdb_sha = next(gdb.sha_iter()) assert isinstance(gdb.info(gitdb_sha), OInfo) with gdb.stream(gitdb_sha) as stream: - assert isinstance(gdb.stream(gitdb_sha), OStream) + assert isinstance(stream, OStream) ni = 50 assert gdb.size() >= ni sha_list = list(gdb.sha_iter()) diff --git a/gitdb/test/db/test_mem.py b/gitdb/test/db/test_mem.py index eb563c0..4042ec8 100644 --- a/gitdb/test/db/test_mem.py +++ b/gitdb/test/db/test_mem.py @@ -2,14 +2,14 @@ # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory -) from gitdb.db import ( MemoryDB, LooseObjectDB ) +from gitdb.test.db.lib import ( + TestDBBase, + with_rw_directory +) class TestMemoryDB(TestDBBase): @@ -30,5 +30,7 @@ def test_writing(self, path): assert ldb.size() == mdb.size() for sha in mdb.sha_iter(): assert ldb.has_object(sha) - assert ldb.stream(sha).read() == mdb.stream(sha).read() + with ldb.stream(sha) as st1: + with mdb.stream(sha) as st2: + self.assertEqual(st1.read(), st2.read()) # END verify objects where copied and are equal diff --git a/gitdb/test/db/test_ref.py b/gitdb/test/db/test_ref.py index 2049698..54b39cb 100644 --- a/gitdb/test/db/test_ref.py +++ b/gitdb/test/db/test_ref.py @@ -2,18 +2,14 @@ # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php +import os + +from gitdb.const import NULL_BIN_SHA +from gitdb.db import ReferenceDB from gitdb.test.db.lib import ( TestDBBase, with_rw_directory, ) -from gitdb.db import ReferenceDB - -from gitdb.util import ( - NULL_BIN_SHA, - hex_to_bin -) - -import os class TestReferenceDB(TestDBBase): diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py index 54a9bf1..a2bd99f 100644 --- a/gitdb/test/performance/test_pack.py +++ b/gitdb/test/performance/test_pack.py @@ -34,7 +34,8 @@ def test_pack_random_access(self): sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) - print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / (elapsed or 1)), file=sys.stderr) + print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % ( + ns, elapsed, ns / (elapsed or 1)), file=sys.stderr) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info @@ -75,16 +76,19 @@ def test_pack_random_access(self): total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 - print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % + print("PDB: Obtained %i streams by sha and read all bytes " + "totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib / (elapsed or 1), elapsed, max_items / (elapsed or 1)), file=sys.stderr) def test_loose_correctness(self): """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back into the loose object db (memory). - This should help finding dormant issues like this one https://github.com/gitpython-developers/GitPython/issues/220 - faster - :note: It doesn't seem this test can find the issue unless the given pack contains highly compressed - data files, like archives.""" + This should help finding dormant issues like this one faster: + https://github.com/gitpython-developers/GitPython/issues/220 + + .. note:: + It doesn't seem this test can find the issue unless the given pack contains highly compressed + data files, like archives.""" from gitdb.util import bin_to_hex pdb = GitDB(os.path.join(self.gitrepopath, 'objects')) mdb = MemoryDB() diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py index 1a61eb6..5ed57dc 100644 --- a/gitdb/test/performance/test_pack_streaming.py +++ b/gitdb/test/performance/test_pack_streaming.py @@ -5,18 +5,18 @@ """Specific test for pack streams only""" from __future__ import print_function -from gitdb.test.performance.lib import ( - TestBigRepoR -) - -from gitdb.db.pack import PackedDB -from gitdb.stream import NullStream -from gitdb.pack import PackEntity - import os import sys from time import time +from gitdb.db.pack import PackedDB +from gitdb.pack import PackEntity +from gitdb.stream import NullStream +from gitdb.test.performance.lib import ( + TestBigRepoR +) +from gitdb.utils.compat import ExitStack + class CountedNullStream(NullStream): __slots__ = '_bw' @@ -56,7 +56,10 @@ def test_pack_writing(self): st = time() ## We are leaking files here, but we don't care... # and we need a `contextlib.ExitStack` to safely close them. - PackEntity.write_pack((pdb.stream(sha) for sha in pdb.sha_iter()), ostream.write, object_count=ni) + with ExitStack() as exs: + all_streams = [exs.enter_context(pdb.stream(sha)) + for sha in pdb.sha_iter()] + PackEntity.write_pack((all_streams), ostream.write, object_count=ni) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % @@ -81,5 +84,6 @@ def test_stream_reading(self): count += 1 elapsed = time() - st total_kib = total_size / 1000 - print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % + print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes " + "totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (ni, total_kib, total_kib / (elapsed or 1), elapsed, ni / (elapsed or 1)), sys.stderr) diff --git a/gitdb/test/test_base.py b/gitdb/test/test_base.py index 519cdfd..d8e70d8 100644 --- a/gitdb/test/test_base.py +++ b/gitdb/test/test_base.py @@ -3,12 +3,6 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Test for object db""" -from gitdb.test.lib import ( - TestBase, - DummyStream, - DeriveTest, -) - from gitdb import ( OInfo, OPackInfo, @@ -18,10 +12,12 @@ ODeltaPackStream, IStream ) -from gitdb.util import ( - NULL_BIN_SHA +from gitdb.const import NULL_BIN_SHA +from gitdb.test.lib import ( + TestBase, + DummyStream, + DeriveTest, ) - from gitdb.typ import ( str_blob_type ) @@ -56,7 +52,7 @@ def test_streams(self): # test ostream stream = DummyStream() - ostream = OStream(*(info + (stream, ))) + ostream = OStream(*(info + (stream,))) assert ostream.stream is stream ostream.read(15) stream._assert() @@ -65,15 +61,15 @@ def test_streams(self): assert stream.bytes == 20 # test packstream - postream = OPackStream(*(pinfo + (stream, ))) + postream = OPackStream(*(pinfo + (stream,))) assert postream.stream is stream postream.read(10) stream._assert() assert stream.bytes == 10 # test deltapackstream - dpostream = ODeltaPackStream(*(dpinfo + (stream, ))) - dpostream.stream is stream + dpostream = ODeltaPackStream(*(dpinfo + (stream,))) + assert dpostream.stream is stream dpostream.read(5) stream._assert() assert stream.bytes == 5 @@ -83,7 +79,7 @@ def test_streams(self): # test istream istream = IStream(str_blob_type, s, stream) - assert istream.binsha == None + assert istream.binsha is None istream.binsha = sha assert istream.binsha == sha @@ -92,7 +88,7 @@ def test_streams(self): assert istream.size == s istream.size = s * 2 - istream.size == s * 2 + assert istream.size == s * 2 assert istream.type == str_blob_type istream.type = "something" assert istream.type == "something" diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index f1e8a35..5efe80c 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -3,40 +3,37 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Test everything about packs reading and writing""" -from gitdb.test.lib import ( - TestBase, - with_rw_directory, - fixture_path -) +import os +import tempfile -from gitdb.stream import DeltaApplyReader +from nose import SkipTest +from gitdb.base import ( + OInfo, + OStream, +) +from gitdb.exc import UnsupportedOperation +from gitdb.fun import delta_types from gitdb.pack import ( PackEntity, PackIndexFile, PackFile ) - -from gitdb.base import ( - OInfo, - OStream, +from gitdb.stream import DeltaApplyReader +from gitdb.test.lib import ( + TestBase, + with_rw_directory, + fixture_path ) - -from gitdb.fun import delta_types -from gitdb.exc import UnsupportedOperation from gitdb.util import to_bin_sha -from gitdb.utils.compat import xrange +from gitdb.utils.compat import xrange, ExitStack + try: from itertools import izip except ImportError: izip = zip -from nose import SkipTest - -import os -import tempfile - #{ Utilities def bin_sha_from_filename(filename): @@ -158,7 +155,7 @@ def test_pack_entity(self, rw_dir): with PackEntity(packfile) as entity: assert entity.pack().path() == packfile assert entity.index().path() == indexfile - pack_objs.extend(entity.stream_iter()) + pack_objs.extend(entity.stream_iter()) # FIXME: How to context-manage these? count = 0 for info, stream in izip(entity.info_iter(), entity.stream_iter()): @@ -220,11 +217,13 @@ def rewind_streams(): iteration += 1 with open(ppath, 'wb') as pfile: - pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) + with ExitStack() as exs: + pack_objs = [exs.enter_context(s) for s in pack_objs] + pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) assert os.path.getsize(ppath) > 100 # verify pack - with PackFile(ppath) as pf: # FIXME: Leaks file-pointer(s)! + with PackFile(ppath) as pf: assert pf.size() == len(pack_objs) assert pf.version() == PackFile.pack_version_default assert pf.checksum() == pack_sha diff --git a/gitdb/test/test_util.py b/gitdb/test/test_util.py index 847bdab..2fdbf35 100644 --- a/gitdb/test/test_util.py +++ b/gitdb/test/test_util.py @@ -3,14 +3,14 @@ # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Test for object db""" -import tempfile import os +import tempfile +from gitdb.const import NULL_HEX_SHA from gitdb.test.lib import TestBase from gitdb.util import ( to_hex_sha, to_bin_sha, - NULL_HEX_SHA, LockedFD ) diff --git a/gitdb/util.py b/gitdb/util.py index 0a77ea2..d994790 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -19,11 +19,6 @@ SlidingWindowMapBuffer ) -from gitdb.const import ( - NULL_BIN_SHA, - NULL_HEX_SHA -) - # initialize our global memory manager instance # Use it to free cached (and unused) resources. @@ -33,24 +28,6 @@ mman = SlidingWindowMapManager() # END handle mman - -try: - from struct import unpack_from -except ImportError: - from struct import unpack, calcsize - __calcsize_cache = dict() - - def unpack_from(fmt, data, offset=0): - try: - size = __calcsize_cache[fmt] - except KeyError: - size = calcsize(fmt) - __calcsize_cache[fmt] = size - # END exception handling - return unpack(fmt, data[offset: offset + size]) - # END own unpack_from implementation - - #{ Aliases hex_to_bin = binascii.a2b_hex @@ -78,8 +55,6 @@ def unpack_from(fmt, data, offset=0): is_win = (os.name == 'nt') is_darwin = (os.name == 'darwin') -# Backwards compatibility imports - #} END Aliases log = logging.getLogger(__name__) @@ -136,7 +111,7 @@ def onerror(func, path, exc_info): try: func(path) # Will scream if still not possible to delete. - except Exception as ex: + except Exception: raise return shutil.rmtree(path, False, onerror) @@ -149,7 +124,7 @@ def make_sha(source=''.encode("ascii")): try: return hashlib.sha1(source) except NameError: - import sha + import sha # @UnresolvedImport sha1 = sha.sha(source) return sha1 diff --git a/gitdb/utils/compat.py b/gitdb/utils/compat.py index a7899cb..501b356 100644 --- a/gitdb/utils/compat.py +++ b/gitdb/utils/compat.py @@ -1,10 +1,11 @@ import sys + PY3 = sys.version_info[0] == 3 try: from itertools import izip - xrange = xrange + xrange = xrange # @UndefinedVariable except ImportError: # py3 izip = zip @@ -13,8 +14,9 @@ try: # Python 2 - buffer = buffer - memoryview = buffer + buffer = buffer # @UndefinedVariable + memoryview = buffer # @ReservedAssignment + # Assume no memory view ... def to_bytes(i): return i @@ -29,15 +31,37 @@ def buffer(obj, offset, size=None): # return memoryview(obj)[offset:offset+size] return obj[offset:offset + size] # end buffer reimplementation - # smmap can return memory view objects, which can't be compared as buffers/bytes can ... + # smmap can return memory view objects, which can't be compared as buffers/bytes can ... + def to_bytes(i): if isinstance(i, memoryview): return i.tobytes() return i - memoryview = memoryview + memoryview = memoryview # @ReservedAssignment try: - MAXSIZE = sys.maxint + MAXSIZE = sys.maxint # @UndefinedVariable except AttributeError: MAXSIZE = sys.maxsize + +try: + from contextlib import ExitStack +except ImportError: + from contextlib2 import ExitStack # @UnusedImport + +try: + from struct import unpack_from # @UnusedImport +except ImportError: + from struct import unpack, calcsize + __calcsize_cache = dict() + + def unpack_from(fmt, data, offset=0): + try: + size = __calcsize_cache[fmt] + except KeyError: + size = calcsize(fmt) + __calcsize_cache[fmt] = size + # END exception handling + return unpack(fmt, data[offset: offset + size]) + # END own unpack_from implementation diff --git a/requirements.txt b/requirements.txt index ed4898e..835629f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ smmap>=0.8.3 +contextlib2; python_version <= "2.7" diff --git a/setup.py b/setup.py index f7e3761..8f35112 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,10 @@ packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), license="BSD License", zip_safe=False, - install_requires=['smmap2 >= 2.0.0'], + install_requires=['smmap2 >= 2.1.0'], + extras_require={ + ':python_version <= "2.7"': ['contextlib2'], + }, long_description="""GitDB is a pure-Python git object database""", # See https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=[ From 7aa959077cc401b4b1a79150e4f9f83405d002ec Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 16:10:54 +0200 Subject: [PATCH 15/21] doc(changes): describe v2.1.0 changes on API --- doc/source/changes.rst | 69 +++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index 22deb6d..3f88481 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -2,42 +2,83 @@ Changelog ######### -***** +2.1.0 +* **BREAKING API:** retrofit streams and (internal) packers as context-managers. + + Specifically if you are using directly the packers + (``git.pack.PackIndexFile``, ``git.pack.PackFile`` & ``git.pack.PackEntity``) + they must always be used from within a ``with ..`` block, or else + you will get *mmap-cursor* missing errors. + + .. Tip:: + + You can "enter" `PackIndexFile`` & ``PackFile`` multiple time, but ``PackEntity`` only once + to detect and avoid sloppy deep-nesting. + Since ``git.pack.PackEntity`` class just coalseces ``PackIndexFile`` & ``PackFile``, + you may "enter" either each internal packer separately, or the entity only once. + +* **BREAKING API:** some utilities moved between ``git.util``, ``git.const`` & ``git.utils.compat``. +* Fix (probably) all leaks in Windows. + + .. Note:: + + The problem is that on Linux, any open files go unoticed, or collected by GC. + But on *Windows* (and specifically on PY3 where GC is not deterministic), + the underlying files cannot delete due to *access violation*. + + That's a Good-thing|copy|, because it is dangerous to *leak* memory-mapped handles. + Actually *Windows* may leak them even after process who created them have died, + needing a full restart(!) to clean them up (signing-out is not enough). + + +* Stop importing *on runtime* *smmap* submodule - deleted completely submodule from sources. + + .. Tip:: + + Developer now has to specify specific dependency to *smmap* in ``requirements.txt`` file, and + remember to updated it before a final release. + +* Run TCs also on Appveyor. + + 0.6.1 -***** +===== * Fixed possibly critical error, see https://github.com/gitpython-developers/GitPython/issues/220 - However, it only seems to occur on high-entropy data and didn't reoccour after the fix -***** + 0.6.0 -***** +===== * Added support got python 3.X * Removed all `async` dependencies and all `*_async` versions of methods with it. -***** + 0.5.4 -***** +===== * Adjusted implementation to use the SlidingMemoryManager by default in python 2.6 for efficiency reasons. In Python 2.4, the StaticMemoryManager will be used instead. -***** + 0.5.3 -***** +===== * Added support for smmap. SmartMMap allows resources to be managed and controlled. This brings the implementation closer to the way git handles memory maps, such that unused cached memory maps will automatically be freed once a resource limit is hit. The memory limit on 32 bit systems remains though as a sliding mmap implementation is not used for performance reasons. -***** + 0.5.2 -***** +===== * Improved performance of the c implementation, which now uses reverse-delta-aggregation to make a memory bound operation CPU bound. -***** + 0.5.1 -***** +===== * Restored most basic python 2.4 compatibility, such that gitdb can be imported within python 2.4, pack access cannot work though. This at least allows Super-Projects to provide their own workarounds, or use everything but pack support. -***** + 0.5.0 -***** +===== Initial Release + + +.. |copy| unicode:: U+000A9 .. COPYRIGHT SIGN \ No newline at end of file From efaa6a1ab2273ff1f2547de23ee95f4c8cfcad38 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 16:14:15 +0200 Subject: [PATCH 16/21] chore(deps): pin dev dependencies on requirements text [travisci skip] --- .appveyor.yml | 2 -- .travis.yml | 3 --- requirements.txt | 5 ++++- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index bfc5eff..5faf521 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -37,8 +37,6 @@ install: git config --global user.email "travis@ci.com" git config --global user.name "Travis Runner" - ## Install any dev-requirements (FIXME: remove on release) - - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 - pip install -r requirements.txt build: false diff --git a/.travis.yml b/.travis.yml index 2f8e407..4f0c129 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,9 +12,6 @@ git: depth: 1000 install: - pip install coveralls - - ## Install any dev-requirements (FIXME: remove on release) - - pip install -I git+https://github.com/ankostis/smmap.git@v2.1.0.dev1 - pip install -r requirements.txt script: - ulimit -n 48 diff --git a/requirements.txt b/requirements.txt index 835629f..39cb6a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ -smmap>=0.8.3 contextlib2; python_version <= "2.7" +#smmap>=2.1.0 + +## DEV-requirements (FIXME: remove on release) +-e git+https://github.com/ankostis/smmap.git@v2.1.0.dev1#egg=smmap From fa24dce28f4237da9c63b2f51b60bad83fa48ba9 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Mon, 24 Oct 2016 22:39:39 +0200 Subject: [PATCH 17/21] feat(streams): use named-tuples speedups: + 3.5: ~5% faster + 3.4: ~15% faster + 3.3/2.7/2.6: the same BREAKING API 1: `XXOStream` not an instance of `XXOInfo`. BREAKING API 2: `XXOinfo/XXOStream not inheritable. (ie no `DeriveTest` class). --- gitdb/base.py | 143 +++++++++++----------------------------- gitdb/test/db/lib.py | 2 +- gitdb/test/lib.py | 12 ---- gitdb/test/test_base.py | 4 -- 4 files changed, 38 insertions(+), 123 deletions(-) diff --git a/gitdb/base.py b/gitdb/base.py index 60e169d..b9f65de 100644 --- a/gitdb/base.py +++ b/gitdb/base.py @@ -4,6 +4,8 @@ # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module with basic data structures - they are designed to be lightweight and fast""" from gitdb.util import bin_to_hex, suppress +from collections import namedtuple + from gitdb.fun import ( type_id_to_type_map, @@ -17,7 +19,7 @@ #{ ODB Bases -class OInfo(tuple): +class OInfo(namedtuple('OInfo', 'binsha, type, size')): """Carries information about an object in an ODB, providing information about the binary sha of the object, the type_string as well as the uncompressed size @@ -30,40 +32,19 @@ class OInfo(tuple): assert dbi[2] == dbi.size The type is designed to be as lightweight as possible.""" - __slots__ = tuple() - - def __new__(cls, sha, type, size): - return tuple.__new__(cls, (sha, type, size)) - - def __init__(self, *args): - tuple.__init__(self) - - #{ Interface - @property - def binsha(self): - """:return: our sha as binary, 20 bytes""" - return self[0] + __slots__ = () @property def hexsha(self): """:return: our sha, hex encoded, 40 bytes""" - return bin_to_hex(self[0]) - - @property - def type(self): - return self[1] + return bin_to_hex(self.binsha) @property def type_id(self): - return type_to_type_id_map[self[1]] + return type_to_type_id_map[self.type] - @property - def size(self): - return self[2] - #} END interface - -class OPackInfo(tuple): +class OPackInfo(namedtuple('OPackInfo', 'pack_offset, type_id, size')): """As OInfo, but provides a type_id property to retrieve the numerical type id, and does not include a sha. @@ -71,68 +52,37 @@ class OPackInfo(tuple): Additionally, the pack_offset is the absolute offset into the packfile at which all object information is located. The data_offset property points to the absolute location in the pack at which that actual data stream can be found.""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size): - return tuple.__new__(cls, (packoffset, type, size)) - - def __init__(self, *args): - tuple.__init__(self) - - #{ Interface - - @property - def pack_offset(self): - return self[0] + __slots__ = () @property def type(self): - return type_id_to_type_map[self[1]] - - @property - def type_id(self): - return self[1] - - @property - def size(self): - return self[2] - - #} END interface + return type_id_to_type_map[self.type_id] -class ODeltaPackInfo(OPackInfo): +class ODeltaPackInfo(namedtuple('ODeltaPackInfo', 'pack_offset, type_id, size, delta_info')): """Adds delta specific information, Either the 20 byte sha which points to some object in the database, or the negative offset from the pack_offset, so that pack_offset - delta_info yields the pack offset of the base object""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, delta_info): - return tuple.__new__(cls, (packoffset, type, size, delta_info)) + __slots__ = () - #{ Interface @property - def delta_info(self): - return self[3] - #} END interface - + def type(self): + return type_id_to_type_map[self.type_id] -class OStream(OInfo): +class OStream(namedtuple('OStream', 'binsha type size stream')): """Base for object streams retrieved from the database, providing additional information about the stream. - Generally, ODB streams are read-only as objects are immutable""" - __slots__ = tuple() + Generally, ODB streams are read-only as objects are immutable - def __new__(cls, sha, type, size, stream, *args, **kwargs): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size, stream)) + .. Note: + Is NOTE a :class:`OInfo` instance; for the effort required, see: + see http://stackoverflow.com/questions/20794182/how-to-make-a-file-like-class-work-with-isinstancecls-io-iobase - def __init__(self, *args, **kwargs): - tuple.__init__(self) - - #{ Stream Reader Interface + """ + __slots__ = () def __enter__(self): return self @@ -148,38 +98,26 @@ def read(self, size=-1): return self.stream.read(size) @property - def stream(self): - return self[3] + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return bin_to_hex(self.binsha) - #} END stream reader interface + @property + def type_id(self): + return type_to_type_id_map[self.type] class ODeltaStream(OStream): - - """Uses size info of its stream, delaying reads""" - - def __new__(cls, sha, type, size, stream, *args, **kwargs): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size, stream)) - - #{ Stream Reader Interface - @property def size(self): return self[3].size - #} END stream reader interface - -class OPackStream(OPackInfo): +class OPackStream(namedtuple('OPackStream', 'pack_offset, type_id, size, stream')): """Next to pack object information, a stream outputting an undeltified base object is provided""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, stream, *args): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (packoffset, type, size, stream)) + __slots__ = () def __enter__(self): return self @@ -191,23 +129,18 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self): self.stream.close() - #{ Stream Reader Interface def read(self, size=-1): return self.stream.read(size) @property - def stream(self): - return self[3] - #} END stream reader interface + def type(self): + return type_id_to_type_map[self.type_id] -class ODeltaPackStream(ODeltaPackInfo): +class ODeltaPackStream(namedtuple('ODeltaPackStream', 'pack_offset, type_id, size, delta_info stream')): """Provides a stream outputting the uncompressed offset delta information""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, delta_info, stream): - return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) + __slots__ = () def __enter__(self): return self @@ -219,14 +152,12 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self): self.stream.close() - #{ Stream Reader Interface def read(self, size=-1): return self.stream.read(size) @property - def stream(self): - return self[4] - #} END stream reader interface + def type(self): + return type_id_to_type_map[self.type_id] class IStream(list): @@ -238,7 +169,7 @@ class IStream(list): to blend in without prior conversion. The only method your content stream must support is 'read'""" - __slots__ = tuple() + __slots__ = () def __new__(cls, type, size, stream, sha=None): return list.__new__(cls, (sha, type, size, stream, None)) @@ -325,7 +256,7 @@ class InvalidOInfo(tuple): """Carries information about a sha identifying an object which is invalid in the queried database. The exception attribute provides more information about the cause of the issue""" - __slots__ = tuple() + __slots__ = () def __new__(cls, sha, exc): return tuple.__new__(cls, (sha, exc)) @@ -350,7 +281,7 @@ def error(self): class InvalidOStream(InvalidOInfo): """Carries information about an invalid ODB stream""" - __slots__ = tuple() + __slots__ = () def __enter__(self): return self diff --git a/gitdb/test/db/lib.py b/gitdb/test/db/lib.py index 574bd9b..b4cccf6 100644 --- a/gitdb/test/db/lib.py +++ b/gitdb/test/db/lib.py @@ -49,7 +49,7 @@ def _assert_object_writing_simple(self, db): assert db.has_object(istream.binsha) info = db.info(istream.binsha) - assert isinstance(info, OInfo) + assert isinstance(info, (OInfo, OStream)) assert info.type == istream.type and info.size == istream.size with db.stream(istream.binsha) as stream: diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index cc369cd..8185ca4 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -18,7 +18,6 @@ import tempfile import unittest -from gitdb.base import OStream from gitdb.test import HIDE_WINDOWS_KNOWN_ERRORS from gitdb.util import rmtree, mman from gitdb.utils.compat import xrange @@ -204,15 +203,4 @@ def close(self): def _assert(self): assert self.was_read - -class DeriveTest(OStream): - - def __init__(self, sha, type, size, stream, *args, **kwargs): - self.myarg = kwargs.pop('myarg') - self.args = args - - def _assert(self): - assert self.args - assert self.myarg - #} END stream utilitiess diff --git a/gitdb/test/test_base.py b/gitdb/test/test_base.py index d8e70d8..0a3d03b 100644 --- a/gitdb/test/test_base.py +++ b/gitdb/test/test_base.py @@ -16,7 +16,6 @@ from gitdb.test.lib import ( TestBase, DummyStream, - DeriveTest, ) from gitdb.typ import ( str_blob_type @@ -74,9 +73,6 @@ def test_streams(self): stream._assert() assert stream.bytes == 5 - # derive with own args - DeriveTest(sha, str_blob_type, s, stream, 'mine', myarg=3)._assert() - # test istream istream = IStream(str_blob_type, s, stream) assert istream.binsha is None From 7524f69aacc159e1e32d6b1aeb8f89e612abe199 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 17:28:03 +0200 Subject: [PATCH 18/21] fix(compat): PY3-check must hold even for PY4 --- gitdb/utils/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdb/utils/compat.py b/gitdb/utils/compat.py index 501b356..93e9a7d 100644 --- a/gitdb/utils/compat.py +++ b/gitdb/utils/compat.py @@ -1,7 +1,7 @@ import sys -PY3 = sys.version_info[0] == 3 +PY3 = sys.version_info[0] >= 3 try: from itertools import izip From c63db6907f8a4c5723604c4b4fa317f950a6eb2c Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Tue, 25 Oct 2016 19:35:00 +0200 Subject: [PATCH 19/21] refact(elapsed): improve no div0 when time-elapsed too small --- gitdb/test/performance/test_pack.py | 20 +++++++++---------- gitdb/test/performance/test_pack_streaming.py | 12 +++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py index a2bd99f..b3dfd40 100644 --- a/gitdb/test/performance/test_pack.py +++ b/gitdb/test/performance/test_pack.py @@ -32,10 +32,10 @@ def test_pack_random_access(self): # sha lookup st = time() sha_list = list(pdb.sha_iter()) - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows ns = len(sha_list) print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % ( - ns, elapsed, ns / (elapsed or 1)), file=sys.stderr) + ns, elapsed, ns / elapsed), file=sys.stderr) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info @@ -44,13 +44,13 @@ def test_pack_random_access(self): for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows # discard cache del(pdb._entities) pdb.entities() print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % - (ns, len(pdb.entities()), elapsed, ns / (elapsed or 1)), file=sys.stderr) + (ns, len(pdb.entities()), elapsed, ns / elapsed), file=sys.stderr) # END for each random mode # query info and streams only @@ -59,9 +59,9 @@ def test_pack_random_access(self): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % - (max_items, pdb_fun.__name__.upper(), elapsed, max_items / (elapsed or 1)), file=sys.stderr) + (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed), file=sys.stderr) # END for each function # retrieve stream and read all @@ -74,11 +74,11 @@ def test_pack_random_access(self): read_len = len(stream.read()) assert read_len == stream.size total_size += stream.size - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows total_kib = total_size / 1000 print("PDB: Obtained %i streams by sha and read all bytes " "totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % - (max_items, total_kib, total_kib / (elapsed or 1), elapsed, max_items / (elapsed or 1)), file=sys.stderr) + (max_items, total_kib, total_kib / elapsed, elapsed, max_items / elapsed), file=sys.stderr) def test_loose_correctness(self): """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back @@ -130,7 +130,7 @@ def test_correctness(self): # END ignore old indices # END for each index # END for each entity - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % - (count, crc, elapsed, count / (elapsed or 1)), file=sys.stderr) + (count, crc, elapsed, count / elapsed), file=sys.stderr) # END for each verify mode diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py index 5ed57dc..ab07873 100644 --- a/gitdb/test/performance/test_pack_streaming.py +++ b/gitdb/test/performance/test_pack_streaming.py @@ -49,9 +49,9 @@ def test_pack_writing(self): if count == ni: break # END gather objects for pack-writing - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows print("PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % - (ni, elapsed, ni / (elapsed or 1)), file=sys.stderr) + (ni, elapsed, ni / elapsed), file=sys.stderr) st = time() ## We are leaking files here, but we don't care... @@ -60,10 +60,10 @@ def test_pack_writing(self): all_streams = [exs.enter_context(pdb.stream(sha)) for sha in pdb.sha_iter()] PackEntity.write_pack((all_streams), ostream.write, object_count=ni) - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows total_kb = ostream.bytes_written() / 1000 print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % - (total_kb, elapsed, total_kb / (elapsed or 1)), sys.stderr) + (total_kb, elapsed, total_kb / elapsed), sys.stderr) def test_stream_reading(self): # raise SkipTest() @@ -82,8 +82,8 @@ def test_stream_reading(self): stream.read() total_size += stream.size count += 1 - elapsed = time() - st + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows total_kib = total_size / 1000 print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes " "totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % - (ni, total_kib, total_kib / (elapsed or 1), elapsed, ni / (elapsed or 1)), sys.stderr) + (ni, total_kib, total_kib / elapsed, elapsed, ni / elapsed), sys.stderr) From 40199bad6862f21527c322f41653179c700edea3 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Thu, 27 Oct 2016 12:36:24 +0200 Subject: [PATCH 20/21] refact(util): ask global `util.mman` from mman module --- gitdb/util.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/gitdb/util.py b/gitdb/util.py index d994790..6ac8fc0 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -14,20 +14,11 @@ import sys from smmap import ( - StaticWindowMapManager, - SlidingWindowMapManager, + managed_mmaps, SlidingWindowMapBuffer ) -# initialize our global memory manager instance -# Use it to free cached (and unused) resources. -if sys.version_info < (2, 6): - mman = StaticWindowMapManager() -else: - mman = SlidingWindowMapManager() -# END handle mman - #{ Aliases hex_to_bin = binascii.a2b_hex @@ -59,6 +50,9 @@ log = logging.getLogger(__name__) +#: Global MemoryManager, remember to ``mman.collect()`` it. +mman = managed_mmaps(check_entered=False) + #{ compatibility stuff ... From fa89615868888fe656ba7bdda2a2fb4baa701349 Mon Sep 17 00:00:00 2001 From: Kostis Anagnostopoulos Date: Thu, 27 Oct 2016 11:45:51 +0200 Subject: [PATCH 21/21] chore(ver): bump 2.0.0.dev1-->2.1.0.dev3, and more + chore(deps): + smmap2-v2.1.0.dev4 (FIXes memoryview leak). + FIX quoted environment marker in requirements. + import actually smmap2! --- gitdb/__init__.py | 2 +- gitdb/test/test_pack.py | 5 ----- requirements.txt | 8 ++++---- setup.py | 6 ++---- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/gitdb/__init__.py b/gitdb/__init__.py index 33229ce..9244074 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -10,7 +10,7 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (2, 1, 0, 'dev1') +version_info = (2, 1, 0, 'dev3') __version__ = '.'.join(str(i) for i in version_info) diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index 5efe80c..f1e885b 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -139,11 +139,6 @@ def test_pack(self): self._assert_pack_file(pack, version, size) # END for each pack to test - ## Unless HIDE_WINDOWS_KNOWN_ERRORS, on Windows fails with: - # File "D:\Work\gitdb.git\gitdb\util.py", line 141, in onerror - # func(path) # Will scream if still not possible to delete. - # PermissionError: [WinError 32] The process cannot access the file - # because it is being used by another process: 'sss\\index_cc_wll5' @with_rw_directory def test_pack_entity(self, rw_dir): pack_objs = [] diff --git a/requirements.txt b/requirements.txt index 39cb6a3..b74364a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -contextlib2; python_version <= "2.7" -#smmap>=2.1.0 +contextlib2; python_version<='2.7' +#smmap2>=2.1.0 -## DEV-requirements (FIXME: remove on release) --e git+https://github.com/ankostis/smmap.git@v2.1.0.dev1#egg=smmap +## DEV-requirement: FIX memleak (FIXME: remove on release) +-e git+https://github.com/ankostis/smmap.git@v2.1.0.dev4#egg=smmap2 diff --git a/setup.py b/setup.py index 8f35112..84128fe 100755 --- a/setup.py +++ b/setup.py @@ -20,10 +20,8 @@ packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), license="BSD License", zip_safe=False, - install_requires=['smmap2 >= 2.1.0'], - extras_require={ - ':python_version <= "2.7"': ['contextlib2'], - }, + install_requires=['smmap2 >= 2.0.0'], + extras_require={':python_version=="2.7"': ['contextlib2']}, long_description="""GitDB is a pure-Python git object database""", # See https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=[