Skip to content

Commit

Permalink
improve buffering efficiency (#427)
Browse files Browse the repository at this point in the history
* improve buffering efficiency

* add benchmarks

Before:

------------------------------------------- benchmark: 1 tests -------------------------------------------
Name (time in s)        Min      Max    Mean  StdDev  Median     IQR  Outliers     OPS  Rounds  Iterations
----------------------------------------------------------------------------------------------------------
test                 4.8925  10.1093  5.9906  2.3032  5.0104  1.3963       1;1  0.1669       5           1
----------------------------------------------------------------------------------------------------------

After:

------------------------------------------- benchmark: 1 tests ------------------------------------------
Name (time in s)        Min     Max    Mean  StdDev  Median     IQR  Outliers     OPS  Rounds  Iterations
---------------------------------------------------------------------------------------------------------
test                 4.9611  9.7707  5.9822  2.1190  5.0280  1.3168       1;1  0.1672       5           1
---------------------------------------------------------------------------------------------------------

* remove unused import
  • Loading branch information
mpenkov authored Mar 15, 2020
1 parent bdb3fb0 commit 85a67ee
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
23 changes: 23 additions & 0 deletions integration-tests/test_s3_buffering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from smart_open import open


def read_bytes(url, limit):
bytes_ = []
with open(url, 'rb') as fin:
for i in range(limit):
bytes_.append(fin.read(1))

return bytes_


def test(benchmark):
#
# This file is around 850MB.
#
url = (
's3://commoncrawl/crawl-data/CC-MAIN-2019-51/segments/1575541319511.97'
'/warc/CC-MAIN-20191216093448-20191216121448-00559.warc.gz'
)
limit = 1000000
bytes_ = benchmark(read_bytes, url, limit)
assert len(bytes_) == limit
6 changes: 1 addition & 5 deletions smart_open/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,6 @@ def read(self, size=-1):
if self._eof:
return self._read_from_buffer()

#
# Fill our buffer to the required size.
#
# logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size)
self._fill_buffer(size)
return self._read_from_buffer(size)

Expand Down Expand Up @@ -440,7 +436,7 @@ def _read_from_buffer(self, size=-1):
return part

def _fill_buffer(self, size=-1):
size = size if size >= 0 else self._buffer._chunk_size
size = max(size, self._buffer._chunk_size)
while len(self._buffer) < size and not self._eof:
bytes_read = self._buffer.fill(self._raw_reader)
if bytes_read == 0:
Expand Down

0 comments on commit 85a67ee

Please sign in to comment.