diff --git a/howto.md b/howto.md index 830067dd..9235200b 100644 --- a/howto.md +++ b/howto.md @@ -66,3 +66,36 @@ Writing example: >>> os.unlink(tmp.name) # comment this line to keep the file for later ``` + +## How to Access S3 Object Properties + +When working with AWS S3, you may want to look beyond the abstraction +provided by `smart_open` and communicate with `boto3` directly in order to +satisfy your use case. + +For example: + +- Access the object's properties, such as the content type, timestamp of the last change, etc. +- Access version information for the object (versioned buckets only) +- Copy the object to another location +- Apply an ACL to the object +- and anything else specified in the [boto3 S3 Object API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#object). + +To enable such use cases, the file-like objects returned by `smart_open` have a special `to_boto3` method. +This returns a `boto3.s3.Object` that you can work with directly. +For example, let's get the content type of a publicly available file: + +```python +>>> from smart_open import open +>>> with open('s3://commoncrawl/robots.txt') as fin: +... print(fin.readline().rstrip()) +... boto3_s3_object = fin.to_boto3() +... print(repr(boto3_s3_object)) +... print(boto3_s3_object.content_type) # Using the boto3 API here +User-Agent: * +s3.Object(bucket_name='commoncrawl', key='robots.txt') +text/plain + +``` + +This works only when reading and writing via S3. diff --git a/smart_open/s3.py b/smart_open/s3.py index db77e8b6..80948ad2 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -101,7 +101,8 @@ def open( Additional parameters to pass to boto3's initiate_multipart_upload function. For writing only. version_id: str, optional - Version of the object, used when reading object. If None, will fetch the most recent version. + Version of the object, used when reading object. + If None, will fetch the most recent version. """ logger.debug('%r', locals()) @@ -237,6 +238,9 @@ def __init__(self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE if resource_kwargs is None: resource_kwargs = {} + self._session = session + self._resource_kwargs = resource_kwargs + s3 = session.resource('s3', **resource_kwargs) self._object = s3.Object(bucket, key) self._version_id = version_id @@ -343,6 +347,18 @@ def terminate(self): """Do nothing.""" pass + def to_boto3(self): + """Create an **independent** `boto3.s3.Object` instance that points to + the same resource as this instance. + + The created instance will re-use the session and resource parameters of + the current instance, but it will be independent: changes to the + `boto3.s3.Object` may not necessary affect the current instance. + + """ + s3 = self._session.resource('s3', **self._resource_kwargs) + return s3.Object(self._object.bucket_name, self._object.key) + # # Internal methods. # @@ -373,13 +389,14 @@ def __init__(self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE line_terminator=BINARY_NEWLINE, session=None, resource_kwargs=None): self._buffer_size = buffer_size - self._session = session - self._resource_kwargs = resource_kwargs if session is None: session = boto3.Session() if resource_kwargs is None: resource_kwargs = {} + + self._session = session + self._resource_kwargs = resource_kwargs s3 = session.resource('s3', **resource_kwargs) self._object = s3.Object(bucket, key) self._version_id = version_id @@ -477,8 +494,6 @@ def __init__( multipart_upload_kwargs=None, ): - self._session = session - self._resource_kwargs = resource_kwargs self._multipart_upload_kwargs = multipart_upload_kwargs if min_part_size < MIN_MIN_PART_SIZE: @@ -492,6 +507,9 @@ def __init__( if multipart_upload_kwargs is None: multipart_upload_kwargs = {} + self._session = session + self._resource_kwargs = resource_kwargs + s3 = session.resource('s3', **resource_kwargs) try: self._object = s3.Object(bucket, key) @@ -581,6 +599,18 @@ def terminate(self): self._mp.abort() self._mp = None + def to_boto3(self): + """Create an **independent** `boto3.s3.Object` instance that points to + the same resource as this instance. + + The created instance will re-use the session and resource parameters of + the current instance, but it will be independent: changes to the + `boto3.s3.Object` may not necessary affect the current instance. + + """ + s3 = self._session.resource('s3', **self._resource_kwargs) + return s3.Object(self._object.bucket_name, self._object.key) + # # Internal methods. # diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index a493af60..840b95a6 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -109,9 +109,10 @@ def ignore_resource_warnings(): class SeekableRawReaderTest(unittest.TestCase): def setUp(self): + self._body = b'123456' self._local_resource = boto3.resource('s3', endpoint_url='http://localhost:5000') self._local_resource.Bucket(BUCKET_NAME).create() - self._local_resource.Object(BUCKET_NAME, KEY_NAME).put(Body=b'123456') + self._local_resource.Object(BUCKET_NAME, KEY_NAME).put(Body=self._body) def tearDown(self): self._local_resource.Object(BUCKET_NAME, KEY_NAME).delete() @@ -289,6 +290,16 @@ def test_read0_does_not_return_data(self): self.assertEqual(data, b'') + def test_to_boto3(self): + contents = b'the spice melange\n' + put_to_bucket(contents=contents) + + with smart_open.s3.BufferedInputBase(BUCKET_NAME, KEY_NAME) as fin: + returned_obj = fin.to_boto3() + + boto3_body = returned_obj.get()['Body'].read() + self.assertEqual(contents, boto3_body) + @maybe_mock_s3 class BufferedOutputBaseTest(unittest.TestCase): @@ -428,6 +439,16 @@ def test_flush_close(self): fout.flush() fout.close() + def test_to_boto3(self): + contents = b'the spice melange\n' + + with smart_open.s3.open(BUCKET_NAME, KEY_NAME, 'wb') as fout: + fout.write(contents) + returned_obj = fout.to_boto3() + + boto3_body = returned_obj.get()['Body'].read() + self.assertEqual(contents, boto3_body) + class ClampTest(unittest.TestCase): def test(self):