diff --git a/google/cloud/storage/transfer_manager.py b/google/cloud/storage/transfer_manager.py index fec5965cf..25abfacae 100644 --- a/google/cloud/storage/transfer_manager.py +++ b/google/cloud/storage/transfer_manager.py @@ -273,6 +273,8 @@ def download_many( raise_exception=False, worker_type=PROCESS, max_workers=DEFAULT_MAX_WORKERS, + *, + skip_if_exists=False, ): """Download many blobs concurrently via a worker pool. @@ -348,6 +350,11 @@ def download_many( and the default is a conservative number that should work okay in most cases without consuming excessive resources. + :type skip_if_exists: bool + :param skip_if_exists: + Before downloading each blob, check if the file for the filename exists; + if it does, skip that blob. + :raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded. :rtype: list @@ -374,6 +381,10 @@ def download_many( "Passing in a file object is only supported by the THREAD worker type. Please either select THREAD workers, or pass in filenames only." ) + if skip_if_exists and isinstance(path_or_file, str): + if os.path.isfile(path_or_file): + continue + futures.append( executor.submit( _call_method_on_maybe_pickled_blob, @@ -589,6 +600,8 @@ def download_many_to_path( raise_exception=False, worker_type=PROCESS, max_workers=DEFAULT_MAX_WORKERS, + *, + skip_if_exists=False, ): """Download many files concurrently by their blob names. @@ -715,6 +728,11 @@ def download_many_to_path( and the default is a conservative number that should work okay in most cases without consuming excessive resources. + :type skip_if_exists: bool + :param skip_if_exists: + Before downloading each blob, check if the file for the filename exists; + if it does, skip that blob. This only works for filenames. + :raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded. :rtype: list @@ -740,6 +758,7 @@ def download_many_to_path( raise_exception=raise_exception, worker_type=worker_type, max_workers=max_workers, + skip_if_exists=skip_if_exists, ) diff --git a/tests/unit/test_transfer_manager.py b/tests/unit/test_transfer_manager.py index 503b8fd2e..732f09a75 100644 --- a/tests/unit/test_transfer_manager.py +++ b/tests/unit/test_transfer_manager.py @@ -273,6 +273,32 @@ def test_download_many_with_filenames(): assert result == FAKE_RESULT +def test_download_many_with_skip_if_exists(): + with tempfile.NamedTemporaryFile() as tf: + BLOB_FILE_PAIRS = [ + (mock.Mock(spec=Blob), "file_a.txt"), + (mock.Mock(spec=Blob), tf.name), + ] + + for blob_mock, _ in BLOB_FILE_PAIRS: + blob_mock._handle_filename_and_download.return_value = FAKE_RESULT + + results = transfer_manager.download_many( + BLOB_FILE_PAIRS, + download_kwargs=DOWNLOAD_KWARGS, + worker_type=transfer_manager.THREAD, + skip_if_exists=True, + ) + mock_blob, file = BLOB_FILE_PAIRS[0] + mock_blob._handle_filename_and_download.assert_any_call( + file, **EXPECTED_DOWNLOAD_KWARGS + ) + mock_blob, _ = BLOB_FILE_PAIRS[1] + mock_blob._handle_filename_and_download.assert_not_called() + for result in results: + assert result == FAKE_RESULT + + def test_download_many_with_file_objs(): BLOB_FILE_PAIRS = [ (mock.Mock(spec=Blob), tempfile.TemporaryFile()), @@ -485,6 +511,7 @@ def test_download_many_to_path(): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, + skip_if_exists=True, ) mock_download_many.assert_called_once_with( @@ -494,6 +521,7 @@ def test_download_many_to_path(): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, + skip_if_exists=True, ) for blobname in BLOBNAMES: bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname) @@ -532,6 +560,7 @@ def test_download_many_to_path_creates_directories(): raise_exception=True, worker_type=transfer_manager.PROCESS, max_workers=8, + skip_if_exists=False, ) for blobname in BLOBNAMES: bucket.blob.assert_any_call(blobname)