Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Add "auto" checksum option and make default #1383

Merged
merged 5 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,17 @@ setup.py file. Applications which do not import directly from
`google-resumable-media` can safely disregard this dependency. This backwards
compatibility feature will be removed in a future major version update.

Checksum Defaults
~~~~~~~~~~~~~~~~~

In Python Storage 3.0, uploads and downloads now have a default of "auto" where
applicable. "Auto" will use crc32c checksums, except for unusual cases where the
fast (C extension) crc32c implementation is not available, in which case it will
use md5 instead. Before Python Storage 3.0, the default was md5 for most
downloads and None for most uploads. Note that ranged downloads ("start" or
"end" set) still do not support any checksumming, and some features in
`transfer_manager.py` still support crc32c only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for adding this! This will make 3.0 release notes really clear

Miscellaneous
~~~~~~~~~~~~~

Expand Down
16 changes: 14 additions & 2 deletions google/cloud/storage/_media/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,28 @@ class Download(DownloadBase):
appropriate checksum (for instance in the case of transcoded or
ranged downloads where the remote service does not know the
correct checksum) an INFO-level log will be emitted. Supported
values are "md5", "crc32c" and None.
values are "md5", "crc32c", "auto" and None. The default is "auto",
which will try to detect if the C extension for crc32c is installed
and fall back to md5 otherwise.
"""

def __init__(
self, media_url, stream=None, start=None, end=None, headers=None, checksum="md5"
self,
media_url,
stream=None,
start=None,
end=None,
headers=None,
checksum="auto",
):
super(Download, self).__init__(
media_url, stream=stream, start=start, end=end, headers=headers
)
self.checksum = checksum
if self.checksum == "auto":
self.checksum = (
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like how this is handled in the constructor 🎉

self._bytes_downloaded = 0
self._expected_checksum = None
self._checksum_object = None
Expand Down
65 changes: 22 additions & 43 deletions google/cloud/storage/_media/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import hashlib
import logging
import random
import warnings

from urllib.parse import parse_qs
from urllib.parse import urlencode
Expand Down Expand Up @@ -142,43 +141,6 @@ def calculate_retry_wait(base_wait, max_sleep, multiplier=2.0):
return new_base_wait, new_base_wait + 0.001 * jitter_ms


def _get_crc32c_object():
"""Get crc32c object
Attempt to use the Google-CRC32c package. If it isn't available, try
to use CRCMod. CRCMod might be using a 'slow' varietal. If so, warn...
"""
try:
import google_crc32c # type: ignore

crc_obj = google_crc32c.Checksum()
except ImportError:
try:
import crcmod # type: ignore

crc_obj = crcmod.predefined.Crc("crc-32c")
_is_fast_crcmod()

except ImportError:
raise ImportError("Failed to import either `google-crc32c` or `crcmod`")

return crc_obj


def _is_fast_crcmod():
# Determine if this is using the slow form of crcmod.
nested_crcmod = __import__(
"crcmod.crcmod",
globals(),
locals(),
["_usingExtension"],
0,
)
fast_crc = getattr(nested_crcmod, "_usingExtension", False)
if not fast_crc:
warnings.warn(_SLOW_CRC32C_WARNING, RuntimeWarning, stacklevel=2)
return fast_crc


def _get_metadata_key(checksum_type):
if checksum_type == "md5":
return "md5Hash"
Expand Down Expand Up @@ -231,10 +193,7 @@ def _get_expected_checksum(response, get_headers, media_url, checksum_type):
_LOGGER.info(msg)
checksum_object = _DoNothingHash()
else:
if checksum_type == "md5":
checksum_object = hashlib.md5()
else:
checksum_object = _get_crc32c_object()
checksum_object = _get_checksum_object(checksum_type)
else:
expected_checksum = None
checksum_object = _DoNothingHash()
Expand Down Expand Up @@ -331,13 +290,33 @@ def _get_checksum_object(checksum_type):
if checksum_type == "md5":
return hashlib.md5()
elif checksum_type == "crc32c":
return _get_crc32c_object()
# In order to support platforms that don't have google_crc32c
# support, only perform the import on demand.
import google_crc32c

return google_crc32c.Checksum()
elif checksum_type is None:
return None
else:
raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")


def _is_crc32c_available_and_fast():
andrewsg marked this conversation as resolved.
Show resolved Hide resolved
"""Return True if the google_crc32c C extension is installed.

Return False if either the package is not installed, or if only the
pure-Python version is installed.
"""
try:
import google_crc32c

if google_crc32c.implementation == "c":
return True
except Exception:
pass
return False


def _parse_generation_header(response, get_headers):
"""Parses the generation header from an ``X-Goog-Generation`` value.

Expand Down
43 changes: 30 additions & 13 deletions google/cloud/storage/_media/_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,19 +249,25 @@ class MultipartUpload(UploadBase):
upload_url (str): The URL where the content will be uploaded.
headers (Optional[Mapping[str, str]]): Extra headers that should
be sent with the request, e.g. headers for encrypted data.
checksum (Optional([str])): The type of checksum to compute to verify
checksum Optional([str]): The type of checksum to compute to verify
the integrity of the object. The request metadata will be amended
to include the computed value. Using this option will override a
manually-set checksum value. Supported values are "md5", "crc32c"
and None. The default is None.
manually-set checksum value. Supported values are "md5",
"crc32c", "auto", and None. The default is "auto", which will try
to detect if the C extension for crc32c is installed and fall back
to md5 otherwise.

Attributes:
upload_url (str): The URL where the content will be uploaded.
"""

def __init__(self, upload_url, headers=None, checksum=None):
def __init__(self, upload_url, headers=None, checksum="auto"):
super(MultipartUpload, self).__init__(upload_url, headers=headers)
self._checksum_type = checksum
if self._checksum_type == "auto":
self._checksum_type = (
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
)

def _prepare_request(self, data, metadata, content_type):
"""Prepare the contents of an HTTP request.
Expand Down Expand Up @@ -355,13 +361,15 @@ class ResumableUpload(UploadBase):
chunk_size (int): The size of each chunk used to upload the resource.
headers (Optional[Mapping[str, str]]): Extra headers that should
be sent with every request.
checksum (Optional([str])): The type of checksum to compute to verify
checksum Optional([str]): The type of checksum to compute to verify
the integrity of the object. After the upload is complete, the
server-computed checksum of the resulting object will be read
server-computed checksum of the resulting object will be checked
and google.cloud.storage.exceptions.DataCorruption will be raised on
a mismatch. The corrupted file will not be deleted from the remote
host automatically. Supported values are "md5", "crc32c" and None.
The default is None.
host automatically. Supported values are "md5", "crc32c", "auto",
and None. The default is "auto", which will try to detect if the C
extension for crc32c is installed and fall back to md5 otherwise.


Attributes:
upload_url (str): The URL where the content will be uploaded.
Expand All @@ -371,7 +379,7 @@ class ResumableUpload(UploadBase):
:data:`.UPLOAD_CHUNK_SIZE`.
"""

def __init__(self, upload_url, chunk_size, checksum=None, headers=None):
def __init__(self, upload_url, chunk_size, checksum="auto", headers=None):
super(ResumableUpload, self).__init__(upload_url, headers=headers)
if chunk_size % UPLOAD_CHUNK_SIZE != 0:
raise ValueError(
Expand All @@ -383,6 +391,10 @@ def __init__(self, upload_url, chunk_size, checksum=None, headers=None):
self._bytes_uploaded = 0
self._bytes_checksummed = 0
self._checksum_type = checksum
if self._checksum_type == "auto":
self._checksum_type = (
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
)
self._checksum_object = None
self._total_bytes = None
self._resumable_url = None
Expand Down Expand Up @@ -1185,9 +1197,10 @@ class XMLMPUPart(UploadBase):
be sent with every request.
checksum (Optional([str])): The type of checksum to compute to verify
the integrity of the object. The request headers will be amended
to include the computed value. Supported values are "md5", "crc32c"
and None. The default is None.

to include the computed value. Supported values are "md5", "crc32c",
"auto" and None. The default is "auto", which will try to detect if
the C extension for crc32c is installed and fall back to md5
otherwise.
Attributes:
upload_url (str): The URL of the object (without query parameters).
upload_id (str): The ID of the upload from the initialization response.
Expand All @@ -1208,7 +1221,7 @@ def __init__(
end,
part_number,
headers=None,
checksum=None,
checksum="auto",
):
super().__init__(upload_url, headers=headers)
self._filename = filename
Expand All @@ -1218,6 +1231,10 @@ def __init__(
self._part_number = part_number
self._etag = None
self._checksum_type = checksum
if self._checksum_type == "auto":
self._checksum_type = (
"crc32c" if _helpers._is_crc32c_available_and_fast() else "md5"
)
self._checksum_object = None

@property
Expand Down
8 changes: 6 additions & 2 deletions google/cloud/storage/_media/requests/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ class Download(_request_helpers.RequestsMixin, _download.Download):
appropriate checksum (for instance in the case of transcoded or
ranged downloads where the remote service does not know the
correct checksum) an INFO-level log will be emitted. Supported
values are "md5", "crc32c" and None. The default is "md5".
values are "md5", "crc32c", "auto" and None. The default is "auto",
which will try to detect if the C extension for crc32c is installed
and fall back to md5 otherwise.

Attributes:
media_url (str): The URL containing the media to be downloaded.
Expand Down Expand Up @@ -263,7 +265,9 @@ class RawDownload(_request_helpers.RawRequestsMixin, _download.Download):
appropriate checksum (for instance in the case of transcoded or
ranged downloads where the remote service does not know the
correct checksum) an INFO-level log will be emitted. Supported
values are "md5", "crc32c" and None. The default is "md5".
values are "md5", "crc32c", "auto" and None. The default is "auto",
which will try to detect if the C extension for crc32c is installed
and fall back to md5 otherwise.
Attributes:
media_url (str): The URL containing the media to be downloaded.
start (Optional[int]): The first byte in a range to be downloaded.
Expand Down
9 changes: 6 additions & 3 deletions google/cloud/storage/_media/requests/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ class MultipartUpload(_request_helpers.RequestsMixin, _upload.MultipartUpload):
the integrity of the object. The request metadata will be amended
to include the computed value. Using this option will override a
manually-set checksum value. Supported values are "md5",
"crc32c" and None. The default is None.
"crc32c", "auto", and None. The default is "auto", which will try
to detect if the C extension for crc32c is installed and fall back
to md5 otherwise.

Attributes:
upload_url (str): The URL where the content will be uploaded.
Expand Down Expand Up @@ -334,8 +336,9 @@ class ResumableUpload(_request_helpers.RequestsMixin, _upload.ResumableUpload):
server-computed checksum of the resulting object will be checked
and google.cloud.storage.exceptions.DataCorruption will be raised on
a mismatch. The corrupted file will not be deleted from the remote
host automatically. Supported values are "md5", "crc32c" and None.
The default is None.
host automatically. Supported values are "md5", "crc32c", "auto",
and None. The default is "auto", which will try to detect if the C
extension for crc32c is installed and fall back to md5 otherwise.

Attributes:
upload_url (str): The URL where the content will be uploaded.
Expand Down
Loading