Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ALMA: add option to just validate data #2263

Merged
merged 7 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ esa.hubble

Service fixes and enhancements
------------------------------
alma
^^^^

- Added ``verify_only`` option to check if data downloaded with correct file size [#2263]

esa.hubble
^^^^^^^^^^

Expand Down
43 changes: 34 additions & 9 deletions astroquery/alma/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
_gen_science_sql, _gen_spec_res_sql, ALMA_DATE_FORMAT
from . import conf, auth_urls
from astroquery.utils.commons import ASTROPY_LT_4_1
from astroquery.exceptions import CorruptDataWarning

__all__ = {'AlmaClass', 'ALMA_BANDS'}

Expand Down Expand Up @@ -685,7 +686,8 @@ def _HEADER_data_size(self, files):
return data_sizes, totalsize.to(u.GB)

def download_files(self, files, savedir=None, cache=True,
continuation=True, skip_unauthorized=True,):
continuation=True, skip_unauthorized=True,
verify_only=False):
"""
Given a list of file URLs, download them

Expand All @@ -706,6 +708,10 @@ def download_files(self, files, savedir=None, cache=True,
If you receive "unauthorized" responses for some of the download
requests, skip over them. If this is False, an exception will be
raised.
verify_only : bool
Option to go through the process of checking the files to see if
they're the right size, but not actually download them. This
option may be useful if a previous download run failed partway.
"""

if self.USERNAME:
Expand Down Expand Up @@ -743,15 +749,34 @@ def download_files(self, files, savedir=None, cache=True,
filename = os.path.join(savedir,
filename)

if verify_only:
existing_file_length = os.stat(filename).st_size
if 'content-length' in check_filename.headers:
length = int(check_filename.headers['content-length'])
if length == 0:
warnings.warn('URL {0} has length=0'.format(url))
elif existing_file_length == length:
log.info(f"Found cached file {filename} with expected size {existing_file_length}.")
elif existing_file_length < length:
log.info(f"Found cached file {filename} with size {existing_file_length} < expected "
f"size {length}. The download should be continued.")
elif existing_file_length > length:
warnings.warn(f"Found cached file {filename} with size {existing_file_length} > expected "
f"size {length}. The download is likely corrupted.",
CorruptDataWarning)
else:
warnings.warn(f"Could not verify {url} because it has no 'content-length'")

try:
self._download_file(file_link,
filename,
timeout=self.TIMEOUT,
auth=auth,
cache=cache,
method='GET',
head_safe=False,
continuation=continuation)
if not verify_only:
self._download_file(file_link,
filename,
timeout=self.TIMEOUT,
auth=auth,
cache=cache,
method='GET',
head_safe=False,
continuation=continuation)

downloaded_files.append(filename)
except requests.HTTPError as ex:
Expand Down
54 changes: 54 additions & 0 deletions astroquery/alma/tests/test_alma_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import shutil
import numpy as np
import pytest
import warnings
from datetime import datetime
import os
from urllib.parse import urlparse
Expand All @@ -12,6 +13,7 @@
from astropy import coordinates
from astropy import units as u

from astroquery.exceptions import CorruptDataWarning
from astroquery.utils.commons import ASTROPY_LT_4_1
from .. import Alma

Expand Down Expand Up @@ -655,3 +657,55 @@ def test_big_download_regression(alma):
def test_download_html_file(alma):
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'])
assert result


@pytest.mark.remote_data
def test_verify_html_file(alma, caplog):
# first, make sure the file is not cached (in case this test gets called repeatedly)
# (we are hacking the file later in this test to trigger different failure modes so
# we need it fresh)
try:
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
local_filepath = result[0]
os.remove(local_filepath)
except FileNotFoundError:
pass

caplog.clear()

# download the file
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'])
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]

result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
local_filepath = result[0]
existing_file_length = 66336
assert f"Found cached file {local_filepath} with expected size {existing_file_length}." in caplog.text

# manipulate the file
with open(local_filepath, 'ab') as fh:
fh.write(b"Extra Text")

caplog.clear()
length = 66336
existing_file_length = length + 10
with pytest.warns(expected_warning=CorruptDataWarning,
match=f"Found cached file {local_filepath} with size {existing_file_length} > expected size {length}. The download is likely corrupted."):
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]

# manipulate the file: make it small
with open(local_filepath, 'wb') as fh:
fh.write(b"Empty Text")

caplog.clear()
result = alma.download_files(['https://almascience.nao.ac.jp/dataPortal/member.uid___A001_X1284_X1353.qa2_report.html'], verify_only=True)
assert 'member.uid___A001_X1284_X1353.qa2_report.html' in result[0]
length = 66336
existing_file_length = 10
assert f"Found cached file {local_filepath} with size {existing_file_length} < expected size {length}. The download should be continued." in caplog.text

# cleanup: we don't want `test_download_html_file` to fail if this test is re-run
if os.path.exists(local_filepath):
os.remove(local_filepath)
10 changes: 9 additions & 1 deletion astroquery/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
__all__ = ['TimeoutError', 'InvalidQueryError', 'RemoteServiceError',
'TableParseError', 'LoginError', 'ResolverError',
'NoResultsWarning', 'LargeQueryWarning', 'InputWarning',
'AuthenticationWarning', 'MaxResultsWarning']
'AuthenticationWarning', 'MaxResultsWarning', 'CorruptDataWarning']


class TimeoutError(Exception):
Expand Down Expand Up @@ -98,6 +98,14 @@ class MaxResultsWarning(AstropyWarning):
pass


class CorruptDataWarning(AstropyWarning):
"""
Astroquery warning class to be issued when there is a sign that the
(partially) downloaded data are corrupt.
"""
pass


class EmptyResponseError(ValueError):
"""
Astroquery error class to be raised when the query returns an empty result
Expand Down
11 changes: 11 additions & 0 deletions docs/alma/alma.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,17 @@ You can also do the downloading all in one step:

>>> myAlma.retrieve_data_from_uid(uids[0])

If you have huge files, sometimes the transfer fails, so you will need to
restart the download. By default, the module will resume downloading where the
failure occurred. You can check whether the downloads all succeeded before
triggering a new download by using the ``verify_only`` keyword, which will not
download but will return useful information about the state of your downloads:

.. code-block:: python

>>> myAlma.download_files(link_list, cache=True, verify_only=True)


Downloading FITS data
=====================

Expand Down