diff --git a/CHANGES.rst b/CHANGES.rst index 148ff8fe01..9e7b133ee8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -169,6 +169,10 @@ mast - Fix bug in ``Catalogs.query_criteria()`` to use ``page`` and ``pagesize`` parameters correctly. [#3065] +- Modify ``mast.Observations.get_cloud_uris`` to also accept query criteria and data product filters. [#3064] + +- Increased the speed of ``mast.Observations.get_cloud_uris`` by obtaining multiple + URIs from MAST at once. [#3064] 0.4.7 (2024-03-08) diff --git a/astroquery/mast/cloud.py b/astroquery/mast/cloud.py index 02dd4d6576..0f5cf63770 100644 --- a/astroquery/mast/cloud.py +++ b/astroquery/mast/cloud.py @@ -14,7 +14,7 @@ from astropy.utils.console import ProgressBarOrSpinner from astropy.utils.exceptions import AstropyDeprecationWarning -from ..exceptions import NoResultsWarning, InvalidQueryError +from ..exceptions import NoResultsWarning from . import utils @@ -109,32 +109,14 @@ def get_cloud_uri(self, data_product, include_bucket=True, full_url=False): found in the cloud, None is returned. """ - s3_client = self.boto3.client('s3', config=self.config) - - path = utils.mast_relative_path(data_product["dataURI"]) - if path is None: - raise InvalidQueryError("Malformed data uri {}".format(data_product['dataURI'])) + uri_list = self.get_cloud_uri_list(data_product, include_bucket=include_bucket, full_url=full_url) - if 'galex' in path: - path = path.lstrip("/mast/") - elif '/ps1/' in path: - path = path.replace("/ps1/", "panstarrs/ps1/public/") + # Making sure we got at least 1 URI from the query above. + if not uri_list or uri_list[0] is None: + warnings.warn("Unable to locate file {}.".format(data_product), NoResultsWarning) else: - path = path.lstrip("/") - - try: - s3_client.head_object(Bucket=self.pubdata_bucket, Key=path) - if include_bucket: - path = "s3://{}/{}".format(self.pubdata_bucket, path) - elif full_url: - path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path) - return path - except self.botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] != "404": - raise - - warnings.warn("Unable to locate file {}.".format(data_product['productFilename']), NoResultsWarning) - return None + # Output from ``get_cloud_uri_list`` is always a list even when it's only 1 URI + return uri_list[0] def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False): """ @@ -158,8 +140,33 @@ def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False) List of URIs generated from the data products, list way contain entries that are None if data_products includes products not found in the cloud. """ + s3_client = self.boto3.client('s3', config=self.config) - return [self.get_cloud_uri(product, include_bucket, full_url) for product in data_products] + paths = utils.mast_relative_path(data_products["dataURI"]) + if isinstance(paths, str): # Handle the case where only one product was requested + paths = [paths] + + uri_list = [] + for path in paths: + if path is None: + uri_list.append(None) + else: + try: + # Use `head_object` to verify that the product is available on S3 (not all products are) + s3_client.head_object(Bucket=self.pubdata_bucket, Key=path) + if include_bucket: + s3_path = "s3://{}/{}".format(self.pubdata_bucket, path) + uri_list.append(s3_path) + elif full_url: + path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path) + uri_list.append(path) + except self.botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] != "404": + raise + warnings.warn("Unable to locate file {}.".format(path), NoResultsWarning) + uri_list.append(None) + + return uri_list def download_file(self, data_product, local_path, cache=True, verbose=True): """ diff --git a/astroquery/mast/observations.py b/astroquery/mast/observations.py index 80bb18e84f..963ff6095d 100644 --- a/astroquery/mast/observations.py +++ b/astroquery/mast/observations.py @@ -774,26 +774,56 @@ def download_products(self, products, *, download_dir=None, flat=False, return manifest - def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False): + def get_cloud_uris(self, data_products=None, *, include_bucket=True, full_url=False, pagesize=None, page=None, + mrp_only=False, extension=None, filter_products={}, **criteria): """ - Takes an `~astropy.table.Table` of data products and returns the associated cloud data uris. + Given an `~astropy.table.Table` of data products or query criteria and filter parameters, + returns the associated cloud data URIs. Parameters ---------- data_products : `~astropy.table.Table` - Table containing products to be converted into cloud data uris. + Table containing products to be converted into cloud data uris. If provided, this will supercede + page_size, page, or any keyword arguments passed in as criteria. include_bucket : bool - Default True. When false returns the path of the file relative to the + Default True. When False, returns the path of the file relative to the top level cloud storage location. Must be set to False when using the full_url argument. full_url : bool Default False. Return an HTTP fetchable url instead of a cloud uri. Must set include_bucket to False to use this option. + pagesize : int, optional + Default None. Can be used to override the default pagesize when making a query. + E.g. when using a slow internet connection. Query criteria must also be provided. + page : int, optional + Default None. Can be used to override the default behavior of all results being returned for a query + to obtain one specific page of results. Query criteria must also be provided. + mrp_only : bool, optional + Default False. When set to True, only "Minimum Recommended Products" will be returned. + extension : string or array, optional + Default None. Option to filter by file extension. + filter_products : dict, optional + Filters to be applied to data products. Valid filters are all products fields listed + `here `__. + The column name as a string is the key. The corresponding value is one + or more acceptable values for that parameter. + Filter behavior is AND between the filters and OR within a filter set. + For example: {"productType": "SCIENCE", "extension"=["fits","jpg"]} + **criteria + Criteria to apply. At least one non-positional criteria must be supplied. + Valid criteria are coordinates, objectname, radius (as in `query_region` and `query_object`), + and all observation fields returned by the ``get_metadata("observations")``. + The Column Name is the keyword, with the argument being one or more acceptable values for that parameter, + except for fields with a float datatype where the argument should be in the form [minVal, maxVal]. + For non-float type criteria wildcards maybe used (both * and % are considered wildcards), however + only one wildcarded value can be processed per criterion. + RA and Dec must be given in decimal degrees, and datetimes in MJD. + For example: filters=["FUV","NUV"],proposal_pi="Ost*",t_max=[52264.4586,54452.8914] Returns ------- response : list - List of URIs generated from the data products, list way contain entries that are None + List of URIs generated from the data products. May contain entries that are None if data_products includes products not found in the cloud. """ @@ -802,6 +832,29 @@ def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False): 'Please enable anonymous cloud access by calling `enable_cloud_dataset` method. ' 'Refer to `~astroquery.mast.ObservationsClass.enable_cloud_dataset` documentation for more info.') + if data_products is None: + if not criteria: + raise InvalidQueryError( + 'Please provide either a `~astropy.table.Table` of data products or query criteria.' + ) + else: + # Get table of observations based on query criteria + obs = self.query_criteria(pagesize=pagesize, page=page, **criteria) + + if not len(obs): + # Warning raised by ~astroquery.mast.ObservationsClass.query_criteria + return + + # Return list of associated data products + data_products = self.get_product_list(obs) + + # Filter product list + data_products = self.filter_products(data_products, mrp_only=mrp_only, extension=extension, **filter_products) + + if not len(data_products): + warnings.warn("No matching products to fetch associated cloud URIs.", NoResultsWarning) + return + # Remove duplicate products data_products = self._remove_duplicate_products(data_products) diff --git a/astroquery/mast/tests/test_mast_remote.py b/astroquery/mast/tests/test_mast_remote.py index 3a7a993d24..23c5224358 100644 --- a/astroquery/mast/tests/test_mast_remote.py +++ b/astroquery/mast/tests/test_mast_remote.py @@ -526,12 +526,13 @@ def test_get_cloud_uri(self, test_data_uri, expected_cloud_uri): assert len(uri) > 0, f'Product for dataURI {test_data_uri} was not found in the cloud.' assert uri == expected_cloud_uri, f'Cloud URI does not match expected. ({uri} != {expected_cloud_uri})' - def test_get_cloud_uris(self): + @pytest.mark.parametrize("test_obs_id", ["25568122", "31411"]) + def test_get_cloud_uris(self, test_obs_id): pytest.importorskip("boto3") - test_obs_id = '25568122' # get a product list - products = Observations.get_product_list(test_obs_id)[24:] + index = 24 if test_obs_id == '25568122' else 0 + products = Observations.get_product_list(test_obs_id)[index:] assert len(products) > 0, (f'No products found for OBSID {test_obs_id}. ' 'Unable to move forward with getting URIs from the cloud.') @@ -544,6 +545,36 @@ def test_get_cloud_uris(self): assert len(uris) > 0, f'Products for OBSID {test_obs_id} were not found in the cloud.' + # check for warning if no data products match filters + with pytest.warns(NoResultsWarning): + Observations.get_cloud_uris(products, + extension='png') + + def test_get_cloud_uris_query(self): + pytest.importorskip("boto3") + + # enable access to public AWS S3 bucket + Observations.enable_cloud_dataset() + + # get uris with other functions + obs = Observations.query_criteria(target_name=234295610) + prod = Observations.get_product_list(obs) + filt = Observations.filter_products(prod, calib_level=[2]) + s3_uris = Observations.get_cloud_uris(filt) + + # get uris with streamlined function + uris = Observations.get_cloud_uris(target_name=234295610, + filter_products={'calib_level': [2]}) + assert s3_uris == uris + + # check that InvalidQueryError is thrown if neither data_products or **criteria are defined + with pytest.raises(InvalidQueryError): + Observations.get_cloud_uris(filter_products={'calib_level': [2]}) + + # check for warning if query returns no observations + with pytest.warns(NoResultsWarning): + Observations.get_cloud_uris(target_name=234295611) + ###################### # CatalogClass tests # ###################### diff --git a/astroquery/mast/utils.py b/astroquery/mast/utils.py index c8a76ea1b6..c8bb384ffe 100644 --- a/astroquery/mast/utils.py +++ b/astroquery/mast/utils.py @@ -158,22 +158,54 @@ def parse_input_location(coordinates=None, objectname=None): def mast_relative_path(mast_uri): """ - Given a MAST dataURI, return the associated relative path. + Given one or more MAST dataURI(s), return the associated relative path(s). Parameters ---------- - mast_uri : str - The MAST uri. + mast_uri : str, list of str + The MAST uri(s). Returns ------- - response : str - The associated relative path. + response : str, list of str + The associated relative path(s). """ - - response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/", - {"uri": mast_uri}) - result = response.json() - uri_result = result.get(mast_uri) - - return uri_result["path"] + if isinstance(mast_uri, str): + uri_list = [("uri", mast_uri)] + else: # mast_uri parameter is a list + uri_list = [("uri", uri) for uri in mast_uri] + + # Split the list into chunks of 50 URIs; this is necessary + # to avoid "414 Client Error: Request-URI Too Large". + uri_list_chunks = list(_split_list_into_chunks(uri_list, chunk_size=50)) + + result = [] + for chunk in uri_list_chunks: + response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/", + {"uri": chunk}) + json_response = response.json() + + for uri in chunk: + # Chunk is a list of tuples where the tuple is + # ("uri", "/path/to/product") + # so we index for path (index=1) + path = json_response.get(uri[1])["path"] + if 'galex' in path: + path = path.lstrip("/mast/") + elif '/ps1/' in path: + path = path.replace("/ps1/", "panstarrs/ps1/public/") + else: + path = path.lstrip("/") + result.append(path) + + # If the input was a single URI string, we return a single string + if isinstance(mast_uri, str): + return result[0] + # Else, return a list of paths + return result + + +def _split_list_into_chunks(input_list, chunk_size): + """Helper function for `mast_relative_path`.""" + for idx in range(0, len(input_list), chunk_size): + yield input_list[idx:idx + chunk_size] diff --git a/docs/mast/mast_catalog.rst b/docs/mast/mast_catalog.rst index 9f947fefab..c8c7177804 100644 --- a/docs/mast/mast_catalog.rst +++ b/docs/mast/mast_catalog.rst @@ -24,12 +24,13 @@ The returned fields vary by catalog, find the field documentation for specific c `here `__. If no catalog is specified, the Hubble Source Catalog will be queried. + .. doctest-remote-data:: >>> from astroquery.mast import Catalogs ... >>> catalog_data = Catalogs.query_object("158.47924 -7.30962", catalog="Galex") - >>> print(catalog_data[:10]) + >>> print(catalog_data[:10]) # doctest: +IGNORE_OUTPUT distance_arcmin objID survey ... fuv_flux_aper_7 fuv_artifact ------------------ ------------------- ------ ... --------------- ------------ 0.3493802506329695 6382034098673685038 AIS ... 0.047751952 0 @@ -261,19 +262,17 @@ Given an HSC Match ID, return all catalog results. >>> catalog_data = Catalogs.query_object("M10", radius=.02, catalog="HSC") >>> matchid = catalog_data[0]["MatchID"] >>> print(matchid) - 63980492 + 7542452 >>> matches = Catalogs.query_hsc_matchid(matchid) >>> print(matches) - CatID MatchID ... cd_matrix - --------- -------- ... ------------------------------------------------------ - 257195287 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005 - 257440119 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005 - 428373428 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005 - 428373427 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005 - 428373429 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005 - 410574499 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005 - 410574498 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005 - 410574497 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005 + CatID MatchID ... cd_matrix + --------- ------- ... ------------------------------------------------------ + 419094794 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005 + 419094795 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005 + 401289578 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005 + 401289577 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005 + 257194049 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005 + 257438887 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005 HSC spectra accessed through this class as well. `~astroquery.mast.CatalogsClass.get_hsc_spectra` diff --git a/docs/mast/mast_obsquery.rst b/docs/mast/mast_obsquery.rst index 55511ff66c..c3f3f34e12 100644 --- a/docs/mast/mast_obsquery.rst +++ b/docs/mast/mast_obsquery.rst @@ -286,7 +286,7 @@ Using "obs_id" instead of "obsid" from the previous example will result in the f .. doctest-remote-data:: >>> obs_ids = obs_table[0:2]['obs_id'] - >>> data_products_by_id = Observations.get_product_list(obs_ids) + >>> data_products_by_id = Observations.get_product_list(obs_ids) # doctest: +IGNORE_OUTPUT Traceback (most recent call last): ... RemoteServiceError: Error converting data type varchar to bigint. @@ -325,12 +325,12 @@ Product filtering can also be applied directly to a table of products without pr ... >>> data_products = Observations.get_product_list('25588063') >>> print(len(data_products)) - 27 + 30 >>> products = Observations.filter_products(data_products, ... productType=["SCIENCE", "PREVIEW"], ... extension="fits") >>> print(len(products)) - 8 + 10 Downloading Data Products @@ -427,6 +427,9 @@ MAST until it is disabled with `~astroquery.mast.ObservationsClass.disable_cloud To directly access a list of cloud URIs for a given dataset, use the `~astroquery.mast.ObservationsClass.get_cloud_uris` function (Python will prompt you to enable cloud access if you haven't already). +With this function, users may specify a `~astropy.table.Table` of data products or +query criteria. Query criteria are supplied as keyword arguments, and product filters +may be supplied through the ``mrp_only``, ``extension``, and ``filter_products`` parameters. When cloud access is enabled, the standard download function `~astroquery.mast.ObservationsClass.download_products` preferentially pulls files from AWS when they @@ -434,7 +437,7 @@ are available. When set to `True`, the ``cloud_only`` parameter in `~astroquery.mast.ObservationsClass.download_products` skips all data products not available in the cloud. -Getting a list of S3 URIs: +To get a list of S3 URIs, use the following workflow: .. doctest-skip:: @@ -456,10 +459,32 @@ Getting a list of S3 URIs: ... productSubGroupDescription='DRZ') >>> s3_uris = Observations.get_cloud_uris(filtered) >>> print(s3_uris) - ['s3://stpubdata/hst/public/jbev/jbeveo010/jbeveo010_drz.fits', 's3://stpubdata/hst/public/jbev/jbeveo010/jbeveo010_drz.fits', 's3://stpubdata/hst/public/jbev/jbevet010/jbevet010_drz.fits', 's3://stpubdata/hst/public/jbev/jbevet010/jbevet010_drz.fits'] + ['s3://stpubdata/hst/public/jbev/jbeveo010/jbeveo010_drz.fits', 's3://stpubdata/hst/public/jbev/jbevet010/jbevet010_drz.fits'] ... >>> Observations.disable_cloud_dataset() +Alternatively, this workflow can be streamlined by providing the query criteria directly to `~astroquery.mast.ObservationsClass.get_cloud_uris`. +This approach is recommended for code brevity. Query criteria are supplied as keyword arguments, and filters are supplied through the +``filter_products`` parameter. If both ``data_products`` and query criteria are provided, ``data_products`` takes precedence. + +.. doctest-remote-data:: + + >>> import os + >>> from astroquery.mast import Observations + ... + >>> Observations.enable_cloud_dataset(provider='AWS') + INFO: Using the S3 STScI public dataset [astroquery.mast.cloud] + >>> # Getting the cloud URIs + >>> s3_uris = Observations.get_cloud_uris(obs_collection='HST', + ... filters='F606W', + ... instrument_name='ACS/WFC', + ... proposal_id=['12062'], + ... dataRights='PUBLIC', + ... filter_products={'productSubGroupDescription': 'DRZ'}) + INFO: 2 of 4 products were duplicates. Only downloading 2 unique product(s). [astroquery.mast.observations] + >>> print(s3_uris) + ['s3://stpubdata/hst/public/jbev/jbeveo010/jbeveo010_drz.fits', 's3://stpubdata/hst/public/jbev/jbevet010/jbevet010_drz.fits'] + >>> Observations.disable_cloud_dataset() Downloading data products from S3: