diff --git a/CHANGELOG.md b/CHANGELOG.md index 37d54f7..e14e45d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [1.2.8] - 2023/10/04 + +### Added +- Downloads API outputs a missing_files.[datetime].json file that details the specific files that didn't successfully download [JEPooley] [Amber Thorne] + +### Fixed +- Downloads API now shows correct number of downloaded files e.g. when some have failed or are missing [JEPooley] [Amber Thorne] + ## [1.2.7] - 2023/06/28 ### Fixed @@ -7,10 +15,9 @@ ## [1.2.6] - 2023/06/28 -### Features +### Added - Added check for chunk size when streaming data. Program should error if file download is incomplete [JEPooley] - ### Changed - Upgrade requests version in dependencies to 2.31.0 [gwionap] @@ -20,7 +27,7 @@ - Import error for osdatahub.post in PlacesAPI [FHunt-OS] [JEPooley] -### Features +### Added - Added support for the dataset parameter within the PlacesAPI wrapper [FHunt-OS] [JEPooley] @@ -45,7 +52,7 @@ - Extents `from_radius` method no longer errors if the coordinate is not a tuple - now accepts any Iterable (issue 66) [JEPooley] - Updated setup.cfg and tox.ini to reflect python3.11 compatability (issue 68) [JEPooley] [dchirst] -### Features +### Added - Added proxy support using the `osdatahub.set_proxies` method (issue 55) [JEPooley] [FHunt-OS] [abiddiscombe] @@ -57,7 +64,7 @@ ## [1.2.0] - 2022/11/07 -### Features +### Added - Added NGD API [dchirst] [BenDickens] [JEPooley] - Fixed typos in Features and Places APIs [dchirst] @@ -65,7 +72,7 @@ ## [1.1.0] - 2022/08/22 -### Features +### Added - Support the new Data Hub API v2 [dchirst] - Allow filters to be joined using bitwise operators [E-Paine] @@ -73,7 +80,7 @@ - Allow any type of collection to be used to construct a bounding box (issue 22) [E-Paine] - Warn when using EPSG:4329 with the features API (issue 29) [E-Paine] -### Bugs +### Fixed - Error when `nearest` returned an empty feature set (issue 24) [E-Paine] diff --git a/setup.cfg b/setup.cfg index 6002da8..8193335 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = osdatahub -version = 1.2.7 +version = 1.2.8 author = OS Rapid Prototyping author_email = rapidprototyping@os.uk classifiers = diff --git a/src/osdatahub/DownloadsAPI/downloads_api.py b/src/osdatahub/DownloadsAPI/downloads_api.py index e203d16..613175d 100644 --- a/src/osdatahub/DownloadsAPI/downloads_api.py +++ b/src/osdatahub/DownloadsAPI/downloads_api.py @@ -1,17 +1,23 @@ import functools +import json import logging import os +import time from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from http import HTTPStatus from multiprocessing import cpu_count from pathlib import Path -from typing import Union +from typing import List, Union -import osdatahub import requests +from requests.exceptions import HTTPError from tqdm import tqdm +import osdatahub +retries = 3 class _DownloadObj: @@ -43,30 +49,73 @@ def download(self, output_dir: Union[str, Path], overwrite: bool = False, pbar: f"Skipping download...") return output_path - response = requests.get(self.url, stream=True, proxies=osdatahub.get_proxies()) - response.raise_for_status() - expected_size = int(response.headers.get('content-length')) - current_size = 0 - chunk_size = 1048576 # 1024 ** 2 -> 1MB - if response.status_code == 200: - with open(output_path, 'wb') as f: - if not pbar: - pbar = tqdm(total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True) - for chunk in response.iter_content(chunk_size=chunk_size): - current_size += len(chunk) - f.write(chunk) - f.flush() - pbar.update(chunk_size) - - if expected_size != current_size: - deficit = expected_size - current_size - raise IOError( - f'incomplete read ({current_size} bytes read, {deficit} more expected)' - ) - pbar.write(f"Finished downloading {self.file_name} to {output_path}") + for _ in range(retries): + try: + response = requests.get( + self.url, stream=True, proxies=osdatahub.get_proxies()) + response.raise_for_status() + expected_size = int(response.headers.get('content-length')) + current_size = 0 + chunk_size = 1048576 # 1024 ** 2 -> 1MB + if response.status_code == 200: + with open(output_path, 'wb') as f: + if not pbar: + pbar = tqdm( + total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True) + for chunk in response.iter_content(chunk_size=chunk_size): + current_size += len(chunk) + f.write(chunk) + f.flush() + pbar.update(chunk_size) + if expected_size != current_size: + deficit = expected_size - current_size + raise IOError( + f'incomplete read ({current_size} bytes read, {deficit} more expected)' + ) + pbar.write( + f"Finished downloading {self.file_name} to {output_path}") + break + + except HTTPError as exc: + if int(exc.response.status_code) == 429: + time.sleep(1) + continue + raise + return output_path +def remove_key(url: str): + """Remove key from url + """ + return "".join([section for section in url.split("&") if "key" not in section]) + + +def format_missing_files(missing_files: List[_DownloadObj]) -> List[dict]: + """Convert download objects to dictionaries and sanitise + """ + file_info = [] + for _download_obj in missing_files: + info = _download_obj.__dict__ + info['url'] = remove_key(info['url']) + file_info.append(info) + return { + "missing_file_count": len(missing_files), + "missing_file_info": file_info + } + + +def save_missing_files(missing_files: List[_DownloadObj], output_dir: Union[str, Path]) -> None: + """Format and save missing files + """ + if len(missing_files) == 0: + return + data = format_missing_files(missing_files) + path = os.path.join( + output_dir, f"missing_files.{datetime.now().strftime('%Y%m%d%H%M%S')}.json") + json.dump(data, open(path, "w")) + + class _DownloadsAPIBase(ABC): """Parent class for Product and DataPackage classes as part of the DownloadsAPI (https://osdatahub.os.uk/docs/downloads/overview) @@ -102,7 +151,8 @@ def details(self) -> dict: """ Calls endpoint to return details about the product or data package """ - response = osdatahub.get(self._endpoint(self._id), proxies=osdatahub.get_proxies()) + response = osdatahub.get(self._endpoint( + self._id), proxies=osdatahub.get_proxies()) response.raise_for_status() return response.json() @@ -114,7 +164,8 @@ def all_products(cls, **kwargs) -> list: Returns: list of dictionaries containing all products available to download """ - response = osdatahub.get(cls._ENDPOINT, proxies=osdatahub.get_proxies()) + response = osdatahub.get( + cls._ENDPOINT, proxies=osdatahub.get_proxies()) response.raise_for_status() return response.json() @@ -146,7 +197,8 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P defaults to the machine's CPU count """ if isinstance(download_list, list) and len(download_list) == 0: - raise Exception("Argument \"download_list\" is empty. Please provide at least one DownloadObj to download") + raise Exception( + "Argument \"download_list\" is empty. Please provide at least one DownloadObj to download") elif isinstance(download_list, list) and len(download_list) > 1 and not download_multiple: raise Exception("Argument \"download_list\" contains more than 1 object to download, but argument " "\"download_multiple\" is set to False. Please pass only 1 download or set " @@ -162,16 +214,32 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P with ThreadPoolExecutor(max_workers=processes) as executor: pbar = tqdm(total=sum([d.size for d in download_list]), unit="B", unit_scale=True, leave=True, desc=f"Downloaded 0/{len(download_list)} files from osdatahub") - results = list([executor.submit(p.download, output_dir, overwrite, pbar) for p in download_list]) + processed_downloads = {} num_downloads_completed = 0 - for _ in as_completed(results): - num_downloads_completed += 1 - pbar.set_description( - f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub") + results = [] + missing_files = [] + + for p in download_list: + future = executor.submit( + p.download, output_dir, overwrite, pbar) + processed_downloads[future] = p + + for future in as_completed(processed_downloads): + info = processed_downloads[future] + try: + results.append(future.result()) + num_downloads_completed += 1 + pbar.set_description( + f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub") + except Exception: + missing_files.append(info) + + save_missing_files(missing_files, output_dir) else: # download single file - d = download_list[0] if isinstance(download_list, list) else download_list + d = download_list[0] if isinstance( + download_list, list) else download_list results = [d.download(output_dir, overwrite)] return results diff --git a/src/osdatahub/__init__.py b/src/osdatahub/__init__.py index 31dee8e..0062a48 100644 --- a/src/osdatahub/__init__.py +++ b/src/osdatahub/__init__.py @@ -9,7 +9,7 @@ def set_proxies(proxies): def get_proxies(): return json.loads(os.environ["_OSDATAHUB_PROXIES"]) -__version__ = "1.2.7" +__version__ = "1.2.8" from osdatahub.extent import Extent from osdatahub.FeaturesAPI import FeaturesAPI