From 100443477552698c24bb2ca552dd899f04db2b30 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Apr 2024 15:11:59 +0200 Subject: [PATCH 01/40] still an early draft --- Makefile | 2 +- src/huggingface_hub/_local_folder.py | 233 ++++++++++++++++++++ src/huggingface_hub/file_download.py | 313 +++++++++++++++++++-------- src/huggingface_hub/utils/_fixes.py | 1 + 4 files changed, 461 insertions(+), 88 deletions(-) create mode 100644 src/huggingface_hub/_local_folder.py diff --git a/Makefile b/Makefile index ad2a0520c1..f8a9216653 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,8 @@ quality: python utils/generate_async_inference_client.py style: - ruff check --fix $(check_dirs) # linter ruff format $(check_dirs) # formatter + ruff check --fix $(check_dirs) # linter python utils/check_contrib_list.py --update python utils/check_static_imports.py --update python utils/generate_async_inference_client.py --update diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py new file mode 100644 index 0000000000..d03bc95ef7 --- /dev/null +++ b/src/huggingface_hub/_local_folder.py @@ -0,0 +1,233 @@ +# coding=utf-8 +# Copyright 2024-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utilities to handle the `../.huggingface` folder in local directories. + +First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store +download metadata when downloading files from the hub to a local directory (without +using the cache). + +./.huggingface folder structure: +[4.0K] data +├── [4.0K] .huggingface +│ └── [4.0K] download +│ ├── [ 16] file.parquet.metadata +│ ├── [ 16] file.txt.metadata +│ └── [4.0K] folder +│ └── [ 16] file.parquet.metadata +│ +├── [6.5G] file.parquet +├── [1.5K] file.txt +└── [4.0K] folder + └── [ 16] file.parquet + + +Metadata file structure: +``` +# file.txt.metadata +{ + "commit_hash": "11c5a3d5811f50298f278a704980280950aedb10", + "etag": "a16a55fda99d2f2e7b69cce5cf93ff4ad3049930", + "timestamp": 1712656091 +} + + +# file.parquet.metadata +{ + "commit_hash": "11c5a3d5811f50298f278a704980280950aedb10", + "etag": "7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421", + "timestamp": 1712656091 +} +``` +""" + +import json +import logging +import os +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple + +from .utils import WeakFileLock + + +logger = logging.getLogger(__name__) + + +@dataclass +class LocalDownloadFileMetadata: + """ + Metadata about a file in the local directory related to a download process. + + Attributes: + filename (`str`): + Path of the file in the repo. + commit_hash (`str`): + Commit hash of the file in the repo. + etag (`str`): + ETag of the file in the repo. Used to check if the file has changed. + For LFS files, this is the sha256 of the file. For regular, it correspond to the git hash. + timestamp (`int`): + Unix timestamp of when the metadata was saved i.e. when the metadata was accurate. + """ + + filename: str + commit_hash: str + etag: str + timestamp: int + + +def local_file_path(local_dir: Path, filename: str) -> Path: + """Compute path to a file in the local directory. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `Path`: the path to the file. + """ + return local_dir / _normalize_filename(filename) + + +def local_lock_path(local_dir: Path, filename: str) -> Path: + """Compute path to the lock file for a file in the local directory. + + It is the same lock as for the metadata file. + Folder containing the lock file is guaranteed to exist. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `Path`: the path to the lock file. + """ + return _download_metadata_file_path(local_dir, filename)[0] + + +def local_tmp_path(local_dir: Path, filename: str) -> Path: + """Compute path where the file will be downloaded to before being moved to correct destination. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `Path`: the path to the temporary file. + """ + return _download_metadata_file_path(local_dir, filename)[1].with_suffix(".incomplete") + + +def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]: + """Read metadata about a file in the local directory related to a download process. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise. + """ + # file_path => path where file is downloaded + # metadata_path => path where metadata is stored + # lock_path => path to lock file to ensure atomic read/write of metadata + file_path = local_file_path(local_dir, filename) + lock_path, metadata_path = _download_metadata_file_path(local_dir, filename) + with WeakFileLock(lock_path): + if metadata_path.exists(): + try: + with metadata_path.open() as f: + metadata = json.load(f) + metadata = LocalDownloadFileMetadata( + filename=filename, + commit_hash=metadata["commit_hash"], + etag=metadata["etag"], + timestamp=metadata["timestamp"], + ) + except Exception as e: + # remove the metadata file if it is corrupted / not a json / not the right format + logger.warning(f"Invalid metadata file {metadata_path}: {e}. Removing it from disk and continue.") + try: + metadata_path.unlink() + except Exception as e: + logger.warning(f"Could not remove corrupted metadata file {metadata_path}: {e}") + + try: + # check if the file exists and hasn't been modified since the metadata was saved + stat = file_path.stat() + if stat.st_mtime <= metadata.timestamp: + return metadata + logger.info(f"Ignoring metadata as file has been modified since metadata was saved ({filename}).") + except FileNotFoundError: + # file does not exist => metadata is outdated + return None + return None + + +def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None: + """Write metadata about a file in the local directory related to a download process. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + """ + lock_path, metadata_path = _download_metadata_file_path(local_dir, filename) + with WeakFileLock(lock_path): + with metadata_path.open("w") as f: + json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": int(time.time())}, f, indent=4) + + +def _download_metadata_file_path(local_dir: Path, filename: str) -> Tuple[Path, Path]: + """Compute path to the metadata file for a given file in the local directory. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `Tuple[Path, Path]`: a tuple (`lock_path`, `metadata_path`). You must use the lock_path to read or write in the + metadata file. You are guaranteed the folder that should contain the files exists but + the files themselves might not exist. + """ + path = local_dir / ".huggingface" / "download" / f"{_normalize_filename(filename)}.metadata" + lock_path = path.with_suffix(".lock") + path.parent.mkdir(parents=True, exist_ok=True) + return lock_path, path + + +def _normalize_filename(filename: str) -> str: + """Normalize a filename to be cross-platform. + + Args: + filename (`str`): + Path of the file in the repo. + + Return: + `str`: the normalized filename. + """ + # filename is the path in the Hub repository (separated by '/') + # make sure to have a cross platform transcription + return os.path.join(*filename.split("/")) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 566ef3246a..a6b14d01ac 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -16,7 +16,7 @@ from dataclasses import dataclass from functools import partial from pathlib import Path -from typing import Any, BinaryIO, Dict, Generator, Literal, Optional, Tuple, Union +from typing import Any, BinaryIO, Dict, Generator, Literal, Optional, Tuple, Type, Union from urllib.parse import quote, urlparse import requests @@ -24,6 +24,7 @@ from huggingface_hub import constants from . import __version__ # noqa: F401 # for backward compatibility +from ._local_folder import local_file_path, local_lock_path, read_download_metadata, write_download_metadata from .constants import ( DEFAULT_ETAG_TIMEOUT, DEFAULT_REQUEST_TIMEOUT, @@ -1255,97 +1256,36 @@ def hf_hub_download( commit_hash = None expected_size = None head_call_error: Optional[Exception] = None - if not local_files_only: - try: - try: - metadata = get_hf_file_metadata( - url=url, - token=token, - proxies=proxies, - timeout=etag_timeout, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - ) - except EntryNotFoundError as http_error: - # Cache the non-existence of the file and raise - commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT) - if commit_hash is not None and not legacy_cache_layout: - no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename - no_exist_file_path.parent.mkdir(parents=True, exist_ok=True) - no_exist_file_path.touch() - _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) - raise - # Commit hash must exist - commit_hash = metadata.commit_hash - if commit_hash is None: - raise FileMetadataError( - "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue" - " prevents you from downloading resources from https://huggingface.co. Please check your firewall" - " and proxy settings and make sure your SSL certificates are updated." - ) + # Try to get metadata from the server. + # Do not raise yet if the file is not found or not accessible. + if not local_files_only: + metadata_or_error = _get_metadata_or_catch_error( + url=url, + token=token, + proxies=proxies, + etag_timeout=etag_timeout, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + headers=headers, + storage_folder=storage_folder, + revision=revision, + relative_filename=relative_filename, + ) - # Etag must exist + if isinstance(metadata_or_error, Exception): + head_call_error = metadata_or_error + else: + assert isinstance(metadata_or_error, HfFileMetadata) + metadata = metadata_or_error etag = metadata.etag - # We favor a custom header indicating the etag of the linked resource, and - # we fallback to the regular etag header. - # If we don't have any of those, raise an error. - if etag is None: - raise FileMetadataError( - "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." - ) - - # Expected (uncompressed) size + commit_hash = metadata.commit_hash expected_size = metadata.size + url_to_download = metadata.location - # In case of a redirect, save an extra redirect on the request.get call, - # and ensure we download the exact atomic version even if it changed - # between the HEAD and the GET (unlikely, but hey). - # - # If url domain is different => we are downloading from a CDN => url is signed => don't send auth - # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth - if metadata.location != url: - url_to_download = metadata.location - if urlparse(url).netloc != urlparse(url_to_download).netloc: - # Remove authorization header when downloading a LFS blob - headers.pop("authorization", None) - except (requests.exceptions.SSLError, requests.exceptions.ProxyError): - # Actually raise for those subclasses of ConnectionError - raise - except ( - requests.exceptions.ConnectionError, - requests.exceptions.Timeout, - OfflineModeIsEnabled, - ) as error: - # Otherwise, our Internet connection is down. - # etag is None - head_call_error = error - pass - except (RevisionNotFoundError, EntryNotFoundError): - # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) - raise - except requests.HTTPError as error: - # Multiple reasons for an http error: - # - Repository is private and invalid/missing token sent - # - Repository is gated and invalid/missing token sent - # - Hub is down (error 500 or 504) - # => let's switch to 'local_files_only=True' to check if the files are already cached. - # (if it's not the case, the error will be re-raised) - head_call_error = error - pass - except FileMetadataError as error: - # Multiple reasons for a FileMetadataError: - # - Wrong network configuration (proxy, firewall, SSL certificates) - # - Inconsistency on the Hub - # => let's switch to 'local_files_only=True' to check if the files are already cached. - # (if it's not the case, the error will be re-raised) - head_call_error = error - pass - - assert ( - local_files_only or etag is not None or head_call_error is not None - ), "etag is empty due to uncovered problems" + if not (local_files_only or etag is not None or head_call_error is not None): + raise RuntimeError("etag is empty due to uncovered problems") # etag can be None for several reasons: # 1. we passed local_files_only. @@ -1528,6 +1468,101 @@ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: return pointer_path +def hf_hub_download_local_dir( + repo_id: str, + filename: str, + commit_hash: str, + etag: Optional[str], + local_dir: Union[str, Path], + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + library_name: Optional[str] = None, + library_version: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + user_agent: Union[Dict, str, None] = None, + force_download: bool = False, + force_filename: Optional[str] = None, + proxies: Optional[Dict] = None, + etag_timeout: float = DEFAULT_ETAG_TIMEOUT, + resume_download: bool = False, + token: Union[bool, str, None] = None, + local_files_only: bool = False, + headers: Optional[Dict[str, str]] = None, + endpoint: Optional[str] = None, +) -> str: + local_dir = Path(local_dir) + local_metadata = read_download_metadata(local_dir=local_dir, filename=filename) + file_path = local_file_path(local_dir, filename) + lock_path = local_lock_path(local_dir, filename) + + if local_files_only: + if file_path.is_file(): + return str(file_path) + raise LocalEntryNotFoundError( + "Cannot find the requested file in local folder and outgoing traffic has been disabled. To enable" + " hf.co look-ups and downloads online, set 'local_files_only=False'." + ) + + # Local file exists + metadata exists + commit_hash matches => return file + if file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == commit_hash: + return str(file_path) + + # Local file doesn't exist or commit_hash doesn't match => we need the etag + if etag is None: + metadata_or_error = _get_metadata_or_catch_error( + url=hf_hub_url( + repo_id, + filename, + subfolder=subfolder, + repo_type=repo_type, + revision=revision, + endpoint=endpoint, + ), + token=token, + proxies=proxies, + etag_timeout=etag_timeout, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + headers=headers, + storage_folder=cache_dir, + revision=revision, + relative_filename=filename, + ) + etag = metadata_or_error.etag + + # Local file exists + etag matches => update metadata and return file + if file_path.is_file() and local_metadata is not None and local_metadata.etag == etag: + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(file_path) + + # Local file doesn't exist or etag doesn't match => retrieve file from remote (or cache) + + # If we are lucky enough, the file is already in the cache => copy it + cached_path = try_to_load_from_cache( + repo_id=repo_id, + filename=filename, + cache_dir=cache_dir, + revision=revision, + repo_type=repo_type, + ) + if isinstance(cached_path, str): + with WeakFileLock(lock_path): + file_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(cached_path, file_path) + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(file_path) + + # Let's download the file + # http_get() + resume_download + # copy + write metadata + # return + return str(file_path) + + @validate_hf_hub_args def try_to_load_from_cache( repo_id: str, @@ -1696,6 +1731,110 @@ def get_hf_file_metadata( ) +def _get_metadata_or_catch_error( + *, + url: str, + token: Union[bool, str, None], + proxies: Optional[Dict], + etag_timeout: Optional[float], + library_name: Optional[str], + library_version: Optional[str], + user_agent: Union[Dict, str, None], + headers: Optional[Dict[str, str]], + storage_folder: str, + revision: str, + relative_filename: str, +) -> Union[HfFileMetadata, Type[Exception]]: + """Get metadata for a file on the Hub, safely handling network issues. + + Returns either the etag, commit_hash and expected size of the file, or the error + raised while fetching the metadata. + + NOTE: This function mutates `headers` inplace! It removes the `authorization` header + if the file is a LFS blob and the domain of the url is different from the + domain of the location (typically an S3 bucket). + """ + try: + try: + metadata = get_hf_file_metadata( + url=url, + token=token, + proxies=proxies, + timeout=etag_timeout, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + ) + except EntryNotFoundError as http_error: + # Cache the non-existence of the file and raise + commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT) + if commit_hash is not None: + no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename + no_exist_file_path.parent.mkdir(parents=True, exist_ok=True) + no_exist_file_path.touch() + _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) + raise + + # Commit hash must exist + commit_hash = metadata.commit_hash + if commit_hash is None: + raise FileMetadataError( + "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue" + " prevents you from downloading resources from https://huggingface.co. Please check your firewall" + " and proxy settings and make sure your SSL certificates are updated." + ) + + # Etag must exist + # If we don't have any of those, raise an error. + if metadata.etag is None: + raise FileMetadataError( + "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." + ) + + # In case of a redirect, save an extra redirect on the request.get call, + # and ensure we download the exact atomic version even if it changed + # between the HEAD and the GET (unlikely, but hey). + # + # If url domain is different => we are downloading from a CDN => url is signed => don't send auth + # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth + if url != metadata.location: + if urlparse(url).netloc != urlparse(metadata.location).netloc: + # Remove authorization header when downloading a LFS blob + headers.pop("authorization", None) + + return metadata + except (requests.exceptions.SSLError, requests.exceptions.ProxyError): + # Actually raise for those subclasses of ConnectionError + raise + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + OfflineModeIsEnabled, + ) as error: + # Otherwise, our Internet connection is down. + # etag is None + return error + except (RevisionNotFoundError, EntryNotFoundError): + # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) + raise + except requests.HTTPError as error: + # Multiple reasons for an http error: + # - Repository is private and invalid/missing token sent + # - Repository is gated and invalid/missing token sent + # - Hub is down (error 500 or 504) + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + return error + pass + except FileMetadataError as error: + # Multiple reasons for a FileMetadataError: + # - Wrong network configuration (proxy, firewall, SSL certificates) + # - Inconsistency on the Hub + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + return error + + def _int_or_none(value: Optional[str]) -> Optional[int]: try: return int(value) # type: ignore diff --git a/src/huggingface_hub/utils/_fixes.py b/src/huggingface_hub/utils/_fixes.py index 1edcbc1eee..28810dd705 100644 --- a/src/huggingface_hub/utils/_fixes.py +++ b/src/huggingface_hub/utils/_fixes.py @@ -79,6 +79,7 @@ def _set_write_permission_and_retry(func, path, excinfo): @contextlib.contextmanager def WeakFileLock(lock_file: Union[str, Path]) -> Generator[BaseFileLock, None, None]: + """A filelock that won't raise an exception if release fails.""" lock = FileLock(lock_file) lock.acquire() From 5a8605e359e43bd10834f0946ac60a3213ec5eba Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Apr 2024 17:28:43 +0200 Subject: [PATCH 02/40] this is better --- src/huggingface_hub/_local_folder.py | 142 ++++---- src/huggingface_hub/file_download.py | 468 ++++++++++++++------------- 2 files changed, 306 insertions(+), 304 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index d03bc95ef7..d387990d6c 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -57,8 +57,9 @@ import os import time from dataclasses import dataclass +from functools import lru_cache from pathlib import Path -from typing import Optional, Tuple +from typing import Optional from .utils import WeakFileLock @@ -66,6 +67,31 @@ logger = logging.getLogger(__name__) +@dataclass +class LocalDownloadFilePaths: + """ + Paths to the files related to a download process in a local dir. + + Returned by `get_local_download_paths`. + + Attributes: + file_path (`Path`): + Path where the file will be saved. + lock_path (`Path`): + Path to the lock file used to ensure atomicity when reading/writing metadata. + metadata_path (`Path`): + Path to the metadata file. + """ + + file_path: Path + lock_path: Path + metadata_path: Path + + def incomplete_path(self, etag: str) -> Path: + """Return the path where a file will be temporarily downloaded before being moved to `file_path`.""" + return self.metadata_path.with_suffix(f".{etag}.incomplete") + + @dataclass class LocalDownloadFileMetadata: """ @@ -89,26 +115,11 @@ class LocalDownloadFileMetadata: timestamp: int -def local_file_path(local_dir: Path, filename: str) -> Path: - """Compute path to a file in the local directory. - - Args: - local_dir (`Path`): - Path to the local directory in which files are downloaded. - filename (`str`): - Path of the file in the repo. - - Return: - `Path`: the path to the file. - """ - return local_dir / _normalize_filename(filename) - - -def local_lock_path(local_dir: Path, filename: str) -> Path: - """Compute path to the lock file for a file in the local directory. +@lru_cache(maxsize=128) # ensure singleton +def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths: + """Compute paths to the files related to a download process. - It is the same lock as for the metadata file. - Folder containing the lock file is guaranteed to exist. + Folders containing the paths are all guaranteed to exist. Args: local_dir (`Path`): @@ -117,24 +128,18 @@ def local_lock_path(local_dir: Path, filename: str) -> Path: Path of the file in the repo. Return: - `Path`: the path to the lock file. + [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path). """ - return _download_metadata_file_path(local_dir, filename)[0] - - -def local_tmp_path(local_dir: Path, filename: str) -> Path: - """Compute path where the file will be downloaded to before being moved to correct destination. - - Args: - local_dir (`Path`): - Path to the local directory in which files are downloaded. - filename (`str`): - Path of the file in the repo. + # filename is the path in the Hub repository (separated by '/') + # make sure to have a cross platform transcription + sanitized_filename = os.path.join(*filename.split("/")) + file_path = local_dir / sanitized_filename + metadata_path = local_dir / ".huggingface" / "download" / f"{sanitized_filename}.metadata" + lock_path = metadata_path.with_suffix(".lock") - Return: - `Path`: the path to the temporary file. - """ - return _download_metadata_file_path(local_dir, filename)[1].with_suffix(".incomplete") + file_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path) def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]: @@ -149,15 +154,13 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown Return: `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise. """ - # file_path => path where file is downloaded - # metadata_path => path where metadata is stored - # lock_path => path to lock file to ensure atomic read/write of metadata - file_path = local_file_path(local_dir, filename) - lock_path, metadata_path = _download_metadata_file_path(local_dir, filename) - with WeakFileLock(lock_path): - if metadata_path.exists(): + paths = get_local_download_paths(local_dir, filename) + # file_path = local_file_path(local_dir, filename) + # lock_path, metadata_path = _download_metadata_file_path(local_dir, filename) + with WeakFileLock(paths.lock_path): + if paths.metadata_path.exists(): try: - with metadata_path.open() as f: + with paths.metadata_path.open() as f: metadata = json.load(f) metadata = LocalDownloadFileMetadata( filename=filename, @@ -167,15 +170,17 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown ) except Exception as e: # remove the metadata file if it is corrupted / not a json / not the right format - logger.warning(f"Invalid metadata file {metadata_path}: {e}. Removing it from disk and continue.") + logger.warning( + f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue." + ) try: - metadata_path.unlink() + paths.metadata_path.unlink() except Exception as e: - logger.warning(f"Could not remove corrupted metadata file {metadata_path}: {e}") + logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}") try: # check if the file exists and hasn't been modified since the metadata was saved - stat = file_path.stat() + stat = paths.file_path.stat() if stat.st_mtime <= metadata.timestamp: return metadata logger.info(f"Ignoring metadata as file has been modified since metadata was saved ({filename}).") @@ -192,42 +197,7 @@ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, et local_dir (`Path`): Path to the local directory in which files are downloaded. """ - lock_path, metadata_path = _download_metadata_file_path(local_dir, filename) - with WeakFileLock(lock_path): - with metadata_path.open("w") as f: + paths = get_local_download_paths(local_dir, filename) + with WeakFileLock(paths.lock_path): + with paths.metadata_path.open("w") as f: json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": int(time.time())}, f, indent=4) - - -def _download_metadata_file_path(local_dir: Path, filename: str) -> Tuple[Path, Path]: - """Compute path to the metadata file for a given file in the local directory. - - Args: - local_dir (`Path`): - Path to the local directory in which files are downloaded. - filename (`str`): - Path of the file in the repo. - - Return: - `Tuple[Path, Path]`: a tuple (`lock_path`, `metadata_path`). You must use the lock_path to read or write in the - metadata file. You are guaranteed the folder that should contain the files exists but - the files themselves might not exist. - """ - path = local_dir / ".huggingface" / "download" / f"{_normalize_filename(filename)}.metadata" - lock_path = path.with_suffix(".lock") - path.parent.mkdir(parents=True, exist_ok=True) - return lock_path, path - - -def _normalize_filename(filename: str) -> str: - """Normalize a filename to be cross-platform. - - Args: - filename (`str`): - Path of the file in the repo. - - Return: - `str`: the normalized filename. - """ - # filename is the path in the Hub repository (separated by '/') - # make sure to have a cross platform transcription - return os.path.join(*filename.split("/")) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index a6b14d01ac..b976e6a841 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -16,7 +16,7 @@ from dataclasses import dataclass from functools import partial from pathlib import Path -from typing import Any, BinaryIO, Dict, Generator, Literal, Optional, Tuple, Type, Union +from typing import Any, BinaryIO, Dict, Generator, Literal, NoReturn, Optional, Tuple, Union from urllib.parse import quote, urlparse import requests @@ -24,7 +24,11 @@ from huggingface_hub import constants from . import __version__ # noqa: F401 # for backward compatibility -from ._local_folder import local_file_path, local_lock_path, read_download_metadata, write_download_metadata +from ._local_folder import ( + get_local_download_paths, + read_download_metadata, + write_download_metadata, +) from .constants import ( DEFAULT_ETAG_TIMEOUT, DEFAULT_REQUEST_TIMEOUT, @@ -1251,41 +1255,16 @@ def hf_hub_download( headers=headers, ) - url_to_download = url - etag = None - commit_hash = None - expected_size = None - head_call_error: Optional[Exception] = None - - # Try to get metadata from the server. - # Do not raise yet if the file is not found or not accessible. - if not local_files_only: - metadata_or_error = _get_metadata_or_catch_error( - url=url, - token=token, - proxies=proxies, - etag_timeout=etag_timeout, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - headers=headers, - storage_folder=storage_folder, - revision=revision, - relative_filename=relative_filename, - ) - - if isinstance(metadata_or_error, Exception): - head_call_error = metadata_or_error - else: - assert isinstance(metadata_or_error, HfFileMetadata) - metadata = metadata_or_error - etag = metadata.etag - commit_hash = metadata.commit_hash - expected_size = metadata.size - url_to_download = metadata.location - - if not (local_files_only or etag is not None or head_call_error is not None): - raise RuntimeError("etag is empty due to uncovered problems") + (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error( + url=url, + proxies=proxies, + etag_timeout=etag_timeout, + headers=headers, + revision=revision, + local_files_only=local_files_only, + storage_folder=storage_folder, + relative_filename=relative_filename, + ) # etag can be None for several reasons: # 1. we passed local_files_only. @@ -1297,61 +1276,36 @@ def hf_hub_download( # # If the specified revision is a commit hash, look inside "snapshots". # If the specified revision is a branch or tag, look inside "refs". - if etag is None: - # In those cases, we cannot force download. - if force_download: - if local_files_only: - raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.") - elif isinstance(head_call_error, OfflineModeIsEnabled): - raise ValueError( - "Cannot pass 'force_download=True' when offline mode is enabled." - ) from head_call_error + if head_call_error is not None: + # Couldn't make a HEAD call => let's try to find a local file + if not force_download: + commit_hash = None + if REGEX_COMMIT_HASH.match(revision): + commit_hash = revision else: - raise ValueError("Force download failed due to the above error.") from head_call_error + ref_path = os.path.join(storage_folder, "refs", revision) + if os.path.isfile(ref_path): + with open(ref_path) as f: + commit_hash = f.read() - # Try to get "commit_hash" from "revision" - commit_hash = None - if REGEX_COMMIT_HASH.match(revision): - commit_hash = revision - else: - ref_path = os.path.join(storage_folder, "refs", revision) - if os.path.isfile(ref_path): - with open(ref_path) as f: - commit_hash = f.read() - - # Return pointer file if exists - if commit_hash is not None: - pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) - if os.path.exists(pointer_path): - if local_dir is not None: - return _to_local_dir( - pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks - ) - return pointer_path + # Return pointer file if exists + if commit_hash is not None: + pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) + if os.path.exists(pointer_path): + if local_dir is not None: + return _to_local_dir( + pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks + ) + return pointer_path - # If we couldn't find an appropriate file on disk, raise an error. - # If files cannot be found and local_files_only=True, - # the models might've been found if local_files_only=False - # Notify the user about that - if local_files_only: - raise LocalEntryNotFoundError( - "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable" - " hf.co look-ups and downloads online, set 'local_files_only' to False." - ) - elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError): - # Repo not found or gated => let's raise the actual error - raise head_call_error - else: - # Otherwise: most likely a connection issue or Hub downtime => let's warn the user - raise LocalEntryNotFoundError( - "An error happened while trying to locate the file on the Hub and we cannot find the requested files" - " in the local cache. Please check your connection and try again or make sure your Internet connection" - " is on." - ) from head_call_error - - # From now on, etag and commit_hash are not None. + # Otherwise, raise appropriate error + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + + # From now on, etag, commit_hash, url and size are not None. assert etag is not None, "etag must have been retrieved from server" assert commit_hash is not None, "commit_hash must have been retrieved from server" + assert url_to_download is not None, "file location must have been retrieved from server" + assert expected_size is not None, "expected_size must have been retrieved from server" blob_path = os.path.join(storage_folder, "blobs", etag) pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) @@ -1469,75 +1423,81 @@ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: def hf_hub_download_local_dir( + *, repo_id: str, filename: str, commit_hash: str, - etag: Optional[str], local_dir: Union[str, Path], - *, subfolder: Optional[str] = None, repo_type: Optional[str] = None, revision: Optional[str] = None, library_name: Optional[str] = None, library_version: Optional[str] = None, cache_dir: Union[str, Path, None] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", user_agent: Union[Dict, str, None] = None, force_download: bool = False, - force_filename: Optional[str] = None, proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, token: Union[bool, str, None] = None, local_files_only: bool = False, headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None, ) -> str: local_dir = Path(local_dir) + paths = get_local_download_paths() local_metadata = read_download_metadata(local_dir=local_dir, filename=filename) - file_path = local_file_path(local_dir, filename) - lock_path = local_lock_path(local_dir, filename) - - if local_files_only: - if file_path.is_file(): - return str(file_path) - raise LocalEntryNotFoundError( - "Cannot find the requested file in local folder and outgoing traffic has been disabled. To enable" - " hf.co look-ups and downloads online, set 'local_files_only=False'." - ) # Local file exists + metadata exists + commit_hash matches => return file - if file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == commit_hash: - return str(file_path) + if paths.file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == commit_hash: + return str(paths.file_path) + + # Need to make http requests + url = hf_hub_url( + repo_id, + filename, + subfolder=subfolder, + repo_type=repo_type, + revision=commit_hash, + endpoint=endpoint, + ) + headers = build_hf_headers( + token=token, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + headers=headers, + ) # Local file doesn't exist or commit_hash doesn't match => we need the etag - if etag is None: - metadata_or_error = _get_metadata_or_catch_error( - url=hf_hub_url( - repo_id, - filename, - subfolder=subfolder, - repo_type=repo_type, - revision=revision, - endpoint=endpoint, - ), - token=token, - proxies=proxies, - etag_timeout=etag_timeout, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - headers=headers, - storage_folder=cache_dir, - revision=revision, - relative_filename=filename, - ) - etag = metadata_or_error.etag + (url_to_download, etag, _, expected_size, head_call_error) = _get_metadata_or_catch_error( + url=url, + proxies=proxies, + etag_timeout=etag_timeout, + headers=headers, + revision=commit_hash, + local_files_only=local_files_only, + ) + + if head_call_error is not None: + # No HEAD call but local file exists => default to local file + if not force_download and paths.file_path.is_file(): + logger.warning( + f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})" + ) + return str(paths.file_path) + # Otherwise => raise + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + + # From now on, etag, commit_hash, url and size are not None. + assert etag is not None, "etag must have been retrieved from server" + assert commit_hash is not None, "commit_hash must have been retrieved from server" + assert url_to_download is not None, "file location must have been retrieved from server" + assert expected_size is not None, "expected_size must have been retrieved from server" # Local file exists + etag matches => update metadata and return file - if file_path.is_file() and local_metadata is not None and local_metadata.etag == etag: + if paths.file_path.is_file() and local_metadata is not None and local_metadata.etag == etag: write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) - return str(file_path) + return str(paths.file_path) # Local file doesn't exist or etag doesn't match => retrieve file from remote (or cache) @@ -1550,17 +1510,37 @@ def hf_hub_download_local_dir( repo_type=repo_type, ) if isinstance(cached_path, str): - with WeakFileLock(lock_path): - file_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(cached_path, file_path) + with WeakFileLock(paths.lock_path): + paths.file_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(cached_path, paths.file_path) write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) - return str(file_path) + return str(paths.file_path) + + # Otherwise, let's download the file! + tmp_path = paths.incomplete_path(etag) + with WeakFileLock(paths.lock_path): + if force_download: + tmp_path.unlink(missing_ok=True) + + with tmp_path.open("ab") as f: + resume_size = f.tell() + logger.info(f"downloading '{url_to_download}' to '{tmp_path}' (resume from {resume_size}/{expected_size})") + http_get( + url_to_download, + f, + proxies=proxies, + resume_size=f.tell(), + headers=headers, + expected_size=expected_size, + ) + + # Move the file to its final location + logger.info(f"moving '{tmp_path}' to '{paths.file_path}'") + shutil.move(tmp_path, paths.file_path) + logger.info(f"successfully downloaded '{filename}' to '{paths.file_path}'") - # Let's download the file - # http_get() + resume_download - # copy + write metadata - # return - return str(file_path) + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(paths.file_path) @validate_hf_hub_args @@ -1734,17 +1714,19 @@ def get_hf_file_metadata( def _get_metadata_or_catch_error( *, url: str, - token: Union[bool, str, None], proxies: Optional[Dict], etag_timeout: Optional[float], - library_name: Optional[str], - library_version: Optional[str], - user_agent: Union[Dict, str, None], - headers: Optional[Dict[str, str]], - storage_folder: str, + headers: Dict[str, str], revision: str, - relative_filename: str, -) -> Union[HfFileMetadata, Type[Exception]]: + local_files_only: bool, + relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache + storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache +) -> Union[ + # Either an exception is caught and returned + Tuple[None, None, None, None, Exception], + # Or the metadata is returned + Tuple[str, str, str, int, None], +]: """Get metadata for a file on the Hub, safely handling network issues. Returns either the etag, commit_hash and expected size of the file, or the error @@ -1754,85 +1736,135 @@ def _get_metadata_or_catch_error( if the file is a LFS blob and the domain of the url is different from the domain of the location (typically an S3 bucket). """ - try: + if local_files_only: + return ( + None, + None, + None, + None, + OfflineModeIsEnabled(f"Cannot access file since 'local_files_only=True' as been set. (url: {url})"), + ) + + url_to_download: str = url + etag: Optional[str] = None + commit_hash: Optional[str] = None + expected_size: Optional[int] = None + head_error_call: Optional[Exception] = None + + # Try to get metadata from the server. + # Do not raise yet if the file is not found or not accessible. + if not local_files_only: try: - metadata = get_hf_file_metadata( - url=url, - token=token, - proxies=proxies, - timeout=etag_timeout, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - ) - except EntryNotFoundError as http_error: - # Cache the non-existence of the file and raise - commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT) - if commit_hash is not None: - no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename - no_exist_file_path.parent.mkdir(parents=True, exist_ok=True) - no_exist_file_path.touch() - _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) + try: + metadata = get_hf_file_metadata(url=url, proxies=proxies, timeout=etag_timeout, headers=headers) + except EntryNotFoundError as http_error: + if storage_folder is not None and relative_filename is not None: + # Cache the non-existence of the file + commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT) + if commit_hash is not None: + no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename + no_exist_file_path.parent.mkdir(parents=True, exist_ok=True) + no_exist_file_path.touch() + _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) + raise + + # Commit hash must exist + commit_hash = metadata.commit_hash + if commit_hash is None: + raise FileMetadataError( + "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue" + " prevents you from downloading resources from https://huggingface.co. Please check your firewall" + " and proxy settings and make sure your SSL certificates are updated." + ) + + # Etag must exist + # If we don't have any of those, raise an error. + etag = metadata.etag + if etag is None: + raise FileMetadataError( + "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." + ) + + # Size must exist + expected_size = metadata.size + if expected_size is None: + raise FileMetadataError("Distant resource does not have a Content-Length.") + + # In case of a redirect, save an extra redirect on the request.get call, + # and ensure we download the exact atomic version even if it changed + # between the HEAD and the GET (unlikely, but hey). + # + # If url domain is different => we are downloading from a CDN => url is signed => don't send auth + # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth + if url != metadata.location: + url_to_download = metadata.location + if urlparse(url).netloc != urlparse(metadata.location).netloc: + # Remove authorization header when downloading a LFS blob + headers.pop("authorization", None) + except (requests.exceptions.SSLError, requests.exceptions.ProxyError): + # Actually raise for those subclasses of ConnectionError raise + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + OfflineModeIsEnabled, + ) as error: + # Otherwise, our Internet connection is down. + # etag is None + head_error_call = error + except (RevisionNotFoundError, EntryNotFoundError): + # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) + raise + except requests.HTTPError as error: + # Multiple reasons for an http error: + # - Repository is private and invalid/missing token sent + # - Repository is gated and invalid/missing token sent + # - Hub is down (error 500 or 504) + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_error_call = error + except FileMetadataError as error: + # Multiple reasons for a FileMetadataError: + # - Wrong network configuration (proxy, firewall, SSL certificates) + # - Inconsistency on the Hub + # => let's switch to 'local_files_only=True' to check if the files are already cached. + # (if it's not the case, the error will be re-raised) + head_error_call = error + + if not (local_files_only or etag is not None or head_error_call is not None): + raise RuntimeError("etag is empty due to uncovered problems") - # Commit hash must exist - commit_hash = metadata.commit_hash - if commit_hash is None: - raise FileMetadataError( - "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue" - " prevents you from downloading resources from https://huggingface.co. Please check your firewall" - " and proxy settings and make sure your SSL certificates are updated." - ) + return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value] - # Etag must exist - # If we don't have any of those, raise an error. - if metadata.etag is None: - raise FileMetadataError( - "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." - ) - # In case of a redirect, save an extra redirect on the request.get call, - # and ensure we download the exact atomic version even if it changed - # between the HEAD and the GET (unlikely, but hey). - # - # If url domain is different => we are downloading from a CDN => url is signed => don't send auth - # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth - if url != metadata.location: - if urlparse(url).netloc != urlparse(metadata.location).netloc: - # Remove authorization header when downloading a LFS blob - headers.pop("authorization", None) +def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn: + """Raise an appropriate error when the HEAD call failed and we cannot locate a local file.""" - return metadata - except (requests.exceptions.SSLError, requests.exceptions.ProxyError): - # Actually raise for those subclasses of ConnectionError - raise - except ( - requests.exceptions.ConnectionError, - requests.exceptions.Timeout, - OfflineModeIsEnabled, - ) as error: - # Otherwise, our Internet connection is down. - # etag is None - return error - except (RevisionNotFoundError, EntryNotFoundError): - # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted) - raise - except requests.HTTPError as error: - # Multiple reasons for an http error: - # - Repository is private and invalid/missing token sent - # - Repository is gated and invalid/missing token sent - # - Hub is down (error 500 or 504) - # => let's switch to 'local_files_only=True' to check if the files are already cached. - # (if it's not the case, the error will be re-raised) - return error - pass - except FileMetadataError as error: - # Multiple reasons for a FileMetadataError: - # - Wrong network configuration (proxy, firewall, SSL certificates) - # - Inconsistency on the Hub - # => let's switch to 'local_files_only=True' to check if the files are already cached. - # (if it's not the case, the error will be re-raised) - return error + # No head call => we cannot force download. + if force_download: + if local_files_only: + raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.") + elif isinstance(head_call_error, OfflineModeIsEnabled): + raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error + else: + raise ValueError("Force download failed due to the above error.") from head_call_error + + # No head call + couldn't find an appropriate file on disk => raise an error. + if local_files_only: + raise LocalEntryNotFoundError( + "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable" + " hf.co look-ups and downloads online, set 'local_files_only' to False." + ) + elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError): + # Repo not found or gated => let's raise the actual error + raise head_call_error + else: + # Otherwise: most likely a connection issue or Hub downtime => let's warn the user + raise LocalEntryNotFoundError( + "An error happened while trying to locate the file on the Hub and we cannot find the requested files" + " in the local cache. Please check your connection and try again or make sure your Internet connection" + " is on." + ) from head_call_error def _int_or_none(value: Optional[str]) -> Optional[int]: From 68a6cf13acb0d9826c7ab914080a1c9621541eef Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Apr 2024 17:39:12 +0200 Subject: [PATCH 03/40] fix --- src/huggingface_hub/file_download.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index b976e6a841..08e9f8defd 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1430,7 +1430,6 @@ def hf_hub_download_local_dir( local_dir: Union[str, Path], subfolder: Optional[str] = None, repo_type: Optional[str] = None, - revision: Optional[str] = None, library_name: Optional[str] = None, library_version: Optional[str] = None, cache_dir: Union[str, Path, None] = None, @@ -1501,12 +1500,14 @@ def hf_hub_download_local_dir( # Local file doesn't exist or etag doesn't match => retrieve file from remote (or cache) + # TODO: if file exists + etag is a sha256 => compute local hash and compare? + # If we are lucky enough, the file is already in the cache => copy it cached_path = try_to_load_from_cache( repo_id=repo_id, filename=filename, cache_dir=cache_dir, - revision=revision, + revision=commit_hash, repo_type=repo_type, ) if isinstance(cached_path, str): From 8e903f8931de29162aa532537b31b22981ec9fbc Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 11:39:19 +0200 Subject: [PATCH 04/40] revampt/refactor download process --- src/huggingface_hub/file_download.py | 490 +++++++++++++-------------- 1 file changed, 239 insertions(+), 251 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 08e9f8defd..521304bfe9 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -2,27 +2,21 @@ import errno import fnmatch import inspect -import io import json import os import re import shutil import stat -import tempfile import time import uuid import warnings -from contextlib import contextmanager from dataclasses import dataclass -from functools import partial from pathlib import Path -from typing import Any, BinaryIO, Dict, Generator, Literal, NoReturn, Optional, Tuple, Union +from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union from urllib.parse import quote, urlparse import requests -from huggingface_hub import constants - from . import __version__ # noqa: F401 # for backward compatibility from ._local_folder import ( get_local_download_paths, @@ -791,46 +785,16 @@ def cached_download( cache_path = "\\\\?\\" + os.path.abspath(cache_path) with WeakFileLock(lock_path): - # If the download just completed while the lock was activated. - if os.path.exists(cache_path) and not force_download: - # Even if returning early like here, the lock will be released. - return cache_path - - if resume_download: - incomplete_path = cache_path + ".incomplete" - - @contextmanager - def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: - with open(incomplete_path, "ab") as f: - yield f - - temp_file_manager = _resumable_file_manager - if os.path.exists(incomplete_path): - resume_size = os.stat(incomplete_path).st_size - else: - resume_size = 0 - else: - temp_file_manager = partial( # type: ignore - tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False - ) - resume_size = 0 - - # Download to temporary file, then copy to cache dir once finished. - # Otherwise you get corrupt cache entries if the download gets interrupted. - with temp_file_manager() as temp_file: - logger.info("downloading %s to %s", url, temp_file.name) - - http_get( - url_to_download, - temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - expected_size=expected_size, - ) - - logger.info("storing %s in cache at %s", url, cache_path) - _chmod_and_replace(temp_file.name, cache_path) + _download_to_tmp_and_move( + incomplete_path=Path(cache_path + ".incomplete"), + destination_path=Path(cache_path), + url_to_download=url_to_download, + proxies=proxies, + headers=headers, + expected_size=expected_size, + filename=filename, + force_download=force_download, + ) if force_filename is None: logger.info("creating metadata file for %s", cache_path) @@ -1212,7 +1176,6 @@ def hf_hub_download( cache_dir = str(cache_dir) if isinstance(local_dir, Path): local_dir = str(local_dir) - locks_dir = os.path.join(cache_dir, ".locks") if subfolder == "": subfolder = None @@ -1225,6 +1188,87 @@ def hf_hub_download( if repo_type not in REPO_TYPES: raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}") + headers = build_hf_headers( + token=token, + library_name=library_name, + library_version=library_version, + user_agent=user_agent, + headers=headers, + ) + + if local_dir is not None: + if local_dir_use_symlinks != "auto": + warnings.warn( + "`local_dir_use_symlinks` parameter is deprecated and will be ignored. " + "The process to download files to a local folder has been updated and do " + "not rely on symlinks anymore. You only need to pass a destination folder " + "as`local_dir`.\n" + "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder." + ) + + return _hf_hub_download_to_local_dir( + # Destination + local_dir=local_dir, + # File info + repo_id=repo_id, + repo_type=repo_type, + filename=filename, + revision=revision, + # HTTP info + proxies=proxies, + etag_timeout=etag_timeout, + headers=headers, + endpoint=endpoint, + # Additional options + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + else: + return _hf_hub_download_to_cache_dir( + # Destination + cache_dir=cache_dir, + # File info + repo_id=repo_id, + filename=filename, + repo_type=repo_type, + revision=revision, + # HTTP info + headers=headers, + proxies=proxies, + etag_timeout=etag_timeout, + endpoint=endpoint, + # Additional options + local_files_only=local_files_only, + force_download=force_download, + resume_download=resume_download, + ) + + +def _hf_hub_download_to_cache_dir( + *, + # Destination + cache_dir: str, + # File info + repo_id: str, + filename: str, + repo_type: str, + revision: str, + # HTTP info + headers: Dict[str, str], + proxies: Optional[Dict], + etag_timeout: float, + endpoint: Optional[str], + # Additional options + local_files_only: bool, + force_download: bool, + resume_download: bool, +) -> str: + """Download a given file to a cache folder, if not already present. + + Method should not be called directly. Please use `hf_hub_download` instead. + """ + locks_dir = os.path.join(cache_dir, ".locks") storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) # cross platform transcription of filename, to be used as a local file path. @@ -1236,31 +1280,23 @@ def hf_hub_download( " owner to rename this file." ) - # if user provides a commit_hash and they already have the file on disk, - # shortcut everything. + # if user provides a commit_hash and they already have the file on disk, shortcut everything. if REGEX_COMMIT_HASH.match(revision): pointer_path = _get_pointer_path(storage_folder, revision, relative_filename) - if os.path.exists(pointer_path): - if local_dir is not None: - return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) + if os.path.exists(pointer_path) and not force_download: return pointer_path - url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint) - - headers = build_hf_headers( - token=token, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - headers=headers, - ) - + # Try to get metadata (etag, commit_hash, url, size) from the server. + # If we can't, a HEAD request error is returned. (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error( - url=url, + repo_id=repo_id, + filename=filename, + repo_type=repo_type, + revision=revision, + endpoint=endpoint, proxies=proxies, etag_timeout=etag_timeout, headers=headers, - revision=revision, local_files_only=local_files_only, storage_folder=storage_folder, relative_filename=relative_filename, @@ -1269,7 +1305,7 @@ def hf_hub_download( # etag can be None for several reasons: # 1. we passed local_files_only. # 2. we don't have a connection - # 3. Hub is down (HTTP 500 or 504) + # 3. Hub is down (HTTP 500, 503, 504) # 4. repo is not found -for example private or gated- and invalid/missing token sent # 5. Hub is blocked by a firewall or proxy is not set correctly. # => Try to get the last downloaded one from the specified revision. @@ -1291,11 +1327,7 @@ def hf_hub_download( # Return pointer file if exists if commit_hash is not None: pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename) - if os.path.exists(pointer_path): - if local_dir is not None: - return _to_local_dir( - pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks - ) + if os.path.exists(pointer_path) and not force_download: return pointer_path # Otherwise, raise appropriate error @@ -1311,21 +1343,19 @@ def hf_hub_download( os.makedirs(os.path.dirname(blob_path), exist_ok=True) os.makedirs(os.path.dirname(pointer_path), exist_ok=True) + # if passed revision is not identical to commit_hash # then revision has to be a branch name or tag name. # In that case store a ref. _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash) - if os.path.exists(pointer_path) and not force_download: - if local_dir is not None: - return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) - return pointer_path + # If file already exists, return it (except if force_download=True) + if not force_download: + if os.path.exists(pointer_path): + return pointer_path - if os.path.exists(blob_path) and not force_download: - # we have the blob already, but not the pointer - if local_dir is not None: # to local dir - return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) - else: # or in snapshot cache + if os.path.exists(blob_path): + # we have the blob already, but not the pointer _create_symlink(blob_path, pointer_path, new_blob=False) return pointer_path @@ -1343,137 +1373,63 @@ def hf_hub_download( Path(lock_path).parent.mkdir(parents=True, exist_ok=True) with WeakFileLock(lock_path): - # If the download just completed while the lock was activated. - if os.path.exists(pointer_path) and not force_download: - # Even if returning early like here, the lock will be released. - if local_dir is not None: - return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks) - return pointer_path - - if resume_download: - incomplete_path = blob_path + ".incomplete" - - @contextmanager - def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]: - with open(incomplete_path, "ab") as f: - yield f - - temp_file_manager = _resumable_file_manager - if os.path.exists(incomplete_path): - resume_size = os.stat(incomplete_path).st_size - else: - resume_size = 0 - else: - temp_file_manager = partial( # type: ignore - tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False - ) - resume_size = 0 - - # Download to temporary file, then copy to cache dir once finished. - # Otherwise you get corrupt cache entries if the download gets interrupted. - with temp_file_manager() as temp_file: - logger.info("downloading %s to %s", url, temp_file.name) - - if expected_size is not None: # might be None if HTTP header not set correctly - # Check tmp path - _check_disk_space(expected_size, os.path.dirname(temp_file.name)) - - # Check destination - _check_disk_space(expected_size, os.path.dirname(blob_path)) - if local_dir is not None: - _check_disk_space(expected_size, local_dir) - - http_get( - url_to_download, - temp_file, - proxies=proxies, - resume_size=resume_size, - headers=headers, - expected_size=expected_size, - displayed_filename=filename, - ) - - if local_dir is None: - logger.debug(f"Storing {url} in cache at {blob_path}") - _chmod_and_replace(temp_file.name, blob_path) - _create_symlink(blob_path, pointer_path, new_blob=True) - else: - local_dir_filepath = os.path.join(local_dir, relative_filename) - os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True) - - # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk - # In both cases, blob file is cached. - is_big_file = os.stat(temp_file.name).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD - if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file): - logger.debug(f"Storing {url} in cache at {blob_path}") - _chmod_and_replace(temp_file.name, blob_path) - logger.debug("Create symlink to local dir") - _create_symlink(blob_path, local_dir_filepath, new_blob=False) - elif local_dir_use_symlinks == "auto" and not is_big_file: - logger.debug(f"Storing {url} in cache at {blob_path}") - _chmod_and_replace(temp_file.name, blob_path) - logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')") - shutil.copyfile(blob_path, local_dir_filepath) - else: - logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).") - _chmod_and_replace(temp_file.name, local_dir_filepath) - pointer_path = local_dir_filepath # for return value + _download_to_tmp_and_move( + incomplete_path=Path(blob_path + ".incomplete"), + destination_path=Path(blob_path), + url_to_download=url_to_download, + proxies=proxies, + headers=headers, + expected_size=expected_size, + filename=filename, + force_download=force_download, + ) + _create_symlink(blob_path, pointer_path, new_blob=True) return pointer_path -def hf_hub_download_local_dir( +def _hf_hub_download_to_local_dir( *, + # Destination + local_dir: Union[str, Path], + # File info repo_id: str, + repo_type: str, filename: str, - commit_hash: str, - local_dir: Union[str, Path], - subfolder: Optional[str] = None, - repo_type: Optional[str] = None, - library_name: Optional[str] = None, - library_version: Optional[str] = None, - cache_dir: Union[str, Path, None] = None, - user_agent: Union[Dict, str, None] = None, - force_download: bool = False, - proxies: Optional[Dict] = None, - etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - token: Union[bool, str, None] = None, - local_files_only: bool = False, - headers: Optional[Dict[str, str]] = None, - endpoint: Optional[str] = None, + revision: str, + # HTTP info + proxies: Optional[Dict], + etag_timeout: float, + headers: Dict[str, str], + endpoint: Optional[str], + # Additional options + cache_dir: str, + force_download: bool, + local_files_only: bool, ) -> str: + """Download a given file to a local folder, if not already present. + + Method should not be called directly. Please use `hf_hub_download` instead. + """ local_dir = Path(local_dir) - paths = get_local_download_paths() + paths = get_local_download_paths(local_dir=local_dir, filename=filename) local_metadata = read_download_metadata(local_dir=local_dir, filename=filename) # Local file exists + metadata exists + commit_hash matches => return file - if paths.file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == commit_hash: - return str(paths.file_path) + if REGEX_COMMIT_HASH.match(revision): + if paths.file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == revision: + return str(paths.file_path) - # Need to make http requests - url = hf_hub_url( - repo_id, - filename, - subfolder=subfolder, + # Local file doesn't exist or commit_hash doesn't match => we need the etag + (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error( + repo_id=repo_id, + filename=filename, repo_type=repo_type, - revision=commit_hash, + revision=revision, endpoint=endpoint, - ) - headers = build_hf_headers( - token=token, - library_name=library_name, - library_version=library_version, - user_agent=user_agent, - headers=headers, - ) - - # Local file doesn't exist or commit_hash doesn't match => we need the etag - (url_to_download, etag, _, expected_size, head_call_error) = _get_metadata_or_catch_error( - url=url, proxies=proxies, etag_timeout=etag_timeout, headers=headers, - revision=commit_hash, local_files_only=local_files_only, ) @@ -1518,27 +1474,17 @@ def hf_hub_download_local_dir( return str(paths.file_path) # Otherwise, let's download the file! - tmp_path = paths.incomplete_path(etag) with WeakFileLock(paths.lock_path): - if force_download: - tmp_path.unlink(missing_ok=True) - - with tmp_path.open("ab") as f: - resume_size = f.tell() - logger.info(f"downloading '{url_to_download}' to '{tmp_path}' (resume from {resume_size}/{expected_size})") - http_get( - url_to_download, - f, - proxies=proxies, - resume_size=f.tell(), - headers=headers, - expected_size=expected_size, - ) - - # Move the file to its final location - logger.info(f"moving '{tmp_path}' to '{paths.file_path}'") - shutil.move(tmp_path, paths.file_path) - logger.info(f"successfully downloaded '{filename}' to '{paths.file_path}'") + _download_to_tmp_and_move( + incomplete_path=paths.incomplete_path(etag), + destination_path=paths.file_path, + url_to_download=url_to_download, + proxies=proxies, + headers=headers, + expected_size=expected_size, + filename=filename, + force_download=force_download, + ) write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) return str(paths.file_path) @@ -1714,18 +1660,22 @@ def get_hf_file_metadata( def _get_metadata_or_catch_error( *, - url: str, + repo_id: str, + filename: str, + repo_type: str, + revision: str, + endpoint: Optional[str], proxies: Optional[Dict], etag_timeout: Optional[float], - headers: Dict[str, str], - revision: str, + headers: Dict[str, str], # mutated inplace! local_files_only: bool, relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache ) -> Union[ # Either an exception is caught and returned Tuple[None, None, None, None, Exception], - # Or the metadata is returned + # Or the metadata is returned as + # `(url_to_download, etag, commit_hash, expected_size, None)` Tuple[str, str, str, int, None], ]: """Get metadata for a file on the Hub, safely handling network issues. @@ -1743,9 +1693,12 @@ def _get_metadata_or_catch_error( None, None, None, - OfflineModeIsEnabled(f"Cannot access file since 'local_files_only=True' as been set. (url: {url})"), + OfflineModeIsEnabled( + f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})" + ), ) + url = url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint) url_to_download: str = url etag: Optional[str] = None commit_hash: Optional[str] = None @@ -1868,6 +1821,70 @@ def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, ) from head_call_error +def _download_to_tmp_and_move( + incomplete_path: Path, + destination_path: Path, + url_to_download: str, + proxies: Optional[Dict], + headers: Dict[str, str], + expected_size: Optional[int], + filename: str, + force_download: bool, +) -> None: + """Download content from a URL to a destination path. + + Internal logic: + - return early if file is already downloaded + - resume download if possible (from incomplete file) + - do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True` + - check disk space before downloading + - download content to a temporary file + - set correct permissions on temporary file + - move the temporary file to the destination path + + Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy. + """ + if destination_path.exists() and not force_download: + # Do nothing if already exists (except if force_download=True) + return + + if incomplete_path.exists() and (force_download or (HF_HUB_ENABLE_HF_TRANSFER and not proxies)): + # By default, we will try to resume the download if possible. + # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should + # not resume the download => delete the incomplete file. + message = f"Removing incomplete file '{incomplete_path}'" + if force_download: + message += " (force_download=True)" + elif HF_HUB_ENABLE_HF_TRANSFER and not proxies: + message += " (hf_transfer=True)" + logger.info(message) + incomplete_path.unlink(missing_ok=True) + + with incomplete_path.open("ab") as f: + resume_size = f.tell() + message = f"Downloading '{filename}' to '{incomplete_path}'" + if resume_size > 0 and expected_size is not None: + message += f" (resume from {resume_size}/{expected_size})" + logger.info(message) + + if expected_size is not None: # might be None if HTTP header not set correctly + # Check disk space in both tmp and destination path + _check_disk_space(expected_size, incomplete_path.parent) + _check_disk_space(expected_size, destination_path.parent) + + http_get( + url_to_download, + f, + proxies=proxies, + resume_size=resume_size, + headers=headers, + expected_size=expected_size, + ) + + logger.info(f"Download complete. Moving file to {destination_path}") + _chmod_and_move(incomplete_path, destination_path) + + def _int_or_none(value: Optional[str]) -> Optional[int]: try: return int(value) # type: ignore @@ -1875,7 +1892,7 @@ def _int_or_none(value: Optional[str]) -> Optional[int]: return None -def _chmod_and_replace(src: str, dst: str) -> None: +def _chmod_and_move(src: Path, dst: Path) -> None: """Set correct permission before moving a blob from tmp directory to cache dir. Do not take into account the `umask` from the process as there is no convenient way @@ -1889,15 +1906,15 @@ def _chmod_and_replace(src: str, dst: str) -> None: - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215 """ # Get umask by creating a temporary file in the cached repo folder. - tmp_file = Path(dst).parent.parent / f"tmp_{uuid.uuid4()}" + tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}" try: tmp_file.touch() cache_dir_mode = Path(tmp_file).stat().st_mode - os.chmod(src, stat.S_IMODE(cache_dir_mode)) + os.chmod(str(src), stat.S_IMODE(cache_dir_mode)) finally: tmp_file.unlink() - shutil.move(src, dst) + shutil.move(str(src), str(dst)) def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str: @@ -1911,32 +1928,3 @@ def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str f" `relative_filename='{relative_filename}'`." ) return pointer_path - - -def _to_local_dir( - path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]] -) -> str: - """Place a file in a local dir (different than cache_dir). - - Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size. - """ - # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks - local_dir_filepath = os.path.join(local_dir, relative_filename) - if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents: - raise ValueError( - f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local" - " directory." - ) - - os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True) - real_blob_path = os.path.realpath(path) - - # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk - if use_symlinks == "auto": - use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD - - if use_symlinks: - _create_symlink(real_blob_path, local_dir_filepath, new_blob=False) - else: - shutil.copyfile(real_blob_path, local_dir_filepath) - return local_dir_filepath From 5f610ee6bd0d485e7a9e74b1f966dc0ed5820d45 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 12:34:10 +0200 Subject: [PATCH 05/40] resume download by default + do not upload .huggingface folder --- src/huggingface_hub/_commit_scheduler.py | 4 ++-- src/huggingface_hub/_local_folder.py | 22 ++++++++++++++++-- src/huggingface_hub/_snapshot_download.py | 2 -- src/huggingface_hub/file_download.py | 28 +++++++++++++++-------- src/huggingface_hub/hf_api.py | 8 ++----- src/huggingface_hub/hub_mixin.py | 4 ---- src/huggingface_hub/keras_mixin.py | 3 --- src/huggingface_hub/utils/__init__.py | 2 +- src/huggingface_hub/utils/_paths.py | 12 +++++++++- tests/test_commit_api.py | 3 +++ tests/test_hf_api.py | 2 ++ tests/test_utils_paths.py | 21 ++++++++++++----- 12 files changed, 74 insertions(+), 37 deletions(-) diff --git a/src/huggingface_hub/_commit_scheduler.py b/src/huggingface_hub/_commit_scheduler.py index 80d8dac786..62d7bf1d0d 100644 --- a/src/huggingface_hub/_commit_scheduler.py +++ b/src/huggingface_hub/_commit_scheduler.py @@ -9,7 +9,7 @@ from threading import Lock, Thread from typing import Dict, List, Optional, Union -from .hf_api import IGNORE_GIT_FOLDER_PATTERNS, CommitInfo, CommitOperationAdd, HfApi +from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi from .utils import filter_repo_objects @@ -107,7 +107,7 @@ def __init__( ignore_patterns = [] elif isinstance(ignore_patterns, str): ignore_patterns = [ignore_patterns] - self.ignore_patterns = ignore_patterns + IGNORE_GIT_FOLDER_PATTERNS + self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS if self.folder_path.is_file(): raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.") diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index d387990d6c..e53f6337e2 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -134,7 +134,7 @@ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFil # make sure to have a cross platform transcription sanitized_filename = os.path.join(*filename.split("/")) file_path = local_dir / sanitized_filename - metadata_path = local_dir / ".huggingface" / "download" / f"{sanitized_filename}.metadata" + metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata" lock_path = metadata_path.with_suffix(".lock") file_path.parent.mkdir(parents=True, exist_ok=True) @@ -183,7 +183,7 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown stat = paths.file_path.stat() if stat.st_mtime <= metadata.timestamp: return metadata - logger.info(f"Ignoring metadata as file has been modified since metadata was saved ({filename}).") + logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.") except FileNotFoundError: # file does not exist => metadata is outdated return None @@ -201,3 +201,21 @@ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, et with WeakFileLock(paths.lock_path): with paths.metadata_path.open("w") as f: json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": int(time.time())}, f, indent=4) + + +@lru_cache() +def _huggingface_dir(local_dir: Path) -> Path: + """Return the path to the `.huggingface` directory in a local directory.""" + # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times + path = local_dir / ".huggingface" + path.mkdir(exist_ok=True, parents=True) + + # Create a .gitignore file in the .huggingface directory if it doesn't exist + # Should be thread-safe enough like this. + gitignore = path / ".gitignore" + gitignore_lock = path / ".gitignore.lock" + if not gitignore.exists(): + with WeakFileLock(gitignore_lock): + gitignore.write_text("*") + gitignore_lock.unlink(missing_ok=True) + return path diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index cbf57ea233..f50fb13960 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -112,8 +112,6 @@ def snapshot_download( etag_timeout (`float`, *optional*, defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. - resume_download (`bool`, *optional*, defaults to `False): - If `True`, resume a previously interrupted download. force_download (`bool`, *optional*, defaults to `False`): Whether the file should be downloaded even if it already exists in the local cache. token (`str`, `bool`, *optional*): diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 521304bfe9..14d0b2cfd9 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -477,9 +477,9 @@ def http_get( consistency_error_message = ( f"Consistency check failed: file should be of size {expected_size} but has size" - f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry download and" - " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us" - " know by opening an issue on https://github.com/huggingface/huggingface_hub." + f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry" + " with `force_download=True`.\nIf the issue persists, please let us know by opening an issue " + "on https://github.com/huggingface/huggingface_hub." ) # Stream file to buffer @@ -618,8 +618,6 @@ def cached_download( etag_timeout (`float`, *optional* defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. - resume_download (`bool`, *optional*, defaults to `False`): - If `True`, resume a previously interrupted download. token (`bool`, `str`, *optional*): A token to be used for the download. - If `True`, the token is read from the HuggingFace config @@ -670,6 +668,13 @@ def cached_download( " 'hf_hub_download'", FutureWarning, ) + if resume_download: + warnings.warn( + "`resume_download` is deprecated and will be removed in version 1.0.0. " + "Download always resume when possible. " + "If you want to force a new download, use `force_download=True`.", + FutureWarning, + ) if cache_dir is None: cache_dir = HF_HUB_CACHE @@ -1087,8 +1092,6 @@ def hf_hub_download( etag_timeout (`float`, *optional*, defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. - resume_download (`bool`, *optional*, defaults to `False`): - If `True`, resume a previously interrupted download. token (`str`, `bool`, *optional*): A token to be used for the download. - If `True`, the token is read from the HuggingFace config @@ -1141,6 +1144,13 @@ def hf_hub_download( FutureWarning, ) legacy_cache_layout = True + if resume_download: + warnings.warn( + "`resume_download` is deprecated and will be removed in version 1.0.0. " + "Download always resume when possible. " + "If you want to force a new download, use `force_download=True`.", + FutureWarning, + ) if legacy_cache_layout: url = hf_hub_url( @@ -1162,7 +1172,6 @@ def hf_hub_download( force_filename=force_filename, proxies=proxies, etag_timeout=etag_timeout, - resume_download=resume_download, token=token, local_files_only=local_files_only, legacy_cache_layout=legacy_cache_layout, @@ -1241,7 +1250,6 @@ def hf_hub_download( # Additional options local_files_only=local_files_only, force_download=force_download, - resume_download=resume_download, ) @@ -1262,7 +1270,6 @@ def _hf_hub_download_to_cache_dir( # Additional options local_files_only: bool, force_download: bool, - resume_download: bool, ) -> str: """Download a given file to a cache folder, if not already present. @@ -1475,6 +1482,7 @@ def _hf_hub_download_to_local_dir( # Otherwise, let's download the file! with WeakFileLock(paths.lock_path): + paths.file_path.unlink(missing_ok=True) # delete outdated file first _download_to_tmp_and_move( incomplete_path=paths.incomplete_path(etag), destination_path=paths.file_path, diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 6b1c947c8e..69e40140db 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -104,7 +104,7 @@ from .file_download import HfFileMetadata, get_hf_file_metadata, hf_hub_url from .repocard_data import DatasetCardData, ModelCardData, SpaceCardData from .utils import ( # noqa: F401 # imported for backward compatibility - IGNORE_GIT_FOLDER_PATTERNS, + DEFAULT_IGNORE_PATTERNS, BadRequestError, EntryNotFoundError, GatedRepoError, @@ -4633,7 +4633,7 @@ def upload_folder( ignore_patterns = [] elif isinstance(ignore_patterns, str): ignore_patterns = [ignore_patterns] - ignore_patterns += IGNORE_GIT_FOLDER_PATTERNS + ignore_patterns += DEFAULT_IGNORE_PATTERNS delete_operations = self._prepare_upload_folder_deletions( repo_id=repo_id, @@ -4999,8 +4999,6 @@ def hf_hub_download( etag_timeout (`float`, *optional*, defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. - resume_download (`bool`, *optional*, defaults to `False`): - If `True`, resume a previously interrupted download. token (`bool` or `str`, *optional*): A valid authentication token (see https://huggingface.co/settings/token). If `None` or `True` and machine is logged in (through `huggingface-cli login` @@ -5141,8 +5139,6 @@ def snapshot_download( etag_timeout (`float`, *optional*, defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. - resume_download (`bool`, *optional*, defaults to `False): - If `True`, resume a previously interrupted download. force_download (`bool`, *optional*, defaults to `False`): Whether the file should be downloaded even if it already exists in the local cache. token (`bool` or `str`, *optional*): diff --git a/src/huggingface_hub/hub_mixin.py b/src/huggingface_hub/hub_mixin.py index 2dbbe79be3..690de0cece 100644 --- a/src/huggingface_hub/hub_mixin.py +++ b/src/huggingface_hub/hub_mixin.py @@ -342,8 +342,6 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether to force (re-)downloading the model weights and configuration files from the Hub, overriding the existing cache. - resume_download (`bool`, *optional*, defaults to `False`): - Whether to delete incompletely received files. Will attempt to resume the download if such a file exists. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on every request. @@ -474,8 +472,6 @@ def _from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether to force (re-)downloading the model weights and configuration files from the Hub, overriding the existing cache. - resume_download (`bool`, *optional*, defaults to `False`): - Whether to delete incompletely received files. Will attempt to resume the download if such a file exists. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint (e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`). diff --git a/src/huggingface_hub/keras_mixin.py b/src/huggingface_hub/keras_mixin.py index 5584ce84a2..e1c9e09fac 100644 --- a/src/huggingface_hub/keras_mixin.py +++ b/src/huggingface_hub/keras_mixin.py @@ -265,9 +265,6 @@ def from_pretrained_keras(*args, **kwargs) -> "KerasModelHubMixin": force_download (`bool`, *optional*, defaults to `False`): Whether to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether to delete incompletely received files. Will attempt to - resume the download if such a file exists. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index e51b274a81..60454bb99f 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -63,7 +63,7 @@ reset_sessions, ) from ._pagination import paginate -from ._paths import IGNORE_GIT_FOLDER_PATTERNS, filter_repo_objects +from ._paths import DEFAULT_IGNORE_PATTERNS, filter_repo_objects from ._runtime import ( dump_environment_info, get_aiohttp_version, diff --git a/src/huggingface_hub/utils/_paths.py b/src/huggingface_hub/utils/_paths.py index 411d7d52bb..d30bdbf1e0 100644 --- a/src/huggingface_hub/utils/_paths.py +++ b/src/huggingface_hub/utils/_paths.py @@ -21,7 +21,17 @@ T = TypeVar("T") -IGNORE_GIT_FOLDER_PATTERNS = [".git", ".git/*", "*/.git", "**/.git/**"] +# Always ignore `.git` and `.huggingface` folders in commits +DEFAULT_IGNORE_PATTERNS = [ + ".git", + ".git/*", + "*/.git", + "**/.git/**", + ".huggingface", + ".huggingface/*", + "*/.huggingface", + "**/.huggingface/**", +] def filter_repo_objects( diff --git a/tests/test_commit_api.py b/tests/test_commit_api.py index 037c70e51f..935641d53f 100644 --- a/tests/test_commit_api.py +++ b/tests/test_commit_api.py @@ -68,6 +68,9 @@ class TestCommitOperationForbiddenPathInRepo(unittest.TestCase): "./.git/path/to/file", "subfolder/path/.git/to/file", "./subfolder/path/.git/to/file", + ".huggingface", + "./.huggingface/path/to/file", + "./subfolder/path/.huggingface/to/file", } VALID_PATHS_IN_REPO = { diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index 5aed0f7ac7..a8c79b9dad 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -530,8 +530,10 @@ def _create_file(*parts) -> None: path.write_text("content") _create_file(".git", "file.txt") + _create_file(".huggingface", "file.txt") _create_file(".git", "folder", "file.txt") _create_file("folder", ".git", "file.txt") + _create_file("folder", ".huggingface", "file.txt") _create_file("folder", ".git", "folder", "file.txt") _create_file(".git_something", "file.txt") _create_file("file.git") diff --git a/tests/test_utils_paths.py b/tests/test_utils_paths.py index a521888cad..6d593acdee 100644 --- a/tests/test_utils_paths.py +++ b/tests/test_utils_paths.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, Callable, List, Optional, Union -from huggingface_hub.utils import IGNORE_GIT_FOLDER_PATTERNS, filter_repo_objects +from huggingface_hub.utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects @dataclass @@ -103,25 +103,34 @@ def _check( ) -class TestGitFolderExclusion(unittest.TestCase): - GIT_FOLDER_PATHS = [ +class TestDefaultIgnorePatterns(unittest.TestCase): + PATHS_TO_IGNORE = [ ".git", ".git/file.txt", ".git/folder/file.txt", "path/to/folder/.git", "path/to/folder/.git/file.txt", "path/to/.git/folder/file.txt", + ".huggingface", + ".huggingface/file.txt", + ".huggingface/folder/file.txt", + "path/to/.huggingface", + "path/to/.huggingface/file.txt", ] - NOT_GIT_FOLDER_PATHS = [ + VALID_PATHS = [ ".gitignore", "path/foo.git/file.txt", "path/.git_bar/file.txt", "path/to/file.git", + "file.huggingface", + "path/file.huggingface", + ".huggingface_folder", + ".huggingface_folder/file.txt", ] def test_exclude_git_folder(self): filtered_paths = filter_repo_objects( - items=self.GIT_FOLDER_PATHS + self.NOT_GIT_FOLDER_PATHS, ignore_patterns=IGNORE_GIT_FOLDER_PATTERNS + items=self.PATHS_TO_IGNORE + self.VALID_PATHS, ignore_patterns=DEFAULT_IGNORE_PATTERNS ) - self.assertListEqual(list(filtered_paths), self.NOT_GIT_FOLDER_PATHS) + self.assertListEqual(list(filtered_paths), self.VALID_PATHS) From 5a9762fe0701fd092fbe9745f41cf407e075e971 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 14:13:26 +0200 Subject: [PATCH 06/40] compute sha256 if necessary --- src/huggingface_hub/_local_folder.py | 4 +-- src/huggingface_hub/file_download.py | 41 +++++++++++++++++++--------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index e53f6337e2..a422f2653e 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -112,7 +112,7 @@ class LocalDownloadFileMetadata: filename: str commit_hash: str etag: str - timestamp: int + timestamp: float @lru_cache(maxsize=128) # ensure singleton @@ -200,7 +200,7 @@ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, et paths = get_local_download_paths(local_dir, filename) with WeakFileLock(paths.lock_path): with paths.metadata_path.open("w") as f: - json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": int(time.time())}, f, indent=4) + json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": time.time()}, f, indent=4) @lru_cache() diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 14d0b2cfd9..48c452f762 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -79,13 +79,23 @@ from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility from .utils._typing import HTTP_METHOD_T from .utils.insecure_hashlib import sha256 +from .utils.sha import sha_fileobj logger = logging.get_logger(__name__) +# Return value when trying to load a file from cache but the file does not exist in the distant repo. +_CACHED_NO_EXIST = object() +_CACHED_NO_EXIST_T = Any + # Regex to get filename from a "Content-Disposition" header for CDN-served files HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P.*?)";') +# Regex to check if the revision IS directly a commit_hash +REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") + +# Regex to check if the file etag IS a valid sha256 +REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$") _are_symlinks_supported_in_dir: Dict[str, bool] = {} @@ -149,12 +159,6 @@ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool: return _are_symlinks_supported_in_dir[cache_dir] -# Return value when trying to load a file from cache but the file does not exist in the distant repo. -_CACHED_NO_EXIST = object() -_CACHED_NO_EXIST_T = Any -REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") - - @dataclass(frozen=True) class HfFileMetadata: """Data structure containing information about a file versioned on the Hub. @@ -1456,14 +1460,25 @@ def _hf_hub_download_to_local_dir( assert url_to_download is not None, "file location must have been retrieved from server" assert expected_size is not None, "expected_size must have been retrieved from server" - # Local file exists + etag matches => update metadata and return file - if paths.file_path.is_file() and local_metadata is not None and local_metadata.etag == etag: - write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) - return str(paths.file_path) - - # Local file doesn't exist or etag doesn't match => retrieve file from remote (or cache) + # Local file exists => check if it's up-to-date + if paths.file_path.is_file(): + # etag matches => update metadata and return file + if local_metadata is not None and local_metadata.etag == etag: + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(paths.file_path) - # TODO: if file exists + etag is a sha256 => compute local hash and compare? + # metadata is outdated + etag is a sha256 + # => means it's an LFS file (large) + # => let's compute local hash and compare + # => if match, update metadata and return file + if local_metadata is None and REGEX_SHA256.match(etag) is not None: + with open(paths.file_path, "rb") as f: + file_hash = sha_fileobj(f).hex() + ")" + if file_hash == etag: + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(paths.file_path) + + # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache) # If we are lucky enough, the file is already in the cache => copy it cached_path = try_to_load_from_cache( From 283977db2dd76cea8890231b93cf7e21d434f80f Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 14:18:02 +0200 Subject: [PATCH 07/40] fix hash --- src/huggingface_hub/file_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 48c452f762..be166c90cb 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1473,7 +1473,7 @@ def _hf_hub_download_to_local_dir( # => if match, update metadata and return file if local_metadata is None and REGEX_SHA256.match(etag) is not None: with open(paths.file_path, "rb") as f: - file_hash = sha_fileobj(f).hex() + ")" + file_hash = sha_fileobj(f).hex() if file_hash == etag: write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) return str(paths.file_path) From e909022dd30ae4344a80c192de9adfb1fc67d086 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 15:35:03 +0200 Subject: [PATCH 08/40] add tests + fix some stuff --- src/huggingface_hub/file_download.py | 40 ++- tests/test_file_download.py | 496 +++++++++++++++++++-------- 2 files changed, 376 insertions(+), 160 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index be166c90cb..4d81c216b8 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1427,9 +1427,14 @@ def _hf_hub_download_to_local_dir( local_metadata = read_download_metadata(local_dir=local_dir, filename=filename) # Local file exists + metadata exists + commit_hash matches => return file - if REGEX_COMMIT_HASH.match(revision): - if paths.file_path.is_file() and local_metadata is not None and local_metadata.commit_hash == revision: - return str(paths.file_path) + if ( + not force_download + and REGEX_COMMIT_HASH.match(revision) + and paths.file_path.is_file() + and local_metadata is not None + and local_metadata.commit_hash == revision + ): + return str(paths.file_path) # Local file doesn't exist or commit_hash doesn't match => we need the etag (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error( @@ -1461,7 +1466,7 @@ def _hf_hub_download_to_local_dir( assert expected_size is not None, "expected_size must have been retrieved from server" # Local file exists => check if it's up-to-date - if paths.file_path.is_file(): + if not force_download and paths.file_path.is_file(): # etag matches => update metadata and return file if local_metadata is not None and local_metadata.etag == etag: write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) @@ -1481,19 +1486,20 @@ def _hf_hub_download_to_local_dir( # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache) # If we are lucky enough, the file is already in the cache => copy it - cached_path = try_to_load_from_cache( - repo_id=repo_id, - filename=filename, - cache_dir=cache_dir, - revision=commit_hash, - repo_type=repo_type, - ) - if isinstance(cached_path, str): - with WeakFileLock(paths.lock_path): - paths.file_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(cached_path, paths.file_path) - write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) - return str(paths.file_path) + if not force_download: + cached_path = try_to_load_from_cache( + repo_id=repo_id, + filename=filename, + cache_dir=cache_dir, + revision=commit_hash, + repo_type=repo_type, + ) + if isinstance(cached_path, str): + with WeakFileLock(paths.lock_path): + paths.file_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(cached_path, paths.file_path) + write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag) + return str(paths.file_path) # Otherwise, let's download the file! with WeakFileLock(paths.lock_path): diff --git a/tests/test_file_download.py b/tests/test_file_download.py index 9cfbd22f0f..2683946c96 100644 --- a/tests/test_file_download.py +++ b/tests/test_file_download.py @@ -18,6 +18,7 @@ import stat import unittest import warnings +from contextlib import contextmanager from pathlib import Path from typing import Iterable from unittest.mock import Mock, patch @@ -28,6 +29,7 @@ import huggingface_hub.file_download from huggingface_hub import HfApi, RepoUrl +from huggingface_hub._local_folder import write_download_metadata from huggingface_hub.constants import ( CONFIG_NAME, HUGGINGFACE_HEADER_X_LINKED_ETAG, @@ -42,7 +44,6 @@ _get_pointer_path, _normalize_etag, _request_wrapper, - _to_local_dir, cached_download, filename_to_url, get_hf_file_metadata, @@ -746,162 +747,379 @@ def test_keep_lock_file(self): self.assertTrue(lock_file_exist, "no lock file can be found") -@with_production_testing +# @with_production_testing +# @pytest.mark.usefixtures("fx_cache_dir") +# class HfHubDownloadToLocalDir(unittest.TestCase): +# cache_dir: Path + +# def test_with_local_dir_and_symlinks_and_file_cached(self) -> None: +# # File already cached +# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) + +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# returned_path = hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=True, +# ) +# config_file = Path(local_dir) / CONFIG_NAME +# self.assertEqual(returned_path, str(config_file)) +# self.assertTrue(config_file.is_file()) +# self.assertTrue( # File is symlink (except in Windows CI) +# config_file.is_symlink() if os.name != "nt" else not config_file.is_symlink() +# ) + +# def test_with_local_dir_and_symlinks_and_file_not_cached(self) -> None: +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# returned_path = hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=True, +# ) +# config_file = Path(local_dir) / CONFIG_NAME +# self.assertEqual(returned_path, str(config_file)) +# self.assertTrue(config_file.is_file()) +# if os.name != "nt": # File is symlink (except in Windows CI) +# self.assertTrue(config_file.is_symlink()) +# blob_path = config_file.resolve() +# self.assertTrue(self.cache_dir in blob_path.parents) # blob is cached! +# else: +# self.assertFalse(config_file.is_symlink()) + +# def test_with_local_dir_and_no_symlink_and_file_cached(self) -> None: +# # File already cached +# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) + +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# with patch.object( +# huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get +# ) as mock: +# returned_path = hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=False, # no symlinks +# ) +# mock.assert_not_called() # reused file from cache + +# config_file = Path(local_dir) / CONFIG_NAME +# self.assertEqual(returned_path, str(config_file)) +# self.assertTrue(config_file.is_file()) +# self.assertFalse(config_file.is_symlink()) + +# def test_with_local_dir_and_no_symlink_and_file_not_cached(self) -> None: +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# with patch.object( +# huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get +# ) as mock: +# returned_path = hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=False, # no symlinks +# ) +# mock.assert_called() # no file cached => had to download it + +# config_file = Path(local_dir) / CONFIG_NAME +# self.assertEqual(returned_path, str(config_file)) +# self.assertTrue(config_file.is_file()) +# self.assertFalse(config_file.is_symlink()) + +# # Cache directory not used (no blobs, no symlinks in it) +# for path in self.cache_dir.glob("**/blobs/**"): +# self.assertFalse(path.is_file()) +# for path in self.cache_dir.glob("**/snapshots/**"): +# self.assertFalse(path.is_file()) + +# @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) +# def test_with_local_dir_and_auto_symlinks_and_file_cached(self) -> None: +# # File already cached +# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) # 496 bytes -> small +# hf_hub_download(DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir) # 1.11kB -> "big" + +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# config = hf_hub_download( +# DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir +# ) +# readme = hf_hub_download( +# DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir +# ) +# self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated +# if os.name != "nt": +# self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink + +# @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) +# def test_with_local_dir_and_auto_symlinks_and_file_not_cached(self) -> None: +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# config = hf_hub_download( +# DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir +# ) +# readme = hf_hub_download( +# DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir +# ) +# self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated +# if os.name != "nt": +# self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink + +# def test_with_local_dir_and_symlinks_and_overwrite(self) -> None: +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# config_path = Path(local_dir) / CONFIG_NAME +# config_path.write_text("this will be overwritten") +# hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=True, +# ) +# if os.name != "nt": +# self.assertTrue(config_path.is_symlink()) +# self.assertNotEqual(config_path.read_text(), "this will be overwritten") + +# def test_with_local_dir_and_no_symlinks_and_overwrite(self) -> None: +# # Download to local dir +# with SoftTemporaryDirectory() as local_dir: +# config_path = Path(local_dir) / CONFIG_NAME +# config_path.write_text("this will be overwritten") +# hf_hub_download( +# DUMMY_MODEL_ID, +# filename=CONFIG_NAME, +# cache_dir=self.cache_dir, +# local_dir=local_dir, +# local_dir_use_symlinks=False, +# ) +# self.assertFalse(config_path.is_symlink()) +# self.assertNotEqual(config_path.read_text(), "this will be overwritten") + + @pytest.mark.usefixtures("fx_cache_dir") class HfHubDownloadToLocalDir(unittest.TestCase): + # `cache_dir` is a temporary directory + # `local_dir` is a subdirectory in which files will be downloaded + # `hub_cache_dir` is a subdirectory in which files will be cached ("HF cache") cache_dir: Path + file_name: str = "file.txt" + lfs_name: str = "lfs.bin" - def test_with_local_dir_and_symlinks_and_file_cached(self) -> None: - # File already cached - hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) + @property + def local_dir(self) -> Path: + path = Path(self.cache_dir) / "local" + path.mkdir(exist_ok=True, parents=True) + return path - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - returned_path = hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=True, - ) - config_file = Path(local_dir) / CONFIG_NAME - self.assertEqual(returned_path, str(config_file)) - self.assertTrue(config_file.is_file()) - self.assertTrue( # File is symlink (except in Windows CI) - config_file.is_symlink() if os.name != "nt" else not config_file.is_symlink() - ) + @property + def hub_cache_dir(self) -> Path: + path = Path(self.cache_dir) / "cache" + path.mkdir(exist_ok=True, parents=True) + return path - def test_with_local_dir_and_symlinks_and_file_not_cached(self) -> None: - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - returned_path = hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=True, - ) - config_file = Path(local_dir) / CONFIG_NAME - self.assertEqual(returned_path, str(config_file)) - self.assertTrue(config_file.is_file()) - if os.name != "nt": # File is symlink (except in Windows CI) - self.assertTrue(config_file.is_symlink()) - blob_path = config_file.resolve() - self.assertTrue(self.cache_dir in blob_path.parents) # blob is cached! - else: - self.assertFalse(config_file.is_symlink()) - - def test_with_local_dir_and_no_symlink_and_file_cached(self) -> None: - # File already cached - hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) + @property + def file_path(self) -> Path: + return self.local_dir / self.file_name - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - with patch.object( - huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get - ) as mock: - returned_path = hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=False, # no symlinks - ) - mock.assert_not_called() # reused file from cache + @property + def lfs_path(self) -> Path: + return self.local_dir / self.lfs_name - config_file = Path(local_dir) / CONFIG_NAME - self.assertEqual(returned_path, str(config_file)) - self.assertTrue(config_file.is_file()) - self.assertFalse(config_file.is_symlink()) + @classmethod + def setUpClass(cls): + cls.api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + cls.repo_id = cls.api.create_repo(repo_id=repo_name()).repo_id + commit_1 = cls.api.upload_file(path_or_fileobj=b"content", path_in_repo=cls.file_name, repo_id=cls.repo_id) + commit_2 = cls.api.upload_file(path_or_fileobj=b"content", path_in_repo=cls.lfs_name, repo_id=cls.repo_id) - def test_with_local_dir_and_no_symlink_and_file_not_cached(self) -> None: - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - with patch.object( - huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get - ) as mock: - returned_path = hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=False, # no symlinks - ) - mock.assert_called() # no file cached => had to download it + info = cls.api.get_paths_info(repo_id=cls.repo_id, paths=[cls.file_name, cls.lfs_name]) + info = {item.path: item for item in info} + cls.commit_hash_1 = commit_1.oid + cls.commit_hash_2 = commit_2.oid + cls.file_etag = info[cls.file_name].blob_id + cls.lfs_etag = info[cls.lfs_name].lfs.sha256 - config_file = Path(local_dir) / CONFIG_NAME - self.assertEqual(returned_path, str(config_file)) - self.assertTrue(config_file.is_file()) - self.assertFalse(config_file.is_symlink()) + @classmethod + def tearDownClass(cls) -> None: + cls.api.delete_repo(repo_id=cls.repo_id) - # Cache directory not used (no blobs, no symlinks in it) - for path in self.cache_dir.glob("**/blobs/**"): - self.assertFalse(path.is_file()) - for path in self.cache_dir.glob("**/snapshots/**"): - self.assertFalse(path.is_file()) + @contextmanager + def with_patch_head(self): + with patch("huggingface_hub.file_download._get_metadata_or_catch_error") as mock: + yield mock - @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) - def test_with_local_dir_and_auto_symlinks_and_file_cached(self) -> None: - # File already cached - hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) # 496 bytes -> small - hf_hub_download(DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir) # 1.11kB -> "big" + @contextmanager + def with_patch_download(self): + with patch("huggingface_hub.file_download._download_to_tmp_and_move") as mock: + yield mock + def test_empty_local_dir(self): # Download to local dir - with SoftTemporaryDirectory() as local_dir: - config = hf_hub_download( - DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir + returned_path = self.api.hf_hub_download( + self.repo_id, filename=self.file_name, cache_dir=self.hub_cache_dir, local_dir=self.local_dir + ) + assert self.local_dir in Path(returned_path).parents + + # Cache directory not used (no blobs, no symlinks in it) + for path in self.hub_cache_dir.glob("**/blobs/**"): + assert not path.is_file() + for path in self.hub_cache_dir.glob("**/snapshots/**"): + assert not path.is_file() + + def test_metadata_ok_and_revision_is_a_commit_hash_and_match(self): + # File already exists + commit_hash matches (and etag not even required) + self.file_path.write_text("content") + write_download_metadata(self.local_dir, self.file_name, self.commit_hash_1, etag="...") + + # Download to local dir => no HEAD call needed + with self.with_patch_head() as mock: + self.api.hf_hub_download( + self.repo_id, filename=self.file_name, revision=self.commit_hash_1, local_dir=self.local_dir ) - readme = hf_hub_download( - DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir + mock.assert_not_called() + + def test_metadata_ok_and_revision_is_a_commit_hash_and_mismatch(self): + # 1 HEAD call + 1 download + # File already exists + commit_hash mismatch + self.file_path.write_text("content") + write_download_metadata(self.local_dir, self.file_name, self.commit_hash_1, etag="...") + + # Mismatch => download + with self.with_patch_download() as mock: + self.api.hf_hub_download( + self.repo_id, filename=self.file_name, revision=self.commit_hash_2, local_dir=self.local_dir ) - self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated - if os.name != "nt": - self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink + mock.assert_called_once() - @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) - def test_with_local_dir_and_auto_symlinks_and_file_not_cached(self) -> None: - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - config = hf_hub_download( - DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir - ) - readme = hf_hub_download( - DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir + def test_metadata_not_ok_and_revision_is_a_commit_hash(self): + # 1 HEAD call + 1 download + # File already exists but no metadata + self.file_path.write_text("content") + + # Mismatch => download + with self.with_patch_download() as mock: + self.api.hf_hub_download( + self.repo_id, filename=self.file_name, revision=self.commit_hash_1, local_dir=self.local_dir ) - self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated - if os.name != "nt": - self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink + mock.assert_called_once() - def test_with_local_dir_and_symlinks_and_overwrite(self) -> None: - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - config_path = Path(local_dir) / CONFIG_NAME - config_path.write_text("this will be overwritten") - hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=True, + def test_local_files_only_and_file_exists(self): + # must return without error + self.file_path.write_text("content2") + + path = self.api.hf_hub_download( + self.repo_id, filename=self.file_name, local_dir=self.local_dir, local_files_only=True + ) + assert Path(path) == self.file_path + assert self.file_path.read_text() == "content2" # not overwritten even if wrong content + + def test_local_files_only_and_file_missing(self): + # must raise + with self.assertRaises(LocalEntryNotFoundError): + self.api.hf_hub_download( + self.repo_id, filename=self.file_name, local_dir=self.local_dir, local_files_only=True ) - if os.name != "nt": - self.assertTrue(config_path.is_symlink()) - self.assertNotEqual(config_path.read_text(), "this will be overwritten") - def test_with_local_dir_and_no_symlinks_and_overwrite(self) -> None: - # Download to local dir - with SoftTemporaryDirectory() as local_dir: - config_path = Path(local_dir) / CONFIG_NAME - config_path.write_text("this will be overwritten") - hf_hub_download( - DUMMY_MODEL_ID, - filename=CONFIG_NAME, - cache_dir=self.cache_dir, - local_dir=local_dir, - local_dir_use_symlinks=False, + def test_metadata_ok_and_etag_match(self): + # 1 HEAD call + return early + self.file_path.write_text("something") + write_download_metadata(self.local_dir, self.file_name, self.commit_hash_1, etag=self.file_etag) + + with self.with_patch_download() as mock: + # Download from main => commit_hash mismatch but etag match => return early + self.api.hf_hub_download(self.repo_id, filename=self.file_name, local_dir=self.local_dir) + mock.assert_not_called() + + def test_metadata_ok_and_etag_mismatch(self): + # 1 HEAD call + 1 download + self.file_path.write_text("something") + write_download_metadata(self.local_dir, self.file_name, self.commit_hash_1, etag="some_other_etag") + + with self.with_patch_download() as mock: + # Download from main => commit_hash mismatch but etag match => return early + self.api.hf_hub_download(self.repo_id, filename=self.file_name, local_dir=self.local_dir) + mock.assert_called_once() + + def test_metadata_ok_and_etag_match_and_force_download(self): + # force_download=True takes precedence on any other rule + self.file_path.write_text("something") + write_download_metadata(self.local_dir, self.file_name, self.commit_hash_1, etag=self.file_etag) + + with self.with_patch_download() as mock: + self.api.hf_hub_download( + self.repo_id, filename=self.file_name, local_dir=self.local_dir, force_download=True + ) + mock.assert_called_once() + + def test_metadata_not_ok_and_lfs_file_and_sha256_match(self): + # 1 HEAD call + 1 hash compute + return early + self.lfs_path.write_text("content") + + with self.with_patch_download() as mock: + # Download from main + # => no metadata but it's an LFS file + # => compute local hash => matches => return early + self.api.hf_hub_download(self.repo_id, filename=self.lfs_name, local_dir=self.local_dir) + mock.assert_not_called() + + def test_metadata_not_ok_and_lfs_file_and_sha256_mismatch(self): + # 1 HEAD call + 1 file hash + 1 download + self.lfs_path.write_text("wrong_content") + + # Download from main + # => no metadata but it's an LFS file + # => compute local hash => mismatches => download + path = self.api.hf_hub_download(self.repo_id, filename=self.lfs_name, local_dir=self.local_dir) + + # existing file overwritten + assert Path(path).read_text() == "content" + + def test_file_exists_in_cache(self): + # 1 HEAD call + return early + self.api.hf_hub_download(self.repo_id, filename=self.file_name, cache_dir=self.hub_cache_dir) + + with self.with_patch_download() as mock: + # Download to local dir + # => file is already in Hub cache + # => we assume it's faster to make a local copy rather than re-downloading + # => duplicate file locally + path = self.api.hf_hub_download( + self.repo_id, filename=self.file_name, cache_dir=self.hub_cache_dir, local_dir=self.local_dir ) - self.assertFalse(config_path.is_symlink()) - self.assertNotEqual(config_path.read_text(), "this will be overwritten") + mock.assert_not_called() + + assert Path(path) == self.file_path + + def test_file_exists_and_overwrites(self): + # 1 HEAD call + 1 download + self.file_path.write_text("another content") + self.api.hf_hub_download(self.repo_id, filename=self.file_name, local_dir=self.local_dir) + assert self.file_path.read_text() == "content" + + def test_resume_from_incomplete(self): + # An incomplete file already exists => use it + incomplete_path = self.local_dir / ".huggingface" / "download" / (self.file_name + ".incomplete") + incomplete_path.parent.mkdir(parents=True, exist_ok=True) + incomplete_path.write_text("XXXX") # Here we put fake data to test the resume + self.api.hf_hub_download(self.repo_id, filename=self.file_name, local_dir=self.local_dir) + self.file_path.read_text() == "XXXXent" + + def test_do_not_resume_on_force_download(self): + # An incomplete file already exists but force_download=True + incomplete_path = self.local_dir / ".huggingface" / "download" / (self.file_name + ".incomplete") + incomplete_path.parent.mkdir(parents=True, exist_ok=True) + incomplete_path.write_text("XXXX") + self.api.hf_hub_download(self.repo_id, filename=self.file_name, local_dir=self.local_dir, force_download=True) + self.file_path.read_text() == "content" @pytest.mark.usefixtures("fx_cache_dir") @@ -1016,14 +1234,6 @@ def test_get_pointer_path_but_invalid_relative_filename(self) -> None: with self.assertRaises(ValueError): _get_pointer_path("path/to/storage", "abcdef", relative_filename) - def test_to_local_dir_but_invalid_relative_filename(self) -> None: - # Cannot happen because of other protections, but just in case. - relative_filename = "folder\\..\\..\\..\\file.txt" if os.name == "nt" else "folder/../../../file.txt" - with self.assertRaises(ValueError): - _to_local_dir( - "path/to/file_to_copy", "path/to/local/dir", relative_filename=relative_filename, use_symlinks=False - ) - class TestHttpGet(unittest.TestCase): def test_http_get_with_ssl_and_timeout_error(self): From 39cfef48d775de3197c0e180c5e708d9e8a8e4bc Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 15:42:18 +0200 Subject: [PATCH 09/40] fix snapshot download tests --- tests/test_snapshot_download.py | 36 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_snapshot_download.py b/tests/test_snapshot_download.py index 865c464b59..e3a3698d5b 100644 --- a/tests/test_snapshot_download.py +++ b/tests/test_snapshot_download.py @@ -199,32 +199,32 @@ def test_download_model_with_ignore_pattern(self): def test_download_model_with_ignore_pattern_list(self): self.check_download_model_with_pattern(["*.git*", "*.pt"], allow=False) - @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 10) # >10b => "big file" def test_download_to_local_dir(self) -> None: """Download a repository to local dir. - Cache dir is used and symlinks are adding to local dir. This test is here to check once the normal behavior - with snapshot_download. More combinations of cache_dir/local_dir/use_symlinks are tested separately in - `test_file_download.py`. + Cache dir is not used. + Symlinks are not used. + + This test is here to check once the normal behavior with snapshot_download. + More individual tests exists in `test_file_download.py`. """ with SoftTemporaryDirectory() as cache_dir: with SoftTemporaryDirectory() as local_dir: returned_path = snapshot_download(self.repo_id, cache_dir=cache_dir, local_dir=local_dir) - # Files have been downloaded - self.assertTrue((Path(local_dir) / "dummy_file.txt").is_file()) - self.assertTrue((Path(local_dir) / "dummy_file_2.txt").is_file()) - - # Files are small so duplicated from cache (no symlinks) - self.assertFalse((Path(local_dir) / "dummy_file.txt").is_symlink()) # smaller than 10b => duplicated - self.assertFalse((Path(local_dir) / "dummy_file_2.txt").is_symlink()) # smaller than 10b => duplicated + # Files have been downloaded in correct structure + assert (Path(local_dir) / "dummy_file.txt").is_file() + assert (Path(local_dir) / "dummy_file_2.txt").is_file() + assert (Path(local_dir) / "subpath" / "file.txt").is_file() - # File structure is preserved (+check content) - subpath_file = Path(local_dir) / "subpath" / "file.txt" - self.assertTrue(subpath_file.is_file()) - self.assertEqual(subpath_file.read_text(), "content in subpath") - if os.name != "nt": - self.assertTrue(subpath_file.is_symlink()) # bigger than 10b => symlinked + # Symlinks are not used anymore + assert not (Path(local_dir) / "dummy_file.txt").is_symlink() + assert not (Path(local_dir) / "dummy_file_2.txt").is_symlink() + assert not (Path(local_dir) / "subpath" / "file.txt").is_symlink() # Check returns local dir and not cache dir - self.assertEqual(Path(returned_path).resolve(), Path(local_dir).resolve()) + assert Path(returned_path).resolve() == Path(local_dir).resolve() + + # Nothing has been added to cache dir (except some subfolders created) + for path in cache_dir.glob("*"): + assert path.is_dir() From dbece972b18d163566967f11317eee1e14ee48cc Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 16:04:41 +0200 Subject: [PATCH 10/40] fix test --- src/huggingface_hub/_commit_api.py | 11 ++++++----- tests/test_commit_api.py | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index b25a8e95fd..64f8ca165a 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -254,11 +254,12 @@ def _validate_path_in_repo(path_in_repo: str) -> str: raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'") if path_in_repo.startswith("./"): path_in_repo = path_in_repo[2:] - if any(part == ".git" for part in path_in_repo.split("/")): - raise ValueError( - "Invalid `path_in_repo` in CommitOperation: cannot update files under a '.git/' folder (path:" - f" '{path_in_repo}')." - ) + for forbidden in [".git", ".huggingface"]: + if any(part == forbidden for part in path_in_repo.split("/")): + raise ValueError( + f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:" + f" '{path_in_repo}')." + ) return path_in_repo diff --git a/tests/test_commit_api.py b/tests/test_commit_api.py index 935641d53f..85e1f719bb 100644 --- a/tests/test_commit_api.py +++ b/tests/test_commit_api.py @@ -57,7 +57,7 @@ def test_path_in_repo_invalid(self) -> None: class TestCommitOperationForbiddenPathInRepo(unittest.TestCase): - """Commit operations must throw an error on files in the .git/ folder. + """Commit operations must throw an error on files in the .git/ or .huggingface/ folders. Server would error anyway so it's best to prevent early. """ @@ -78,6 +78,7 @@ class TestCommitOperationForbiddenPathInRepo(unittest.TestCase): "path/to/.gitignore", "path/to/something.git", "path/to/something.git/more", + "path/to/something.huggingface/more", } def test_cannot_update_file_in_git_folder(self): From 0206964d431eb1c5e686efcb92a5a20bf1c623e3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:00:30 +0200 Subject: [PATCH 11/40] lots of docs --- docs/source/en/guides/cli.md | 12 ++- docs/source/en/guides/download.md | 55 ++++------ .../environment_variables.md | 5 +- src/huggingface_hub/_snapshot_download.py | 52 +++------ src/huggingface_hub/commands/download.py | 29 ++--- src/huggingface_hub/constants.py | 5 +- src/huggingface_hub/file_download.py | 63 ++++------- src/huggingface_hub/hf_api.py | 101 +++++------------- tests/test_cli.py | 12 --- 9 files changed, 100 insertions(+), 234 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index b4b9a52d0e..aa7a4751a3 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -224,18 +224,20 @@ The examples above show how to download from the latest commit on the main branc ### Download to a local folder -The recommended (and default) way to download files from the Hub is to use the cache-system. However, in some cases you want to download files and move them to a specific folder. This is useful to get a workflow closer to what git commands offer. You can do that using the `--local_dir` option. The file is downloaded to a tmp file and then moved to the local dir to avoid having partially downloaded files in the local folder. +The recommended (and default) way to download files from the Hub is to use the cache-system. However, in some cases you want to download files and move them to a specific folder. This is useful to get a workflow closer to what git commands offer. You can do that using the `--local_dir` option. - +Note that a `.huggingface/` folder will be created at the root of your local directory, containing metadata about the downloaded files. This prevents re-downloading files if you re-run the command. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. -Downloading to a local directory comes with some downsides. Please check out the limitations in the [Download](./download#download-files-to-local-folder) guide before using `--local-dir`. + + +For more details on how downloading to a local file works, check out the [download](./download.md#download-files-to-a-local-folder) guide. ```bash ->>> huggingface-cli download adept/fuyu-8b model-00001-of-00002.safetensors --local-dir . +>>> huggingface-cli download adept/fuyu-8b model-00001-of-00002.safetensors --local-dir fuyu ... -./model-00001-of-00002.safetensors +fuyu/model-00001-of-00002.safetensors ``` ### Specify cache directory diff --git a/docs/source/en/guides/download.md b/docs/source/en/guides/download.md index db6eb3b16c..1bf4ff0635 100644 --- a/docs/source/en/guides/download.md +++ b/docs/source/en/guides/download.md @@ -126,42 +126,25 @@ files except `vocab.json`. >>> snapshot_download(repo_id="gpt2", allow_patterns=["*.md", "*.json"], ignore_patterns="vocab.json") ``` -## Download file(s) to local folder - -The recommended (and default) way to download files from the Hub is to use the [cache-system](./manage-cache). -You can define your cache location by setting `cache_dir` parameter (both in [`hf_hub_download`] and [`snapshot_download`]). - -However, in some cases you want to download files and move them to a specific folder. This is useful to get a workflow -closer to what `git` commands offer. You can do that using the `local_dir` and `local_dir_use_symlinks` parameters: -- `local_dir` must be a path to a folder on your system. The downloaded files will keep the same file structure as in the -repo. For example if `filename="data/train.csv"` and `local_dir="path/to/folder"`, then the returned filepath will be -`"path/to/folder/data/train.csv"`. -- `local_dir_use_symlinks` defines how the file must be saved in your local folder. - - The default behavior (`"auto"`) is to duplicate small files (<5MB) and use symlinks for bigger files. Symlinks allow - to optimize both bandwidth and disk usage. However manually editing a symlinked file might corrupt the cache, hence - the duplication for small files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD` - environment variable. - - If `local_dir_use_symlinks=True` is set, all files are symlinked for an optimal disk space optimization. This is - for example useful when downloading a huge dataset with thousands of small files. - - Finally, if you don't want symlinks at all you can disable them (`local_dir_use_symlinks=False`). The cache directory - will still be used to check whether the file is already cached or not. If already cached, the file is **duplicated** - from the cache (i.e. saves bandwidth but increases disk usage). If the file is not already cached, it will be - downloaded and moved directly to the local dir. This means that if you need to reuse it somewhere else later, it - will be **re-downloaded**. - -Here is a table that summarizes the different options to help you choose the parameters that best suit your use case. - - -| Parameters | File already cached | Returned path | Can read path? | Can save to path? | Optimized bandwidth | Optimized disk usage | -|---|:---:|:---:|:---:|:---:|:---:|:---:| -| `local_dir=None` | | symlink in cache | ✅ | ❌
_(save would corrupt the cache)_ | ✅ | ✅ | -| `local_dir="path/to/folder"`
`local_dir_use_symlinks="auto"` | | file or symlink in folder | ✅ | ✅ _(for small files)_
⚠️ _(for big files do not resolve path before saving)_ | ✅ | ✅ | -| `local_dir="path/to/folder"`
`local_dir_use_symlinks=True` | | symlink in folder | ✅ | ⚠️
_(do not resolve path before saving)_ | ✅ | ✅ | -| `local_dir="path/to/folder"`
`local_dir_use_symlinks=False` | No | file in folder | ✅ | ✅ | ❌
_(if re-run, file is re-downloaded)_ | ⚠️
(multiple copies if ran in multiple folders) | -| `local_dir="path/to/folder"`
`local_dir_use_symlinks=False` | Yes | file in folder | ✅ | ✅ | ⚠️
_(file has to be cached first)_ | ❌
_(file is duplicated)_ | - -**Note:** if you are on a Windows machine, you need to enable developer mode or run `huggingface_hub` as admin to enable -symlinks. Check out the [cache limitations](../guides/manage-cache#limitations) section for more details. +## Download file(s) to a local folder + +By default, we recommend using the [cache system](./manage-cache) to download files from the Hub. You can specify a custom cache location using the `cache_dir` parameter in [`hf_hub_download`] and [`snapshot_download`], or by setting the [`HF_HOME`](../package_reference/environment_variables#hf_home) environment variable. + +However, if you need to download files to a specific folder, you can pass a `local_dir` parameter to the download function. This is useful to get a workflow closer to what `git` commands offer. The downloaded files will maintain their original file structure within the specified folder. For example, if `filename="data/train.csv"` and `local_dir="path/to/folder"`, the resulting filepath will be `"path/to/folder/data/train.csv"`. + +Note that a `.huggingface/` folder will be created at the root of your local directory, containing metadata about the downloaded files. This prevents re-downloading files if you re-run your script. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. + + + +After completing the download, you can safely remove the `.huggingface/` folder if you no longer need it. However, be aware that re-running your script without this folder may result in longer recovery times, as metadata will be lost. Rest assured that your local data will remain intact and unaffected. + + + + + +Don't worry about the `.huggingface/` folder when committing changes to the Hub! This folder is automatically ignored by both `git` and [`upload_folder`]. + + ## Download from the CLI diff --git a/docs/source/en/package_reference/environment_variables.md b/docs/source/en/package_reference/environment_variables.md index f01cc48f8b..e2752d8ea4 100644 --- a/docs/source/en/package_reference/environment_variables.md +++ b/docs/source/en/package_reference/environment_variables.md @@ -67,10 +67,7 @@ For more details, see [logging reference](../package_reference/utilities#hugging ### HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD -Integer value to define under which size a file is considered as "small". When downloading files to a local directory, -small files will be duplicated to ease user experience while bigger files are symlinked to save disk usage. - -For more details, see the [download guide](../guides/download#download-files-to-local-folder). +This environment variable has been deprecated and is now ignored by `huggingface_hub`. Downloading files to the local dir do not rely on symlinks anymore. ### HF_HUB_ETAG_TIMEOUT diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index f50fb13960..739ffa9ef8 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -39,13 +39,11 @@ def snapshot_download( revision: Optional[str] = None, cache_dir: Union[str, Path, None] = None, local_dir: Union[str, Path, None] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", library_name: Optional[str] = None, library_version: Optional[str] = None, user_agent: Optional[Union[Dict, str]] = None, proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, force_download: bool = False, token: Optional[Union[bool, str]] = None, local_files_only: bool = False, @@ -55,6 +53,9 @@ def snapshot_download( tqdm_class: Optional[base_tqdm] = None, headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None, + # Deprecated args + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + resume_download: bool = False, ) -> str: """Download repo files. @@ -63,20 +64,10 @@ def snapshot_download( to keep their actual filename relative to that folder. You can also filter which files to download using `allow_patterns` and `ignore_patterns`. - If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure - how you want to move those files: - - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob - files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal - is to be able to manually edit and save small files without corrupting the cache while saving disk space for - binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD` - environment variable. - - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`. - This is optimal in term of disk usage but files must not be manually edited. - - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the - local dir. This means disk usage is not optimized. - - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the - files are downloaded and directly placed under `local_dir`. This means if you need to download them again later, - they will be re-downloaded entirely. + If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this + option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir` + to store some metadata related to the downloaded files.While this mechanism is not as robust as the main + cache-system, it's optimized for regularly pulling the latest version of a repository. An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly configured. It is also not possible to filter which files to download when cloning a repository using git. @@ -93,13 +84,7 @@ def snapshot_download( cache_dir (`str`, `Path`, *optional*): Path to the folder where cached files are stored. local_dir (`str` or `Path`, *optional*): - If provided, the downloaded files will be placed under this directory, either as symlinks (default) or - regular files (see description for more details). - local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`): - To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either - duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be - created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if - already exists) or downloaded from the Hub and not cached. See description for more details. + If provided, the downloaded files will be placed under this directory. library_name (`str`, *optional*): The name of the library to which the object corresponds. library_version (`str`, *optional*): @@ -139,20 +124,15 @@ def snapshot_download( `HF_HUB_DISABLE_PROGRESS_BARS` environment variable. Returns: - Local folder path (string) of repo snapshot - - - - Raises the following errors: - - - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - if `token=True` and the token cannot be found. - - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if - ETag cannot be determined. - - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) - if some parameter value is invalid + `str`: folder path of the repo snapshot. - + Raises: + - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) + if `token=True` and the token cannot be found. + - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if + ETag cannot be determined. + - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + if some parameter value is invalid """ if cache_dir is None: cache_dir = HF_HUB_CACHE diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index 1de7738a14..b8f4b2f51e 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -38,7 +38,7 @@ import warnings from argparse import Namespace, _SubParsersAction -from typing import List, Literal, Optional, Union +from typing import List, Optional from huggingface_hub import logging from huggingface_hub._snapshot_download import snapshot_download @@ -94,13 +94,7 @@ def register_subcommand(parser: _SubParsersAction): download_parser.add_argument( "--local-dir-use-symlinks", choices=["auto", "True", "False"], - default="auto", - help=( - "To be used with `local_dir`. If set to 'auto', the cache directory will be used and the file will be" - " either duplicated or symlinked to the local directory depending on its size. It set to `True`, a" - " symlink will be created, no matter the file size. If set to `False`, the file will either be" - " duplicated from cache (if already exists) or downloaded from the Hub and not cached." - ), + help=("Deprecated and ignored. Downloading to a local directory do not use symlinks anymore."), ) download_parser.add_argument( "--force-download", @@ -134,19 +128,10 @@ def __init__(self, args: Namespace) -> None: self.resume_download: bool = args.resume_download self.quiet: bool = args.quiet - # Raise if local_dir_use_symlinks is invalid - self.local_dir_use_symlinks: Union[Literal["auto"], bool] - use_symlinks_lowercase = args.local_dir_use_symlinks.lower() - if use_symlinks_lowercase == "true": - self.local_dir_use_symlinks = True - elif use_symlinks_lowercase == "false": - self.local_dir_use_symlinks = False - elif use_symlinks_lowercase == "auto": - self.local_dir_use_symlinks = "auto" - else: - raise ValueError( - f"'{args.local_dir_use_symlinks}' is not a valid value for `local_dir_use_symlinks`. It must be either" - " 'auto', 'True' or 'False'." + if args.local_dir_use_symlinks is not None: + warnings.warn( + "Ignoring --local-dir-use-symlinks. Downloading to a local directory do not use symlinks anymore.", + FutureWarning, ) def run(self) -> None: @@ -187,7 +172,6 @@ def _download(self) -> str: force_download=self.force_download, token=self.token, local_dir=self.local_dir, - local_dir_use_symlinks=self.local_dir_use_symlinks, library_name="huggingface-cli", ) @@ -210,6 +194,5 @@ def _download(self) -> str: cache_dir=self.cache_dir, token=self.token, local_dir=self.local_dir, - local_dir_use_symlinks=self.local_dir_use_symlinks, library_name="huggingface-cli", ) diff --git a/src/huggingface_hub/constants.py b/src/huggingface_hub/constants.py index f62d2f311a..064aa60bda 100644 --- a/src/huggingface_hub/constants.py +++ b/src/huggingface_hub/constants.py @@ -163,9 +163,8 @@ def _as_int(value: Optional[str]) -> Optional[int]: HF_HUB_ENABLE_HF_TRANSFER: bool = _is_true(os.environ.get("HF_HUB_ENABLE_HF_TRANSFER")) -# Used if download to `local_dir` and `local_dir_use_symlinks="auto"` -# Files smaller than 5MB are copy-pasted while bigger files are symlinked. The idea is to save disk-usage by symlinking -# huge files (i.e. LFS files most of the time) while allowing small files to be manually edited in local folder. +# UNUSED +# We don't use symlinks in local dir anymore. HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = ( _as_int(os.environ.get("HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD")) or 5 * 1024 * 1024 ) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 4d81c216b8..4fc4a68691 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1000,18 +1000,19 @@ def hf_hub_download( library_version: Optional[str] = None, cache_dir: Union[str, Path, None] = None, local_dir: Union[str, Path, None] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", user_agent: Union[Dict, str, None] = None, force_download: bool = False, - force_filename: Optional[str] = None, proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, token: Union[bool, str, None] = None, local_files_only: bool = False, headers: Optional[Dict[str, str]] = None, - legacy_cache_layout: bool = False, endpoint: Optional[str] = None, + # Deprecated args + legacy_cache_layout: bool = False, + resume_download: bool = False, + force_filename: Optional[str] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", ) -> str: """Download a given file if it's not already present in the local cache. @@ -1025,21 +1026,6 @@ def hf_hub_download( that have been resolved at that particular commit. Each filename is a symlink to the blob at that particular commit. - If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure - how you want to move those files: - - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob - files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal - is to be able to manually edit and save small files without corrupting the cache while saving disk space for - binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD` - environment variable. - - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`. - This is optimal in term of disk usage but files must not be manually edited. - - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the - local dir. This means disk usage is not optimized. - - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the - files are downloaded and directly placed under `local_dir`. This means if you need to download them again later, - they will be re-downloaded entirely. - ``` [ 96] . └── [ 160] models--julien-c--EsperBERTo-small @@ -1058,6 +1044,11 @@ def hf_hub_download( └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd ``` + If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this + option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir` + to store some metadata related to the downloaded files. While this mechanism is not as robust as the main + cache-system, it's optimized for regularly pulling the latest version of a repository. + Args: repo_id (`str`): A user or an organization name and a repo name separated by a `/`. @@ -1078,13 +1069,7 @@ def hf_hub_download( cache_dir (`str`, `Path`, *optional*): Path to the folder where cached files are stored. local_dir (`str` or `Path`, *optional*): - If provided, the downloaded file will be placed under this directory, either as a symlink (default) or - a regular file (see description for more details). - local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`): - To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either - duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be - created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if - already exists) or downloaded from the Hub and not cached. See description for more details. + If provided, the downloaded file will be placed under this directory. user_agent (`dict`, `str`, *optional*): The user-agent info in the form of a dictionary or a string. force_download (`bool`, *optional*, defaults to `False`): @@ -1112,30 +1097,24 @@ def hf_hub_download( more powerful. Returns: - Local path (string) of file or if networking is off, last version of - file cached on disk. - - - - Raises the following errors: + `str`: Local path of file or if networking is off, last version of file cached on disk. + Raises: - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - if `token=True` and the token cannot be found. + if `token=True` and the token cannot be found. - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) - if ETag cannot be determined. + if ETag cannot be determined. - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) - if some parameter value is invalid + if some parameter value is invalid - [`~utils.RepositoryNotFoundError`] - If the repository to download from cannot be found. This may be because it doesn't exist, - or because it is set to `private` and you do not have access. + If the repository to download from cannot be found. This may be because it doesn't exist, + or because it is set to `private` and you do not have access. - [`~utils.RevisionNotFoundError`] - If the revision to download from cannot be found. + If the revision to download from cannot be found. - [`~utils.EntryNotFoundError`] - If the file to download cannot be found. + If the file to download cannot be found. - [`~utils.LocalEntryNotFoundError`] - If network is disabled or unavailable and file is not found in cache. - - + If network is disabled or unavailable and file is not found in cache. """ if HF_HUB_ETAG_TIMEOUT != DEFAULT_ETAG_TIMEOUT: # Respect environment variable above user value diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 69e40140db..b13905e8cd 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -4912,15 +4912,16 @@ def hf_hub_download( revision: Optional[str] = None, cache_dir: Union[str, Path, None] = None, local_dir: Union[str, Path, None] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", force_download: bool = False, - force_filename: Optional[str] = None, proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, token: Optional[Union[str, bool]] = None, local_files_only: bool = False, + # Deprecated args + resume_download: bool = False, legacy_cache_layout: bool = False, + force_filename: Optional[str] = None, + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", ) -> str: """Download a given file if it's not already present in the local cache. @@ -4934,21 +4935,6 @@ def hf_hub_download( that have been resolved at that particular commit. Each filename is a symlink to the blob at that particular commit. - If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure - how you want to move those files: - - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob - files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal - is to be able to manually edit and save small files without corrupting the cache while saving disk space for - binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD` - environment variable. - - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`. - This is optimal in term of disk usage but files must not be manually edited. - - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the - local dir. This means disk usage is not optimized. - - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the - files are downloaded and directly placed under `local_dir`. This means if you need to download them again later, - they will be re-downloaded entirely. - ``` [ 96] . └── [ 160] models--julien-c--EsperBERTo-small @@ -4967,6 +4953,11 @@ def hf_hub_download( └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd ``` + If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this + option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir` + to store some metadata related to the downloaded files. While this mechanism is not as robust as the main + cache-system, it's optimized for regularly pulling the latest version of a repository. + Args: repo_id (`str`): A user or an organization name and a repo name separated by a `/`. @@ -4983,13 +4974,7 @@ def hf_hub_download( cache_dir (`str`, `Path`, *optional*): Path to the folder where cached files are stored. local_dir (`str` or `Path`, *optional*): - If provided, the downloaded file will be placed under this directory, either as a symlink (default) or - a regular file (see description for more details). - local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`): - To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either - duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be - created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if - already exists) or downloaded from the Hub and not cached. See description for more details. + If provided, the downloaded file will be placed under this directory. force_download (`bool`, *optional*, defaults to `False`): Whether the file should be downloaded even if it already exists in the local cache. @@ -5007,19 +4992,11 @@ def hf_hub_download( local_files_only (`bool`, *optional*, defaults to `False`): If `True`, avoid downloading the file and return the path to the local cached file if it exists. - legacy_cache_layout (`bool`, *optional*, defaults to `False`): - If `True`, uses the legacy file cache layout i.e. just call [`hf_hub_url`] - then `cached_download`. This is deprecated as the new cache layout is - more powerful. Returns: - Local path (string) of file or if networking is off, last version of - file cached on disk. - - - - Raises the following errors: + `str`: Local path of file or if networking is off, last version of file cached on disk. + Raises: - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) if `token=True` and the token cannot be found. - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) @@ -5035,8 +5012,6 @@ def hf_hub_download( If the file to download cannot be found. - [`~utils.LocalEntryNotFoundError`] If network is disabled or unavailable and file is not found in cache. - - """ from .file_download import hf_hub_download @@ -5077,10 +5052,8 @@ def snapshot_download( revision: Optional[str] = None, cache_dir: Union[str, Path, None] = None, local_dir: Union[str, Path, None] = None, - local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, force_download: bool = False, token: Optional[Union[str, bool]] = None, local_files_only: bool = False, @@ -5088,6 +5061,9 @@ def snapshot_download( ignore_patterns: Optional[Union[List[str], str]] = None, max_workers: int = 8, tqdm_class: Optional[base_tqdm] = None, + # Deprecated args + local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", + resume_download: bool = False, ) -> str: """Download repo files. @@ -5096,20 +5072,10 @@ def snapshot_download( to keep their actual filename relative to that folder. You can also filter which files to download using `allow_patterns` and `ignore_patterns`. - If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure - how you want to move those files: - - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob - files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal - is to be able to manually edit and save small files without corrupting the cache while saving disk space for - binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD` - environment variable. - - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`. - This is optimal in term of disk usage but files must not be manually edited. - - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the - local dir. This means disk usage is not optimized. - - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the - files are downloaded and directly placed under `local_dir`. This means if you need to download them again later, - they will be re-downloaded entirely. + If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this + option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir` + to store some metadata related to the downloaded files.While this mechanism is not as robust as the main + cache-system, it's optimized for regularly pulling the latest version of a repository. An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly configured. It is also not possible to filter which files to download when cloning a repository using git. @@ -5126,13 +5092,7 @@ def snapshot_download( cache_dir (`str`, `Path`, *optional*): Path to the folder where cached files are stored. local_dir (`str` or `Path`, *optional*): - If provided, the downloaded files will be placed under this directory, either as symlinks (default) or - regular files (see description for more details). - local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`): - To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either - duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be - created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if - already exists) or downloaded from the Hub and not cached. See description for more details. + If provided, the downloaded files will be placed under this directory. proxies (`dict`, *optional*): Dictionary mapping protocol to the URL of the proxy passed to `requests.request`. @@ -5164,20 +5124,15 @@ def snapshot_download( `HF_HUB_DISABLE_PROGRESS_BARS` environment variable. Returns: - Local folder path (string) of repo snapshot - - - - Raises the following errors: + `str`: folder path of the repo snapshot. - - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - if `token=True` and the token cannot be found. - - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if - ETag cannot be determined. - - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) - if some parameter value is invalid - - + Raises: + - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) + if `token=True` and the token cannot be found. + - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if + ETag cannot be determined. + - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + if some parameter value is invalid """ from ._snapshot_download import snapshot_download diff --git a/tests/test_cli.py b/tests/test_cli.py index 0394350644..55fb475fe4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -359,7 +359,6 @@ def test_download_basic(self) -> None: self.assertIsNone(args.exclude) self.assertIsNone(args.cache_dir) self.assertIsNone(args.local_dir) - self.assertEqual(args.local_dir_use_symlinks, "auto") self.assertFalse(args.force_download) self.assertFalse(args.resume_download) self.assertIsNone(args.token) @@ -391,8 +390,6 @@ def test_download_with_all_options(self) -> None: "--quiet", "--local-dir", ".", - "--local-dir-use-symlinks", - "True", ] ) self.assertEqual(args.repo_id, DUMMY_MODEL_ID) @@ -403,7 +400,6 @@ def test_download_with_all_options(self) -> None: self.assertTrue(args.force_download) self.assertEqual(args.cache_dir, "/tmp") self.assertEqual(args.local_dir, ".") - self.assertTrue(args.local_dir_use_symlinks) self.assertTrue(args.resume_download) self.assertEqual(args.token, "my-token") self.assertTrue(args.quiet) @@ -423,7 +419,6 @@ def test_download_file_from_revision(self, mock: Mock) -> None: resume_download=False, cache_dir=None, local_dir=".", - local_dir_use_symlinks="auto", quiet=False, ) @@ -442,7 +437,6 @@ def test_download_file_from_revision(self, mock: Mock) -> None: force_download=False, token="hf_****", local_dir=".", - local_dir_use_symlinks="auto", library_name="huggingface-cli", ) @@ -460,7 +454,6 @@ def test_download_multiple_files(self, mock: Mock) -> None: resume_download=True, cache_dir=None, local_dir="/path/to/dir", - local_dir_use_symlinks="False", quiet=False, ) DownloadCommand(args).run() @@ -477,7 +470,6 @@ def test_download_multiple_files(self, mock: Mock) -> None: cache_dir=None, token="hf_****", local_dir="/path/to/dir", - local_dir_use_symlinks=False, library_name="huggingface-cli", ) @@ -496,7 +488,6 @@ def test_download_with_patterns(self, mock: Mock) -> None: cache_dir=None, quiet=False, local_dir=None, - local_dir_use_symlinks="auto", ) DownloadCommand(args).run() @@ -511,7 +502,6 @@ def test_download_with_patterns(self, mock: Mock) -> None: force_download=True, cache_dir=None, local_dir=None, - local_dir_use_symlinks="auto", token=None, library_name="huggingface-cli", ) @@ -531,7 +521,6 @@ def test_download_with_ignored_patterns(self, mock: Mock) -> None: cache_dir=None, quiet=False, local_dir=None, - local_dir_use_symlinks="auto", ) with self.assertWarns(UserWarning): @@ -549,7 +538,6 @@ def test_download_with_ignored_patterns(self, mock: Mock) -> None: cache_dir=None, token=None, local_dir=None, - local_dir_use_symlinks="auto", library_name="huggingface-cli", ) From 82b46b338e4824cb25b613f1fda1cd581d23fd27 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:02:56 +0200 Subject: [PATCH 12/40] add secu --- src/huggingface_hub/_local_folder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index a422f2653e..6c792502cd 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -133,6 +133,11 @@ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFil # filename is the path in the Hub repository (separated by '/') # make sure to have a cross platform transcription sanitized_filename = os.path.join(*filename.split("/")) + if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) file_path = local_dir / sanitized_filename metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata" lock_path = metadata_path.with_suffix(".lock") From 3300b286355caaa1e37277bd0a549a4424838d91 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:11:51 +0200 Subject: [PATCH 13/40] as constant --- src/huggingface_hub/_commit_api.py | 3 ++- src/huggingface_hub/utils/__init__.py | 2 +- src/huggingface_hub/utils/_paths.py | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index 64f8ca165a..d4368bab25 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -19,6 +19,7 @@ from .file_download import hf_hub_url from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info from .utils import ( + FORBIDDEN_FOLDERS, EntryNotFoundError, chunk_iterable, get_session, @@ -254,7 +255,7 @@ def _validate_path_in_repo(path_in_repo: str) -> str: raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'") if path_in_repo.startswith("./"): path_in_repo = path_in_repo[2:] - for forbidden in [".git", ".huggingface"]: + for forbidden in FORBIDDEN_FOLDERS: if any(part == forbidden for part in path_in_repo.split("/")): raise ValueError( f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:" diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index 60454bb99f..b7bd497342 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -63,7 +63,7 @@ reset_sessions, ) from ._pagination import paginate -from ._paths import DEFAULT_IGNORE_PATTERNS, filter_repo_objects +from ._paths import DEFAULT_IGNORE_PATTERNS, FORBIDDEN_FOLDERS, filter_repo_objects from ._runtime import ( dump_environment_info, get_aiohttp_version, diff --git a/src/huggingface_hub/utils/_paths.py b/src/huggingface_hub/utils/_paths.py index d30bdbf1e0..2361db6d0e 100644 --- a/src/huggingface_hub/utils/_paths.py +++ b/src/huggingface_hub/utils/_paths.py @@ -32,6 +32,8 @@ "*/.huggingface", "**/.huggingface/**", ] +# Forbidden to commit these folders +FORBIDDEN_FOLDERS = [".git", ".huggingface"] def filter_repo_objects( From c606a94cc901594508c453addd5d24b54d04786d Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:12:58 +0200 Subject: [PATCH 14/40] dix --- src/huggingface_hub/_local_folder.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 6c792502cd..7f0a9c2c5c 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -133,11 +133,12 @@ def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFil # filename is the path in the Hub repository (separated by '/') # make sure to have a cross platform transcription sanitized_filename = os.path.join(*filename.split("/")) - if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename: - raise ValueError( - f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository" - " owner to rename this file." - ) + if os.name == "nt": + if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) file_path = local_dir / sanitized_filename metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata" lock_path = metadata_path.with_suffix(".lock") From 95171efe0f634240cd6e8bf55e630fa2590d11c0 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:21:12 +0200 Subject: [PATCH 15/40] fix tests --- tests/test_cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 55fb475fe4..e7fc9ff14b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -419,6 +419,7 @@ def test_download_file_from_revision(self, mock: Mock) -> None: resume_download=False, cache_dir=None, local_dir=".", + local_dir_use_symlinks=None, quiet=False, ) @@ -454,6 +455,7 @@ def test_download_multiple_files(self, mock: Mock) -> None: resume_download=True, cache_dir=None, local_dir="/path/to/dir", + local_dir_use_symlinks=None, quiet=False, ) DownloadCommand(args).run() @@ -488,6 +490,7 @@ def test_download_with_patterns(self, mock: Mock) -> None: cache_dir=None, quiet=False, local_dir=None, + local_dir_use_symlinks=None, ) DownloadCommand(args).run() @@ -521,6 +524,7 @@ def test_download_with_ignored_patterns(self, mock: Mock) -> None: cache_dir=None, quiet=False, local_dir=None, + local_dir_use_symlinks=None, ) with self.assertWarns(UserWarning): From 7180746d8553cda4703ee421c4e41652f995e3b2 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 17:48:34 +0200 Subject: [PATCH 16/40] remove unused code --- tests/test_file_download.py | 158 ------------------------------------ 1 file changed, 158 deletions(-) diff --git a/tests/test_file_download.py b/tests/test_file_download.py index 2683946c96..fd95bae4dc 100644 --- a/tests/test_file_download.py +++ b/tests/test_file_download.py @@ -747,164 +747,6 @@ def test_keep_lock_file(self): self.assertTrue(lock_file_exist, "no lock file can be found") -# @with_production_testing -# @pytest.mark.usefixtures("fx_cache_dir") -# class HfHubDownloadToLocalDir(unittest.TestCase): -# cache_dir: Path - -# def test_with_local_dir_and_symlinks_and_file_cached(self) -> None: -# # File already cached -# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) - -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# returned_path = hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=True, -# ) -# config_file = Path(local_dir) / CONFIG_NAME -# self.assertEqual(returned_path, str(config_file)) -# self.assertTrue(config_file.is_file()) -# self.assertTrue( # File is symlink (except in Windows CI) -# config_file.is_symlink() if os.name != "nt" else not config_file.is_symlink() -# ) - -# def test_with_local_dir_and_symlinks_and_file_not_cached(self) -> None: -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# returned_path = hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=True, -# ) -# config_file = Path(local_dir) / CONFIG_NAME -# self.assertEqual(returned_path, str(config_file)) -# self.assertTrue(config_file.is_file()) -# if os.name != "nt": # File is symlink (except in Windows CI) -# self.assertTrue(config_file.is_symlink()) -# blob_path = config_file.resolve() -# self.assertTrue(self.cache_dir in blob_path.parents) # blob is cached! -# else: -# self.assertFalse(config_file.is_symlink()) - -# def test_with_local_dir_and_no_symlink_and_file_cached(self) -> None: -# # File already cached -# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) - -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# with patch.object( -# huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get -# ) as mock: -# returned_path = hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=False, # no symlinks -# ) -# mock.assert_not_called() # reused file from cache - -# config_file = Path(local_dir) / CONFIG_NAME -# self.assertEqual(returned_path, str(config_file)) -# self.assertTrue(config_file.is_file()) -# self.assertFalse(config_file.is_symlink()) - -# def test_with_local_dir_and_no_symlink_and_file_not_cached(self) -> None: -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# with patch.object( -# huggingface_hub.file_download, "http_get", wraps=huggingface_hub.file_download.http_get -# ) as mock: -# returned_path = hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=False, # no symlinks -# ) -# mock.assert_called() # no file cached => had to download it - -# config_file = Path(local_dir) / CONFIG_NAME -# self.assertEqual(returned_path, str(config_file)) -# self.assertTrue(config_file.is_file()) -# self.assertFalse(config_file.is_symlink()) - -# # Cache directory not used (no blobs, no symlinks in it) -# for path in self.cache_dir.glob("**/blobs/**"): -# self.assertFalse(path.is_file()) -# for path in self.cache_dir.glob("**/snapshots/**"): -# self.assertFalse(path.is_file()) - -# @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) -# def test_with_local_dir_and_auto_symlinks_and_file_cached(self) -> None: -# # File already cached -# hf_hub_download(DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir) # 496 bytes -> small -# hf_hub_download(DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir) # 1.11kB -> "big" - -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# config = hf_hub_download( -# DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir -# ) -# readme = hf_hub_download( -# DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir -# ) -# self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated -# if os.name != "nt": -# self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink - -# @patch("huggingface_hub.constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD", 1024) -# def test_with_local_dir_and_auto_symlinks_and_file_not_cached(self) -> None: -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# config = hf_hub_download( -# DUMMY_MODEL_ID, filename=CONFIG_NAME, cache_dir=self.cache_dir, local_dir=local_dir -# ) -# readme = hf_hub_download( -# DUMMY_MODEL_ID, filename="README.md", cache_dir=self.cache_dir, local_dir=local_dir -# ) -# self.assertFalse(Path(config).is_symlink()) # 496b => small => duplicated -# if os.name != "nt": -# self.assertTrue(Path(readme).is_symlink()) # 1.11kB => big => symlink - -# def test_with_local_dir_and_symlinks_and_overwrite(self) -> None: -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# config_path = Path(local_dir) / CONFIG_NAME -# config_path.write_text("this will be overwritten") -# hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=True, -# ) -# if os.name != "nt": -# self.assertTrue(config_path.is_symlink()) -# self.assertNotEqual(config_path.read_text(), "this will be overwritten") - -# def test_with_local_dir_and_no_symlinks_and_overwrite(self) -> None: -# # Download to local dir -# with SoftTemporaryDirectory() as local_dir: -# config_path = Path(local_dir) / CONFIG_NAME -# config_path.write_text("this will be overwritten") -# hf_hub_download( -# DUMMY_MODEL_ID, -# filename=CONFIG_NAME, -# cache_dir=self.cache_dir, -# local_dir=local_dir, -# local_dir_use_symlinks=False, -# ) -# self.assertFalse(config_path.is_symlink()) -# self.assertNotEqual(config_path.read_text(), "this will be overwritten") - - @pytest.mark.usefixtures("fx_cache_dir") class HfHubDownloadToLocalDir(unittest.TestCase): # `cache_dir` is a temporary directory From 4e664d4c2a1050cadc60cb4845ff1277a3f23b04 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 18:18:33 +0200 Subject: [PATCH 17/40] don't use jsons --- src/huggingface_hub/_local_folder.py | 32 ++++++++++++---------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 7f0a9c2c5c..d6029eb99b 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -36,23 +36,17 @@ Metadata file structure: ``` # file.txt.metadata -{ - "commit_hash": "11c5a3d5811f50298f278a704980280950aedb10", - "etag": "a16a55fda99d2f2e7b69cce5cf93ff4ad3049930", - "timestamp": 1712656091 -} - +11c5a3d5811f50298f278a704980280950aedb10 +a16a55fda99d2f2e7b69cce5cf93ff4ad3049930 +1712656091.123 # file.parquet.metadata -{ - "commit_hash": "11c5a3d5811f50298f278a704980280950aedb10", - "etag": "7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421", - "timestamp": 1712656091 +11c5a3d5811f50298f278a704980280950aedb10 +7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421 +1712656091.123 } ``` """ - -import json import logging import os import time @@ -167,15 +161,17 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown if paths.metadata_path.exists(): try: with paths.metadata_path.open() as f: - metadata = json.load(f) + commit_hash = f.readline().strip() + etag = f.readline().strip() + timestamp = float(f.readline().strip()) metadata = LocalDownloadFileMetadata( filename=filename, - commit_hash=metadata["commit_hash"], - etag=metadata["etag"], - timestamp=metadata["timestamp"], + commit_hash=commit_hash, + etag=etag, + timestamp=timestamp, ) except Exception as e: - # remove the metadata file if it is corrupted / not a json / not the right format + # remove the metadata file if it is corrupted / not the right format logger.warning( f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue." ) @@ -206,7 +202,7 @@ def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, et paths = get_local_download_paths(local_dir, filename) with WeakFileLock(paths.lock_path): with paths.metadata_path.open("w") as f: - json.dump({"commit_hash": commit_hash, "etag": etag, "timestamp": time.time()}, f, indent=4) + f.write(f"{commit_hash}\n{etag}\n{time.time()}\n") @lru_cache() From 7bb263ef3b4a64c0ea77c39263e4466415df0731 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 24 Apr 2024 18:19:21 +0200 Subject: [PATCH 18/40] style --- src/huggingface_hub/_local_folder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index d6029eb99b..087eb9dff3 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -47,6 +47,7 @@ } ``` """ + import logging import os import time From 35950422f37743d2955e84fe0f9d2393c9c3e3a0 Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 25 Apr 2024 14:29:09 +0200 Subject: [PATCH 19/40] Apply suggestions from code review Co-authored-by: Lysandre Debut --- docs/source/en/package_reference/environment_variables.md | 2 +- src/huggingface_hub/_snapshot_download.py | 2 +- src/huggingface_hub/commands/download.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/package_reference/environment_variables.md b/docs/source/en/package_reference/environment_variables.md index e2752d8ea4..d22671cf21 100644 --- a/docs/source/en/package_reference/environment_variables.md +++ b/docs/source/en/package_reference/environment_variables.md @@ -67,7 +67,7 @@ For more details, see [logging reference](../package_reference/utilities#hugging ### HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD -This environment variable has been deprecated and is now ignored by `huggingface_hub`. Downloading files to the local dir do not rely on symlinks anymore. +This environment variable has been deprecated and is now ignored by `huggingface_hub`. Downloading files to the local dir does not rely on symlinks anymore. ### HF_HUB_ETAG_TIMEOUT diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index 739ffa9ef8..5af37415c4 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -66,7 +66,7 @@ def snapshot_download( If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir` - to store some metadata related to the downloaded files.While this mechanism is not as robust as the main + to store some metadata related to the downloaded files. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index b8f4b2f51e..500d90b924 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -130,7 +130,7 @@ def __init__(self, args: Namespace) -> None: if args.local_dir_use_symlinks is not None: warnings.warn( - "Ignoring --local-dir-use-symlinks. Downloading to a local directory do not use symlinks anymore.", + "Ignoring --local-dir-use-symlinks. Downloading to a local directory does not use symlinks anymore.", FutureWarning, ) From 3401880bf3299f5d90cb91194580d1d9549c4b96 Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 25 Apr 2024 14:29:30 +0200 Subject: [PATCH 20/40] Apply suggestions from code review Co-authored-by: Lysandre Debut --- src/huggingface_hub/_local_folder.py | 2 +- src/huggingface_hub/commands/download.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 087eb9dff3..6e93db228e 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -99,7 +99,7 @@ class LocalDownloadFileMetadata: Commit hash of the file in the repo. etag (`str`): ETag of the file in the repo. Used to check if the file has changed. - For LFS files, this is the sha256 of the file. For regular, it correspond to the git hash. + For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash. timestamp (`int`): Unix timestamp of when the metadata was saved i.e. when the metadata was accurate. """ diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index 500d90b924..53c24c09d2 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -94,7 +94,7 @@ def register_subcommand(parser: _SubParsersAction): download_parser.add_argument( "--local-dir-use-symlinks", choices=["auto", "True", "False"], - help=("Deprecated and ignored. Downloading to a local directory do not use symlinks anymore."), + help=("Deprecated and ignored. Downloading to a local directory does not use symlinks anymore."), ) download_parser.add_argument( "--force-download", From 92106486701ef8175920b0bade7bf84f5e3c010e Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 14:37:29 +0200 Subject: [PATCH 21/40] Warn more about resume_download --- docs/source/en/guides/integrations.md | 2 +- src/huggingface_hub/_snapshot_download.py | 2 +- src/huggingface_hub/commands/download.py | 6 ++++-- src/huggingface_hub/file_download.py | 12 ++++++------ src/huggingface_hub/hf_api.py | 4 ++-- src/huggingface_hub/hub_mixin.py | 8 ++++---- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/docs/source/en/guides/integrations.md b/docs/source/en/guides/integrations.md index d3f6c26658..0898ab0445 100644 --- a/docs/source/en/guides/integrations.md +++ b/docs/source/en/guides/integrations.md @@ -98,7 +98,7 @@ common to offer parameters like: - `token`: to download from a private repo - `revision`: to download from a specific branch - `cache_dir`: to cache files in a specific directory -- `force_download`/`resume_download`/`local_files_only`: to reuse the cache or not +- `force_download`/`local_files_only`: to reuse the cache or not - `proxies`: configure HTTP session When pushing models, similar parameters are supported: diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py index 5af37415c4..2a29da6838 100644 --- a/src/huggingface_hub/_snapshot_download.py +++ b/src/huggingface_hub/_snapshot_download.py @@ -55,7 +55,7 @@ def snapshot_download( endpoint: Optional[str] = None, # Deprecated args local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - resume_download: bool = False, + resume_download: Optional[bool] = None, ) -> str: """Download repo files. diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index 53c24c09d2..48c4c55d28 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -102,7 +102,9 @@ def register_subcommand(parser: _SubParsersAction): help="If True, the files will be downloaded even if they are already cached.", ) download_parser.add_argument( - "--resume-download", action="store_true", help="If True, resume a previously interrupted download." + "--resume-download", + action="store_true", + help="Deprecated and ignored. Downloading a file to local dir always attempt to resume previously interrupted download.", ) download_parser.add_argument( "--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens" @@ -125,7 +127,7 @@ def __init__(self, args: Namespace) -> None: self.cache_dir: Optional[str] = args.cache_dir self.local_dir: Optional[str] = args.local_dir self.force_download: bool = args.force_download - self.resume_download: bool = args.resume_download + self.resume_download: Optional[bool] = args.resume_download or None self.quiet: bool = args.quiet if args.local_dir_use_symlinks is not None: diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 4fc4a68691..05503c27a8 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -585,7 +585,7 @@ def cached_download( force_filename: Optional[str] = None, proxies: Optional[Dict] = None, etag_timeout: float = DEFAULT_ETAG_TIMEOUT, - resume_download: bool = False, + resume_download: Optional[bool] = None, token: Union[bool, str, None] = None, local_files_only: bool = False, legacy_cache_layout: bool = False, @@ -672,10 +672,10 @@ def cached_download( " 'hf_hub_download'", FutureWarning, ) - if resume_download: + if resume_download is not None: warnings.warn( "`resume_download` is deprecated and will be removed in version 1.0.0. " - "Download always resume when possible. " + "Downloads always resume when possible. " "If you want to force a new download, use `force_download=True`.", FutureWarning, ) @@ -1010,7 +1010,7 @@ def hf_hub_download( endpoint: Optional[str] = None, # Deprecated args legacy_cache_layout: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, force_filename: Optional[str] = None, local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", ) -> str: @@ -1127,10 +1127,10 @@ def hf_hub_download( FutureWarning, ) legacy_cache_layout = True - if resume_download: + if resume_download is not None: warnings.warn( "`resume_download` is deprecated and will be removed in version 1.0.0. " - "Download always resume when possible. " + "Downloads always resume when possible. " "If you want to force a new download, use `force_download=True`.", FutureWarning, ) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index b13905e8cd..c6caeb4543 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -4918,7 +4918,7 @@ def hf_hub_download( token: Optional[Union[str, bool]] = None, local_files_only: bool = False, # Deprecated args - resume_download: bool = False, + resume_download: Optional[bool] = None, legacy_cache_layout: bool = False, force_filename: Optional[str] = None, local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", @@ -5063,7 +5063,7 @@ def snapshot_download( tqdm_class: Optional[base_tqdm] = None, # Deprecated args local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", - resume_download: bool = False, + resume_download: Optional[bool] = None, ) -> str: """Download repo files. diff --git a/src/huggingface_hub/hub_mixin.py b/src/huggingface_hub/hub_mixin.py index 690de0cece..ee4a8f27b8 100644 --- a/src/huggingface_hub/hub_mixin.py +++ b/src/huggingface_hub/hub_mixin.py @@ -112,7 +112,7 @@ class ModelHubMixin: ... pretrained_model_name_or_path: Union[str, Path], ... *, ... force_download: bool = False, - ... resume_download: bool = False, + ... resume_download: Optional[bool] = None, ... proxies: Optional[Dict] = None, ... token: Optional[Union[str, bool]] = None, ... cache_dir: Optional[Union[str, Path]] = None, @@ -320,7 +320,7 @@ def from_pretrained( pretrained_model_name_or_path: Union[str, Path], *, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict] = None, token: Optional[Union[str, bool]] = None, cache_dir: Optional[Union[str, Path]] = None, @@ -449,7 +449,7 @@ def _from_pretrained( cache_dir: Optional[Union[str, Path]], force_download: bool, proxies: Optional[Dict], - resume_download: bool, + resume_download: Optional[bool], local_files_only: bool, token: Optional[Union[str, bool]], **model_kwargs, @@ -625,7 +625,7 @@ def _from_pretrained( cache_dir: Optional[Union[str, Path]], force_download: bool, proxies: Optional[Dict], - resume_download: bool, + resume_download: Optional[bool], local_files_only: bool, token: Union[str, bool, None], map_location: str = "cpu", From fb477e594ddf22f4595416db42c5dd1a187fd48d Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 14:39:41 +0200 Subject: [PATCH 22/40] fix test --- tests/test_hub_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hub_mixin.py b/tests/test_hub_mixin.py index 0047bdd45d..dcf5cebd34 100644 --- a/tests/test_hub_mixin.py +++ b/tests/test_hub_mixin.py @@ -266,7 +266,7 @@ def test_from_pretrained_model_id_and_revision(self, from_pretrained_mock: Mock) cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, local_files_only=False, token=None, ) From 0eacbc9f76f568f28bfba5b5307d9d64b673641b Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 15:15:56 +0200 Subject: [PATCH 23/40] Add tests specific to .huggingface folder --- tests/test_local_folder.py | 158 +++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tests/test_local_folder.py diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py new file mode 100644 index 0000000000..286ca09c1b --- /dev/null +++ b/tests/test_local_folder.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2024-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains tests for the `.huggingface` folder in local directories. + +See `huggingface_hub/src/_local_folder.py` for the implementation. +""" + +import logging +import time +from pathlib import Path + +import pytest + +from huggingface_hub._local_folder import ( + LocalDownloadFileMetadata, + LocalDownloadFilePaths, + _huggingface_dir, + get_local_download_paths, + read_download_metadata, + write_download_metadata, +) + + +def test_creates_huggingface_dir_with_gitignore(tmp_path: Path): + """Test `.huggingface/` dir is ignored by git.""" + local_dir = tmp_path / "path" / "to" / "local" + huggingface_dir = _huggingface_dir(local_dir) + assert huggingface_dir == local_dir / ".huggingface" + assert huggingface_dir.exists() # all subdirectories have been created + assert huggingface_dir.is_dir() + + # Whole folder must be ignored + assert (huggingface_dir / ".gitignore").exists() + assert (huggingface_dir / ".gitignore").read_text() == "*" + + +def test_local_download_paths(tmp_path: Path): + """Test local download paths are valid + usable.""" + paths = get_local_download_paths(tmp_path, "path/in/repo.txt") + + # Correct paths (also sanitized on windows) + assert isinstance(paths, LocalDownloadFilePaths) + assert paths.file_path == tmp_path / "path" / "in" / "repo.txt" + assert paths.metadata_path == tmp_path / ".huggingface" / "download" / "path" / "in" / "repo.txt.metadata" + assert paths.lock_path == tmp_path / ".huggingface" / "download" / "path" / "in" / "repo.txt.lock" + + # Paths are usable (parent directories have been created) + assert paths.file_path.parent.is_dir() + assert paths.metadata_path.parent.is_dir() + assert paths.lock_path.parent.is_dir() + + # Incomplete path are etag-based + assert ( + paths.incomplete_path("etag123") + == tmp_path / ".huggingface" / "download" / "path" / "in" / "repo.txt.etag123.incomplete" + ) + assert paths.incomplete_path("etag123").parent.is_dir() + + +def test_local_download_paths_are_cached(tmp_path: Path): + """Test local download paths are cached.""" + # No need for an exact singleton here. + # We just want to avoid recreating the dataclass on consecutive calls (happens often + # in the process). + paths1 = get_local_download_paths(tmp_path, "path/in/repo.txt") + paths2 = get_local_download_paths(tmp_path, "path/in/repo.txt") + assert paths1 is paths2 + + +def test_write_download_metadata(tmp_path: Path): + """Test download metadata content is valid.""" + # Write metadata + write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") + metadata_path = tmp_path / ".huggingface" / "download" / "file.txt.metadata" + assert metadata_path.exists() + + # Metadata is valid + with metadata_path.open() as f: + assert f.readline() == "commit_hash\n" + assert f.readline() == "123456789\n" + timestamp = float(f.readline().strip()) + assert timestamp < time.time() # in the past + assert timestamp > time.time() - 1 # but less than 1 seconds ago (we're not that slow) + + # Overwriting works as expected + write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash2", etag="987654321") + with metadata_path.open() as f: + assert f.readline() == "commit_hash2\n" + assert f.readline() == "987654321\n" + timestamp2 = float(f.readline().strip()) + assert timestamp < timestamp2 # updated timestamp + + +def test_read_download_metadata_valid_metadata(tmp_path: Path): + """Test reading download metadata when metadata is valid.""" + # Create file + write correct metadata + (tmp_path / "file.txt").write_text("content") + write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") + + # Read metadata + metadata = read_download_metadata(tmp_path, filename="file.txt") + assert isinstance(metadata, LocalDownloadFileMetadata) + assert metadata.filename == "file.txt" + assert metadata.commit_hash == "commit_hash" + assert metadata.etag == "123456789" + assert isinstance(metadata.timestamp, float) + + +def test_read_download_metadata_no_metadata(tmp_path: Path): + """Test reading download metadata when there is no metadata.""" + # No metadata file => return None + assert read_download_metadata(tmp_path, filename="file.txt") is None + + +def test_read_download_metadata_corrupted_metadata(tmp_path: Path, caplog: pytest.LogCaptureFixture): + """Test reading download metadata when metadata is corrupted.""" + # Write corrupted metadata + metadata_path = tmp_path / ".huggingface" / "download" / "file.txt.metadata" + metadata_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.write_text("invalid content") + + # Corrupted metadata file => delete it + warn + return None + with caplog.at_level(logging.WARNING): + assert read_download_metadata(tmp_path, filename="file.txt") is None + assert not metadata_path.exists() + assert "Invalid metadata file" in caplog.text + + +def test_read_download_metadata_correct_metadata_missing_file(tmp_path: Path): + """Test reading download metadata when metadata is correct but file is missing.""" + # Write correct metadata + write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") + + # File missing => return None + assert read_download_metadata(tmp_path, filename="file.txt") is None + + +def test_read_download_metadata_correct_metadata_but_outdated(tmp_path: Path): + """Test reading download metadata when metadata is correct but outdated.""" + # Write correct metadata + write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") + time.sleep(0.05) # for deterministic tests + + # File is outdated => return None + (tmp_path / "file.txt").write_text("content") + assert read_download_metadata(tmp_path, filename="file.txt") is None From 1a4320a4e3b82e3e940656743704beafd04ea9e5 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 15:17:01 +0200 Subject: [PATCH 24/40] remove advice to use hf_transfer when downloading from cli --- src/huggingface_hub/commands/download.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index 48c4c55d28..d5bf0d6e41 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -43,7 +43,6 @@ from huggingface_hub import logging from huggingface_hub._snapshot_download import snapshot_download from huggingface_hub.commands import BaseHuggingfaceCLICommand -from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER from huggingface_hub.file_download import hf_hub_download from huggingface_hub.utils import disable_progress_bars, enable_progress_bars @@ -156,12 +155,6 @@ def _download(self) -> str: if self.exclude is not None and len(self.exclude) > 0: warnings.warn("Ignoring `--exclude` since filenames have being explicitly set.") - if not HF_HUB_ENABLE_HF_TRANSFER: - logger.info( - "Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See" - " https://huggingface.co/docs/huggingface_hub/hf_transfer for more details." - ) - # Single file to download: use `hf_hub_download` if len(self.filenames) == 1: return hf_hub_download( From 8c9dc8bbda3f9d1ac1dc31f9a6ed6e97c040334f Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 15:23:01 +0200 Subject: [PATCH 25/40] fix torhc test --- tests/test_hub_mixin_pytorch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_hub_mixin_pytorch.py b/tests/test_hub_mixin_pytorch.py index 3d1afdf557..98925b8290 100644 --- a/tests/test_hub_mixin_pytorch.py +++ b/tests/test_hub_mixin_pytorch.py @@ -147,7 +147,7 @@ def test_from_pretrained_model_from_hub_prefer_safetensor(self, hf_hub_download_ cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, token=None, local_files_only=False, ) @@ -176,7 +176,7 @@ def test_from_pretrained_model_from_hub_fallback_pickle(self, hf_hub_download_mo cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, token=None, local_files_only=False, ) @@ -187,7 +187,7 @@ def test_from_pretrained_model_from_hub_fallback_pickle(self, hf_hub_download_mo cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, token=None, local_files_only=False, ) @@ -204,7 +204,7 @@ def test_from_pretrained_model_id_and_revision(self, from_pretrained_mock: Mock) cache_dir=None, force_download=False, proxies=None, - resume_download=False, + resume_download=None, local_files_only=False, token=None, ) From 6260a17fdee529be3720d2cd5d0dd5621ad4f1f2 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 15:33:36 +0200 Subject: [PATCH 26/40] more test fix --- tests/test_cli.py | 2 +- tests/test_hf_api.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index e7fc9ff14b..0485e6cd47 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -434,7 +434,7 @@ def test_download_file_from_revision(self, mock: Mock) -> None: revision="refs/pr/1", filename="README.md", cache_dir=None, - resume_download=False, + resume_download=None, force_download=False, token="hf_****", local_dir=".", diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index a8c79b9dad..ad4529a0d9 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2875,7 +2875,7 @@ def test_hf_hub_download_alias(self, mock: Mock) -> None: force_filename=None, proxies=None, etag_timeout=10, - resume_download=False, + resume_download=None, local_files_only=False, legacy_cache_layout=False, headers=None, @@ -2901,7 +2901,7 @@ def test_snapshot_download_alias(self, mock: Mock) -> None: local_dir_use_symlinks="auto", proxies=None, etag_timeout=10, - resume_download=False, + resume_download=None, force_download=False, local_files_only=False, allow_patterns=None, From 3a45f4b2f940a8491c03f3b282a01dfd40aa218e Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 25 Apr 2024 20:42:03 +0200 Subject: [PATCH 27/40] feedback --- docs/source/en/guides/download.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/source/en/guides/download.md b/docs/source/en/guides/download.md index 1bf4ff0635..3242e0c0b0 100644 --- a/docs/source/en/guides/download.md +++ b/docs/source/en/guides/download.md @@ -134,12 +134,8 @@ However, if you need to download files to a specific folder, you can pass a `loc Note that a `.huggingface/` folder will be created at the root of your local directory, containing metadata about the downloaded files. This prevents re-downloading files if you re-run your script. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. - - After completing the download, you can safely remove the `.huggingface/` folder if you no longer need it. However, be aware that re-running your script without this folder may result in longer recovery times, as metadata will be lost. Rest assured that your local data will remain intact and unaffected. - - Don't worry about the `.huggingface/` folder when committing changes to the Hub! This folder is automatically ignored by both `git` and [`upload_folder`]. From 4f6f531581c591a100ee54aad2340d75bcb45e6e Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 26 Apr 2024 17:44:41 +0200 Subject: [PATCH 28/40] suggested changes --- docs/source/en/guides/cli.md | 2 +- docs/source/en/guides/download.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/guides/cli.md b/docs/source/en/guides/cli.md index aa7a4751a3..e515ce464a 100644 --- a/docs/source/en/guides/cli.md +++ b/docs/source/en/guides/cli.md @@ -226,7 +226,7 @@ The examples above show how to download from the latest commit on the main branc The recommended (and default) way to download files from the Hub is to use the cache-system. However, in some cases you want to download files and move them to a specific folder. This is useful to get a workflow closer to what git commands offer. You can do that using the `--local_dir` option. -Note that a `.huggingface/` folder will be created at the root of your local directory, containing metadata about the downloaded files. This prevents re-downloading files if you re-run the command. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. +A `./huggingface/` folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they're already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the `local_dir` optimized for pulling only the latest changes. diff --git a/docs/source/en/guides/download.md b/docs/source/en/guides/download.md index 3242e0c0b0..52bfa35bc9 100644 --- a/docs/source/en/guides/download.md +++ b/docs/source/en/guides/download.md @@ -132,7 +132,7 @@ By default, we recommend using the [cache system](./manage-cache) to download fi However, if you need to download files to a specific folder, you can pass a `local_dir` parameter to the download function. This is useful to get a workflow closer to what `git` commands offer. The downloaded files will maintain their original file structure within the specified folder. For example, if `filename="data/train.csv"` and `local_dir="path/to/folder"`, the resulting filepath will be `"path/to/folder/data/train.csv"`. -Note that a `.huggingface/` folder will be created at the root of your local directory, containing metadata about the downloaded files. This prevents re-downloading files if you re-run your script. While this mechanism is not as robust as the main cache-system, it's optimized for regularly pulling the latest version of a repository. +A `./huggingface/` folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they're already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the `local_dir` optimized for pulling only the latest changes. After completing the download, you can safely remove the `.huggingface/` folder if you no longer need it. However, be aware that re-running your script without this folder may result in longer recovery times, as metadata will be lost. Rest assured that your local data will remain intact and unaffected. From 41c8ae39f967f26a5c0a972f684dcc892ca5a729 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 26 Apr 2024 17:45:43 +0200 Subject: [PATCH 29/40] more robust --- tests/test_utils_http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils_http.py b/tests/test_utils_http.py index be5ceaea14..71262aa1e8 100644 --- a/tests/test_utils_http.py +++ b/tests/test_utils_http.py @@ -202,7 +202,7 @@ def test_get_session_multiple_threads(self): sessions = [None] * N def _get_session_in_thread(index: int) -> None: - time.sleep(0.01) + time.sleep(0.1) sessions[index] = get_session() # Get main thread session From 84f55ffbc16634f6cd00329ae7763799fb967953 Mon Sep 17 00:00:00 2001 From: Lucain Date: Mon, 29 Apr 2024 08:04:35 +0200 Subject: [PATCH 30/40] Apply suggestions from code review Co-authored-by: Pedro Cuenca --- docs/source/en/guides/download.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/guides/download.md b/docs/source/en/guides/download.md index 52bfa35bc9..4d26a90577 100644 --- a/docs/source/en/guides/download.md +++ b/docs/source/en/guides/download.md @@ -130,7 +130,7 @@ files except `vocab.json`. By default, we recommend using the [cache system](./manage-cache) to download files from the Hub. You can specify a custom cache location using the `cache_dir` parameter in [`hf_hub_download`] and [`snapshot_download`], or by setting the [`HF_HOME`](../package_reference/environment_variables#hf_home) environment variable. -However, if you need to download files to a specific folder, you can pass a `local_dir` parameter to the download function. This is useful to get a workflow closer to what `git` commands offer. The downloaded files will maintain their original file structure within the specified folder. For example, if `filename="data/train.csv"` and `local_dir="path/to/folder"`, the resulting filepath will be `"path/to/folder/data/train.csv"`. +However, if you need to download files to a specific folder, you can pass a `local_dir` parameter to the download function. This is useful to get a workflow closer to what the `git` command offers. The downloaded files will maintain their original file structure within the specified folder. For example, if `filename="data/train.csv"` and `local_dir="path/to/folder"`, the resulting filepath will be `"path/to/folder/data/train.csv"`. A `./huggingface/` folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they're already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the `local_dir` optimized for pulling only the latest changes. From f5d1faa818e317036aeafc326bf39d9c461fabb0 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 08:06:32 +0200 Subject: [PATCH 31/40] comment --- src/huggingface_hub/commands/download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index d5bf0d6e41..a34b9db20e 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -84,8 +84,7 @@ def register_subcommand(parser: _SubParsersAction): "--local-dir", type=str, help=( - "If set, the downloaded file will be placed under this directory either as a symlink (default) or a" - " regular file. Check out" + "If set, the downloaded file will be placed under this directory. Check out" " https://huggingface.co/docs/huggingface_hub/guides/download#download-files-to-local-folder for more" " details." ), From edc97904cd17c91bca0e850c4e10ef9c7978d067 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 08:08:47 +0200 Subject: [PATCH 32/40] commen --- src/huggingface_hub/commands/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/commands/download.py b/src/huggingface_hub/commands/download.py index a34b9db20e..d7b8760a83 100644 --- a/src/huggingface_hub/commands/download.py +++ b/src/huggingface_hub/commands/download.py @@ -102,7 +102,7 @@ def register_subcommand(parser: _SubParsersAction): download_parser.add_argument( "--resume-download", action="store_true", - help="Deprecated and ignored. Downloading a file to local dir always attempt to resume previously interrupted download.", + help="Deprecated and ignored. Downloading a file to local dir always attempts to resume previously interrupted downloads (unless hf-transfer is enabled).", ) download_parser.add_argument( "--token", type=str, help="A User Access Token generated from https://huggingface.co/settings/tokens" From d0ea3eae43ff1f85514a067408783203eec6c682 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 08:29:16 +0200 Subject: [PATCH 33/40] robust tests --- tests/test_local_folder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py index 286ca09c1b..f17f42767a 100644 --- a/tests/test_local_folder.py +++ b/tests/test_local_folder.py @@ -94,6 +94,8 @@ def test_write_download_metadata(tmp_path: Path): assert timestamp < time.time() # in the past assert timestamp > time.time() - 1 # but less than 1 seconds ago (we're not that slow) + time.sleep(0.2) # for deterministic tests + # Overwriting works as expected write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash2", etag="987654321") with metadata_path.open() as f: @@ -151,7 +153,7 @@ def test_read_download_metadata_correct_metadata_but_outdated(tmp_path: Path): """Test reading download metadata when metadata is correct but outdated.""" # Write correct metadata write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") - time.sleep(0.05) # for deterministic tests + time.sleep(0.2) # for deterministic tests # File is outdated => return None (tmp_path / "file.txt").write_text("content") From dffa539fb0883c8de03421b02c051f504f1feba1 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 08:44:09 +0200 Subject: [PATCH 34/40] fix CI --- tests/test_hf_api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index ad4529a0d9..334b68d9ea 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -1783,11 +1783,13 @@ def test_filter_models_with_library(self): @expect_deprecation("ModelFilter") def test_filter_models_with_task(self): models = list(self._api.list_models(filter=ModelFilter(task="fill-mask", model_name="albert-base-v2"))) - self.assertTrue("fill-mask" == models[0].pipeline_tag) - self.assertTrue("albert-base-v2" in models[0].modelId) + assert models[0].pipeline_tag == "fill-mask" + assert "albert" in models[0].modelId + assert "base" in models[0].modelId + assert "v2" in models[0].modelId models = list(self._api.list_models(filter=ModelFilter(task="dummytask"))) - self.assertEqual(len(models), 0) + assert len(models) == 0 @expect_deprecation("ModelFilter") def test_filter_models_by_language(self): From d414825ea6e38b5c24eaea640d1f1879a3e8f44c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 09:25:47 +0200 Subject: [PATCH 35/40] ez --- tests/test_local_folder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py index f17f42767a..86ba3819a0 100644 --- a/tests/test_local_folder.py +++ b/tests/test_local_folder.py @@ -102,7 +102,7 @@ def test_write_download_metadata(tmp_path: Path): assert f.readline() == "commit_hash2\n" assert f.readline() == "987654321\n" timestamp2 = float(f.readline().strip()) - assert timestamp < timestamp2 # updated timestamp + assert timestamp <= timestamp2 # updated timestamp def test_read_download_metadata_valid_metadata(tmp_path: Path): From 9e6d569142365528d4a9a24c1935fc11c03e97af Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 09:45:49 +0200 Subject: [PATCH 36/40] more ribust? --- tests/test_local_folder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py index 86ba3819a0..bc525fb65c 100644 --- a/tests/test_local_folder.py +++ b/tests/test_local_folder.py @@ -91,8 +91,8 @@ def test_write_download_metadata(tmp_path: Path): assert f.readline() == "commit_hash\n" assert f.readline() == "123456789\n" timestamp = float(f.readline().strip()) - assert timestamp < time.time() # in the past - assert timestamp > time.time() - 1 # but less than 1 seconds ago (we're not that slow) + assert timestamp <= time.time() # in the past + assert timestamp >= time.time() - 1 # but less than 1 seconds ago (we're not that slow) time.sleep(0.2) # for deterministic tests From e6fe766ddeb1f101544a2023bb7e033965314d8c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 10:10:09 +0200 Subject: [PATCH 37/40] allow for 1s diff --- src/huggingface_hub/_local_folder.py | 2 +- tests/test_local_folder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 6e93db228e..2251563898 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -184,7 +184,7 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown try: # check if the file exists and hasn't been modified since the metadata was saved stat = paths.file_path.stat() - if stat.st_mtime <= metadata.timestamp: + if stat.st_mtime - 1 <= metadata.timestamp: # allow 1s difference as stat.st_mtime might not be precise return metadata logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.") except FileNotFoundError: diff --git a/tests/test_local_folder.py b/tests/test_local_folder.py index bc525fb65c..96feb3bcb3 100644 --- a/tests/test_local_folder.py +++ b/tests/test_local_folder.py @@ -153,7 +153,7 @@ def test_read_download_metadata_correct_metadata_but_outdated(tmp_path: Path): """Test reading download metadata when metadata is correct but outdated.""" # Write correct metadata write_download_metadata(tmp_path, filename="file.txt", commit_hash="commit_hash", etag="123456789") - time.sleep(0.2) # for deterministic tests + time.sleep(2) # We allow for a 1s difference in practice, so let's wait a bit # File is outdated => return None (tmp_path / "file.txt").write_text("content") From 28991c92f8d5b60a57c3e684a76fa804d0762f16 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 10:26:16 +0200 Subject: [PATCH 38/40] don't raise on unlink --- src/huggingface_hub/_local_folder.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 2251563898..7651402d62 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -184,7 +184,9 @@ def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDown try: # check if the file exists and hasn't been modified since the metadata was saved stat = paths.file_path.stat() - if stat.st_mtime - 1 <= metadata.timestamp: # allow 1s difference as stat.st_mtime might not be precise + if ( + stat.st_mtime - 1 <= metadata.timestamp + ): # allow 1s difference as stat.st_mtime might not be precise return metadata logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.") except FileNotFoundError: @@ -220,5 +222,8 @@ def _huggingface_dir(local_dir: Path) -> Path: if not gitignore.exists(): with WeakFileLock(gitignore_lock): gitignore.write_text("*") - gitignore_lock.unlink(missing_ok=True) + try: + gitignore_lock.unlink() + except OSError: # FileNotFoundError, PermissionError, etc. + pass return path From a0b61a1c2fa3d587755ef8d002bb24ae551ef912 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 10:26:28 +0200 Subject: [PATCH 39/40] style --- src/huggingface_hub/_local_folder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/_local_folder.py b/src/huggingface_hub/_local_folder.py index 7651402d62..40b66e1c11 100644 --- a/src/huggingface_hub/_local_folder.py +++ b/src/huggingface_hub/_local_folder.py @@ -224,6 +224,6 @@ def _huggingface_dir(local_dir: Path) -> Path: gitignore.write_text("*") try: gitignore_lock.unlink() - except OSError: # FileNotFoundError, PermissionError, etc. + except OSError: # FileNotFoundError, PermissionError, etc. pass return path From fccabe02b56027007afa1eef9e5a29c5c3d0a07f Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 29 Apr 2024 10:40:23 +0200 Subject: [PATCH 40/40] robustenss --- tests/test_webhooks_server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_webhooks_server.py b/tests/test_webhooks_server.py index f35efa278d..8284b14bea 100644 --- a/tests/test_webhooks_server.py +++ b/tests/test_webhooks_server.py @@ -232,8 +232,9 @@ def test_run_print_instructions(self): self.mocked_run_app() instructions = output.getvalue() - self.assertIn("Webhooks are correctly setup and ready to use:", instructions) - self.assertIn("- POST http://127.0.0.1:7860/webhooks/test_webhook", instructions) + assert "Webhooks are correctly setup and ready to use:" in instructions + assert "- POST http://127.0.0.1:" in instructions # port is usually 7860 but can be dynamic + assert "/webhooks/test_webhook" in instructions def test_run_parse_payload(self): """Test that the payload is correctly parsed when running the app."""