From c468bc3cf4085fa5e9eb59e6a796a16b09b51635 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sun, 3 Sep 2023 05:37:23 -0400 Subject: [PATCH] cache Link parsing and interpreter compatibility checking --- src/pip/_internal/cache.py | 26 +++- src/pip/_internal/index/package_finder.py | 160 +++++++++++++++++++++- src/pip/_internal/models/link.py | 58 ++++++++ 3 files changed, 236 insertions(+), 8 deletions(-) diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 4c66a728925..35d159a1bc8 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -7,7 +7,7 @@ import logging import os from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Type from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name @@ -95,6 +95,17 @@ def get_path_for_link(self, link: Link) -> str: return os.path.join(self.cache_dir, "link-metadata", *parts) +class SerializableEntry(abc.ABC): + @classmethod + @abc.abstractmethod + def suffix(cls) -> str: + ... + + @abc.abstractmethod + def serialize(self) -> Dict[str, Any]: + ... + + class FetchResolveCache(Cache): def get_path_for_link(self, link: Link) -> str: # We are reading index links to extract other links from, not executing any @@ -103,6 +114,19 @@ def get_path_for_link(self, link: Link) -> str: assert self.cache_dir return os.path.join(self.cache_dir, "fetch-resolve", *parts) + def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path: + hashed = _hash_dict(entry.serialize()) + return self.cache_path(link) / f"{hashed}{entry.suffix()}" + + def clear_hashed_entries( + self, link: Link, entry_type: Type[SerializableEntry] + ) -> None: + for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"): + logger.debug( + "unlinking invalidated hashed link eval cache entry %s", hashed_entry + ) + hashed_entry.unlink() + class WheelCacheBase(Cache): """Specializations to the cache concept for wheels.""" diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index d84210307f2..14588dca411 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -3,6 +3,7 @@ import enum import functools import itertools +import json import logging import os import re @@ -10,6 +11,7 @@ from pathlib import Path from typing import ( TYPE_CHECKING, + Any, Dict, FrozenSet, Iterable, @@ -26,7 +28,7 @@ from pip._vendor.packaging.version import _BaseVersion from pip._vendor.packaging.version import parse as parse_version -from pip._internal.cache import FetchResolveCache +from pip._internal.cache import FetchResolveCache, SerializableEntry from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, @@ -36,7 +38,7 @@ from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl -from pip._internal.models.link import Link +from pip._internal.models.link import Link, PersistentLinkCacheArgs from pip._internal.models.search_scope import SearchScope from pip._internal.models.selection_prefs import SelectionPreferences from pip._internal.models.target_python import TargetPython @@ -119,14 +121,41 @@ class LinkType(enum.Enum): requires_python_mismatch = enum.auto() -class LinkEvaluator: +class LinkEvaluator(SerializableEntry): """ Responsible for evaluating links for a particular project. """ + @classmethod + def suffix(cls) -> str: + return ".evaluation" + _py_version_re = re.compile(r"-py([123]\.?[0-9]?)$") + def serialize(self) -> Dict[str, Any]: + return dict( + project_name=self.project_name, + canonical_name=self._canonical_name, + # Sort these for determinism. + formats=sorted(self._formats), + target_python=self._target_python.format_given(), + allow_yanked=self._allow_yanked, + ignore_requires_python=self._ignore_requires_python, + ) + + def to_json(self) -> str: + return json.dumps(self.serialize(), sort_keys=True) + + def __eq__(self, other: Any) -> bool: + return isinstance(other, type(self)) and self.to_json() == other.to_json() + + def __ne__(self, other: Any) -> bool: + return not self == other + + def __hash__(self) -> int: + return hash(self.to_json()) + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -900,6 +929,91 @@ def _write_http_cache_info( return (new_etag, new_date, new_checksum, page_unmodified) + @staticmethod + def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]: + page_links: Optional[List[Link]] = None + try: + with parsed_links_path.open("r") as f: + logger.debug("reading page links from cache %s", parsed_links_path) + cached_links = json.load(f) + page_links = [] + for cache_info in cached_links: + link = Link.from_cache_args( + PersistentLinkCacheArgs.from_json(cache_info) + ) + assert link is not None + page_links.append(link) + except (OSError, json.decoder.JSONDecodeError, KeyError) as e: + logger.debug( + "could not read page links from cache file %s %s(%s)", + parsed_links_path, + e.__class__.__name__, + str(e), + ) + return page_links + + @staticmethod + def _write_parsed_links_cache( + parsed_links_path: Path, links: Iterable[Link] + ) -> List[Link]: + cacheable_links: List[Dict[str, Any]] = [] + page_links: List[Link] = [] + for link in links: + cache_info = link.cache_args() + assert cache_info is not None + cacheable_links.append(cache_info.to_json()) + page_links.append(link) + + logger.debug("writing page links to %s", parsed_links_path) + with parsed_links_path.open("w") as f: + json.dump(cacheable_links, f) + + return page_links + + @staticmethod + def _try_load_installation_candidate_cache( + cached_candidates_path: Path, + ) -> Optional[List[InstallationCandidate]]: + try: + with cached_candidates_path.open("r") as f: + serialized_candidates = json.load(f) + logger.debug("read serialized candidates from %s", cached_candidates_path) + package_links: List[InstallationCandidate] = [] + for cand in serialized_candidates: + link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"]) + link = Link.from_cache_args(link_cache_args) + package_links.append( + InstallationCandidate(cand["name"], cand["version"], link) + ) + return package_links + except (OSError, json.decoder.JSONDecodeError, KeyError) as e: + logger.debug( + "could not read cached candidates at %s %s(%s)", + cached_candidates_path, + e.__class__.__name__, + str(e), + ) + return None + + @staticmethod + def _write_installation_candidate_cache( + cached_candidates_path: Path, + candidates: Iterable[InstallationCandidate], + ) -> List[InstallationCandidate]: + candidates = list(candidates) + serialized_candidates = [ + dict( + name=candidate.name, + version=str(candidate.version), + link=candidate.link.cache_args().to_json(), + ) + for candidate in candidates + ] + with cached_candidates_path.open("w") as f: + logger.debug("writing serialized candidates to %s", f.name) + json.dump(serialized_candidates, f) + return candidates + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: @@ -926,6 +1040,10 @@ def process_project_url( etag_path = cached_path / "etag" date_path = cached_path / "modified-since-date" checksum_path = cached_path / "checksum" + parsed_links_path = cached_path / "parsed-links" + cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path( + project_url, link_evaluator + ) headers: Dict[str, str] = {} # NB: mutates headers! @@ -962,12 +1080,40 @@ def process_project_url( prev_checksum=prev_checksum, ) - page_links = parse_links(index_response) + page_links: Optional[List[Link]] = None + # Only try our persistent link parsing and evaluation caches if we know the page + # was unmodified via checksum. + if page_unmodified: + cached_candidates = self._try_load_installation_candidate_cache( + cached_candidates_path + ) + if cached_candidates is not None: + return cached_candidates + + page_links = self._try_load_parsed_links_cache(parsed_links_path) + else: + try: + parsed_links_path.unlink() + except OSError: + pass + self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator) + + if page_links is None: + logger.debug( + "extracting new parsed links from index response %s", index_response + ) + page_links = self._write_parsed_links_cache( + parsed_links_path, + parse_links(index_response), + ) with indent_log(): - package_links = self.evaluate_links( - link_evaluator, - links=page_links, + package_links = self._write_installation_candidate_cache( + cached_candidates_path, + self.evaluate_links( + link_evaluator, + links=page_links, + ), ) return package_links diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 80b7ad67557..0a54a527075 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -179,6 +179,43 @@ def _ensure_quoted_url(url: str) -> str: return urllib.parse.urlunparse(result._replace(path=path)) +@dataclass(frozen=True) +class PersistentLinkCacheArgs: + url: str + comes_from: Optional[str] = None + requires_python: Optional[str] = None + yanked_reason: Optional[str] = None + metadata_file_data: Optional[MetadataFile] = None + hashes: Optional[Mapping[str, str]] = None + + def to_json(self) -> Dict[str, Any]: + return dict( + url=self.url, + comes_from=self.comes_from, + requires_python=self.requires_python, + yanked_reason=self.yanked_reason, + metadata_file_data=( + self.metadata_file_data.hashes if self.metadata_file_data else None + ), + hashes=self.hashes, + ) + + @classmethod + def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs": + return cls( + url=cache_info["url"], + comes_from=cache_info["comes_from"], + requires_python=cache_info["requires_python"], + yanked_reason=cache_info["yanked_reason"], + metadata_file_data=( + MetadataFile(hashes=cache_info["metadata_file_data"]) + if cache_info["metadata_file_data"] + else None + ), + hashes=cache_info["hashes"], + ) + + class Link(KeyBasedCompareMixin): """Represents a parsed link from a Package Index's simple URL""" @@ -305,6 +342,27 @@ def from_json( metadata_file_data=metadata_file_data, ) + def cache_args(self) -> PersistentLinkCacheArgs: + return PersistentLinkCacheArgs( + url=self.url, + comes_from=(str(self.comes_from) if self.comes_from else None), + requires_python=self.requires_python, + yanked_reason=self.yanked_reason, + metadata_file_data=self.metadata_file_data, + hashes=self._hashes, + ) + + @classmethod + def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link": + return cls( + args.url, + comes_from=args.comes_from, + requires_python=args.requires_python, + yanked_reason=args.yanked_reason, + metadata_file_data=args.metadata_file_data, + hashes=args.hashes, + ) + @classmethod def from_element( cls,