diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index bed8b1b68e4..f9630375d4b 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -8,7 +8,7 @@ import os import re from pathlib import Path -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple, Type from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name @@ -146,6 +146,15 @@ def get_path_for_link(self, link: Link) -> str: return os.path.join(self.cache_dir, "link-metadata", *parts) +class SerializableEntry(abc.ABC): + @classmethod + @abc.abstractmethod + def suffix(cls) -> str: ... + + @abc.abstractmethod + def serialize(self) -> Dict[str, Any]: ... + + class FetchResolveCache(Cache): def get_path_for_link(self, link: Link) -> str: # We are reading index links to extract other links from, not executing any @@ -154,6 +163,19 @@ def get_path_for_link(self, link: Link) -> str: assert self.cache_dir return os.path.join(self.cache_dir, "fetch-resolve", *parts) + def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path: + hashed = _hash_dict(entry.serialize()) + return self.cache_path(link) / f"{hashed}{entry.suffix()}" + + def clear_hashed_entries( + self, link: Link, entry_type: Type[SerializableEntry] + ) -> None: + for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"): + logger.debug( + "unlinking invalidated hashed link eval cache entry %s", hashed_entry + ) + hashed_entry.unlink() + class WheelCacheBase(Cache): """Specializations to the cache concept for wheels.""" diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 439dacf3e92..092191197c7 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -1,10 +1,12 @@ """Routines related to PyPI, indexes""" import binascii +import bz2 import datetime import enum import functools import itertools +import json import logging import os import re @@ -14,6 +16,8 @@ from pathlib import Path from typing import ( TYPE_CHECKING, + Any, + Callable, Dict, FrozenSet, Iterable, @@ -30,7 +34,7 @@ from pip._vendor.packaging.version import InvalidVersion, _BaseVersion from pip._vendor.packaging.version import parse as parse_version -from pip._internal.cache import FetchResolveCache +from pip._internal.cache import FetchResolveCache, SerializableEntry from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, @@ -40,7 +44,7 @@ from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl -from pip._internal.models.link import Link +from pip._internal.models.link import Link, PersistentLinkCacheArgs from pip._internal.models.search_scope import SearchScope from pip._internal.models.selection_prefs import SelectionPreferences from pip._internal.models.target_python import TargetPython @@ -123,13 +127,28 @@ class LinkType(enum.Enum): requires_python_mismatch = enum.auto() -class LinkEvaluator: +class LinkEvaluator(SerializableEntry): """ Responsible for evaluating links for a particular project. """ + @classmethod + def suffix(cls) -> str: + return ".evaluation" + _py_version_re = re.compile(r"-py([123]\.?[0-9]?)$") + def serialize(self) -> Dict[str, Any]: + return { + "project_name": self.project_name, + "canonical_name": self._canonical_name, + # Sort these for determinism. + "formats": sorted(self._formats), + "target_python": self._target_python.format_given(), + "allow_yanked": self._allow_yanked, + "ignore_requires_python": self._ignore_requires_python, + } + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -594,6 +613,19 @@ def compute_best_candidate( ) +_FindCandidates = Callable[["PackageFinder", str], List[InstallationCandidate]] + + +def _canonicalize_arg(func: _FindCandidates) -> _FindCandidates: + @functools.wraps(func) + def wrapper( + self: "PackageFinder", project_name: str + ) -> List[InstallationCandidate]: + return func(self, canonicalize_name(project_name)) + + return wrapper + + class PackageFinder: """This finds packages. @@ -954,6 +986,91 @@ def _write_http_cache_info( return (new_etag, new_date, new_checksum, page_unmodified) + @staticmethod + def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]: + page_links: Optional[List[Link]] = None + try: + with bz2.open(parsed_links_path, mode="rt", encoding="utf-8") as f: + logger.debug("reading page links from cache %s", parsed_links_path) + cached_links = json.load(f) + page_links = [] + for cache_info in cached_links: + link = Link.from_cache_args( + PersistentLinkCacheArgs.from_json(cache_info) + ) + assert link is not None + page_links.append(link) + except (OSError, json.decoder.JSONDecodeError, KeyError) as e: + logger.debug( + "could not read page links from cache file %s %s(%s)", + parsed_links_path, + e.__class__.__name__, + str(e), + ) + return page_links + + @staticmethod + def _write_parsed_links_cache( + parsed_links_path: Path, links: Iterable[Link] + ) -> List[Link]: + cacheable_links: List[Dict[str, Any]] = [] + page_links: List[Link] = [] + for link in links: + cache_info = link.cache_args() + assert cache_info is not None + cacheable_links.append(cache_info.to_json()) + page_links.append(link) + + logger.debug("writing page links to %s", parsed_links_path) + with bz2.open(parsed_links_path, mode="wt", encoding="utf-8") as f: + json.dump(cacheable_links, f) + + return page_links + + @staticmethod + def _try_load_installation_candidate_cache( + cached_candidates_path: Path, + ) -> Optional[List[InstallationCandidate]]: + try: + with bz2.open(cached_candidates_path, mode="rt", encoding="utf-8") as f: + serialized_candidates = json.load(f) + logger.debug("read serialized candidates from %s", cached_candidates_path) + package_links: List[InstallationCandidate] = [] + for cand in serialized_candidates: + link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"]) + link = Link.from_cache_args(link_cache_args) + package_links.append( + InstallationCandidate(cand["name"], cand["version"], link) + ) + return package_links + except (OSError, json.decoder.JSONDecodeError, KeyError) as e: + logger.debug( + "could not read cached candidates at %s %s(%s)", + cached_candidates_path, + e.__class__.__name__, + str(e), + ) + return None + + @staticmethod + def _write_installation_candidate_cache( + cached_candidates_path: Path, + candidates: Iterable[InstallationCandidate], + ) -> List[InstallationCandidate]: + candidates = list(candidates) + serialized_candidates = [ + { + "name": candidate.name, + "version": str(candidate.version), + "link": candidate.link.cache_args().to_json(), + } + for candidate in candidates + ] + with bz2.open(cached_candidates_path, mode="wt", encoding="utf-8") as f: + logger.debug("writing serialized candidates to %s", cached_candidates_path) + json.dump(serialized_candidates, f) + return candidates + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: @@ -972,7 +1089,6 @@ def _process_project_url_uncached( package_links = self.evaluate_links(link_evaluator, links=page_links) return package_links - @functools.lru_cache(maxsize=None) def process_project_url( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: @@ -985,6 +1101,10 @@ def process_project_url( etag_path = cached_path / "etag" date_path = cached_path / "modified-since-date" checksum_path = cached_path / "checksum" + parsed_links_path = cached_path / "parsed-links" + cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path( + project_url, link_evaluator + ) headers: Dict[str, str] = {} # NB: mutates headers! @@ -1021,16 +1141,45 @@ def process_project_url( prev_checksum=prev_checksum, ) - page_links = parse_links(index_response) + page_links: Optional[List[Link]] = None + # Only try our persistent link parsing and evaluation caches if we know the page + # was unmodified via checksum. + if page_unmodified: + cached_candidates = self._try_load_installation_candidate_cache( + cached_candidates_path + ) + if cached_candidates is not None: + return cached_candidates + + page_links = self._try_load_parsed_links_cache(parsed_links_path) + else: + try: + parsed_links_path.unlink() + except OSError: + pass + self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator) + + if page_links is None: + logger.debug( + "extracting new parsed links from index response %s", index_response + ) + page_links = self._write_parsed_links_cache( + parsed_links_path, + parse_links(index_response), + ) with indent_log(): - package_links = self.evaluate_links( - link_evaluator, - links=page_links, + package_links = self._write_installation_candidate_cache( + cached_candidates_path, + self.evaluate_links( + link_evaluator, + links=page_links, + ), ) return package_links + @_canonicalize_arg @functools.lru_cache(maxsize=None) def find_all_candidates(self, project_name: str) -> List[InstallationCandidate]: """Find all available InstallationCandidate for project_name diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index d60dfb96403..3a0b563bd64 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -178,6 +178,43 @@ def _ensure_quoted_url(url: str) -> str: return urllib.parse.urlunparse(result._replace(path=path)) +@dataclass(frozen=True) +class PersistentLinkCacheArgs: + url: str + comes_from: Optional[str] = None + requires_python: Optional[str] = None + yanked_reason: Optional[str] = None + metadata_file_data: Optional[MetadataFile] = None + hashes: Optional[Mapping[str, str]] = None + + def to_json(self) -> Dict[str, Any]: + return { + "url": self.url, + "comes_from": self.comes_from, + "requires_python": self.requires_python, + "yanked_reason": self.yanked_reason, + "metadata_file_data": ( + self.metadata_file_data.hashes if self.metadata_file_data else None + ), + "hashes": self.hashes, + } + + @classmethod + def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs": + return cls( + url=cache_info["url"], + comes_from=cache_info["comes_from"], + requires_python=cache_info["requires_python"], + yanked_reason=cache_info["yanked_reason"], + metadata_file_data=( + MetadataFile(hashes=cache_info["metadata_file_data"]) + if cache_info["metadata_file_data"] + else None + ), + hashes=cache_info["hashes"], + ) + + @functools.total_ordering class Link: """Represents a parsed link from a Package Index's simple URL""" @@ -303,6 +340,27 @@ def from_json( metadata_file_data=metadata_file_data, ) + def cache_args(self) -> PersistentLinkCacheArgs: + return PersistentLinkCacheArgs( + url=self.url, + comes_from=(str(self.comes_from) if self.comes_from else None), + requires_python=self.requires_python, + yanked_reason=self.yanked_reason, + metadata_file_data=self.metadata_file_data, + hashes=self._hashes, + ) + + @classmethod + def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link": + return cls( + args.url, + comes_from=args.comes_from, + requires_python=args.requires_python, + yanked_reason=args.yanked_reason, + metadata_file_data=args.metadata_file_data, + hashes=args.hashes, + ) + @classmethod def from_element( cls, diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index cd68f9f536c..36a9d9762ef 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -2,7 +2,6 @@ import json import logging import os -import re import uuid from pathlib import Path from textwrap import dedent @@ -676,55 +675,6 @@ def test_parse_links__metadata_file_data( assert link._hashes == hashes -def test_parse_links_caches_same_page_by_url() -> None: - raise Exception("todo!") - html = ( - "" - '
' - '' - ) - html_bytes = html.encode("utf-8") - - url = "https://example.com/simple/" - - page_1 = IndexContent( - html_bytes, - "text/html", - encoding=None, - url=url, - ) - # Make a second page with zero content, to ensure that it's not accessed, - # because the page was cached by url. - page_2 = IndexContent( - b"", - "text/html", - encoding=None, - url=url, - ) - # Make a third page which represents an index url, which should not be - # cached, even for the same url. We modify the page content slightly to - # verify that the result is not cached. - page_3 = IndexContent( - re.sub(b"pkg1", b"pkg2", html_bytes), - "text/html", - encoding=None, - url=url, - # cache_link_parsing=False, - ) - - parsed_links_1 = list(parse_links(page_1)) - assert len(parsed_links_1) == 1 - assert "pkg1" in parsed_links_1[0].url - - parsed_links_2 = list(parse_links(page_2)) - assert parsed_links_2 == parsed_links_1 - - parsed_links_3 = list(parse_links(page_3)) - assert len(parsed_links_3) == 1 - assert parsed_links_3 != parsed_links_1 - assert "pkg2" in parsed_links_3[0].url - - @mock.patch("pip._internal.index.collector.raise_for_status") def test_request_http_error( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture diff --git a/tests/unit/test_finder.py b/tests/unit/test_finder.py index 35c7e89b765..888dc974753 100644 --- a/tests/unit/test_finder.py +++ b/tests/unit/test_finder.py @@ -317,6 +317,7 @@ def test_finder_priority_file_over_page(data: TestData) -> None: find_links=[data.find_links], index_urls=["http://pypi.org/simple/"], ) + assert req.name all_versions = finder.find_all_candidates(req.name) # 1 file InstallationCandidate followed by all https ones assert all_versions[0].link.scheme == "file" @@ -335,6 +336,7 @@ def test_finder_priority_nonegg_over_eggfragments() -> None: links = ["http://foo/bar.py#egg=bar-1.0", "http://foo/bar-1.0.tar.gz"] finder = make_test_finder(links) + assert req.name all_versions = finder.find_all_candidates(req.name) assert all_versions[0].link.url.endswith("tar.gz") assert all_versions[1].link.url.endswith("#egg=bar-1.0") @@ -347,6 +349,7 @@ def test_finder_priority_nonegg_over_eggfragments() -> None: links.reverse() finder = make_test_finder(links) + assert req.name all_versions = finder.find_all_candidates(req.name) assert all_versions[0].link.url.endswith("tar.gz") assert all_versions[1].link.url.endswith("#egg=bar-1.0") @@ -549,6 +552,17 @@ def test_find_all_candidates_nothing() -> None: assert not finder.find_all_candidates("pip") +def test_find_all_candidates_cached(data: TestData) -> None: + """Ensure the exact same list of candidates is returned when called twice for the + same project name.""" + finder = make_test_finder(find_links=[data.find_links]) + versions = finder.find_all_candidates("simple") + # Check that the exact same list is reused for a second call. + assert versions is finder.find_all_candidates("simple") + # Check that the project name is canonicalized before caching. + assert versions is finder.find_all_candidates("Simple") + + def test_find_all_candidates_find_links(data: TestData) -> None: finder = make_test_finder(find_links=[data.find_links]) versions = finder.find_all_candidates("simple")