cache Link parsing and interpreter compatibility checking

pypa · Sep 3, 2023 · c468bc3 · c468bc3
1 parent 202b6d6
commit c468bc3
Show file tree

Hide file tree

Showing 3 changed files with 236 additions and 8 deletions.
diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py
@@ -7,7 +7,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Type
 
 from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
 from pip._vendor.packaging.utils import canonicalize_name
@@ -95,6 +95,17 @@ def get_path_for_link(self, link: Link) -> str:
         return os.path.join(self.cache_dir, "link-metadata", *parts)
 
 
+class SerializableEntry(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def suffix(cls) -> str:
+        ...
+
+    @abc.abstractmethod
+    def serialize(self) -> Dict[str, Any]:
+        ...
+
+
 class FetchResolveCache(Cache):
     def get_path_for_link(self, link: Link) -> str:
         # We are reading index links to extract other links from, not executing any
@@ -103,6 +114,19 @@ def get_path_for_link(self, link: Link) -> str:
         assert self.cache_dir
         return os.path.join(self.cache_dir, "fetch-resolve", *parts)
 
+    def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
+        hashed = _hash_dict(entry.serialize())
+        return self.cache_path(link) / f"{hashed}{entry.suffix()}"
+
+    def clear_hashed_entries(
+        self, link: Link, entry_type: Type[SerializableEntry]
+    ) -> None:
+        for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
+            logger.debug(
+                "unlinking invalidated hashed link eval cache entry %s", hashed_entry
+            )
+            hashed_entry.unlink()
+
 
 class WheelCacheBase(Cache):
     """Specializations to the cache concept for wheels."""

diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
@@ -3,13 +3,15 @@
 import enum
 import functools
 import itertools
+import json
 import logging
 import os
 import re
 from hashlib import sha256
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
+    Any,
     Dict,
     FrozenSet,
     Iterable,
@@ -26,7 +28,7 @@
 from pip._vendor.packaging.version import _BaseVersion
 from pip._vendor.packaging.version import parse as parse_version
 
-from pip._internal.cache import FetchResolveCache
+from pip._internal.cache import FetchResolveCache, SerializableEntry
 from pip._internal.exceptions import (
     BestVersionAlreadyInstalled,
     DistributionNotFound,
@@ -36,7 +38,7 @@
 from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
 from pip._internal.models.candidate import InstallationCandidate
 from pip._internal.models.format_control import FormatControl
-from pip._internal.models.link import Link
+from pip._internal.models.link import Link, PersistentLinkCacheArgs
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.models.selection_prefs import SelectionPreferences
 from pip._internal.models.target_python import TargetPython
@@ -119,14 +121,41 @@ class LinkType(enum.Enum):
     requires_python_mismatch = enum.auto()
 
 
-class LinkEvaluator:
+class LinkEvaluator(SerializableEntry):
 
     """
     Responsible for evaluating links for a particular project.
     """
 
+    @classmethod
+    def suffix(cls) -> str:
+        return ".evaluation"
+
     _py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")
 
+    def serialize(self) -> Dict[str, Any]:
+        return dict(
+            project_name=self.project_name,
+            canonical_name=self._canonical_name,
+            # Sort these for determinism.
+            formats=sorted(self._formats),
+            target_python=self._target_python.format_given(),
+            allow_yanked=self._allow_yanked,
+            ignore_requires_python=self._ignore_requires_python,
+        )
+
+    def to_json(self) -> str:
+        return json.dumps(self.serialize(), sort_keys=True)
+
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, type(self)) and self.to_json() == other.to_json()
+
+    def __ne__(self, other: Any) -> bool:
+        return not self == other
+
+    def __hash__(self) -> int:
+        return hash(self.to_json())
+
     # Don't include an allow_yanked default value to make sure each call
     # site considers whether yanked releases are allowed. This also causes
     # that decision to be made explicit in the calling code, which helps
@@ -900,6 +929,91 @@ def _write_http_cache_info(
 
         return (new_etag, new_date, new_checksum, page_unmodified)
 
+    @staticmethod
+    def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
+        page_links: Optional[List[Link]] = None
+        try:
+            with parsed_links_path.open("r") as f:
+                logger.debug("reading page links from cache %s", parsed_links_path)
+                cached_links = json.load(f)
+                page_links = []
+                for cache_info in cached_links:
+                    link = Link.from_cache_args(
+                        PersistentLinkCacheArgs.from_json(cache_info)
+                    )
+                    assert link is not None
+                    page_links.append(link)
+        except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
+            logger.debug(
+                "could not read page links from cache file %s %s(%s)",
+                parsed_links_path,
+                e.__class__.__name__,
+                str(e),
+            )
+        return page_links
+
+    @staticmethod
+    def _write_parsed_links_cache(
+        parsed_links_path: Path, links: Iterable[Link]
+    ) -> List[Link]:
+        cacheable_links: List[Dict[str, Any]] = []
+        page_links: List[Link] = []
+        for link in links:
+            cache_info = link.cache_args()
+            assert cache_info is not None
+            cacheable_links.append(cache_info.to_json())
+            page_links.append(link)
+
+        logger.debug("writing page links to %s", parsed_links_path)
+        with parsed_links_path.open("w") as f:
+            json.dump(cacheable_links, f)
+
+        return page_links
+
+    @staticmethod
+    def _try_load_installation_candidate_cache(
+        cached_candidates_path: Path,
+    ) -> Optional[List[InstallationCandidate]]:
+        try:
+            with cached_candidates_path.open("r") as f:
+                serialized_candidates = json.load(f)
+            logger.debug("read serialized candidates from %s", cached_candidates_path)
+            package_links: List[InstallationCandidate] = []
+            for cand in serialized_candidates:
+                link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
+                link = Link.from_cache_args(link_cache_args)
+                package_links.append(
+                    InstallationCandidate(cand["name"], cand["version"], link)
+                )
+            return package_links
+        except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
+            logger.debug(
+                "could not read cached candidates at %s %s(%s)",
+                cached_candidates_path,
+                e.__class__.__name__,
+                str(e),
+            )
+        return None
+
+    @staticmethod
+    def _write_installation_candidate_cache(
+        cached_candidates_path: Path,
+        candidates: Iterable[InstallationCandidate],
+    ) -> List[InstallationCandidate]:
+        candidates = list(candidates)
+        serialized_candidates = [
+            dict(
+                name=candidate.name,
+                version=str(candidate.version),
+                link=candidate.link.cache_args().to_json(),
+            )
+            for candidate in candidates
+        ]
+        with cached_candidates_path.open("w") as f:
+            logger.debug("writing serialized candidates to %s", f.name)
+            json.dump(serialized_candidates, f)
+        return candidates
+
     def _process_project_url_uncached(
         self, project_url: Link, link_evaluator: LinkEvaluator
     ) -> List[InstallationCandidate]:
@@ -926,6 +1040,10 @@ def process_project_url(
         etag_path = cached_path / "etag"
         date_path = cached_path / "modified-since-date"
         checksum_path = cached_path / "checksum"
+        parsed_links_path = cached_path / "parsed-links"
+        cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
+            project_url, link_evaluator
+        )
 
         headers: Dict[str, str] = {}
         # NB: mutates headers!
@@ -962,12 +1080,40 @@ def process_project_url(
             prev_checksum=prev_checksum,
         )
 
-        page_links = parse_links(index_response)
+        page_links: Optional[List[Link]] = None
+        # Only try our persistent link parsing and evaluation caches if we know the page
+        # was unmodified via checksum.
+        if page_unmodified:
+            cached_candidates = self._try_load_installation_candidate_cache(
+                cached_candidates_path
+            )
+            if cached_candidates is not None:
+                return cached_candidates
+
+            page_links = self._try_load_parsed_links_cache(parsed_links_path)
+        else:
+            try:
+                parsed_links_path.unlink()
+            except OSError:
+                pass
+            self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)
+
+        if page_links is None:
+            logger.debug(
+                "extracting new parsed links from index response %s", index_response
+            )
+            page_links = self._write_parsed_links_cache(
+                parsed_links_path,
+                parse_links(index_response),
+            )
 
         with indent_log():
-            package_links = self.evaluate_links(
-                link_evaluator,
-                links=page_links,
+            package_links = self._write_installation_candidate_cache(
+                cached_candidates_path,
+                self.evaluate_links(
+                    link_evaluator,
+                    links=page_links,
+                ),
             )
 
         return package_links

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -179,6 +179,43 @@ def _ensure_quoted_url(url: str) -> str:
     return urllib.parse.urlunparse(result._replace(path=path))
 
 
+@dataclass(frozen=True)
+class PersistentLinkCacheArgs:
+    url: str
+    comes_from: Optional[str] = None
+    requires_python: Optional[str] = None
+    yanked_reason: Optional[str] = None
+    metadata_file_data: Optional[MetadataFile] = None
+    hashes: Optional[Mapping[str, str]] = None
+
+    def to_json(self) -> Dict[str, Any]:
+        return dict(
+            url=self.url,
+            comes_from=self.comes_from,
+            requires_python=self.requires_python,
+            yanked_reason=self.yanked_reason,
+            metadata_file_data=(
+                self.metadata_file_data.hashes if self.metadata_file_data else None
+            ),
+            hashes=self.hashes,
+        )
+
+    @classmethod
+    def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
+        return cls(
+            url=cache_info["url"],
+            comes_from=cache_info["comes_from"],
+            requires_python=cache_info["requires_python"],
+            yanked_reason=cache_info["yanked_reason"],
+            metadata_file_data=(
+                MetadataFile(hashes=cache_info["metadata_file_data"])
+                if cache_info["metadata_file_data"]
+                else None
+            ),
+            hashes=cache_info["hashes"],
+        )
+
+
 class Link(KeyBasedCompareMixin):
     """Represents a parsed link from a Package Index's simple URL"""
 
@@ -305,6 +342,27 @@ def from_json(
             metadata_file_data=metadata_file_data,
         )
 
+    def cache_args(self) -> PersistentLinkCacheArgs:
+        return PersistentLinkCacheArgs(
+            url=self.url,
+            comes_from=(str(self.comes_from) if self.comes_from else None),
+            requires_python=self.requires_python,
+            yanked_reason=self.yanked_reason,
+            metadata_file_data=self.metadata_file_data,
+            hashes=self._hashes,
+        )
+
+    @classmethod
+    def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
+        return cls(
+            args.url,
+            comes_from=args.comes_from,
+            requires_python=args.requires_python,
+            yanked_reason=args.yanked_reason,
+            metadata_file_data=args.metadata_file_data,
+            hashes=args.hashes,
+        )
+
     @classmethod
     def from_element(
         cls,