Skip to content

Commit

Permalink
cache Link parsing and interpreter compatibility checking
Browse files Browse the repository at this point in the history
  • Loading branch information
cosmicexplorer committed Sep 3, 2023
1 parent 202b6d6 commit c468bc3
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 8 deletions.
26 changes: 25 additions & 1 deletion src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Type

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
Expand Down Expand Up @@ -95,6 +95,17 @@ def get_path_for_link(self, link: Link) -> str:
return os.path.join(self.cache_dir, "link-metadata", *parts)


class SerializableEntry(abc.ABC):
@classmethod
@abc.abstractmethod
def suffix(cls) -> str:
...

@abc.abstractmethod
def serialize(self) -> Dict[str, Any]:
...


class FetchResolveCache(Cache):
def get_path_for_link(self, link: Link) -> str:
# We are reading index links to extract other links from, not executing any
Expand All @@ -103,6 +114,19 @@ def get_path_for_link(self, link: Link) -> str:
assert self.cache_dir
return os.path.join(self.cache_dir, "fetch-resolve", *parts)

def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
hashed = _hash_dict(entry.serialize())
return self.cache_path(link) / f"{hashed}{entry.suffix()}"

def clear_hashed_entries(
self, link: Link, entry_type: Type[SerializableEntry]
) -> None:
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
logger.debug(
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
)
hashed_entry.unlink()


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""
Expand Down
160 changes: 153 additions & 7 deletions src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import enum
import functools
import itertools
import json
import logging
import os
import re
from hashlib import sha256
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
FrozenSet,
Iterable,
Expand All @@ -26,7 +28,7 @@
from pip._vendor.packaging.version import _BaseVersion
from pip._vendor.packaging.version import parse as parse_version

from pip._internal.cache import FetchResolveCache
from pip._internal.cache import FetchResolveCache, SerializableEntry
from pip._internal.exceptions import (
BestVersionAlreadyInstalled,
DistributionNotFound,
Expand All @@ -36,7 +38,7 @@
from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.format_control import FormatControl
from pip._internal.models.link import Link
from pip._internal.models.link import Link, PersistentLinkCacheArgs
from pip._internal.models.search_scope import SearchScope
from pip._internal.models.selection_prefs import SelectionPreferences
from pip._internal.models.target_python import TargetPython
Expand Down Expand Up @@ -119,14 +121,41 @@ class LinkType(enum.Enum):
requires_python_mismatch = enum.auto()


class LinkEvaluator:
class LinkEvaluator(SerializableEntry):

"""
Responsible for evaluating links for a particular project.
"""

@classmethod
def suffix(cls) -> str:
return ".evaluation"

_py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")

def serialize(self) -> Dict[str, Any]:
return dict(
project_name=self.project_name,
canonical_name=self._canonical_name,
# Sort these for determinism.
formats=sorted(self._formats),
target_python=self._target_python.format_given(),
allow_yanked=self._allow_yanked,
ignore_requires_python=self._ignore_requires_python,
)

def to_json(self) -> str:
return json.dumps(self.serialize(), sort_keys=True)

def __eq__(self, other: Any) -> bool:
return isinstance(other, type(self)) and self.to_json() == other.to_json()

def __ne__(self, other: Any) -> bool:
return not self == other

def __hash__(self) -> int:
return hash(self.to_json())

# Don't include an allow_yanked default value to make sure each call
# site considers whether yanked releases are allowed. This also causes
# that decision to be made explicit in the calling code, which helps
Expand Down Expand Up @@ -900,6 +929,91 @@ def _write_http_cache_info(

return (new_etag, new_date, new_checksum, page_unmodified)

@staticmethod
def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
page_links: Optional[List[Link]] = None
try:
with parsed_links_path.open("r") as f:
logger.debug("reading page links from cache %s", parsed_links_path)
cached_links = json.load(f)
page_links = []
for cache_info in cached_links:
link = Link.from_cache_args(
PersistentLinkCacheArgs.from_json(cache_info)
)
assert link is not None
page_links.append(link)
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
logger.debug(
"could not read page links from cache file %s %s(%s)",
parsed_links_path,
e.__class__.__name__,
str(e),
)
return page_links

@staticmethod
def _write_parsed_links_cache(
parsed_links_path: Path, links: Iterable[Link]
) -> List[Link]:
cacheable_links: List[Dict[str, Any]] = []
page_links: List[Link] = []
for link in links:
cache_info = link.cache_args()
assert cache_info is not None
cacheable_links.append(cache_info.to_json())
page_links.append(link)

logger.debug("writing page links to %s", parsed_links_path)
with parsed_links_path.open("w") as f:
json.dump(cacheable_links, f)

return page_links

@staticmethod
def _try_load_installation_candidate_cache(
cached_candidates_path: Path,
) -> Optional[List[InstallationCandidate]]:
try:
with cached_candidates_path.open("r") as f:
serialized_candidates = json.load(f)
logger.debug("read serialized candidates from %s", cached_candidates_path)
package_links: List[InstallationCandidate] = []
for cand in serialized_candidates:
link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
link = Link.from_cache_args(link_cache_args)
package_links.append(
InstallationCandidate(cand["name"], cand["version"], link)
)
return package_links
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
logger.debug(
"could not read cached candidates at %s %s(%s)",
cached_candidates_path,
e.__class__.__name__,
str(e),
)
return None

@staticmethod
def _write_installation_candidate_cache(
cached_candidates_path: Path,
candidates: Iterable[InstallationCandidate],
) -> List[InstallationCandidate]:
candidates = list(candidates)
serialized_candidates = [
dict(
name=candidate.name,
version=str(candidate.version),
link=candidate.link.cache_args().to_json(),
)
for candidate in candidates
]
with cached_candidates_path.open("w") as f:
logger.debug("writing serialized candidates to %s", f.name)
json.dump(serialized_candidates, f)
return candidates

def _process_project_url_uncached(
self, project_url: Link, link_evaluator: LinkEvaluator
) -> List[InstallationCandidate]:
Expand All @@ -926,6 +1040,10 @@ def process_project_url(
etag_path = cached_path / "etag"
date_path = cached_path / "modified-since-date"
checksum_path = cached_path / "checksum"
parsed_links_path = cached_path / "parsed-links"
cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
project_url, link_evaluator
)

headers: Dict[str, str] = {}
# NB: mutates headers!
Expand Down Expand Up @@ -962,12 +1080,40 @@ def process_project_url(
prev_checksum=prev_checksum,
)

page_links = parse_links(index_response)
page_links: Optional[List[Link]] = None
# Only try our persistent link parsing and evaluation caches if we know the page
# was unmodified via checksum.
if page_unmodified:
cached_candidates = self._try_load_installation_candidate_cache(
cached_candidates_path
)
if cached_candidates is not None:
return cached_candidates

page_links = self._try_load_parsed_links_cache(parsed_links_path)
else:
try:
parsed_links_path.unlink()
except OSError:
pass
self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)

if page_links is None:
logger.debug(
"extracting new parsed links from index response %s", index_response
)
page_links = self._write_parsed_links_cache(
parsed_links_path,
parse_links(index_response),
)

with indent_log():
package_links = self.evaluate_links(
link_evaluator,
links=page_links,
package_links = self._write_installation_candidate_cache(
cached_candidates_path,
self.evaluate_links(
link_evaluator,
links=page_links,
),
)

return package_links
Expand Down
58 changes: 58 additions & 0 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,43 @@ def _ensure_quoted_url(url: str) -> str:
return urllib.parse.urlunparse(result._replace(path=path))


@dataclass(frozen=True)
class PersistentLinkCacheArgs:
url: str
comes_from: Optional[str] = None
requires_python: Optional[str] = None
yanked_reason: Optional[str] = None
metadata_file_data: Optional[MetadataFile] = None
hashes: Optional[Mapping[str, str]] = None

def to_json(self) -> Dict[str, Any]:
return dict(
url=self.url,
comes_from=self.comes_from,
requires_python=self.requires_python,
yanked_reason=self.yanked_reason,
metadata_file_data=(
self.metadata_file_data.hashes if self.metadata_file_data else None
),
hashes=self.hashes,
)

@classmethod
def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
return cls(
url=cache_info["url"],
comes_from=cache_info["comes_from"],
requires_python=cache_info["requires_python"],
yanked_reason=cache_info["yanked_reason"],
metadata_file_data=(
MetadataFile(hashes=cache_info["metadata_file_data"])
if cache_info["metadata_file_data"]
else None
),
hashes=cache_info["hashes"],
)


class Link(KeyBasedCompareMixin):
"""Represents a parsed link from a Package Index's simple URL"""

Expand Down Expand Up @@ -305,6 +342,27 @@ def from_json(
metadata_file_data=metadata_file_data,
)

def cache_args(self) -> PersistentLinkCacheArgs:
return PersistentLinkCacheArgs(
url=self.url,
comes_from=(str(self.comes_from) if self.comes_from else None),
requires_python=self.requires_python,
yanked_reason=self.yanked_reason,
metadata_file_data=self.metadata_file_data,
hashes=self._hashes,
)

@classmethod
def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
return cls(
args.url,
comes_from=args.comes_from,
requires_python=args.requires_python,
yanked_reason=args.yanked_reason,
metadata_file_data=args.metadata_file_data,
hashes=args.hashes,
)

@classmethod
def from_element(
cls,
Expand Down

0 comments on commit c468bc3

Please sign in to comment.