Skip to content

Commit

Permalink
cache Link parsing and interpreter compatibility checking
Browse files Browse the repository at this point in the history
- also compress the link parsing
  • Loading branch information
cosmicexplorer committed Aug 13, 2024
1 parent dbacf99 commit f007a62
Show file tree
Hide file tree
Showing 5 changed files with 252 additions and 59 deletions.
24 changes: 23 additions & 1 deletion src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import os
import re
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
Expand Down Expand Up @@ -146,6 +146,15 @@ def get_path_for_link(self, link: Link) -> str:
return os.path.join(self.cache_dir, "link-metadata", *parts)


class SerializableEntry(abc.ABC):
@classmethod
@abc.abstractmethod
def suffix(cls) -> str: ...

@abc.abstractmethod
def serialize(self) -> Dict[str, Any]: ...


class FetchResolveCache(Cache):
def get_path_for_link(self, link: Link) -> str:
# We are reading index links to extract other links from, not executing any
Expand All @@ -154,6 +163,19 @@ def get_path_for_link(self, link: Link) -> str:
assert self.cache_dir
return os.path.join(self.cache_dir, "fetch-resolve", *parts)

def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
hashed = _hash_dict(entry.serialize())
return self.cache_path(link) / f"{hashed}{entry.suffix()}"

def clear_hashed_entries(
self, link: Link, entry_type: Type[SerializableEntry]
) -> None:
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
logger.debug(
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
)
hashed_entry.unlink()


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""
Expand Down
165 changes: 157 additions & 8 deletions src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Routines related to PyPI, indexes"""

import binascii
import bz2
import datetime
import enum
import functools
import itertools
import json
import logging
import os
import re
Expand All @@ -14,6 +16,8 @@
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
FrozenSet,
Iterable,
Expand All @@ -30,7 +34,7 @@
from pip._vendor.packaging.version import InvalidVersion, _BaseVersion
from pip._vendor.packaging.version import parse as parse_version

from pip._internal.cache import FetchResolveCache
from pip._internal.cache import FetchResolveCache, SerializableEntry
from pip._internal.exceptions import (
BestVersionAlreadyInstalled,
DistributionNotFound,
Expand All @@ -40,7 +44,7 @@
from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.format_control import FormatControl
from pip._internal.models.link import Link
from pip._internal.models.link import Link, PersistentLinkCacheArgs
from pip._internal.models.search_scope import SearchScope
from pip._internal.models.selection_prefs import SelectionPreferences
from pip._internal.models.target_python import TargetPython
Expand Down Expand Up @@ -123,13 +127,28 @@ class LinkType(enum.Enum):
requires_python_mismatch = enum.auto()


class LinkEvaluator:
class LinkEvaluator(SerializableEntry):
"""
Responsible for evaluating links for a particular project.
"""

@classmethod
def suffix(cls) -> str:
return ".evaluation"

_py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")

def serialize(self) -> Dict[str, Any]:
return {
"project_name": self.project_name,
"canonical_name": self._canonical_name,
# Sort these for determinism.
"formats": sorted(self._formats),
"target_python": self._target_python.format_given(),
"allow_yanked": self._allow_yanked,
"ignore_requires_python": self._ignore_requires_python,
}

# Don't include an allow_yanked default value to make sure each call
# site considers whether yanked releases are allowed. This also causes
# that decision to be made explicit in the calling code, which helps
Expand Down Expand Up @@ -594,6 +613,19 @@ def compute_best_candidate(
)


_FindCandidates = Callable[["PackageFinder", str], List[InstallationCandidate]]


def _canonicalize_arg(func: _FindCandidates) -> _FindCandidates:
@functools.wraps(func)
def wrapper(
self: "PackageFinder", project_name: str
) -> List[InstallationCandidate]:
return func(self, canonicalize_name(project_name))

return wrapper


class PackageFinder:
"""This finds packages.
Expand Down Expand Up @@ -954,6 +986,91 @@ def _write_http_cache_info(

return (new_etag, new_date, new_checksum, page_unmodified)

@staticmethod
def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
page_links: Optional[List[Link]] = None
try:
with bz2.open(parsed_links_path, mode="rt", encoding="utf-8") as f:
logger.debug("reading page links from cache %s", parsed_links_path)
cached_links = json.load(f)
page_links = []
for cache_info in cached_links:
link = Link.from_cache_args(
PersistentLinkCacheArgs.from_json(cache_info)
)
assert link is not None
page_links.append(link)
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
logger.debug(
"could not read page links from cache file %s %s(%s)",
parsed_links_path,
e.__class__.__name__,
str(e),
)
return page_links

@staticmethod
def _write_parsed_links_cache(
parsed_links_path: Path, links: Iterable[Link]
) -> List[Link]:
cacheable_links: List[Dict[str, Any]] = []
page_links: List[Link] = []
for link in links:
cache_info = link.cache_args()
assert cache_info is not None
cacheable_links.append(cache_info.to_json())
page_links.append(link)

logger.debug("writing page links to %s", parsed_links_path)
with bz2.open(parsed_links_path, mode="wt", encoding="utf-8") as f:
json.dump(cacheable_links, f)

return page_links

@staticmethod
def _try_load_installation_candidate_cache(
cached_candidates_path: Path,
) -> Optional[List[InstallationCandidate]]:
try:
with bz2.open(cached_candidates_path, mode="rt", encoding="utf-8") as f:
serialized_candidates = json.load(f)
logger.debug("read serialized candidates from %s", cached_candidates_path)
package_links: List[InstallationCandidate] = []
for cand in serialized_candidates:
link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
link = Link.from_cache_args(link_cache_args)
package_links.append(
InstallationCandidate(cand["name"], cand["version"], link)
)
return package_links
except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
logger.debug(
"could not read cached candidates at %s %s(%s)",
cached_candidates_path,
e.__class__.__name__,
str(e),
)
return None

@staticmethod
def _write_installation_candidate_cache(
cached_candidates_path: Path,
candidates: Iterable[InstallationCandidate],
) -> List[InstallationCandidate]:
candidates = list(candidates)
serialized_candidates = [
{
"name": candidate.name,
"version": str(candidate.version),
"link": candidate.link.cache_args().to_json(),
}
for candidate in candidates
]
with bz2.open(cached_candidates_path, mode="wt", encoding="utf-8") as f:
logger.debug("writing serialized candidates to %s", cached_candidates_path)
json.dump(serialized_candidates, f)
return candidates

def _process_project_url_uncached(
self, project_url: Link, link_evaluator: LinkEvaluator
) -> List[InstallationCandidate]:
Expand All @@ -972,7 +1089,6 @@ def _process_project_url_uncached(
package_links = self.evaluate_links(link_evaluator, links=page_links)
return package_links

@functools.lru_cache(maxsize=None)
def process_project_url(
self, project_url: Link, link_evaluator: LinkEvaluator
) -> List[InstallationCandidate]:
Expand All @@ -985,6 +1101,10 @@ def process_project_url(
etag_path = cached_path / "etag"
date_path = cached_path / "modified-since-date"
checksum_path = cached_path / "checksum"
parsed_links_path = cached_path / "parsed-links"
cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
project_url, link_evaluator
)

headers: Dict[str, str] = {}
# NB: mutates headers!
Expand Down Expand Up @@ -1021,16 +1141,45 @@ def process_project_url(
prev_checksum=prev_checksum,
)

page_links = parse_links(index_response)
page_links: Optional[List[Link]] = None
# Only try our persistent link parsing and evaluation caches if we know the page
# was unmodified via checksum.
if page_unmodified:
cached_candidates = self._try_load_installation_candidate_cache(
cached_candidates_path
)
if cached_candidates is not None:
return cached_candidates

page_links = self._try_load_parsed_links_cache(parsed_links_path)
else:
try:
parsed_links_path.unlink()
except OSError:
pass
self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)

if page_links is None:
logger.debug(
"extracting new parsed links from index response %s", index_response
)
page_links = self._write_parsed_links_cache(
parsed_links_path,
parse_links(index_response),
)

with indent_log():
package_links = self.evaluate_links(
link_evaluator,
links=page_links,
package_links = self._write_installation_candidate_cache(
cached_candidates_path,
self.evaluate_links(
link_evaluator,
links=page_links,
),
)

return package_links

@_canonicalize_arg
@functools.lru_cache(maxsize=None)
def find_all_candidates(self, project_name: str) -> List[InstallationCandidate]:
"""Find all available InstallationCandidate for project_name
Expand Down
58 changes: 58 additions & 0 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,43 @@ def _ensure_quoted_url(url: str) -> str:
return urllib.parse.urlunparse(result._replace(path=path))


@dataclass(frozen=True)
class PersistentLinkCacheArgs:
url: str
comes_from: Optional[str] = None
requires_python: Optional[str] = None
yanked_reason: Optional[str] = None
metadata_file_data: Optional[MetadataFile] = None
hashes: Optional[Mapping[str, str]] = None

def to_json(self) -> Dict[str, Any]:
return {
"url": self.url,
"comes_from": self.comes_from,
"requires_python": self.requires_python,
"yanked_reason": self.yanked_reason,
"metadata_file_data": (
self.metadata_file_data.hashes if self.metadata_file_data else None
),
"hashes": self.hashes,
}

@classmethod
def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
return cls(
url=cache_info["url"],
comes_from=cache_info["comes_from"],
requires_python=cache_info["requires_python"],
yanked_reason=cache_info["yanked_reason"],
metadata_file_data=(
MetadataFile(hashes=cache_info["metadata_file_data"])
if cache_info["metadata_file_data"]
else None
),
hashes=cache_info["hashes"],
)


@functools.total_ordering
class Link:
"""Represents a parsed link from a Package Index's simple URL"""
Expand Down Expand Up @@ -303,6 +340,27 @@ def from_json(
metadata_file_data=metadata_file_data,
)

def cache_args(self) -> PersistentLinkCacheArgs:
return PersistentLinkCacheArgs(
url=self.url,
comes_from=(str(self.comes_from) if self.comes_from else None),
requires_python=self.requires_python,
yanked_reason=self.yanked_reason,
metadata_file_data=self.metadata_file_data,
hashes=self._hashes,
)

@classmethod
def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
return cls(
args.url,
comes_from=args.comes_from,
requires_python=args.requires_python,
yanked_reason=args.yanked_reason,
metadata_file_data=args.metadata_file_data,
hashes=args.hashes,
)

@classmethod
def from_element(
cls,
Expand Down
Loading

0 comments on commit f007a62

Please sign in to comment.