Skip to content

Commit

Permalink
Fix --venv cached Python interpreter info. (pex-tool#2579)
Browse files Browse the repository at this point in the history
Previously, the work dir path of the `atomic_directory` used to create
the venv would leak into various cached paths in the `PythonInterpreter`
`INTERP-INFO` file. Now, these paths are corrected at creation time.

This is work towards `pex3 cache prune --last-access` which will need to
iterate cached interpreters to find any associated with venvs such that
the interpreter can be pruned when the venv is pruned.
  • Loading branch information
jsirois authored Oct 27, 2024
1 parent 7dda519 commit d1d541e
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 104 deletions.
2 changes: 1 addition & 1 deletion pex/cache/dirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def iter_transitive_dependents(self):

INTERPRETERS = Value(
"interpreters",
version=0,
version=1,
name="Interpreters",
description="Information about interpreters found on the system.",
)
Expand Down
103 changes: 83 additions & 20 deletions pex/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,53 @@ class Platlib(SitePackagesDir):
pass


_PATH_MAPPINGS = {}


@contextmanager
def path_mapping(
current_path, # type: str
final_path, # type: str
):
# type: (...) -> Iterator[None]

_PATH_MAPPINGS[current_path] = final_path
try:
yield
finally:
_PATH_MAPPINGS.pop(current_path)


@contextmanager
def path_mappings(mappings):
# type: (Mapping[str, str]) -> Iterator[None]

_PATH_MAPPINGS.update(mappings)
try:
yield
finally:
for current_path in mappings:
_PATH_MAPPINGS.pop(current_path)


def _adjust_to_final_path(path):
# type: (str) -> str
for current_path, final_path in _PATH_MAPPINGS.items():
if path.startswith(current_path):
prefix_pattern = re.escape(current_path)
return re.sub(prefix_pattern, final_path, path)
return path


def _adjust_to_current_path(path):
# type: (str) -> str
for current_path, final_path in _PATH_MAPPINGS.items():
if path.startswith(final_path):
prefix_pattern = re.escape(final_path)
return re.sub(prefix_pattern, current_path, path)
return path


class PythonIdentity(object):
class Error(Exception):
pass
Expand Down Expand Up @@ -333,14 +380,24 @@ def iter_tags():
site_packages = [] # type: List[SitePackagesDir]
for path in site_packages_paths:
if path == purelib:
site_packages.append(Purelib(path))
site_packages.append(Purelib(_adjust_to_current_path(path)))
elif path == platlib:
site_packages.append(Platlib(path))
site_packages.append(Platlib(_adjust_to_current_path(path)))
else:
site_packages.append(SitePackagesDir(path))
site_packages.append(SitePackagesDir(_adjust_to_current_path(path)))

return cls(
binary=_adjust_to_current_path(values.pop("binary")),
prefix=_adjust_to_current_path(values.pop("prefix")),
base_prefix=_adjust_to_current_path(values.pop("base_prefix")),
sys_path=[_adjust_to_current_path(entry) for entry in values.pop("sys_path")],
site_packages=site_packages,
extras_paths=[
_adjust_to_current_path(extras_path) for extras_path in values.pop("extras_paths")
],
paths={
name: _adjust_to_current_path(path) for name, path in values.pop("paths").items()
},
version=cast("Tuple[int, int, int]", version),
pypy_version=cast("Optional[Tuple[int, int, int]]", pypy_version),
supported_tags=iter_tags(),
Expand Down Expand Up @@ -402,26 +459,27 @@ def encode(self):
purelib = None # type: Optional[str]
platlib = None # type: Optional[str]
for entry in self._site_packages:
site_packages.append(entry.path)
entry_path = _adjust_to_final_path(entry.path)
site_packages.append(entry_path)
if isinstance(entry, Purelib):
purelib = entry.path
purelib = entry_path
elif isinstance(entry, Platlib):
platlib = entry.path
platlib = entry_path

values = dict(
__format_version__=self._FORMAT_VERSION,
binary=self._binary,
prefix=self._prefix,
base_prefix=self._base_prefix,
sys_path=self._sys_path,
binary=_adjust_to_final_path(self._binary),
prefix=_adjust_to_final_path(self._prefix),
base_prefix=_adjust_to_final_path(self._base_prefix),
sys_path=[_adjust_to_final_path(entry) for entry in self._sys_path],
site_packages=site_packages,
# N.B.: We encode purelib and platlib site-packages entries on the side like this to
# ensure older Pex versions that did not know the distinction can still use the
# interpreter cache.
purelib=purelib,
platlib=platlib,
extras_paths=self._extras_paths,
paths=self._paths,
extras_paths=[_adjust_to_final_path(extras_path) for extras_path in self._extras_paths],
paths={name: _adjust_to_final_path(path) for name, path in self._paths.items()},
packaging_version=self._packaging_version,
python_tag=self._python_tag,
abi_tag=self._abi_tag,
Expand Down Expand Up @@ -1068,20 +1126,25 @@ def create_interpreter(
import os
import sys
from pex import interpreter
from pex.atomic_directory import atomic_directory
from pex.common import safe_open
from pex.interpreter import PythonIdentity
encoded_identity = PythonIdentity.get(binary={binary!r}).encode()
with atomic_directory({cache_dir!r}) as cache_dir:
if not cache_dir.is_finalized():
with safe_open(
os.path.join(cache_dir.work_dir, {info_file!r}), 'w'
) as fp:
fp.write(encoded_identity)
with interpreter.path_mappings({path_mappings!r}):
encoded_identity = PythonIdentity.get(binary={binary!r}).encode()
with atomic_directory({cache_dir!r}) as cache_dir:
if not cache_dir.is_finalized():
with safe_open(
os.path.join(cache_dir.work_dir, {info_file!r}), 'w'
) as fp:
fp.write(encoded_identity)
""".format(
binary=binary, cache_dir=cache_dir, info_file=cls.INTERP_INFO_FILE
path_mappings=_PATH_MAPPINGS,
binary=binary,
cache_dir=cache_dir,
info_file=cls.INTERP_INFO_FILE,
)
),
],
Expand Down
170 changes: 90 additions & 80 deletions pex/pex_bootstrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os
import sys

from pex import pex_warnings
from pex import interpreter, pex_warnings
from pex.atomic_directory import atomic_directory
from pex.cache import access as cache_access
from pex.cache.dirs import CacheDir
Expand Down Expand Up @@ -527,92 +527,102 @@ def ensure_venv(
if not venv.is_finalized():
from pex.venv.virtualenv import Virtualenv

virtualenv = Virtualenv.create_atomic(
venv_dir=venv,
interpreter=pex.interpreter,
copies=pex_info.venv_copies,
system_site_packages=pex_info.venv_system_site_packages,
prompt=os.path.basename(ENV.PEX) if ENV.PEX else None,
)
with interpreter.path_mapping(venv.work_dir, venv_dir):
virtualenv = Virtualenv.create_atomic(
venv_dir=venv,
interpreter=pex.interpreter,
copies=pex_info.venv_copies,
system_site_packages=pex_info.venv_system_site_packages,
prompt=os.path.basename(ENV.PEX) if ENV.PEX else None,
)

pex_path = os.path.abspath(pex.path())

# A sha1 hash is 160 bits -> 20 bytes -> 40 hex characters. We start with 8 characters
# (32 bits) of entropy since that is short and _very_ unlikely to collide with another
# PEX venv on this machine. If we still collide after using the whole sha1 (for a total
# of 33 collisions), then the universe is broken and we raise. It's the least we can do.
venv_hash = hashlib.sha1(venv_dir.encode("utf-8")).hexdigest()
collisions = []
for chars in range(8, len(venv_hash) + 1):
entropy = venv_hash[:chars]
short_venv_dir = CacheDir.VENVS.path("s", entropy, pex_root=pex_info.pex_root)
with atomic_directory(short_venv_dir) as short_venv:
if short_venv.is_finalized():
collisions.append(short_venv_dir)
if entropy == venv_hash:
raise RuntimeError(
"The venv for {pex} at {venv} has hash collisions with {count} "
"other {venvs}!\n{collisions}".format(
pex=pex_path,
venv=venv_dir,
count=len(collisions),
venvs=pluralize(collisions, "venv"),
collisions="\n".join(
"{index}.) {venv_path}".format(
index=index, venv_path=os.path.realpath(path)
)
for index, path in enumerate(collisions, start=1)
),
pex_path = os.path.abspath(pex.path())

# A sha1 hash is 160 bits -> 20 bytes -> 40 hex characters. We start with 8
# characters (32 bits) of entropy since that is short and _very_ unlikely to collide
# with another PEX venv on this machine. If we still collide after using the whole
# sha1 (for a total of 33 collisions), then the universe is broken and we raise.
# It's the least we can do.
venv_hash = hashlib.sha1(venv_dir.encode("utf-8")).hexdigest()
collisions = []
for chars in range(8, len(venv_hash) + 1):
entropy = venv_hash[:chars]
short_venv_dir = CacheDir.VENVS.path("s", entropy, pex_root=pex_info.pex_root)
with atomic_directory(short_venv_dir) as short_venv:
if short_venv.is_finalized():
collisions.append(short_venv_dir)
if entropy == venv_hash:
raise RuntimeError(
"The venv for {pex} at {venv} has hash collisions with {count} "
"other {venvs}!\n{collisions}".format(
pex=pex_path,
venv=venv_dir,
count=len(collisions),
venvs=pluralize(collisions, "venv"),
collisions="\n".join(
"{index}.) {venv_path}".format(
index=index, venv_path=os.path.realpath(path)
)
for index, path in enumerate(collisions, start=1)
),
)
)
continue

with interpreter.path_mapping(short_venv.work_dir, short_venv_dir):
os.symlink(
os.path.relpath(venv_dir, short_venv_dir),
os.path.join(short_venv.work_dir, "venv"),
)
continue

os.symlink(venv_dir, os.path.join(short_venv.work_dir, "venv"))

# Loose PEXes don't need to unpack themselves to the PEX_ROOT before running;
# so we'll not have a stable base there to symlink from. As such, always copy
# for loose PEXes to ensure the PEX_ROOT venv is stable in the face of
# modification of the source loose PEX.
copy_mode = (
CopyMode.SYMLINK
if (pex.layout != Layout.LOOSE and not pex_info.venv_site_packages_copies)
else CopyMode.LINK
)

shebang = installer.populate_venv_from_pex(
virtualenv,
pex,
bin_path=pex_info.venv_bin_path,
python=os.path.join(
short_venv_dir,
"venv",
"bin",
os.path.basename(virtualenv.interpreter.binary),
),
collisions_ok=collisions_ok,
copy_mode=copy_mode,
hermetic_scripts=pex_info.venv_hermetic_scripts,
)
# Loose PEXes don't need to unpack themselves to the PEX_ROOT before
# running; so we'll not have a stable base there to symlink from. As
# such, always copy for loose PEXes to ensure the PEX_ROOT venv is
# stable in the face of modification of the source loose PEX.
copy_mode = (
CopyMode.SYMLINK
if (
pex.layout != Layout.LOOSE
and not pex_info.venv_site_packages_copies
)
else CopyMode.LINK
)

# There are popular Linux distributions with shebang length limits
# (BINPRM_BUF_SIZE in /usr/include/linux/binfmts.h) set at 128 characters, so
# we warn in the _very_ unlikely case that our shortened shebang is longer than
# this.
if len(shebang) > 128:
pex_warnings.warn(
"The venv for {pex} at {venv} has script shebangs of {shebang!r} with "
"{count} characters. On some systems this may be too long and cause "
"problems running the venv scripts. You may be able adjust PEX_ROOT "
"from {pex_root} to a shorter path as a work-around.".format(
pex=pex_path,
venv=venv_dir,
shebang=shebang,
count=len(shebang),
pex_root=pex_info.pex_root,
shebang = installer.populate_venv_from_pex(
virtualenv,
pex,
bin_path=pex_info.venv_bin_path,
python=os.path.join(
short_venv_dir,
"venv",
"bin",
os.path.basename(virtualenv.interpreter.binary),
),
collisions_ok=collisions_ok,
copy_mode=copy_mode,
hermetic_scripts=pex_info.venv_hermetic_scripts,
)
)

break
# There are popular Linux distributions with shebang length limits
# (BINPRM_BUF_SIZE in /usr/include/linux/binfmts.h) set at 128
# characters, so we warn in the _very_ unlikely case that our shortened
# shebang is longer than this.
if len(shebang) > 128:
pex_warnings.warn(
"The venv for {pex} at {venv} has script shebangs of "
"{shebang!r} with {count} characters. On some systems this may "
"be too long and cause problems running the venv scripts. You "
"may be able adjust PEX_ROOT from {pex_root} to a shorter path "
"as a work-around.".format(
pex=pex_path,
venv=venv_dir,
shebang=shebang,
count=len(shebang),
pex_root=pex_info.pex_root,
)
)

break

return VenvPex(venv_dir, hermetic_scripts=pex_info.venv_hermetic_scripts)

Expand Down
Loading

0 comments on commit d1d541e

Please sign in to comment.