Skip to content

Commit

Permalink
store complete size in transmute zstd compressor (#60)
Browse files Browse the repository at this point in the history
* store complete size in transmute zstd compressor
* always create testing tar.bz2 packages from .conda
* add metadata.json to transmuted .conda's
* return Path instead of None in transmute API
  • Loading branch information
dholth authored May 8, 2023
1 parent 1711ed5 commit b6a8e12
Show file tree
Hide file tree
Showing 13 changed files with 308 additions and 79 deletions.
100 changes: 100 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: Tests

on:
# NOTE: github.event context is push payload:
# https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#push
push:
branches:
- main
- feature/**

# NOTE: github.event context is pull_request payload:
# https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#pull_request
pull_request:

concurrency:
# Concurrency group that uses the workflow name and PR number if available
# or commit SHA as a fallback. If a new build is triggered under that
# concurrency group while a previous build is running it will be canceled.
# Repeated pushes to a PR will cancel all previous builds, while multiple
# merges to main will not cancel.
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true

jobs:
linux:
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
fail-fast: false
matrix:
python-version: ['3.10', '3.11']

steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
fetch-depth: 0

- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: "x64"
cache: "pip"

- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2
with:
python-version: ${{ matrix.python-version }}
channels: defaults
activate-environment: test_env
auto-update-conda: false
auto-activate-base: false
show-channel-urls: true

- name: Source Scripts
run: |
set -x
# conda is our test dependency but can't be pip installed
conda install --quiet conda pip
pip install -e .[test]
conda info --json
echo "condarc"
cat ~/.condarc
echo "conda_pkgs_dir"
ls /home/runner/conda_pkgs_dir
echo "miniconda/pkgs"
ls /usr/share/miniconda/pkgs
echo "test_env"
ls /usr/share/miniconda/envs/test_env
pytest
analyze:
name: Analyze test results
needs: [linux]
if: always()
runs-on: ubuntu-latest
steps:
- name: Download test results
uses: actions/download-artifact@v3

- name: Upload combined test results
# provides one downloadable archive of all .coverage/test-report.xml files
# of all matrix runs for further analysis.
uses: actions/upload-artifact@v3
with:
name: test-results-${{ github.sha }}-all
path: test-results-${{ github.sha }}-*
retention-days: 90 # default: 90

- name: Test Summary
uses: test-summary/action@v2
with:
paths: ./test-results-${{ github.sha }}-**/test-report*.xml

- name: Decide whether the needed jobs succeeded or failed
uses: re-actors/alls-green@release/v1
with:
jobs: ${{ toJSON(needs) }}
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[//]: # (current developments)

## 0.8.0 (2023-05)

* Update transmute to use SpooledTemporaryFile instead of streaming directly to
zip [(#57)](https://github.com/conda/conda-package-streaming/issues/57). This
can reduce zstd memory usage during decompression.
* `transmute` returns Path to transmuted package instead of `None`.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ with closing(conda):
break
```

If you need the entire package, download it first and use the file-based APIs.
The URL-based APIs are more efficient if you only need to access package
metadata.

# Package goals

* Extract conda packages (both formats)
Expand Down
2 changes: 1 addition & 1 deletion conda_package_streaming/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.0"
__version__ = "0.8.0"
92 changes: 58 additions & 34 deletions conda_package_streaming/transmute.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@

from __future__ import annotations

import io
import json
import os
import shutil
import tarfile
import tempfile
import zipfile
from pathlib import Path
from typing import Callable

import zstandard
Expand All @@ -31,6 +33,8 @@
# increase to reduce compression and increase speed
ZSTD_COMPRESS_THREADS = 1

CONDA_PACKAGE_FORMAT_VERSION = 2


def transmute(
package,
Expand All @@ -42,7 +46,7 @@ def transmute(
level=ZSTD_COMPRESS_LEVEL, threads=ZSTD_COMPRESS_THREADS
),
is_info: Callable[[str], bool] = lambda filename: filename.startswith("info/"),
):
) -> Path:
"""
Convert .tar.bz2 conda :package to .conda-format under path.
Expand All @@ -55,32 +59,21 @@ def transmute(
(not this package ``conda-package-streaming``) uses a set of regular
expressions to keep expected items in the info- component, while other
items starting with ``info/`` wind up in the pkg- component.
:return: Path to transmuted package.
"""
assert package.endswith(".tar.bz2"), "can only convert .tar.bz2 to .conda"
assert os.path.isdir(path)
file_id = os.path.basename(package)[: -len(".tar.bz2")]

# x to not append to existing
with zipfile.ZipFile(
os.path.join(path, f"{file_id}.conda"), "x", compresslevel=zipfile.ZIP_STORED
) as conda_file:
info_compress = compressor()
data_compress = compressor()

# in theory, info_tar could grow uncomfortably big, in which case we would
# rather swap it to disk
info_io = io.BytesIO()
info_stream = info_compress.stream_writer(info_io, closefd=False)
info_tar = tarfile.TarFile(fileobj=info_stream, mode="w")

conda_file.writestr(
"metadata.json", json.dumps({"conda_pkg_format_version": 2})
)

with conda_file.open(f"pkg-{file_id}.tar.zst", "w") as pkg_file:
pkg_stream = data_compress.stream_writer(pkg_file, closefd=False)
pkg_tar = tarfile.TarFile(fileobj=pkg_stream, mode="w")

output_path = Path(path, f"{file_id}.conda")

with tempfile.SpooledTemporaryFile() as info_file, tempfile.SpooledTemporaryFile() as pkg_file:
with tarfile.TarFile(fileobj=info_file, mode="w") as info_tar, tarfile.TarFile(
fileobj=pkg_file, mode="w"
) as pkg_tar:
# If we wanted to compress these at a low setting to save temporary
# space, we could insert a file object that counts bytes written in
# front of a zstd (level between 1..3) compressor.
stream = iter(stream_conda_component(package))
for tar, member in stream:
tar_get = info_tar if is_info(member.name) else pkg_tar
Expand All @@ -89,27 +82,56 @@ def transmute(
else:
tar_get.addfile(member)

info_tar.close()
pkg_tar.close()
pkg_stream.close()

info_tar.close()
info_stream.close()
info_size = info_file.tell()
pkg_size = pkg_file.tell()

with conda_file.open(f"info-{file_id}.tar.zst", "w") as info_file:
info_file.write(info_io.getvalue())
info_file.seek(0)
pkg_file.seek(0)

with zipfile.ZipFile(
output_path,
"x", # x to not append to existing
compresslevel=zipfile.ZIP_STORED,
) as conda_file:
# Use a maximum of one Zstd compressor, stream_writer at a time to save memory.
data_compress = compressor()

pkg_metadata = {"conda_pkg_format_version": CONDA_PACKAGE_FORMAT_VERSION}
conda_file.writestr("metadata.json", json.dumps(pkg_metadata))

with conda_file.open(
f"pkg-{file_id}.tar.zst", "w"
) as pkg_file_zip, data_compress.stream_writer(
pkg_file_zip, size=pkg_size, closefd=False
) as pkg_stream:
shutil.copyfileobj(pkg_file._file, pkg_stream)

with conda_file.open(
f"info-{file_id}.tar.zst", "w"
) as info_file_zip, data_compress.stream_writer(
info_file_zip, size=info_size, closefd=False
) as info_stream:
shutil.copyfileobj(info_file._file, info_stream)

return output_path


def transmute_tar_bz2(
package,
package: str,
path,
):
) -> Path:
"""
Convert .conda :package to .tar.bz2 format under path.
Can recompress .tar.bz2 packages.
:param package: path to `.conda` or `.tar.bz2` package.
:param path: destination path for transmuted package.
:return: Path to transmuted package.
"""
assert package.endswith((".tar.bz2", ".conda")), "Unknown extension"
assert os.path.isdir(path)
Expand All @@ -125,13 +147,15 @@ def transmute_tar_bz2(
# .tar.bz2 doesn't filter by component
components = [CondaComponent.pkg]

with open(package, "rb") as fileobj, tarfile.open(
os.path.join(path, f"{file_id}.tar.bz2"), "x:bz2"
) as pkg_tar:
output_path = Path(path, f"{file_id}.tar.bz2")

with open(package, "rb") as fileobj, tarfile.open(output_path, "x:bz2") as pkg_tar:
for component in components:
stream = iter(stream_conda_component(package, fileobj, component=component))
for tar, member in stream:
if member.isfile():
pkg_tar.addfile(member, tar.extractfile(member))
else:
pkg_tar.addfile(member)

return output_path
5 changes: 5 additions & 0 deletions conda_package_streaming/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
Fetch metadata from remote .conda or .tar.bz2 package.
Try to fetch less than the whole file if possible.
This module should only be used to make *partial* reads against a remote
package, typically just the ``info`` portion. If a full ``.conda`` format
package is needed, it is more efficient to download locally first and then use
the file-based API.
"""

import logging
Expand Down
3 changes: 3 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Changelog
```{include} ../CHANGELOG.md
```
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ zstd-compressed streams.
:caption: 'Contents:'
:maxdepth: 2
modules
changelog
```

# Indices and tables
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ test = [
"boto3-stubs[essential]",
"bottle",
"conda",
"conda-package-handling >=2",
]
docs = ["furo", "sphinx", "myst-parser", "mdit-py-plugins>=0.3.0"]

Expand Down
Loading

0 comments on commit b6a8e12

Please sign in to comment.