Skip to content

Commit

Permalink
gh-113257: Automatically generate pip SBOM metadata from wheel (#113295)
Browse files Browse the repository at this point in the history
Co-authored-by: Hugo van Kemenade <[email protected]>
  • Loading branch information
sethmlarson and hugovk authored Dec 20, 2023
1 parent 11ee912 commit b221e03
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Misc/sbom.spdx.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

116 changes: 106 additions & 10 deletions Tools/build/generate_sbom.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""

import os
import re
import hashlib
import json
import glob
import pathlib
import subprocess
import sys
import typing
from urllib.request import urlopen

CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent

# Before adding a new entry to this list, double check that
# the license expression is a valid SPDX license expression:
Expand Down Expand Up @@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
# values to 'exclude' if we create new files within tracked
# directories that aren't sourced from third-party packages.
PACKAGE_TO_FILES = {
# NOTE: pip's entry in this structure is automatically generated in
# the 'discover_pip_sbom_package()' function below.
"mpdecimal": PackageFiles(
include=["Modules/_decimal/libmpdec/**"]
),
"expat": PackageFiles(
include=["Modules/expat/**"]
),
"pip": PackageFiles(
include=["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl"]
),
"macholib": PackageFiles(
include=["Lib/ctypes/macholib/**"],
exclude=[
Expand Down Expand Up @@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])


def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
"""pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
automatable to discover the metadata we need like the version and checksums
so let's do that on behalf of our friends at the PyPA.
"""
global PACKAGE_TO_FILES

ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
pip_wheels = []

# Find the hopefully one pip wheel in the bundled directory.
for wheel_filename in os.listdir(ensurepip_bundled_dir):
if wheel_filename.startswith("pip-"):
pip_wheels.append(wheel_filename)
if len(pip_wheels) != 1:
print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
sys.exit(1)
pip_wheel_filename = pip_wheels[0]

# Add the wheel filename to the list of files so the SBOM file
# and relationship generator can work its magic on the wheel too.
PACKAGE_TO_FILES["pip"] = PackageFiles(
include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"]
)

# Wheel filename format puts the version right after the project name.
pip_version = pip_wheel_filename.split("-")[1]
pip_checksum_sha256 = hashlib.sha256(
(ensurepip_bundled_dir / pip_wheel_filename).read_bytes()
).hexdigest()

# Get pip's download location from PyPI. Check that the checksum is correct too.
try:
raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read()
pip_release_metadata = json.loads(raw_text)
url: dict[str, typing.Any]

# Look for a matching artifact filename and then check
# its remote checksum to the local one.
for url in pip_release_metadata["urls"]:
if url["filename"] == pip_wheel_filename:
break
else:
raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'")
if url["digests"]["sha256"] != pip_checksum_sha256:
raise ValueError(f"Local pip checksum doesn't match artifact on PyPI")

# Successfully found the download URL for the matching artifact.
pip_download_url = url["url"]

except (OSError, ValueError) as e:
print(f"Couldn't fetch pip's metadata from PyPI: {e}")
sys.exit(1)

# Remove pip from the existing SBOM packages if it's there
# and then overwrite its entry with our own generated one.
sbom_data["packages"] = [
sbom_package
for sbom_package in sbom_data["packages"]
if sbom_package["name"] != "pip"
]
sbom_data["packages"].append(
{
"SPDXID": spdx_id("SPDXRef-PACKAGE-pip"),
"name": "pip",
"versionInfo": pip_version,
"originator": "Organization: Python Packaging Authority",
"licenseConcluded": "MIT",
"downloadLocation": pip_download_url,
"checksums": [
{"algorithm": "SHA256", "checksumValue": pip_checksum_sha256}
],
"externalRefs": [
{
"referenceCategory": "SECURITY",
"referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*",
"referenceType": "cpe23Type",
},
{
"referenceCategory": "PACKAGE_MANAGER",
"referenceLocator": f"pkg:pypi/pip@{pip_version}",
"referenceType": "purl",
},
],
"primaryPackagePurpose": "SOURCE",
}
)


def main() -> None:
root_dir = pathlib.Path(__file__).parent.parent.parent
sbom_path = root_dir / "Misc/sbom.spdx.json"
sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
sbom_data = json.loads(sbom_path.read_bytes())

# Make a bunch of assertions about the SBOM data to ensure it's consistent.
# Insert pip's SBOM metadata from the wheel.
discover_pip_sbom_package(sbom_data)

# Ensure all packages in this tool are represented also in the SBOM file.
assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)

# Make a bunch of assertions about the SBOM data to ensure it's consistent.
for package in sbom_data["packages"]:

# Properties and ID must be properly formed.
Expand All @@ -138,17 +234,17 @@ def main() -> None:
for include in sorted(files.include):

# Find all the paths and then filter them through .gitignore.
paths = glob.glob(include, root_dir=root_dir, recursive=True)
paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
paths = filter_gitignored_paths(paths)
assert paths, include # Make sure that every value returns something!

for path in paths:
# Skip directories and excluded files
if not (root_dir / path).is_file() or path in exclude:
if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude:
continue

# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
data = (root_dir / path).read_bytes()
data = (CPYTHON_ROOT_DIR / path).read_bytes()
checksum_sha1 = hashlib.sha1(data).hexdigest()
checksum_sha256 = hashlib.sha256(data).hexdigest()

Expand Down

0 comments on commit b221e03

Please sign in to comment.