Skip to content

Commit

Permalink
Merge branch 'main' into CYT-470-add-hook-for-reading-sbom-files
Browse files Browse the repository at this point in the history
  • Loading branch information
mws180000 authored Oct 16, 2023
2 parents b2150e0 + e94ced8 commit b33c3b0
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 12 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ repos:
- id: flake8
additional_dependencies: [flake8-bugbear]
- repo: https://github.com/pycqa/pylint
rev: v3.0.0a7
rev: v3.0.1
hooks:
- id: pylint
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: mixed-line-ending
- id: end-of-file-fixer
Expand All @@ -31,7 +31,7 @@ repos:
- id: check-json
#- id: pretty-format-json
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.9.0.5
rev: v0.9.0.6
hooks:
- id: shellcheck
args: [-x]
Expand Down
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ A configuration file contains the information about the sample to gather informa

**extractPaths**: (required) the absolute path or relative path from location of current working directory that `surfactant` is being run from to the sample folders, cannot be a file (Note that even on Windows, Unix style `/` directory separators should be used in paths)\
**archive**: (optional) the full path, including file name, of the zip, exe installer, or other archive file that the folders in **extractPaths** were extracted from. This is used to collect metadata about the overall sample and will be added as a "Contains" relationship to all software entries found in the various **extractPaths**\
**installPrefix**: (optional) where the files in **extractPaths** would be if installed correctly on an actual system i.e. "C:/", "C:/Program Files/", etc (Note that even on Windows, Unix style `/` directory separators should be used in the path)
**installPrefix**: (optional) where the files in **extractPaths** would be if installed correctly on an actual system i.e. "C:/", "C:/Program Files/", etc (Note that even on Windows, Unix style `/` directory separators should be used in the path). If not given then the **extractPaths** will be used as the install paths

#### Example configuration file
Lets say you have a .tar.gz file that you want to run surfactant on. For this example, we will be using the HELICS release .tar.gz example. In this scenario, the absolute path for this file is `/home/samples/helics.tar.gz`. Upon extracting this file, we get a helics folder with 4 sub-folders: bin, include, lib64, and share.
Expand All @@ -83,18 +83,24 @@ The resulting SBOM would be structured like this:
{
"UUID": "abc1",
"fileName": ["helics_binary"],
"installPath": null,
"installPath": ["/home/samples/helics/bin/helics_binary"],
"containerPath": null
},
{
"UUID": "abc2",
"fileName": ["lib1.so"],
"installPath": null,
"installPath": ["/home/samples/helics/lib64/lib1.so"],
"containerPath": null
}

],
"relationships": []
"relationships": [
{
"xUUID": "abc1",
"yUUID": "abc2",
"relationship": "Uses"
}
]
}
```
##### Example 2: Detailed Configuration File
Expand Down Expand Up @@ -264,6 +270,7 @@ $ surfactant generate [OPTIONS] CONFIG_FILE SBOM_OUTFILE [INPUT_SBOM]
**INPUT_SBOM**: (optional) a base sbom, should be used with care as relationships could be messed up when files are installed on different systems\
**--skip_gather**: (optional) skips the gathering of information on files and adding software entires\
**--skip_relationships**: (optional) skips the adding of relationships based on metadata\
**--skip_install_path**: (optional) skips including an install path for the files discovered. This may cause "Uses" relationships to also not be generated\
**--recorded_institution**: (optional) the name of the institution collecting the SBOM data (default: LLNL)\
**--output_format**: (optional) changes the output format for the SBOM (given as full module name of a surfactant plugin implementing the `write_sbom` hook)\
**--input_format**: (optional) specifies the format of the input SBOM if one is being used (default: cytrics) (given as full module name of a surfactant plugin implementing the `read_sbom` hook)\
Expand Down
44 changes: 41 additions & 3 deletions surfactant/cmd/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pathlib
import re
import sys
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import click
from loguru import logger
Expand Down Expand Up @@ -101,7 +101,6 @@ def print_output_formats(ctx, _, value):
print(pm.get_canonical_name(plugin))
ctx.exit()


def print_input_formats(ctx, _, value):
if not value or ctx.resilient_parsing:
return
Expand All @@ -113,6 +112,27 @@ def print_input_formats(ctx, _, value):
else:
print(pm.get_canonical_name(plugin))
ctx.exit()

def warn_if_hash_collision(soft1: Optional[Software], soft2: Optional[Software]):
if not soft1 or not soft2:
return
# A hash collision occurs if one or more but less than all hashes match or
# any hash matches but the filesize is different
collision = False
if soft1.sha256 == soft2.sha256 or soft1.sha1 == soft2.sha1 or soft1.md5 == soft2.md5:
# Hashes can be None; make sure they aren't before checking for inequality
if soft1.sha256 and soft2.sha256 and soft1.sha256 != soft2.sha256:
collision = True
elif soft1.sha1 and soft2.sha1 and soft1.sha1 != soft2.sha1:
collision = True
elif soft1.md5 and soft2.md5 and soft1.md5 != soft2.md5:
collision = True
elif soft1.size != soft2.size:
collision = True
if collision:
logger.warn(
f"Hash collision between {soft1.name} and {soft2.name}; unexpected results may occur"
)


@click.command("generate")
Expand All @@ -133,6 +153,13 @@ def print_input_formats(ctx, _, value):
required=False,
help="Skip adding relationships based on Linux/Windows/etc metadata",
)
@click.option(
"--skip_install_path",
is_flag=True,
default=False,
required=False,
help="Skip including install path information if not given by configuration",
)
@click.option(
"--recorded_institution", is_flag=False, default="LLNL", help="Name of user's institution"
)
Expand Down Expand Up @@ -170,6 +197,7 @@ def sbom(
input_sbom,
skip_gather,
skip_relationships,
skip_install_path,
recorded_institution,
output_format,
input_format,
Expand Down Expand Up @@ -215,6 +243,7 @@ def sbom(
pm, new_sbom, entry["archive"], user_institution_name=recorded_institution
)
archive_entry = new_sbom.find_software(parent_entry.sha256)
warn_if_hash_collision(archive_entry, parent_entry)
if archive_entry:
parent_entry = archive_entry
else:
Expand Down Expand Up @@ -285,6 +314,14 @@ def sbom(
# We need get_software_entry to look at the true filepath
filepath = true_filepath

if install_prefix is not None:
install_path = install_prefix
elif not skip_install_path:
# epath is guaranteed to not have an ending slash due to formatting above
install_path = epath + "/"
else:
install_path = None

if ftype := pm.hook.identify_file_type(filepath=filepath):
try:
entries.append(
Expand All @@ -295,7 +332,7 @@ def sbom(
filetype=ftype,
root_path=epath,
container_uuid=parent_uuid,
install_path=install_prefix,
install_path=install_path,
user_institution_name=recorded_institution,
)
)
Expand All @@ -313,6 +350,7 @@ def sbom(
# if a software entry already exists with a matching file hash, augment the info in the existing entry
for e in entries:
existing_sw = new_sbom.find_software(e.sha256)
warn_if_hash_collision(existing_sw, e)
if not existing_sw:
new_sbom.add_software(e)
# if the config file specified a parent/container for the file, add the new entry as a "Contains" relationship
Expand Down
43 changes: 41 additions & 2 deletions tests/cmd/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,20 @@ def test_generate_no_install_prefix(tmp_path):
actual_software_names = {software["fileName"][0] for software in generated_sbom["software"]}
assert expected_software_names == actual_software_names

expected_install_paths = {
"hello_world.exe": extract_path + "/hello_world.exe",
"testlib.dll": extract_path + "/testlib.dll",
}
for software in generated_sbom["software"]:
assert software["installPath"] == []
assert software["installPath"][0] == expected_install_paths[software["fileName"][0]]

assert len(generated_sbom["relationships"]) == 0
uuids = {software["fileName"][0]: software["UUID"] for software in generated_sbom["software"]}
assert len(generated_sbom["relationships"]) == 1
assert generated_sbom["relationships"][0] == {
"xUUID": uuids["hello_world.exe"],
"yUUID": uuids["testlib.dll"],
"relationship": "Uses",
}


def test_generate_with_install_prefix(tmp_path):
Expand Down Expand Up @@ -72,3 +82,32 @@ def test_generate_with_install_prefix(tmp_path):
"yUUID": uuids["testlib.dll"],
"relationship": "Uses",
}


def test_generate_with_skip_install_path(tmp_path):
extract_path = Path(testing_data, "Windows_dll_test_no1").as_posix()
config_data = f'[{{"extractPaths": ["{extract_path}"]}}]'
config_path = str(Path(tmp_path, "config.json"))
output_path = str(Path(tmp_path, "out.json"))

with open(config_path, "w") as f:
f.write(config_data)

# the click.testing module would be better here but it doesn't allow for files to be generated
# pylint: disable=no-value-for-parameter
sbom(["--skip_install_path", config_path, output_path], standalone_mode=False)
# pylint: enable

with open(output_path) as f:
generated_sbom = json.load(f)

assert len(generated_sbom["software"]) == 2

expected_software_names = {"hello_world.exe", "testlib.dll"}
actual_software_names = {software["fileName"][0] for software in generated_sbom["software"]}
assert expected_software_names == actual_software_names

for software in generated_sbom["software"]:
assert software["installPath"] == []

assert len(generated_sbom["relationships"]) == 0

0 comments on commit b33c3b0

Please sign in to comment.