Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: report unused inputs for the tar rule #951

Merged
merged 13 commits into from
Oct 13, 2024
3 changes: 2 additions & 1 deletion docs/tar.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions lib/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ bool_flag(
build_setting_default = True if is_bzlmod_enabled() else False,
)

# TODO(3.0): change default to True.
bool_flag(
name = "tar_compute_unused_inputs",
build_setting_default = False,
plobsing marked this conversation as resolved.
Show resolved Hide resolved
)

config_setting(
name = "enable_runfiles",
values = {"enable_runfiles": "true"},
Expand Down
5 changes: 4 additions & 1 deletion lib/private/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,10 @@ bzl_library(
name = "tar",
srcs = ["tar.bzl"],
visibility = ["//lib:__subpackages__"],
deps = ["@aspect_bazel_lib//lib:paths"],
deps = [
"@aspect_bazel_lib//lib:paths",
"@bazel_skylib//rules:common_settings",
],
)

bzl_library(
Expand Down
192 changes: 185 additions & 7 deletions lib/private/tar.bzl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"Implementation of tar rule"

load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
load("//lib:paths.bzl", "to_repository_relative_path")

TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type"
Expand Down Expand Up @@ -84,6 +85,40 @@ _tar_attrs = {
doc = "Compress the archive file with a supported algorithm.",
values = _ACCEPTED_COMPRESSION_TYPES,
),
"compute_unused_inputs": attr.int(
doc = """
Whether to discover and prune input files that will not contribute to the archive.

Unused inputs are discovered by comparing the set of input files in `srcs` to the set
of files referenced by `mtree`. Files not used for content by the mtree specification
will not be read by the `tar` tool when creating the archive and can be pruned from the
input set using the `unused_inputs_list`
[mechanism](https://bazel.build/contribute/codebase#input-discovery).

Benefits: pruning unused input files can reduce the amount of work the build system must
perform. Pruned files are not included in the action cache key; changes to them do not
invalidate the cache entry, which can lead to higher cache hit rates. Actions do not need
to block on the availability of pruned inputs, which can increase the available
parallelism of builds. Pruned files do not need to be transferred to remote-execution
workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The
comparison performed between `srcs` and `mtree` is currently inexact and may fail to
handle handwritten or externally-derived mtree specifications. However, it is safe to use
this feature when the lines found in `mtree` are derived from one or more `mtree_spec`
rules, filtered and/or merged on whole-line basis only.

Possible values:

- `compute_unused_inputs = 1`: Always perform unused input discovery and pruning.
- `compute_unused_inputs = 0`: Never discover or prune unused inputs.
- `compute_unused_inputs = -1`: Discovery and pruning of unused inputs is controlled by the
--[no]@aspect_bazel_lib//lib:tar_compute_unused_inputs flag.
""",
default = -1,
values = [-1, 0, 1],
),
"_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")),
}

_mtree_attrs = {
Expand Down Expand Up @@ -125,6 +160,123 @@ def _calculate_runfiles_dir(default_info):
return manifest.short_path[:-9]
fail("manifest path {} seems malformed".format(manifest.short_path))

def _is_unused_inputs_enabled(attr):
"""Determine whether or not to compute unused inputs.

Args:
attr: `tar` rule ctx.attr struct. Must provide `_tar_attrs`.

Returns: bool. Whether the unused_inputs_list file should be computed.
"""
if attr.compute_unused_inputs == 1:
return True
elif attr.compute_unused_inputs == 0:
return False
elif attr.compute_unused_inputs == -1:
return attr._compute_unused_inputs_flag[BuildSettingInfo].value
else:
fail("Unexpected `compute_unused_inputs` value: {}".format(attr.compute_unused_inputs))

def _is_unprunable(file):
# Some input files cannot be pruned because their name will be misinterpreted by Bazel when reading the unused_inputs_list.
# * Filenames containing newlines will be mangled in the line-oriented format of the file.
# * Filenames with leading or trailing whitespace will be mangled by the call to String.trim().
# ref https://github.com/bazelbuild/bazel/blob/678b01a512c0b28c87067cdf5a4e0224a82716c0/src/main/java/com/google/devtools/build/lib/analysis/actions/StarlarkAction.java#L357
p = file.path
return p[0].isspace() or p[-1].isspace() or "\n" in p or "\r" in p

def _fmt_pruanble_inputs_line(file):
if _is_unprunable(file):
return None

# The tar.prunable_inputs.txt file has a two columns:
# 1. vis-encoded paths of the files, used in comparison
# 2. un-vis-encoded paths of the files, used for reporting back to Bazel after filtering
path = file.path
return _vis_encode(path) + " " + path

def _fmt_keep_inputs_line(file):
# The tar.keep_inputs.txt file has a single column of vis-encoded paths of the files to keep.
return _vis_encode(file.path)

def _configured_unused_inputs_file(ctx, srcs, keep):
"""
Compute the unused_inputs_list, if configured.
thesayyn marked this conversation as resolved.
Show resolved Hide resolved

Args:
ctx: `tar` rule context. Must provide `_tar_attrs` and a `coreutils_toolchain_type` toolchain.
srcs: sequence or depset. The set of all input sources being provided to the `tar` rule.
keep: sequence or depset. A hardcoded set of sources to consider "used" regardless of whether or not they appear in the mtree.

Returns: file or None. List of inputs unused by the `Tar` action.
"""
if not _is_unused_inputs_enabled(ctx.attr):
return None

coreutils = ctx.toolchains["@aspect_bazel_lib//lib:coreutils_toolchain_type"].coreutils_info.bin

prunable_inputs = ctx.actions.declare_file(ctx.attr.name + ".prunable_inputs.txt")
keep_inputs = ctx.actions.declare_file(ctx.attr.name + ".keep_inputs.txt")
unused_inputs = ctx.actions.declare_file(ctx.attr.name + ".unused_inputs.txt")

ctx.actions.write(
output = prunable_inputs,
content = ctx.actions.args()
.set_param_file_format("multiline")
.add_all(
srcs,
map_each = _fmt_pruanble_inputs_line,
),
)
ctx.actions.write(
output = keep_inputs,
content = ctx.actions.args()
.set_param_file_format("multiline")
.add_all(
keep,
map_each = _fmt_keep_inputs_line,
),
)

# Unused inputs are inputs that:
# * are in the set of PRUNABLE_INPUTS
# * are not found in any content= or contents= keyword in the MTREE
# * are not in the hardcoded KEEP_INPUTS set
#
# Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation, stored in field 1,
# before being written out in the un-vis-encoded form Bazel understands, from field 2.
#
# Note: bsdtar (libarchive) accepts both content= and contents= to identify source file:
# ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640
#
# TODO: Make comparison exact by converting all inputs to a canonical vis-encoded form before comparing.
# See also: https://github.com/bazel-contrib/bazel-lib/issues/794
ctx.actions.run_shell(
outputs = [unused_inputs],
inputs = [prunable_inputs, keep_inputs, ctx.file.mtree],
tools = [coreutils],
command = '''
"$COREUTILS" join -v 1 \\
<("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\
<("$COREUTILS" sort -u \\
<(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\
"$KEEP_INPUTS" \\
) \\
| "$COREUTILS" cut -d' ' -f 2- \\
> "$UNUSED_INPUTS"
''',
env = {
"COREUTILS": coreutils.path,
"PRUNABLE_INPUTS": prunable_inputs.path,
"KEEP_INPUTS": keep_inputs.path,
"MTREE": ctx.file.mtree.path,
"UNUSED_INPUTS": unused_inputs.path,
},
mnemonic = "UnusedTarInputs",
)

return unused_inputs

def _tar_impl(ctx):
bsdtar = ctx.toolchains[TAR_TOOLCHAIN_TYPE]
inputs = ctx.files.srcs[:]
Expand All @@ -151,20 +303,43 @@ def _tar_impl(ctx):
src[DefaultInfo].files_to_run.repo_mapping_manifest
for src in ctx.attr.srcs
]
inputs.extend([m for m in repo_mappings if m != None])
repo_mappings = [m for m in repo_mappings if m != None]
inputs.extend(repo_mappings)

srcs_runfiles = [
src[DefaultInfo].default_runfiles.files
for src in ctx.attr.srcs
]

unused_inputs_file = _configured_unused_inputs_file(
ctx,
srcs = depset(direct = ctx.files.srcs + repo_mappings, transitive = srcs_runfiles),
keep = depset(direct = [ctx.file.mtree, bsdtar.tarinfo.binary], transitive = [bsdtar.default.files]),
)
if unused_inputs_file:
inputs.append(unused_inputs_file)

ctx.actions.run(
executable = bsdtar.tarinfo.binary,
inputs = depset(direct = inputs, transitive = [bsdtar.default.files] + [
src[DefaultInfo].default_runfiles.files
for src in ctx.attr.srcs
]),
inputs = depset(direct = inputs, transitive = [bsdtar.default.files] + srcs_runfiles),
outputs = [out],
arguments = [args],
mnemonic = "Tar",
unused_inputs_list = unused_inputs_file,
)

return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))
# TODO(3.0): Always return a list of providers.
default_info = DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))
if unused_inputs_file:
return [
default_info,
OutputGroupInfo(
# exposed for testing
_unused_inputs_file = depset([unused_inputs_file]),
),
]
else:
return default_info

def _mtree_line(file, type, content = None, uid = "0", gid = "0", time = "1672560000", mode = "0755"):
spec = [
Expand Down Expand Up @@ -284,5 +459,8 @@ tar = rule(
doc = "Rule that executes BSD `tar`. Most users should use the [`tar`](#tar) macro, rather than load this directly.",
implementation = tar_lib.implementation,
attrs = tar_lib.attrs,
toolchains = [tar_lib.toolchain_type],
toolchains = [
tar_lib.toolchain_type,
"@aspect_bazel_lib//lib:coreutils_toolchain_type",
],
)
54 changes: 53 additions & 1 deletion lib/tests/tar/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ load("@aspect_bazel_lib//lib:diff_test.bzl", "diff_test")
load("@aspect_bazel_lib//lib:tar.bzl", "mtree_mutate", "mtree_spec", "tar")
load("@aspect_bazel_lib//lib:testing.bzl", "assert_archive_contains")
load("@bazel_skylib//rules:write_file.bzl", "write_file")
load(":asserts.bzl", "assert_tar_listing")
load(":asserts.bzl", "assert_tar_listing", "assert_unused_listing")

# The examples below work with both source files and generated files.
# Here we generate a file to use in the examples.
Expand Down Expand Up @@ -413,3 +413,55 @@ diff_test(
file1 = ":strip_prefix14",
file2 = "expected14.mtree",
)

#############
# Example 15: mtree subsetting and unused-input pruning.

copy_directory(
name = "unused_srcdir",
src = "srcdir",
out = "unused",
)

mtree_spec(
name = "mtree15",
srcs = [
":treeartifact",
],
)

tar(
name = "tar15",
srcs = [
"treeartifact",
":mtree15", # Not in output archive, but cannot be pruned.
":unused_srcdir",
],
out = "15.tar",
compute_unused_inputs = 1,
mtree = ":mtree15",
)

assert_tar_listing(
name = "test_unused_inputs_ignored",
actual = ":tar15",
expected = [
"drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/",
"drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/",
"drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/",
"drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/",
"-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/info",
"-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/pkg",
"-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/treeartifact/space in name.txt",
],
)

assert_unused_listing(
name = "test_unused_inputs_listed",
actual = ":tar15",
expected = [
"lib/tests/tar/unused/info",
"lib/tests/tar/unused/pkg",
"lib/tests/tar/unused/space in name.txt",
],
)
42 changes: 42 additions & 0 deletions lib/tests/tar/asserts.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,45 @@ def assert_tar_listing(name, actual, expected):
file2 = expected_listing,
timeout = "short",
)

# buildifier: disable=function-docstring
def assert_unused_listing(name, actual, expected):
actual_listing = native.package_relative_label("_{}_actual_listing".format(name))
actual_shortnames = native.package_relative_label("_{}_actual_shortnames".format(name))
actual_shortnames_file = native.package_relative_label("_{}.actual_shortnames".format(name))
expected_listing = native.package_relative_label("_{}_expected".format(name))
expected_listing_file = native.package_relative_label("_{}.expected".format(name))

native.filegroup(
name = actual_listing.name,
output_group = "_unused_inputs_file",
srcs = [actual],
testonly = True,
)

# Trim platform-specific bindir prefix from unused inputs listing. E.g.
# bazel-out/darwin_arm64-fastbuild/bin/lib/tests/tar/unused/info
# ->
# lib/tests/tar/unused/info
native.genrule(
name = actual_shortnames.name,
srcs = [actual_listing],
cmd = "sed 's!^bazel-out/[^/]*/bin/!!' $< >$@",
testonly = True,
outs = [actual_shortnames_file],
)

write_file(
name = expected_listing.name,
testonly = True,
out = expected_listing_file,
content = expected + [""],
newline = "unix",
)

diff_test(
name = name,
file1 = actual_shortnames,
file2 = expected_listing,
timeout = "short",
)
Loading