Skip to content

Commit

Permalink
Don't rewind the build if invocation id stays the same
Browse files Browse the repository at this point in the history
Invocation id (`BUILD_ID` in code) is expected to be different for each command. When retrying the build, we rely on Bazel generating a new invocation id for a new attempt. However, if flag `--invocation_id` is set, Bazel just uses the provided value instead of generating a new one. In this case, invocation id stays the same among multiple attempts which could cause issues like #18547.

This PR fixes that by not retrying the build if the invocation id is same to previous attempt. Also updated the doc to point this requirement out.

Closes #18591.

PiperOrigin-RevId: 539946840
Change-Id: I6ae85ea923b0fdbff97fe2e44e36995f0205f8a1
  • Loading branch information
coeuvre authored and copybara-github committed Jun 13, 2023
1 parent 31ce017 commit d9b94cb
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,11 @@ public boolean usingLocalTestJobs() {
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation"
+ " id will be generated for each attempt. If you generate invocation id and provide"
+ " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit"
+ " code 39.")
public int remoteRetryOnCacheEviction;

/** An enum for specifying different formats of test output. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,14 @@
import java.io.OutputStream;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicReference;
import java.util.logging.Level;
import javax.annotation.Nullable;
import net.starlark.java.eval.Starlark;

/**
Expand Down Expand Up @@ -233,7 +237,8 @@ public BlazeCommandResult exec(
retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN);
}
BlazeCommandResult result;
int attempt = 0;
Set<UUID> attemptedCommandIds = new HashSet<>();
BlazeCommandResult lastResult = null;
while (true) {
try {
result =
Expand All @@ -248,11 +253,12 @@ public BlazeCommandResult exec(
waitTimeInMs,
startupOptionsTaggedWithBazelRc,
commandExtensions,
attempt);
attemptedCommandIds,
lastResult);
break;
} catch (RemoteCacheEvictedException e) {
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
attempt += 1;
attemptedCommandIds.add(e.getCommandId());
lastResult = e.getResult();
}
}
if (result.shutdown()) {
Expand Down Expand Up @@ -303,7 +309,8 @@ private BlazeCommandResult execExclusively(
long waitTimeInMs,
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions,
int attempt)
Set<UUID> attemptedCommandIds,
@Nullable BlazeCommandResult lastResult)
throws RemoteCacheEvictedException {
// Record the start time for the profiler. Do not put anything before this!
long execStartTimeNanos = runtime.getClock().nanoTime();
Expand Down Expand Up @@ -331,6 +338,19 @@ private BlazeCommandResult execExclusively(
firstContactTime,
commandExtensions,
this::setShutdownReason);

if (!attemptedCommandIds.isEmpty()) {
if (attemptedCommandIds.contains(env.getCommandId())) {
outErr.printErrLn(
String.format(
"Failed to retry the build: invocation id `%s` has already been used.",
env.getCommandId()));
return Preconditions.checkNotNull(lastResult);
} else {
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
}
}

CommonCommandOptions commonOptions = options.getOptions(CommonCommandOptions.class);
boolean tracerEnabled = false;
if (commonOptions.enableTracer == TriState.YES) {
Expand Down Expand Up @@ -655,8 +675,8 @@ private BlazeCommandResult execExclusively(
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
var executionOptions =
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
if (attempt < executionOptions.remoteRetryOnCacheEviction) {
throw new RemoteCacheEvictedException();
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) {
throw new RemoteCacheEvictedException(env.getCommandId(), newResult);
}
}

Expand Down Expand Up @@ -696,7 +716,23 @@ private BlazeCommandResult execExclusively(
}
}

private static class RemoteCacheEvictedException extends IOException {}
private static class RemoteCacheEvictedException extends IOException {
private final UUID commandId;
private final BlazeCommandResult result;

private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) {
this.commandId = commandId;
this.result = result;
}

public UUID getCommandId() {
return commandId;
}

public BlazeCommandResult getResult() {
return result;
}
}

private static void replayEarlyExitEvents(
OutErr outErr,
Expand Down
61 changes: 61 additions & 0 deletions src/test/shell/bazel/remote/build_without_the_bytes_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1785,9 +1785,70 @@ EOF
--experimental_remote_cache_eviction_retries=5 \
//a:bar >& $TEST_log || fail "Failed to build"

expect_log 'Failed to fetch blobs because they do not exist remotely.'
expect_log "Found remote cache eviction error, retrying the build..."
}

function test_remote_cache_eviction_retries_with_fixed_invocation_id() {
mkdir -p a

cat > a/BUILD <<'EOF'
genrule(
name = 'foo',
srcs = ['foo.in'],
outs = ['foo.out'],
cmd = 'cat $(SRCS) > $@',
)
genrule(
name = 'bar',
srcs = ['foo.out', 'bar.in'],
outs = ['bar.out'],
cmd = 'cat $(SRCS) > $@',
tags = ['no-remote-exec'],
)
EOF

echo foo > a/foo.in
echo bar > a/bar.in

# Populate remote cache
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

bazel clean

# Clean build, foo.out isn't downloaded
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

if [[ -f bazel-bin/a/foo.out ]]; then
fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"
fi

# Evict blobs from remote cache
stop_worker
start_worker

echo "updated bar" > a/bar.in

# Incremental build triggers remote cache eviction error and Bazel tries to
# retry the build but failed because the invocation id is the same.
bazel build \
--invocation_id=91648f28-6081-4af7-9374-cdfd3cd36ef2 \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
--experimental_remote_cache_eviction_retries=5 \
//a:bar >& $TEST_log && fail "Expected build to fail"

expect_log 'Failed to fetch blobs because they do not exist remotely.'
expect_log 'Failed to retry the build: invocation id `91648f28-6081-4af7-9374-cdfd3cd36ef2` has already been used.'
}

function test_download_toplevel_symlinks_runfiles() {
cat > rules.bzl <<EOF
def _symlink_rule_impl(ctx):
Expand Down

0 comments on commit d9b94cb

Please sign in to comment.