diff --git a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java index d2164b292d457c..fe25394139fc49 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java +++ b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java @@ -515,7 +515,11 @@ public boolean usingLocalTestJobs() { help = "The maximum number of attempts to retry if the build encountered remote cache eviction" + " error. A non-zero value will implicitly set" - + " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.") + + " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation" + + " id will be generated for each attempt. If you generate invocation id and provide" + + " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag" + + " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit" + + " code 39.") public int remoteRetryOnCacheEviction; /** An enum for specifying different formats of test output. */ diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java index b6fd01752d96c4..624d170d8e8f10 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java @@ -74,10 +74,14 @@ import java.io.OutputStream; import java.time.Duration; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Optional; +import java.util.Set; +import java.util.UUID; import java.util.concurrent.atomic.AtomicReference; import java.util.logging.Level; +import javax.annotation.Nullable; import net.starlark.java.eval.Starlark; /** @@ -233,7 +237,8 @@ public BlazeCommandResult exec( retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN); } BlazeCommandResult result; - int attempt = 0; + Set attemptedCommandIds = new HashSet<>(); + BlazeCommandResult lastResult = null; while (true) { try { result = @@ -248,11 +253,12 @@ public BlazeCommandResult exec( waitTimeInMs, startupOptionsTaggedWithBazelRc, commandExtensions, - attempt); + attemptedCommandIds, + lastResult); break; } catch (RemoteCacheEvictedException e) { - outErr.printErrLn("Found remote cache eviction error, retrying the build..."); - attempt += 1; + attemptedCommandIds.add(e.getCommandId()); + lastResult = e.getResult(); } } if (result.shutdown()) { @@ -303,7 +309,8 @@ private BlazeCommandResult execExclusively( long waitTimeInMs, Optional>> startupOptionsTaggedWithBazelRc, List commandExtensions, - int attempt) + Set attemptedCommandIds, + @Nullable BlazeCommandResult lastResult) throws RemoteCacheEvictedException { // Record the start time for the profiler. Do not put anything before this! long execStartTimeNanos = runtime.getClock().nanoTime(); @@ -331,6 +338,19 @@ private BlazeCommandResult execExclusively( firstContactTime, commandExtensions, this::setShutdownReason); + + if (!attemptedCommandIds.isEmpty()) { + if (attemptedCommandIds.contains(env.getCommandId())) { + outErr.printErrLn( + String.format( + "Failed to retry the build: invocation id `%s` has already been used.", + env.getCommandId())); + return Preconditions.checkNotNull(lastResult); + } else { + outErr.printErrLn("Found remote cache eviction error, retrying the build..."); + } + } + CommonCommandOptions commonOptions = options.getOptions(CommonCommandOptions.class); boolean tracerEnabled = false; if (commonOptions.enableTracer == TriState.YES) { @@ -655,8 +675,8 @@ private BlazeCommandResult execExclusively( if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) { var executionOptions = Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class)); - if (attempt < executionOptions.remoteRetryOnCacheEviction) { - throw new RemoteCacheEvictedException(); + if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) { + throw new RemoteCacheEvictedException(env.getCommandId(), newResult); } } @@ -696,7 +716,23 @@ private BlazeCommandResult execExclusively( } } - private static class RemoteCacheEvictedException extends IOException {} + private static class RemoteCacheEvictedException extends IOException { + private final UUID commandId; + private final BlazeCommandResult result; + + private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) { + this.commandId = commandId; + this.result = result; + } + + public UUID getCommandId() { + return commandId; + } + + public BlazeCommandResult getResult() { + return result; + } + } private static void replayEarlyExitEvents( OutErr outErr, diff --git a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh index 7d3ecbf7c98930..ab186eca0707c9 100755 --- a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh +++ b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh @@ -1785,9 +1785,70 @@ EOF --experimental_remote_cache_eviction_retries=5 \ //a:bar >& $TEST_log || fail "Failed to build" + expect_log 'Failed to fetch blobs because they do not exist remotely.' expect_log "Found remote cache eviction error, retrying the build..." } +function test_remote_cache_eviction_retries_with_fixed_invocation_id() { + mkdir -p a + + cat > a/BUILD <<'EOF' +genrule( + name = 'foo', + srcs = ['foo.in'], + outs = ['foo.out'], + cmd = 'cat $(SRCS) > $@', +) + +genrule( + name = 'bar', + srcs = ['foo.out', 'bar.in'], + outs = ['bar.out'], + cmd = 'cat $(SRCS) > $@', + tags = ['no-remote-exec'], +) +EOF + + echo foo > a/foo.in + echo bar > a/bar.in + + # Populate remote cache + bazel build \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + //a:bar >& $TEST_log || fail "Failed to build" + + bazel clean + + # Clean build, foo.out isn't downloaded + bazel build \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + //a:bar >& $TEST_log || fail "Failed to build" + + if [[ -f bazel-bin/a/foo.out ]]; then + fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded" + fi + + # Evict blobs from remote cache + stop_worker + start_worker + + echo "updated bar" > a/bar.in + + # Incremental build triggers remote cache eviction error and Bazel tries to + # retry the build but failed because the invocation id is the same. + bazel build \ + --invocation_id=91648f28-6081-4af7-9374-cdfd3cd36ef2 \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + --experimental_remote_cache_eviction_retries=5 \ + //a:bar >& $TEST_log && fail "Expected build to fail" + + expect_log 'Failed to fetch blobs because they do not exist remotely.' + expect_log 'Failed to retry the build: invocation id `91648f28-6081-4af7-9374-cdfd3cd36ef2` has already been used.' +} + function test_download_toplevel_symlinks_runfiles() { cat > rules.bzl <