From 1940dfb07424ff7ac6d1d8cf7b776b4dc04b21e8 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Fri, 21 Apr 2023 12:33:57 +0000 Subject: [PATCH] Automatically retry the build if encountered remote cache eviction error (#18171) With https://github.com/bazelbuild/bazel/pull/17358, Bazel will exit with code 39 if remote cache evicts blobs during the build. With https://github.com/bazelbuild/bazel/pull/17462 and #17747, Bazel is able to continue the build without bazel clean or bazel shutdown. However, even with https://github.com/bazelbuild/bazel/pull/17639 and following changes to extend the lease, remote cache can still evict blobs in some rare cases. Based on above changes, this PR makes bazel retry the invocation if it encountered the remote cache eviction error during previous invocation if `--experimental_remote_cache_eviction_retries` is set, or **build rewinding**. ``` $ bazel build --experimental_remote_cache_eviction_retries=5 ... INFO: Invocation ID: b7348bfa-9446-4c72-a888-0a0ad012f225 Loading: Loading: Loading: 0 packages loaded Analyzing: target //a:bar (0 packages loaded, 0 targets configured) INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured). INFO: Found 1 target... [0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt ERROR: .../workspace/a/BUILD:8:8: Executing genrule //a:bar failed: Failed to fetch blobs because they do not exist remotely: Missing digest: b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c/4 Target //a:bar failed to build Use --verbose_failures to see the command lines of failed build steps. INFO: Elapsed time: 0.447s, Critical Path: 0.05s INFO: 2 processes: 2 internal. ERROR: Build did NOT complete successfully Found remote cache eviction error, retrying the build... INFO: Invocation ID: 983f60dc-8bb9-4b82-aa33-a378469ce140 Loading: Loading: Loading: 0 packages loaded Analyzing: target //a:bar (0 packages loaded, 0 targets configured) INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured). INFO: Found 1 target... [0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt Target //a:bar up-to-date: bazel-bin/a/bar.out INFO: Elapsed time: 0.866s, Critical Path: 0.35s INFO: 3 processes: 1 internal, 1 processwrapper-sandbox, 1 remote. INFO: Build completed successfully, 3 total actions $ ``` Part of https://github.com/bazelbuild/bazel/pull/16660. Closes #17711. PiperOrigin-RevId: 520610524 Change-Id: I20d43d1968767a03250b9c8f8a6dda4e056d4f52 --- .../build/lib/exec/ExecutionOptions.java | 11 ++++ .../lib/remote/RemoteActionInputFetcher.java | 5 +- .../build/lib/remote/RemoteModule.java | 4 +- .../build/lib/remote/RemoteSpawnRunner.java | 3 +- .../lib/runtime/BlazeCommandDispatcher.java | 56 +++++++++++++----- .../BuildWithoutTheBytesIntegrationTest.java | 4 +- .../remote/build_without_the_bytes_test.sh | 59 +++++++++++++++++++ 7 files changed, 119 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java index f964d876a7e0c6..a32b9efc7e59d8 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java +++ b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java @@ -494,6 +494,17 @@ public boolean usingLocalTestJobs() { + "test log. Otherwise, Bazel generates a test.xml as part of the test action.") public boolean splitXmlGeneration; + @Option( + name = "experimental_remote_cache_eviction_retries", + defaultValue = "0", + documentationCategory = OptionDocumentationCategory.REMOTE, + effectTags = {OptionEffectTag.EXECUTION}, + help = + "The maximum number of attempts to retry if the build encountered remote cache eviction" + + " error. A non-zero value will implicitly set" + + " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.") + public int remoteRetryOnCacheEviction; + /** An enum for specifying different formats of test output. */ public enum TestOutputFormat { SUMMARY, // Provide summary output only. diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java index ed9818a51a2904..e13606c05cd3e1 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java @@ -158,10 +158,7 @@ protected Completable onErrorResumeNext(Throwable error) { new EnvironmentalExecException( (BulkTransferException) error, FailureDetail.newBuilder() - .setMessage( - "Failed to fetch blobs because they do not exist remotely." - + " Build without the Bytes does not work if your remote" - + " cache evicts blobs during builds") + .setMessage("Failed to fetch blobs because they do not exist remotely.") .setSpawn(FailureDetails.Spawn.newBuilder().setCode(code)) .build()); } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java index 979df0fab81636..bd0f63c03ee44a 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java @@ -907,6 +907,7 @@ public void executorInit(CommandEnvironment env, BuildRequest request, ExecutorB actionContextProvider.setTempPathGenerator(tempPathGenerator); + ExecutionOptions executionOptions = env.getOptions().getOptions(ExecutionOptions.class); RemoteOptions remoteOptions = Preconditions.checkNotNull( env.getOptions().getOptions(RemoteOptions.class), "RemoteOptions"); @@ -929,7 +930,8 @@ public void executorInit(CommandEnvironment env, BuildRequest request, ExecutorB tempPathGenerator, patternsToDownload, outputPermissions, - remoteOptions.useNewExitCodeForLostInputs); + remoteOptions.useNewExitCodeForLostInputs + || (executionOptions != null && executionOptions.remoteRetryOnCacheEviction > 0)); env.getEventBus().register(actionInputFetcher); builder.setActionInputPrefetcher(actionInputFetcher); actionContextProvider.setActionInputFetcher(actionInputFetcher); diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 0f7bb077577efa..1eba36e9f1d723 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -557,7 +557,8 @@ private SpawnResult handleError( catastrophe = true; } else if (remoteCacheFailed) { status = Status.REMOTE_CACHE_FAILED; - if (remoteOptions.useNewExitCodeForLostInputs) { + if (remoteOptions.useNewExitCodeForLostInputs + || executionOptions.remoteRetryOnCacheEviction > 0) { detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED; } else { detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED; diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java index 0fbd5a2d3f0327..b91d69e7384ac3 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java @@ -43,6 +43,7 @@ import com.google.devtools.build.lib.events.PrintingEventHandler; import com.google.devtools.build.lib.events.Reporter; import com.google.devtools.build.lib.events.StoredEventHandler; +import com.google.devtools.build.lib.exec.ExecutionOptions; import com.google.devtools.build.lib.profiler.MemoryProfiler; import com.google.devtools.build.lib.profiler.Profiler; import com.google.devtools.build.lib.profiler.SilentCloseable; @@ -54,6 +55,7 @@ import com.google.devtools.build.lib.util.AnsiStrippingOutputStream; import com.google.devtools.build.lib.util.DebugLoggerConfigurator; import com.google.devtools.build.lib.util.DetailedExitCode; +import com.google.devtools.build.lib.util.ExitCode; import com.google.devtools.build.lib.util.InterruptedFailureDetails; import com.google.devtools.build.lib.util.LoggingUtil; import com.google.devtools.build.lib.util.Pair; @@ -230,18 +232,29 @@ public BlazeCommandResult exec( return createDetailedCommandResult( retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN); } - BlazeCommandResult result = - execExclusively( - originalCommandLine, - invocationPolicy, - args, - outErr, - firstContactTimeMillis, - commandName, - command, - waitTimeInMs, - startupOptionsTaggedWithBazelRc, - commandExtensions); + BlazeCommandResult result; + int attempt = 0; + while (true) { + try { + result = + execExclusively( + originalCommandLine, + invocationPolicy, + args, + outErr, + firstContactTimeMillis, + commandName, + command, + waitTimeInMs, + startupOptionsTaggedWithBazelRc, + commandExtensions, + attempt); + break; + } catch (RemoteCacheEvictedException e) { + outErr.printErrLn("Found remote cache eviction error, retrying the build..."); + attempt += 1; + } + } if (result.shutdown()) { setShutdownReason( "Server shut down " @@ -289,7 +302,9 @@ private BlazeCommandResult execExclusively( BlazeCommand command, long waitTimeInMs, Optional>> startupOptionsTaggedWithBazelRc, - List commandExtensions) { + List commandExtensions, + int attempt) + throws RemoteCacheEvictedException { // Record the start time for the profiler. Do not put anything before this! long execStartTimeNanos = runtime.getClock().nanoTime(); @@ -631,7 +646,18 @@ private BlazeCommandResult execExclusively( } needToCallAfterCommand = false; - return runtime.afterCommand(env, result); + var newResult = runtime.afterCommand(env, result); + if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) { + var executionOptions = + Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class)); + if (attempt < executionOptions.remoteRetryOnCacheEviction) { + throw new RemoteCacheEvictedException(); + } + } + + return newResult; + } catch (RemoteCacheEvictedException e) { + throw e; } catch (Throwable e) { logger.atSevere().withCause(e).log("Shutting down due to exception"); Crash crash = Crash.from(e); @@ -665,6 +691,8 @@ private BlazeCommandResult execExclusively( } } + private static class RemoteCacheEvictedException extends IOException {} + private static void replayEarlyExitEvents( OutErr outErr, BlazeOptionHandler optionHandler, diff --git a/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java b/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java index 6828c9abc75b09..c6ea8418c1a46d 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java @@ -528,9 +528,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E // Assert: Exit code is 39 assertThat(error) .hasMessageThat() - .contains( - "Build without the Bytes does not work if your remote cache evicts blobs" - + " during builds"); + .contains("Failed to fetch blobs because they do not exist remotely"); assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length)); assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39); } diff --git a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh index 33ab3bd475e9d4..7768f25b5e98e0 100755 --- a/src/test/shell/bazel/remote/build_without_the_bytes_test.sh +++ b/src/test/shell/bazel/remote/build_without_the_bytes_test.sh @@ -1627,4 +1627,63 @@ end_of_record" expect_log "$expected_result" } +function test_remote_cache_eviction_retries() { + mkdir -p a + + cat > a/BUILD <<'EOF' +genrule( + name = 'foo', + srcs = ['foo.in'], + outs = ['foo.out'], + cmd = 'cat $(SRCS) > $@', +) + +genrule( + name = 'bar', + srcs = ['foo.out', 'bar.in'], + outs = ['bar.out'], + cmd = 'cat $(SRCS) > $@', + tags = ['no-remote-exec'], +) +EOF + + echo foo > a/foo.in + echo bar > a/bar.in + + # Populate remote cache + bazel build \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + //a:bar >& $TEST_log || fail "Failed to build" + + bazel clean + + # Clean build, foo.out isn't downloaded + bazel build \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + //a:bar >& $TEST_log || fail "Failed to build" + + if [[ -f bazel-bin/a/foo.out ]]; then + fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded" + fi + + # Evict blobs from remote cache + stop_worker + start_worker + + echo "updated bar" > a/bar.in + + # Incremental build triggers remote cache eviction error but Bazel + # automatically retries the build and reruns the generating actions for + # missing blobs + bazel build \ + --remote_executor=grpc://localhost:${worker_port} \ + --remote_download_minimal \ + --experimental_remote_cache_eviction_retries=5 \ + //a:bar >& $TEST_log || fail "Failed to build" + + expect_log "Found remote cache eviction error, retrying the build..." +} + run_suite "Build without the Bytes tests"