Skip to content

Commit

Permalink
Add option to start locally scheduling actions above a certain age in…
Browse files Browse the repository at this point in the history
… dynamic scheduling. This allows handling when remote systems get stuck for whatever reason.

Currently, local actions are scheduled LIFO, so we have the best chance of actually doing something faster locally, rather than getting cancelled when the remote branch finishes shortly after. With this change, we switch to FIFO for actions that are above a certain age, where we guess the remote is not going to answer in a reasonable time.

We can't tell what timeouts to expect, since they could be caused by many different parts of the remote system. Instead, this allows adjusting based on experience. If only very few remote calls end up timing out, setting this to ~half the seen timeout should be fine. The more often remote calls time out, the lower you want to set this.

Setting this flag can hide problems in your remote system. Make sure to have proper monitoring and SLOs in place for your remote system before papering it over with this flag.

PiperOrigin-RevId: 422434741
  • Loading branch information
larsrc-google authored and copybara-github committed Jan 18, 2022
1 parent 5de9888 commit 88f605c
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,14 @@ public class DynamicExecutionOptions extends OptionsBase {
"If set, dynamic execution is turned off until there has been at least one successful"
+ " build.")
public boolean skipFirstBuild;

@Option(
name = "experimental_dynamic_slow_remote_time",
documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
effectTags = {OptionEffectTag.UNKNOWN},
defaultValue = "0",
help =
"If >0, the number of seconds a dynamically run action must run remote-only before we"
+ " prioritize its local execution to avoid remote timeouts.")
public int slowRemoteTime;
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import com.google.devtools.build.lib.server.FailureDetails.FailureDetail;
import com.google.errorprone.annotations.FormatMethod;
import com.google.errorprone.annotations.FormatString;
import java.time.Duration;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;
Expand Down Expand Up @@ -238,7 +239,6 @@ public ImmutableList<SpawnResult> exec(
waitingLocalJobs.add(localBranch);
tryScheduleLocalJob();
}

remoteBranch.execute(executorService);

try {
Expand Down Expand Up @@ -273,7 +273,18 @@ public ImmutableList<SpawnResult> exec(
private void tryScheduleLocalJob() {
synchronized (waitingLocalJobs) {
while (!waitingLocalJobs.isEmpty() && threadLimiter.tryAcquire()) {
LocalBranch job = waitingLocalJobs.pollLast();
LocalBranch job;
// TODO(b/120910324): Prioritize jobs where the remote branch has already failed.
if (options.slowRemoteTime > 0
&& waitingLocalJobs
.peekFirst()
.getAge()
.compareTo(Duration.ofSeconds(options.slowRemoteTime))
> 0) {
job = waitingLocalJobs.pollFirst();
} else {
job = waitingLocalJobs.pollLast();
}
job.execute(executorService);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import com.google.devtools.build.lib.actions.SpawnStrategy;
import com.google.devtools.build.lib.dynamic.DynamicExecutionModule.IgnoreFailureCheck;
import com.google.devtools.build.lib.util.io.FileOutErr;
import java.time.Duration;
import java.time.Instant;
import java.util.Optional;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
Expand All @@ -51,6 +53,7 @@ class LocalBranch extends Branch {
private final IgnoreFailureCheck ignoreFailureCheck;
private final Function<Spawn, Optional<Spawn>> getExtraSpawnForLocalExecution;
private final AtomicBoolean delayLocalExecution;
private final Instant creationTime = Instant.now();

public LocalBranch(
ActionExecutionContext actionExecutionContext,
Expand All @@ -71,6 +74,10 @@ public DynamicMode getMode() {
return LOCAL;
}

public Duration getAge() {
return Duration.between(creationTime, Instant.now());
}

/**
* Try to run the given spawn locally.
*
Expand Down

0 comments on commit 88f605c

Please sign in to comment.