Skip to content

Commit

Permalink
Stop auto-followers on shutdown (elastic#40124)
Browse files Browse the repository at this point in the history
When shutting down a node, auto-followers will keep trying to run. This
is happening even as transport services and other components are being
closed. In some cases, this can lead to a stack overflow as we rapidly
try to check the license state of the remote cluster, can not because
the transport service is shutdown, and then immeidately retry
again. This can happen faster than the shutdown, and we die with stack
overflow. This commit adds a stop command to auto-followers so that this
retry loop occurs at most once on shutdown.
  • Loading branch information
jasontedor authored Mar 18, 2019
1 parent 8fb5c32 commit ad90055
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,16 @@ public Collection<Object> createComponents(
CcrRestoreSourceService restoreSourceService = new CcrRestoreSourceService(threadPool, ccrSettings);
this.restoreSourceService.set(restoreSourceService);
return Arrays.asList(
ccrLicenseChecker,
restoreSourceService,
new CcrRepositoryManager(settings, clusterService, client),
new AutoFollowCoordinator(settings, client, clusterService, ccrLicenseChecker,
threadPool::relativeTimeInMillis, threadPool::absoluteTimeInMillis)
);
ccrLicenseChecker,
restoreSourceService,
new CcrRepositoryManager(settings, clusterService, client),
new AutoFollowCoordinator(
settings,
client,
clusterService,
ccrLicenseChecker,
threadPool::relativeTimeInMillis,
threadPool::absoluteTimeInMillis));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.collect.CopyOnWriteHashMap;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.component.Lifecycle;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.AtomicArray;
Expand Down Expand Up @@ -66,7 +68,7 @@
* A component that runs only on the elected master node and follows leader indices automatically
* if they match with a auto follow pattern that is defined in {@link AutoFollowMetadata}.
*/
public class AutoFollowCoordinator implements ClusterStateListener {
public class AutoFollowCoordinator extends AbstractLifecycleComponent implements ClusterStateListener {

private static final Logger LOGGER = LogManager.getLogger(AutoFollowCoordinator.class);
private static final int MAX_AUTO_FOLLOW_ERRORS = 256;
Expand Down Expand Up @@ -117,6 +119,26 @@ protected boolean removeEldestEntry(final Map.Entry<String, Tuple<Long, Elastics
waitForMetadataTimeOut = CcrSettings.CCR_WAIT_FOR_METADATA_TIMEOUT.get(settings);
}

@Override
protected void doStart() {

}

@Override
protected void doStop() {
LOGGER.trace("stopping all auto-followers");
/*
* Synchronization is not necessary here; the field is volatile and the map is a copy-on-write map, any new auto-followers will not
* start since we check started status of the coordinator before starting them.
*/
autoFollowers.values().forEach(AutoFollower::stop);
}

@Override
protected void doClose() {

}

public synchronized AutoFollowStats getStats() {
final Map<String, AutoFollower> autoFollowers = this.autoFollowers;
final TreeMap<String, AutoFollowedCluster> timesSinceLastAutoFollowPerRemoteCluster = new TreeMap<>();
Expand Down Expand Up @@ -246,8 +268,10 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS

};
newAutoFollowers.put(remoteCluster, autoFollower);
LOGGER.info("starting auto follower for remote cluster [{}]", remoteCluster);
autoFollower.start();
LOGGER.info("starting auto-follower for remote cluster [{}]", remoteCluster);
if (lifecycleState() == Lifecycle.State.STARTED) {
autoFollower.start();
}
}

List<String> removedRemoteClusters = new ArrayList<>();
Expand All @@ -257,13 +281,15 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
boolean exist = autoFollowMetadata.getPatterns().values().stream()
.anyMatch(pattern -> pattern.getRemoteCluster().equals(remoteCluster));
if (exist == false) {
LOGGER.info("removing auto follower for remote cluster [{}]", remoteCluster);
LOGGER.info("removing auto-follower for remote cluster [{}]", remoteCluster);
autoFollower.removed = true;
removedRemoteClusters.add(remoteCluster);
} else if (autoFollower.remoteClusterConnectionMissing) {
LOGGER.info("retrying auto follower [{}] after remote cluster connection was missing", remoteCluster);
LOGGER.info("retrying auto-follower for remote cluster [{}] after remote cluster connection was missing", remoteCluster);
autoFollower.remoteClusterConnectionMissing = false;
autoFollower.start();
if (lifecycleState() == Lifecycle.State.STARTED) {
autoFollower.start();
}
}
}
assert assertNoOtherActiveAutoFollower(newAutoFollowers);
Expand Down Expand Up @@ -313,6 +339,7 @@ abstract static class AutoFollower {
volatile boolean removed = false;
private volatile CountDown autoFollowPatternsCountDown;
private volatile AtomicArray<AutoFollowResult> autoFollowResults;
private volatile boolean stop;

AutoFollower(final String remoteCluster,
final Consumer<List<AutoFollowResult>> statsUpdater,
Expand All @@ -325,6 +352,10 @@ abstract static class AutoFollower {
}

void start() {
if (stop) {
LOGGER.trace("auto-follower is stopped for remote cluster [{}]", remoteCluster);
return;
}
if (removed) {
// This check exists to avoid two AutoFollower instances a single remote cluster.
// (If an auto follow pattern is deleted and then added back quickly enough then
Expand Down Expand Up @@ -389,6 +420,11 @@ void start() {
});
}

void stop() {
LOGGER.trace("stopping auto-follower for remote cluster [{}]", remoteCluster);
stop = true;
}

private void autoFollowIndices(final AutoFollowMetadata autoFollowMetadata,
final ClusterState clusterState,
final ClusterState remoteClusterState,
Expand Down

0 comments on commit ad90055

Please sign in to comment.