Skip to content

Commit

Permalink
[Java] Introduce SnapshotDurationTracker (#1532)
Browse files Browse the repository at this point in the history
  • Loading branch information
eliquinox authored Nov 20, 2023
1 parent be772c6 commit af560d6
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 15 deletions.
11 changes: 11 additions & 0 deletions aeron-client/src/main/java/io/aeron/AeronCounters.java
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,17 @@ public final class AeronCounters
*/
public static final int NODE_CONTROL_TOGGLE_TYPE_ID = 233;

/**
* The type id of the {@link Counter} used for keeping track of the maximum total snapshot duration.
*/
public static final int CLUSTER_TOTAL_MAX_SNAPSHOT_DURATION_TYPE_ID = 234;

/**
* The type id of the {@link Counter} used for keeping track of the count total snapshot duration
* has exceeded the threshold.
*/
public static final int CLUSTER_TOTAL_SNAPSHOT_DURATION_THRESHOLD_EXCEEDED_TYPE_ID = 235;

private AeronCounters()
{
}
Expand Down
92 changes: 92 additions & 0 deletions aeron-cluster/src/main/java/io/aeron/cluster/ConsensusModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,18 @@ public static final class Configuration
*/
public static final long CYCLE_THRESHOLD_DEFAULT_NS = TimeUnit.MILLISECONDS.toNanos(1000);

/**
* Property name for threshold value, which is used for tracking total snapshot duration breaches.
*/
public static final String TOTAL_SNAPSHOT_DURATION_THRESHOLD_PROP_NAME =
"aeron.cluster.total.snapshot.threshold";

/**
* Default threshold value, which is used for tracking total snapshot duration breaches.
*/
public static final long TOTAL_SNAPSHOT_DURATION_THRESHOLD_DEFAULT_NS =
TimeUnit.MILLISECONDS.toNanos(1000);

/**
* Default timeout a leader will wait on getting termination ACKs from followers.
*/
Expand Down Expand Up @@ -1100,6 +1112,18 @@ public static long cycleThresholdNs()
return getDurationInNanos(CYCLE_THRESHOLD_PROP_NAME, CYCLE_THRESHOLD_DEFAULT_NS);
}

/**
* Get threshold value, which is used for monitoring total snapshot duration breaches of its predefined
* threshold.
*
* @return threshold value in nanoseconds.
*/
public static long totalSnapshotDurationThresholdNs()
{
return getDurationInNanos(TOTAL_SNAPSHOT_DURATION_THRESHOLD_PROP_NAME,
TOTAL_SNAPSHOT_DURATION_THRESHOLD_DEFAULT_NS);
}

/**
* Size in bytes of the error buffer in the mark file.
*
Expand Down Expand Up @@ -1392,6 +1416,7 @@ public static final class Context implements Cloneable
private long electionStatusIntervalNs = Configuration.electionStatusIntervalNs();
private long terminationTimeoutNs = Configuration.terminationTimeoutNs();
private long cycleThresholdNs = Configuration.cycleThresholdNs();
private long totalSnapshotDurationThresholdNs = Configuration.totalSnapshotDurationThresholdNs();

private String agentRoleName = Configuration.agentRoleName();
private ThreadFactory threadFactory;
Expand Down Expand Up @@ -1425,6 +1450,7 @@ public static final class Context implements Cloneable
private LogPublisher logPublisher;
private EgressPublisher egressPublisher;
private DutyCycleTracker dutyCycleTracker;
private SnapshotDurationTracker totalSnapshotDurationTracker;
private AppVersionValidator appVersionValidator;
private boolean isLogMdc;
private boolean useAgentInvoker = false;
Expand Down Expand Up @@ -1726,6 +1752,26 @@ public void conclude()
cycleThresholdNs);
}

if (null == totalSnapshotDurationTracker)
{
totalSnapshotDurationTracker = new SnapshotDurationTracker(
ClusterCounters.allocate(
aeron,
buffer,
"Total max snapshot duration in ns",
AeronCounters.CLUSTER_TOTAL_MAX_SNAPSHOT_DURATION_TYPE_ID,
clusterId),
ClusterCounters.allocate(
aeron,
buffer,
"Total max snapshot duration exceeded count: threshold=" +
totalSnapshotDurationThresholdNs,
AeronCounters.CLUSTER_TOTAL_SNAPSHOT_DURATION_THRESHOLD_EXCEEDED_TYPE_ID,
clusterId),
totalSnapshotDurationThresholdNs);
}


if (null == threadFactory)
{
threadFactory = Thread::new;
Expand Down Expand Up @@ -3029,6 +3075,52 @@ public DutyCycleTracker dutyCycleTracker()
return dutyCycleTracker;
}

/**
* Set a threshold for total snapshot duration which when exceeded will result in a counter increment.
*
* @param thresholdNs value in nanoseconds
* @return this for fluent API.
* @see ConsensusModule.Configuration#TOTAL_SNAPSHOT_DURATION_THRESHOLD_PROP_NAME
* @see ConsensusModule.Configuration#TOTAL_SNAPSHOT_DURATION_THRESHOLD_DEFAULT_NS
*/
public Context totalSnapshotDurationThresholdNs(final long thresholdNs)
{
this.totalSnapshotDurationThresholdNs = thresholdNs;
return this;
}

/**
* Threshold for total snapshot duration which when exceeded it will increment the counter.
*
* @return threshold value in nanoseconds.
*/
public long totalSnapshotDurationThresholdNs()
{
return totalSnapshotDurationThresholdNs;
}

/**
* Set snapshot duration tracker used for monitoring total snapshot duration.
*
* @param snapshotDurationTracker snapshot duration tracker.
* @return this for fluent API.
*/
public Context totalSnapshotDurationTracker(final SnapshotDurationTracker snapshotDurationTracker)
{
this.totalSnapshotDurationTracker = snapshotDurationTracker;
return this;
}

/**
* Get snapshot duration tracker used for monitoring total snapshot duration.
*
* @return snapshot duration tracker
*/
public SnapshotDurationTracker totalSnapshotDurationTracker()
{
return totalSnapshotDurationTracker;
}

/**
* Get the {@link Agent#roleName()} to be used for the consensus module agent. If {@code null} then one will
* be generated.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@
import static io.aeron.archive.client.ReplayMerge.LIVE_ADD_MAX_WINDOW;
import static io.aeron.archive.codecs.SourceLocation.LOCAL;
import static io.aeron.cluster.ClusterSession.State.*;
import static io.aeron.cluster.ConsensusModule.CLUSTER_ACTION_FLAGS_STANDBY_SNAPSHOT;
import static io.aeron.cluster.ConsensusModule.CLUSTER_ACTION_FLAGS_DEFAULT;
import static io.aeron.cluster.ConsensusModule.CLUSTER_ACTION_FLAGS_STANDBY_SNAPSHOT;
import static io.aeron.cluster.ConsensusModule.Configuration.*;
import static io.aeron.cluster.client.AeronCluster.Configuration.PROTOCOL_SEMANTIC_VERSION;
import static io.aeron.cluster.service.ClusteredServiceContainer.Configuration.MARK_FILE_UPDATE_INTERVAL_NS;
Expand Down Expand Up @@ -147,6 +147,7 @@ final class ConsensusModuleAgent implements Agent, TimerService.TimerHandler, Co
private final IdleStrategy idleStrategy;
private final RecordingLog recordingLog;
private final DutyCycleTracker dutyCycleTracker;
private final SnapshotDurationTracker totalSnapshotDurationTracker;
private RecordingLog.RecoveryPlan recoveryPlan;
private AeronArchive archive;
private RecordingSignalPoller recordingSignalPoller;
Expand Down Expand Up @@ -188,6 +189,7 @@ final class ConsensusModuleAgent implements Agent, TimerService.TimerHandler, Co
Arrays.fill(serviceClientIds, NULL_VALUE);
this.serviceAckQueues = ServiceAck.newArrayOfQueues(ctx.serviceCount());
this.dutyCycleTracker = ctx.dutyCycleTracker();
this.totalSnapshotDurationTracker = ctx.totalSnapshotDurationTracker();

aeronClientInvoker = aeron.conductorAgentInvoker();
aeronClientInvoker.invoke();
Expand Down Expand Up @@ -1261,6 +1263,7 @@ void onServiceAck(
final ServiceAck[] serviceAcks = pollServiceAcks(logPosition, serviceId);
++serviceAckId;
takeSnapshot(timestamp, logPosition, serviceAcks);
totalSnapshotDurationTracker.onSnapshotEnd(clusterClock.timeNanos());

if (null != clusterTermination)
{
Expand Down Expand Up @@ -2247,6 +2250,7 @@ private int checkClusterControlToggle(final long nowNs)
if (ConsensusModule.State.ACTIVE == state && appendAction(ClusterAction.SNAPSHOT))
{
state(ConsensusModule.State.SNAPSHOT);
totalSnapshotDurationTracker.onSnapshotBegin(nowNs);
}
break;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright 2014-2023 Real Logic Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.aeron.cluster.service;

import org.agrona.concurrent.status.AtomicCounter;


/**
* Snapshot duration tracker that tracks maximum snapshot duration and also keeps count of how many times a predefined
* duration threshold is breached.
*/
public class SnapshotDurationTracker
{
private final AtomicCounter maxSnapshotDuration;
private final AtomicCounter snapshotDurationThresholdExceededCount;
private final long durationThresholdNs;
private long snapshotStartTimeNs = Long.MIN_VALUE;

/**
* Create a tracker to track max snapshot duration and breaches of a threshold.
*
* @param maxSnapshotDuration counter for tracking.
* @param snapshotDurationThresholdExceededCount counter for tracking.
* @param durationThresholdNs to use for tracking breaches.
*/
public SnapshotDurationTracker(
final AtomicCounter maxSnapshotDuration,
final AtomicCounter snapshotDurationThresholdExceededCount,
final long durationThresholdNs)
{
this.maxSnapshotDuration = maxSnapshotDuration;
this.snapshotDurationThresholdExceededCount = snapshotDurationThresholdExceededCount;
this.durationThresholdNs = durationThresholdNs;
}

/**
* Get max snapshot duration counter.
*
* @return max snapshot duration counter.
*/
public AtomicCounter maxSnapshotDuration()
{
return maxSnapshotDuration;
}

/**
* Get counter tracking number of times {@link SnapshotDurationTracker#durationThresholdNs} was exceeded
*
* @return duration threshold exceeded counter.
*/
public AtomicCounter snapshotDurationThresholdExceededCount()
{
return snapshotDurationThresholdExceededCount;
}

/**
* Called when snapshotting has started.
*
* @param timeNanos snapshot start time in nanoseconds.
*/
public void onSnapshotBegin(final long timeNanos)
{
snapshotStartTimeNs = timeNanos;
}

/**
* Called when snapshot has been taken.
*
* @param timeNanos snapshot end time in nanoseconds.
*/
public void onSnapshotEnd(final long timeNanos)
{
if (snapshotStartTimeNs != Long.MIN_VALUE)
{
final long snapshotDurationNs = timeNanos - snapshotStartTimeNs;

if (snapshotDurationNs > durationThresholdNs)
{
snapshotDurationThresholdExceededCount.increment();
}

maxSnapshotDuration.proposeMax(snapshotDurationNs);
}
}
}
Loading

0 comments on commit af560d6

Please sign in to comment.