Skip to content

Commit

Permalink
chore: Reduce flakiness of state sync system tests (#3672)
Browse files Browse the repository at this point in the history
This commit fixes a condition that can lead to flakiness in the state
sync system tests.

The test relies on a metric of the number of successful state syncs.
However, there appears to be a race condition of when the changed metric
can be observed. At the same time, in this test it only takes a very
short amount of time between the end of state sync and the subnet being
no longer stalled. This race condition confused the test.

Now, the test waits for the metric to increase instead of asserting on
the first try.
  • Loading branch information
schneiderstefan authored Jan 30, 2025
1 parent d128e24 commit cf78072
Showing 1 changed file with 43 additions and 41 deletions.
84 changes: 43 additions & 41 deletions rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,29 +87,7 @@ pub async fn rejoin_test(
let message = b"This beautiful prose should be persisted for future generations";
store_and_read_stable(&logger, message, &universal_canister).await;

info!(
logger,
"Checking for the state sync count metrics indicating that a successful state sync has happened"
);
let res = fetch_metrics::<u64>(
&logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
)
.await;
assert!(res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count);

let res = fetch_metrics::<f64>(
&logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
)
.await;
info!(
logger,
"State sync finishes successfully in {} seconds",
res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
);
assert_state_sync_has_happened(&logger, rejoin_node, base_count).await;
}

pub async fn rejoin_test_large_state(
Expand Down Expand Up @@ -223,29 +201,53 @@ pub async fn rejoin_test_large_state(
let message = b"This beautiful prose should be persisted for future generations";
store_and_read_stable(&logger, message, &universal_canister).await;

assert_state_sync_has_happened(&logger, rejoin_node, base_count).await;
}

async fn assert_state_sync_has_happened(
logger: &slog::Logger,
rejoin_node: IcNodeSnapshot,
base_count: u64,
) {
const NUM_RETRIES: u32 = 200;
const BACKOFF_TIME_MILLIS: u64 = 500;

info!(
logger,
"Checking for the state sync count metrics indicating that a successful state sync has happened"
);
let res = fetch_metrics::<u64>(
&logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
)
.await;
assert!(res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count);

let res = fetch_metrics::<f64>(
&logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
)
.await;
info!(
logger,
"State sync finishes successfully in {} seconds",
res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
);
// We retry a few times as we observed a pontential race condition where it
// still reads a slightly older value from the metrics, even though the
// state sync has already happened. This is a workaround to make the test
// more robust.
for i in 0..NUM_RETRIES {
let res = fetch_metrics::<u64>(
logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
)
.await;
if res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count {
let res = fetch_metrics::<f64>(
logger,
rejoin_node.clone(),
vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
)
.await;
info!(
logger,
"State sync finishes successfully in {} seconds",
res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
);

return;
}

info!(logger, "No state sync detected yet, attempt {i}.");
tokio::time::sleep(Duration::from_millis(BACKOFF_TIME_MILLIS)).await;
}
panic!("Couldn't verify that a state sync has happened after {NUM_RETRIES} attempts.");
}

pub async fn fetch_metrics<T>(
Expand Down

0 comments on commit cf78072

Please sign in to comment.