chore: Reduce flakiness of state sync system tests (#3672)

This commit fixes a condition that can lead to flakiness in the state sync system tests. The test relies on a metric of the number of successful state syncs. However, there appears to be a race condition of when the changed metric can be observed. At the same time, in this test it only takes a very short amount of time between the end of state sync and the subnet being no longer stalled. This race condition confused the test. Now, the test waits for the metric to increase instead of asserting on the first try.
dfinity · Jan 30, 2025 · cf78072 · cf78072
1 parent d128e24
commit cf78072
Showing 1 changed file with 43 additions and 41 deletions.
diff --git a/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs b/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs
@@ -87,29 +87,7 @@ pub async fn rejoin_test(
     let message = b"This beautiful prose should be persisted for future generations";
     store_and_read_stable(&logger, message, &universal_canister).await;
 
-    info!(
-        logger,
-        "Checking for the state sync count metrics indicating that a successful state sync has happened"
-    );
-    let res = fetch_metrics::<u64>(
-        &logger,
-        rejoin_node.clone(),
-        vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
-    )
-    .await;
-    assert!(res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count);
-
-    let res = fetch_metrics::<f64>(
-        &logger,
-        rejoin_node.clone(),
-        vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
-    )
-    .await;
-    info!(
-        logger,
-        "State sync finishes successfully in {} seconds",
-        res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
-    );
+    assert_state_sync_has_happened(&logger, rejoin_node, base_count).await;
 }
 
 pub async fn rejoin_test_large_state(
@@ -223,29 +201,53 @@ pub async fn rejoin_test_large_state(
     let message = b"This beautiful prose should be persisted for future generations";
     store_and_read_stable(&logger, message, &universal_canister).await;
 
+    assert_state_sync_has_happened(&logger, rejoin_node, base_count).await;
+}
+
+async fn assert_state_sync_has_happened(
+    logger: &slog::Logger,
+    rejoin_node: IcNodeSnapshot,
+    base_count: u64,
+) {
+    const NUM_RETRIES: u32 = 200;
+    const BACKOFF_TIME_MILLIS: u64 = 500;
+
     info!(
         logger,
         "Checking for the state sync count metrics indicating that a successful state sync has happened"
     );
-    let res = fetch_metrics::<u64>(
-        &logger,
-        rejoin_node.clone(),
-        vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
-    )
-    .await;
-    assert!(res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count);
 
-    let res = fetch_metrics::<f64>(
-        &logger,
-        rejoin_node.clone(),
-        vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
-    )
-    .await;
-    info!(
-        logger,
-        "State sync finishes successfully in {} seconds",
-        res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
-    );
+    // We retry a few times as we observed a pontential race condition where it
+    // still reads a slightly older value from the metrics, even though the
+    // state sync has already happened. This is a workaround to make the test
+    // more robust.
+    for i in 0..NUM_RETRIES {
+        let res = fetch_metrics::<u64>(
+            logger,
+            rejoin_node.clone(),
+            vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT],
+        )
+        .await;
+        if res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT][0] > base_count {
+            let res = fetch_metrics::<f64>(
+                logger,
+                rejoin_node.clone(),
+                vec![SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM],
+            )
+            .await;
+            info!(
+                logger,
+                "State sync finishes successfully in {} seconds",
+                res[SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_SUM][0],
+            );
+
+            return;
+        }
+
+        info!(logger, "No state sync detected yet, attempt {i}.");
+        tokio::time::sleep(Duration::from_millis(BACKOFF_TIME_MILLIS)).await;
+    }
+    panic!("Couldn't verify that a state sync has happened after {NUM_RETRIES} attempts.");
 }
 
 pub async fn fetch_metrics<T>(