From 2828131f605b6008351fc57d1eaf2b40635a87f8 Mon Sep 17 00:00:00 2001 From: mraszyk <31483726+mraszyk@users.noreply.github.com> Date: Wed, 15 Jan 2025 14:20:13 +0100 Subject: [PATCH] fix(PocketIC): safely drop StateMachine (#3450) This PR safely drops every StateMachine in PocketIC to prevent its state directory from being deleted before every Arc to its state manager is dropped. --- rs/pocket_ic_server/src/pocket_ic.rs | 34 ++++++++++++++++++++++++++-- rs/state_machine_tests/src/lib.rs | 14 ++++++++---- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/rs/pocket_ic_server/src/pocket_ic.rs b/rs/pocket_ic_server/src/pocket_ic.rs index 220099396b0..a74a6011c70 100644 --- a/rs/pocket_ic_server/src/pocket_ic.rs +++ b/rs/pocket_ic_server/src/pocket_ic.rs @@ -384,6 +384,9 @@ impl SubnetsImpl { pub(crate) fn get_all(&self) -> Vec> { self.subnets.read().unwrap().values().cloned().collect() } + fn clear(&self) { + self.subnets.write().unwrap().clear(); + } } impl Subnets for SubnetsImpl { @@ -421,8 +424,8 @@ pub struct PocketIc { impl Drop for PocketIc { fn drop(&mut self) { - let subnets = self.subnets.get_all(); if let Some(ref state_dir) = self.state_dir { + let subnets = self.subnets.get_all(); for subnet in &subnets { subnet.state_machine.checkpointed_tick(); } @@ -452,9 +455,36 @@ impl Drop for PocketIc { let topology_json = serde_json::to_string(&raw_topology).unwrap(); topology_file.write_all(topology_json.as_bytes()).unwrap(); } - for subnet in subnets { + for subnet in self.subnets.get_all() { subnet.state_machine.drop_payload_builder(); } + let state_machines: Vec<_> = self + .subnets + .get_all() + .into_iter() + .map(|subnet| subnet.state_machine.clone()) + .collect(); + self.subnets.clear(); + // for every StateMachine, wait until nobody else has an Arc to that StateMachine + // and then drop that StateMachine + let start = std::time::Instant::now(); + for state_machine in state_machines { + let mut state_machine = Some(state_machine); + while state_machine.is_some() { + match Arc::try_unwrap(state_machine.take().unwrap()) { + Ok(sm) => { + sm.drop(); + break; + } + Err(sm) => { + state_machine = Some(sm); + } + } + if start.elapsed() > std::time::Duration::from_secs(5 * 60) { + panic!("Timed out while dropping PocketIC."); + } + } + } } } diff --git a/rs/state_machine_tests/src/lib.rs b/rs/state_machine_tests/src/lib.rs index 5031290b05a..c096015a556 100644 --- a/rs/state_machine_tests/src/lib.rs +++ b/rs/state_machine_tests/src/lib.rs @@ -1826,20 +1826,26 @@ impl StateMachine { fn into_components(self) -> (Box, u64, Time, u64) { let state_manager = Arc::downgrade(&self.state_manager); let result = self.into_components_inner(); - let mut i = 0i32; // StateManager is owned by an Arc, that is cloned into multiple components and different // threads. If we return before all the asynchronous components release the Arc, we may // end up with to StateManagers writing to the same directory, resulting in a crash. + let start = std::time::Instant::now(); while state_manager.upgrade().is_some() { std::thread::sleep(std::time::Duration::from_millis(50)); - i += 1; - if i >= 100 { - panic!("Failed to wait for StateManager drop"); + if start.elapsed() > std::time::Duration::from_secs(5 * 60) { + panic!("Timed out while dropping StateMachine."); } } result } + /// Safely drops this `StateMachine`. We cannot achieve this functionality by implementing `Drop` + /// since we have to wait until there are no more `Arc`s for the state manager and + /// this is infeasible in a `Drop` implementation. + pub fn drop(self) { + let _ = self.into_components(); + } + /// Emulates a node restart, including checkpoint recovery. pub fn restart_node(self) -> Self { // We must drop self before setup_form_dir so that we don't have two StateManagers pointing