-
Notifications
You must be signed in to change notification settings - Fork 11.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[QD Reconfig] 3. add reconfig observer (#7024)
1. This PR adds ReconfigObserver Trait. RO detects reconfigs and updates quorum driver the new committee. 2. `OnsiteReconfigObserver` is the RO that lives in `Fullnode`/`TransactionOrchestrator` that subscribes to the checkpoint executor reconfig channel. Note the integration of `OnsiteReconfigObserver` and `TransactionOrchestrator` happens in the follow-up PR Co-authored-by: Mark Logan <[email protected]>
- Loading branch information
1 parent
d87a8c3
commit fa4b7fc
Showing
11 changed files
with
296 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// Copyright (c) Mysten Labs, Inc. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
use async_trait::async_trait; | ||
use std::sync::Arc; | ||
use sui_types::committee::Committee; | ||
use tokio::sync::broadcast::error::RecvError; | ||
use tracing::{info, warn}; | ||
|
||
use crate::{ | ||
authority::AuthorityStore, | ||
authority_aggregator::{AuthAggMetrics, AuthorityAggregator}, | ||
authority_client::NetworkAuthorityClient, | ||
epoch::committee_store::CommitteeStore, | ||
safe_client::SafeClientMetricsBase, | ||
}; | ||
|
||
use super::QuorumDriver; | ||
|
||
#[async_trait] | ||
pub trait ReconfigObserver<A> { | ||
async fn run(&mut self, quorum_driver: Arc<QuorumDriver<A>>); | ||
} | ||
|
||
pub struct OnsiteReconfigObserver { | ||
reconfig_rx: tokio::sync::broadcast::Receiver<Committee>, | ||
authority_store: Arc<AuthorityStore>, | ||
committee_store: Arc<CommitteeStore>, | ||
safe_client_metrics_base: SafeClientMetricsBase, | ||
auth_agg_metrics: AuthAggMetrics, | ||
} | ||
|
||
impl OnsiteReconfigObserver { | ||
pub fn new( | ||
reconfig_rx: tokio::sync::broadcast::Receiver<Committee>, | ||
authority_store: Arc<AuthorityStore>, | ||
committee_store: Arc<CommitteeStore>, | ||
safe_client_metrics_base: SafeClientMetricsBase, | ||
auth_agg_metrics: AuthAggMetrics, | ||
) -> Self { | ||
Self { | ||
reconfig_rx, | ||
authority_store, | ||
committee_store, | ||
safe_client_metrics_base, | ||
auth_agg_metrics, | ||
} | ||
} | ||
|
||
async fn create_authority_aggregator_from_system_state( | ||
&self, | ||
) -> AuthorityAggregator<NetworkAuthorityClient> { | ||
AuthorityAggregator::new_from_system_state( | ||
&self.authority_store, | ||
&self.committee_store, | ||
self.safe_client_metrics_base.clone(), | ||
self.auth_agg_metrics.clone(), | ||
) | ||
// TODO: we should tolerate when <= f validators give invalid addresses | ||
// GH issue: https://github.com/MystenLabs/sui/issues/7019 | ||
.unwrap_or_else(|e| { | ||
panic!( | ||
"Failed to create AuthorityAggregator from System State: {:?}", | ||
e | ||
) | ||
}) | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl ReconfigObserver<NetworkAuthorityClient> for OnsiteReconfigObserver { | ||
async fn run(&mut self, quorum_driver: Arc<QuorumDriver<NetworkAuthorityClient>>) { | ||
// A tiny optimization: when a very stale node just starts, the | ||
// channel may fill up committees quickly. Here we skip directly to | ||
// the last known committee by looking at SuiSystemState. | ||
let authority_agg = self.create_authority_aggregator_from_system_state().await; | ||
if authority_agg.committee.epoch > quorum_driver.current_epoch() { | ||
quorum_driver | ||
.update_validators(Arc::new(authority_agg)) | ||
.await; | ||
} | ||
loop { | ||
match self.reconfig_rx.recv().await { | ||
Ok(committee) => { | ||
info!("Got reconfig message: {}", committee); | ||
if committee.epoch > quorum_driver.current_epoch() { | ||
let authority_agg = | ||
self.create_authority_aggregator_from_system_state().await; | ||
quorum_driver | ||
.update_validators(Arc::new(authority_agg)) | ||
.await; | ||
} else { | ||
// This should only happen when the node just starts | ||
warn!("Epoch number decreased - ignoring committee: {}", committee); | ||
} | ||
} | ||
// It's ok to miss messages due to overflow here | ||
Err(RecvError::Lagged(_)) => { | ||
continue; | ||
} | ||
Err(RecvError::Closed) => panic!("Do not expect the channel to be closed"), | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Copyright (c) Mysten Labs, Inc. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#![allow(clippy::async_yields_async)] | ||
use prometheus::Registry; | ||
use sui_core::authority_aggregator::AuthAggMetrics; | ||
use sui_core::quorum_driver::reconfig_observer::OnsiteReconfigObserver; | ||
use sui_core::quorum_driver::reconfig_observer::ReconfigObserver; | ||
use sui_core::safe_client::SafeClientMetricsBase; | ||
use test_utils::authority::{spawn_fullnodes, spawn_test_authorities, test_authority_configs}; | ||
use test_utils::network::wait_for_nodes_transition_to_epoch; | ||
use tracing::info; | ||
|
||
use sui_macros::sim_test; | ||
|
||
#[sim_test] | ||
async fn test_onsite_reconfig_observer_basic() { | ||
telemetry_subscribers::init_for_testing(); | ||
let config = test_authority_configs(); | ||
let authorities = spawn_test_authorities([].into_iter(), &config).await; | ||
let fullnodes = spawn_fullnodes(&config, 1).await; | ||
let fullnode = &fullnodes[0]; | ||
|
||
let _observer_handle = fullnode | ||
.with_async(|node| async { | ||
let qd = node | ||
.transaction_orchestrator() | ||
.unwrap() | ||
.clone_quorum_driver(); | ||
assert_eq!(qd.current_epoch(), 0); | ||
let rx = node.subscribe_to_epoch_change().await; | ||
let registry = Registry::new(); | ||
let mut observer = OnsiteReconfigObserver::new( | ||
rx, | ||
node.clone_authority_store(), | ||
node.clone_committee_store(), | ||
SafeClientMetricsBase::new(®istry), | ||
AuthAggMetrics::new(®istry), | ||
); | ||
let qd_clone = qd.clone_quorum_driver(); | ||
tokio::task::spawn(async move { observer.run(qd_clone).await }) | ||
}) | ||
.await; | ||
info!("Shutting down epoch 0"); | ||
for handle in &authorities { | ||
handle | ||
.with_async(|node| async { node.close_epoch().await.unwrap() }) | ||
.await; | ||
} | ||
// Wait for all nodes to reach the next epoch. | ||
info!("Waiting for nodes to advance to epoch 1"); | ||
wait_for_nodes_transition_to_epoch(authorities.iter().chain(fullnodes.iter()), 1).await; | ||
|
||
// Give it some time for the update to happen | ||
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await; | ||
fullnode.with(|node| { | ||
let qd = node | ||
.transaction_orchestrator() | ||
.unwrap() | ||
.clone_quorum_driver(); | ||
assert_eq!(qd.current_epoch(), 1); | ||
assert_eq!( | ||
node.clone_authority_aggregator().unwrap().committee.epoch, | ||
1 | ||
); | ||
}); | ||
} |
Oops, something went wrong.