Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Collator connection management #5388

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions node/collation-generation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,12 @@ async fn handle_new_activations<Context: SubsystemContext>(
},
};

if config.collator.is_collating(relay_parent, &validation_data).await {
if let Some(forecast) =
config.collator.is_collating(relay_parent, &validation_data).await
{
let _ = ctx
.send_message(AllMessages::CollatorProtocol(
CollatorProtocolMessage::PreConnectAsCollator(relay_parent),
CollatorProtocolMessage::ForecastCollation(relay_parent, forecast),
))
.await;
gum::debug!(
Expand Down
10 changes: 7 additions & 3 deletions node/collation-generation/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ mod handle_new_activations {
use ::test_helpers::{dummy_hash, dummy_head_data, dummy_validator};
use futures::lock::Mutex;
use polkadot_node_primitives::{
BlockData, Collation, CollationResult, Collator, MaybeCompressedPoV, PoV,
BlockData, Collation, CollationForecast, CollationResult, Collator, MaybeCompressedPoV, PoV,
};
use polkadot_node_subsystem::{
errors::RuntimeApiError,
Expand Down Expand Up @@ -69,8 +69,12 @@ mod handle_new_activations {
Some(CollationResult { collation: test_collation(), result_sender: None })
}

async fn is_collating(&self, _: Hash, _: &PersistedValidationData) -> bool {
false
async fn is_collating(
&self,
_: Hash,
_: &PersistedValidationData,
) -> Option<CollationForecast> {
None
}
}

Expand Down
197 changes: 197 additions & 0 deletions node/network/collator-protocol/src/collator_side/advertising.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// Copyright 2022 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use std::collections::{HashMap, HashSet};

use polkadot_node_network_protocol::v1::CollatorProtocolMessage;
use polkadot_node_network_protocol::{OurView, PeerId, View};
use polkadot_primitives::v2::{CollatorPair, Id as ParaId, Hash, BlockNumber, SessionIndex, GroupRotationInfo, CoreIndex, GroupIndex, AuthorityDiscoveryId};
use polkadot_subsystem::messages::CollatorProtocolMessage;
use polkadot_subsystem::{ActiveLeavesUpdate, SubsystemContext, SubsystemSender};

use crate::error::FatalResult;

use crate::error::FatalResult;

use super::Metrics;

/// State for taking care of validator connections and advertisments.
struct Advertiser {
/// Our network peer id.
local_peer_id: PeerId,

/// Our keys.
collator_pair: CollatorPair,

/// The para this collator is collating on.
/// Starts as `None` and is updated with every `CollateOn` message.
collating_on: Option<ParaId>,

/// Track all active peers and their views
/// to determine what is relevant to them.
peer_views: HashMap<PeerId, View>,

/// Our own view.
view: OurView,

/// Information about connections we want to have established.
///
/// For connection management we basically need two blocks of information:
///
/// 1. Time: When and for how long do we want the connection.
/// 2. To what validators/which backing group to connect to.
///
/// ## Connection time management
///
/// For simplicity we chose to make connection life management based on relay chain blocks,
/// which act as a natural pace maker. So the lifespan of a connection can be adjusted in
/// multiples of rougly 6 seconds (assuming normal operation).
///
/// To ensure uninterrupted connectivity, for several blocks, all you have to do is to ensure
/// that this map contains an entry for those consecutive block numbers. We chose block numbers
/// as key for this map as opposed to `Hash`es, so you can ensure "guaranteed" uninterrupted
/// connectivity from a current block to a future block, e.g. for pre-connect.
///
/// Concretely: For pre-connect (establishing a connection one block earlier than you need it),
/// you would make sure that this map contains two entries with the same value, one for the
/// current block height of the fork you are interested in and one with the block number
/// incremented. This way the connection will survive the next block in all cases.
///
/// # Target/what validators to connect to
///
/// The values of this map are the group/session combinations we want to be connected for the
/// given block height.
required_connections: HashMap<BlockNumber, HashSet<(SessionIndex, GroupIndex)>>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indentation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could this be a BTreeMap? might make pruning easier (prune all up to X)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about that, but for the time it looks like it is not needed.


/// Information about established connections to validators in a group.
connections: HashMap<(SessionIndex, GroupIndex), GroupConnection>,

/// Lookup group index/session indexes for a peer that connected.
reverse_peers: HashMap<PeerId, HashSet<(SessionIndex, GroupIndex)>>,

/// Lookup groups in sessions for an Authority (in requested connections)
authority_group: HashMap<AuthorityDiscoveryId, HashSet<(SessionIndex, GroupIndex)>>,

// send task (given relay_parent, paraid):
// - send for paraid
// - resolves to coreid (on a per block basis) - can be found once we have block
// - resolves to groupid - can be found once we know the session (and block height)
// - resolves to `AuthorityDiscoveryIds` - can be found if we have session
// - which resolve to `PeerId`s - can be resolved once connected
requested_connections: HashMap<AuthorityDiscoveryId, GroupIndex>,
/// `PeerId`s for already established connections for some group.
group_peers: HashMap<GroupIndex, HashSet<PeerId>>,

established_connections: HashMap<Hash, GroupIndex>,

/// Report metrics.
metrics: Metrics,
}

/// Needed information for establishing connections.
struct ConnectionInfo {
/// The relay parent to use for determining the correct core.
relay_parent: Hash,
/// Block number to determine the desired backing group.
///
/// Note: This seemingly redundant info, is not redundant in the case of `pre-connect`. In this
/// case the above relay_parent will be used for determining the core in advance, but the
/// responsbile backing group will be determined based on the given `block_number`.
block_number: BlockNumber,
/// What session we are operating in.
///
/// For `pre-connect`, this will just be a guess, which might be wrong (we assume to stay in
/// the same session).
session_index: SessionIndex,
}

/// Information about connections to a validator group.
///
/// We keep track of connected peers in a validator group and the messages that should be sent to
/// them.
struct GroupConnection {
/// Connected peers.
peers: HashSet<PeerId>,
/// Messages that should be sent to connected peers in this group.
///
/// When messages are sent, peers might not yet be connected, so we keep all messages that
/// should be sent to peers here, if a new peer connects all messages that are still relevant
/// to its view are sent.
messages: Vec<CollatorProtocolMessage>,
}

impl Advertiser {
pub fn new(local_peer_id: PeerId, collator_pair: CollatorPair, metrics: Metrics) -> Self {
Self {
local_peer_id,
collator_pair,
collating_on: None,
peer_views: HashMap::new(),
view: OurView::default(),
required_connections: HashMap::new(),
metrics,
}
}

/// Ask for additional connections.
///
/// They will automatically established once a block with block height `when` comes into view
/// and will be closed, once it goes out of view, assuming no other entry is present, which
/// preserves them.
///
// - Lookup SessionIndex & Group
// - Add to required_connections
// - Update Connect message if necessary (connection affects already present block heights)
pub fn add_required_connections<Sender: SubsystemSender>(&mut self, sender: &mut Sender, when: BlockNumber, info: ConnectionInfo) {
self.requried_connections.insert(when, info);
}

/// Send a message to a validator group.
//
// - insert message
// - send to all already connected peers if in view.
// -> View still relevant with async backing? Likely only when it comes to height.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

View is still relevant - view determines their 'implicit view', which is the view-head + some ancestry (described in more detail here: #5054 (comment))

pub fn send_message(&mut self, session: SessionIndex, group: GroupIndex, msg: CollatorProtocolMessage) -> FatalResult<()> {
panic!("WIP");
}

// - Add to peers in groups.
// - Send any messages it has not received yet.
// - Cleanout any obsolete messages
pub fn on_peer_connected(&mut self, ...);


/// Process an active leaves update.
///
/// - Make sure needed connections are established
/// - Make sure obsolete connections are dropped
pub fn process_active_leaves_update<Context: SubsystemContext>(
&mut self,
ctx: &mut Context,
update: &ActiveLeavesUpdate,
) -> FatalResult<()> {
}

/// Get all peers which have the given relay parent in their view.
fn peers_interested_in_leaf(&self, relay_parent: &Hash) -> Vec<PeerId> {
self.peer_views
.iter()
.filter(|(_, v)| v.contains(relay_parent))
.map(|(peer, _)| *peer)
.collect()
}

}
123 changes: 123 additions & 0 deletions node/network/collator-protocol/src/collator_side/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Copyright 2022 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use polkadot_node_subsystem_util::metrics::{self, prometheus};

#[derive(Clone, Default)]
pub struct Metrics(Option<MetricsInner>);

impl Metrics {
pub fn on_advertisment_made(&self) {
if let Some(metrics) = &self.0 {
metrics.advertisements_made.inc();
}
}

pub fn on_collation_sent_requested(&self) {
if let Some(metrics) = &self.0 {
metrics.collations_send_requested.inc();
}
}

pub fn on_collation_sent(&self) {
if let Some(metrics) = &self.0 {
metrics.collations_sent.inc();
}
}

/// Provide a timer for `process_msg` which observes on drop.
pub fn time_process_msg(&self) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| metrics.process_msg.start_timer())
}

/// Provide a timer for `distribute_collation` which observes on drop.
pub fn time_collation_distribution(
&self,
label: &'static str,
) -> Option<prometheus::prometheus::HistogramTimer> {
self.0.as_ref().map(|metrics| {
metrics.collation_distribution_time.with_label_values(&[label]).start_timer()
})
}
}

#[derive(Clone)]
struct MetricsInner {
advertisements_made: prometheus::Counter<prometheus::U64>,
collations_sent: prometheus::Counter<prometheus::U64>,
collations_send_requested: prometheus::Counter<prometheus::U64>,
process_msg: prometheus::Histogram,
collation_distribution_time: prometheus::HistogramVec,
}

impl metrics::Metrics for Metrics {
fn try_register(
registry: &prometheus::Registry,
) -> std::result::Result<Self, prometheus::PrometheusError> {
let metrics = MetricsInner {
advertisements_made: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_collation_advertisements_made_total",
"A number of collation advertisements sent to validators.",
)?,
registry,
)?,
collations_send_requested: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_collations_sent_requested_total",
"A number of collations requested to be sent to validators.",
)?,
registry,
)?,
collations_sent: prometheus::register(
prometheus::Counter::new(
"polkadot_parachain_collations_sent_total",
"A number of collations sent to validators.",
)?,
registry,
)?,
process_msg: prometheus::register(
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"polkadot_parachain_collator_protocol_collator_process_msg",
"Time spent within `collator_protocol_collator::process_msg`",
)
.buckets(vec![
0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75,
1.0,
]),
)?,
registry,
)?,
collation_distribution_time: prometheus::register(
prometheus::HistogramVec::new(
prometheus::HistogramOpts::new(
"polkadot_parachain_collator_protocol_collator_distribution_time",
"Time spent within `collator_protocol_collator::distribute_collation`",
)
.buckets(vec![
0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75,
1.0,
]),
&["state"],
)?,
registry,
)?,
};

Ok(Metrics(Some(metrics)))
}
}
Loading