From 0bcb6f9d43917434725e1e06064b50f3e65ebb6a Mon Sep 17 00:00:00 2001 From: Robert Habermeier Date: Fri, 31 Jul 2020 11:07:31 -0400 Subject: [PATCH] guide: collator networking & subsystems (#1452) * Do a small write-up on collation-generation * preamble to collator protocol * notes on protocol * collation-generation: point to collator protocol * fix missing bracket * expand on collator protocol wire protocol * add a couple more sentences * expand on requests some more * go higher level * network bridge: note peerset * note peer-set = validation for protocols * add `ConnectToValidators` message * use ConnectToValidators in collator protocol * typo * remove references to sentry nodes --- roadmap/implementers-guide/src/SUMMARY.md | 2 +- .../availability/availability-distribution.md | 2 +- .../availability/bitfield-distribution.md | 2 +- .../src/node/backing/pov-distribution.md | 4 +- .../node/backing/statement-distribution.md | 2 +- .../node/collators/collation-distribution.md | 9 -- .../node/collators/collation-generation.md | 31 +++- .../src/node/collators/collator-protocol.md | 133 ++++++++++++++++++ .../src/node/utility/network-bridge.md | 18 ++- .../src/parachains-overview.md | 2 +- .../src/types/overseer-protocol.md | 46 +++++- 11 files changed, 228 insertions(+), 23 deletions(-) delete mode 100644 roadmap/implementers-guide/src/node/collators/collation-distribution.md create mode 100644 roadmap/implementers-guide/src/node/collators/collator-protocol.md diff --git a/roadmap/implementers-guide/src/SUMMARY.md b/roadmap/implementers-guide/src/SUMMARY.md index 0a7fa21d2a31..86ddabe45ba7 100644 --- a/roadmap/implementers-guide/src/SUMMARY.md +++ b/roadmap/implementers-guide/src/SUMMARY.md @@ -29,7 +29,7 @@ - [Bitfield Signing](node/availability/bitfield-signing.md) - [Collators](node/collators/README.md) - [Collation Generation](node/collators/collation-generation.md) - - [Collation Distribution](node/collators/collation-distribution.md) + - [Collator Protocol](node/collators/collator-protocol.md) - [Validity](node/validity/README.md) - [Utility Subsystems](node/utility/README.md) - [Availability Store](node/utility/availability-store.md) diff --git a/roadmap/implementers-guide/src/node/availability/availability-distribution.md b/roadmap/implementers-guide/src/node/availability/availability-distribution.md index 008f3e91fbe7..3c530bde8450 100644 --- a/roadmap/implementers-guide/src/node/availability/availability-distribution.md +++ b/roadmap/implementers-guide/src/node/availability/availability-distribution.md @@ -6,7 +6,7 @@ After a candidate is backed, the availability of the PoV block must be confirmed ## Protocol -`ProtocolId`:`b"avad"` +`ProtocolId`:`b"avad"`: `PeerSet`: `Validation` Input: diff --git a/roadmap/implementers-guide/src/node/availability/bitfield-distribution.md b/roadmap/implementers-guide/src/node/availability/bitfield-distribution.md index 528b5f9d1d74..faf1985c9269 100644 --- a/roadmap/implementers-guide/src/node/availability/bitfield-distribution.md +++ b/roadmap/implementers-guide/src/node/availability/bitfield-distribution.md @@ -4,7 +4,7 @@ Validators vote on the availability of a backed candidate by issuing signed bitf ## Protocol -`ProtocolId`: `b"bitd"` +`ProtocolId`: `b"bitd"`: `PeerSet`: `Validation` Input: [`BitfieldDistributionMessage`](../../types/overseer-protocol.md#bitfield-distribution-message) which are gossiped to all peers, no matter if validator or not. diff --git a/roadmap/implementers-guide/src/node/backing/pov-distribution.md b/roadmap/implementers-guide/src/node/backing/pov-distribution.md index 4215386d2644..486ba96fccb0 100644 --- a/roadmap/implementers-guide/src/node/backing/pov-distribution.md +++ b/roadmap/implementers-guide/src/node/backing/pov-distribution.md @@ -4,7 +4,7 @@ This subsystem is responsible for distributing PoV blocks. For now, unified with ## Protocol -`ProtocolId`: `b"povd"` +`ProtocolId`: `b"povd"`, `PeerSet`: `Validation` Input: [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message) @@ -18,7 +18,7 @@ Output: ## Functionality -This network protocol is responsible for distributing [`PoV`s](../../types/availability.md#proof-of-validity) by gossip. Since PoVs are heavy in practice, gossip is far from the most efficient way to distribute them. In the future, this should be replaced by a better network protocol that finds validators who have validated the block and connects to them directly. This protocol is descrbied +This network protocol is responsible for distributing [`PoV`s](../../types/availability.md#proof-of-validity) by gossip. Since PoVs are heavy in practice, gossip is far from the most efficient way to distribute them. In the future, this should be replaced by a better network protocol that finds validators who have validated the block and connects to them directly. This protocol is descrbied. This protocol is described in terms of "us" and our peers, with the understanding that this is the procedure that any honest node will run. It has the following goals: - We never have to buffer an unbounded amount of data diff --git a/roadmap/implementers-guide/src/node/backing/statement-distribution.md b/roadmap/implementers-guide/src/node/backing/statement-distribution.md index d05c68f7af70..b9a8914a9963 100644 --- a/roadmap/implementers-guide/src/node/backing/statement-distribution.md +++ b/roadmap/implementers-guide/src/node/backing/statement-distribution.md @@ -4,7 +4,7 @@ The Statement Distribution Subsystem is responsible for distributing statements ## Protocol -`ProtocolId`: `b"stmd"` +`ProtocolId`: `b"stmd"`, `PeerSet`: `Validation` Input: diff --git a/roadmap/implementers-guide/src/node/collators/collation-distribution.md b/roadmap/implementers-guide/src/node/collators/collation-distribution.md deleted file mode 100644 index 0b24ce47ca56..000000000000 --- a/roadmap/implementers-guide/src/node/collators/collation-distribution.md +++ /dev/null @@ -1,9 +0,0 @@ -# Collation Distribution - -> TODO - -## Protocol - -## Functionality - -## Jobs, if any diff --git a/roadmap/implementers-guide/src/node/collators/collation-generation.md b/roadmap/implementers-guide/src/node/collators/collation-generation.md index 1c828b3b0f9c..a182fbadd307 100644 --- a/roadmap/implementers-guide/src/node/collators/collation-generation.md +++ b/roadmap/implementers-guide/src/node/collators/collation-generation.md @@ -1,9 +1,36 @@ # Collation Generation -> TODO +The collation generation subsystem is executed on collator nodes and produces candidates to be distributed to validators. If configured to produce collations for a para, it produces collations and then feeds them to the [Collator Protocol][CP] subsystem, which handles the networking. ## Protocol +Input: None + +Output: CollationDistributionMessage + ## Functionality -## Jobs, if any +The process of generating a collation for a parachain is very parachain-specific. As such, the details of how to do so are left beyond the scope of this description. The subsystem should be implemented as an abstract wrapper, which is aware of this configuration: + +```rust +struct CollationGenerationConfig { + key: CollatorPair, + collation_producer: Fn(params) -> async (HeadData, Vec, PoV), +} +``` + +The configuration should be optional, to allow for the case where the node is not run with the capability to collate. + +On `ActiveLeavesUpdate`: + * If there is no collation generation config, ignore. + * Otherwise, for each `activated` head in the update: + * Determine if the para is scheduled or is next up on any occupied core by fetching the `availability_cores` Runtime API. + * Determine an occupied core assumption to make about the para. The simplest thing to do is to always assume that if the para occupies a core, that the candidate will become available. Further on, this might be determined based on bitfields seen or validator requests. + * Use the Runtime API subsystem to fetch the global validation data and local validation data. + * Construct validation function params based on validation data. + * Invoke the `collation_producer`. + * Construct a `CommittedCandidateReceipt` using the outputs of the `collation_producer` and signing with the `key`. + * Dispatch a [`CollatorProtocolMessage`][CPM]`::DistributeCollation(receipt, pov)`. + +[CP]: collator-protocol.md +[CPM]: ../../types/overseer-protocol.md#collatorprotocolmessage diff --git a/roadmap/implementers-guide/src/node/collators/collator-protocol.md b/roadmap/implementers-guide/src/node/collators/collator-protocol.md new file mode 100644 index 000000000000..526aab31aec8 --- /dev/null +++ b/roadmap/implementers-guide/src/node/collators/collator-protocol.md @@ -0,0 +1,133 @@ +# Collator Protocol + +The Collator Protocol implements the network protocol by which collators and validators communicate. It is used by collators to distribute collations to validators and used by validators to accept collations by collators. + +Collator-to-Validator networking is more difficult than Validator-to-Validator networking because the set of possible collators for any given para is unbounded, unlike the validator set. Validator-to-Validator networking protocols can easily be implemented as gossip because the data can be bounded, and validators can authenticate each other by their `PeerId`s for the purposes of instantiating and accepting connections. + +Since, at least at the level of the para abstraction, the collator-set for any given para is unbounded, validators need to make sure that they are receiving connections from capable and honest collators and that their bandwidth and time are not being wasted by attackers. Communicating across this trust-boundary is the most difficult part of this subsystem. + +Validation of candidates is a heavy task, and furthermore, the [`PoV`][PoV] itself is a large piece of data. Empirically, `PoV`s are on the order of 10MB. + +> TODO: note the incremental validation function Ximin proposes at https://github.com/paritytech/polkadot/issues/1348 + +As this network protocol serves as a bridge between collators and validators, it communicates primarily with one subsystem on behalf of each. As a collator, this will receive messages from the [`CollationGeneration`][CG] subsystem. As a validator, this will communicate with the [`CandidateBacking`][CB] subsystem. + +## Protocol + +Input: [`CollatorProtocolMessage`][CPM] + +Output: + - [`RuntimeApiMessage`][RAM] + - [`NetworkBridgeMessage`][NBM] + +## Functionality + +This network protocol uses the `Collation` peer-set of the [`NetworkBridge`][NB]. + +```rust +type RequestId = u64; + +enum WireMessage { + /// Declare the intent to advertise collations under a collator ID. + Declare(CollatorId), + /// Advertise a collation to a validator. Can only be sent once the peer has declared + /// that they are a collator with given ID. + AdvertiseCollation(Hash, ParaId), + /// Request the advertised collation at that relay-parent. + RequestCollation(RequestId, Hash, ParaId), + /// A requested collation. + Collation(RequestId, CandidateReceipt, PoV), +} +``` + +Since this protocol functions both for validators and collators, it is easiest to go through the protocol actions for each of them separately. + +Validators and collators. +```dot process +digraph { + c1 [shape=MSquare, label="Collator 1"]; + c2 [shape=MSquare, label="Collator 2"]; + + v1 [shape=MSquare, label="Validator 1"]; + v2 [shape=MSquare, label="Validator 2"]; + + c1 -> v1; + c1 -> v2; + c2 -> v2; +} +``` + +### Collators + +It is assumed that collators are only collating on a single parachain. Collations are generated by the [Collation Generation][CG] subsystem. We will keep up to one local collation per relay-parent, based on `DistributeCollation` messages. If the para is not scheduled or next up on any core, at the relay-parent, or the relay-parent isn't in the active-leaves set, we ignore the message as it must be invalid in that case - although this indicates a logic error elsewhere in the node. + +We keep track of the Para ID we are collating on as a collator. This starts as `None`, and is updated with each `CollateOn` message received. If the `ParaId` of a collation requested to be distributed does not match the one we expect, we ignore the message. + +As with most other subsystems, we track the active leaves set by following `ActiveLeavesUpdate` signals. + +For the purposes of actually distributing a collation, we need to be connected to the validators who are interested in collations on that `ParaId` at this point in time. We assume that there is a discovery API for connecting to a set of validators. + +> TODO: design & expose the discovery API not just for connecting to such peers but also to determine which of our current peers are validators. + +As seen in the [Scheduler Module][SCH] of the runtime, validator groups are fixed for an entire session and their rotations across cores are predictable. Collators will want to do these things when attempting to distribute collations at a given relay-parent: + * Determine which core the para collated-on is assigned to. + * Determine the group on that core and the next group on that core. + * Issue a discovery request for the validators of the current group and the next group with[`NetworkBridgeMessage`][NBM]`::ConnectToValidators`. + +Once connected to the relevant peers for the current group assigned to the core (transitively, the para), advertise the collation to any of them which advertise the relay-parent in their view (as provided by the [Network Bridge][NB]). If any respond with a request for the full collation, provide it. Upon receiving a view update from any of these peers which includes a relay-parent for which we have a collation that they will find relevant, advertise the collation to them if we haven't already. + +### Validators + +On the validator side of the protocol, validators need to accept incoming connections from collators. They should keep some peer slots open for accepting new speculative connections from collators and should disconnect from collators who are not relevant. + +```dot process +digraph G { + label = "Declaring, advertising, and providing collations"; + labelloc = "t"; + rankdir = LR; + + subgraph cluster_collator { + rank = min; + label = "Collator"; + graph[style = border, rank = min]; + + c1, c2 [label = ""]; + } + + subgraph cluster_validator { + rank = same; + label = "Validator"; + graph[style = border]; + + v1, v2 [label = ""]; + } + + c1 -> v1 [label = "Declare and advertise"]; + + v1 -> c2 [label = "Request"]; + + c2 -> v2 [label = "Provide"]; + + v2 -> v2 [label = "Note Good/Bad"]; +} +``` + +When peers connect to us, they can `Declare` that they represent a collator with given public key. Once they've declared that, they can begin to send advertisements of collations. The peers should not send us any advertisements for collations that are on a relay-parent outside of our view. + +The protocol tracks advertisements received and the source of the advertisement. The advertisement source is the `PeerId` of the peer who sent the message. We accept one advertisement per collator per source per relay-parent. + +As a validator, we will handle requests from other subsystems to fetch a collation on a specific `ParaId` and relay-parent. These requests are made with the [`CollatorProtocolMessage`][CPM]`::FetchCollation`. To do so, we need to first check if we have already gathered a collation on that `ParaId` and relay-parent. If not, we need to select one of the advertisements and issue a request for it. If we've already issued a request, we shouldn't issue another one until the first has returned. + +When acting on an advertisement, we issue a `WireMessage::RequestCollation`. If the request times out, we need to note the collator as being unreliable and reduce its priority relative to other collators. And then make another request - repeat until we get a response or the chain has moved on. + +As a validator, once the collation has been fetched some other subsystem will inspect and do deeper validation of the collation. The subsystem will report to this subsystem with a [`CollatorProtocolMessage`][CPM]`::ReportCollator` or `NoteGoodCollation` message. In that case, if we are connected directly to the collator, we apply a cost to the `PeerId` associated with the collator and potentially disconnect or blacklist it. + +[PoV]: ../../types/availability.md#proofofvalidity +[CPM]: ../../types/overseer-protocol.md#collatorprotocolmessage +[CG]: collation-generation.md +[CB]: ../backing/candidate-backing.md +[NB]: ../utility/network-bridge.md +[CBM]: ../../types/overseer-protocol.md#candidatebackingmesage +[RAM]: ../../types/overseer-protocol.md#runtimeapimessage +[NBM]: ../../types/overseer-protocol.md#networkbridgemessage +[SCH]: ../../runtime/scheduler.md diff --git a/roadmap/implementers-guide/src/node/utility/network-bridge.md b/roadmap/implementers-guide/src/node/utility/network-bridge.md index 09c7e081a6ab..bd4a6a3d75a0 100644 --- a/roadmap/implementers-guide/src/node/utility/network-bridge.md +++ b/roadmap/implementers-guide/src/node/utility/network-bridge.md @@ -8,19 +8,24 @@ One other piece of shared state to track is peer reputation. When peers are foun So in short, this Subsystem acts as a bridge between an actual network component and a subsystem's protocol. +The other component of the network bridge is which peer-set to use. Different peer-sets can be connected for different purposes. The network bridge is not generic over peer-set, but instead exposes two peer-sets that event producers can attach to: `Validation` and `Collation`. More information can be found on the documentation of the [`NetworkBridgeMessage`][NBM]. + ## Protocol -Input: [`NetworkBridgeMessage`](../../types/overseer-protocol.md#network-bridge-message) +Input: [`NetworkBridgeMessage`][NBM] Output: Varying, based on registered event producers. ## Functionality -Track a set of all Event Producers, each associated with a 4-byte protocol ID. +Track a set of all Event Producers, each associated with a 4-byte protocol ID and the `PeerSet` it is associated on. + There are two types of network messages this sends and receives: - ProtocolMessage(ProtocolId, Bytes) - ViewUpdate(View) +Each of these network messages is associated with a particular peer-set. If we are connected to the same peer on both peer-sets, we will receive two `ViewUpdate`s from them every time they change their view. + `ActiveLeavesUpdate`'s `activated` and `deactivated` lists determine the evolution of our local view over time. A `ViewUpdate` is issued to each connected peer after each update, and a `NetworkBridgeUpdate::OurViewChange` is issued for each registered event producer. On `RegisterEventProducer`: @@ -44,3 +49,12 @@ On `ReportPeer` message: On `SendMessage` message: - Issue a corresponding `ProtocolMessage` to each listed peer with given protocol ID and bytes. + +[NBM]: ../../types/overseer-protocol.md#network-bridge-message + +On `ConnectToValidators` message: + +- Determine the DHT keys to use for each validator based on the relay-chain state and Runtime API. +- Recover the Peer IDs of the validators from the DHT. There may be more than one peer ID per validator. +- Accumulate all `(ValidatorId, PeerId)` pairs and send on the response channel. +- Feed all Peer IDs to the discovery utility the underlying network provides. diff --git a/roadmap/implementers-guide/src/parachains-overview.md b/roadmap/implementers-guide/src/parachains-overview.md index 23a8e83cf2b6..b8b39eee4f0f 100644 --- a/roadmap/implementers-guide/src/parachains-overview.md +++ b/roadmap/implementers-guide/src/parachains-overview.md @@ -18,7 +18,7 @@ Here is a description of the Inclusion Pipeline: the path a parachain block (or 1. Validators are selected and assigned to parachains by the Validator Assignment routine. 1. A collator produces the parachain block, which is known as a parachain candidate or candidate, along with a PoV for the candidate. -1. The collator forwards the candidate and PoV to validators assigned to the same parachain via the [Collation Distribution subsystem](node/collators/collation-distribution.md). +1. The collator forwards the candidate and PoV to validators assigned to the same parachain via the [Collator Protocol](node/collators/collator-protocol.md). 1. The validators assigned to a parachain at a given point in time participate in the [Candidate Backing subsystem](node/backing/candidate-backing.md) to validate candidates that were put forward for validation. Candidates which gather enough signed validity statements from validators are considered "backable". Their backing is the set of signed validity statements. 1. A relay-chain block author, selected by BABE, can note up to one (1) backable candidate for each parachain to include in the relay-chain block alongside its backing. A backable candidate once included in the relay-chain is considered backed in that fork of the relay-chain. 1. Once backed in the relay-chain, the parachain candidate is considered to be "pending availability". It is not considered to be included as part of the parachain until it is proven available. diff --git a/roadmap/implementers-guide/src/types/overseer-protocol.md b/roadmap/implementers-guide/src/types/overseer-protocol.md index c3e0e9c2305c..412ac5a1d6c1 100644 --- a/roadmap/implementers-guide/src/types/overseer-protocol.md +++ b/roadmap/implementers-guide/src/types/overseer-protocol.md @@ -126,20 +126,60 @@ enum CandidateSelectionMessage { } ``` +## Collator Protocol Message + +Messages received by the [Collator Protocol subsystem](../node/collators/collator-protocol.md) + +```rust +enum CollatorProtocolMessage { + /// Signal to the collator protocol that it should connect to validators with the expectation + /// of collating on the given para. This is only expected to be called once, early on, if at all, + /// and only by the Collation Generation subsystem. As such, it will overwrite the value of + /// the previous signal. + /// + /// This should be sent before any `DistributeCollation` message. + CollateOn(ParaId), + /// Provide a collation to distribute to validators. + DistributeCollation(CandidateReceipt, PoV), + /// Fetch a collation under the given relay-parent for the given ParaId. + FetchCollation(Hash, ParaId, ResponseChannel<(CandidateReceipt, PoV)>), + /// Report a collator as having provided an invalid collation. This should lead to disconnect + /// and blacklist of the collator. + ReportCollator(CollatorId), + /// Note a collator as having provided a good collation. + NoteGoodCollation(CollatorId), +} +``` + ## Network Bridge Message Messages received by the network bridge. This subsystem is invoked by others to manipulate access to the low-level networking code. ```rust +/// Peer-sets handled by the network bridge. +enum PeerSet { + /// The collation peer-set is used to distribute collations from collators to validators. + Collation, + /// The validation peer-set is used to distribute information relevant to parachain + /// validation among validators. This may include nodes which are not validators, + /// as some protocols on this peer-set are expected to be gossip. + Validation, +} + enum NetworkBridgeMessage { /// Register an event producer with the network bridge. This should be done early and cannot /// be de-registered. - RegisterEventProducer(ProtocolId, Fn(NetworkBridgeEvent) -> AllMessages), + RegisterEventProducer(PeerSet, ProtocolId, Fn(NetworkBridgeEvent) -> AllMessages), /// Report a cost or benefit of a peer. Negative values are costs, positive are benefits. - ReportPeer(PeerId, cost_benefit: i32), + ReportPeer(PeerSet, PeerId, cost_benefit: i32), /// Send a message to one or more peers on the given protocol ID. - SendMessage([PeerId], ProtocolId, Bytes), + SendMessage(PeerSet, [PeerId], ProtocolId, Bytes), + /// Connect to peers who represent the given `ValidatorId`s at the given relay-parent. + /// + /// Also accepts a response channel by which the issuer can learn the `PeerId`s of those + /// validators. + ConnectToValidators(PeerSet, [ValidatorId], ResponseChannel<[(ValidatorId, PeerId)]>>), } ```