Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Check spawned worker version vs node version before PVF preparation #6861

Merged
merged 20 commits into from
Mar 29, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
df6c683
Check spawned worker version vs node version before PVF preparation
s0me0ne-unkn0wn Mar 12, 2023
335495b
Address discussions
s0me0ne-unkn0wn Mar 12, 2023
b96cc31
Propagate errors and shutdown preparation and execution pipelines pro…
s0me0ne-unkn0wn Mar 13, 2023
54b11c6
Add logs; Fix execution worker checks
s0me0ne-unkn0wn Mar 14, 2023
3cd790f
Revert "Propagate errors and shutdown preparation and execution pipel…
s0me0ne-unkn0wn Mar 14, 2023
acc94f7
Don't try to shut down; report the condition and exit worker
s0me0ne-unkn0wn Mar 15, 2023
0db3ed3
Merge branch 's0me0ne/worker-version-v2' into s0me0ne/worker-version
s0me0ne-unkn0wn Mar 15, 2023
89c7897
Get rid of `VersionMismatch` preparation error
s0me0ne-unkn0wn Mar 15, 2023
fb5ccc9
Merge master
s0me0ne-unkn0wn Mar 15, 2023
db26c64
Add docs; Fix tests
s0me0ne-unkn0wn Mar 15, 2023
5a344de
Merge branch 'master' into s0me0ne/worker-version
s0me0ne-unkn0wn Mar 15, 2023
b696baa
Update Cargo.lock
s0me0ne-unkn0wn Mar 15, 2023
501d4be
Kill again, but only the main node process
s0me0ne-unkn0wn Mar 18, 2023
a42d0a5
Merge branch 'master' into s0me0ne/worker-version
s0me0ne-unkn0wn Mar 20, 2023
1b4678b
Move unsafe code to a common safe function
s0me0ne-unkn0wn Mar 20, 2023
ec9c4ca
Fix libc dependency error on MacOS
mrcnski Mar 20, 2023
0bd760f
pvf spawning: Add some logging, add a small integration test
mrcnski Mar 22, 2023
a9562c3
Merge branch 'master' into s0me0ne/worker-version
s0me0ne-unkn0wn Mar 29, 2023
0ecce2e
Minor fixes
s0me0ne-unkn0wn Mar 29, 2023
456314c
Restart CI
s0me0ne-unkn0wn Mar 29, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ pub enum Subcommand {
pub struct ValidationWorkerCommand {
/// The path to the validation host's socket.
pub socket_path: String,
/// Calling node implementation version
#[arg(long)]
pub node_impl_version: String,
}

#[allow(missing_docs)]
Expand Down
5 changes: 4 additions & 1 deletion cli/src/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,10 @@ pub fn run() -> Result<()> {

#[cfg(not(target_os = "android"))]
{
polkadot_node_core_pvf::prepare_worker_entrypoint(&cmd.socket_path);
polkadot_node_core_pvf::prepare_worker_entrypoint(
&cmd.socket_path,
Some(&cmd.node_impl_version),
);
Ok(())
}
},
Expand Down
3 changes: 3 additions & 0 deletions node/core/pvf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ sp-wasm-interface = { git = "https://github.com/paritytech/substrate", branch =
sp-maybe-compressed-blob = { git = "https://github.com/paritytech/substrate", branch = "master" }
sp-tracing = { git = "https://github.com/paritytech/substrate", branch = "master" }

[build-dependencies]
substrate-build-script-utils = { git = "https://github.com/paritytech/substrate", branch = "master" }

[target.'cfg(target_os = "linux")'.dependencies]
libc = "0.2.139"
tikv-jemalloc-ctl = "0.5.0"
Expand Down
19 changes: 19 additions & 0 deletions node/core/pvf/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright 2017-2023 Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

fn main() {
substrate_build_script_utils::generate_cargo_keys();
}
6 changes: 5 additions & 1 deletion node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ pub enum PrepareError {
/// The response from the worker is received, but the file cannot be renamed (moved) to the final destination
/// location. This state is reported by the validation host (not by the worker).
RenameTmpFileErr(String),
/// Node and worker version mismatch. May happen if the node binary has been upgraded in-place.
VersionMismatch,
}

impl PrepareError {
Expand All @@ -55,7 +57,8 @@ impl PrepareError {
use PrepareError::*;
match self {
Prevalidation(_) | Preparation(_) | Panic(_) => true,
TimedOut | IoErr(_) | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false,
TimedOut | IoErr(_) | CreateTmpFileErr(_) | RenameTmpFileErr(_) | VersionMismatch =>
false,
}
}
}
Expand All @@ -71,6 +74,7 @@ impl fmt::Display for PrepareError {
IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err),
CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err),
RenameTmpFileErr(err) => write!(f, "prepare: error renaming tmp file: {}", err),
VersionMismatch => write!(f, "prepare: node and worker version mismatch"),
}
}
}
Expand Down
36 changes: 33 additions & 3 deletions node/core/pvf/src/prepare/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,13 @@ pub async fn spawn(
program_path: &Path,
spawn_timeout: Duration,
) -> Result<(IdleWorker, WorkerHandle), SpawnErr> {
spawn_with_program_path("prepare", program_path, &["prepare-worker"], spawn_timeout).await
spawn_with_program_path(
"prepare",
program_path,
&["prepare-worker", "--node-impl-version", env!("SUBSTRATE_CLI_IMPL_VERSION")],
spawn_timeout,
)
.await
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
}

pub enum Outcome {
Expand Down Expand Up @@ -175,6 +181,15 @@ async fn handle_response(
Ok(result) => result,
// Timed out on the child. This should already be logged by the child.
Err(PrepareError::TimedOut) => return Outcome::TimedOut,
// Worker reported version mismatch
Err(PrepareError::VersionMismatch) => {
gum::error!(
target: LOG_TARGET,
%worker_pid,
"node and worker version mismatch",
);
std::process::exit(1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can send a signal to the overseer to let it shutdown?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's PVF host, it isn't a subsystem by itself, and it's not aware of overseer's existence 🙄
To handle it the right way, the error should be propagated all the way up to the candidate validation and PVF pre-check subsystems and handled by them separately. Do you think it's worth the effort?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't just kill the process here, you should bubble up this error and let the host kill the worker properly. Look at how the host handles e.g. TimedOut for an example. Also, I'm pretty sure this code is running on the host so exit would kill the node, not the worker, which I think is not what we want? (Haven't followed the disputes issue that closely over the weekend.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I'm pretty sure this code is running on the host so exit would kill the node, not the worker

Yes, it's exactly what we want. At this point, we found out that the node owner screwed up the node upgrade process and we're still running an old node software, but the binary is already new, and we're trying to execute a new worker from the old node. In that case, we want to tear the whole node down and let the node owner handle its restart.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, got it. Is any clean-up of the workers necessary, or do they get killed automatically as child processes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Unfortunately this means potentially waiting the full lenient timeout duration for a worker to exit. I think it would make sense to just kill them forcefully so we don't wait.

Yes for sure we should kill the workers, they don't do anything important that could get lost when we kill them.

It makes sense to implement @koute suggestion as well and include polkadot's version into the artifact path. This way old version can't override an artifact for the new node.

This should be a separate pr.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to wait for the workers to shutdown.

Good point! But I've reviewed the code and looks like @pepyakin thought this through before us.

start_work() on the host side provides the worker with a temporary path where it should store the artifact. If the worker reports a positive result, then in handle_result(), again on the host side, the artifact gets renamed to its real artifact_id path. If the host dies between start_work() and handle_result(), the only leftover is a stale tmp file which will be pruned on the next node restart.

It's still a good idea to kill preparation workers on version mismatch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm.... thinking about this some more, since we delete the cache anyway at each startup couldn't we alternatively just hash a random number into the filename? I don't remember if we already do this, but assuming we write the cache atomically to disk (that is, save the cache to a file with e.g. .tmp extension or into another directory, and then rename it to where we want it; writing is not atomic, but renaming always is) we'd be essentially guaranteed that we won't load a stale file. This would also allow us to just simply forcibly kill the worker on exit without having to wait for it to shut down.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@koute you mean some random number we generate on node startup and use throughout the node instance lifetime? I think it's a good approach also, but your first idea about including the node version number to the artifact name is somewhat more... Deterministic, I'd say? :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean some random number we generate on node startup and use throughout the node instance lifetime?

Yeah, just generate a random number and then just hash that in. Assuming the number is actually random and the hash is cryptographic (e.g. BLAKE2) we'll have a guarantee that no two separate runs will try to load the same files.

I think it's a good approach also, but your first idea about including the node version number to the artifact name is somewhat more... Deterministic, I'd say? :)

Well, sure, but considering that we don't want to reuse those cache files after the node restarts what's the point of them having stable filenames across restarts? Isn't that just unnecessary complexity? (:

(Unless we do want to reuse them after a restart, in which case carry on.)

},
Err(_) => return Outcome::Concluded { worker, result },
};

Expand Down Expand Up @@ -342,11 +357,26 @@ async fn recv_response(stream: &mut UnixStream, pid: u32) -> io::Result<PrepareR
///
/// 7. Send the result of preparation back to the host. If any error occurred in the above steps, we
/// send that in the `PrepareResult`.
pub fn worker_entrypoint(socket_path: &str) {
pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) {
worker_event_loop("prepare", socket_path, |rt_handle, mut stream| async move {
let worker_pid = std::process::id();
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
let version_mismatch = if let Some(version) = node_version {
version != env!("SUBSTRATE_CLI_IMPL_VERSION")
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
} else {
false
};

loop {
let worker_pid = std::process::id();
let (pvf, dest) = recv_request(&mut stream).await?;
if version_mismatch {
gum::error!(
target: LOG_TARGET,
%worker_pid,
"worker: node and worker version mismatch",
);
send_response(&mut stream, Err(PrepareError::VersionMismatch)).await?;
return Err(io::Error::new(io::ErrorKind::Unsupported, "Version mismatch"))
}
gum::debug!(
target: LOG_TARGET,
%worker_pid,
Expand Down
2 changes: 1 addition & 1 deletion node/core/pvf/src/testing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ macro_rules! decl_puppet_worker_main {
},
"prepare-worker" => {
let socket_path = &args[2];
$crate::prepare_worker_entrypoint(socket_path);
$crate::prepare_worker_entrypoint(socket_path, None);
},
"execute-worker" => {
let socket_path = &args[2];
Expand Down
2 changes: 1 addition & 1 deletion node/malus/src/malus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ impl MalusCli {

#[cfg(not(target_os = "android"))]
{
polkadot_node_core_pvf::prepare_worker_entrypoint(&cmd.socket_path);
polkadot_node_core_pvf::prepare_worker_entrypoint(&cmd.socket_path, None);
}
},
NemesisVariant::PvfExecuteWorker(cmd) => {
Expand Down