Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - CI fix: add retries to eth1 sim tests #4501

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions testing/node_test_rig/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ validator_client = { path = "../../validator_client" }
validator_dir = { path = "../../common/validator_dir", features = ["insecure_keys"] }
sensitive_url = { path = "../../common/sensitive_url" }
execution_layer = { path = "../../beacon_node/execution_layer" }
tokio = { version = "1.14.0", features = ["time"] }
19 changes: 13 additions & 6 deletions testing/node_test_rig/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::path::PathBuf;
use std::time::Duration;
use std::time::{SystemTime, UNIX_EPOCH};
use tempfile::{Builder as TempBuilder, TempDir};
use tokio::time::timeout;
use types::EthSpec;
use validator_client::ProductionValidatorClient;
use validator_dir::insecure_keys::build_deterministic_validator_dirs;
Expand All @@ -24,6 +25,8 @@ pub use validator_client::Config as ValidatorConfig;

/// The global timeout for HTTP requests to the beacon node.
const HTTP_TIMEOUT: Duration = Duration::from_secs(4);
/// The timeout for a beacon node to start up.
const STARTUP_TIMEOUT: Duration = Duration::from_secs(60);

/// Provides a beacon node that is running in the current process on a given tokio executor (it
/// is _local_ to this process).
Expand Down Expand Up @@ -51,12 +54,16 @@ impl<E: EthSpec> LocalBeaconNode<E> {
client_config.set_data_dir(datadir.path().into());
client_config.network.network_dir = PathBuf::from(datadir.path()).join("network");

ProductionBeaconNode::new(context, client_config)
.await
.map(move |client| Self {
client: client.into_inner(),
datadir,
})
timeout(
STARTUP_TIMEOUT,
ProductionBeaconNode::new(context, client_config),
)
.await
.map_err(|_| format!("Beacon node startup timed out after {:?}", STARTUP_TIMEOUT))?
.map(move |client| Self {
client: client.into_inner(),
datadir,
})
}
}

Expand Down
168 changes: 104 additions & 64 deletions testing/simulator/src/eth1_sim.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
use crate::local_network::{EXECUTION_PORT, TERMINAL_BLOCK, TERMINAL_DIFFICULTY};
use crate::{checks, LocalNetwork, E};
use crate::{checks, LocalNetwork};
use clap::ArgMatches;
use eth1::{Eth1Endpoint, DEFAULT_CHAIN_ID};
use eth1_test_rig::AnvilEth1Instance;

use crate::retry::with_retry;
use execution_layer::http::deposit_methods::Eth1Id;
use futures::prelude::*;
use node_test_rig::environment::RuntimeContext;
use node_test_rig::{
environment::{EnvironmentBuilder, LoggerConfig},
testing_client_config, testing_validator_config, ClientGenesis, ValidatorFiles,
testing_client_config, testing_validator_config, ClientConfig, ClientGenesis, ValidatorFiles,
};
use rayon::prelude::*;
use sensitive_url::SensitiveUrl;
Expand Down Expand Up @@ -107,71 +109,24 @@ pub fn run_eth1_sim(matches: &ArgMatches) -> Result<(), String> {
let context = env.core_context();

let main_future = async {
/*
* Deploy the deposit contract, spawn tasks to keep creating new blocks and deposit
* validators.
*/
let anvil_eth1_instance = AnvilEth1Instance::new(DEFAULT_CHAIN_ID.into()).await?;
let deposit_contract = anvil_eth1_instance.deposit_contract;
let chain_id = anvil_eth1_instance.anvil.chain_id();
let anvil = anvil_eth1_instance.anvil;
let eth1_endpoint = SensitiveUrl::parse(anvil.endpoint().as_str())
.expect("Unable to parse anvil endpoint.");
let deposit_contract_address = deposit_contract.address();

// Start a timer that produces eth1 blocks on an interval.
tokio::spawn(async move {
let mut interval = tokio::time::interval(eth1_block_time);
loop {
interval.tick().await;
let _ = anvil.evm_mine().await;
}
});

// Submit deposits to the deposit contract.
tokio::spawn(async move {
for i in 0..total_validator_count {
println!("Submitting deposit for validator {}...", i);
let _ = deposit_contract
.deposit_deterministic_async::<E>(i, deposit_amount)
.await;
}
});

let mut beacon_config = testing_client_config();

beacon_config.genesis = ClientGenesis::DepositContract;
beacon_config.eth1.endpoint = Eth1Endpoint::NoAuth(eth1_endpoint);
beacon_config.eth1.deposit_contract_address = deposit_contract_address;
beacon_config.eth1.deposit_contract_deploy_block = 0;
beacon_config.eth1.lowest_cached_block_number = 0;
beacon_config.eth1.follow_distance = 1;
beacon_config.eth1.node_far_behind_seconds = 20;
beacon_config.dummy_eth1_backend = false;
beacon_config.sync_eth1_chain = true;
beacon_config.eth1.auto_update_interval_millis = eth1_block_time.as_millis() as u64;
beacon_config.eth1.chain_id = Eth1Id::from(chain_id);
beacon_config.network.target_peers = node_count + proposer_nodes - 1;

beacon_config.network.enr_address = (Some(Ipv4Addr::LOCALHOST), None);

if post_merge_sim {
let el_config = execution_layer::Config {
execution_endpoints: vec![SensitiveUrl::parse(&format!(
"http://localhost:{}",
EXECUTION_PORT
))
.unwrap()],
..Default::default()
};

beacon_config.execution_layer = Some(el_config);
}

Comment on lines -110 to -170
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section has been extracted into a create_local_network function below without any logic change.
https://github.com/sigp/lighthouse/pull/4501/files#diff-21c83610adf1cfb2330cabd0bec079ae7f7c0f3e3d525c7931181fd5592983c3R309

/*
* Create a new `LocalNetwork` with one beacon node.
*/
let network = LocalNetwork::new(context.clone(), beacon_config.clone()).await?;
let max_retries = 3;
Copy link
Member Author

@jimmygchen jimmygchen Jul 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying a small retry number to start with, the deposit logs issue seems to occur about 50/50 of times and usually passes on the 1st retry attempt, so i reckon maybe 3 would be enough, so we don't wait for too long for the job to fail if there's a geniune issue in the beacon node start up.

I initially had a different error type to signify a timeout error, but I don't feel it's worth it so I've removed them to keep this PR small.

let (network, beacon_config) = with_retry(max_retries, || {
Box::pin(create_local_network(
LocalNetworkParams {
eth1_block_time,
total_validator_count,
deposit_amount,
node_count,
proposer_nodes,
post_merge_sim,
},
context.clone(),
))
})
.await?;

/*
* One by one, add beacon nodes to the network.
Expand Down Expand Up @@ -341,3 +296,88 @@ pub fn run_eth1_sim(matches: &ArgMatches) -> Result<(), String> {

Ok(())
}

struct LocalNetworkParams {
eth1_block_time: Duration,
total_validator_count: usize,
deposit_amount: u64,
node_count: usize,
proposer_nodes: usize,
post_merge_sim: bool,
}

async fn create_local_network<E: EthSpec>(
LocalNetworkParams {
eth1_block_time,
total_validator_count,
deposit_amount,
node_count,
proposer_nodes,
post_merge_sim,
}: LocalNetworkParams,
context: RuntimeContext<E>,
) -> Result<(LocalNetwork<E>, ClientConfig), String> {
/*
* Deploy the deposit contract, spawn tasks to keep creating new blocks and deposit
* validators.
*/
let anvil_eth1_instance = AnvilEth1Instance::new(DEFAULT_CHAIN_ID.into()).await?;
let deposit_contract = anvil_eth1_instance.deposit_contract;
let chain_id = anvil_eth1_instance.anvil.chain_id();
let anvil = anvil_eth1_instance.anvil;
let eth1_endpoint =
SensitiveUrl::parse(anvil.endpoint().as_str()).expect("Unable to parse anvil endpoint.");
let deposit_contract_address = deposit_contract.address();

// Start a timer that produces eth1 blocks on an interval.
tokio::spawn(async move {
let mut interval = tokio::time::interval(eth1_block_time);
loop {
interval.tick().await;
let _ = anvil.evm_mine().await;
}
});

// Submit deposits to the deposit contract.
tokio::spawn(async move {
for i in 0..total_validator_count {
println!("Submitting deposit for validator {}...", i);
let _ = deposit_contract
.deposit_deterministic_async::<E>(i, deposit_amount)
.await;
}
});

let mut beacon_config = testing_client_config();

beacon_config.genesis = ClientGenesis::DepositContract;
beacon_config.eth1.endpoint = Eth1Endpoint::NoAuth(eth1_endpoint);
beacon_config.eth1.deposit_contract_address = deposit_contract_address;
beacon_config.eth1.deposit_contract_deploy_block = 0;
beacon_config.eth1.lowest_cached_block_number = 0;
beacon_config.eth1.follow_distance = 1;
beacon_config.eth1.node_far_behind_seconds = 20;
beacon_config.dummy_eth1_backend = false;
beacon_config.sync_eth1_chain = true;
beacon_config.eth1.auto_update_interval_millis = eth1_block_time.as_millis() as u64;
beacon_config.eth1.chain_id = Eth1Id::from(chain_id);
beacon_config.network.target_peers = node_count + proposer_nodes - 1;

beacon_config.network.enr_address = (Some(Ipv4Addr::LOCALHOST), None);

if post_merge_sim {
let el_config = execution_layer::Config {
execution_endpoints: vec![SensitiveUrl::parse(&format!(
"http://localhost:{}",
EXECUTION_PORT
))
.unwrap()],
..Default::default()
};

beacon_config.execution_layer = Some(el_config);
}

let network = LocalNetwork::new(context, beacon_config.clone()).await?;
Ok((network, beacon_config))
}
1 change: 1 addition & 0 deletions testing/simulator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ mod cli;
mod eth1_sim;
mod local_network;
mod no_eth1_sim;
mod retry;
mod sync_sim;

use cli::cli_app;
Expand Down
63 changes: 63 additions & 0 deletions testing/simulator/src/retry.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use std::fmt::Debug;
use std::future::Future;
use std::pin::Pin;

/// Executes the function with a specified number of retries if the function returns an error.
/// Once it exceeds `max_retries` and still fails, the error is returned.
pub async fn with_retry<T, E, F>(max_retries: usize, mut func: F) -> Result<T, E>
where
F: FnMut() -> Pin<Box<dyn Future<Output = Result<T, E>>>>,
E: Debug,
{
let mut retry_count = 0;
loop {
let result = Box::pin(func()).await;
if result.is_ok() || retry_count >= max_retries {
break result;
}
retry_count += 1;

if let Err(e) = result {
eprintln!(
"Operation failed with error {:?}, retrying {} of {}",
e, retry_count, max_retries
);
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::collections::VecDeque;

async fn my_async_func(is_ok: bool) -> Result<(), ()> {
if is_ok {
Ok(())
} else {
Err(())
}
}

#[tokio::test]
async fn test_with_retry_ok() {
let res = with_retry(3, || Box::pin(my_async_func(true))).await;
assert!(res.is_ok());
}

#[tokio::test]
async fn test_with_retry_2nd_ok() {
let mut mock_results = VecDeque::from([false, true]);
let res = with_retry(3, || {
Box::pin(my_async_func(mock_results.pop_front().unwrap()))
})
.await;
assert!(res.is_ok());
}

#[tokio::test]
async fn test_with_retry_fail() {
let res = with_retry(3, || Box::pin(my_async_func(false))).await;
assert!(res.is_err());
}
}