Skip to content

Commit

Permalink
Drop stuck lookups (#5824)
Browse files Browse the repository at this point in the history
Squashed commit of the following:

commit f15131f
Author: dapplion <[email protected]>
Date:   Wed May 22 16:01:22 2024 +0200

    Drop stuck lookups
  • Loading branch information
michaelsproul committed May 23, 2024
1 parent 8c041c3 commit acc5eef
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 26 deletions.
6 changes: 3 additions & 3 deletions beacon_node/network/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,9 @@ lazy_static! {
"sync_lookups_completed_total",
"Total count of sync lookups completed",
);
pub static ref SYNC_LOOKUPS_STUCK: Result<IntGauge> = try_create_int_gauge(
"sync_lookups_stuck",
"Current count of sync lookups that may be stuck",
pub static ref SYNC_LOOKUPS_STUCK: Result<IntCounter> = try_create_int_counter(
"sync_lookups_stuck_total",
"Total count of sync lookups that are stuck and dropped",
);

/*
Expand Down
7 changes: 1 addition & 6 deletions beacon_node/network/src/sync/block_lookups/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::sync::block_lookups::single_block_lookup::{
LookupRequestError, SingleBlockLookup, SingleLookupRequestState,
};
use crate::sync::block_lookups::{BlobRequestState, BlockRequestState, PeerId};
use crate::sync::manager::{Id, SLOT_IMPORT_TOLERANCE};
use crate::sync::manager::Id;
use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext};
use beacon_chain::block_verification_types::RpcBlock;
use beacon_chain::BeaconChainTypes;
Expand All @@ -19,11 +19,6 @@ pub enum ResponseType {
Blob,
}

/// The maximum depth we will search for a parent block. In principle we should have sync'd any
/// canonical chain to its head once the peer connects. A chain should not appear where it's depth
/// is further back than the most recent head slot.
pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2;

/// This trait unifies common single block lookup functionality across blocks and blobs. This
/// includes making requests, verifying responses, and handling processing results. A
/// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is
Expand Down
91 changes: 76 additions & 15 deletions beacon_node/network/src/sync/block_lookups/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use self::parent_chain::{compute_parent_chains, NodeChain};
pub use self::single_block_lookup::DownloadResult;
use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup};
use super::manager::{BlockProcessType, BlockProcessingResult};
use super::manager::{BlockProcessType, BlockProcessingResult, SLOT_IMPORT_TOLERANCE};
use super::network_context::{RpcResponseResult, SyncNetworkContext};
use crate::metrics;
use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE};
use crate::sync::block_lookups::common::ResponseType;
use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor;
use crate::sync::manager::{Id, SingleLookupReqId};
use beacon_chain::block_verification_types::AsBlock;
Expand All @@ -28,9 +28,18 @@ mod single_block_lookup;
#[cfg(test)]
mod tests;

/// The maximum depth we will search for a parent block. In principle we should have sync'd any
/// canonical chain to its head once the peer connects. A chain should not appear where it's depth
/// is further back than the most recent head slot.
pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2;

const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60;
pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4;
const LOOKUP_MAX_DURATION_SECS: u64 = 60;

/// Maximum time we allow a lookup to exist before assuming it is stuck and will never make
/// progress. Assume the worse case processing time per block component set * times max depth.
/// 15 * 2 * 32 = 16 minutes.
const LOOKUP_MAX_DURATION_SECS: usize = 15 * PARENT_DEPTH_TOLERANCE;

pub enum BlockComponent<E: EthSpec> {
Block(DownloadResult<Arc<SignedBeaconBlock<E>>>),
Expand Down Expand Up @@ -680,20 +689,72 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
);
}

pub fn log_stuck_lookups(&self) {
let mut stuck_count = 0;
for lookup in self.single_block_lookups.values() {
if lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS) {
debug!(self.log, "Lookup maybe stuck";
// Fields id and block_root are also part of the summary. However, logging them
// here allows log parsers o index them and have better search
"id" => lookup.id,
"block_root" => ?lookup.block_root(),
"summary" => ?lookup,
/// Safety mechanism to unstuck lookup sync. Lookup sync if purely event driven and depends on
/// external components to feed it events to make progress. If there is a bug in network, in
/// beacon processor, or here internally: lookups can get stuck forever. A stuck lookup can
/// stall a node indefinitely as other lookup will be awaiting on a parent lookup to make
/// progress.
///
/// If a lookup lasts more than LOOKUP_MAX_DURATION_SECS this function will find its oldest
/// ancestor and then drop it and all its children. This action will allow the node to unstuck
/// itself. Bugs that cause lookups to get stuck may be triggered consistently. So this strategy
/// is useful for two reasons:
///
/// - One single clear warn level log per stuck incident
/// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min
pub fn drop_stuck_lookups(&mut self) {
// While loop to find and drop all disjoint trees of potentially stuck lookups.
while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| {
lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS as u64)
}) {
let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) {
Ok(lookup) => lookup,
Err(e) => {
warn!(self.log, "Error finding oldest ancestor lookup"; "error" => ?e);
// Default to dropping the lookup that exceeds the max duration so at least
// eventually sync should be unstuck
stuck_lookup
}
};

if stuck_lookup.id == ancestor_stuck_lookup.id {
warn!(self.log, "Notify the devs, a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
);
stuck_count += 1;
} else {
warn!(self.log, "Notify the devs, a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
"ancestor_lookup" => ?ancestor_stuck_lookup,
);
}

metrics::inc_counter(&metrics::SYNC_LOOKUPS_STUCK);
self.drop_lookup_and_children(ancestor_stuck_lookup.id);
}
}

/// Recursively find the oldest ancestor lookup of another lookup
fn find_oldest_ancestor_lookup<'a>(
&'a self,
stuck_lookup: &'a SingleBlockLookup<T>,
) -> Result<&'a SingleBlockLookup<T>, String> {
if let Some(awaiting_parent) = stuck_lookup.awaiting_parent() {
if let Some(lookup) = self
.single_block_lookups
.values()
.find(|l| l.block_root() == awaiting_parent)
{
self.find_oldest_ancestor_lookup(lookup)
} else {
Err(format!(
"Lookup references unknown parent {awaiting_parent:?}"
))
}
} else {
Ok(stuck_lookup)
}
metrics::set_gauge(&metrics::SYNC_LOOKUPS_STUCK, stuck_count);
}
}
2 changes: 1 addition & 1 deletion beacon_node/network/src/sync/block_lookups/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use std::sync::Arc;

use super::*;

use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE};
use crate::sync::block_lookups::common::ResponseType;
use beacon_chain::blob_verification::GossipVerifiedBlob;
use beacon_chain::block_verification_types::{BlockImportData, RpcBlock};
use beacon_chain::builder::Witness;
Expand Down
2 changes: 1 addition & 1 deletion beacon_node/network/src/sync/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
self.handle_new_execution_engine_state(engine_state);
}
_ = interval.tick() => {
self.block_lookups.log_stuck_lookups();
self.block_lookups.drop_stuck_lookups();
}
}
}
Expand Down

0 comments on commit acc5eef

Please sign in to comment.