diff --git a/stacks-signer/CHANGELOG.md b/stacks-signer/CHANGELOG.md index db6ccacf6c..df4b560b1e 100644 --- a/stacks-signer/CHANGELOG.md +++ b/stacks-signer/CHANGELOG.md @@ -17,6 +17,9 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE - Introduced the `block_proposal_max_age_secs` configuration option for signers, enabling them to automatically ignore block proposals that exceed the specified age in seconds. - When a new block proposal is received while the signer is waiting for an existing proposal to be validated, the signer will wait until the existing block is done validating before submitting the new one for validating. ([#5453](https://github.com/stacks-network/stacks-core/pull/5453)) +- Introduced two new prometheus metrics: + - `stacks_signer_block_validation_latencies_histogram`: the validation_time_ms reported by the node when validating a block proposal + - `stacks_signer_block_response_latencies_histogram`: the "end-to-end" time it takes for the signer to issue a block response ## Changed - Improvements to the stale signer cleanup logic: deletes the prior signer if it has no remaining unprocessed blocks in its database diff --git a/stacks-signer/src/client/stacks_client.rs b/stacks-signer/src/client/stacks_client.rs index 4676738629..f6cb9c6d8b 100644 --- a/stacks-signer/src/client/stacks_client.rs +++ b/stacks-signer/src/client/stacks_client.rs @@ -323,8 +323,10 @@ impl StacksClient { block, chain_id: self.chain_id, }; - let timer = - crate::monitoring::new_rpc_call_timer(&self.block_proposal_path(), &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer( + &self.block_proposal_path(), + &self.http_origin, + ); let send_request = || { self.stacks_node_client .post(self.block_proposal_path()) @@ -399,7 +401,8 @@ impl StacksClient { "{}{RPC_TENURE_FORKING_INFO_PATH}/:start/:stop", self.http_origin ); - let timer = crate::monitoring::new_rpc_call_timer(&metrics_path, &self.http_origin); + let timer = + crate::monitoring::actions::new_rpc_call_timer(&metrics_path, &self.http_origin); let send_request = || { self.stacks_node_client .get(&path) @@ -420,7 +423,7 @@ impl StacksClient { pub fn get_current_and_last_sortition(&self) -> Result { debug!("StacksClient: Getting current and prior sortition"); let path = format!("{}/latest_and_last", self.sortition_info_path()); - let timer = crate::monitoring::new_rpc_call_timer(&path, &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer(&path, &self.http_origin); let send_request = || { self.stacks_node_client.get(&path).send().map_err(|e| { warn!("Signer failed to request latest sortition"; "err" => ?e); @@ -460,8 +463,10 @@ impl StacksClient { /// Get the current peer info data from the stacks node pub fn get_peer_info(&self) -> Result { debug!("StacksClient: Getting peer info"); - let timer = - crate::monitoring::new_rpc_call_timer(&self.core_info_path(), &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer( + &self.core_info_path(), + &self.http_origin, + ); let send_request = || { self.stacks_node_client .get(self.core_info_path()) @@ -485,7 +490,7 @@ impl StacksClient { debug!("StacksClient: Getting reward set signers"; "reward_cycle" => reward_cycle, ); - let timer = crate::monitoring::new_rpc_call_timer( + let timer = crate::monitoring::actions::new_rpc_call_timer( &format!("{}/v3/stacker_set/:reward_cycle", self.http_origin), &self.http_origin, ); @@ -521,7 +526,8 @@ impl StacksClient { /// Retrieve the current pox data from the stacks node pub fn get_pox_data(&self) -> Result { debug!("StacksClient: Getting pox data"); - let timer = crate::monitoring::new_rpc_call_timer(&self.pox_path(), &self.http_origin); + let timer = + crate::monitoring::actions::new_rpc_call_timer(&self.pox_path(), &self.http_origin); let send_request = || { self.stacks_node_client .get(self.pox_path()) @@ -572,7 +578,7 @@ impl StacksClient { "address" => %address, ); let timer_label = format!("{}/v2/accounts/:principal", self.http_origin); - let timer = crate::monitoring::new_rpc_call_timer(&timer_label, &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer(&timer_label, &self.http_origin); let send_request = || { self.stacks_node_client .get(self.accounts_path(address)) @@ -628,7 +634,7 @@ impl StacksClient { "block_height" => %block.header.chain_length, ); let path = format!("{}{}?broadcast=1", self.http_origin, postblock_v3::PATH); - let timer = crate::monitoring::new_rpc_call_timer(&path, &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer(&path, &self.http_origin); let send_request = || { self.stacks_node_client .post(&path) @@ -678,7 +684,7 @@ impl StacksClient { "{}/v2/contracts/call-read/:principal/{contract_name}/{function_name}", self.http_origin ); - let timer = crate::monitoring::new_rpc_call_timer(&timer_label, &self.http_origin); + let timer = crate::monitoring::actions::new_rpc_call_timer(&timer_label, &self.http_origin); let response = self .stacks_node_client .post(path) diff --git a/stacks-signer/src/lib.rs b/stacks-signer/src/lib.rs index 244675c65c..9f2df12534 100644 --- a/stacks-signer/src/lib.rs +++ b/stacks-signer/src/lib.rs @@ -125,7 +125,7 @@ impl + Send + 'static, T: SignerEventTrait + 'static> SpawnedSigner ); let (res_send, res_recv) = channel(); let ev = SignerEventReceiver::new(config.network.is_mainnet()); - crate::monitoring::start_serving_monitoring_metrics(config.clone()).ok(); + crate::monitoring::actions::start_serving_monitoring_metrics(config.clone()).ok(); let runloop = RunLoop::new(config.clone()); let mut signer: RunLoopSigner = libsigner::Signer::new(runloop, ev, res_send); let running_signer = signer.spawn(endpoint).expect("Failed to spawn signer"); diff --git a/stacks-signer/src/monitoring/mod.rs b/stacks-signer/src/monitoring/mod.rs index 400541d0e7..60a530acab 100644 --- a/stacks-signer/src/monitoring/mod.rs +++ b/stacks-signer/src/monitoring/mod.rs @@ -14,139 +14,176 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . -#[cfg(feature = "monitoring_prom")] -use ::prometheus::HistogramTimer; -#[cfg(feature = "monitoring_prom")] -use slog::slog_error; -#[cfg(not(feature = "monitoring_prom"))] -use slog::slog_info; -#[cfg(feature = "monitoring_prom")] -use stacks_common::error; -#[cfg(not(feature = "monitoring_prom"))] -use stacks_common::info; - -use crate::config::GlobalConfig; - #[cfg(feature = "monitoring_prom")] mod prometheus; #[cfg(feature = "monitoring_prom")] mod server; -/// Update stacks tip height gauge -#[allow(unused_variables)] -pub fn update_stacks_tip_height(height: i64) { - #[cfg(feature = "monitoring_prom")] - prometheus::STACKS_TIP_HEIGHT_GAUGE.set(height); -} +/// Actions for updating metrics +#[cfg(feature = "monitoring_prom")] +pub mod actions { + use ::prometheus::HistogramTimer; + use blockstack_lib::chainstate::nakamoto::NakamotoBlock; + use slog::slog_error; + use stacks_common::error; + + use crate::config::GlobalConfig; + use crate::monitoring::prometheus::*; + + /// Update stacks tip height gauge + pub fn update_stacks_tip_height(height: i64) { + STACKS_TIP_HEIGHT_GAUGE.set(height); + } -/// Update the current reward cycle -#[allow(unused_variables)] -pub fn update_reward_cycle(reward_cycle: i64) { - #[cfg(feature = "monitoring_prom")] - prometheus::CURRENT_REWARD_CYCLE.set(reward_cycle); -} + /// Update the current reward cycle + pub fn update_reward_cycle(reward_cycle: i64) { + CURRENT_REWARD_CYCLE.set(reward_cycle); + } -/// Increment the block validation responses counter -#[allow(unused_variables)] -pub fn increment_block_validation_responses(accepted: bool) { - #[cfg(feature = "monitoring_prom")] - { + /// Increment the block validation responses counter + pub fn increment_block_validation_responses(accepted: bool) { let label_value = if accepted { "accepted" } else { "rejected" }; - prometheus::BLOCK_VALIDATION_RESPONSES + BLOCK_VALIDATION_RESPONSES .with_label_values(&[label_value]) .inc(); } -} -/// Increment the block responses sent counter -#[allow(unused_variables)] -pub fn increment_block_responses_sent(accepted: bool) { - #[cfg(feature = "monitoring_prom")] - { + /// Increment the block responses sent counter + pub fn increment_block_responses_sent(accepted: bool) { let label_value = if accepted { "accepted" } else { "rejected" }; - prometheus::BLOCK_RESPONSES_SENT - .with_label_values(&[label_value]) - .inc(); + BLOCK_RESPONSES_SENT.with_label_values(&[label_value]).inc(); } -} -/// Increment the number of block proposals received -#[allow(unused_variables)] -pub fn increment_block_proposals_received() { - #[cfg(feature = "monitoring_prom")] - prometheus::BLOCK_PROPOSALS_RECEIVED.inc(); -} - -/// Update the stx balance of the signer -#[allow(unused_variables)] -pub fn update_signer_stx_balance(balance: i64) { - #[cfg(feature = "monitoring_prom")] - prometheus::SIGNER_STX_BALANCE.set(balance); -} + /// Increment the number of block proposals received + pub fn increment_block_proposals_received() { + BLOCK_PROPOSALS_RECEIVED.inc(); + } -/// Update the signer nonce metric -#[allow(unused_variables)] -pub fn update_signer_nonce(nonce: u64) { - #[cfg(feature = "monitoring_prom")] - prometheus::SIGNER_NONCE.set(nonce as i64); -} + /// Update the stx balance of the signer + pub fn update_signer_stx_balance(balance: i64) { + SIGNER_STX_BALANCE.set(balance); + } -// Allow dead code because this is only used in the `monitoring_prom` feature -// but we want to run it in a test -#[allow(dead_code)] -/// Remove the origin from the full path to avoid duplicate metrics for different origins -fn remove_origin_from_path(full_path: &str, origin: &str) -> String { - full_path.replace(origin, "") -} + /// Update the signer nonce metric + pub fn update_signer_nonce(nonce: u64) { + SIGNER_NONCE.set(nonce as i64); + } -/// Start a new RPC call timer. -/// The `origin` parameter is the base path of the RPC call, e.g. `http://node.com`. -/// The `origin` parameter is removed from `full_path` when storing in prometheus. -#[cfg(feature = "monitoring_prom")] -pub fn new_rpc_call_timer(full_path: &str, origin: &str) -> HistogramTimer { - let path = remove_origin_from_path(full_path, origin); - let histogram = prometheus::SIGNER_RPC_CALL_LATENCIES_HISTOGRAM.with_label_values(&[&path]); - histogram.start_timer() -} + /// Start a new RPC call timer. + /// The `origin` parameter is the base path of the RPC call, e.g. `http://node.com`. + /// The `origin` parameter is removed from `full_path` when storing in prometheus. + pub fn new_rpc_call_timer(full_path: &str, origin: &str) -> HistogramTimer { + let path = super::remove_origin_from_path(full_path, origin); + let histogram = SIGNER_RPC_CALL_LATENCIES_HISTOGRAM.with_label_values(&[&path]); + histogram.start_timer() + } -/// NoOp timer uses for monitoring when the monitoring feature is not enabled. -pub struct NoOpTimer; -impl NoOpTimer { - /// NoOp method to stop recording when the monitoring feature is not enabled. - pub fn stop_and_record(&self) {} -} + /// Record the time taken to issue a block response for + /// a given block. The block's timestamp is used to calculate the latency. + /// + /// Call this right after broadcasting a BlockResponse + pub fn record_block_response_latency(block: &NakamotoBlock) { + use clarity::util::get_epoch_time_ms; + + let diff = + get_epoch_time_ms().saturating_sub(block.header.timestamp.saturating_mul(1000).into()); + SIGNER_BLOCK_RESPONSE_LATENCIES_HISTOGRAM + .with_label_values(&[]) + .observe(diff as f64 / 1000.0); + } -/// Stop and record the no-op timer. -#[cfg(not(feature = "monitoring_prom"))] -pub fn new_rpc_call_timer(_full_path: &str, _origin: &str) -> NoOpTimer { - NoOpTimer -} + /// Record the time taken to validate a block, as reported by the Stacks node. + pub fn record_block_validation_latency(latency_ms: u64) { + SIGNER_BLOCK_VALIDATION_LATENCIES_HISTOGRAM + .with_label_values(&[]) + .observe(latency_ms as f64 / 1000.0); + } -/// Start serving monitoring metrics. -/// This will only serve the metrics if the `monitoring_prom` feature is enabled. -#[allow(unused_variables)] -pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> { - #[cfg(feature = "monitoring_prom")] - { + /// Start serving monitoring metrics. + /// This will only serve the metrics if the `monitoring_prom` feature is enabled. + pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> { if config.metrics_endpoint.is_none() { return Ok(()); } - let thread = std::thread::Builder::new() + let _ = std::thread::Builder::new() .name("signer_metrics".to_string()) .spawn(move || { - if let Err(monitoring_err) = server::MonitoringServer::start(&config) { + if let Err(monitoring_err) = super::server::MonitoringServer::start(&config) { error!("Monitoring: Error in metrics server: {:?}", monitoring_err); } }); + Ok(()) + } +} + +/// No-op actions for updating metrics +#[cfg(not(feature = "monitoring_prom"))] +pub mod actions { + use blockstack_lib::chainstate::nakamoto::NakamotoBlock; + use slog::slog_info; + use stacks_common::info; + + use crate::GlobalConfig; + + /// Update stacks tip height gauge + pub fn update_stacks_tip_height(_height: i64) {} + + /// Update the current reward cycle + pub fn update_reward_cycle(_reward_cycle: i64) {} + + /// Increment the block validation responses counter + pub fn increment_block_validation_responses(_accepted: bool) {} + + /// Increment the block responses sent counter + pub fn increment_block_responses_sent(_accepted: bool) {} + + /// Increment the number of block proposals received + pub fn increment_block_proposals_received() {} + + /// Update the stx balance of the signer + pub fn update_signer_stx_balance(_balance: i64) {} + + /// Update the signer nonce metric + pub fn update_signer_nonce(_nonce: u64) {} + + /// NoOp timer uses for monitoring when the monitoring feature is not enabled. + pub struct NoOpTimer; + impl NoOpTimer { + /// NoOp method to stop recording when the monitoring feature is not enabled. + pub fn stop_and_record(&self) {} + } + + /// Stop and record the no-op timer. + pub fn new_rpc_call_timer(_full_path: &str, _origin: &str) -> NoOpTimer { + NoOpTimer } - #[cfg(not(feature = "monitoring_prom"))] - { + + /// Record the time taken to issue a block response for + /// a given block. The block's timestamp is used to calculate the latency. + /// + /// Call this right after broadcasting a BlockResponse + pub fn record_block_response_latency(_block: &NakamotoBlock) {} + + /// Record the time taken to validate a block, as reported by the Stacks node. + pub fn record_block_validation_latency(_latency_ms: u64) {} + + /// Start serving monitoring metrics. + /// This will only serve the metrics if the `monitoring_prom` feature is enabled. + pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> { if config.metrics_endpoint.is_some() { info!("`metrics_endpoint` is configured for the signer, but the monitoring_prom feature is not enabled. Not starting monitoring metrics server."); } + Ok(()) } - Ok(()) +} + +// Allow dead code because this is only used in the `monitoring_prom` feature +// but we want to run it in a test +#[allow(dead_code)] +/// Remove the origin from the full path to avoid duplicate metrics for different origins +fn remove_origin_from_path(full_path: &str, origin: &str) -> String { + full_path.replace(origin, "") } #[test] diff --git a/stacks-signer/src/monitoring/prometheus.rs b/stacks-signer/src/monitoring/prometheus.rs index 247a9f00f5..49f74ba1e8 100644 --- a/stacks-signer/src/monitoring/prometheus.rs +++ b/stacks-signer/src/monitoring/prometheus.rs @@ -62,6 +62,18 @@ lazy_static! { "Time (seconds) measuring round-trip RPC call latency to the Stacks node" // Will use DEFAULT_BUCKETS = [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] by default ), &["path"]).unwrap(); + + pub static ref SIGNER_BLOCK_VALIDATION_LATENCIES_HISTOGRAM: HistogramVec = register_histogram_vec!(histogram_opts!( + "stacks_signer_block_validation_latencies_histogram", + "Time (seconds) measuring block validation time reported by the Stacks node", + vec![0.005, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0] + ), &[]).unwrap(); + + pub static ref SIGNER_BLOCK_RESPONSE_LATENCIES_HISTOGRAM: HistogramVec = register_histogram_vec!(histogram_opts!( + "stacks_signer_block_response_latencies_histogram", + "Time (seconds) measuring end-to-end time to respond to a block", + vec![0.005, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0, 60.0, 120.0] + ), &[]).unwrap(); } pub fn gather_metrics_string() -> String { diff --git a/stacks-signer/src/monitoring/server.rs b/stacks-signer/src/monitoring/server.rs index 15267c44ee..0e584eec58 100644 --- a/stacks-signer/src/monitoring/server.rs +++ b/stacks-signer/src/monitoring/server.rs @@ -24,11 +24,11 @@ use slog::{slog_debug, slog_error, slog_info, slog_warn}; use stacks_common::{debug, error, info, warn}; use tiny_http::{Response as HttpResponse, Server as HttpServer}; -use super::{update_reward_cycle, update_signer_stx_balance}; +use super::actions::{update_reward_cycle, update_signer_stx_balance}; use crate::client::{ClientError, StacksClient}; use crate::config::{GlobalConfig, Network}; +use crate::monitoring::actions::{update_signer_nonce, update_stacks_tip_height}; use crate::monitoring::prometheus::gather_metrics_string; -use crate::monitoring::{update_signer_nonce, update_stacks_tip_height}; #[derive(thiserror::Error, Debug)] /// Monitoring server errors diff --git a/stacks-signer/src/v0/signer.rs b/stacks-signer/src/v0/signer.rs index 5d427e6717..dfd61ee35d 100644 --- a/stacks-signer/src/v0/signer.rs +++ b/stacks-signer/src/v0/signer.rs @@ -472,7 +472,10 @@ impl Signer { .send_message_with_retry::(block_response.into()) { Ok(_) => { - crate::monitoring::increment_block_responses_sent(accepted); + crate::monitoring::actions::increment_block_responses_sent(accepted); + crate::monitoring::actions::record_block_response_latency( + &block_proposal.block, + ); } Err(e) => { warn!("{self}: Failed to send block response to stacker-db: {e:?}",); @@ -489,7 +492,7 @@ impl Signer { "burn_height" => block_proposal.burn_height, "consensus_hash" => %block_proposal.block.header.consensus_hash, ); - crate::monitoring::increment_block_proposals_received(); + crate::monitoring::actions::increment_block_proposals_received(); #[cfg(any(test, feature = "testing"))] let mut block_info = BlockInfo::from(block_proposal.clone()); #[cfg(not(any(test, feature = "testing")))] @@ -672,7 +675,7 @@ impl Signer { stacks_client: &StacksClient, block_validate_ok: &BlockValidateOk, ) -> Option { - crate::monitoring::increment_block_validation_responses(true); + crate::monitoring::actions::increment_block_validation_responses(true); let signer_signature_hash = block_validate_ok.signer_signature_hash; if self .submitted_block_proposal @@ -706,6 +709,8 @@ impl Signer { .stackerdb .send_message_with_retry::(block_response.into()); + crate::monitoring::actions::record_block_response_latency(&block_info.block); + match res { Err(e) => warn!("{self}: Failed to send block rejection to stacker-db: {e:?}"), Ok(ack) if !ack.accepted => warn!( @@ -748,7 +753,7 @@ impl Signer { &mut self, block_validate_reject: &BlockValidateReject, ) -> Option { - crate::monitoring::increment_block_validation_responses(false); + crate::monitoring::actions::increment_block_validation_responses(false); let signer_signature_hash = block_validate_reject.signer_signature_hash; if self .submitted_block_proposal @@ -798,6 +803,9 @@ impl Signer { info!("{self}: Received a block validate response: {block_validate_response:?}"); let block_response = match block_validate_response { BlockValidateResponse::Ok(block_validate_ok) => { + crate::monitoring::actions::record_block_validation_latency( + block_validate_ok.validation_time_ms, + ); self.handle_block_validate_ok(stacks_client, block_validate_ok) } BlockValidateResponse::Reject(block_validate_reject) => { @@ -823,7 +831,13 @@ impl Signer { .send_message_with_retry::(response.into()) { Ok(_) => { - crate::monitoring::increment_block_responses_sent(accepted); + crate::monitoring::actions::increment_block_responses_sent(accepted); + if let Ok(Some(block_info)) = self + .signer_db + .block_lookup(&block_validate_response.signer_signature_hash()) + { + crate::monitoring::actions::record_block_response_latency(&block_info.block); + } } Err(e) => { warn!("{self}: Failed to send block rejection to stacker-db: {e:?}",); @@ -906,6 +920,8 @@ impl Signer { .stackerdb .send_message_with_retry::(rejection.into()); + crate::monitoring::actions::record_block_response_latency(&block_info.block); + match res { Err(e) => warn!("{self}: Failed to send block rejection to stacker-db: {e:?}"), Ok(ack) if !ack.accepted => warn!(