Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: record validation and end-to-end response time in signer metrics #5664

Merged
merged 8 commits into from
Jan 20, 2025
3 changes: 3 additions & 0 deletions stacks-signer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE

- Introduced the `block_proposal_max_age_secs` configuration option for signers, enabling them to automatically ignore block proposals that exceed the specified age in seconds.
- When a new block proposal is received while the signer is waiting for an existing proposal to be validated, the signer will wait until the existing block is done validating before submitting the new one for validating. ([#5453](https://github.com/stacks-network/stacks-core/pull/5453))
- Introduced two new prometheus metrics:
- `stacks_signer_block_validation_latencies_histogram`: the validation_time_ms reported by the node when validating a block proposal
- `stacks_signer_block_response_latencies_histogram`: the "end-to-end" time it takes for the signer to issue a block response

## Changed
- Improvements to the stale signer cleanup logic: deletes the prior signer if it has no remaining unprocessed blocks in its database
Expand Down
28 changes: 17 additions & 11 deletions stacks-signer/src/client/stacks_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,10 @@ impl StacksClient {
block,
chain_id: self.chain_id,
};
let timer =
crate::monitoring::new_rpc_call_timer(&self.block_proposal_path(), &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(
&self.block_proposal_path(),
&self.http_origin,
);
let send_request = || {
self.stacks_node_client
.post(self.block_proposal_path())
Expand Down Expand Up @@ -399,7 +401,8 @@ impl StacksClient {
"{}{RPC_TENURE_FORKING_INFO_PATH}/:start/:stop",
self.http_origin
);
let timer = crate::monitoring::new_rpc_call_timer(&metrics_path, &self.http_origin);
let timer =
crate::monitoring::actions::new_rpc_call_timer(&metrics_path, &self.http_origin);
let send_request = || {
self.stacks_node_client
.get(&path)
Expand All @@ -420,7 +423,7 @@ impl StacksClient {
pub fn get_current_and_last_sortition(&self) -> Result<CurrentAndLastSortition, ClientError> {
debug!("StacksClient: Getting current and prior sortition");
let path = format!("{}/latest_and_last", self.sortition_info_path());
let timer = crate::monitoring::new_rpc_call_timer(&path, &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(&path, &self.http_origin);
let send_request = || {
self.stacks_node_client.get(&path).send().map_err(|e| {
warn!("Signer failed to request latest sortition"; "err" => ?e);
Expand Down Expand Up @@ -460,8 +463,10 @@ impl StacksClient {
/// Get the current peer info data from the stacks node
pub fn get_peer_info(&self) -> Result<PeerInfo, ClientError> {
debug!("StacksClient: Getting peer info");
let timer =
crate::monitoring::new_rpc_call_timer(&self.core_info_path(), &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(
&self.core_info_path(),
&self.http_origin,
);
let send_request = || {
self.stacks_node_client
.get(self.core_info_path())
Expand All @@ -485,7 +490,7 @@ impl StacksClient {
debug!("StacksClient: Getting reward set signers";
"reward_cycle" => reward_cycle,
);
let timer = crate::monitoring::new_rpc_call_timer(
let timer = crate::monitoring::actions::new_rpc_call_timer(
&format!("{}/v3/stacker_set/:reward_cycle", self.http_origin),
&self.http_origin,
);
Expand Down Expand Up @@ -521,7 +526,8 @@ impl StacksClient {
/// Retrieve the current pox data from the stacks node
pub fn get_pox_data(&self) -> Result<RPCPoxInfoData, ClientError> {
debug!("StacksClient: Getting pox data");
let timer = crate::monitoring::new_rpc_call_timer(&self.pox_path(), &self.http_origin);
let timer =
crate::monitoring::actions::new_rpc_call_timer(&self.pox_path(), &self.http_origin);
let send_request = || {
self.stacks_node_client
.get(self.pox_path())
Expand Down Expand Up @@ -572,7 +578,7 @@ impl StacksClient {
"address" => %address,
);
let timer_label = format!("{}/v2/accounts/:principal", self.http_origin);
let timer = crate::monitoring::new_rpc_call_timer(&timer_label, &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(&timer_label, &self.http_origin);
let send_request = || {
self.stacks_node_client
.get(self.accounts_path(address))
Expand Down Expand Up @@ -628,7 +634,7 @@ impl StacksClient {
"block_height" => %block.header.chain_length,
);
let path = format!("{}{}?broadcast=1", self.http_origin, postblock_v3::PATH);
let timer = crate::monitoring::new_rpc_call_timer(&path, &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(&path, &self.http_origin);
let send_request = || {
self.stacks_node_client
.post(&path)
Expand Down Expand Up @@ -678,7 +684,7 @@ impl StacksClient {
"{}/v2/contracts/call-read/:principal/{contract_name}/{function_name}",
self.http_origin
);
let timer = crate::monitoring::new_rpc_call_timer(&timer_label, &self.http_origin);
let timer = crate::monitoring::actions::new_rpc_call_timer(&timer_label, &self.http_origin);
let response = self
.stacks_node_client
.post(path)
Expand Down
2 changes: 1 addition & 1 deletion stacks-signer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ impl<S: Signer<T> + Send + 'static, T: SignerEventTrait + 'static> SpawnedSigner
);
let (res_send, res_recv) = channel();
let ev = SignerEventReceiver::new(config.network.is_mainnet());
crate::monitoring::start_serving_monitoring_metrics(config.clone()).ok();
crate::monitoring::actions::start_serving_monitoring_metrics(config.clone()).ok();
let runloop = RunLoop::new(config.clone());
let mut signer: RunLoopSigner<S, T> = libsigner::Signer::new(runloop, ev, res_send);
let running_signer = signer.spawn(endpoint).expect("Failed to spawn signer");
Expand Down
239 changes: 140 additions & 99 deletions stacks-signer/src/monitoring/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,139 +14,180 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

#[cfg(feature = "monitoring_prom")]
use ::prometheus::HistogramTimer;
#[cfg(feature = "monitoring_prom")]
use slog::slog_error;
#[cfg(not(feature = "monitoring_prom"))]
use slog::slog_info;
#[cfg(feature = "monitoring_prom")]
use stacks_common::error;
#[cfg(not(feature = "monitoring_prom"))]
use stacks_common::info;

use crate::config::GlobalConfig;

#[cfg(feature = "monitoring_prom")]
mod prometheus;

#[cfg(feature = "monitoring_prom")]
mod server;

/// Update stacks tip height gauge
#[allow(unused_variables)]
pub fn update_stacks_tip_height(height: i64) {
#[cfg(feature = "monitoring_prom")]
prometheus::STACKS_TIP_HEIGHT_GAUGE.set(height);
}

/// Update the current reward cycle
#[allow(unused_variables)]
pub fn update_reward_cycle(reward_cycle: i64) {
#[cfg(feature = "monitoring_prom")]
prometheus::CURRENT_REWARD_CYCLE.set(reward_cycle);
}
/// Actions for updating metrics
#[cfg(feature = "monitoring_prom")]
pub mod actions {
use ::prometheus::HistogramTimer;
use blockstack_lib::chainstate::nakamoto::NakamotoBlock;
use slog::slog_error;
use stacks_common::error;

use crate::config::GlobalConfig;
use crate::monitoring::prometheus::*;

/// Update stacks tip height gauge
pub fn update_stacks_tip_height(height: i64) {
STACKS_TIP_HEIGHT_GAUGE.set(height);
}

/// Increment the block validation responses counter
#[allow(unused_variables)]
pub fn increment_block_validation_responses(accepted: bool) {
#[cfg(feature = "monitoring_prom")]
{
let label_value = if accepted { "accepted" } else { "rejected" };
prometheus::BLOCK_VALIDATION_RESPONSES
.with_label_values(&[label_value])
.inc();
/// Update the current reward cycle
pub fn update_reward_cycle(reward_cycle: i64) {
CURRENT_REWARD_CYCLE.set(reward_cycle);
}
}

/// Increment the block responses sent counter
#[allow(unused_variables)]
pub fn increment_block_responses_sent(accepted: bool) {
#[cfg(feature = "monitoring_prom")]
{
/// Increment the block validation responses counter
pub fn increment_block_validation_responses(accepted: bool) {
let label_value = if accepted { "accepted" } else { "rejected" };
prometheus::BLOCK_RESPONSES_SENT
BLOCK_VALIDATION_RESPONSES
.with_label_values(&[label_value])
.inc();
}
}

/// Increment the number of block proposals received
#[allow(unused_variables)]
pub fn increment_block_proposals_received() {
#[cfg(feature = "monitoring_prom")]
prometheus::BLOCK_PROPOSALS_RECEIVED.inc();
}
/// Increment the block responses sent counter
#[allow(unused_variables)]
pub fn increment_block_responses_sent(accepted: bool) {
#[cfg(feature = "monitoring_prom")]
{
let label_value = if accepted { "accepted" } else { "rejected" };
BLOCK_RESPONSES_SENT.with_label_values(&[label_value]).inc();
}
}
hstove marked this conversation as resolved.
Show resolved Hide resolved

/// Update the stx balance of the signer
#[allow(unused_variables)]
pub fn update_signer_stx_balance(balance: i64) {
#[cfg(feature = "monitoring_prom")]
prometheus::SIGNER_STX_BALANCE.set(balance);
}
/// Increment the number of block proposals received
pub fn increment_block_proposals_received() {
BLOCK_PROPOSALS_RECEIVED.inc();
}

/// Update the signer nonce metric
#[allow(unused_variables)]
pub fn update_signer_nonce(nonce: u64) {
#[cfg(feature = "monitoring_prom")]
prometheus::SIGNER_NONCE.set(nonce as i64);
}
/// Update the stx balance of the signer
pub fn update_signer_stx_balance(balance: i64) {
SIGNER_STX_BALANCE.set(balance);
}

// Allow dead code because this is only used in the `monitoring_prom` feature
// but we want to run it in a test
#[allow(dead_code)]
/// Remove the origin from the full path to avoid duplicate metrics for different origins
fn remove_origin_from_path(full_path: &str, origin: &str) -> String {
full_path.replace(origin, "")
}
/// Update the signer nonce metric
pub fn update_signer_nonce(nonce: u64) {
SIGNER_NONCE.set(nonce as i64);
}

/// Start a new RPC call timer.
/// The `origin` parameter is the base path of the RPC call, e.g. `http://node.com`.
/// The `origin` parameter is removed from `full_path` when storing in prometheus.
#[cfg(feature = "monitoring_prom")]
pub fn new_rpc_call_timer(full_path: &str, origin: &str) -> HistogramTimer {
let path = remove_origin_from_path(full_path, origin);
let histogram = prometheus::SIGNER_RPC_CALL_LATENCIES_HISTOGRAM.with_label_values(&[&path]);
histogram.start_timer()
}
/// Start a new RPC call timer.
/// The `origin` parameter is the base path of the RPC call, e.g. `http://node.com`.
/// The `origin` parameter is removed from `full_path` when storing in prometheus.
pub fn new_rpc_call_timer(full_path: &str, origin: &str) -> HistogramTimer {
let path = super::remove_origin_from_path(full_path, origin);
let histogram = SIGNER_RPC_CALL_LATENCIES_HISTOGRAM.with_label_values(&[&path]);
histogram.start_timer()
}

/// NoOp timer uses for monitoring when the monitoring feature is not enabled.
pub struct NoOpTimer;
impl NoOpTimer {
/// NoOp method to stop recording when the monitoring feature is not enabled.
pub fn stop_and_record(&self) {}
}
/// Record the time taken to issue a block response for
/// a given block. The block's timestamp is used to calculate the latency.
///
/// Call this right after broadcasting a BlockResponse
pub fn record_block_response_latency(block: &NakamotoBlock) {
use clarity::util::get_epoch_time_ms;

let diff =
get_epoch_time_ms().saturating_sub(block.header.timestamp.saturating_mul(1000).into());
SIGNER_BLOCK_RESPONSE_LATENCIES_HISTOGRAM
.with_label_values(&[])
.observe(diff as f64 / 1000.0);
}

/// Stop and record the no-op timer.
#[cfg(not(feature = "monitoring_prom"))]
pub fn new_rpc_call_timer(_full_path: &str, _origin: &str) -> NoOpTimer {
NoOpTimer
}
/// Record the time taken to validate a block, as reported by the Stacks node.
pub fn record_block_validation_latency(latency_ms: u64) {
SIGNER_BLOCK_VALIDATION_LATENCIES_HISTOGRAM
.with_label_values(&[])
.observe(latency_ms as f64 / 1000.0);
}

/// Start serving monitoring metrics.
/// This will only serve the metrics if the `monitoring_prom` feature is enabled.
#[allow(unused_variables)]
pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> {
#[cfg(feature = "monitoring_prom")]
{
/// Start serving monitoring metrics.
/// This will only serve the metrics if the `monitoring_prom` feature is enabled.
pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> {
if config.metrics_endpoint.is_none() {
return Ok(());
}
let thread = std::thread::Builder::new()
let _ = std::thread::Builder::new()
.name("signer_metrics".to_string())
.spawn(move || {
if let Err(monitoring_err) = server::MonitoringServer::start(&config) {
if let Err(monitoring_err) = super::server::MonitoringServer::start(&config) {
error!("Monitoring: Error in metrics server: {:?}", monitoring_err);
}
});
Ok(())
}
#[cfg(not(feature = "monitoring_prom"))]
{
}

/// No-op actions for updating metrics
#[cfg(not(feature = "monitoring_prom"))]
pub mod actions {
use blockstack_lib::chainstate::nakamoto::NakamotoBlock;
use slog::slog_info;
use stacks_common::info;

use crate::GlobalConfig;

/// Update stacks tip height gauge
pub fn update_stacks_tip_height(_height: i64) {}

/// Update the current reward cycle
pub fn update_reward_cycle(_reward_cycle: i64) {}

/// Increment the block validation responses counter
pub fn increment_block_validation_responses(_accepted: bool) {}

/// Increment the block responses sent counter
pub fn increment_block_responses_sent(_accepted: bool) {}

/// Increment the number of block proposals received
pub fn increment_block_proposals_received() {}

/// Update the stx balance of the signer
pub fn update_signer_stx_balance(_balance: i64) {}

/// Update the signer nonce metric
pub fn update_signer_nonce(_nonce: u64) {}

/// NoOp timer uses for monitoring when the monitoring feature is not enabled.
pub struct NoOpTimer;
impl NoOpTimer {
/// NoOp method to stop recording when the monitoring feature is not enabled.
pub fn stop_and_record(&self) {}
}

/// Stop and record the no-op timer.
pub fn new_rpc_call_timer(_full_path: &str, _origin: &str) -> NoOpTimer {
NoOpTimer
}

/// Record the time taken to issue a block response for
/// a given block. The block's timestamp is used to calculate the latency.
///
/// Call this right after broadcasting a BlockResponse
pub fn record_block_response_latency(_block: &NakamotoBlock) {}

/// Record the time taken to validate a block, as reported by the Stacks node.
pub fn record_block_validation_latency(_latency_ms: u64) {}

/// Start serving monitoring metrics.
/// This will only serve the metrics if the `monitoring_prom` feature is enabled.
pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> {
if config.metrics_endpoint.is_some() {
info!("`metrics_endpoint` is configured for the signer, but the monitoring_prom feature is not enabled. Not starting monitoring metrics server.");
}
Ok(())
}
Ok(())
}

// Allow dead code because this is only used in the `monitoring_prom` feature
// but we want to run it in a test
#[allow(dead_code)]
/// Remove the origin from the full path to avoid duplicate metrics for different origins
fn remove_origin_from_path(full_path: &str, origin: &str) -> String {
full_path.replace(origin, "")
}

#[test]
Expand Down
Loading
Loading