Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add failure counter for triple/presig/signature #804

Merged
merged 1 commit into from
Aug 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
metrics: add failures for triple/presig/signature
ppca committed Aug 7, 2024
commit b0251041cb350396cfdcc52e3766e9d079d6f3a9
45 changes: 45 additions & 0 deletions chain-signatures/node/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -335,6 +335,51 @@ pub(crate) static FAILED_SEND_ENCRYPTED_LATENCY: Lazy<HistogramVec> = Lazy::new(
.unwrap()
});

pub(crate) static NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS: Lazy<IntGaugeVec> = Lazy::new(|| {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we rename this to SIGNATURE_GENERATOR, NEW_SIGNATURE_GENERATOR, or something similar?
We should not expose how it is represented on Grafana in code. It can be reused, it can be per hour, not historical. It can be combined with other data and not be TOTAL.

I would rename all new and existing metrics so they represent what they are, and not how they are shown on in Grafana.

try_create_int_gauge_vec(
"multichain_num_total_historical_signature_generators",
"number of all signature generators historically on the node",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static TRIPLE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_triple_generator_failures",
"total triple generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_signature_generator_failures",
"total signature generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static PRESIGNATURE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_presignature_generator_failures",
"total presignature generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_signature_failures",
"total signature failures",
&["node_account_id"],
)
.unwrap()
});

pub fn try_create_int_gauge_vec(name: &str, help: &str, labels: &[&str]) -> Result<IntGaugeVec> {
check_metric_multichain_prefix(name)?;
let opts = Opts::new(name, help);
2 changes: 2 additions & 0 deletions chain-signatures/node/src/protocol/consensus.rs
Original file line number Diff line number Diff line change
@@ -167,6 +167,7 @@ impl ConsensusProtocol for StartedState {
me,
contract_state.public_key,
epoch,
ctx.my_account_id(),
),
)),
messages: Default::default(),
@@ -398,6 +399,7 @@ impl ConsensusProtocol for WaitingForConsensusState {
me,
self.public_key,
self.epoch,
ctx.my_account_id(),
))),
messages: self.messages,
}))
7 changes: 1 addition & 6 deletions chain-signatures/node/src/protocol/cryptography.rs
Original file line number Diff line number Diff line change
@@ -456,12 +456,7 @@ impl CryptographicProtocol for RunningState {
messages.push(info.clone(), MpcMessage::Signature(msg));
}
signature_manager
.publish(
ctx.rpc_client(),
ctx.signer(),
ctx.mpc_contract_id(),
&my_account_id,
)
.publish(ctx.rpc_client(), ctx.signer(), ctx.mpc_contract_id())
.await;
drop(signature_manager);
let failures = messages
3 changes: 3 additions & 0 deletions chain-signatures/node/src/protocol/presignature.rs
Original file line number Diff line number Diff line change
@@ -435,6 +435,9 @@ impl PresignatureManager {
let action = match generator.poke() {
Ok(action) => action,
Err(e) => {
crate::metrics::PRESIGNATURE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.gc.insert(*id, Instant::now());
self.introduced.remove(id);
errors.push(e);
31 changes: 26 additions & 5 deletions chain-signatures/node/src/protocol/signature.rs
Original file line number Diff line number Diff line change
@@ -238,6 +238,7 @@ pub struct SignatureManager {
me: Participant,
public_key: PublicKey,
epoch: u64,
my_account_id: AccountId,
}

pub const MAX_RETRY: u8 = 10;
@@ -267,7 +268,12 @@ impl ToPublish {
}

impl SignatureManager {
pub fn new(me: Participant, public_key: PublicKey, epoch: u64) -> Self {
pub fn new(
me: Participant,
public_key: PublicKey,
epoch: u64,
my_account_id: &AccountId,
) -> Self {
Self {
generators: HashMap::new(),
failed: VecDeque::new(),
@@ -276,6 +282,7 @@ impl SignatureManager {
me,
public_key,
epoch,
my_account_id: my_account_id.clone(),
}
}

@@ -354,6 +361,9 @@ impl SignatureManager {
req,
cfg,
)?;
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.generators.insert(receipt_id, generator);
Ok(())
}
@@ -393,6 +403,9 @@ impl SignatureManager {
},
cfg,
)?;
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.generators.insert(receipt_id, generator);
Ok(())
}
@@ -462,6 +475,9 @@ impl SignatureManager {
}
};
let generator = entry.insert(generator);
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
Ok(&mut generator.protocol)
}
Entry::Occupied(entry) => Ok(&mut entry.into_mut().protocol),
@@ -482,6 +498,9 @@ impl SignatureManager {
if generator.proposer == self.me {
if generator.sign_request_timestamp.elapsed() < generator.timeout_total {
tracing::warn!(?err, "signature failed to be produced; pushing request back into failed queue");
crate::metrics::SIGNATURE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
// only retry the signature generation if it was initially proposed by us. We do not
// want any nodes to be proposing the same signature multiple times.
self.failed.push_back((
@@ -496,6 +515,9 @@ impl SignatureManager {
));
} else {
self.completed.insert(*receipt_id, Instant::now());
crate::metrics::SIGNATURE_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
tracing::warn!(?err, "signature failed to be produced; trashing request");
}
}
@@ -663,7 +685,6 @@ impl SignatureManager {
rpc_client: &near_fetch::Client,
signer: &T,
mpc_contract_id: &AccountId,
my_account_id: &AccountId,
) {
let mut to_retry: Vec<ToPublish> = Vec::new();

@@ -720,14 +741,14 @@ impl SignatureManager {
};

crate::metrics::NUM_SIGN_SUCCESS
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.inc();
crate::metrics::SIGN_LATENCY
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.observe(time_added.elapsed().as_secs_f64());
if time_added.elapsed().as_secs() <= 30 {
crate::metrics::NUM_SIGN_SUCCESS_30S
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.inc();
}
}
3 changes: 3 additions & 0 deletions chain-signatures/node/src/protocol/triple.rs
Original file line number Diff line number Diff line change
@@ -459,6 +459,9 @@ impl TripleManager {
Ok(action) => action,
Err(e) => {
errors.push(e);
crate::metrics::TRIPLE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.gc.insert(*id, Instant::now());
self.ongoing.remove(id);
self.introduced.remove(id);