Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: hardware metrics #876

Merged
merged 19 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 117 additions & 1 deletion chain-signatures/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions chain-signatures/node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,4 @@ itertools = "0.12.0"
http = "1.1.0"
prometheus = { version = "0.13.3" }
once_cell = "1.13.1"
sysinfo = "0.32.0"
60 changes: 60 additions & 0 deletions chain-signatures/node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,66 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy<CounterVec> = Lazy::new(|| {
.unwrap()
});

// CPU Usage Percentage Metric
pub(crate) static CPU_USAGE_PERCENTAGE: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_cpu_usage_percentage",
"CPU Usage Percentage",
&[ "global", "node_account_id" ],
)
.unwrap()
});

// Total Memory Metric
pub(crate) static TOTAL_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either this one or AVAILABLE_MEMORY_BYTES is redundant.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't we want a total to ensure partners have enough memory? I figured available would be useful to alert off of as well.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just saying that available + used = total

try_create_int_gauge_vec(
"multichain_total_memory_bytes",
"Total Memory in Bytes",
&[ "total", "node_account_id" ],
)
.unwrap()
});

// Available Memory Metric
pub(crate) static AVAILABLE_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_available_memory_bytes",
"Available Memory in Bytes",
&[ "available_mem", "node_account_id"],
)
.unwrap()
});

// Used Memory Metric
pub(crate) static USED_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_used_memory_bytes",
"Used Memory in Bytes",
&[ "used", "node_account_id" ],
)
.unwrap()
});

// Disk Space Metric
pub(crate) static DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also have available and used for the DISK_SPACE?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is Available disk space, mostly for alerting purposes. We can also add a total disk space just so there's visibility into it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's specify in the name if it is used or remaining.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, I will do a calculation in grafana for used disk space since only total and available is part of that library.

try_create_int_gauge_vec(
"multichain_disk_space_bytes",
"Available Disk Space in Bytes",
&[ "available_disk", "node_account_id" ],
)
.unwrap()
});

// Total Disk Space Metric
pub(crate) static TOTAL_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_total_disk_space_bytes",
"Total Disk Space in Bytes",
&[ "total_disk", "node_account_id" ],
)
.unwrap()
});

pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy<CounterVec> = Lazy::new(|| {
try_create_counter_vec(
"multichain_signature_publish_response_errors",
Expand Down
71 changes: 71 additions & 0 deletions chain-signatures/node/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub use message::MpcMessage;
pub use signature::SignQueue;
pub use signature::SignRequest;
pub use state::NodeState;
pub use sysinfo::{System, Components, Disks, CpuRefreshKind, RefreshKind};

use self::consensus::ConsensusCtx;
use self::cryptography::CryptographicCtx;
Expand All @@ -36,6 +37,7 @@ use near_crypto::InMemorySigner;
use reqwest::IntoUrl;
use std::time::Instant;
use std::{sync::Arc, time::Duration};
use std::path::Path;
use tokio::sync::mpsc::{self, error::TryRecvError};
use tokio::sync::RwLock;
use url::Url;
Expand Down Expand Up @@ -212,6 +214,7 @@ impl MpcSignProtocol {
let mut queue = MpcMessageQueue::default();
let mut last_state_update = Instant::now();
let mut last_config_update = Instant::now();
let mut last_hardware_pull = Instant::now();
let mut last_pinged = Instant::now();

// Sets the latest configurations from the contract:
Expand All @@ -226,6 +229,12 @@ impl MpcSignProtocol {

loop {
let protocol_time = Instant::now();
tracing::debug!("trying to advance chain signatures protocol");
// Hardware metric refresh
if last_hardware_pull.elapsed() > Duration::from_secs(5) {
update_system_metrics(&my_account_id);
}

tracing::debug!("trying to advance chain signatures protocol");
Copy link
Contributor

@ppca ppca Oct 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe delete this one then? there are 2 tracing::debug!("trying to advance chain signatures protocol");

loop {
let msg_result = self.receiver.try_recv();
Expand Down Expand Up @@ -381,3 +390,65 @@ fn node_version() -> i64 {
};
(rc_num + version.patch * 1000 + version.minor * 1000000 + version.major * 1000000000) as i64
}


fn update_system_metrics(node_account_id: &str) {

let mut system = System::new_all();

// Refresh only the necessary components
system.refresh_all();

let mut s = System::new_with_specifics(
RefreshKind::new().with_cpu(CpuRefreshKind::everything()),
);
// Wait a bit because CPU usage is based on diff.
std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL);
// Refresh CPUs again to get actual value.
s.refresh_cpu_specifics(CpuRefreshKind::everything());

// Update CPU usage metric
let cpu_usage = s.global_cpu_usage() as i64;
crate::metrics::CPU_USAGE_PERCENTAGE
.with_label_values(&["global", node_account_id])
.set(cpu_usage);

// Update total memory metric
let total_memory = system.total_memory() as i64;
crate::metrics::TOTAL_MEMORY_BYTES
.with_label_values(&["total", node_account_id])
.set(total_memory);

// Update available memory metric
let available_memory = system.available_memory() as i64;
crate::metrics::AVAILABLE_MEMORY_BYTES
.with_label_values(&["available_mem", node_account_id])
.set(available_memory);

// Update used memory metric
let used_memory = system.used_memory() as i64;
crate::metrics::USED_MEMORY_BYTES
.with_label_values(&["used", node_account_id])
.set(used_memory);

let root_mount_point = Path::new("/");
// Update available disk space metric
let available_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.available_space() as i64;
crate::metrics::DISK_SPACE_BYTES
.with_label_values(&["available_disk", node_account_id])
.set(available_disk_space);

// Update total disk space metric
let total_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.total_space() as i64;
crate::metrics::TOTAL_DISK_SPACE_BYTES
.with_label_values(&["total_disk", node_account_id])
.set(total_disk_space);
}
5 changes: 5 additions & 0 deletions infra/partner-mainnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down
5 changes: 5 additions & 0 deletions infra/partner-testnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down
Loading