Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: hardware metrics #876

Merged
merged 19 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/multichain-update-prod-nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ on:
jobs:
build-mpc-recovery:
runs-on: ubuntu-latest
environment: prod
steps:
- uses: actions/checkout@v3
name: "Checkout mpc-recovery"
Expand Down
122 changes: 119 additions & 3 deletions chain-signatures/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions chain-signatures/node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,4 @@ itertools = "0.12.0"
http = "1.1.0"
prometheus = { version = "0.13.3" }
once_cell = "1.13.1"
sysinfo = "0.32.0"
50 changes: 50 additions & 0 deletions chain-signatures/node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,56 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy<CounterVec> = Lazy::new(|| {
.unwrap()
});

// CPU Usage Percentage Metric
pub(crate) static CPU_USAGE_PERCENTAGE: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_cpu_usage_percentage",
"CPU Usage Percentage",
&["global", "node_account_id"],
)
.unwrap()
});

// Available Memory Metric
pub(crate) static AVAILABLE_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_available_memory_bytes",
"Available Memory in Bytes",
&["available_mem", "node_account_id"],
)
.unwrap()
});

// Used Memory Metric
pub(crate) static USED_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_used_memory_bytes",
"Used Memory in Bytes",
&["used", "node_account_id"],
)
.unwrap()
});

// Disk Space Metric
pub(crate) static AVAILABLE_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_available_disk_space_bytes",
"Available Disk Space in Bytes",
&["available_disk", "node_account_id"],
)
.unwrap()
});

// Total Disk Space Metric
pub(crate) static TOTAL_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_total_disk_space_bytes",
"Total Disk Space in Bytes",
&["total_disk", "node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy<CounterVec> = Lazy::new(|| {
try_create_counter_vec(
"multichain_signature_publish_response_errors",
Expand Down
63 changes: 62 additions & 1 deletion chain-signatures/node/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub use message::MpcMessage;
pub use signature::SignQueue;
pub use signature::SignRequest;
pub use state::NodeState;
pub use sysinfo::{Components, CpuRefreshKind, Disks, RefreshKind, System};

use self::consensus::ConsensusCtx;
use self::cryptography::CryptographicCtx;
Expand All @@ -34,6 +35,7 @@ use cait_sith::protocol::Participant;
use near_account_id::AccountId;
use near_crypto::InMemorySigner;
use reqwest::IntoUrl;
use std::path::Path;
use std::time::Instant;
use std::{sync::Arc, time::Duration};
use tokio::sync::mpsc::{self, error::TryRecvError};
Expand Down Expand Up @@ -212,6 +214,7 @@ impl MpcSignProtocol {
let mut queue = MpcMessageQueue::default();
let mut last_state_update = Instant::now();
let mut last_config_update = Instant::now();
let last_hardware_pull = Instant::now();
let mut last_pinged = Instant::now();

// Sets the latest configurations from the contract:
Expand All @@ -226,11 +229,16 @@ impl MpcSignProtocol {

loop {
let protocol_time = Instant::now();
tracing::debug!("trying to advance chain signatures protocol");
// Hardware metric refresh
if last_hardware_pull.elapsed() > Duration::from_secs(5) {
update_system_metrics(&my_account_id);
}

tracing::debug!("trying to advance chain signatures protocol");
Copy link
Contributor

@ppca ppca Oct 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe delete this one then? there are 2 tracing::debug!("trying to advance chain signatures protocol");

crate::metrics::PROTOCOL_ITER_CNT
.with_label_values(&[my_account_id.as_str()])
.inc();

loop {
let msg_result = self.receiver.try_recv();
match msg_result {
Expand Down Expand Up @@ -385,3 +393,56 @@ fn node_version() -> i64 {
};
(rc_num + version.patch * 1000 + version.minor * 1000000 + version.major * 1000000000) as i64
}

fn update_system_metrics(node_account_id: &str) {
let mut system = System::new_all();

// Refresh only the necessary components
system.refresh_all();

let mut s =
System::new_with_specifics(RefreshKind::new().with_cpu(CpuRefreshKind::everything()));
// Wait a bit because CPU usage is based on diff.
std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL);
// Refresh CPUs again to get actual value.
s.refresh_cpu_specifics(CpuRefreshKind::everything());

// Update CPU usage metric
let cpu_usage = s.global_cpu_usage() as i64;
crate::metrics::CPU_USAGE_PERCENTAGE
.with_label_values(&["global", node_account_id])
.set(cpu_usage);

// Update available memory metric
let available_memory = system.available_memory() as i64;
crate::metrics::AVAILABLE_MEMORY_BYTES
.with_label_values(&["available_mem", node_account_id])
.set(available_memory);

// Update used memory metric
let used_memory = system.used_memory() as i64;
crate::metrics::USED_MEMORY_BYTES
.with_label_values(&["used", node_account_id])
.set(used_memory);

let root_mount_point = Path::new("/");
// Update available disk space metric
let available_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.available_space() as i64;
crate::metrics::AVAILABLE_DISK_SPACE_BYTES
.with_label_values(&["available_disk", node_account_id])
.set(available_disk_space);

// Update total disk space metric
let total_disk_space = Disks::new_with_refreshed_list()
.iter()
.find(|d| d.mount_point() == root_mount_point)
.expect("No disk found mounted at '/'")
.total_space() as i64;
crate::metrics::TOTAL_DISK_SPACE_BYTES
.with_label_values(&["total_disk", node_account_id])
.set(total_disk_space);
}
5 changes: 5 additions & 0 deletions infra/partner-mainnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down
7 changes: 6 additions & 1 deletion infra/partner-testnet/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ provider "google" {
provider "google-beta" {
project = var.project_id
}

resource "google_compute_project_metadata_item" "project_logging" {
key = "google-logging-enabled"
value = "true"
}
module "gce-container" {
count = length(var.node_configs)
source = "terraform-google-modules/container-vm/google"
Expand Down Expand Up @@ -103,7 +108,7 @@ module "ig_template" {
source_image_project = "cos-cloud"
machine_type = "n2d-standard-2"

startup_script = "docker rm watchtower ; docker run -d --name watchtower -v /var/run/docker.sock:/var/run/docker.sock containrrr/watchtower --debug --interval 3600"
startup_script = "docker rm watchtower ; docker run -d --name watchtower -v /var/run/docker.sock:/var/run/docker.sock containrrr/watchtower --debug --interval 30"

source_image = reverse(split("/", module.gce-container[count.index].source_image))[0]
metadata = merge(var.additional_metadata, { "gce-container-declaration" = module.gce-container["${count.index}"].metadata_value })
Expand Down
Loading