-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FEAT: hardware metrics #876
Changes from 9 commits
bf19f65
dd888d0
b9cd984
80a770d
a93aeea
efb3e85
60dd369
b3804a9
b589024
b5689d1
1f7d401
9b23a41
5d5684f
76e8efc
79cb421
81ea543
6da492d
7bc43bc
451107f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,3 +60,4 @@ itertools = "0.12.0" | |
http = "1.1.0" | ||
prometheus = { version = "0.13.3" } | ||
once_cell = "1.13.1" | ||
sysinfo = "0.32.0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -389,6 +389,66 @@ pub(crate) static SIGNATURE_PUBLISH_FAILURES: Lazy<CounterVec> = Lazy::new(|| { | |
.unwrap() | ||
}); | ||
|
||
// CPU Usage Percentage Metric | ||
pub(crate) static CPU_USAGE_PERCENTAGE: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
try_create_int_gauge_vec( | ||
"multichain_cpu_usage_percentage", | ||
"CPU Usage Percentage", | ||
&[ "global", "node_account_id" ], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
// Total Memory Metric | ||
pub(crate) static TOTAL_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
try_create_int_gauge_vec( | ||
"multichain_total_memory_bytes", | ||
"Total Memory in Bytes", | ||
&[ "total", "node_account_id" ], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
// Available Memory Metric | ||
pub(crate) static AVAILABLE_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
try_create_int_gauge_vec( | ||
"multichain_available_memory_bytes", | ||
"Available Memory in Bytes", | ||
&[ "available_mem", "node_account_id"], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
// Used Memory Metric | ||
pub(crate) static USED_MEMORY_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
try_create_int_gauge_vec( | ||
"multichain_used_memory_bytes", | ||
"Used Memory in Bytes", | ||
&[ "used", "node_account_id" ], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
// Disk Space Metric | ||
pub(crate) static DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also have available and used for the DISK_SPACE? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is Available disk space, mostly for alerting purposes. We can also add a total disk space just so there's visibility into it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's specify in the name if it is used or remaining. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, I will do a calculation in grafana for used disk space since only total and available is part of that library. |
||
try_create_int_gauge_vec( | ||
"multichain_disk_space_bytes", | ||
"Available Disk Space in Bytes", | ||
&[ "available_disk", "node_account_id" ], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
// Total Disk Space Metric | ||
pub(crate) static TOTAL_DISK_SPACE_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| { | ||
try_create_int_gauge_vec( | ||
"multichain_total_disk_space_bytes", | ||
"Total Disk Space in Bytes", | ||
&[ "total_disk", "node_account_id" ], | ||
) | ||
.unwrap() | ||
}); | ||
|
||
pub(crate) static SIGNATURE_PUBLISH_RESPONSE_ERRORS: Lazy<CounterVec> = Lazy::new(|| { | ||
try_create_counter_vec( | ||
"multichain_signature_publish_response_errors", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ pub use message::MpcMessage; | |
pub use signature::SignQueue; | ||
pub use signature::SignRequest; | ||
pub use state::NodeState; | ||
pub use sysinfo::{System, Components, Disks, CpuRefreshKind, RefreshKind}; | ||
|
||
use self::consensus::ConsensusCtx; | ||
use self::cryptography::CryptographicCtx; | ||
|
@@ -36,6 +37,7 @@ use near_crypto::InMemorySigner; | |
use reqwest::IntoUrl; | ||
use std::time::Instant; | ||
use std::{sync::Arc, time::Duration}; | ||
use std::path::Path; | ||
use tokio::sync::mpsc::{self, error::TryRecvError}; | ||
use tokio::sync::RwLock; | ||
use url::Url; | ||
|
@@ -212,6 +214,7 @@ impl MpcSignProtocol { | |
let mut queue = MpcMessageQueue::default(); | ||
let mut last_state_update = Instant::now(); | ||
let mut last_config_update = Instant::now(); | ||
let mut last_hardware_pull = Instant::now(); | ||
let mut last_pinged = Instant::now(); | ||
|
||
// Sets the latest configurations from the contract: | ||
|
@@ -226,6 +229,12 @@ impl MpcSignProtocol { | |
|
||
loop { | ||
let protocol_time = Instant::now(); | ||
tracing::debug!("trying to advance chain signatures protocol"); | ||
// Hardware metric refresh | ||
if last_hardware_pull.elapsed() > Duration::from_secs(5) { | ||
update_system_metrics(&my_account_id); | ||
} | ||
|
||
tracing::debug!("trying to advance chain signatures protocol"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe delete this one then? there are 2 |
||
loop { | ||
let msg_result = self.receiver.try_recv(); | ||
|
@@ -381,3 +390,65 @@ fn node_version() -> i64 { | |
}; | ||
(rc_num + version.patch * 1000 + version.minor * 1000000 + version.major * 1000000000) as i64 | ||
} | ||
|
||
|
||
fn update_system_metrics(node_account_id: &str) { | ||
|
||
let mut system = System::new_all(); | ||
|
||
// Refresh only the necessary components | ||
system.refresh_all(); | ||
|
||
let mut s = System::new_with_specifics( | ||
RefreshKind::new().with_cpu(CpuRefreshKind::everything()), | ||
); | ||
// Wait a bit because CPU usage is based on diff. | ||
std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); | ||
// Refresh CPUs again to get actual value. | ||
s.refresh_cpu_specifics(CpuRefreshKind::everything()); | ||
|
||
// Update CPU usage metric | ||
let cpu_usage = s.global_cpu_usage() as i64; | ||
crate::metrics::CPU_USAGE_PERCENTAGE | ||
.with_label_values(&["global", node_account_id]) | ||
.set(cpu_usage); | ||
|
||
// Update total memory metric | ||
let total_memory = system.total_memory() as i64; | ||
crate::metrics::TOTAL_MEMORY_BYTES | ||
.with_label_values(&["total", node_account_id]) | ||
.set(total_memory); | ||
|
||
// Update available memory metric | ||
let available_memory = system.available_memory() as i64; | ||
crate::metrics::AVAILABLE_MEMORY_BYTES | ||
.with_label_values(&["available_mem", node_account_id]) | ||
.set(available_memory); | ||
|
||
// Update used memory metric | ||
let used_memory = system.used_memory() as i64; | ||
crate::metrics::USED_MEMORY_BYTES | ||
.with_label_values(&["used", node_account_id]) | ||
.set(used_memory); | ||
|
||
let root_mount_point = Path::new("/"); | ||
// Update available disk space metric | ||
let available_disk_space = Disks::new_with_refreshed_list() | ||
.iter() | ||
.find(|d| d.mount_point() == root_mount_point) | ||
.expect("No disk found mounted at '/'") | ||
.available_space() as i64; | ||
crate::metrics::DISK_SPACE_BYTES | ||
.with_label_values(&["available_disk", node_account_id]) | ||
.set(available_disk_space); | ||
|
||
// Update total disk space metric | ||
let total_disk_space = Disks::new_with_refreshed_list() | ||
.iter() | ||
.find(|d| d.mount_point() == root_mount_point) | ||
.expect("No disk found mounted at '/'") | ||
.total_space() as i64; | ||
crate::metrics::TOTAL_DISK_SPACE_BYTES | ||
.with_label_values(&["total_disk", node_account_id]) | ||
.set(total_disk_space); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Either this one or
AVAILABLE_MEMORY_BYTES
is redundant.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't we want a total to ensure partners have enough memory? I figured available would be useful to alert off of as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just saying that available + used = total