Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Kubernetes collector #836

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
3 changes: 3 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ jobs:
run: |
./scripts/init_slurm_collector_sqlite.sh

- name: Migrate kubernetes collector sqlite database
run: ./scripts/init_kubernetes_collector_sqlite.sh

- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov

Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/general.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ jobs:
run: |
./scripts/init_slurm_collector_sqlite.sh

- name: Migrate kubernetes collector sqlite database
run: ./scripts/init_kubernetes_collector_sqlite.sh

- name: Check sqlx-data.json for auditor is up-to-date
run: |
cd auditor
Expand Down Expand Up @@ -142,6 +145,9 @@ jobs:
- name: Migrate slurm collector sqlite database
run: ./scripts/init_slurm_collector_sqlite.sh

- name: Migrate kubernetes collector sqlite database
run: ./scripts/init_kubernetes_collector_sqlite.sh

- name: Clippy
run: cargo clippy --all-targets --all-features -- -D warnings

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/rpm-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
- auditor
- auditor-slurm-epilog-collector
- auditor-slurm-collector
- auditor-kubernetes-collector
- auditor-priority-plugin

env:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Apel plugin: Add option to send individual job messages ([@dirksammel](https://github.com/dirksammel))
- CI: Add mypy workflow for type checking ([@dirksammel](https://github.com/dirksammel))
- Docs: Add contribution guidelines ([@QuantumDancer](https://github.com/QuantumDancer))
- Kubernetes collector: Added a Kubernetes collector ([@rkleinem](https://github.com/rkleinem))

### Changed
- Apel plugin: Use common logger in the code base ([@dirksammel](https://github.com/dirksammel))
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ members = [
"auditor",
"auditor-client",
"pyauditor",
"collectors/kubernetes",
"collectors/slurm",
"collectors/slurm-epilog",
"plugins/priority",
Expand Down
2 changes: 1 addition & 1 deletion auditor/src/domain/validname.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ impl<'de> serde::Deserialize<'de> for ValidName {

impl fmt::Display for ValidName {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self.0)
write!(f, "{}", self.0)
}
}

Expand Down
1 change: 1 addition & 0 deletions collectors/kubernetes/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DATABASE_URL=sqlite://$PWD/collectors/kubernetes/testdb.db
40 changes: 40 additions & 0 deletions collectors/kubernetes/Cargo.toml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Local dependencies can be transitioned to workspace dependencies

Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[package]
name = "auditor-kubernetes-collector"
version = "0.5.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1.0"
auditor = { path = "../../auditor", version = "0.5.0", default-features = false }
auditor-client = { path = "../../auditor-client", version = "0.5.0" }
chrono = { version = "0.4.38", default-features = false, features = ["serde"] }
k8s-openapi = { version = "0.21", features = ["v1_28"] }
kube = { version = "0.88" }
prometheus-http-query = { version = "0.8.0" }
reqwest = { version = "0.12.4" }
serde = { version = "1.0.200", features = ["derive"] }
serde_yaml = "0.9"
tokio = { version = "1", features = ["time"] }
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3" }
uuid = { version = "1.8", features = ["v7"] }
bincode = "1.3.3"

[dependencies.sqlx]
version = "0.7.4"
default-features = false
features = [
"runtime-tokio-rustls",
"macros",
"sqlite",
"uuid",
"chrono",
"migrate"
]

[dev-dependencies]
fake = { version = "2.9", features = ["chrono", "derive"] }
wiremock = "0.6"

12 changes: 12 additions & 0 deletions collectors/kubernetes/migrations/20240420084225_merge_queue.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- Add migration script here
CREATE TABLE IF NOT EXISTS mergequeue (
rid VARCHAR(256) NOT NULL,
record BLOB NOT NULL,
retry INTEGER NOT NULL,
updated INTEGER NOT NULL,
complete BOOLEAN NOT NULL
);

CREATE TABLE IF NOT EXISTS lastcheck (
time DATETIME NOT NULL
);
182 changes: 182 additions & 0 deletions collectors/kubernetes/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
use std::fs;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::Duration;

use chrono::{DateTime, Local, TimeDelta};
use serde::Deserialize;
use tracing_subscriber::filter::LevelFilter;

pub fn load_configuration(p: impl AsRef<Path>) -> Config {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

load_configuration can be modified to return Result<Config, config::ConfigError>

let yaml = fs::read_to_string(p.as_ref()).expect("Cannot open config file");
let config: DeConfig = serde_yaml::from_str(&yaml).expect("Config is not valid yaml");
config.into()
}

#[derive(Deserialize)]
#[serde(from = "Config")]
struct DeConfig(Config);

impl From<Config> for DeConfig {
fn from(mut value: Config) -> Self {
for status in value.job_filter.status.iter_mut() {
status.make_ascii_lowercase()
}
Self(value)
}
}

impl From<DeConfig> for Config {
fn from(value: DeConfig) -> Self {
value.0
}
}

//#[serde_with::serde_as]
#[derive(Deserialize, Debug, Clone)]
#[serde(deny_unknown_fields)]
pub struct Config {
pub auditor_addr: String,
#[serde(default = "default_auditor_port")]
pub auditor_port: u16,
pub prometheus_addr: String,
pub prometheus_port: u16,
#[serde(default = "default_record_prefix")]
pub record_prefix: String,
#[serde(default = "default_earliest_datetime")]
pub earliest_datetime: DateTime<Local>,
#[serde(default = "default_auditor_timeout")]
#[serde(deserialize_with = "deserialize_timedelta")]
pub auditor_timeout: TimeDelta,
#[serde(default = "default_prometheus_timeout")]
#[serde(deserialize_with = "deserialize_timedelta")]
pub prometheus_timeout: TimeDelta,
#[serde(default = "default_collect_interval")]
#[serde(deserialize_with = "deserialize_timedelta")]
pub collect_interval: TimeDelta,
#[serde(default = "default_send_interval")]
#[serde(deserialize_with = "deserialize_timedelta")]
pub send_interval: TimeDelta,
#[serde(default = "default_database_path")]
pub database_path: PathBuf,
#[serde(default)]
pub job_filter: JobFilterSettings,
//#[serde(default)] // bool defaults to false
//pub delete_jobs: bool,
#[serde(default = "default_backlog_interval")]
#[serde(deserialize_with = "deserialize_duration")]
pub backlog_interval: Duration,
#[serde(default = "default_backlog_maxtries")]
pub backlog_maxretries: u16,
#[serde(default = "default_log_level")]
#[serde(deserialize_with = "deserialize_log_level")]
pub log_level: LevelFilter,
}

fn default_auditor_port() -> u16 {
8000
}
fn default_record_prefix() -> String {
//"KUBE".to_owned()
"".to_owned()
}
fn default_earliest_datetime() -> DateTime<Local> {
Local::now()
}
fn default_auditor_timeout() -> TimeDelta {
TimeDelta::try_seconds(10).unwrap()
}
fn default_prometheus_timeout() -> TimeDelta {
TimeDelta::try_seconds(60).unwrap()
}
fn default_collect_interval() -> TimeDelta {
TimeDelta::try_seconds(60).unwrap()
}
fn default_send_interval() -> TimeDelta {
TimeDelta::try_seconds(60).unwrap()
}
fn default_database_path() -> PathBuf {
PathBuf::from(".")
}
fn default_backlog_interval() -> Duration {
Duration::from_secs(300)
}
fn default_backlog_maxtries() -> u16 {
2
}
fn default_log_level() -> LevelFilter {
LevelFilter::INFO
}

pub fn deserialize_log_level<'de, D>(deserializer: D) -> Result<LevelFilter, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
LevelFilter::from_str(&s.to_lowercase()).map_err(serde::de::Error::custom)
}

pub fn deserialize_duration<'de, D>(deserializer: D) -> Result<Duration, D::Error>
where
D: serde::Deserializer<'de>,
{
let seconds = i64::deserialize(deserializer)?;
if seconds < 1 {
Err(serde::de::Error::custom(
"durations should be greater than zero",
))
} else {
Ok(Duration::from_secs(seconds as u64))
}
}

pub fn deserialize_timedelta<'de, D>(deserializer: D) -> Result<TimeDelta, D::Error>
where
D: serde::Deserializer<'de>,
{
let seconds = i64::deserialize(deserializer)?;
if seconds < 1 {
Err(serde::de::Error::custom(
"durations should be greater than zero",
))
} else {
let dur = TimeDelta::try_seconds(seconds).ok_or(serde::de::Error::custom(format!(
"Cannot convert {} seconds to TimeDelta",
seconds
)))?;
if let Err(e) = dur.to_std() {
Err(serde::de::Error::custom(e))
} else {
Ok(dur)
}
}
}

#[derive(Deserialize, Debug, Clone)]
pub struct JobFilterSettings {
/// Potentially interesting: complete, failed, suspended
#[serde(default = "default_job_filter_status")]
pub status: Vec<String>,
#[serde(default = "default_job_filter_namespace")]
pub namespace: Vec<String>,
#[serde(default)]
pub labels: Vec<String>,
}

impl Default for JobFilterSettings {
fn default() -> Self {
Self {
status: default_job_filter_status(),
namespace: default_job_filter_namespace(),
labels: Vec::with_capacity(0),
}
}
}

fn default_job_filter_status() -> Vec<String> {
vec!["completed".into()]
}

fn default_job_filter_namespace() -> Vec<String> {
vec!["default".into()]
}
65 changes: 65 additions & 0 deletions collectors/kubernetes/src/constants.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use std::cell::Cell;
use std::ops::Deref;
use std::sync::OnceLock;

use auditor::domain::ValidName;

pub static KEY_PODNAME: Lazy<ValidName> =
Lazy::new(|| ValidName::parse("podname".to_owned()).unwrap());
pub static KEY_NAMESPACE: Lazy<ValidName> =
Lazy::new(|| ValidName::parse("namespace".to_owned()).unwrap());
pub static KEY_STATUS: Lazy<ValidName> =
Lazy::new(|| ValidName::parse("status".to_owned()).unwrap());
pub static COMPONENT_CPU: Lazy<ValidName> =
Lazy::new(|| ValidName::parse("cpu".to_owned()).unwrap());
pub static COMPONENT_MEM: Lazy<ValidName> =
Lazy::new(|| ValidName::parse("memory".to_owned()).unwrap());

pub fn ensure_lazies() {
let _ = KEY_PODNAME.force();
let _ = KEY_NAMESPACE.force();
let _ = KEY_STATUS.force();
let _ = COMPONENT_CPU.force();
let _ = COMPONENT_MEM.force();
}

// Replace by `std::sync::LazyLock` once it is stable
pub struct Lazy<T, F = fn() -> T> {
cell: OnceLock<T>,
init: Cell<Option<F>>,
}

// We never make a &F and OnceLock guarantees that there will
// only ever be one call of `init`
unsafe impl<T, F: Send> Sync for Lazy<T, F> where OnceLock<T>: Sync {}

impl<T, F: FnOnce() -> T> Lazy<T, F> {
const fn new(f: F) -> Self {
Self {
cell: OnceLock::new(),
init: Cell::new(Some(f)),
}
}

fn force(&self) -> &T {
self.cell.get_or_init(|| self.init.take().unwrap()())
}
}

impl<T, F: FnOnce() -> T> Deref for Lazy<T, F> {
type Target = T;
fn deref(&self) -> &Self::Target {
self.force()
}
}

impl<T, F, U> AsRef<U> for Lazy<T, F>
where
T: AsRef<U>,
F: FnOnce() -> T,
U: ?Sized,
{
fn as_ref(&self) -> &U {
self.deref().as_ref()
}
}
Loading
Loading