Skip to content

Commit

Permalink
account cpu time in ebpf (#1185)
Browse files Browse the repository at this point in the history
don't use cgroup id 1 if cgroup id is invalid



choose the right cgroup id path from /proc/pid/cgroup



use kernel thread group id (i.e. userspace pid) to aggregate processes to reduce search overhead



add finish_task_switch kprobe



switch to kprobe to trace context switch



change to co-re to avoid crash



review feedback



clean up



cpu time: use per cpu array to track process cpu time



review feedback: use task_clock instead of cpu_clock



review feedback



add to changelog



rebase

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs authored Jan 23, 2024
1 parent bebe5e2 commit 36e7a87
Show file tree
Hide file tree
Showing 14 changed files with 120,519 additions and 55 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
in kepler 0.7 release
- switch to libbpf as default ebpf provider
- base image update decouple GPU driver from kepler image itself
- use kprobe instead of tracepoint for ebpf to obtain context switch information
- add task clock event to ebpf and use it to calculate cpu usage for each process. The event is also exported to prometheus
9 changes: 8 additions & 1 deletion bpfassets/libbpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@ GOARCH=$(shell go env GOARCH)
TARGET := kepler
TARGET_BPF := $(TARGET).bpf.o
BPF_SRC := bpfassets/libbpf/src/*.bpf.c
# in libbpf if $(ARCH) is x86_64, then set TARGET_ARCH to x86
ifeq ($(ARCH),x86_64)
TARGET_ARCH := x86
else
TARGET_ARCH := $(ARCH)
endif

$(TARGET_BPF): $(BPF_SRC)
clang \
-I /usr/include/$(ARCH)-linux-gnu \
-D __TARGET_ARCH_$(TARGET_ARCH) \
-O2 -g -c -target bpf \
-o bpfassets/libbpf/bpf.o/$(GOARCH)_$@ $<
-o bpfassets/libbpf/bpf.o/$(GOARCH)_$@ $<
Binary file modified bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
Binary file not shown.
96 changes: 67 additions & 29 deletions bpfassets/libbpf/src/kepler.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,21 @@ BPF_HASH(processes, u32, process_metrics_t);
BPF_HASH(pid_time, u32, u64);

// perf counters
BPF_PERF_ARRAY(cpu_cycles_hc_reader);
BPF_PERF_ARRAY(cpu_cycles_event_reader);
BPF_ARRAY(cpu_cycles, u64);

BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader);
BPF_PERF_ARRAY(cpu_ref_cycles_event_reader);
BPF_ARRAY(cpu_ref_cycles, u64);

BPF_PERF_ARRAY(cpu_instructions_hc_reader);
BPF_PERF_ARRAY(cpu_instructions_event_reader);
BPF_ARRAY(cpu_instructions, u64);

BPF_PERF_ARRAY(cache_miss_hc_reader);
BPF_PERF_ARRAY(cache_miss_event_reader);
BPF_ARRAY(cache_miss, u64);

BPF_PERF_ARRAY(task_clock_event_reader);
BPF_ARRAY(task_clock, u64);

// cpu freq counters
BPF_ARRAY(cpu_freq_array, u32);

Expand All @@ -62,13 +65,12 @@ static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
// timestamp later than the recorded off-CPU event, or vice versa.
if (cur_ts > *prev_ts)
{
cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
cpu_time = (cur_ts - *prev_ts) / 1000000 ; // convert to ms
bpf_map_delete_elem(&pid_time, &prev_pid_key);
}
}
pid_time_t new_pid_key = {.pid = cur_pid};
bpf_map_update_elem(&pid_time, &new_pid_key, &cur_ts, BPF_NOEXIST);

return cpu_time;
}

Expand All @@ -83,13 +85,40 @@ static inline u64 calc_delta(u64 *prev_val, u64 *val)
return delta;
}

static inline u64 get_on_cpu_task_clock_time(u32 *cpu_id, u64 cur_ts)
{
u64 delta = 0;
#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
struct bpf_perf_event_value c = {};
int error = bpf_perf_event_read_value(&task_clock_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = c.counter;
u64 *prev_val = bpf_map_lookup_elem(&task_clock, cpu_id);
delta = calc_delta(prev_val, &val);
bpf_map_update_elem(&task_clock, cpu_id, &val, BPF_ANY);
}
#else
int ret = bpf_perf_event_read(&task_clock_event_reader, *cpu_id);
if (ret < 0) {
return delta;
}
u64 val = ret;
u64 *prev_val = bpf_map_lookup_elem(&task_clock, cpu_id);
delta = calc_delta(prev_val, &val);
bpf_map_update_elem(&task_clock, cpu_id, &val, BPF_ANY);
#endif

return delta / 1000000; // convert to ms

}
// although the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
static inline u64 get_on_cpu_cycles(u32 *cpu_id)
{
u64 delta = 0;
#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
struct bpf_perf_event_value c = {};
int error = bpf_perf_event_read_value(&cpu_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
int error = bpf_perf_event_read_value(&cpu_cycles_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = c.counter;
Expand All @@ -98,7 +127,7 @@ static inline u64 get_on_cpu_cycles(u32 *cpu_id)
bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);
}
#else
int ret = bpf_perf_event_read(&cpu_cycles_hc_reader, *cpu_id);
int ret = bpf_perf_event_read(&cpu_cycles_event_reader, *cpu_id);
if (ret < 0) {
return delta;
}
Expand All @@ -116,7 +145,7 @@ static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
u64 delta = 0;
#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
struct bpf_perf_event_value c = {};
int error = bpf_perf_event_read_value(&cpu_ref_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
int error = bpf_perf_event_read_value(&cpu_ref_cycles_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = c.counter;
Expand All @@ -125,7 +154,7 @@ static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY);
}
#else
int ret = bpf_perf_event_read(&cpu_ref_cycles_hc_reader, *cpu_id);
int ret = bpf_perf_event_read(&cpu_ref_cycles_event_reader, *cpu_id);
if (ret < 0) {
return delta;
}
Expand All @@ -142,7 +171,7 @@ static inline u64 get_on_cpu_instr(u32 *cpu_id)
u64 delta = 0;
#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
struct bpf_perf_event_value c = {};
int error = bpf_perf_event_read_value(&cpu_instructions_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
int error = bpf_perf_event_read_value(&cpu_instructions_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = c.counter;
Expand All @@ -151,7 +180,7 @@ static inline u64 get_on_cpu_instr(u32 *cpu_id)
bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);
}
#else
int ret = bpf_perf_event_read(&cpu_instructions_hc_reader, *cpu_id);
int ret = bpf_perf_event_read(&cpu_instructions_event_reader, *cpu_id);
if (ret < 0) {
return delta;
}
Expand All @@ -168,7 +197,7 @@ static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
u64 delta = 0;
#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
struct bpf_perf_event_value c = {};
int error = bpf_perf_event_read_value(&cache_miss_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
int error = bpf_perf_event_read_value(&cache_miss_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = c.counter;
Expand All @@ -177,7 +206,7 @@ static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);
}
#else
int ret = bpf_perf_event_read(&cache_miss_hc_reader, *cpu_id);
int ret = bpf_perf_event_read(&cache_miss_event_reader, *cpu_id);
if (ret < 0) {
return delta;
}
Expand Down Expand Up @@ -207,11 +236,8 @@ static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64
return avg_freq;
}

SEC("tracepoint/sched/sched_switch")
int kepler_trace(struct sched_switch_args *ctx)
{
u32 next_pid = ctx->next_pid; // the new pid that is to be scheduled

SEC("kprobe/finish_task_switch")
int kprobe__finish_task_switch(struct pt_regs *ctx) {
// only do sampling if sample rate is set
if (sample_rate != 0)
{
Expand All @@ -223,42 +249,51 @@ int kepler_trace(struct sched_switch_args *ctx)
counter_sched_switch = sample_rate;
}

u32 cur_pid = bpf_get_current_pid_tgid();
u64 cgroup_id = bpf_get_current_cgroup_id(); // the cgroup id is the cgroup id of the running process (this is not next_pid or prev_pid)

u64 cur_ts = bpf_ktime_get_ns();
u32 cpu_id = bpf_get_smp_processor_id();
u32 prev_pid = ctx->prev_pid;
// Getting the task_struct of the task that is scheduled out
struct task_struct *prev_task = (struct task_struct *)PT_REGS_PARM1(ctx);
// Getting the PID of the scheduled-out task
u64 prev_tgid = BPF_CORE_READ(prev_task, tgid);
u32 prev_pid = prev_tgid & 0xffffffff;

u64 pid_tgid = bpf_get_current_pid_tgid();
pid_t cur_pid = pid_tgid & 0xffffffff;
u32 tgid = pid_tgid >> 32;
u64 cgroup_id = bpf_get_current_cgroup_id();
u32 cpu_id = bpf_get_smp_processor_id();
u64 cur_ts = bpf_ktime_get_ns();
u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
u64 on_cpu_time_delta = get_on_cpu_time(next_pid, prev_pid, cur_ts);
u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev_pid, cur_ts);
u64 on_task_clock_time_delta = get_on_cpu_task_clock_time(&cpu_id, cur_ts);

// store process metrics
struct process_metrics_t *process_metrics;
process_metrics = bpf_map_lookup_elem(&processes, &prev_pid);
if (process_metrics)
{
// update process time
process_metrics->process_run_time += on_cpu_time_delta;
process_metrics->task_clock_time += on_task_clock_time_delta;
process_metrics->cpu_cycles += on_cpu_cycles_delta;
process_metrics->cpu_instr += on_cpu_instr_delta;
process_metrics->cache_miss += on_cpu_cache_miss_delta;
}

// creat new process metrics
process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
if (process_metrics == 0)
{
process_metrics_t new_process = {};
new_process.pid = cur_pid;
new_process.tgid = tgid;
new_process.cgroup_id = cgroup_id;
// bpf_probe_read(&new_process.comm, sizeof(new_process.comm), (void *)ctx->next_comm);
bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
bpf_map_update_elem(&processes, &cur_pid, &new_process, BPF_NOEXIST);
}

return 0;
}

Expand All @@ -267,11 +302,14 @@ int kepler_irq_trace(struct trace_event_raw_softirq *ctx)
{
u32 cur_pid = bpf_get_current_pid_tgid();
struct process_metrics_t *process_metrics;
unsigned int vec = ctx->vec;
process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
if (process_metrics != 0)
{
if (ctx->vec < 10) {
process_metrics->vec_nr[ctx->vec] ++;
if (vec < IRQ_MAX_LEN) {
u16 count = process_metrics->vec_nr[vec];
count++;
process_metrics->vec_nr[vec] = count;
}
}
return 0;
Expand Down
16 changes: 10 additions & 6 deletions bpfassets/libbpf/src/kepler.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ limitations under the License.
* Redefine it to just asm to enable successful compilation.
* see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details
*/
#include <linux/types.h>
#include <linux/sched.h>
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_tracing.h>

#ifdef asm_inline
#undef asm_inline
#define asm_inline asm
Expand All @@ -30,9 +33,6 @@ typedef __u64 u64;
typedef __u32 u32;
typedef __u16 u16;

#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>

#ifndef NUM_CPUS
#define NUM_CPUS 128
#endif
Expand Down Expand Up @@ -109,16 +109,20 @@ struct sched_switch_args {
int next_prio;
};

/*
struct trace_event_raw_softirq {
unsigned long long pad;
unsigned int vec;
};
*/

typedef struct process_metrics_t
{
u64 cgroup_id;
u64 pid;
u64 pid; // pid is the kernel space view of the thread id
u64 tgid; // tgid is the user space view of the pid
u64 process_run_time;
u64 task_clock_time;
u64 cpu_cycles;
u64 cpu_instr;
u64 cache_miss;
Expand Down
Loading

0 comments on commit 36e7a87

Please sign in to comment.