account cpu time in ebpf (#1185)

don't use cgroup id 1 if cgroup id is invalid choose the right cgroup id path from /proc/pid/cgroup use kernel thread group id (i.e. userspace pid) to aggregate processes to reduce search overhead add finish_task_switch kprobe switch to kprobe to trace context switch change to co-re to avoid crash review feedback clean up cpu time: use per cpu array to track process cpu time review feedback: use task_clock instead of cpu_clock review feedback add to changelog rebase Signed-off-by: Huamin Chen <[email protected]>
sustainable-computing-io · Jan 23, 2024 · 36e7a87 · 36e7a87
1 parent bebe5e2
commit 36e7a87
Show file tree

Hide file tree

Showing 14 changed files with 120,519 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
 in kepler 0.7 release
 - switch to libbpf as default ebpf provider
 - base image update decouple GPU driver from kepler image itself
+- use kprobe instead of tracepoint for ebpf to obtain context switch information
+- add task clock event to ebpf and use it to calculate cpu usage for each process. The event is also exported to prometheus
diff --git a/bpfassets/libbpf/Makefile b/bpfassets/libbpf/Makefile
@@ -4,9 +4,16 @@ GOARCH=$(shell go env GOARCH)
 TARGET := kepler
 TARGET_BPF := $(TARGET).bpf.o
 BPF_SRC := bpfassets/libbpf/src/*.bpf.c
+# in libbpf if $(ARCH) is x86_64, then set TARGET_ARCH to x86
+ifeq ($(ARCH),x86_64)
+    TARGET_ARCH := x86
+else
+    TARGET_ARCH := $(ARCH)
+endif
 
 $(TARGET_BPF): $(BPF_SRC)
 	clang \
 		-I /usr/include/$(ARCH)-linux-gnu \
+		-D __TARGET_ARCH_$(TARGET_ARCH) \
 		-O2 -g -c -target bpf \
-		-o bpfassets/libbpf/bpf.o/$(GOARCH)_$@ $<
+		-o bpfassets/libbpf/bpf.o/$(GOARCH)_$@ $<
diff --git a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o
diff --git a/bpfassets/libbpf/src/kepler.bpf.c b/bpfassets/libbpf/src/kepler.bpf.c
@@ -29,18 +29,21 @@ BPF_HASH(processes, u32, process_metrics_t);
 BPF_HASH(pid_time, u32, u64);
 
 // perf counters
-BPF_PERF_ARRAY(cpu_cycles_hc_reader);
+BPF_PERF_ARRAY(cpu_cycles_event_reader);
 BPF_ARRAY(cpu_cycles, u64);
 
-BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader);
+BPF_PERF_ARRAY(cpu_ref_cycles_event_reader);
 BPF_ARRAY(cpu_ref_cycles, u64);
 
-BPF_PERF_ARRAY(cpu_instructions_hc_reader);
+BPF_PERF_ARRAY(cpu_instructions_event_reader);
 BPF_ARRAY(cpu_instructions, u64);
 
-BPF_PERF_ARRAY(cache_miss_hc_reader);
+BPF_PERF_ARRAY(cache_miss_event_reader);
 BPF_ARRAY(cache_miss, u64);
 
+BPF_PERF_ARRAY(task_clock_event_reader);
+BPF_ARRAY(task_clock, u64);
+
 // cpu freq counters
 BPF_ARRAY(cpu_freq_array, u32);
 
@@ -62,13 +65,12 @@ static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
         // timestamp later than the recorded off-CPU event, or vice versa.
         if (cur_ts > *prev_ts)
         {
-            cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
+            cpu_time = (cur_ts - *prev_ts) / 1000000 ; // convert to ms
             bpf_map_delete_elem(&pid_time, &prev_pid_key);
         }
     }
     pid_time_t new_pid_key = {.pid = cur_pid};
     bpf_map_update_elem(&pid_time, &new_pid_key, &cur_ts, BPF_NOEXIST);
-
     return cpu_time;
 }
 
@@ -83,13 +85,40 @@ static inline u64 calc_delta(u64 *prev_val, u64 *val)
     return delta;
 }
 
+static inline u64 get_on_cpu_task_clock_time(u32 *cpu_id, u64 cur_ts)
+{
+    u64 delta = 0;
+#ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
+    struct bpf_perf_event_value c = {};
+    int error = bpf_perf_event_read_value(&task_clock_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = c.counter;
+        u64 *prev_val = bpf_map_lookup_elem(&task_clock, cpu_id);
+        delta = calc_delta(prev_val, &val);
+        bpf_map_update_elem(&task_clock, cpu_id, &val, BPF_ANY);
+    }
+#else
+    int ret = bpf_perf_event_read(&task_clock_event_reader, *cpu_id);
+    if (ret < 0) {
+        return delta;
+    }
+    u64 val = ret;
+    u64 *prev_val = bpf_map_lookup_elem(&task_clock, cpu_id);
+    delta = calc_delta(prev_val, &val);
+    bpf_map_update_elem(&task_clock, cpu_id, &val, BPF_ANY);
+#endif
+
+    return delta / 1000000; // convert to ms
+
+}
 // although the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
 static inline u64 get_on_cpu_cycles(u32 *cpu_id)
 {
     u64 delta = 0;
 #ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
     struct bpf_perf_event_value c = {};
-    int error = bpf_perf_event_read_value(&cpu_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    int error = bpf_perf_event_read_value(&cpu_cycles_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
     if (error == 0)
     {
         u64 val = c.counter;
@@ -98,7 +127,7 @@ static inline u64 get_on_cpu_cycles(u32 *cpu_id)
         bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);
     }
 #else
-    int ret = bpf_perf_event_read(&cpu_cycles_hc_reader, *cpu_id);
+    int ret = bpf_perf_event_read(&cpu_cycles_event_reader, *cpu_id);
     if (ret < 0) {
         return delta;
     }
@@ -116,7 +145,7 @@ static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
     u64 delta = 0;
 #ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
     struct bpf_perf_event_value c = {};
-    int error = bpf_perf_event_read_value(&cpu_ref_cycles_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    int error = bpf_perf_event_read_value(&cpu_ref_cycles_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
     if (error == 0)
     {
         u64 val = c.counter;
@@ -125,7 +154,7 @@ static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
         bpf_map_update_elem(&cpu_ref_cycles, cpu_id, &val, BPF_ANY);
     }
 #else
-    int ret = bpf_perf_event_read(&cpu_ref_cycles_hc_reader, *cpu_id);
+    int ret = bpf_perf_event_read(&cpu_ref_cycles_event_reader, *cpu_id);
     if (ret < 0) {
         return delta;
     }
@@ -142,7 +171,7 @@ static inline u64 get_on_cpu_instr(u32 *cpu_id)
     u64 delta = 0;
 #ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
     struct bpf_perf_event_value c = {};
-    int error = bpf_perf_event_read_value(&cpu_instructions_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    int error = bpf_perf_event_read_value(&cpu_instructions_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
     if (error == 0)
     {
         u64 val = c.counter;
@@ -151,7 +180,7 @@ static inline u64 get_on_cpu_instr(u32 *cpu_id)
         bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);
     }
 #else
-    int ret = bpf_perf_event_read(&cpu_instructions_hc_reader, *cpu_id);
+    int ret = bpf_perf_event_read(&cpu_instructions_event_reader, *cpu_id);
     if (ret < 0) {
         return delta;
     }
@@ -168,7 +197,7 @@ static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
     u64 delta = 0;
 #ifdef BPF_PERF_EVENT_READ_VALUE_AVAILABLE
     struct bpf_perf_event_value c = {};
-    int error = bpf_perf_event_read_value(&cache_miss_hc_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
+    int error = bpf_perf_event_read_value(&cache_miss_event_reader, *cpu_id, &c, sizeof(struct bpf_perf_event_value));
     if (error == 0)
     {
         u64 val = c.counter;
@@ -177,7 +206,7 @@ static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
         bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);
     }
 #else
-    int ret = bpf_perf_event_read(&cache_miss_hc_reader, *cpu_id);
+    int ret = bpf_perf_event_read(&cache_miss_event_reader, *cpu_id);
     if (ret < 0) {
         return delta;
     }
@@ -207,11 +236,8 @@ static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64
     return avg_freq;
 }
 
-SEC("tracepoint/sched/sched_switch")
-int kepler_trace(struct sched_switch_args *ctx)
-{
-    u32 next_pid = ctx->next_pid; // the new pid that is to be scheduled
-
+SEC("kprobe/finish_task_switch")
+int kprobe__finish_task_switch(struct pt_regs *ctx) {
     // only do sampling if sample rate is set
     if (sample_rate != 0)
     {
@@ -223,42 +249,51 @@ int kepler_trace(struct sched_switch_args *ctx)
         counter_sched_switch = sample_rate;
     }
 
-    u32 cur_pid = bpf_get_current_pid_tgid();
-    u64 cgroup_id = bpf_get_current_cgroup_id(); // the cgroup id is the cgroup id of the running process (this is not next_pid or prev_pid)
-
-    u64 cur_ts = bpf_ktime_get_ns();
-    u32 cpu_id = bpf_get_smp_processor_id();
-    u32 prev_pid = ctx->prev_pid;
+    // Getting the task_struct of the task that is scheduled out
+    struct task_struct *prev_task = (struct task_struct *)PT_REGS_PARM1(ctx);
+    // Getting the PID of the scheduled-out task
+    u64 prev_tgid = BPF_CORE_READ(prev_task, tgid);
+    u32 prev_pid = prev_tgid & 0xffffffff;
 
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    pid_t cur_pid = pid_tgid & 0xffffffff;
+    u32 tgid = pid_tgid >> 32;
+    u64 cgroup_id = bpf_get_current_cgroup_id();
+    u32 cpu_id = bpf_get_smp_processor_id();
+    u64 cur_ts = bpf_ktime_get_ns();
     u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
     u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
     u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
     u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
     u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
-    u64 on_cpu_time_delta = get_on_cpu_time(next_pid, prev_pid, cur_ts);
+    u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev_pid, cur_ts);
+    u64 on_task_clock_time_delta = get_on_cpu_task_clock_time(&cpu_id, cur_ts);
 
-    // store process metrics
     struct process_metrics_t *process_metrics;
     process_metrics = bpf_map_lookup_elem(&processes, &prev_pid);
     if (process_metrics)
     {
         // update process time
         process_metrics->process_run_time += on_cpu_time_delta;
+        process_metrics->task_clock_time += on_task_clock_time_delta;
         process_metrics->cpu_cycles += on_cpu_cycles_delta;
         process_metrics->cpu_instr += on_cpu_instr_delta;
         process_metrics->cache_miss += on_cpu_cache_miss_delta;
     }
 
+    // creat new process metrics
     process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
     if (process_metrics == 0)
     {
         process_metrics_t new_process = {};
         new_process.pid = cur_pid;
+        new_process.tgid = tgid;
         new_process.cgroup_id = cgroup_id;
         // bpf_probe_read(&new_process.comm, sizeof(new_process.comm), (void *)ctx->next_comm);
         bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
         bpf_map_update_elem(&processes, &cur_pid, &new_process, BPF_NOEXIST);
     }
+
     return 0;
 }
 
@@ -267,11 +302,14 @@ int kepler_irq_trace(struct trace_event_raw_softirq *ctx)
 {
     u32 cur_pid = bpf_get_current_pid_tgid();
     struct process_metrics_t *process_metrics;
+    unsigned int vec = ctx->vec;
     process_metrics = bpf_map_lookup_elem(&processes, &cur_pid);
     if (process_metrics != 0)
     {
-        if (ctx->vec < 10) {
-            process_metrics->vec_nr[ctx->vec] ++;
+        if (vec < IRQ_MAX_LEN) {
+            u16 count = process_metrics->vec_nr[vec];
+            count++;
+            process_metrics->vec_nr[vec] = count;
         }
     }
     return 0;

diff --git a/bpfassets/libbpf/src/kepler.bpf.h b/bpfassets/libbpf/src/kepler.bpf.h
@@ -19,8 +19,11 @@ limitations under the License.
  * Redefine it to just asm to enable successful compilation.
  * see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details
  */
-#include <linux/types.h>
-#include <linux/sched.h>
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+
 #ifdef asm_inline
 #undef asm_inline
 #define asm_inline asm
@@ -30,9 +33,6 @@ typedef __u64 u64;
 typedef __u32 u32;
 typedef __u16 u16;
 
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-
 #ifndef NUM_CPUS
 #define NUM_CPUS 128
 #endif
@@ -109,16 +109,20 @@ struct sched_switch_args {
     int next_prio;
 };
 
+/*
 struct trace_event_raw_softirq {
     unsigned long long pad;
     unsigned int vec;
 };
+*/
 
 typedef struct process_metrics_t
 {
     u64 cgroup_id;
-    u64 pid;
+    u64 pid; // pid is the kernel space view of the thread id
+    u64 tgid; // tgid is the user space view of the pid
     u64 process_run_time;
+    u64 task_clock_time;
     u64 cpu_cycles;
     u64 cpu_instr;
     u64 cache_miss;