Skip to content

Commit

Permalink
libbpf: add perf buffer API
Browse files Browse the repository at this point in the history
BPF_MAP_TYPE_PERF_EVENT_ARRAY map is often used to send data from BPF program
to user space for additional processing. libbpf already has very low-level API
to read single CPU perf buffer, bpf_perf_event_read_simple(), but it's hard to
use and requires a lot of code to set everything up. This patch adds
perf_buffer abstraction on top of it, abstracting setting up and polling
per-CPU logic into simple and convenient API, similar to what BCC provides.

perf_buffer__new() sets up per-CPU ring buffers and updates corresponding BPF
map entries. It accepts two user-provided callbacks: one for handling raw
samples and one for get notifications of lost samples due to buffer overflow.

perf_buffer__new_raw() is similar, but provides more control over how
perf events are set up (by accepting user-provided perf_event_attr), how
they are handled (perf_event_header pointer is passed directly to
user-provided callback), and on which CPUs ring buffers are created
(it's possible to provide a list of CPUs and corresponding map keys to
update). This API allows advanced users fuller control.

perf_buffer__poll() is used to fetch ring buffer data across all CPUs,
utilizing epoll instance.

perf_buffer__free() does corresponding clean up and unsets FDs from BPF map.

All APIs are not thread-safe. User should ensure proper locking/coordination if
used in multi-threaded set up.

Signed-off-by: Andrii Nakryiko <[email protected]>
Acked-by: Yonghong Song <[email protected]>
Signed-off-by: Daniel Borkmann <[email protected]>
  • Loading branch information
anakryiko authored and borkmann committed Jul 8, 2019
1 parent c3ec002 commit fb84b82
Show file tree
Hide file tree
Showing 3 changed files with 419 additions and 0 deletions.
366 changes: 366 additions & 0 deletions tools/lib/bpf/libbpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
#include <linux/limits.h>
#include <linux/perf_event.h>
#include <linux/ring_buffer.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/vfs.h>
Expand Down Expand Up @@ -4354,6 +4356,370 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
return ret;
}

struct perf_buffer;

struct perf_buffer_params {
struct perf_event_attr *attr;
/* if event_cb is specified, it takes precendence */
perf_buffer_event_fn event_cb;
/* sample_cb and lost_cb are higher-level common-case callbacks */
perf_buffer_sample_fn sample_cb;
perf_buffer_lost_fn lost_cb;
void *ctx;
int cpu_cnt;
int *cpus;
int *map_keys;
};

struct perf_cpu_buf {
struct perf_buffer *pb;
void *base; /* mmap()'ed memory */
void *buf; /* for reconstructing segmented data */
size_t buf_size;
int fd;
int cpu;
int map_key;
};

struct perf_buffer {
perf_buffer_event_fn event_cb;
perf_buffer_sample_fn sample_cb;
perf_buffer_lost_fn lost_cb;
void *ctx; /* passed into callbacks */

size_t page_size;
size_t mmap_size;
struct perf_cpu_buf **cpu_bufs;
struct epoll_event *events;
int cpu_cnt;
int epoll_fd; /* perf event FD */
int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */
};

static void perf_buffer__free_cpu_buf(struct perf_buffer *pb,
struct perf_cpu_buf *cpu_buf)
{
if (!cpu_buf)
return;
if (cpu_buf->base &&
munmap(cpu_buf->base, pb->mmap_size + pb->page_size))
pr_warning("failed to munmap cpu_buf #%d\n", cpu_buf->cpu);
if (cpu_buf->fd >= 0) {
ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0);
close(cpu_buf->fd);
}
free(cpu_buf->buf);
free(cpu_buf);
}

void perf_buffer__free(struct perf_buffer *pb)
{
int i;

if (!pb)
return;
if (pb->cpu_bufs) {
for (i = 0; i < pb->cpu_cnt && pb->cpu_bufs[i]; i++) {
struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i];

bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key);
perf_buffer__free_cpu_buf(pb, cpu_buf);
}
free(pb->cpu_bufs);
}
if (pb->epoll_fd >= 0)
close(pb->epoll_fd);
free(pb->events);
free(pb);
}

static struct perf_cpu_buf *
perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr,
int cpu, int map_key)
{
struct perf_cpu_buf *cpu_buf;
char msg[STRERR_BUFSIZE];
int err;

cpu_buf = calloc(1, sizeof(*cpu_buf));
if (!cpu_buf)
return ERR_PTR(-ENOMEM);

cpu_buf->pb = pb;
cpu_buf->cpu = cpu;
cpu_buf->map_key = map_key;

cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu,
-1, PERF_FLAG_FD_CLOEXEC);
if (cpu_buf->fd < 0) {
err = -errno;
pr_warning("failed to open perf buffer event on cpu #%d: %s\n",
cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}

cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size,
PROT_READ | PROT_WRITE, MAP_SHARED,
cpu_buf->fd, 0);
if (cpu_buf->base == MAP_FAILED) {
cpu_buf->base = NULL;
err = -errno;
pr_warning("failed to mmap perf buffer on cpu #%d: %s\n",
cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}

if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
err = -errno;
pr_warning("failed to enable perf buffer event on cpu #%d: %s\n",
cpu, libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}

return cpu_buf;

error:
perf_buffer__free_cpu_buf(pb, cpu_buf);
return (struct perf_cpu_buf *)ERR_PTR(err);
}

static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
struct perf_buffer_params *p);

struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
const struct perf_buffer_opts *opts)
{
struct perf_buffer_params p = {};
struct perf_event_attr attr = {
.config = PERF_COUNT_SW_BPF_OUTPUT,
.type = PERF_TYPE_SOFTWARE,
.sample_type = PERF_SAMPLE_RAW,
.sample_period = 1,
.wakeup_events = 1,
};

p.attr = &attr;
p.sample_cb = opts ? opts->sample_cb : NULL;
p.lost_cb = opts ? opts->lost_cb : NULL;
p.ctx = opts ? opts->ctx : NULL;

return __perf_buffer__new(map_fd, page_cnt, &p);
}

struct perf_buffer *
perf_buffer__new_raw(int map_fd, size_t page_cnt,
const struct perf_buffer_raw_opts *opts)
{
struct perf_buffer_params p = {};

p.attr = opts->attr;
p.event_cb = opts->event_cb;
p.ctx = opts->ctx;
p.cpu_cnt = opts->cpu_cnt;
p.cpus = opts->cpus;
p.map_keys = opts->map_keys;

return __perf_buffer__new(map_fd, page_cnt, &p);
}

static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
struct perf_buffer_params *p)
{
struct bpf_map_info map = {};
char msg[STRERR_BUFSIZE];
struct perf_buffer *pb;
__u32 map_info_len;
int err, i;

if (page_cnt & (page_cnt - 1)) {
pr_warning("page count should be power of two, but is %zu\n",
page_cnt);
return ERR_PTR(-EINVAL);
}

map_info_len = sizeof(map);
err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len);
if (err) {
err = -errno;
pr_warning("failed to get map info for map FD %d: %s\n",
map_fd, libbpf_strerror_r(err, msg, sizeof(msg)));
return ERR_PTR(err);
}

if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
map.name);
return ERR_PTR(-EINVAL);
}

pb = calloc(1, sizeof(*pb));
if (!pb)
return ERR_PTR(-ENOMEM);

pb->event_cb = p->event_cb;
pb->sample_cb = p->sample_cb;
pb->lost_cb = p->lost_cb;
pb->ctx = p->ctx;

pb->page_size = getpagesize();
pb->mmap_size = pb->page_size * page_cnt;
pb->map_fd = map_fd;

pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (pb->epoll_fd < 0) {
err = -errno;
pr_warning("failed to create epoll instance: %s\n",
libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}

if (p->cpu_cnt > 0) {
pb->cpu_cnt = p->cpu_cnt;
} else {
pb->cpu_cnt = libbpf_num_possible_cpus();
if (pb->cpu_cnt < 0) {
err = pb->cpu_cnt;
goto error;
}
if (map.max_entries < pb->cpu_cnt)
pb->cpu_cnt = map.max_entries;
}

pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events));
if (!pb->events) {
err = -ENOMEM;
pr_warning("failed to allocate events: out of memory\n");
goto error;
}
pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs));
if (!pb->cpu_bufs) {
err = -ENOMEM;
pr_warning("failed to allocate buffers: out of memory\n");
goto error;
}

for (i = 0; i < pb->cpu_cnt; i++) {
struct perf_cpu_buf *cpu_buf;
int cpu, map_key;

cpu = p->cpu_cnt > 0 ? p->cpus[i] : i;
map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i;

cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key);
if (IS_ERR(cpu_buf)) {
err = PTR_ERR(cpu_buf);
goto error;
}

pb->cpu_bufs[i] = cpu_buf;

err = bpf_map_update_elem(pb->map_fd, &map_key,
&cpu_buf->fd, 0);
if (err) {
err = -errno;
pr_warning("failed to set cpu #%d, key %d -> perf FD %d: %s\n",
cpu, map_key, cpu_buf->fd,
libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}

pb->events[i].events = EPOLLIN;
pb->events[i].data.ptr = cpu_buf;
if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd,
&pb->events[i]) < 0) {
err = -errno;
pr_warning("failed to epoll_ctl cpu #%d perf FD %d: %s\n",
cpu, cpu_buf->fd,
libbpf_strerror_r(err, msg, sizeof(msg)));
goto error;
}
}

return pb;

error:
if (pb)
perf_buffer__free(pb);
return ERR_PTR(err);
}

struct perf_sample_raw {
struct perf_event_header header;
uint32_t size;
char data[0];
};

struct perf_sample_lost {
struct perf_event_header header;
uint64_t id;
uint64_t lost;
uint64_t sample_id;
};

static enum bpf_perf_event_ret
perf_buffer__process_record(struct perf_event_header *e, void *ctx)
{
struct perf_cpu_buf *cpu_buf = ctx;
struct perf_buffer *pb = cpu_buf->pb;
void *data = e;

/* user wants full control over parsing perf event */
if (pb->event_cb)
return pb->event_cb(pb->ctx, cpu_buf->cpu, e);

switch (e->type) {
case PERF_RECORD_SAMPLE: {
struct perf_sample_raw *s = data;

if (pb->sample_cb)
pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size);
break;
}
case PERF_RECORD_LOST: {
struct perf_sample_lost *s = data;

if (pb->lost_cb)
pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost);
break;
}
default:
pr_warning("unknown perf sample type %d\n", e->type);
return LIBBPF_PERF_EVENT_ERROR;
}
return LIBBPF_PERF_EVENT_CONT;
}

static int perf_buffer__process_records(struct perf_buffer *pb,
struct perf_cpu_buf *cpu_buf)
{
enum bpf_perf_event_ret ret;

ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size,
pb->page_size, &cpu_buf->buf,
&cpu_buf->buf_size,
perf_buffer__process_record, cpu_buf);
if (ret != LIBBPF_PERF_EVENT_CONT)
return ret;
return 0;
}

int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
{
int i, cnt, err;

cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
for (i = 0; i < cnt; i++) {
struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;

err = perf_buffer__process_records(pb, cpu_buf);
if (err) {
pr_warning("error while processing records: %d\n", err);
return err;
}
}
return cnt < 0 ? -errno : cnt;
}

struct bpf_prog_info_array_desc {
int array_offset; /* e.g. offset of jited_prog_insns */
int count_offset; /* e.g. offset of jited_prog_len */
Expand Down
Loading

0 comments on commit fb84b82

Please sign in to comment.