From d0daf6a41f8574c8dfd8ac6931994d1106ecc4b8 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Thu, 5 Nov 2015 23:31:22 -0800 Subject: [PATCH] Add perf_output support for high rate events This adds support for the bpf_perf_event_output command. This is intended for per-process events from bpf to userspace at high rate. The events from the bpf program can be completely customized. Signed-off-by: Brenden Blanco --- examples/tracing/trace_perf_output.py | 43 ++++++++++++++++ src/cc/export/helpers.h | 23 +++++++++ src/cc/frontends/clang/b_frontend_action.cc | 43 +++++++++++----- src/cc/libbpf.c | 51 ++++++++++++++++--- src/cc/perf_reader.c | 78 ++++++++++++++++++++++++----- src/cc/perf_reader.h | 6 ++- src/libbpf.h | 1 + src/python/bcc/__init__.py | 32 ++++++++++-- 8 files changed, 237 insertions(+), 40 deletions(-) create mode 100644 examples/tracing/trace_perf_output.py diff --git a/examples/tracing/trace_perf_output.py b/examples/tracing/trace_perf_output.py new file mode 100644 index 0000000..3e28d0f --- /dev/null +++ b/examples/tracing/trace_perf_output.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# Copyright (c) PLUMgrid, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") + +# This is an example of tracing an event and printing custom fields. +# run in project examples directory with: +# sudo ./trace_fields.py" + +import atexit +from bcc import BPF +import ctypes + +counter = 0 +def cb(foo, data, size): + global counter + counter += 1 + +prog = """ +BPF_PERF_ARRAY(events, 2); +BPF_TABLE("array", int, u64, counters, 10); +int kprobe__sys_write(void *ctx) { + struct { + u64 ts; + } data = {bpf_ktime_get_ns()}; + if (events.perf_output(ctx, 0, &data, sizeof(data)) < 0) + bpf_trace_printk("perf_output failed\\n"); + int zero = 0; + u64 *val = counters.lookup(&zero); + if (val) lock_xadd(val, 1); + return 0; +} +""" +b = BPF(text=prog) +b["events"].open_perf_buffer(0, cb, None) + +@atexit.register +def print_counter(): + global counter + global b + print("counter = %d vs %d" % (counter, b["counters"][ctypes.c_int(0)].value)) + +while 1: + b.kprobe_poll() diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 5c916e6..5dcb61b 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -42,6 +42,19 @@ struct _name##_table_t { \ __attribute__((section("maps/" _table_type))) \ struct _name##_table_t _name +#define BPF_PERF_ARRAY(_name, _max_entries) \ +struct _name##_table_t { \ + int key; \ + u32 leaf; \ + /* counter = map.perf_read(index) */ \ + u64 (*perf_read) (int); \ + /* map.perf_ouput(ctx, index, data, data_size) */ \ + int (*perf_output) (void *, int, void *, u32); \ + u32 data[_max_entries]; \ +}; \ +__attribute__((section("maps/perf_array"))) \ +struct _name##_table_t _name + #define BPF_HASH1(_name) \ BPF_TABLE("hash", u64, u64, _name, 10240) #define BPF_HASH2(_name, _key_type) \ @@ -117,6 +130,16 @@ static int (*bpf_skb_get_tunnel_key)(void *ctx, void *to, u32 size, u64 flags) = (void *) BPF_FUNC_skb_get_tunnel_key; static int (*bpf_skb_set_tunnel_key)(void *ctx, void *from, u32 size, u64 flags) = (void *) BPF_FUNC_skb_set_tunnel_key; +static int (*bpf_perf_event_read)(void *map, u32 index) = + (void *) BPF_FUNC_perf_event_read; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +static int (*bpf_redirect)(int ifindex, u32 flags) = + (void *) BPF_FUNC_redirect; +static u32 (*bpf_get_route_realm)(void *ctx) = + (void *) BPF_FUNC_get_route_realm; +static int (*bpf_perf_event_output)(void *ctx, void *map, u32 index, void *data, u32 size) = + (void *) BPF_FUNC_perf_event_output; #endif /* llvm builtin functions that eBPF C program may use to diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc index 3ec4efb..1f5bff3 100644 --- a/src/cc/frontends/clang/b_frontend_action.cc +++ b/src/cc/frontends/clang/b_frontend_action.cc @@ -332,6 +332,13 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { } txt += "typeof(" + name + ".leaf) *_leaf = " + lookup + ", &_key); "; txt += "if (_leaf) (*_leaf)++; })"; + } else if (memb_name == "perf_output") { + string name = Ref->getDecl()->getName(); + string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(), + Call->getArg(0)->getLocEnd())); + string args_other = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(), + Call->getArg(3)->getLocEnd())); + txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + "), " + args_other + ")"; } else { if (memb_name == "lookup") { prefix = "bpf_map_lookup_elem"; @@ -345,6 +352,9 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { } else if (memb_name == "call") { prefix = "bpf_tail_call_"; suffix = ")"; + } else if (memb_name == "perf_read") { + prefix = "bpf_perf_event_read"; + suffix = ")"; } else { C.getDiagnostics().Report(Call->getLocStart(), diag::err_expected) << "valid bpf_table operation"; @@ -482,6 +492,13 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) { } const RecordDecl *RD = R->getDecl()->getDefinition(); + int major = 0, minor = 0; + struct utsname un; + if (uname(&un) == 0) { + // release format: ..[-] + sscanf(un.release, "%d.%d.", &major, &minor); + } + TableDesc table; table.name = Decl->getName(); @@ -519,20 +536,20 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) { diag_.Report(Decl->getLocStart(), diag_id) << table.leaf_desc; } } else if (A->getName() == "maps/prog") { - struct utsname un; - if (uname(&un) == 0) { - int major = 0, minor = 0; - // release format: ..[-] - sscanf(un.release, "%d.%d.", &major, &minor); - if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0)) - map_type = BPF_MAP_TYPE_PROG_ARRAY; - } - if (map_type == BPF_MAP_TYPE_UNSPEC) { - C.getDiagnostics().Report(Decl->getLocStart(), diag::err_expected) - << "kernel supporting maps/prog"; - return false; - } + if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0)) + map_type = BPF_MAP_TYPE_PROG_ARRAY; + } else if (A->getName() == "maps/perf_array") { + if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,3,0)) + map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY; } + + if (map_type == BPF_MAP_TYPE_UNSPEC) { + unsigned diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, + "unsupported map type: %0"); + C.getDiagnostics().Report(Decl->getLocStart(), diag_id) << A->getName(); + return false; + } + table.type = map_type; table.fd = bpf_create_map(map_type, table.key_size, table.leaf_size, table.max_entries); if (table.fd < 0) { diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index bda83ff..c7da5fe 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -178,8 +178,8 @@ int bpf_attach_socket(int sock, int prog) { static int bpf_attach_tracing_event(int progfd, const char *event_path, struct perf_reader *reader, int pid, int cpu, int group_fd) { - int efd = -1, rc = -1, pfd = -1; - ssize_t bytes = -1; + int efd = -1, rc = -1, pfd; + ssize_t bytes; char buf[256]; struct perf_event_attr attr = {}; @@ -206,8 +206,9 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, perror("perf_event_open"); goto cleanup; } + perf_reader_set_fd(reader, pfd); - if (perf_reader_mmap(reader, pfd, attr.sample_type) < 0) + if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0) goto cleanup; if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) { @@ -219,14 +220,11 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, goto cleanup; } - rc = pfd; - pfd = -1; + rc = 0; cleanup: if (efd >= 0) close(efd); - if (pfd >= 0) - close(pfd); return rc; } @@ -239,7 +237,7 @@ void * bpf_attach_kprobe(int progfd, const char *event, char buf[256]; struct perf_reader *reader = NULL; - reader = perf_reader_new(-1, 8, cb, cb_cookie); + reader = perf_reader_new(cb, NULL, cb_cookie); if (!reader) goto cleanup; @@ -292,3 +290,40 @@ cleanup: return rc; } +void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie) { + int rc = -1, pfd; + struct perf_event_attr attr = {}; + + struct perf_reader *reader = perf_reader_new(NULL, raw_cb, cb_cookie); + + if (!reader) + goto cleanup; + + attr.config = PERF_COUNT_SW_BPF_OUTPUT; + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + perror("perf_event_open"); + goto cleanup; + } + perf_reader_set_fd(reader, pfd); + + if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0) + goto cleanup; + + if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + perror("ioctl(PERF_EVENT_IOC_ENABLE)"); + goto cleanup; + } + + rc = 0; + +cleanup: + if (reader && rc < 0) { + perf_reader_free(reader); + reader = NULL; + } + + return reader; +} diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c index 467e704..938edff 100644 --- a/src/cc/perf_reader.c +++ b/src/cc/perf_reader.c @@ -26,8 +26,11 @@ #include "libbpf.h" #include "perf_reader.h" +int perf_reader_page_cnt = 8; + struct perf_reader { perf_reader_cb cb; + perf_reader_raw_cb raw_cb; void *cb_cookie; // to be returned in the cb void *buf; // for keeping segmented data size_t buf_size; @@ -35,18 +38,20 @@ struct perf_reader { int page_size; int page_cnt; int fd; + uint32_t type; uint64_t sample_type; }; -struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie) { +struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) { struct perf_reader *reader = calloc(1, sizeof(struct perf_reader)); if (!reader) return NULL; reader->cb = cb; + reader->raw_cb = raw_cb; reader->cb_cookie = cb_cookie; - reader->fd = fd; + reader->fd = -1; reader->page_size = getpagesize(); - reader->page_cnt = page_cnt; + reader->page_cnt = perf_reader_page_cnt; return reader; } @@ -61,18 +66,20 @@ void perf_reader_free(void *ptr) { } } -int perf_reader_mmap(struct perf_reader *reader, int fd, uint64_t sample_type) { +int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type) { int mmap_size = reader->page_size * (reader->page_cnt + 1); - if (!reader->cb) - return 0; + if (reader->fd < 0) { + fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__); + return -1; + } - reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0); + reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0); if (reader->base == MAP_FAILED) { perror("mmap"); return -1; } - reader->fd = fd; + reader->type = type; reader->sample_type = sample_type; return 0; @@ -90,7 +97,7 @@ struct perf_sample_trace_kprobe { uint64_t ip; }; -static void sample_parse(struct perf_reader *reader, void *data, int size) { +static void parse_tracepoint(struct perf_reader *reader, void *data, int size) { uint8_t *ptr = data; struct perf_event_header *header = (void *)data; @@ -153,6 +160,40 @@ static void sample_parse(struct perf_reader *reader, void *data, int size) { reader->cb(reader->cb_cookie, tk ? tk->common.pid : -1, num_callchain, callchain); } +static void parse_sw(struct perf_reader *reader, void *data, int size) { + uint8_t *ptr = data; + struct perf_event_header *header = (void *)data; + + struct { + uint32_t size; + char data[0]; + } *raw = NULL; + + ptr += sizeof(*header); + if (ptr > (uint8_t *)data + size) { + fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__); + return; + } + + if (reader->sample_type & PERF_SAMPLE_RAW) { + raw = (void *)ptr; + ptr += sizeof(raw->size) + raw->size; + if (ptr > (uint8_t *)data + size) { + fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__); + return; + } + } + + // sanity check + if (ptr != (uint8_t *)data + size) { + fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__); + return; + } + + if (reader->raw_cb) + reader->raw_cb(reader->cb_cookie, raw->data, raw->size); +} + static uint64_t read_data_head(struct perf_event_mmap_page *perf_header) { uint64_t data_head = *((volatile uint64_t *)&perf_header->data_head); asm volatile("" ::: "memory"); @@ -194,12 +235,16 @@ static void event_read(struct perf_reader *reader) { ptr = reader->buf; } - if (e->type == PERF_RECORD_LOST) + if (e->type == PERF_RECORD_LOST) { fprintf(stderr, "Lost %lu samples\n", *(uint64_t *)(ptr + sizeof(*e))); - else if (e->type == PERF_RECORD_SAMPLE) - sample_parse(reader, ptr, e->size); - else + } else if (e->type == PERF_RECORD_SAMPLE) { + if (reader->type == PERF_TYPE_TRACEPOINT) + parse_tracepoint(reader, ptr, e->size); + else if (reader->type == PERF_TYPE_SOFTWARE) + parse_sw(reader, ptr, e->size); + } else { fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type); + } write_data_tail(perf_header, perf_header->data_tail + e->size); } @@ -223,3 +268,10 @@ int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout) return 0; } +void perf_reader_set_fd(struct perf_reader *reader, int fd) { + reader->fd = fd; +} + +int perf_reader_fd(struct perf_reader *reader) { + return reader->fd; +} diff --git a/src/cc/perf_reader.h b/src/cc/perf_reader.h index 024c71f..fdf1c75 100644 --- a/src/cc/perf_reader.h +++ b/src/cc/perf_reader.h @@ -16,7 +16,9 @@ struct perf_reader; -struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie); +struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie); void perf_reader_free(void *ptr); -int perf_reader_mmap(struct perf_reader *reader, int fd, unsigned long sample_type); +int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type); int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout); +int perf_reader_fd(struct perf_reader *reader); +void perf_reader_set_fd(struct perf_reader *reader, int fd); diff --git a/src/libbpf.h b/src/libbpf.h index 1484c31..0204799 100644 --- a/src/libbpf.h +++ b/src/libbpf.h @@ -42,6 +42,7 @@ int bpf_open_raw_sock(const char *name); typedef void (*perf_reader_cb)(void *cb_cookie, int pid, uint64_t callchain_num, void *callchain); +typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size); void * bpf_attach_kprobe(int progfd, const char *event, const char *event_desc, int pid, int cpu, int group_fd, perf_reader_cb cb, diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index f7a1973..a3c7597 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -89,14 +89,19 @@ lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_void_p, ct.c_size_t, lib.bpf_attach_kprobe.restype = ct.c_void_p _CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_int, ct.c_ulonglong, ct.POINTER(ct.c_ulonglong)) +_RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int) lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_int, ct.c_int, ct.c_int, _CB_TYPE, ct.py_object] lib.bpf_detach_kprobe.restype = ct.c_int lib.bpf_detach_kprobe.argtypes = [ct.c_char_p] +lib.bpf_open_perf_buffer.restype = ct.c_void_p +lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object] lib.perf_reader_poll.restype = ct.c_int lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int] lib.perf_reader_free.restype = None lib.perf_reader_free.argtypes = [ct.c_void_p] +lib.perf_reader_fd.restype = int +lib.perf_reader_fd.argtypes = [ct.c_void_p] open_kprobes = {} tracefile = None @@ -111,8 +116,9 @@ stars_max = 40 def cleanup_kprobes(): for k, v in open_kprobes.items(): lib.perf_reader_free(v) - desc = "-:kprobes/%s" % k - lib.bpf_detach_kprobe(desc.encode("ascii")) + if isinstance(k, str): + desc = "-:kprobes/%s" % k + lib.bpf_detach_kprobe(desc.encode("ascii")) open_kprobes.clear() if tracefile: tracefile.close() @@ -126,6 +132,7 @@ class BPF(object): HASH = 1 ARRAY = 2 PROG_ARRAY = 3 + PERF_EVENT_ARRAY = 4 class Function(object): def __init__(self, bpf, name, fd): @@ -178,6 +185,21 @@ class BPF(object): raise Exception("Could not scanf leaf") return leaf + def open_perf_buffer(self, key, cb, cookie): + reader = lib.bpf_open_perf_buffer(_RAW_CB_TYPE(cb), + ct.cast(id(cookie), ct.py_object)) + if not reader: + raise Exception("Could not open perf buffer") + fd = lib.perf_reader_fd(reader) + self[self.Key(key)] = self.Leaf(fd) + open_kprobes[(id(self), key)] = reader + + def close_perf_buffer(self, key): + reader = open_kprobes.get((id(self), key)) + if reader: + lib.perf_reader_free(reader) + del(open_kprobes[(id(self), key)]) + def __getitem__(self, key): key_p = ct.pointer(key) leaf = self.Leaf() @@ -208,7 +230,7 @@ class BPF(object): ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id) # Deleting from array type maps does not have an effect, so # zero out the entry instead. - if ttype in (BPF.ARRAY, BPF.PROG_ARRAY): + if ttype in (BPF.ARRAY, BPF.PROG_ARRAY, BPF.PERF_EVENT_ARRAY): leaf = self.Leaf() leaf_p = ct.pointer(leaf) res = lib.bpf_update_elem(self.map_fd, @@ -216,6 +238,8 @@ class BPF(object): ct.cast(leaf_p, ct.c_void_p), 0) if res < 0: raise Exception("Could not clear item") + if ttype == BPF.PERF_EVENT_ARRAY: + self.close_perf_buffer(key) else: res = lib.bpf_delete_elem(self.map_fd, ct.cast(key_p, ct.c_void_p)) @@ -792,5 +816,5 @@ class BPF(object): try: lib.perf_reader_poll(len(open_kprobes), readers, timeout) except KeyboardInterrupt: - pass + exit() -- 2.7.4