libbpf-tools/ksnoop: kernel argument/return value tracing/display using BTF

author Alan Maguire <32452915+alan-maguire@users.noreply.github.com>

Mon, 6 Sep 2021 04:09:46 +0000 (05:09 +0100)

committer GitHub <noreply@github.com>

Mon, 6 Sep 2021 04:09:46 +0000 (21:09 -0700)
author Alan Maguire <32452915+alan-maguire@users.noreply.github.com>
Mon, 6 Sep 2021 04:09:46 +0000 (05:09 +0100)
committer GitHub <noreply@github.com>
Mon, 6 Sep 2021 04:09:46 +0000 (21:09 -0700)
diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore

index cd79bf29ccc90ddd5d4e1cdd181b3680cd97168e..8ea9bf8094fc929252e99e3ce703f90187330d4a 100644 (file)
--- a/libbpf-tools/.gitignore
+++ b/libbpf-tools/.gitignore
@@ -22,6 +22,7 @@
  /funclatency
  /gethostlatency
  /hardirqs
+/ksnoop
  /llcstat
  /nfsdist
  /nfsslower
diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile

index 21fd5127bfeb358997f2adab2431491b44e07ea2..f2c4707cb5642a685df3d3b62dde21725572eec1 100644 (file)
--- a/libbpf-tools/Makefile
+++ b/libbpf-tools/Makefile
@@ -35,6 +35,7 @@ APPS = \
         funclatency \
         gethostlatency \
         hardirqs \
+       ksnoop \
         llcstat \
         mountsnoop \
         numamove \
diff --git a/libbpf-tools/ksnoop.bpf.c b/libbpf-tools/ksnoop.bpf.c

new file mode 100644 (file)

index 0000000..13342e5
--- /dev/null
+++ b/libbpf-tools/ksnoop.bpf.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021, Oracle and/or its affiliates. */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "ksnoop.h"
+
+/* For kretprobes, the instruction pointer in the struct pt_regs context
+ * is the kretprobe_trampoline.  We derive the instruction pointer
+ * by pushing it onto a function stack on entry and popping it on return.
+ *
+ * We could use bpf_get_func_ip(), but "stack mode" - where we
+ * specify functions "a", "b and "c" and only want to see a trace if "a"
+ * calls "b" and "b" calls "c" - utilizes this stack to determine if trace
+ * data should be collected.
+ */
+#define FUNC_MAX_STACK_DEPTH   16
+
+#ifndef ENOSPC
+#define ENOSPC                 28
+#endif
+
+struct func_stack {
+       __u64 task;
+       __u64 ips[FUNC_MAX_STACK_DEPTH];
+       __u8 stack_depth;
+};
+
+#define MAX_TASKS              2048
+
+/* function call stack hashed on a per-task key */
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       /* function call stack for functions we are tracing */
+       __uint(max_entries, MAX_TASKS);
+       __type(key, __u64);
+       __type(value, struct func_stack);
+} ksnoop_func_stack SEC(".maps");
+
+/* per-cpu trace info hashed on function address */
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+       __uint(max_entries, MAX_FUNC_TRACES);
+       __type(key, __u64);
+       __type(value, struct trace);
+} ksnoop_func_map SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+       __uint(value_size, sizeof(int));
+       __uint(key_size, sizeof(int));
+} ksnoop_perf_map SEC(".maps");
+
+static void clear_trace(struct trace *trace)
+{
+       __builtin_memset(&trace->trace_data, 0, sizeof(trace->trace_data));
+       trace->data_flags = 0;
+       trace->buf_len = 0;
+}
+
+static struct trace *get_trace(struct pt_regs *ctx, bool entry)
+{
+       __u8 stack_depth, last_stack_depth;
+       struct func_stack *func_stack;
+       __u64 ip, last_ip = 0, task;
+       struct trace *trace;
+
+       task = bpf_get_current_task();
+
+       func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
+       if (!func_stack) {
+               struct func_stack new_stack = { .task = task };
+
+               bpf_map_update_elem(&ksnoop_func_stack, &task, &new_stack,
+                                   BPF_NOEXIST);
+               func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
+               if (!func_stack)
+                       return NULL;
+       }
+
+       stack_depth = func_stack->stack_depth;
+       if (stack_depth > FUNC_MAX_STACK_DEPTH)
+               return NULL;
+
+       if (entry) {
+               ip = KSNOOP_IP_FIX(PT_REGS_IP_CORE(ctx));
+               if (stack_depth >= FUNC_MAX_STACK_DEPTH - 1)
+                       return NULL;
+               /* verifier doesn't like using "stack_depth - 1" as array index
+                * directly.
+                */
+               last_stack_depth = stack_depth - 1;
+               /* get address of last function we called */
+               if (last_stack_depth >= 0 &&
+                   last_stack_depth < FUNC_MAX_STACK_DEPTH)
+                       last_ip = func_stack->ips[last_stack_depth];
+               /* push ip onto stack. return will pop it. */
+               func_stack->ips[stack_depth++] = ip;
+               func_stack->stack_depth = stack_depth;
+               /* rather than zero stack entries on popping, we zero the
+                * (stack_depth + 1)'th entry when pushing the current
+                * entry.  The reason we take this approach is that
+                * when tracking the set of functions we returned from,
+                * we want the history of functions we returned from to
+                * be preserved.
+                */
+               if (stack_depth < FUNC_MAX_STACK_DEPTH)
+                       func_stack->ips[stack_depth] = 0;
+       } else {
+               if (stack_depth == 0 || stack_depth >= FUNC_MAX_STACK_DEPTH)
+                       return NULL;
+               last_stack_depth = stack_depth;
+               /* get address of last function we returned from */
+               if (last_stack_depth >= 0 &&
+                   last_stack_depth < FUNC_MAX_STACK_DEPTH)
+                       last_ip = func_stack->ips[last_stack_depth];
+               if (stack_depth > 0)
+                       stack_depth = stack_depth - 1;
+               /* retrieve ip from stack as IP in pt_regs is
+                * bpf kretprobe trampoline address.
+                */
+               if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
+                       ip = func_stack->ips[stack_depth];
+               if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
+                       func_stack->stack_depth = stack_depth;
+       }
+
+       trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
+       if (!trace)
+               return NULL;
+
+       /* we may stash data on entry since predicates are a mix
+        * of entry/return; in such cases, trace->flags specifies
+        * KSNOOP_F_STASH, and we will output stashed data on return.
+        * If returning, make sure we don't clear our stashed data.
+        */
+       if (!entry && (trace->flags & KSNOOP_F_STASH)) {
+               /* skip clearing trace data */
+               if (!(trace->data_flags & KSNOOP_F_STASHED)) {
+                       /* predicate must have failed */
+                       return NULL;
+               }
+               /* skip clearing trace data */
+       } else {
+               /* clear trace data before starting. */
+               clear_trace(trace);
+       }
+
+       if (entry) {
+               /* if in stack mode, check if previous fn matches */
+               if (trace->prev_ip && trace->prev_ip != last_ip)
+                       return NULL;
+               /* if tracing intermediate fn in stack of fns, stash data. */
+               if (trace->next_ip)
+                       trace->data_flags |= KSNOOP_F_STASH;
+               /* we may stash data on entry since predicates are a mix
+                * of entry/return; in such cases, trace->flags specifies
+                * KSNOOP_F_STASH, and we will output stashed data on return.
+                */
+               if (trace->flags & KSNOOP_F_STASH)
+                       trace->data_flags |= KSNOOP_F_STASH;
+               /* otherwise the data is outputted (because we've reached
+                * the last fn in the set of fns specified).
+                */
+       } else {
+               /* In stack mode, check if next fn matches the last fn
+                * we returned from; i.e. "a" called "b", and now
+                * we're at "a", was the last fn we returned from "b"?
+                * If so, stash data for later display (when we reach the
+                * first fn in the set of stack fns).
+                */
+               if (trace->next_ip && trace->next_ip != last_ip)
+                       return NULL;
+               if (trace->prev_ip)
+                       trace->data_flags |= KSNOOP_F_STASH;
+               /* If there is no "prev" function, i.e. we are at the
+                * first function in a set of stack functions, the trace
+                * info is shown (along with any stashed info associated
+                * with callers).
+                */
+       }
+       trace->task = task;
+       return trace;
+}
+
+static void output_trace(struct pt_regs *ctx, struct trace *trace)
+{
+       __u16 trace_len;
+
+       if (trace->buf_len == 0)
+               goto skip;
+
+       /* we may be simply stashing values, and will report later */
+       if (trace->data_flags & KSNOOP_F_STASH) {
+               trace->data_flags &= ~KSNOOP_F_STASH;
+               trace->data_flags |= KSNOOP_F_STASHED;
+               return;
+       }
+       /* we may be outputting earlier stashed data */
+       if (trace->data_flags & KSNOOP_F_STASHED)
+               trace->data_flags &= ~KSNOOP_F_STASHED;
+
+       /* trim perf event size to only contain data we've recorded. */
+       trace_len = sizeof(*trace) + trace->buf_len - MAX_TRACE_BUF;
+
+       if (trace_len <= sizeof(*trace))
+               bpf_perf_event_output(ctx, &ksnoop_perf_map,
+                                     BPF_F_CURRENT_CPU,
+                                     trace, trace_len);
+skip:
+       clear_trace(trace);
+}
+
+static void output_stashed_traces(struct pt_regs *ctx,
+                                        struct trace *currtrace,
+                                        bool entry)
+{
+       struct func_stack *func_stack;
+       struct trace *trace = NULL;
+       __u8 stack_depth, i;
+       __u64 task = 0;
+
+       task = bpf_get_current_task();
+       func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
+       if (!func_stack)
+               return;
+
+       stack_depth = func_stack->stack_depth;
+
+       if (entry) {
+               /* iterate from bottom to top of stack, outputting stashed
+                * data we find.  This corresponds to the set of functions
+                * we called before the current function.
+                */
+               for (i = 0;
+                    i < func_stack->stack_depth - 1 && i < FUNC_MAX_STACK_DEPTH;
+                    i++) {
+                       trace = bpf_map_lookup_elem(&ksnoop_func_map,
+                                                   &func_stack->ips[i]);
+                       if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
+                               break;
+                       if (trace->task != task)
+                               return;
+                       output_trace(ctx, trace);
+               }
+       } else {
+               /* iterate from top to bottom of stack, outputting stashed
+                * data we find.  This corresponds to the set of functions
+                * that returned prior to the current returning function.
+                */
+               for (i = FUNC_MAX_STACK_DEPTH; i > 0; i--) {
+                       __u64 ip;
+
+                       ip = func_stack->ips[i];
+                       if (!ip)
+                               continue;
+                       trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
+                       if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
+                               break;
+                       if (trace->task != task)
+                               return;
+                       output_trace(ctx, trace);
+               }
+       }
+       /* finally output the current trace info */
+       output_trace(ctx, currtrace);
+}
+
+static __u64 get_arg(struct pt_regs *ctx, enum arg argnum)
+{
+       switch (argnum) {
+       case KSNOOP_ARG1:
+               return PT_REGS_PARM1_CORE(ctx);
+       case KSNOOP_ARG2:
+               return PT_REGS_PARM2_CORE(ctx);
+       case KSNOOP_ARG3:
+               return PT_REGS_PARM3_CORE(ctx);
+       case KSNOOP_ARG4:
+               return PT_REGS_PARM4_CORE(ctx);
+       case KSNOOP_ARG5:
+               return PT_REGS_PARM5_CORE(ctx);
+       case KSNOOP_RETURN:
+               return PT_REGS_RC_CORE(ctx);
+       default:
+               return 0;
+       }
+}
+
+static int ksnoop(struct pt_regs *ctx, bool entry)
+{
+       void *data_ptr = NULL;
+       struct trace *trace;
+       struct func *func;
+       __u16 trace_len;
+       __u64 data, pg;
+       __u32 currpid;
+       int ret;
+       __u8 i;
+
+       trace = get_trace(ctx, entry);
+       if (!trace)
+               return 0;
+
+       func = &trace->func;
+
+       /* make sure we want events from this pid */
+       currpid = bpf_get_current_pid_tgid();
+       if (trace->filter_pid && trace->filter_pid != currpid)
+               return 0;
+       trace->pid = currpid;
+
+       trace->cpu = bpf_get_smp_processor_id();
+       trace->time = bpf_ktime_get_ns();
+
+       trace->data_flags &= ~(KSNOOP_F_ENTRY | KSNOOP_F_RETURN);
+       if (entry)
+               trace->data_flags |= KSNOOP_F_ENTRY;
+       else
+               trace->data_flags |= KSNOOP_F_RETURN;
+
+
+       for (i = 0; i < MAX_TRACES; i++) {
+               struct trace_data *currdata;
+               struct value *currtrace;
+               char *buf_offset = NULL;
+               __u32 tracesize;
+
+               currdata = &trace->trace_data[i];
+               currtrace = &trace->traces[i];
+
+               if ((entry && !base_arg_is_entry(currtrace->base_arg)) ||
+                   (!entry && base_arg_is_entry(currtrace->base_arg)))
+                       continue;
+
+               /* skip void (unused) trace arguments, ensuring not to
+                * skip "void *".
+                */
+               if (currtrace->type_id == 0 &&
+                   !(currtrace->flags & KSNOOP_F_PTR))
+                       continue;
+
+               data = get_arg(ctx, currtrace->base_arg);
+
+               /* look up member value and read into data field. */
+               if (currtrace->flags & KSNOOP_F_MEMBER) {
+                       if (currtrace->offset)
+                               data += currtrace->offset;
+
+                       /* member is a pointer; read it in */
+                       if (currtrace->flags & KSNOOP_F_PTR) {
+                               void *dataptr = (void *)data;
+
+                               ret = bpf_probe_read(&data, sizeof(data),
+                                                    dataptr);
+                               if (ret) {
+                                       currdata->err_type_id =
+                                               currtrace->type_id;
+                                       currdata->err = ret;
+                                       continue;
+                               }
+                               currdata->raw_value = data;
+                       } else if (currtrace->size <=
+                                  sizeof(currdata->raw_value)) {
+                               /* read member value for predicate comparison */
+                               bpf_probe_read(&currdata->raw_value,
+                                              currtrace->size,
+                                              (void*)data);
+                       }
+               } else {
+                       currdata->raw_value = data;
+               }
+
+               /* simple predicate evaluation: if any predicate fails,
+                * skip all tracing for this function.
+                */
+               if (currtrace->flags & KSNOOP_F_PREDICATE_MASK) {
+                       bool ok = false;
+
+                       if (currtrace->flags & KSNOOP_F_PREDICATE_EQ &&
+                           currdata->raw_value == currtrace->predicate_value)
+                               ok = true;
+
+                       if (currtrace->flags & KSNOOP_F_PREDICATE_NOTEQ &&
+                           currdata->raw_value != currtrace->predicate_value)
+                               ok = true;
+
+                       if (currtrace->flags & KSNOOP_F_PREDICATE_GT &&
+                           currdata->raw_value > currtrace->predicate_value)
+                               ok = true;
+
+                       if (currtrace->flags & KSNOOP_F_PREDICATE_LT &&
+                           currdata->raw_value < currtrace->predicate_value)
+                               ok = true;
+
+                       if (!ok) {
+                               clear_trace(trace);
+                               return 0;
+                       }       
+               }
+
+               if (currtrace->flags & (KSNOOP_F_PTR | KSNOOP_F_MEMBER))
+                       data_ptr = (void *)data;
+               else
+                       data_ptr = &data;
+
+               if (trace->buf_len + MAX_TRACE_DATA >= MAX_TRACE_BUF)
+                       break;
+
+               buf_offset = &trace->buf[trace->buf_len];
+               if (buf_offset > &trace->buf[MAX_TRACE_BUF]) {
+                       currdata->err_type_id = currtrace->type_id;
+                       currdata->err = -ENOSPC;
+                       continue;
+               }
+               currdata->buf_offset = trace->buf_len;
+
+               tracesize = currtrace->size;
+               if (tracesize > MAX_TRACE_DATA)
+                       tracesize = MAX_TRACE_DATA;
+               ret = bpf_probe_read(buf_offset, tracesize, data_ptr);
+               if (ret < 0) {
+                       currdata->err_type_id = currtrace->type_id;
+                       currdata->err = ret;
+                       continue;
+               } else {
+                       currdata->buf_len = tracesize;
+                       trace->buf_len += tracesize;
+               }
+       }
+
+       /* show accumulated stashed traces (if any) */
+       if ((entry && trace->prev_ip && !trace->next_ip) ||
+           (!entry && trace->next_ip && !trace->prev_ip))
+               output_stashed_traces(ctx, trace, entry);
+       else
+               output_trace(ctx, trace);
+
+       return 0;
+}
+
+SEC("kprobe/foo")
+int kprobe_entry(struct pt_regs *ctx)
+{
+       return ksnoop(ctx, true);
+}
+
+SEC("kretprobe/foo")
+int kprobe_return(struct pt_regs *ctx)
+{
+       return ksnoop(ctx, false);
+}
+
+char _license[] SEC("license") = "Dual BSD/GPL";
diff --git a/libbpf-tools/ksnoop.c b/libbpf-tools/ksnoop.c

new file mode 100644 (file)

index 0000000..f6d4d8e
--- /dev/null
+++ b/libbpf-tools/ksnoop.c
@@ -0,0 +1,980 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021, Oracle and/or its affiliates. */
+
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <linux/bpf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+
+#include "ksnoop.h"
+#include "ksnoop.skel.h"
+
+#ifndef KSNOOP_VERSION
+#define KSNOOP_VERSION "0.1"
+#endif
+
+static struct btf *vmlinux_btf;
+static const char *bin_name;
+static int pages = PAGES_DEFAULT;
+
+enum log_level {
+       DEBUG,
+       WARN,
+       ERROR,
+};
+
+static enum log_level log_level = WARN;
+
+static __u32 filter_pid;
+static bool stack_mode;
+
+#define libbpf_errstr(val)     strerror(-libbpf_get_error(val))
+
+static void __p(enum log_level level, char *level_str, char *fmt, ...)
+{
+       va_list ap;
+
+       if (level < log_level)
+               return;
+       va_start(ap, fmt);
+       fprintf(stderr, "%s: ", level_str);
+       vfprintf(stderr, fmt, ap);
+       fprintf(stderr, "\n");
+       va_end(ap);
+       fflush(stderr);
+}
+
+#define p_err(fmt, ...)                __p(ERROR, "Error", fmt, ##__VA_ARGS__)
+#define p_warn(fmt, ...)       __p(WARNING, "Warn", fmt, ##__VA_ARGS__)
+#define        p_debug(fmt, ...)       __p(DEBUG, "Debug", fmt, ##__VA_ARGS__)
+
+static int do_version(int argc, char **argv)
+{
+       printf("%s v%s\n", bin_name, KSNOOP_VERSION);
+       return 0;
+}
+
+static int cmd_help(int argc, char **argv)
+{
+       fprintf(stderr,
+               "Usage: %s [OPTIONS] [COMMAND | help] FUNC\n"
+               "       COMMAND := { trace | info }\n"
+               "       FUNC    := { name | name(ARG[,ARG]*) }\n"
+               "       ARG     := { arg | arg [PRED] | arg->member [PRED] }\n"
+               "       PRED    := { == | != | > | >= | < | <=  value }\n"
+               "       OPTIONS := { {-d|--debug} | {-V|--version} |\n"
+               "                    {-p|--pid filter_pid}|\n"
+               "                    {-P|--pages nr_pages} }\n"
+               "                    {-s|--stack}\n",
+               bin_name);
+       fprintf(stderr,
+               "Examples:\n"
+               "       %s info ip_send_skb\n"
+               "       %s trace ip_send_skb\n"
+               "       %s trace \"ip_send_skb(skb, return)\"\n"
+               "       %s trace \"ip_send_skb(skb->sk, return)\"\n"
+               "       %s trace \"ip_send_skb(skb->len > 128, skb)\"\n"
+               "       %s trace -s udp_sendmsg ip_send_skb\n",
+               bin_name, bin_name, bin_name, bin_name, bin_name, bin_name);
+       return 0;
+}
+
+static void usage(void)
+{
+       cmd_help(0, NULL);
+       exit(1);
+}
+
+static void type_to_value(struct btf *btf, char *name, __u32 type_id,
+                         struct value *val)
+{
+       const struct btf_type *type;
+       __s32 id = type_id;
+
+       if (strlen(val->name) == 0) {
+               if (name)
+                       strncpy(val->name, name,
+                               sizeof(val->name) - 1);
+               else
+                       val->name[0] = '\0';
+       }
+       do {
+               type = btf__type_by_id(btf, id);
+
+               switch (BTF_INFO_KIND(type->info)) {
+               case BTF_KIND_CONST:
+               case BTF_KIND_VOLATILE:
+               case BTF_KIND_RESTRICT:
+                       id = type->type;
+                       break;
+               case BTF_KIND_PTR:
+                       val->flags |= KSNOOP_F_PTR;
+                       id = type->type;
+                       break;
+               default:
+                       val->type_id = id;
+                       goto done;
+               }
+       } while (id >= 0);
+
+       val->type_id = KSNOOP_ID_UNKNOWN;
+       return;
+done:
+       val->size = btf__resolve_size(btf, val->type_id);
+}
+
+static int member_to_value(struct btf *btf, const char *name, __u32 type_id,
+                          struct value *val, int lvl)
+{
+       const struct btf_member *member;
+       const struct btf_type *type;
+       const char *pname;
+       __s32 id = type_id;
+       int i, nmembers;
+       __u8 kind;
+
+       /* type_to_value has already stripped qualifiers, so
+        * we either have a base type, a struct, union, etc.
+        * Only struct/unions have named members so anything
+        * else is invalid.
+        */
+       p_debug("Looking for member '%s' in type id %d", name, type_id);
+       type = btf__type_by_id(btf, id);
+       pname = btf__str_by_offset(btf, type->name_off);
+       if (strlen(pname) == 0)
+               pname = "<anon>";
+
+       kind = BTF_INFO_KIND(type->info);
+       switch (kind) {
+       case BTF_KIND_STRUCT:
+       case BTF_KIND_UNION:
+               nmembers = BTF_INFO_VLEN(type->info);
+               p_debug("Checking %d members...", nmembers);
+               for (member = (struct btf_member *)(type + 1), i = 0;
+                    i < nmembers;
+                    member++, i++) {
+                       const char *mname;
+                       __u16 offset;
+
+                       type = btf__type_by_id(btf, member->type);
+                       mname = btf__str_by_offset(btf, member->name_off);
+                       offset = member->offset / 8;
+
+                       p_debug("Checking member '%s' type %d offset %d",
+                               mname, member->type, offset);
+
+                       /* anonymous struct member? */
+                       kind = BTF_INFO_KIND(type->info);
+                       if (strlen(mname) == 0 &&
+                           (kind == BTF_KIND_STRUCT ||
+                            kind == BTF_KIND_UNION)) {
+                               p_debug("Checking anon struct/union %d",
+                                       member->type);
+                               val->offset += offset;
+                               if (!member_to_value(btf, name, member->type,
+                                                    val, lvl + 1))
+                                       return 0;
+                               val->offset -= offset;
+                               continue;
+                       }
+
+                       if (strcmp(mname, name) == 0) {
+                               val->offset += offset;
+                               val->flags |= KSNOOP_F_MEMBER;
+                               type_to_value(btf, NULL, member->type, val);
+                               p_debug("Member '%s', offset %d, flags %x size %d",
+                                       mname, val->offset, val->flags,
+                                       val->size);
+                               return 0;
+                       }
+               }
+               if (lvl > 0)
+                       break;
+               p_err("No member '%s' found in %s [%d], offset %d", name, pname,
+                     id, val->offset);
+               break;
+       default:
+               p_err("'%s' is not a struct/union", pname);
+               break;
+       }
+       return -ENOENT;
+}
+
+static int get_func_btf(struct btf *btf, struct func *func)
+{
+       const struct btf_param *param;
+       const struct btf_type *type;
+       __u8 i;
+
+       func->id = btf__find_by_name_kind(btf, func->name, BTF_KIND_FUNC);
+       if (func->id <= 0) {
+               p_err("Cannot find function '%s' in BTF: %s",
+                      func->name, strerror(-func->id));
+               return -ENOENT;
+       }
+       type = btf__type_by_id(btf, func->id);
+       if (libbpf_get_error(type) ||
+           BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+               p_err("Error looking up function type via id '%d'", func->id);
+               return -EINVAL;
+       }
+       type = btf__type_by_id(btf, type->type);
+       if (libbpf_get_error(type) ||
+           BTF_INFO_KIND(type->info) != BTF_KIND_FUNC_PROTO) {
+               p_err("Error looking up function proto type via id '%d'",
+                     func->id);
+               return -EINVAL;
+       }
+       for (param = (struct btf_param *)(type + 1), i = 0;
+            i < BTF_INFO_VLEN(type->info) && i < MAX_ARGS;
+            param++, i++) {
+               type_to_value(btf,
+                             (char *)btf__str_by_offset(btf, param->name_off),
+                             param->type, &func->args[i]);
+               p_debug("arg #%d: <name '%s', type id '%u'>",
+                       i + 1, func->args[i].name, func->args[i].type_id);
+       }
+
+       /* real number of args, even if it is > number we recorded. */
+       func->nr_args = BTF_INFO_VLEN(type->info);
+
+       type_to_value(btf, KSNOOP_RETURN_NAME, type->type,
+                     &func->args[KSNOOP_RETURN]);
+       p_debug("return value: type id '%u'>",
+               func->args[KSNOOP_RETURN].type_id);
+       return 0;
+}
+
+int predicate_to_value(char *predicate, struct value *val)
+{
+       char pred[MAX_STR];
+       long v;
+
+       if (!predicate)
+               return 0;
+
+       p_debug("checking predicate '%s' for '%s'", predicate, val->name);
+
+       if (sscanf(predicate, "%[!=><]%li", pred, &v) != 2) {
+               p_err("Invalid specification; expected predicate, not '%s'",
+                     predicate);
+               return -EINVAL;
+       }
+       if (!(val->flags & KSNOOP_F_PTR) &&
+           (val->size == 0 || val->size > sizeof(__u64))) {
+               p_err("'%s' (size %d) does not support predicate comparison",
+                     val->name, val->size);
+               return -EINVAL;
+       }
+       val->predicate_value = (__u64)v;
+
+       if (strcmp(pred, "==") == 0) {
+               val->flags |= KSNOOP_F_PREDICATE_EQ;
+               goto out;
+       } else if (strcmp(pred, "!=") == 0) {
+               val->flags |= KSNOOP_F_PREDICATE_NOTEQ;
+               goto out;
+       }
+       if (pred[0] == '>')
+               val->flags |= KSNOOP_F_PREDICATE_GT;
+       else if (pred[0] == '<')
+               val->flags |= KSNOOP_F_PREDICATE_LT;
+
+       if (strlen(pred) == 1)
+               goto out;
+
+       if (pred[1] != '=') {
+               p_err("Invalid predicate specification '%s'", predicate);
+               return -EINVAL;
+       }
+       val->flags |= KSNOOP_F_PREDICATE_EQ;
+
+out:
+       p_debug("predicate '%s', flags 0x%x value %x",
+               pred, val->flags, val->predicate_value);
+
+       return 0;
+}
+
+static int trace_to_value(struct btf *btf, struct func *func, char *argname,
+                         char *membername, char *predicate, struct value *val)
+{
+       __u8 i;
+
+       if (strlen(membername) > 0)
+               snprintf(val->name, sizeof(val->name), "%s->%s",
+                        argname, membername);
+       else
+               strncpy(val->name, argname, sizeof(val->name));
+
+       for (i = 0; i < MAX_TRACES; i++) {
+               if (!func->args[i].name)
+                       continue;
+               if (strcmp(argname, func->args[i].name) != 0)
+                       continue;
+               p_debug("setting base arg for val %s to %d", val->name, i);
+               val->base_arg = i;
+
+               if (strlen(membername) > 0) {
+                       if (member_to_value(btf, membername,
+                                           func->args[i].type_id, val, 0))
+                               return -ENOENT;
+               } else {
+                       val->type_id = func->args[i].type_id;
+                       val->flags |= func->args[i].flags;
+                       val->size = func->args[i].size;
+               }
+               return predicate_to_value(predicate, val);
+       }
+       p_err("Could not find '%s' in arguments/return value for '%s'",
+             argname, func->name);
+       return -ENOENT;
+}
+
+static struct btf *get_btf(const char *name)
+{
+       struct btf *mod_btf;
+
+       p_debug("getting BTF for %s",
+               name && strlen(name) > 0 ? name : "vmlinux");
+
+       if (!vmlinux_btf) {
+               vmlinux_btf = btf__load_vmlinux_btf();
+               if (libbpf_get_error(vmlinux_btf)) {
+                       p_err("No BTF, cannot determine type info: %s",
+                             libbpf_errstr(vmlinux_btf));
+                       return NULL;
+               }
+       }
+       if (!name || strlen(name) == 0)
+               return vmlinux_btf;
+
+       mod_btf = btf__load_module_btf(name, vmlinux_btf);
+       if (libbpf_get_error(mod_btf)) {
+               p_err("No BTF for module '%s': %s",
+                     name, libbpf_errstr(mod_btf));
+               return NULL;
+       }
+       return mod_btf;
+}
+
+static void copy_without_spaces(char *target, char *src)
+{
+       for (; *src != '\0'; src++)
+               if (!isspace(*src))
+                       *(target++) = *src;
+       *target = '\0';
+}
+
+static char *type_id_to_str(struct btf *btf, __s32 type_id, char *str)
+{
+       const struct btf_type *type;
+       const char *name = "";
+       char *prefix = "";
+       char *suffix = " ";
+       char *ptr = "";
+
+       str[0] = '\0';
+
+       switch (type_id) {
+       case 0:
+               name = "void";
+               break;
+       case KSNOOP_ID_UNKNOWN:
+               name = "?";
+               break;
+       default:
+               do {
+                       type = btf__type_by_id(btf, type_id);
+
+                       if (libbpf_get_error(type)) {
+                               name = "?";
+                               break;
+                       }
+                       switch (BTF_INFO_KIND(type->info)) {
+                       case BTF_KIND_CONST:
+                       case BTF_KIND_VOLATILE:
+                       case BTF_KIND_RESTRICT:
+                               type_id = type->type;
+                               break;
+                       case BTF_KIND_PTR:
+                               ptr = "* ";
+                               type_id = type->type;
+                               break;
+                       case BTF_KIND_ARRAY:
+                               suffix = "[]";
+                               type_id = type->type;
+                               break;
+                       case BTF_KIND_STRUCT:
+                               prefix = "struct ";
+                               name = btf__str_by_offset(btf, type->name_off);
+                               break;
+                       case BTF_KIND_UNION:
+                               prefix = "union";
+                               name = btf__str_by_offset(btf, type->name_off);
+                               break;
+                       case BTF_KIND_ENUM:
+                               prefix = "enum ";
+                               break;
+                       case BTF_KIND_TYPEDEF:
+                               name = btf__str_by_offset(btf, type->name_off);
+                               break;
+                       default:
+                               name = btf__str_by_offset(btf, type->name_off);
+                               break;
+                       }
+               } while (type_id >= 0 && strlen(name) == 0);
+               break;
+       }
+       snprintf(str, MAX_STR, "%s%s%s%s", prefix, name, suffix, ptr);
+
+       return str;
+}
+
+static char *value_to_str(struct btf *btf, struct value *val, char *str)
+{
+
+       str = type_id_to_str(btf, val->type_id, str);
+       if (val->flags & KSNOOP_F_PTR)
+               strncat(str, " * ", MAX_STR);
+       if (strlen(val->name) > 0 &&
+           strcmp(val->name, KSNOOP_RETURN_NAME) != 0)
+               strncat(str, val->name, MAX_STR);
+
+       return str;
+}
+
+/* based heavily on bpf_object__read_kallsyms_file() in libbpf.c */
+static int get_func_ip_mod(struct func *func)
+{
+       char sym_type, sym_name[MAX_STR], mod_info[MAX_STR];
+       unsigned long long sym_addr;
+       int ret, err = 0;
+       FILE *f;
+
+       f = fopen("/proc/kallsyms", "r");
+       if (!f) {
+               err = errno;
+               p_err("failed to open /proc/kallsyms: %d", strerror(err));
+               return err;
+       }
+
+       while (true) {
+               ret = fscanf(f, "%llx %c %128s%[^\n]\n",
+                            &sym_addr, &sym_type, sym_name, mod_info);
+               if (ret == EOF && feof(f))
+                       break;
+               if (ret < 3) {
+                       p_err("failed to read kallsyms entry: %d", ret);
+                       err = -EINVAL;
+                       goto out;
+               }
+               if (strcmp(func->name, sym_name) != 0)
+                       continue;
+               func->ip = sym_addr;
+               func->mod[0] = '\0';
+               /* get module name from [modname] */
+               if (ret == 4) {
+                       if (sscanf(mod_info, "%*[\t ][%[^]]", func->mod) < 1) {
+                               p_err("failed to read module name");
+                               err = -EINVAL;
+                               goto out;
+                       }
+               }
+               p_debug("%s =  <ip %llx, mod %s>", func->name, func->ip,
+                       strlen(func->mod) > 0 ? func->mod : "vmlinux");
+               break;
+       }
+out:
+       fclose(f);
+       return err;
+}
+
+static void trace_printf(void *ctx, const char *fmt, va_list args)
+{
+       vprintf(fmt, args);
+}
+
+#define VALID_NAME     "%[A-Za-z0-9\\-_]"
+#define ARGDATA                "%[^)]"
+
+static int parse_trace(char *str, struct trace *trace)
+{
+       __u8 i, nr_predicates = 0, nr_entry = 0, nr_return = 0;
+       char argname[MAX_NAME], membername[MAX_NAME];
+       char tracestr[MAX_STR], argdata[MAX_STR];
+       struct func *func = &trace->func;
+       struct btf_dump_opts opts = { };
+       char *arg, *saveptr;
+       int ret;
+
+       copy_without_spaces(tracestr, str);
+
+       p_debug("Parsing trace '%s'", tracestr);
+
+       trace->filter_pid = (__u32)filter_pid;
+       if (filter_pid)
+               p_debug("Using pid %lu as filter", trace->filter_pid);
+
+       trace->btf = vmlinux_btf;
+
+       ret = sscanf(tracestr, VALID_NAME "(" ARGDATA ")", func->name, argdata);
+       if (ret <= 0)
+               usage();
+       if (ret == 1) {
+               if (strlen(tracestr) > strlen(func->name)) {
+                       p_err("Invalid function specification '%s'", tracestr);
+                       usage();
+               }
+               argdata[0] = '\0';
+               p_debug("got func '%s'", func->name);
+       } else {
+               if (strlen(tracestr) >
+                   strlen(func->name) + strlen(argdata) + 2) {
+                       p_err("Invalid function specification '%s'", tracestr);
+                       usage();
+               }
+               p_debug("got func '%s', args '%s'", func->name, argdata);
+               trace->flags |= KSNOOP_F_CUSTOM;
+       }
+
+       ret = get_func_ip_mod(func);
+       if (ret) {
+               p_err("could not get address of '%s'", func->name);
+               return ret;
+       }
+       trace->btf = get_btf(func->mod);
+       if (libbpf_get_error(trace->btf)) {
+               p_err("could not get BTF for '%s': %s",
+                     strlen(func->mod) ? func->mod : "vmlinux",
+                     libbpf_errstr(trace->btf));
+               return -ENOENT;
+       }
+       trace->dump = btf_dump__new(trace->btf, NULL, &opts, trace_printf);
+       if (libbpf_get_error(trace->dump)) {
+               p_err("could not create BTF dump : %n",
+                     libbpf_errstr(trace->btf));
+               return -EINVAL;
+       }
+
+       ret = get_func_btf(trace->btf, func);
+       if (ret) {
+               p_debug("unexpected return value '%d' getting function", ret);
+               return ret;
+       }
+
+       for (arg = strtok_r(argdata, ",", &saveptr), i = 0;
+            arg;
+            arg = strtok_r(NULL, ",", &saveptr), i++) {
+               char *predicate = NULL;
+
+               ret = sscanf(arg, VALID_NAME "->" VALID_NAME,
+                            argname, membername);
+               if (ret == 2) {
+                       if (strlen(arg) >
+                           strlen(argname) + strlen(membername) + 2) {
+                               predicate = arg + strlen(argname) +
+                                           strlen(membername) + 2;
+                       }
+                       p_debug("'%s' dereferences '%s', predicate '%s'",
+                               argname, membername, predicate);
+               } else {
+                       if (strlen(arg) > strlen(argname))
+                               predicate = arg + strlen(argname);
+                       p_debug("'%s' arg, predcate '%s'", argname, predicate);
+                       membername[0] = '\0';
+               }
+
+               if (i >= MAX_TRACES) {
+                       p_err("Too many arguments; up to %d are supported",
+                             MAX_TRACES);
+                       return -EINVAL;
+               }
+               if (trace_to_value(trace->btf, func, argname, membername,
+                                  predicate, &trace->traces[i]))
+                       return -EINVAL;
+
+               if (predicate)
+                       nr_predicates++;
+               if (trace->traces[i].base_arg == KSNOOP_RETURN)
+                       nr_return++;
+               else
+                       nr_entry++;
+               trace->nr_traces++;
+       }
+
+       if (trace->nr_traces > 0) {
+               trace->flags |= KSNOOP_F_CUSTOM;
+               p_debug("custom trace with %d args", trace->nr_traces);
+
+               /* If we have one or more predicates _and_ references to
+                * entry and return values, we need to activate "stash"
+                * mode where arg traces are stored on entry and not
+                * sent until return to ensure predicates are satisfied.
+                */
+               if (nr_predicates > 0 && nr_entry > 0 && nr_return > 0) {
+                       trace->flags |= KSNOOP_F_STASH;
+                       p_debug("activating stash mode on entry");
+               }
+       } else {
+               p_debug("Standard trace, function with %d arguments",
+                       func->nr_args);
+               /* copy function arg/return value to trace specification. */
+               memcpy(trace->traces, func->args, sizeof(trace->traces));
+               for (i = 0; i < MAX_TRACES; i++)
+                       trace->traces[i].base_arg = i;
+               trace->nr_traces = MAX_TRACES;
+       }
+
+       return 0;
+}
+
+static int parse_traces(int argc, char **argv, struct trace **traces)
+{
+       __u8 i;
+
+       if (argc == 0)
+               usage();
+
+       if (argc > MAX_FUNC_TRACES) {
+               p_err("A maximum of %d traces are supported", MAX_FUNC_TRACES);
+               return -EINVAL;
+       }
+       *traces = calloc(argc, sizeof(struct trace));
+       if (!*traces) {
+               p_err("Could not allocate %d traces", argc);
+               return -ENOMEM;
+       }
+       for (i = 0; i < argc; i++) {
+               if (parse_trace(argv[i], &((*traces)[i])))
+                       return -EINVAL;
+               if (!stack_mode || i == 0)
+                       continue;
+               /* tell stack mode trace which function to expect next */
+               (*traces)[i].prev_ip = (*traces)[i-1].func.ip;
+               (*traces)[i-1].next_ip = (*traces)[i].func.ip;
+       }
+       return i;
+}
+
+static int cmd_info(int argc, char **argv)
+{
+       struct trace *traces;
+       char str[MAX_STR];
+       int nr_traces;
+       __u8 i, j;
+
+       nr_traces = parse_traces(argc, argv, &traces);
+       if (nr_traces < 0)
+               return nr_traces;
+
+       for (i = 0; i < nr_traces; i++) {
+               struct func *func = &traces[i].func;
+
+               printf("%s %s(",
+                      value_to_str(traces[i].btf, &func->args[KSNOOP_RETURN],
+                                   str),
+                      func->name);
+               for (j = 0; j < func->nr_args; j++) {
+                       if (j > 0)
+                               printf(", ");
+                       printf("%s", value_to_str(traces[i].btf, &func->args[j],
+                                                 str));
+               }
+               if (func->nr_args > MAX_ARGS)
+                       printf(" /* and %d more args that are not traceable */",
+                              func->nr_args - MAX_ARGS);
+               printf(");\n");
+       }
+       return 0;
+}
+
+static void trace_handler(void *ctx, int cpu, void *data, __u32 size)
+{
+       struct trace *trace = data;
+       int i, shown, ret;
+
+       p_debug("got trace, size %d", size);
+       if (size < (sizeof(*trace) - MAX_TRACE_BUF)) {
+               p_err("\t/* trace buffer size '%u' < min %ld */",
+                       size, sizeof(trace) - MAX_TRACE_BUF);
+               return;
+       }
+       printf("%16lld %4d %8u %s(\n", trace->time, trace->cpu, trace->pid,
+              trace->func.name);
+
+       for (i = 0, shown = 0; i < trace->nr_traces; i++) {
+               DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts);
+               bool entry = trace->data_flags & KSNOOP_F_ENTRY;
+               struct value *val = &trace->traces[i];
+               struct trace_data *data = &trace->trace_data[i];
+
+               opts.indent_level = 36;
+               opts.indent_str = " ";
+
+               /* skip if it's entry data and trace data is for return, or
+                * if it's return and trace data is entry; only exception in
+                * the latter case is if we stashed data; in such cases we
+                * want to see it as it's a mix of entry/return data with
+                * predicates.
+                */
+               if ((entry && !base_arg_is_entry(val->base_arg)) ||
+                   (!entry && base_arg_is_entry(val->base_arg) &&
+                    !(trace->flags & KSNOOP_F_STASH)))
+                       continue;
+
+               if (val->type_id == 0)
+                       continue;
+
+               if (shown > 0)
+                       printf(",\n");
+               printf("%34s %s = ", "", val->name);
+               if (val->flags & KSNOOP_F_PTR)
+                       printf("*(0x%llx)", data->raw_value);
+               printf("\n");
+
+               if (data->err_type_id != 0) {
+                       char typestr[MAX_STR];
+
+                       printf("%36s /* Cannot show '%s' as '%s%s'; invalid/userspace ptr? */\n",
+                              "",
+                              val->name,
+                              type_id_to_str(trace->btf,
+                                             val->type_id,
+                                             typestr),
+                              val->flags & KSNOOP_F_PTR ?
+                              " *" : "");
+               } else {
+                       ret = btf_dump__dump_type_data
+                               (trace->dump, val->type_id,
+                                trace->buf + data->buf_offset,
+                                data->buf_len, &opts);
+                       /* truncated? */
+                       if (ret == -E2BIG)
+                               printf("%36s... /* %d bytes of %d */", "",
+                                      data->buf_len,
+                                      val->size);
+               }
+               shown++;
+
+       }
+       printf("\n%31s);\n\n", "");
+       fflush(stdout);
+}
+
+static void lost_handler(void *ctx, int cpu, __u64 cnt)
+{
+       p_err("\t/* lost %llu events */", cnt);
+}
+
+static int add_traces(struct bpf_map *func_map, struct trace *traces,
+                     int nr_traces)
+{
+       int i, j, ret, nr_cpus = libbpf_num_possible_cpus();
+       struct trace *map_traces;
+
+       map_traces = calloc(nr_cpus, sizeof(struct trace));
+       if (!map_traces) {
+               p_err("Could not allocate memory for %d traces", nr_traces);
+               return -ENOMEM;
+       }
+       for (i = 0; i < nr_traces; i++) {
+               for (j = 0; j < nr_cpus; j++)
+                       memcpy(&map_traces[j], &traces[i],
+                              sizeof(map_traces[j]));
+
+               ret = bpf_map_update_elem(bpf_map__fd(func_map),
+                                         &traces[i].func.ip,
+                                         map_traces,
+                                         BPF_NOEXIST);
+               if (ret) {
+                       p_err("Could not add map entry for '%s': %s",
+                             traces[i].func.name, strerror(-ret));
+                       break;
+               }
+       }
+       free(map_traces);
+       return ret;
+}
+
+static int attach_traces(struct ksnoop_bpf *skel, struct trace *traces,
+                        int nr_traces)
+{
+       struct bpf_link *link;
+       int i, ret;
+
+       for (i = 0; i < nr_traces; i++) {
+               link = bpf_program__attach_kprobe(skel->progs.kprobe_entry,
+                                                 false,
+                                                 traces[i].func.name);
+               ret = libbpf_get_error(link);
+               if (ret) {
+                       p_err("Could not attach kprobe to '%s': %s",
+                             traces[i].func.name, strerror(-ret));
+                               return ret;
+                       }
+               p_debug("Attached kprobe for '%s'", traces[i].func.name);
+
+               link = bpf_program__attach_kprobe(skel->progs.kprobe_return,
+                                                 true,
+                                                 traces[i].func.name);
+               ret = libbpf_get_error(link);
+               if (ret) {
+                       p_err("Could not attach kretprobe to '%s': %s",
+                             traces[i].func.name, strerror(-ret));
+                       return ret;
+               }
+               p_debug("Attached kretprobe for '%s'", traces[i].func.name);
+       }
+       return 0;
+}
+
+static int cmd_trace(int argc, char **argv)
+{
+       struct perf_buffer_opts pb_opts = {};
+       struct bpf_map *perf_map, *func_map;
+       struct perf_buffer *pb;
+       struct ksnoop_bpf *skel;
+       int nr_traces, ret = 0;
+       struct trace *traces;
+
+       nr_traces = parse_traces(argc, argv, &traces);
+       if (nr_traces < 0)
+               return nr_traces;
+
+       skel = ksnoop_bpf__open_and_load();
+       if (!skel) {
+               p_err("Could not load ksnoop BPF: %s", libbpf_errstr(skel));
+               return 1;
+       }
+
+       perf_map = skel->maps.ksnoop_perf_map;
+       if (!perf_map) {
+               p_err("Could not find '%s'", "ksnoop_perf_map");
+               return 1;
+       }
+       func_map = bpf_object__find_map_by_name(skel->obj, "ksnoop_func_map");
+       if (!func_map) {
+               p_err("Could not find '%s'", "ksnoop_func_map");
+               return 1;
+       }
+
+       if (add_traces(func_map, traces, nr_traces)) {
+               p_err("Could not add traces to '%s'", "ksnoop_func_map");
+               return 1;
+       }
+
+       if (attach_traces(skel, traces, nr_traces)) {
+               p_err("Could not attach %d traces", nr_traces);
+               return 1;
+       }
+
+       pb_opts.sample_cb = trace_handler;
+       pb_opts.lost_cb = lost_handler;
+       pb = perf_buffer__new(bpf_map__fd(perf_map), pages, &pb_opts);
+       if (libbpf_get_error(pb)) {
+               p_err("Could not create perf buffer: %s",
+                     libbpf_errstr(pb));
+               return 1;
+       }
+
+       printf("%16s %4s %8s %s\n", "TIME", "CPU", "PID", "FUNCTION/ARGS");
+
+       while (1) {
+               ret = perf_buffer__poll(pb, 1);
+               if (ret < 0 && ret != -EINTR) {
+                       p_err("Polling failed: %s", strerror(-ret));
+                       break;
+               }
+       }
+
+       perf_buffer__free(pb);
+       ksnoop_bpf__destroy(skel);
+
+       return ret;
+}
+
+struct cmd {
+       const char *cmd;
+       int (*func)(int argc, char **argv);
+};
+
+struct cmd cmds[] = {
+       { "info",       cmd_info },
+       { "trace",      cmd_trace },
+       { "help",       cmd_help },
+       { NULL,         NULL }
+};
+
+static int cmd_select(int argc, char **argv)
+{
+       int i;
+
+       for (i = 0; cmds[i].cmd; i++) {
+               if (strncmp(*argv, cmds[i].cmd, strlen(*argv)) == 0)
+                       return cmds[i].func(argc - 1, argv + 1);
+       }
+       return cmd_trace(argc, argv);
+}
+
+static int print_all_levels(enum libbpf_print_level level,
+                const char *format, va_list args)
+{
+       return vfprintf(stderr, format, args);
+}
+
+int main(int argc, char *argv[])
+{
+       static const struct option options[] = {
+               { "debug",      no_argument,            NULL,   'd' },
+               { "help",       no_argument,            NULL,   'h' },
+               { "version",    no_argument,            NULL,   'V' },
+               { "pages",      required_argument,      NULL,   'P' },
+               { "pid",        required_argument,      NULL,   'p' },
+               { 0 }
+       };
+       int opt;
+
+       bin_name = argv[0];
+
+       while ((opt = getopt_long(argc, argv, "dhp:P:sV", options,
+                                 NULL)) >= 0) {
+               switch (opt) {
+               case 'd':
+                       libbpf_set_print(print_all_levels);
+                       log_level = DEBUG;
+                       break;
+               case 'h':
+                       return cmd_help(argc, argv);
+               case 'V':
+                       return do_version(argc, argv);
+               case 'p':
+                       filter_pid = atoi(optarg);
+                       break;
+               case 'P':
+                       pages = atoi(optarg);
+                       break;
+               case 's':
+                       stack_mode = true;
+                       break;
+               default:
+                       p_err("unrecognized option '%s'", argv[optind - 1]);
+                       usage();
+               }
+       }
+       if (argc == 1)
+               usage();
+       argc -= optind;
+       argv += optind;
+       if (argc < 0)
+               usage();
+
+       return cmd_select(argc, argv);
+}
diff --git a/libbpf-tools/ksnoop.h b/libbpf-tools/ksnoop.h

new file mode 100644 (file)

index 0000000..6c55b0e
--- /dev/null
+++ b/libbpf-tools/ksnoop.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021, Oracle and/or its affiliates. */
+
+/* maximum number of different functions we can trace at once */
+#define MAX_FUNC_TRACES                        64
+
+enum arg {
+       KSNOOP_ARG1,
+       KSNOOP_ARG2,
+       KSNOOP_ARG3,
+       KSNOOP_ARG4,
+       KSNOOP_ARG5,
+       KSNOOP_RETURN
+};
+
+/* we choose "return" as the name for the returned value because as
+ * a C keyword it can't clash with a function entry parameter.
+ */
+#define KSNOOP_RETURN_NAME             "return"
+
+/* if we can't get a type id for a type (such as module-specific type)
+ * mark it as KSNOOP_ID_UNKNOWN since BTF lookup in bpf_snprintf_btf()
+ * will fail and the data will be simply displayed as a __u64.
+ */
+#define KSNOOP_ID_UNKNOWN              0xffffffff
+
+#define MAX_NAME                       96      
+#define MAX_STR                                256     
+#define MAX_PATH                       512
+#define MAX_VALUES                     6
+#define MAX_ARGS                       (MAX_VALUES - 1)
+#define KSNOOP_F_PTR                   0x1     /* value is a pointer */
+#define KSNOOP_F_MEMBER                        0x2     /* member reference */
+#define KSNOOP_F_ENTRY                 0x4
+#define KSNOOP_F_RETURN                        0x8
+#define KSNOOP_F_CUSTOM                        0x10    /* custom trace */
+#define KSNOOP_F_STASH                 0x20    /* store values on entry,
+                                                * no perf events.
+                                                */
+#define KSNOOP_F_STASHED               0x40    /* values stored on entry */
+
+#define KSNOOP_F_PREDICATE_EQ          0x100
+#define KSNOOP_F_PREDICATE_NOTEQ       0x200
+#define KSNOOP_F_PREDICATE_GT          0x400
+#define KSNOOP_F_PREDICATE_LT          0x800
+
+#define KSNOOP_F_PREDICATE_MASK                (KSNOOP_F_PREDICATE_EQ | \
+                                        KSNOOP_F_PREDICATE_NOTEQ | \
+                                        KSNOOP_F_PREDICATE_GT | \
+                                        KSNOOP_F_PREDICATE_LT)
+
+/* for kprobes, entry is function IP + sizeof(kprobe_opcode_t),
+ * subtract in BPF prog context to get fn address.
+ */
+#ifdef __TARGET_ARCH_x86
+#define KSNOOP_IP_FIX(ip)              (ip - sizeof(kprobe_opcode_t))
+#else
+#define KSNOOP_IP_FIX(ip)              ip
+#endif
+
+struct value {
+       char name[MAX_STR];
+       enum arg base_arg;
+       __u32 offset;
+       __u32 size;
+       __u64 type_id;
+       __u64 flags;
+       __u64 predicate_value;
+};
+
+struct func {
+       char name[MAX_NAME];
+       char mod[MAX_NAME];
+       __s32 id;
+       __u8 nr_args;
+       __u64 ip;
+       struct value args[MAX_VALUES];
+};
+
+#define MAX_TRACES MAX_VALUES
+
+#define MAX_TRACE_DATA 2048
+
+struct trace_data {
+       __u64 raw_value;
+       __u32 err_type_id;      /* type id we can't dereference */
+       int err;
+       __u32 buf_offset;
+       __u16 buf_len;
+};
+
+#define MAX_TRACE_BUF  (MAX_TRACES * MAX_TRACE_DATA)
+
+struct trace {
+       /* initial values are readonly in tracing context */
+       struct btf *btf;
+       struct btf_dump *dump;
+       struct func func;
+       __u8 nr_traces;
+       __u32 filter_pid;
+       __u64 prev_ip; /* these are used in stack-mode tracing */
+       __u64 next_ip;
+       struct value traces[MAX_TRACES];
+       __u64 flags;
+       /* values below this point are set or modified in tracing context */
+       __u64 task;
+       __u32 pid;
+       __u32 cpu;
+       __u64 time;
+       __u64 data_flags;
+       struct trace_data trace_data[MAX_TRACES];
+       __u16 buf_len;
+       char buf[MAX_TRACE_BUF];
+       char buf_end[0];
+};
+
+#define PAGES_DEFAULT  16
+
+static inline int base_arg_is_entry(enum arg base_arg)
+{
+       return base_arg != KSNOOP_RETURN;
+}
diff --git a/man/man8/ksnoop.8 b/man/man8/ksnoop.8

new file mode 100644 (file)

index 0000000..8733cb7
--- /dev/null
+++ b/man/man8/ksnoop.8
@@ -0,0 +1,298 @@
+.\" Man page generated from reStructuredText.
+.
+.TH KSNOOP 8 "" "" ""
+.SH NAME
+KSNOOP \- tool for tracing kernel function entry/return showing arguments/return values
+.
+.nr rst2man-indent-level 0
+.
+.de1 rstReportMargin
+\\$1 \\n[an-margin]
+level \\n[rst2man-indent-level]
+level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
+-
+\\n[rst2man-indent0]
+\\n[rst2man-indent1]
+\\n[rst2man-indent2]
+..
+.de1 INDENT
+.\" .rstReportMargin pre:
+. RS \\$1
+. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
+. nr rst2man-indent-level +1
+.\" .rstReportMargin post:
+..
+.de UNINDENT
+. RE
+.\" indent \\n[an-margin]
+.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.nr rst2man-indent-level -1
+.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
+..
+.SH SYNOPSIS
+.INDENT 0.0
+.INDENT 3.5
+\fBksnoop\fP [\fIOPTIONS\fP] { \fICOMMAND\fP  \fIFUNC\fP | \fBhelp\fP }
+.sp
+\fIOPTIONS\fP := { { \fB\-V\fP | \fB\-\-version\fP } | { \fB\-h\fP | \fB\-\-help\fP }
+| { [\fB\-P\fP | \fB\-\-pages\fP] nr_pages} | { [\fB\-p\fP | \fB\-\-pid\fP] pid} |
+[{ \fB\-s\fP | \fB\-\-stack\fP }] | [{ \fB\-d\fP | \fB\-\-debug\fP }] }
+.sp
+\fICOMMAND\fP := { \fBtrace\fP | \fBinfo\fP }
+.sp
+\fIFUNC\fP := { \fBname\fP | \fBname\fP(\fBarg\fP[,**arg]) }
+.UNINDENT
+.UNINDENT
+.SH DESCRIPTION
+.INDENT 0.0
+.INDENT 3.5
+\fIksnoop\fP allows for inspection of arguments and return values
+associated with function entry/return.
+.INDENT 0.0
+.TP
+.B \fBksnoop info\fP \fIFUNC\fP
+Show function description, arguments and return value types.
+.TP
+.B \fBksnoop trace\fP \fIFUNC\fP [\fIFUNC\fP]
+Trace function entry and return, showing arguments and
+return values.  A function name can simply be specified,
+or a function name along with named arguments, return values.
+\fBreturn\fP is used to specify the return value.
+.UNINDENT
+.sp
+\fIksnoop\fP requires the kernel to provide BTF for itself, and if
+tracing of module data is required, module BTF must be present also.
+Check /sys/kernel/btf to see if BTF is present.
+.sp
+\fBksnoop\fP requires \fICAP_BPF\fP and \fICAP_TRACING\fP capabilities.
+.UNINDENT
+.UNINDENT
+.SH OPTIONS
+.INDENT 0.0
+.INDENT 3.5
+.INDENT 0.0
+.TP
+.B \-h\fP,\fB  \-\-help
+Show help information
+.TP
+.B \-V\fP,\fB  \-\-version
+Show version.
+.TP
+.B \-d\fP,\fB  \-\-debug
+Show debug output.
+.TP
+.B \-p\fP,\fB  \-\-pid
+Filter events by pid.
+.TP
+.B \-P\fP,\fB  \-\-pages
+Specify number of pages used per\-CPU for perf event
+collection.  Default is 8.
+.TP
+.B \-s\fP,\fB  \-\-stack
+Specified set of functions are traced if and only
+if they are encountered in the order specified.
+.UNINDENT
+.UNINDENT
+.UNINDENT
+.SH EXAMPLES
+.sp
+\fB# ksnoop info ip_send_skb\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+int  ip_send_skb(struct net  * net, struct sk_buff  * skb);
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Show function description.
+.sp
+\fB# ksnoop trace ip_send_skb\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+          TIME  CPU      PID FUNCTION/ARGS
+78101668506811    1     2813 ip_send_skb(
+                                 net = *(0xffffffffb5959840)
+                                  (struct net){
+                                   .passive = (refcount_t){
+                                    .refs = (atomic_t){
+                                     .counter = (int)0x2,
+                                    },
+                                   },
+                                   .dev_base_seq = (unsigned int)0x18,
+                                   .ifindex = (int)0xf,
+                                   .list = (struct list_head){
+                                    .next = (struct list_head *)0xffff9895440dc120,
+                                    .prev = (struct list_head *)0xffffffffb595a8d0,
+                                   },
+                                 ...
+
+79561322965250    1     2813 ip_send_skb(
+                                 return =
+                                  (int)0x0
+                             );
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Show entry/return for ip_send_skb() with arguments, return values.
+.sp
+\fB# ksnoop trace "ip_send_skb(skb)"\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+         TIME  CPU      PID FUNCTION/ARGS
+78142420834537    1     2813 ip_send_skb(
+                                 skb = *(0xffff989750797c00)
+                                  (struct sk_buff){
+                                   (union){
+                                    .sk = (struct sock *)0xffff98966ce19200,
+                                    .ip_defrag_offset = (int)0x6ce19200,
+                                   },
+                                   (union){
+                                    (struct){
+                                     ._skb_refdst = (long unsigned int)0xffff98981dde2d80,
+                                     .destructor = (void (*)(struct sk_buff *))0xffffffffb3e1beb0,
+                                    },
+                                ...
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Show entry argument \fBskb\fP\&.
+.sp
+\fB# ksnoop trace "ip_send_skb(return)"\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+         TIME  CPU      PID FUNCTION/ARGS
+78178228354796    1     2813 ip_send_skb(
+                                 return =
+                                  (int)0x0
+                             );
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Show return value from ip_send_skb().
+.sp
+\fB# ksnoop trace "ip_send_skb(skb\->sk)"\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+          TIME  CPU      PID FUNCTION/ARGS
+78207649138829    2     2813 ip_send_skb(
+                                 skb\->sk = *(0xffff98966ce19200)
+                                  (struct sock){
+                                   .__sk_common = (struct sock_common){
+                                    (union){
+                                     .skc_addrpair = (__addrpair)0x1701a8c017d38f8d,
+                                     (struct){
+                                      .skc_daddr = (__be32)0x17d38f8d,
+                                      .skc_rcv_saddr = (__be32)0x1701a8c0,
+                                     },
+                                    },
+                                  ...
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Trace member information associated with argument.  Only one level of
+membership is supported.
+.sp
+\fB# ksnoop \-p 2813 "ip_rcv(dev)"\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+          TIME  CPU      PID FUNCTION/ARGS
+78254803164920    1     2813 ip_rcv(
+                                 dev = *(0xffff9895414cb000)
+                                  (struct net_device){
+                                   .name = (char[16])[
+                                    \(aql\(aq,
+                                    \(aqo\(aq,
+                                   ],
+                                   .name_node = (struct netdev_name_node *)0xffff989541515ec0,
+                                   .state = (long unsigned int)0x3,
+                                 ...
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Trace \fBdev\fP argument of \fBip_rcv()\fP\&.  Specify process id 2813 for events
+for that process only.
+.sp
+\fB# ksnoop \-s tcp_sendmsg __tcp_transmit_skb  ip_output\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+         TIME  CPU      PID FUNCTION/ARGS
+71827770952903    1     4777 __tcp_transmit_skb(
+                                 sk = *(0xffff9852460a2300)
+                                  (struct sock){
+                                   .__sk_common = (struct sock_common){
+                                    (union){
+                                     .skc_addrpair = (__addrpair)0x61b2af0a35cbfe0a,
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Trace entry/return of tcp_sendmsg, __tcp_transmit_skb and ip_output when
+tcp_sendmsg leads to a call to __tcp_transmit_skb and that in turn
+leads to a call to ip_output; i.e. with a call graph matching the order
+specified.  The order does not have to be direct calls, i.e. function A
+can call another function that calls function B.
+.sp
+\fB# ksnoop "ip_send_skb(skb\->len > 100, skb)"\fP
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+          TIME  CPU      PID FUNCTION/ARGS
+39267395709745    1     2955 ip_send_skb(
+                                 skb\->len =
+                                  (unsigned int)0x89,
+                                 skb = *(0xffff89c8be81e500)
+                                  (struct sk_buff){
+                                   (union){
+                                    .sk = (struct sock *)0xffff89c6c59e5580,
+                                    .ip_defrag_offset = (int)0xc59e5580,
+                                   },
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Trace ip_send_skb() skbs which have len > 100.
+.SH SEE ALSO
+.INDENT 0.0
+.INDENT 3.5
+\fBbpf\fP(2),
+.UNINDENT
+.UNINDENT
+.\" Generated by docutils manpage writer.
+.
diff --git a/src/cc/libbpf b/src/cc/libbpf

index 21f90f61b0849ae654b7c78ba9ce34bfb74ce6f2..a3c0cc19d4b93cb0b7088c5604b0cec1c6863fde 160000 (submodule)
--- a/src/cc/libbpf
+++ b/src/cc/libbpf
@@ -1 +1 @@
-Subproject commit 21f90f61b0849ae654b7c78ba9ce34bfb74ce6f2
+Subproject commit a3c0cc19d4b93cb0b7088c5604b0cec1c6863fde
author	Alan Maguire <32452915+alan-maguire@users.noreply.github.com>
	Mon, 6 Sep 2021 04:09:46 +0000 (05:09 +0100)
committer	GitHub <noreply@github.com>
	Mon, 6 Sep 2021 04:09:46 +0000 (21:09 -0700)
libbpf-tools/.gitignore		patch \| blob \| history
libbpf-tools/Makefile		patch \| blob \| history
libbpf-tools/ksnoop.bpf.c	[new file with mode: 0644]	patch \| blob
libbpf-tools/ksnoop.c	[new file with mode: 0644]	patch \| blob
libbpf-tools/ksnoop.h	[new file with mode: 0644]	patch \| blob
man/man8/ksnoop.8	[new file with mode: 0644]	patch \| blob
src/cc/libbpf		patch \| blob \| history