libbpf-tools: add offcputime
authorWenbo Zhang <ethercflow@gmail.com>
Sat, 1 May 2021 04:22:42 +0000 (12:22 +0800)
committeryonghong-song <ys114321@gmail.com>
Sat, 1 May 2021 16:24:45 +0000 (09:24 -0700)
Signed-off-by: Wenbo Zhang <ethercflow@gmail.com>
libbpf-tools/.gitignore
libbpf-tools/Makefile
libbpf-tools/offcputime.bpf.c [new file with mode: 0644]
libbpf-tools/offcputime.c [new file with mode: 0644]
libbpf-tools/offcputime.h [new file with mode: 0644]
libbpf-tools/trace_helpers.c
libbpf-tools/trace_helpers.h
libbpf-tools/uprobe_helpers.c
libbpf-tools/uprobe_helpers.h

index 76bcc6e59981a427ad2b7ff6cef29d2d283fba9f..2b4999bd43b00481bb34035d093fa70a26f47d96 100644 (file)
@@ -15,6 +15,7 @@
 /hardirqs
 /llcstat
 /numamove
+/offcputime
 /opensnoop
 /readahead
 /runqlat
index 345bb7bc65d2a6178f6985565f80e3603a799b16..92dcf5a5ff164be6be3bb4a735fcb6cb84c7a940 100644 (file)
@@ -32,6 +32,7 @@ APPS = \
        hardirqs \
        llcstat \
        numamove \
+       offcputime \
        opensnoop \
        readahead \
        runqlat \
diff --git a/libbpf-tools/offcputime.bpf.c b/libbpf-tools/offcputime.bpf.c
new file mode 100644 (file)
index 0000000..3b0277a
--- /dev/null
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2021 Wenbo Zhang
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+#include "offcputime.h"
+
+#define PF_KTHREAD             0x00200000      /* I am a kernel thread */
+#define MAX_ENTRIES            10240
+
+const volatile bool kernel_threads_only = false;
+const volatile bool user_threads_only = false;
+const volatile __u64 max_block_ns = -1;
+const volatile __u64 min_block_ns = 1;
+const volatile pid_t targ_tgid = -1;
+const volatile pid_t targ_pid = -1;
+const volatile long state = -1;
+
+struct internal_key {
+       u64 start_ts;
+       struct key_t key;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u32);
+       __type(value, struct internal_key);
+       __uint(max_entries, MAX_ENTRIES);
+} start SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+       __uint(key_size, sizeof(u32));
+} stackmap SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, struct key_t);
+       __type(value, struct val_t);
+       __uint(max_entries, MAX_ENTRIES);
+} info SEC(".maps");
+
+static bool allow_record(struct task_struct *t)
+{
+       if (targ_tgid != -1 && targ_tgid != t->tgid)
+               return false;
+       if (targ_pid != -1 && targ_pid != t->pid)
+               return false;
+       if (user_threads_only && t->flags & PF_KTHREAD)
+               return false;
+       else if (kernel_threads_only && !(t->flags & PF_KTHREAD))
+               return false;
+       if (state != -1 && t->state != state)
+               return false;
+       return true;
+}
+
+SEC("tp_btf/sched_switch")
+int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev,
+            struct task_struct *next)
+{
+       struct internal_key *i_keyp, i_key;
+       struct val_t *valp, val;
+       s64 delta;
+       u32 pid;
+
+       if (allow_record(prev)) {
+               pid = prev->pid;
+               /* To distinguish idle threads of different cores */
+               if (!pid)
+                       pid = bpf_get_smp_processor_id();
+               i_key.key.pid = pid;
+               i_key.key.tgid = prev->tgid;
+               i_key.start_ts = bpf_ktime_get_ns();
+
+               if (prev->flags & PF_KTHREAD)
+                       i_key.key.user_stack_id = -1;
+               else
+                       i_key.key.user_stack_id =
+                               bpf_get_stackid(ctx, &stackmap,
+                                               BPF_F_USER_STACK);
+               i_key.key.kern_stack_id = bpf_get_stackid(ctx, &stackmap, 0);
+               bpf_map_update_elem(&start, &pid, &i_key, 0);
+               bpf_probe_read_str(&val.comm, sizeof(prev->comm), prev->comm);
+               val.delta = 0;
+               bpf_map_update_elem(&info, &i_key.key, &val, BPF_NOEXIST);
+       }
+
+       pid = next->pid;
+       i_keyp = bpf_map_lookup_elem(&start, &pid);
+       if (!i_keyp)
+               return 0;
+       delta = (s64)(bpf_ktime_get_ns() - i_keyp->start_ts);
+       if (delta < 0)
+               goto cleanup;
+       delta /= 1000U;
+       if (delta < min_block_ns || delta > max_block_ns)
+               goto cleanup;
+       valp = bpf_map_lookup_elem(&info, &i_keyp->key);
+       if (!valp)
+               goto cleanup;
+       __sync_fetch_and_add(&valp->delta, delta);
+
+cleanup:
+       bpf_map_delete_elem(&start, &pid);
+       return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/offcputime.c b/libbpf-tools/offcputime.c
new file mode 100644 (file)
index 0000000..341e271
--- /dev/null
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2021 Wenbo Zhang
+//
+// Based on offcputime(8) from BCC by Brendan Gregg.
+// 19-Mar-2021   Wenbo Zhang   Created this.
+#include <argp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "offcputime.h"
+#include "offcputime.skel.h"
+#include "trace_helpers.h"
+
+static struct env {
+       pid_t pid;
+       pid_t tid;
+       bool user_threads_only;
+       bool kernel_threads_only;
+       int stack_storage_size;
+       int perf_max_stack_depth;
+       __u64 min_block_time;
+       __u64 max_block_time;
+       long state;
+       int duration;
+       bool verbose;
+} env = {
+       .pid = -1,
+       .tid = -1,
+       .stack_storage_size = 1024,
+       .perf_max_stack_depth = 127,
+       .min_block_time = 1,
+       .max_block_time = -1,
+       .state = -1,
+       .duration = 99999999,
+};
+
+static volatile bool exiting;
+
+const char *argp_program_version = "offcputime 0.1";
+const char *argp_program_bug_address =
+       "https://github.com/iovisor/bcc/tree/master/libbpf-tools";
+const char argp_program_doc[] =
+"Summarize off-CPU time by stack trace.\n"
+"\n"
+"USAGE: offcputime [--help] [-p PID | -u | -k] [-m MIN-BLOCK-TIME] "
+"[-M MAX-BLOCK-TIME] [--state] [--perf-max-stack-depth] [--stack-storage-size] "
+"[duration]\n"
+"EXAMPLES:\n"
+"    offcputime             # trace off-CPU stack time until Ctrl-C\n"
+"    offcputime 5           # trace for 5 seconds only\n"
+"    offcputime -m 1000     # trace only events that last more than 1000 usec\n"
+"    offcputime -M 10000    # trace only events that last less than 10000 usec\n"
+"    offcputime -p 185      # only trace threads for PID 185\n"
+"    offcputime -t 188      # only trace thread 188\n"
+"    offcputime -u          # only trace user threads (no kernel)\n"
+"    offcputime -k          # only trace kernel threads (no user)\n";
+
+#define OPT_PERF_MAX_STACK_DEPTH       1 /* --pef-max-stack-depth */
+#define OPT_STACK_STORAGE_SIZE         2 /* --stack-storage-size */
+#define OPT_STATE                      3 /* --state */
+
+static const struct argp_option opts[] = {
+       { "pid", 'p', "PID", 0, "Trace this PID only" },
+       { "tid", 't', "TID", 0, "Trace this TID only" },
+       { "user-threads-only", 'u', NULL, 0,
+         "User threads only (no kernel threads)" },
+       { "kernel-threads-only", 'k', NULL, 0,
+         "Kernel threads only (no user threads)" },
+       { "perf-max-stack-depth", OPT_PERF_MAX_STACK_DEPTH,
+         "PERF-MAX-STACK-DEPTH", 0, "the limit for both kernel and user stack traces (default 127)" },
+       { "stack-storage-size", OPT_STACK_STORAGE_SIZE, "STACK-STORAGE-SIZE", 0,
+         "the number of unique stack traces that can be stored and displayed (default 1024)" },
+       { "min-block-time", 'm', "MIN-BLOCK-TIME", 0,
+         "the amount of time in microseconds over which we store traces (default 1)" },
+       { "max-block-time", 'M', "MAX-BLOCK-TIME", 0,
+         "the amount of time in microseconds under which we store traces (default U64_MAX)" },
+       { "state", OPT_STATE, "STATE", 0, "filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE) see include/linux/sched.h" },
+       { "verbose", 'v', NULL, 0, "Verbose debug output" },
+       {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+       static int pos_args;
+
+       switch (key) {
+       case 'v':
+               env.verbose = true;
+               break;
+       case 'p':
+               errno = 0;
+               env.pid = strtol(arg, NULL, 10);
+               if (errno) {
+                       fprintf(stderr, "invalid PID: %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case 't':
+               errno = 0;
+               env.tid = strtol(arg, NULL, 10);
+               if (errno || env.tid <= 0) {
+                       fprintf(stderr, "Invalid TID: %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case 'u':
+               env.user_threads_only = true;
+               break;
+       case 'k':
+               env.kernel_threads_only = true;
+               break;
+       case OPT_PERF_MAX_STACK_DEPTH:
+               errno = 0;
+               env.perf_max_stack_depth = strtol(arg, NULL, 10);
+               if (errno) {
+                       fprintf(stderr, "invalid perf max stack depth: %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case OPT_STACK_STORAGE_SIZE:
+               errno = 0;
+               env.stack_storage_size = strtol(arg, NULL, 10);
+               if (errno) {
+                       fprintf(stderr, "invalid stack storage size: %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case 'm':
+               errno = 0;
+               env.min_block_time = strtoll(arg, NULL, 10);
+               if (errno) {
+                       fprintf(stderr, "Invalid min block time (in us): %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case 'M':
+               errno = 0;
+               env.max_block_time = strtoll(arg, NULL, 10);
+               if (errno) {
+                       fprintf(stderr, "Invalid min block time (in us): %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case OPT_STATE:
+               errno = 0;
+               env.state = strtol(arg, NULL, 10);
+               if (errno || env.state < 0 || env.state > 2) {
+                       fprintf(stderr, "Invalid task state: %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       case ARGP_KEY_ARG:
+               if (pos_args++) {
+                       fprintf(stderr,
+                               "Unrecognized positional argument: %s\n", arg);
+                       argp_usage(state);
+               }
+               errno = 0;
+               env.duration = strtol(arg, NULL, 10);
+               if (errno || env.duration <= 0) {
+                       fprintf(stderr, "Invalid duration (in s): %s\n", arg);
+                       argp_usage(state);
+               }
+               break;
+       default:
+               return ARGP_ERR_UNKNOWN;
+       }
+       return 0;
+}
+
+int libbpf_print_fn(enum libbpf_print_level level,
+                   const char *format, va_list args)
+{
+       if (level == LIBBPF_DEBUG && !env.verbose)
+               return 0;
+       return vfprintf(stderr, format, args);
+}
+
+static void sig_handler(int sig)
+{
+}
+
+static void print_map(struct ksyms *ksyms, struct syms_cache *syms_cache,
+                     struct offcputime_bpf *obj)
+{
+       struct key_t lookup_key = {}, next_key;
+       const struct ksym *ksym;
+       const struct syms *syms;
+       const struct sym *sym;
+       int err, i, ifd, sfd;
+       unsigned long *ip;
+       struct val_t val;
+
+       ip = calloc(env.perf_max_stack_depth, sizeof(*ip));
+       if (!ip) {
+               fprintf(stderr, "failed to alloc ip\n");
+               return;
+       }
+
+       ifd = bpf_map__fd(obj->maps.info);
+       sfd = bpf_map__fd(obj->maps.stackmap);
+       while (!bpf_map_get_next_key(ifd, &lookup_key, &next_key)) {
+               err = bpf_map_lookup_elem(ifd, &next_key, &val);
+               if (err < 0) {
+                       fprintf(stderr, "failed to lookup info: %d\n", err);
+                       goto cleanup;
+               }
+               lookup_key = next_key;
+               if (val.delta == 0)
+                       continue;
+               if (bpf_map_lookup_elem(sfd, &next_key.kern_stack_id, ip) != 0) {
+                       fprintf(stderr, "    [Missed Kernel Stack]\n");
+                       goto print_ustack;
+               }
+               for (i = 0; i < env.perf_max_stack_depth && ip[i]; i++) {
+                       ksym = ksyms__map_addr(ksyms, ip[i]);
+                       printf("    %s\n", ksym ? ksym->name : "Unknown");
+               }
+
+print_ustack:
+               if (next_key.user_stack_id == -1)
+                       goto skip_ustack;
+
+               if (bpf_map_lookup_elem(sfd, &next_key.user_stack_id, ip) != 0) {
+                       fprintf(stderr, "    [Missed User Stack]\n");
+                       continue;
+               }
+
+               syms = syms_cache__get_syms(syms_cache, next_key.tgid);
+               if (!syms) {
+                       fprintf(stderr, "failed to get syms\n");
+                       goto skip_ustack;
+               }
+               for (i = 0; i < env.perf_max_stack_depth && ip[i]; i++) {
+                       sym = syms__map_addr(syms, ip[i]);
+                       if (sym)
+                               printf("    %s\n", sym->name);
+                       else
+                               printf("    [unknown]\n");
+               }
+
+skip_ustack:
+               printf("    %-16s %s (%d)\n", "-", val.comm, next_key.pid);
+               printf("        %lld\n\n", val.delta);
+       }
+
+cleanup:
+       free(ip);
+}
+
+int main(int argc, char **argv)
+{
+       static const struct argp argp = {
+               .options = opts,
+               .parser = parse_arg,
+               .doc = argp_program_doc,
+       };
+       struct syms_cache *syms_cache = NULL;
+       struct ksyms *ksyms = NULL;
+       struct offcputime_bpf *obj;
+       int err;
+
+       err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+       if (err)
+               return err;
+       if (env.user_threads_only && env.kernel_threads_only) {
+               fprintf(stderr, "user_threads_only, kernel_threads_only cann't be used together.\n");
+               return 1;
+       }
+       if (env.min_block_time >= env.max_block_time) {
+               fprintf(stderr, "min_block_time should smaller than max_block_time\n");
+               return 1;
+       }
+
+       libbpf_set_print(libbpf_print_fn);
+
+       err = bump_memlock_rlimit();
+       if (err) {
+               fprintf(stderr, "failed to increase rlimit: %d\n", err);
+               return 1;
+       }
+
+       obj = offcputime_bpf__open();
+       if (!obj) {
+               fprintf(stderr, "failed to open BPF object\n");
+               return 1;
+       }
+
+       /* initialize global data (filtering options) */
+       obj->rodata->targ_tgid = env.pid;
+       obj->rodata->targ_pid = env.tid;
+       obj->rodata->user_threads_only = env.user_threads_only;
+       obj->rodata->kernel_threads_only = env.kernel_threads_only;
+       obj->rodata->state = env.state;
+       obj->rodata->min_block_ns = env.min_block_time;
+       obj->rodata->max_block_ns = env.max_block_time;
+
+       bpf_map__set_value_size(obj->maps.stackmap,
+                               env.perf_max_stack_depth * sizeof(unsigned long));
+       bpf_map__set_max_entries(obj->maps.stackmap, env.stack_storage_size);
+
+       err = offcputime_bpf__load(obj);
+       if (err) {
+               fprintf(stderr, "failed to load BPF programs\n");
+               goto cleanup;
+       }
+       ksyms = ksyms__load();
+       if (!ksyms) {
+               fprintf(stderr, "failed to load kallsyms\n");
+               goto cleanup;
+       }
+       syms_cache = syms_cache__new(0);
+       if (!syms_cache) {
+               fprintf(stderr, "failed to create syms_cache\n");
+               goto cleanup;
+       }
+       err = offcputime_bpf__attach(obj);
+       if (err) {
+               fprintf(stderr, "failed to attach BPF programs\n");
+               goto cleanup;
+       }
+
+       signal(SIGINT, sig_handler);
+
+       /*
+        * We'll get sleep interrupted when someone presses Ctrl-C (which will
+        * be "handled" with noop by sig_handler).
+        */
+       sleep(env.duration);
+
+       print_map(ksyms, syms_cache, obj);
+
+cleanup:
+       offcputime_bpf__destroy(obj);
+       syms_cache__free(syms_cache);
+       ksyms__free(ksyms);
+       return err != 0;
+}
diff --git a/libbpf-tools/offcputime.h b/libbpf-tools/offcputime.h
new file mode 100644 (file)
index 0000000..43ca364
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __OFFCPUTIME_H
+#define __OFFCPUTIME_H
+
+#define TASK_COMM_LEN          16
+
+struct key_t {
+       __u32 pid;
+       __u32 tgid;
+       int user_stack_id;
+       int kern_stack_id;
+};
+
+struct val_t {
+       __u64 delta;
+       char comm[TASK_COMM_LEN];
+};
+
+#endif /* __OFFCPUTIME_H */
index 21538af15136827ef534edc7fa194bf0f0232fba..9ea0bb8a3eb35c8a1a6265321784e439f355c270 100644 (file)
@@ -3,20 +3,26 @@
 //
 // Based on ksyms improvements from Andrii Nakryiko, add more helpers.
 // 28-Feb-2020   Wenbo Zhang   Created this.
+#define _GNU_SOURCE
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
 #include <sys/resource.h>
 #include <time.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
+#include <limits.h>
 #include "trace_helpers.h"
+#include "uprobe_helpers.h"
 
-#define min(x, y) ({                            \
-       typeof(x) _min1 = (x);                   \
-       typeof(y) _min2 = (y);                   \
-       (void) (&_min1 == &_min2);               \
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
        _min1 < _min2 ? _min1 : _min2; })
 
 #define DISK_NAME_LEN  32
@@ -172,6 +178,599 @@ const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms,
        return NULL;
 }
 
+struct load_range {
+       uint64_t start;
+       uint64_t end;
+       uint64_t file_off;
+};
+
+enum elf_type {
+       EXEC,
+       DYN,
+       PERF_MAP,
+       VDSO,
+       UNKNOWN,
+};
+
+struct dso {
+       char *name;
+       struct load_range *ranges;
+       int range_sz;
+       /* Dyn's first text section virtual addr at execution */
+       uint64_t sh_addr;
+       /* Dyn's first text section file offset */
+       uint64_t sh_offset;
+       enum elf_type type;
+
+       struct sym *syms;
+       int syms_sz;
+       int syms_cap;
+
+       /*
+        * libbpf's struct btf is actually a pretty efficient
+        * "set of strings" data structure, so we create an
+        * empty one and use it to store symbol names.
+        */
+       struct btf *btf;
+};
+
+struct map {
+       uint64_t start_addr;
+       uint64_t end_addr;
+       uint64_t file_off;
+       uint64_t dev_major;
+       uint64_t dev_minor;
+       uint64_t inode;
+};
+
+struct syms {
+       struct dso *dsos;
+       int dso_sz;
+};
+
+static bool is_file_backed(const char *mapname)
+{
+#define STARTS_WITH(mapname, prefix) \
+       (!strncmp(mapname, prefix, sizeof(prefix) - 1))
+
+       return mapname[0] && !(
+               STARTS_WITH(mapname, "//anon") ||
+               STARTS_WITH(mapname, "/dev/zero") ||
+               STARTS_WITH(mapname, "/anon_hugepage") ||
+               STARTS_WITH(mapname, "[stack") ||
+               STARTS_WITH(mapname, "/SYSV") ||
+               STARTS_WITH(mapname, "[heap]") ||
+               STARTS_WITH(mapname, "[vsyscall]"));
+}
+
+static bool is_perf_map(const char *path)
+{
+       return false;
+}
+
+static bool is_vdso(const char *path)
+{
+       return !strcmp(path, "[vdso]");
+}
+
+static int get_elf_type(const char *path)
+{
+       GElf_Ehdr hdr;
+       void *res;
+       Elf *e;
+       int fd;
+
+       if (is_vdso(path))
+               return -1;
+       e = open_elf(path, &fd);
+       if (!e)
+               return -1;
+       res = gelf_getehdr(e, &hdr);
+       close_elf(e, fd);
+       if (!res)
+               return -1;
+       return hdr.e_type;
+}
+
+static int get_elf_text_scn_info(const char *path, uint64_t *addr,
+                                uint64_t *offset)
+{
+       Elf_Scn *section = NULL;
+       int fd = -1, err = -1;
+       GElf_Shdr header;
+       size_t stridx;
+       Elf *e = NULL;
+       char *name;
+
+       e = open_elf(path, &fd);
+       if (!e)
+               goto err_out;
+       err = elf_getshdrstrndx(e, &stridx);
+       if (err < 0)
+               goto err_out;
+
+       err = -1;
+       while ((section = elf_nextscn(e, section)) != 0) {
+               if (!gelf_getshdr(section, &header))
+                       continue;
+
+               name = elf_strptr(e, stridx, header.sh_name);
+               if (name && !strcmp(name, ".text")) {
+                       *addr = (uint64_t)header.sh_addr;
+                       *offset = (uint64_t)header.sh_offset;
+                       err = 0;
+                       break;
+               }
+       }
+
+err_out:
+       close_elf(e, fd);
+       return err;
+}
+
+static int syms__add_dso(struct syms *syms, struct map *map, const char *name)
+{
+       struct dso *dso = NULL;
+       int i, type;
+       void *tmp;
+
+       for (i = 0; i < syms->dso_sz; i++) {
+               if (!strcmp(syms->dsos[i].name, name)) {
+                       dso = &syms->dsos[i];
+                       break;
+               }
+       }
+
+       if (!dso) {
+               tmp = realloc(syms->dsos, (syms->dso_sz + 1) *
+                             sizeof(*syms->dsos));
+               if (!tmp)
+                       return -1;
+               syms->dsos = tmp;
+               dso = &syms->dsos[syms->dso_sz++];
+               memset(dso, 0, sizeof(*dso));
+               dso->name = strdup(name);
+               dso->btf = btf__new_empty();
+       }
+
+       tmp = realloc(dso->ranges, (dso->range_sz + 1) * sizeof(*dso->ranges));
+       if (!tmp)
+               return -1;
+       dso->ranges = tmp;
+       dso->ranges[dso->range_sz].start = map->start_addr;
+       dso->ranges[dso->range_sz].end = map->end_addr;
+       dso->ranges[dso->range_sz].file_off = map->file_off;
+       dso->range_sz++;
+       type = get_elf_type(name);
+       if (type == ET_EXEC) {
+               dso->type = EXEC;
+       } else if (type == ET_DYN) {
+               dso->type = DYN;
+               if (get_elf_text_scn_info(name, &dso->sh_addr, &dso->sh_offset) < 0)
+                       return -1;
+       } else if (is_perf_map(name)) {
+               dso->type = PERF_MAP;
+       } else if (is_vdso(name)) {
+               dso->type = VDSO;
+       } else {
+               dso->type = UNKNOWN;
+       }
+       return 0;
+}
+
+static struct dso *syms__find_dso(const struct syms *syms, unsigned long addr,
+                                 uint64_t *offset)
+{
+       struct load_range *range;
+       struct dso *dso;
+       int i, j;
+
+       for (i = 0; i < syms->dso_sz; i++) {
+               dso = &syms->dsos[i];
+               for (j = 0; j < dso->range_sz; j++) {
+                       range = &dso->ranges[j];
+                       if (addr <= range->start || addr >= range->end)
+                               continue;
+                       if (dso->type == DYN || dso->type == VDSO) {
+                               /* Offset within the mmap */
+                               *offset = addr - range->start + range->file_off;
+                               /* Offset within the ELF for dyn symbol lookup */
+                               *offset += dso->sh_addr - dso->sh_offset;
+                       } else {
+                               *offset = addr;
+                       }
+
+                       return dso;
+               }
+       }
+
+       return NULL;
+}
+
+static int dso__load_sym_table_from_perf_map(struct dso *dso)
+{
+       return -1;
+}
+
+static int dso__add_sym(struct dso *dso, const char *name, uint64_t start,
+                       uint64_t size)
+{
+       struct sym *sym;
+       size_t new_cap;
+       void *tmp;
+       int off;
+
+       off = btf__add_str(dso->btf, name);
+       if (off < 0)
+               return off;
+
+       if (dso->syms_sz + 1 > dso->syms_cap) {
+               new_cap = dso->syms_cap * 4 / 3;
+               if (new_cap < 1024)
+                       new_cap = 1024;
+               tmp = realloc(dso->syms, sizeof(*dso->syms) * new_cap);
+               if (!tmp)
+                       return -1;
+               dso->syms = tmp;
+               dso->syms_cap = new_cap;
+       }
+
+       sym = &dso->syms[dso->syms_sz++];
+       /* while constructing, re-use pointer as just a plain offset */
+       sym->name = (void*)(unsigned long)off;
+       sym->start = start;
+       sym->size = size;
+
+       return 0;
+}
+
+static int sym_cmp(const void *p1, const void *p2)
+{
+       const struct sym *s1 = p1, *s2 = p2;
+
+       if (s1->start == s2->start)
+               return strcmp(s1->name, s2->name);
+       return s1->start < s2->start ? -1 : 1;
+}
+
+static int dso__add_syms(struct dso *dso, Elf *e, Elf_Scn *section,
+                        size_t stridx, size_t symsize)
+{
+       Elf_Data *data = NULL;
+
+       while ((data = elf_getdata(section, data)) != 0) {
+               size_t i, symcount = data->d_size / symsize;
+
+               if (data->d_size % symsize)
+                       return -1;
+
+               for (i = 0; i < symcount; ++i) {
+                       const char *name;
+                       GElf_Sym sym;
+
+                       if (!gelf_getsym(data, (int)i, &sym))
+                               continue;
+                       if (!(name = elf_strptr(e, stridx, sym.st_name)))
+                               continue;
+                       if (name[0] == '\0')
+                               continue;
+
+                       if (sym.st_value == 0)
+                               continue;
+
+                       if (dso__add_sym(dso, name, sym.st_value, sym.st_size))
+                               goto err_out;
+               }
+       }
+
+       return 0;
+
+err_out:
+       return -1;
+}
+
+static void dso__free_fields(struct dso *dso)
+{
+       if (!dso)
+               return;
+
+       free(dso->name);
+       free(dso->ranges);
+       free(dso->syms);
+       btf__free(dso->btf);
+}
+
+static int dso__load_sym_table_from_elf(struct dso *dso, int fd)
+{
+       Elf_Scn *section = NULL;
+       Elf *e;
+       int i;
+
+       e = fd > 0 ? open_elf_by_fd(fd) : open_elf(dso->name, &fd);
+       if (!e)
+               return -1;
+
+       while ((section = elf_nextscn(e, section)) != 0) {
+               GElf_Shdr header;
+
+               if (!gelf_getshdr(section, &header))
+                       continue;
+
+               if (header.sh_type != SHT_SYMTAB &&
+                   header.sh_type != SHT_DYNSYM)
+                       continue;
+
+               if (dso__add_syms(dso, e, section, header.sh_link,
+                                 header.sh_entsize))
+                       goto err_out;
+       }
+
+       /* now when strings are finalized, adjust pointers properly */
+       for (i = 0; i < dso->syms_sz; i++)
+               dso->syms[i].name =
+                       btf__name_by_offset(dso->btf,
+                                           (unsigned long)dso->syms[i].name);
+
+       qsort(dso->syms, dso->syms_sz, sizeof(*dso->syms), sym_cmp);
+
+       close_elf(e, fd);
+       return 0;
+
+err_out:
+       dso__free_fields(dso);
+       close_elf(e, fd);
+       return -1;
+}
+
+static int create_tmp_vdso_image(struct dso *dso)
+{
+       uint64_t start_addr, end_addr;
+       long pid = getpid();
+       char buf[PATH_MAX];
+       void *image = NULL;
+       char tmpfile[128];
+       int ret, fd = -1;
+       uint64_t sz;
+       char *name;
+       FILE *f;
+
+       snprintf(tmpfile, sizeof(tmpfile), "/proc/%ld/maps", pid);
+       f = fopen(tmpfile, "r");
+       if (!f)
+               return -1;
+
+       while (true) {
+               ret = fscanf(f, "%lx-%lx %*s %*x %*x:%*x %*u%[^\n]",
+                            &start_addr, &end_addr, buf);
+               if (ret == EOF && feof(f))
+                       break;
+               if (ret != 3)
+                       goto err_out;
+
+               name = buf;
+               while (isspace(*name))
+                       name++;
+               if (!is_file_backed(name))
+                       continue;
+               if (is_vdso(name))
+                       break;
+       }
+
+       sz = end_addr - start_addr;
+       image = malloc(sz);
+       if (!image)
+               goto err_out;
+       memcpy(image, (void *)start_addr, sz);
+
+       snprintf(tmpfile, sizeof(tmpfile),
+                "/tmp/libbpf_%ld_vdso_image_XXXXXX", pid);
+       fd = mkostemp(tmpfile, O_CLOEXEC);
+       if (fd < 0) {
+               fprintf(stderr, "failed to create temp file: %s\n",
+                       strerror(errno));
+               goto err_out;
+       }
+       /* Unlink the file to avoid leaking */
+       if (unlink(tmpfile) == -1)
+               fprintf(stderr, "failed to unlink %s: %s\n", tmpfile,
+                       strerror(errno));
+       if (write(fd, image, sz) == -1) {
+               fprintf(stderr, "failed to write to vDSO image: %s\n",
+                       strerror(errno));
+               close(fd);
+               fd = -1;
+               goto err_out;
+       }
+
+err_out:
+       fclose(f);
+       free(image);
+       return fd;
+}
+
+static int dso__load_sym_table_from_vdso_image(struct dso *dso)
+{
+       int fd = create_tmp_vdso_image(dso);
+
+       if (fd < 0)
+               return -1;
+       return dso__load_sym_table_from_elf(dso, fd);
+}
+
+static int dso__load_sym_table(struct dso *dso)
+{
+       if (dso->type == UNKNOWN)
+               return -1;
+       if (dso->type == PERF_MAP)
+               return dso__load_sym_table_from_perf_map(dso);
+       if (dso->type == EXEC || dso->type == DYN)
+               return dso__load_sym_table_from_elf(dso, 0);
+       if (dso->type == VDSO)
+               return dso__load_sym_table_from_vdso_image(dso);
+       return -1;
+}
+
+static struct sym *dso__find_sym(struct dso *dso, uint64_t offset)
+{
+       unsigned long sym_addr;
+       int start, end, mid;
+
+       if (!dso->syms && dso__load_sym_table(dso))
+               return NULL;
+
+       start = 0;
+       end = dso->syms_sz - 1;
+
+       /* find largest sym_addr <= addr using binary search */
+       while (start < end) {
+               mid = start + (end - start + 1) / 2;
+               sym_addr = dso->syms[mid].start;
+
+               if (sym_addr <= offset)
+                       start = mid;
+               else
+                       end = mid - 1;
+       }
+
+       if (start == end && dso->syms[start].start <= offset)
+               return &dso->syms[start];
+       return NULL;
+}
+
+struct syms *syms__load_file(const char *fname)
+{
+       char buf[PATH_MAX], perm[5];
+       struct syms *syms;
+       struct map map;
+       char *name;
+       FILE *f;
+       int ret;
+
+       f = fopen(fname, "r");
+       if (!f)
+               return NULL;
+
+       syms = calloc(1, sizeof(*syms));
+       if (!syms)
+               goto err_out;
+
+       while (true) {
+               ret = fscanf(f, "%lx-%lx %4s %lx %lx:%lx %lu%[^\n]",
+                            &map.start_addr, &map.end_addr, perm,
+                            &map.file_off, &map.dev_major,
+                            &map.dev_minor, &map.inode, buf);
+               if (ret == EOF && feof(f))
+                       break;
+               if (ret != 8)   /* perf-<PID>.map */
+                       goto err_out;
+
+               if (perm[2] != 'x')
+                       continue;
+
+               name = buf;
+               while (isspace(*name))
+                       name++;
+               if (!is_file_backed(name))
+                       continue;
+
+               if (syms__add_dso(syms, &map, name))
+                       goto err_out;
+       }
+
+       fclose(f);
+       return syms;
+
+err_out:
+       syms__free(syms);
+       fclose(f);
+       return NULL;
+}
+
+struct syms *syms__load_pid(pid_t tgid)
+{
+       char fname[128];
+
+       snprintf(fname, sizeof(fname), "/proc/%ld/maps", (long)tgid);
+       return syms__load_file(fname);
+}
+
+void syms__free(struct syms *syms)
+{
+       int i;
+
+       if (!syms)
+               return;
+
+       for (i = 0; i < syms->dso_sz; i++)
+               dso__free_fields(&syms->dsos[i]);
+       free(syms->dsos);
+       free(syms);
+}
+
+const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr)
+{
+       struct dso *dso;
+       uint64_t offset;
+
+       dso = syms__find_dso(syms, addr, &offset);
+       if (!dso)
+               return NULL;
+       return dso__find_sym(dso, offset);
+}
+
+struct syms_cache {
+       struct {
+               struct syms *syms;
+               int tgid;
+       } *data;
+       int nr;
+};
+
+struct syms_cache *syms_cache__new(int nr)
+{
+       struct syms_cache *syms_cache;
+
+       syms_cache = calloc(1, sizeof(*syms_cache));
+       if (!syms_cache)
+               return NULL;
+       if (nr > 0)
+               syms_cache->data = calloc(nr, sizeof(*syms_cache->data));
+       return syms_cache;
+}
+
+void syms_cache__free(struct syms_cache *syms_cache)
+{
+       int i;
+
+       if (!syms_cache)
+               return;
+
+       for (i = 0; i < syms_cache->nr; i++)
+               syms__free(syms_cache->data[i].syms);
+       free(syms_cache->data);
+       free(syms_cache);
+}
+
+struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid)
+{
+       void *tmp;
+       int i;
+
+       for (i = 0; i < syms_cache->nr; i++) {
+               if (syms_cache->data[i].tgid == tgid)
+                       return syms_cache->data[i].syms;
+       }
+
+       tmp = realloc(syms_cache->data, (syms_cache->nr + 1) *
+                     sizeof(*syms_cache->data));
+       if (!tmp)
+               return NULL;
+       syms_cache->data = tmp;
+       syms_cache->data[syms_cache->nr].syms = syms__load_pid(tgid);
+       syms_cache->data[syms_cache->nr].tgid = tgid;
+       return syms_cache->data[syms_cache->nr++].syms;
+}
+
 struct partitions {
        struct partition *items;
        int sz;
@@ -330,7 +929,7 @@ void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type)
 }
 
 void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base,
-               unsigned int step, const char *val_type)
+                      unsigned int step, const char *val_type)
 {
        int i, stars_max = 40, idx_min = -1, idx_max = -1;
        unsigned int val, val_max = 0;
index 8dc7c1c0134ad745406862e311f0043cbef0199a..90d07feba0671d8e16c40bbec70a1b6c0a72154c 100644 (file)
@@ -20,6 +20,25 @@ const struct ksym *ksyms__map_addr(const struct ksyms *ksyms,
 const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms,
                                     const char *name);
 
+struct sym {
+       const char *name;
+       unsigned long start;
+       unsigned long size;
+};
+
+struct syms;
+
+struct syms *syms__load_pid(int tgid);
+struct syms *syms__load_file(const char *fname);
+void syms__free(struct syms *syms);
+const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr);
+
+struct syms_cache;
+
+struct syms_cache *syms_cache__new(int nr);
+struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid);
+void syms_cache__free(struct syms_cache *syms_cache);
+
 struct partition {
        char *name;
        unsigned int dev;
index fbce9b89c80c3b50c3b6c9f1ee44c3f8a7becf9d..4a627a919944699d5fb0aa073f14384b667e7adb 100644 (file)
@@ -159,7 +159,7 @@ int resolve_binary_path(const char *binary, pid_t pid, char *path, size_t path_s
  * Opens an elf at `path` of kind ELF_K_ELF.  Returns NULL on failure.  On
  * success, close with close_elf(e, fd_close).
  */
-static Elf *open_elf(const char *path, int *fd_close)
+Elf *open_elf(const char *path, int *fd_close)
 {
        int fd;
        Elf *e;
@@ -189,7 +189,30 @@ static Elf *open_elf(const char *path, int *fd_close)
        return e;
 }
 
-static void close_elf(Elf *e, int fd_close)
+Elf *open_elf_by_fd(int fd)
+{
+       Elf *e;
+
+       if (elf_version(EV_CURRENT) == EV_NONE) {
+               warn("elf init failed\n");
+               return NULL;
+       }
+       e = elf_begin(fd, ELF_C_READ, NULL);
+       if (!e) {
+               warn("elf_begin failed: %s\n", elf_errmsg(-1));
+               close(fd);
+               return NULL;
+       }
+       if (elf_kind(e) != ELF_K_ELF) {
+               warn("elf kind %d is not ELF_K_ELF\n", elf_kind(e));
+               elf_end(e);
+               close(fd);
+               return NULL;
+       }
+       return e;
+}
+
+void close_elf(Elf *e, int fd_close)
 {
        elf_end(e);
        close(fd_close);
index c8a758036efb9dee8fbf9f448043e765540eb16a..47f77bb20baac9d79322c64a48f70ccced4a42d9 100644 (file)
@@ -5,10 +5,14 @@
 
 #include <sys/types.h>
 #include <unistd.h>
+#include <gelf.h>
 
 int get_pid_binary_path(pid_t pid, char *path, size_t path_sz);
 int get_pid_lib_path(pid_t pid, const char *lib, char *path, size_t path_sz);
 int resolve_binary_path(const char *binary, pid_t pid, char *path, size_t path_sz);
 off_t get_elf_func_offset(const char *path, const char *func);
+Elf *open_elf(const char *path, int *fd_close);
+Elf *open_elf_by_fd(int fd);
+void close_elf(Elf *e, int fd_close);
 
 #endif /* __UPROBE_HELPERS_H */