From 594a58d67467495761d2d0826555612785dfcb90 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=B8ren=20Sandmann=20Pedersen?= Date: Sat, 5 Sep 2009 17:15:19 -0400 Subject: [PATCH] Initial port to perf counters --- barrier.h | 33 +++ collector.c | 621 +++++++++++++++++++++++++++++++------------- perf_counter.h | 804 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ process.c | 12 +- process.h | 1 + 5 files changed, 1281 insertions(+), 190 deletions(-) create mode 100644 barrier.h create mode 100644 perf_counter.h diff --git a/barrier.h b/barrier.h new file mode 100644 index 0000000..8d468c2 --- /dev/null +++ b/barrier.h @@ -0,0 +1,33 @@ +#if defined(__i386__) +#define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#if defined(__x86_64__) +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#ifdef __s390__ +#define rmb() asm volatile("bcr 15,0" ::: "memory") +#define cpu_relax() asm volatile("" ::: "memory"); +#endif + +#ifdef __sh__ +#if defined(__SH4A__) || defined(__SH5__) +# define rmb() asm volatile("synco" ::: "memory") +#else +# define rmb() asm volatile("" ::: "memory") +#endif +#define cpu_relax() asm volatile("" ::: "memory") +#endif + +#ifdef __hppa__ +#define rmb() asm volatile("" ::: "memory") +#define cpu_relax() asm volatile("" ::: "memory"); +#endif diff --git a/collector.c b/collector.c index ce18e47..524696c 100644 --- a/collector.c +++ b/collector.c @@ -17,6 +17,17 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "stackstash.h" #include "collector.h" #include "module/sysprof-module.h" @@ -24,17 +35,66 @@ #include "process.h" #include "elfparser.h" -#include -#include -#include -#include -#include -#include +#include "perf_counter.h" +#include "barrier.h" -#define SYSPROF_FILE "/dev/sysprof-trace" +#define N_PAGES 128 /* Number of pages in the ringbuffer */ -static void set_no_module_error (GError **err); -static void set_cant_open_error (GError **err, int eno); +typedef struct counter_t counter_t; +typedef struct sample_event_t sample_event_t; +typedef struct mmap_event_t mmap_event_t; +typedef struct comm_event_t comm_event_t; +typedef union counter_event_t counter_event_t; +typedef void (* event_callback_t) (counter_event_t *event, gpointer data); + +struct counter_t +{ + int fd; + struct perf_counter_mmap_page * mmap_page; + uint8_t * data; + + uint64_t tail; + int cpu; + + event_callback_t callback; + gpointer user_data; + + GString * partial; +}; + +struct sample_event_t +{ + struct perf_event_header header; + uint64_t ip; + uint32_t pid, tid; + uint64_t n_ips; + uint64_t ips[1]; +}; + +struct comm_event_t +{ + struct perf_event_header header; + uint32_t pid, tid; + char comm[]; +}; + +struct mmap_event_t +{ + struct perf_event_header header; + + uint32_t pid, tid; + uint64_t addr; + uint64_t pgoff; + char filename[1]; +}; + +union counter_event_t +{ + struct perf_event_header header; + mmap_event_t mmap; + comm_event_t comm; + sample_event_t sample; +}; struct Collector { @@ -42,23 +102,256 @@ struct Collector gpointer data; StackStash * stash; - int fd; GTimeVal latest_reset; + int n_samples; - SysprofMmapArea * map_area; - unsigned int current; + + GList * counters; }; +static int +get_n_cpus (void) +{ + return sysconf (_SC_NPROCESSORS_ONLN); +} + +static int +sysprof_perf_counter_open (struct perf_counter_attr *attr, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + attr->size = sizeof(*attr); + + return syscall (__NR_perf_counter_open, attr, pid, cpu, group_fd, flags); +} + +static int +process_event (counter_t *counter, void *data) +{ + struct perf_event_header *header = data; + + counter->callback ((counter_event_t *)header, counter->user_data); + + return header->size; +} + +static int +n_missing_bytes (const uint8_t *data, int n_bytes) +{ + const struct perf_event_header *header; + + if (n_bytes < sizeof (*header)) + return sizeof (*header) - n_bytes; + + header = (const void *)data; + + if (n_bytes < header->size) + return header->size - n_bytes; + + return 0; +} + +static void +process_events (counter_t *counter, uint8_t *data, int n_bytes, + GString *partial) +{ + int n_events; + ssize_t n_missing; + + n_events = 0; + + while (n_bytes && (n_missing = n_missing_bytes ( + (uint8_t *)partial->str, partial->len))) + { + n_missing = MIN (n_bytes, n_missing); + + g_string_append_len (partial, (const char *)data, n_missing); + + data += n_missing; + n_bytes -= n_missing; + } + + if (partial->len) + { + int n_used; + + if (n_missing_bytes ((uint8_t *)partial->str, partial->len)) + return; + + n_used = process_event (counter, partial->str); + + g_assert (n_used == partial->len); + + g_string_truncate (partial, 0); + } + + while (n_bytes && !n_missing_bytes (data, n_bytes)) + { + int n_used = process_event (counter, data); + + data += n_used; + n_bytes -= n_used; + } + + if (n_missing_bytes (data, n_bytes)) + g_string_append_len (partial, (char *)data, n_bytes); +} + + +static void +on_read (gpointer data) +{ + uint64_t head, tail; + counter_t *counter = data; + int mask = (N_PAGES * process_get_page_size() - 1); + uint64_t size; + int diff; + + tail = counter->tail; + + head = counter->mmap_page->data_head; + rmb(); + + diff = head - tail; + + if (diff < 0) + { + g_warning ("sysprof fails at reading the buffer\n"); + + tail = head; + } + + size = head - tail; + + if ((tail & mask) + size != (head & mask)) + { + size = mask + 1 - (tail & mask); + + g_assert ((tail & mask) + size <= (N_PAGES * process_get_page_size())); + + process_events (counter, counter->data + (tail & mask), + size, counter->partial); + + tail += size; + } + + size = head - tail; + + g_assert ((tail & mask) + size <= (N_PAGES * process_get_page_size())); + + process_events (counter, + counter->data + (tail & mask), size, counter->partial); + + tail += size; + + counter->tail = tail; + counter->mmap_page->data_tail = tail; +} + +#define fail(x) + +static counter_t * +counter_new (int cpu, + event_callback_t callback, + gpointer data) +{ + struct perf_counter_attr attr; + counter_t *counter; + int fd; + + counter = g_new (counter_t, 1); + + memset (&attr, 0, sizeof (attr)); + + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.sample_period = 1200000 ; /* In number of clock cycles - + * use frequency instead FIXME + */ + attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_CALLCHAIN; + attr.wakeup_events = 100000; + attr.disabled = TRUE; + attr.mmap = TRUE; + attr.comm = TRUE; + + fd = sysprof_perf_counter_open (&attr, -1, cpu, -1, 0); + + if (fd < 0) + { + fail ("perf_counter_open"); + return NULL; + } + + counter->fd = fd; + counter->mmap_page = mmap ( + NULL, (N_PAGES + 1) * process_get_page_size(), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (counter->mmap_page == MAP_FAILED) + { + fail ("mmap"); + return NULL; + } + + counter->data = (uint8_t *)counter->mmap_page + process_get_page_size (); + counter->tail = 0; + counter->cpu = cpu; + counter->partial = g_string_new (NULL); + counter->callback = callback; + counter->user_data = data; + + fd_add_watch (fd, counter); + fd_set_read_callback (fd, on_read); + + return counter; +} + +static void +counter_enable (counter_t *counter) +{ + ioctl (counter->fd, PERF_COUNTER_IOC_ENABLE); +} + +static void +counter_free (counter_t *counter) +{ + munmap (counter->mmap_page, (N_PAGES + 1) * process_get_page_size()); + fd_remove_watch (counter->fd); + + close (counter->fd); + g_string_free (counter->partial, TRUE); + + g_free (counter); +} + +/* + * Collector + */ +void +collector_reset (Collector *collector) +{ + if (collector->stash) + stack_stash_unref (collector->stash); + + process_flush_caches(); + + collector->stash = stack_stash_new (NULL); + collector->n_samples = 0; + + g_get_current_time (&collector->latest_reset); +} + /* callback is called whenever a new sample arrives */ Collector * collector_new (CollectorFunc callback, - gpointer data) + gpointer data) { Collector *collector = g_new0 (Collector, 1); collector->callback = callback; collector->data = data; - collector->fd = -1; collector->stash = NULL; collector_reset (collector); @@ -88,9 +381,9 @@ static void add_trace_to_stash (const SysprofStackTrace *trace, StackStash *stash) { - int i; - gulong *addrs; Process *process = process_get_from_pid (trace->pid); + gulong *addrs; + int i; int n_addresses; int n_kernel_words; int a; @@ -165,200 +458,183 @@ in_dead_period (Collector *collector) return FALSE; } - - static void -collect_traces (Collector *collector) +process_mmap (Collector *collector, + mmap_event_t *mmap) { - gboolean first; - /* After a reset we ignore samples for a short period so that - * a reset will actually cause 'samples' to become 0 - */ - if (in_dead_period (collector)) - { - collector->current = collector->map_area->head; - return; - } - - first = collector->n_samples == 0; - - while (collector->current != collector->map_area->head) - { - const SysprofStackTrace *trace; - - trace = &(collector->map_area->traces[collector->current]); - -#if 0 - { - int i; - g_print ("pid: %d (%d)\n", trace->pid, trace->n_addresses); - for (i=0; i < trace->n_addresses; ++i) - g_print ("rd: %08x\n", trace->addresses[i]); - g_print ("-=-\n"); - } -#endif -#if 0 - { - int i; - g_print ("pid: %d (%d)\n", trace->pid, trace->n_addresses); - for (i=0; i < trace->n_kernel_words; ++i) - g_print ("rd: %08x\n", trace->kernel_stack[i]); - g_print ("-=-\n"); - } -#endif - - add_trace_to_stash (trace, collector->stash); - - collector->current++; - if (collector->current >= SYSPROF_N_TRACES) - collector->current = 0; - - collector->n_samples++; - } - - if (collector->callback) - collector->callback (first, collector->data); } static void -on_read (gpointer data) +process_comm (Collector *collector, + comm_event_t *comm) { - Collector *collector = data; - char c; - /* Make sure poll() doesn't fire immediately again */ - read (collector->fd, &c, 1); - - collect_traces (collector); } static gboolean -load_module (void) +is_context (uint64_t addr) { - int exit_status = -1; - char *dummy1, *dummy2; - - if (g_spawn_command_line_sync ("/sbin/modprobe sysprof-module", - &dummy1, &dummy2, - &exit_status, - NULL)) - { - if (WIFEXITED (exit_status)) - exit_status = WEXITSTATUS (exit_status); - - g_free (dummy1); - g_free (dummy2); - } - - return (exit_status == 0); + return + addr == PERF_CONTEXT_HV || + addr == PERF_CONTEXT_KERNEL || + addr == PERF_CONTEXT_USER || + addr == PERF_CONTEXT_GUEST || + addr == PERF_CONTEXT_GUEST_KERNEL || + addr == PERF_CONTEXT_GUEST_USER; } -static gboolean -open_fd (Collector *collector, - GError **err) +static void +process_sample (Collector *collector, + sample_event_t *sample) { - int fd; - void *map_area; - - fd = open (SYSPROF_FILE, O_RDONLY); - if (fd < 0) + Process *process = process_get_from_pid (sample->pid); + gboolean first = collector->n_samples == 0; + uint64_t context = 0; + gulong addrs_stack[2048]; + gulong *addrs; + int n_alloc; + int i; + gulong *a; + + n_alloc = sample->n_ips + 2; + if (n_alloc < 2048) + addrs = addrs_stack; + else + addrs = g_new (gulong, n_alloc); + + a = addrs; + for (i = 0; i < sample->n_ips; ++i) { - if (load_module()) + uint64_t addr = sample->ips[i]; + + if (is_context (addr)) { - GTimer *timer = g_timer_new (); - - while (fd < 0 && g_timer_elapsed (timer, NULL) < 0.5) + /* FIXME: think this through */ + if (context == PERF_CONTEXT_KERNEL) + *a++ = 0x01; /* kernel marker */ + + context = addr; + } + else + { + if (context == PERF_CONTEXT_KERNEL) { - /* Wait for udev to discover the new device. - */ - usleep (100000); - - errno = 0; - fd = open (SYSPROF_FILE, O_RDONLY); + if (process_is_kernel_address (addr)) + *a++ = addr; } - - g_timer_destroy (timer); - - if (fd < 0) + else { - set_cant_open_error (err, errno); - return FALSE; + if (!context) + g_print ("no context\n"); + + process_ensure_map (process, sample->pid, addr); + + *a++ = addr; } } - - if (fd < 0) - { - set_no_module_error (err); - - return FALSE; - } } + + *a++ = (gulong)process; + + stack_stash_add_trace (collector->stash, addrs, a - addrs, 1); - map_area = mmap (NULL, sizeof (SysprofMmapArea), - PROT_READ, MAP_SHARED, fd, 0); + collector->n_samples++; + + if (collector->callback) + collector->callback (first, collector->data); + + if (addrs != addrs_stack) + g_free (addrs); +} + +static void +on_event (counter_event_t * event, + gpointer data) + +{ + Collector *collector = data; - if (map_area == MAP_FAILED) + switch (event->header.type) { - close (fd); - set_cant_open_error (err, errno); + case PERF_EVENT_MMAP: + process_mmap (collector, &event->mmap); + break; + + case PERF_EVENT_LOST: + break; + + case PERF_EVENT_COMM: + process_comm (collector, &event->comm); + break; + + case PERF_EVENT_EXIT: + break; + + case PERF_EVENT_THROTTLE: + break; + + case PERF_EVENT_UNTHROTTLE: + break; + + case PERF_EVENT_FORK: + break; + + case PERF_EVENT_READ: + break; + + case PERF_EVENT_SAMPLE: + process_sample (collector, &event->sample); + break; - return FALSE; + default: + g_print ("unknown event: %d (%d)\n", + event->header.type, event->header.size); + break; } - - collector->map_area = map_area; - collector->current = 0; - collector->fd = fd; - fd_add_watch (collector->fd, collector); - - return TRUE; } gboolean collector_start (Collector *collector, GError **err) { - if (collector->fd < 0 && !open_fd (collector, err)) - return FALSE; + int n_cpus = get_n_cpus (); + GList *list; + int i; + + for (i = 0; i < n_cpus; ++i) + { + counter_t *counter = counter_new (i, on_event, collector); + + collector->counters = g_list_append (collector->counters, counter); + } /* Hack to make sure we parse the kernel symbols before * starting collection, so the parsing doesn't interfere * with the profiling. */ process_is_kernel_address (0); + + for (list = collector->counters; list != NULL; list = list->next) + counter_enable (list->data); - fd_set_read_callback (collector->fd, on_read); return TRUE; } void collector_stop (Collector *collector) { - if (collector->fd >= 0) + GList *list; + + for (list = collector->counters; list != NULL; list = list->next) { - fd_remove_watch (collector->fd); + counter_t *counter = list->data; - munmap (collector->map_area, sizeof (SysprofMmapArea)); - collector->map_area = NULL; - collector->current = 0; - - close (collector->fd); - collector->fd = -1; + counter_free (counter); } -} -void -collector_reset (Collector *collector) -{ - if (collector->stash) - stack_stash_unref (collector->stash); - - process_flush_caches(); - - collector->stash = stack_stash_new (NULL); - collector->n_samples = 0; - - g_get_current_time (&collector->latest_reset); + g_list_free (collector->counters); + collector->counters = NULL; } int @@ -464,7 +740,7 @@ lookup_symbol (Process *process, gpointer address, static void resolve_symbols (GList *trace, gint size, gpointer data) { - static const char *const everything = "Everything"; + static const char *const everything = "[Everything]"; GList *list; ResolveInfo *info = data; Process *process = g_list_last (trace)->data; @@ -538,31 +814,6 @@ collector_create_profile (Collector *collector) return profile; } -static void -set_no_module_error (GError **err) -{ - g_set_error (err, - COLLECTOR_ERROR, - COLLECTOR_ERROR_CANT_OPEN_FILE, - "Can't open " SYSPROF_FILE ". You need to insert " - "the sysprof kernel module. Run\n" - "\n" - " modprobe sysprof-module\n" - "\n" - "as root"); -} - -static void -set_cant_open_error (GError **err, - int eno) -{ - g_set_error (err, - COLLECTOR_ERROR, - COLLECTOR_ERROR_CANT_OPEN_FILE, - "Can't open " SYSPROF_FILE ": %s", - g_strerror (eno)); -} - GQuark collector_error_quark (void) { diff --git a/perf_counter.h b/perf_counter.h new file mode 100644 index 0000000..4d3ad31 --- /dev/null +++ b/perf_counter.h @@ -0,0 +1,804 @@ +/* + * Performance counters: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache counters: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_counter_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + + __reserved_1 : 50; + + __u32 wakeup_events; /* wakeup every n events */ + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ + __u64 time_enabled; /* time counter active */ + __u64 time_running; /* time counter on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_counter_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_EVENT_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_EVENT_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_EVENT_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_EVENT_SAMPLE = 9, + + PERF_EVENT_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + + + + + +#ifdef __KERNEL__ +/* + * Kernel-internal data types and definitions: + */ + +#ifdef CONFIG_PERF_COUNTERS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PERF_MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + __u64 nr; + __u64 ip[PERF_MAX_STACK_DEPTH]; +}; + +struct perf_raw_record { + u32 size; + void *data; +}; + +struct task_struct; + +/** + * struct hw_perf_counter - performance counter hardware details: + */ +struct hw_perf_counter { +#ifdef CONFIG_PERF_COUNTERS + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long counter_base; + int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; + atomic64_t prev_count; + u64 sample_period; + u64 last_period; + atomic64_t period_left; + u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; + u64 freq_stamp; +#endif +}; + +struct perf_counter; + +/** + * struct pmu - generic performance monitoring unit + */ +struct pmu { + int (*enable) (struct perf_counter *counter); + void (*disable) (struct perf_counter *counter); + void (*read) (struct perf_counter *counter); + void (*unthrottle) (struct perf_counter *counter); +}; + +/** + * enum perf_counter_active_state - the states of a counter + */ +enum perf_counter_active_state { + PERF_COUNTER_STATE_ERROR = -2, + PERF_COUNTER_STATE_OFF = -1, + PERF_COUNTER_STATE_INACTIVE = 0, + PERF_COUNTER_STATE_ACTIVE = 1, +}; + +struct file; + +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + int nr_locked; /* nr pages mlocked */ + + atomic_t poll; /* POLL_ for wakeups */ + atomic_t events; /* event limit */ + + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + + atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + atomic_t lost; /* nr records lost */ + + struct perf_counter_mmap_page *user_page; + void *data_pages[0]; +}; + +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { +#ifdef CONFIG_PERF_COUNTERS + struct list_head list_entry; + struct list_head event_entry; + struct list_head sibling_list; + int nr_siblings; + struct perf_counter *group_leader; + const struct pmu *pmu; + + enum perf_counter_active_state state; + atomic64_t count; + + /* + * These are the total time in nanoseconds that the counter + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task counter) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the counter is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the counter was enabled + * tstamp_running: the notional time when the counter was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * counter was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + + struct perf_counter_attr attr; + struct hw_perf_counter hw; + + struct perf_counter_context *ctx; + struct file *filp; + + /* + * These accumulate total time (in nanoseconds) that children + * counters have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + + /* + * Protect attach/detach and child_list: + */ + struct mutex child_mutex; + struct list_head child_list; + struct perf_counter *parent; + + int oncpu; + int cpu; + + struct list_head owner_entry; + struct task_struct *owner; + + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; + + /* poll related */ + wait_queue_head_t waitq; + struct fasync_struct *fasync; + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_kill; + int pending_disable; + struct perf_pending_entry pending; + + atomic_t event_limit; + + void (*destroy)(struct perf_counter *); + struct rcu_head rcu_head; + + struct pid_namespace *ns; + u64 id; +#endif +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { + /* + * Protect the states of the counters in the list, + * nr_active, and the list: + */ + spinlock_t lock; + /* + * Protect the list of counters. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; + + struct list_head counter_list; + struct list_head event_list; + int nr_counters; + int nr_active; + int is_active; + int nr_stat; + atomic_t refcount; + struct task_struct *task; + + /* + * Context clock, runs when context enabled. + */ + u64 time; + u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_counter_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { + struct perf_counter_context ctx; + struct perf_counter_context *task_ctx; + int active_oncpu; + int max_pertask; + int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; +}; + +#ifdef CONFIG_PERF_COUNTERS + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); + +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern int perf_counter_init_task(struct task_struct *child); +extern void perf_counter_exit_task(struct task_struct *child); +extern void perf_counter_free_task(struct task_struct *task); +extern void set_perf_counter_pending(void); +extern void perf_counter_do_pending(void); +extern void perf_counter_print_debug(void); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); +extern int perf_counter_task_disable(void); +extern int perf_counter_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu); +extern void perf_counter_update_userpage(struct perf_counter *counter); + +struct perf_sample_data { + struct pt_regs *regs; + u64 addr; + u64 period; + struct perf_raw_record *raw; +}; + +extern int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); + +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return (counter->attr.type != PERF_TYPE_RAW) && + (counter->attr.type != PERF_TYPE_HARDWARE) && + (counter->attr.type != PERF_TYPE_HW_CACHE); +} + +extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; + +extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); + +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swcounter_enabled[event])) + __perf_swcounter_event(event, nr, nmi, regs, addr); +} + +extern void __perf_counter_mmap(struct vm_area_struct *vma); + +static inline void perf_counter_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_counter_mmap(vma); +} + +extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_fork(struct task_struct *tsk); + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + +extern int sysctl_perf_counter_paranoid; +extern int sysctl_perf_counter_mlock; +extern int sysctl_perf_counter_sample_rate; + +extern void perf_counter_init(void); + +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ + PERF_EVENT_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu) { } +static inline int perf_counter_init_task(struct task_struct *child) { return 0; } +static inline void perf_counter_exit_task(struct task_struct *child) { } +static inline void perf_counter_free_task(struct task_struct *task) { } +static inline void perf_counter_do_pending(void) { } +static inline void perf_counter_print_debug(void) { } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } +static inline int perf_counter_task_disable(void) { return -EINVAL; } +static inline int perf_counter_task_enable(void) { return -EINVAL; } + +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } + +static inline void perf_counter_mmap(struct vm_area_struct *vma) { } +static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_fork(struct task_struct *tsk) { } +static inline void perf_counter_init(void) { } +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/process.c b/process.c index 45b45d9..c9c8776 100644 --- a/process.c +++ b/process.c @@ -296,8 +296,8 @@ process_has_page (Process *process, gulong addr) return FALSE; } -static int -page_size (void) +int +process_get_page_size (void) { static int page_size; static gboolean has_page_size = FALSE; @@ -316,7 +316,7 @@ process_ensure_map (Process *process, int pid, gulong addr) { /* Round down to closest page */ - addr = (addr - addr % page_size()); + addr = (addr - addr % process_get_page_size()); if (process_has_page (process, addr)) return; @@ -678,7 +678,7 @@ process_lookup_kernel_symbol (gulong address, const char * process_lookup_symbol (Process *process, gulong address, gulong *offset) { - static const char *const kernel = "kernel"; + static const char *const kernel = "[kernel]"; const BinSymbol *result; Map *map = process_locate_map (process, address); @@ -756,7 +756,9 @@ process_lookup_symbol (Process *process, gulong address, gulong *offset) g_print (" ---> %s\n", result->name); #endif -/* g_print ("(%x) %x %x name; %s\n", address, map->start, map->offset, result->name); */ +/* g_print ("(%x) %x %x name; %s\n", address, map->start, + * map->offset, result->name); + */ #if 0 g_print ("name: %s (in %s)\n", bin_symbol_get_name (map->bin_file, result), map->filename); diff --git a/process.h b/process.h index a72714c..21faf4d 100644 --- a/process.h +++ b/process.h @@ -60,5 +60,6 @@ const guint8 *process_get_vdso_bytes (gsize *length); gboolean process_is_kernel_address (gulong address); const char * process_lookup_kernel_symbol (gulong address, gulong *offset); +int process_get_page_size (void); #endif -- 2.7.4