Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
authorIngo Molnar <mingo@elte.hu>
Fri, 23 Jul 2010 07:10:29 +0000 (09:10 +0200)
committerIngo Molnar <mingo@elte.hu>
Fri, 23 Jul 2010 07:10:29 +0000 (09:10 +0200)
12 files changed:
Documentation/trace/ftrace-design.txt
include/linux/ftrace.h
include/linux/ftrace_event.h
include/trace/ftrace.h
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_events.c
kernel/trace/trace_irqsoff.c
kernel/trace/trace_output.c
kernel/trace/trace_sched_wakeup.c
scripts/recordmcount.pl

index f1f81af..dc52bd4 100644 (file)
@@ -13,6 +13,9 @@ Note that this focuses on architecture implementation details only.  If you
 want more explanation of a feature in terms of common code, review the common
 ftrace.txt file.
 
+Ideally, everyone who wishes to retain performance while supporting tracing in
+their kernel should make it all the way to dynamic ftrace support.
+
 
 Prerequisites
 -------------
@@ -215,7 +218,7 @@ An arch may pass in a unique value (frame pointer) to both the entering and
 exiting of a function.  On exit, the value is compared and if it does not
 match, then it will panic the kernel.  This is largely a sanity check for bad
 code generation with gcc.  If gcc for your port sanely updates the frame
-pointer under different opitmization levels, then ignore this option.
+pointer under different optimization levels, then ignore this option.
 
 However, adding support for it isn't terribly difficult.  In your assembly code
 that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument.
@@ -234,7 +237,7 @@ If you can't trace NMI functions, then skip this option.
 
 
 HAVE_SYSCALL_TRACEPOINTS
----------------------
+------------------------
 
 You need very few things to get the syscalls tracing in an arch.
 
@@ -250,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch.
 HAVE_FTRACE_MCOUNT_RECORD
 -------------------------
 
-See scripts/recordmcount.pl for more info.
+See scripts/recordmcount.pl for more info.  Just fill in the arch-specific
+details for how to locate the addresses of mcount call sites via objdump.
+This option doesn't make much sense without also implementing dynamic ftrace.
 
+
+HAVE_DYNAMIC_FTRACE
+-------------------
+
+You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
+scroll your reader back up if you got over eager.
+
+Once those are out of the way, you will need to implement:
+       - asm/ftrace.h:
+               - MCOUNT_ADDR
+               - ftrace_call_adjust()
+               - struct dyn_arch_ftrace{}
+       - asm code:
+               - mcount() (new stub)
+               - ftrace_caller()
+               - ftrace_call()
+               - ftrace_stub()
+       - C code:
+               - ftrace_dyn_arch_init()
+               - ftrace_make_nop()
+               - ftrace_make_call()
+               - ftrace_update_ftrace_func()
+
+First you will need to fill out some arch details in your asm/ftrace.h.
+
+Define MCOUNT_ADDR as the address of your mcount symbol similar to:
+       #define MCOUNT_ADDR ((unsigned long)mcount)
+Since no one else will have a decl for that function, you will need to:
+       extern void mcount(void);
+
+You will also need the helper function ftrace_call_adjust().  Most people
+will be able to stub it out like so:
+       static inline unsigned long ftrace_call_adjust(unsigned long addr)
+       {
+               return addr;
+       }
 <details to be filled>
 
+Lastly you will need the custom dyn_arch_ftrace structure.  If you need
+some extra state when runtime patching arbitrary call sites, this is the
+place.  For now though, create an empty struct:
+       struct dyn_arch_ftrace {
+               /* No extra data needed */
+       };
+
+With the header out of the way, we can fill out the assembly code.  While we
+did already create a mcount() function earlier, dynamic ftrace only wants a
+stub function.  This is because the mcount() will only be used during boot
+and then all references to it will be patched out never to return.  Instead,
+the guts of the old mcount() will be used to create a new ftrace_caller()
+function.  Because the two are hard to merge, it will most likely be a lot
+easier to have two separate definitions split up by #ifdefs.  Same goes for
+the ftrace_stub() as that will now be inlined in ftrace_caller().
+
+Before we get confused anymore, let's check out some pseudo code so you can
+implement your own stuff in assembly:
 
-HAVE_DYNAMIC_FTRACE
----------------------
+void mcount(void)
+{
+       return;
+}
+
+void ftrace_caller(void)
+{
+       /* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */
+
+       /* save all state needed by the ABI (see paragraph above) */
+
+       unsigned long frompc = ...;
+       unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+
+ftrace_call:
+       ftrace_stub(frompc, selfpc);
+
+       /* restore all state needed by the ABI */
+
+ftrace_stub:
+       return;
+}
+
+This might look a little odd at first, but keep in mind that we will be runtime
+patching multiple things.  First, only functions that we actually want to trace
+will be patched to call ftrace_caller().  Second, since we only have one tracer
+active at a time, we will patch the ftrace_caller() function itself to call the
+specific tracer in question.  That is the point of the ftrace_call label.
+
+With that in mind, let's move on to the C code that will actually be doing the
+runtime patching.  You'll need a little knowledge of your arch's opcodes in
+order to make it through the next section.
+
+Every arch has an init callback function.  If you need to do something early on
+to initialize some state, this is the time to do that.  Otherwise, this simple
+function below should be sufficient for most people:
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+       /* return value is done indirectly via data */
+       *(unsigned long *)data = 0;
+
+       return 0;
+}
+
+There are two functions that are used to do runtime patching of arbitrary
+functions.  The first is used to turn the mcount call site into a nop (which
+is what helps us retain runtime performance when not tracing).  The second is
+used to turn the mcount call site into a call to an arbitrary location (but
+typically that is ftracer_caller()).  See the general function definition in
+linux/ftrace.h for the functions:
+       ftrace_make_nop()
+       ftrace_make_call()
+The rec->ip value is the address of the mcount call site that was collected
+by the scripts/recordmcount.pl during build time.
+
+The last function is used to do runtime patching of the active tracer.  This
+will be modifying the assembly code at the location of the ftrace_call symbol
+inside of the ftrace_caller() function.  So you should have sufficient padding
+at that location to support the new function calls you'll be inserting.  Some
+people will be using a "call" type instruction while others will be using a
+"branch" type instruction.  Specifically, the function is:
+       ftrace_update_ftrace_func()
+
+
+HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER
+------------------------------------------------
+
+The function grapher needs a few tweaks in order to work with dynamic ftrace.
+Basically, you will need to:
+       - update:
+               - ftrace_caller()
+               - ftrace_graph_call()
+               - ftrace_graph_caller()
+       - implement:
+               - ftrace_enable_ftrace_graph_caller()
+               - ftrace_disable_ftrace_graph_caller()
 
 <details to be filled>
+Quick notes:
+       - add a nop stub after the ftrace_call location named ftrace_graph_call;
+         stub needs to be large enough to support a call to ftrace_graph_caller()
+       - update ftrace_graph_caller() to work with being called by the new
+         ftrace_caller() since some semantics may have changed
+       - ftrace_enable_ftrace_graph_caller() will runtime patch the
+         ftrace_graph_call location with a call to ftrace_graph_caller()
+       - ftrace_disable_ftrace_graph_caller() will runtime patch the
+         ftrace_graph_call location with nops
index 41e4633..dcd6a7c 100644 (file)
@@ -1,3 +1,8 @@
+/*
+ * Ftrace header.  For implementation details beyond the random comments
+ * scattered below, see: Documentation/trace/ftrace-design.txt
+ */
+
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
index 01df7ca..02b8b24 100644 (file)
@@ -11,8 +11,6 @@ struct trace_array;
 struct tracer;
 struct dentry;
 
-DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
-
 struct trace_print_flags {
        unsigned long           mask;
        const char              *name;
@@ -58,6 +56,9 @@ struct trace_iterator {
        struct ring_buffer_iter *buffer_iter[NR_CPUS];
        unsigned long           iter_flags;
 
+       /* trace_seq for __print_flags() and __print_symbolic() etc. */
+       struct trace_seq        tmp_seq;
+
        /* The below is zeroed out in pipe_read */
        struct trace_seq        seq;
        struct trace_entry      *ent;
@@ -152,11 +153,13 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
 enum {
        TRACE_EVENT_FL_ENABLED_BIT,
        TRACE_EVENT_FL_FILTERED_BIT,
+       TRACE_EVENT_FL_RECORDED_CMD_BIT,
 };
 
 enum {
-       TRACE_EVENT_FL_ENABLED  = (1 << TRACE_EVENT_FL_ENABLED_BIT),
-       TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+       TRACE_EVENT_FL_ENABLED          = (1 << TRACE_EVENT_FL_ENABLED_BIT),
+       TRACE_EVENT_FL_FILTERED         = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+       TRACE_EVENT_FL_RECORDED_CMD     = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
 };
 
 struct ftrace_event_call {
@@ -174,6 +177,7 @@ struct ftrace_event_call {
         * 32 bit flags:
         *   bit 1:             enabled
         *   bit 2:             filter_active
+        *   bit 3:             enabled cmd record
         *
         * Changes to flags must hold the event_mutex.
         *
index 55c1fd1..fb783d9 100644 (file)
  *     struct trace_seq *s = &iter->seq;
  *     struct ftrace_raw_<call> *field; <-- defined in stage 1
  *     struct trace_entry *entry;
- *     struct trace_seq *p;
+ *     struct trace_seq *p = &iter->tmp_seq;
  *     int ret;
  *
  *     entry = iter->ent;
  *
  *     field = (typeof(field))entry;
  *
- *     p = &get_cpu_var(ftrace_event_seq);
  *     trace_seq_init(p);
  *     ret = trace_seq_printf(s, "%s: ", <call>);
  *     if (ret)
  *             ret = trace_seq_printf(s, <TP_printk> "\n");
- *     put_cpu();
  *     if (!ret)
  *             return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -216,7 +214,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,    \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##call *field;                                \
        struct trace_entry *entry;                                      \
-       struct trace_seq *p;                                            \
+       struct trace_seq *p = &iter->tmp_seq;                           \
        int ret;                                                        \
                                                                        \
        event = container_of(trace_event, struct ftrace_event_call,     \
@@ -231,12 +229,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,  \
                                                                        \
        field = (typeof(field))entry;                                   \
                                                                        \
-       p = &get_cpu_var(ftrace_event_seq);                             \
        trace_seq_init(p);                                              \
        ret = trace_seq_printf(s, "%s: ", event->name);                 \
        if (ret)                                                        \
                ret = trace_seq_printf(s, print);                       \
-       put_cpu();                                                      \
        if (!ret)                                                       \
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
@@ -255,7 +251,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,    \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##template *field;                            \
        struct trace_entry *entry;                                      \
-       struct trace_seq *p;                                            \
+       struct trace_seq *p = &iter->tmp_seq;                           \
        int ret;                                                        \
                                                                        \
        entry = iter->ent;                                              \
@@ -267,12 +263,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,  \
                                                                        \
        field = (typeof(field))entry;                                   \
                                                                        \
-       p = &get_cpu_var(ftrace_event_seq);                             \
        trace_seq_init(p);                                              \
        ret = trace_seq_printf(s, "%s: ", #call);                       \
        if (ret)                                                        \
                ret = trace_seq_printf(s, print);                       \
-       put_cpu();                                                      \
        if (!ret)                                                       \
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
index 28d0615..3632ce8 100644 (file)
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
  */
 struct ring_buffer_per_cpu {
        int                             cpu;
+       atomic_t                        record_disabled;
        struct ring_buffer              *buffer;
        spinlock_t                      reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu {
        unsigned long                   read;
        u64                             write_stamp;
        u64                             read_stamp;
-       atomic_t                        record_disabled;
 };
 
 struct ring_buffer {
index c1752da..4b1122d 100644 (file)
@@ -344,7 +344,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-       TRACE_ITER_GRAPH_TIME;
+       TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +428,7 @@ static const char *trace_options[] = {
        "latency-format",
        "sleep-time",
        "graph-time",
+       "record-cmd",
        NULL
 };
 
@@ -659,6 +660,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
 
        WARN_ON_ONCE(!irqs_disabled());
+       if (!current_trace->use_max_tr) {
+               WARN_ON_ONCE(1);
+               return;
+       }
        arch_spin_lock(&ftrace_max_lock);
 
        tr->buffer = max_tr.buffer;
@@ -685,6 +690,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
 
        WARN_ON_ONCE(!irqs_disabled());
+       if (!current_trace->use_max_tr) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+
        arch_spin_lock(&ftrace_max_lock);
 
        ftrace_disable_cpu();
@@ -729,7 +739,7 @@ __acquires(kernel_lock)
                return -1;
        }
 
-       if (strlen(type->name) > MAX_TRACER_SIZE) {
+       if (strlen(type->name) >= MAX_TRACER_SIZE) {
                pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
                return -1;
        }
@@ -2508,6 +2518,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
                trace_flags |= mask;
        else
                trace_flags &= ~mask;
+
+       if (mask == TRACE_ITER_RECORD_CMD)
+               trace_event_enable_cmd_record(enabled);
 }
 
 static ssize_t
@@ -2746,6 +2759,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
        if (ret < 0)
                return ret;
 
+       if (!current_trace->use_max_tr)
+               goto out;
+
        ret = ring_buffer_resize(max_tr.buffer, size);
        if (ret < 0) {
                int r;
@@ -2773,11 +2789,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
                return ret;
        }
 
+       max_tr.entries = size;
+ out:
        global_trace.entries = size;
 
        return ret;
 }
 
+
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
  *
@@ -2838,12 +2857,26 @@ static int tracing_set_tracer(const char *buf)
        trace_branch_disable();
        if (current_trace && current_trace->reset)
                current_trace->reset(tr);
-
+       if (current_trace && current_trace->use_max_tr) {
+               /*
+                * We don't free the ring buffer. instead, resize it because
+                * The max_tr ring buffer has some state (e.g. ring->clock) and
+                * we want preserve it.
+                */
+               ring_buffer_resize(max_tr.buffer, 1);
+               max_tr.entries = 1;
+       }
        destroy_trace_option_files(topts);
 
        current_trace = t;
 
        topts = create_trace_option_files(current_trace);
+       if (current_trace->use_max_tr) {
+               ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+               if (ret < 0)
+                       goto out;
+               max_tr.entries = global_trace.entries;
+       }
 
        if (t->init) {
                ret = tracer_init(t, tr);
@@ -3426,7 +3459,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        }
 
        tracing_start();
-       max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
 
        return cnt;
@@ -4531,16 +4563,14 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-       max_tr.buffer = ring_buffer_alloc(ring_buf_size,
-                                            TRACE_BUFFER_FLAGS);
+       max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
        if (!max_tr.buffer) {
                printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
                WARN_ON(1);
                ring_buffer_free(global_trace.buffer);
                goto out_free_cpumask;
        }
-       max_tr.entries = ring_buffer_size(max_tr.buffer);
-       WARN_ON(max_tr.entries != global_trace.entries);
+       max_tr.entries = 1;
 #endif
 
        /* Allocate the first page for all buffers */
index 638a588..d05c873 100644 (file)
@@ -274,6 +274,7 @@ struct tracer {
        struct tracer           *next;
        int                     print_max;
        struct tracer_flags     *flags;
+       int                     use_max_tr;
 };
 
 
@@ -581,6 +582,7 @@ enum trace_iterator_flags {
        TRACE_ITER_LATENCY_FMT          = 0x20000,
        TRACE_ITER_SLEEP_TIME           = 0x40000,
        TRACE_ITER_GRAPH_TIME           = 0x80000,
+       TRACE_ITER_RECORD_CMD           = 0x100000,
 };
 
 /*
@@ -713,6 +715,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
        return 0;
 }
 
+extern void trace_event_enable_cmd_record(bool enable);
+
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
index e8e6043..09b4fa6 100644 (file)
@@ -170,6 +170,26 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
+void trace_event_enable_cmd_record(bool enable)
+{
+       struct ftrace_event_call *call;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+                       continue;
+
+               if (enable) {
+                       tracing_start_cmdline_record();
+                       call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+               } else {
+                       tracing_stop_cmdline_record();
+                       call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+               }
+       }
+       mutex_unlock(&event_mutex);
+}
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
@@ -179,13 +199,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
        case 0:
                if (call->flags & TRACE_EVENT_FL_ENABLED) {
                        call->flags &= ~TRACE_EVENT_FL_ENABLED;
-                       tracing_stop_cmdline_record();
+                       if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+                               tracing_stop_cmdline_record();
+                               call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+                       }
                        call->class->reg(call, TRACE_REG_UNREGISTER);
                }
                break;
        case 1:
                if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
-                       tracing_start_cmdline_record();
+                       if (trace_flags & TRACE_ITER_RECORD_CMD) {
+                               tracing_start_cmdline_record();
+                               call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+                       }
                        ret = call->class->reg(call, TRACE_REG_REGISTER);
                        if (ret) {
                                tracing_stop_cmdline_record();
index 6fd486e..73a6b06 100644 (file)
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+       .use_max_tr     = 1,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+       .use_max_tr     = 1,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+       .use_max_tr     = 1,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
index a46197b..02272ba 100644 (file)
@@ -16,9 +16,6 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
index c9fd5bd..4086eae 100644 (file)
@@ -382,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+       .use_max_tr     = 1,
 };
 
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -396,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
+       .use_max_tr     = 1,
 };
 
 __init static int init_wakeup_tracer(void)
index f3c9c0a..0171060 100755 (executable)
@@ -326,7 +326,7 @@ if ($arch eq "x86_64") {
     #                    14: R_MIPS_NONE *ABS*
     #   18:   00020021        nop
     if ($is_module eq "0") {
-           $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
+           $mcount_regex = "^\\s*([0-9a-fA-F]+): R_MIPS_26\\s+_mcount\$";
     } else {
            $mcount_regex = "^\\s*([0-9a-fA-F]+): R_MIPS_HI16\\s+_mcount\$";
     }