Merge tag 'trace-v6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2022 19:20:55 +0000 (12:20 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2022 19:20:55 +0000 (12:20 -0700)
Pull tracing updates from Steven Rostedt:
 "Major changes:

   - Changed location of tracing repo from personal git repo to:
     git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git

   - Added Masami Hiramatsu as co-maintainer

   - Updated MAINTAINERS file to separate out FTRACE as it is more than
     just TRACING.

  Minor changes:

   - Added Mark Rutland as FTRACE reviewer

   - Updated user_events to make it on its way to remove the BROKEN tag.
     The changes should now be acceptable but will run it through a
     cycle and hopefully we can remove the BROKEN tag next release.

   - Added filtering to eprobes

   - Added a delta time to the benchmark trace event

   - Have the histogram and filter callbacks called via a switch
     statement instead of indirect functions. This speeds it up to avoid
     retpolines.

   - Add a way to wake up ring buffer waiters waiting for the ring
     buffer to fill up to its watermark.

   - New ioctl() on the trace_pipe_raw file to wake up ring buffer
     waiters.

   - Wake up waiters when the ring buffer is disabled. A reader may
     block when the ring buffer is disabled, but if it was blocked when
     the ring buffer is disabled it should then wake up.

  Fixes:

   - Allow splice to read partially read ring buffer pages. This fixes
     splice never moving forward.

   - Fix inverted compare that made the "shortest" ring buffer wait
     queue actually the longest.

   - Fix a race in the ring buffer between resetting a page when a
     writer goes to another page, and the reader.

   - Fix ftrace accounting bug when function hooks are added at boot up
     before the weak functions are set to "disabled".

   - Fix bug that freed a user allocated snapshot buffer when enabling a
     tracer.

   - Fix possible recursive locks in osnoise tracer

   - Fix recursive locking direct functions

   - Other minor clean ups and fixes"

* tag 'trace-v6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (44 commits)
  ftrace: Create separate entry in MAINTAINERS for function hooks
  tracing: Update MAINTAINERS to reflect new tracing git repo
  tracing: Do not free snapshot if tracer is on cmdline
  ftrace: Still disable enabled records marked as disabled
  tracing/user_events: Move pages/locks into groups to prepare for namespaces
  tracing: Add Masami Hiramatsu as co-maintainer
  tracing: Remove unused variable 'dups'
  MAINTAINERS: add myself as a tracing reviewer
  ring-buffer: Fix race between reset page and reading page
  tracing/user_events: Update ABI documentation to align to bits vs bytes
  tracing/user_events: Use bits vs bytes for enabled status page data
  tracing/user_events: Use refcount instead of atomic for ref tracking
  tracing/user_events: Ensure user provided strings are safely formatted
  tracing/user_events: Use WRITE instead of READ for io vector import
  tracing/user_events: Use NULL for strstr checks
  tracing: Fix spelling mistake "preapre" -> "prepare"
  tracing: Wake up waiters when tracing is disabled
  tracing: Add ioctl() to force ring buffer waiters to wake up
  tracing: Wake up ring buffer waiters on closing of the file
  ring-buffer: Add ring_buffer_wake_waiters()
  ...

34 files changed:
Documentation/trace/user_events.rst
MAINTAINERS
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/kprobes.h
arch/x86/kernel/kprobes/core.c
include/linux/ftrace.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/trace_events.h
include/linux/user_events.h
kernel/trace/ftrace.c
kernel/trace/kprobe_event_gen_test.c
kernel/trace/ring_buffer.c
kernel/trace/rv/monitors/wip/wip.c
kernel/trace/rv/monitors/wwnr/wwnr.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_benchmark.c
kernel/trace/trace_benchmark.h
kernel/trace/trace_eprobe.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_events_hist.c
kernel/trace/trace_events_user.c
kernel/trace/trace_osnoise.c
kernel/trace/trace_probe.h
kernel/trace/tracing_map.c
kernel/tracepoint.c
samples/user_events/example.c
tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc [new file with mode: 0644]
tools/testing/selftests/user_events/ftrace_test.c
tools/testing/selftests/user_events/perf_test.c
tools/verification/dot2/dot2k_templates/main_global.c
tools/verification/dot2/dot2k_templates/main_per_cpu.c
tools/verification/dot2/dot2k_templates/main_per_task.c

index c180936..9f181f3 100644 (file)
@@ -20,14 +20,14 @@ dynamic_events is the same as the ioctl with the u: prefix applied.
 
 Typically programs will register a set of events that they wish to expose to
 tools that can read trace_events (such as ftrace and perf). The registration
-process gives back two ints to the program for each event. The first int is the
-status index. This index describes which byte in the
+process gives back two ints to the program for each event. The first int is
+the status bit. This describes which bit in little-endian format in the
 /sys/kernel/debug/tracing/user_events_status file represents this event. The
-second int is the write index. This index describes the data when a write() or
+second int is the write index which describes the data when a write() or
 writev() is called on the /sys/kernel/debug/tracing/user_events_data file.
 
-The structures referenced in this document are contained with the
-/include/uap/linux/user_events.h file in the source tree.
+The structures referenced in this document are contained within the
+/include/uapi/linux/user_events.h file in the source tree.
 
 **NOTE:** *Both user_events_status and user_events_data are under the tracefs
 filesystem and may be mounted at different paths than above.*
@@ -38,18 +38,18 @@ Registering within a user process is done via ioctl() out to the
 /sys/kernel/debug/tracing/user_events_data file. The command to issue is
 DIAG_IOCSREG.
 
-This command takes a struct user_reg as an argument::
+This command takes a packed struct user_reg as an argument::
 
   struct user_reg {
         u32 size;
         u64 name_args;
-        u32 status_index;
+        u32 status_bit;
         u32 write_index;
   };
 
 The struct user_reg requires two inputs, the first is the size of the structure
 to ensure forward and backward compatibility. The second is the command string
-to issue for registering. Upon success two outputs are set, the status index
+to issue for registering. Upon success two outputs are set, the status bit
 and the write index.
 
 User based events show up under tracefs like any other event under the
@@ -111,15 +111,56 @@ in realtime. This allows user programs to only incur the cost of the write() or
 writev() calls when something is actively attached to the event.
 
 User programs call mmap() on /sys/kernel/debug/tracing/user_events_status to
-check the status for each event that is registered. The byte to check in the
-file is given back after the register ioctl() via user_reg.status_index.
+check the status for each event that is registered. The bit to check in the
+file is given back after the register ioctl() via user_reg.status_bit. The bit
+is always in little-endian format. Programs can check if the bit is set either
+using a byte-wise index with a mask or a long-wise index with a little-endian
+mask.
+
 Currently the size of user_events_status is a single page, however, custom
 kernel configurations can change this size to allow more user based events. In
 all cases the size of the file is a multiple of a page size.
 
-For example, if the register ioctl() gives back a status_index of 3 you would
-check byte 3 of the returned mmap data to see if anything is attached to that
-event.
+For example, if the register ioctl() gives back a status_bit of 3 you would
+check byte 0 (3 / 8) of the returned mmap data and then AND the result with 8
+(1 << (3 % 8)) to see if anything is attached to that event.
+
+A byte-wise index check is performed as follows::
+
+  int index, mask;
+  char *status_page;
+
+  index = status_bit / 8;
+  mask = 1 << (status_bit % 8);
+
+  ...
+
+  if (status_page[index] & mask) {
+        /* Enabled */
+  }
+
+A long-wise index check is performed as follows::
+
+  #include <asm/bitsperlong.h>
+  #include <endian.h>
+
+  #if __BITS_PER_LONG == 64
+  #define endian_swap(x) htole64(x)
+  #else
+  #define endian_swap(x) htole32(x)
+  #endif
+
+  long index, mask, *status_page;
+
+  index = status_bit / __BITS_PER_LONG;
+  mask = 1L << (status_bit % __BITS_PER_LONG);
+  mask = endian_swap(mask);
+
+  ...
+
+  if (status_page[index] & mask) {
+        /* Enabled */
+  }
 
 Administrators can easily check the status of all registered events by reading
 the user_events_status file directly via a terminal. The output is as follows::
@@ -137,7 +178,7 @@ For example, on a system that has a single event the output looks like this::
 
   Active: 1
   Busy: 0
-  Max: 4096
+  Max: 32768
 
 If a user enables the user event via ftrace, the output would change to this::
 
@@ -145,21 +186,10 @@ If a user enables the user event via ftrace, the output would change to this::
 
   Active: 1
   Busy: 1
-  Max: 4096
-
-**NOTE:** *A status index of 0 will never be returned. This allows user
-programs to have an index that can be used on error cases.*
-
-Status Bits
-^^^^^^^^^^^
-The byte being checked will be non-zero if anything is attached. Programs can
-check specific bits in the byte to see what mechanism has been attached.
-
-The following values are defined to aid in checking what has been attached:
-
-**EVENT_STATUS_FTRACE** - Bit set if ftrace has been attached (Bit 0).
+  Max: 32768
 
-**EVENT_STATUS_PERF** - Bit set if perf has been attached (Bit 1).
+**NOTE:** *A status bit of 0 will never be returned. This allows user programs
+to have a bit that can be used on error cases.*
 
 Writing Data
 ------------
index 0dc4a76..bf93366 100644 (file)
@@ -8433,6 +8433,19 @@ L:       platform-driver-x86@vger.kernel.org
 S:     Maintained
 F:     drivers/platform/x86/fujitsu-tablet.c
 
+FUNCTION HOOKS (FTRACE)
+M:     Steven Rostedt <rostedt@goodmis.org>
+M:     Masami Hiramatsu <mhiramat@kernel.org>
+R:     Mark Rutland <mark.rutland@arm.com>
+S:     Maintained
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
+F:     Documentation/trace/ftrace*
+F:     kernel/trace/ftrace*
+F:     kernel/trace/fgraph.c
+F:     arch/*/*/*/*ftrace*
+F:     arch/*/*/*ftrace*
+F:     include/*/ftrace.h
+
 FUNGIBLE ETHERNET DRIVERS
 M:     Dimitris Michailidis <dmichail@fungible.com>
 L:     netdev@vger.kernel.org
@@ -11422,7 +11435,7 @@ M:      Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
 M:     "David S. Miller" <davem@davemloft.net>
 M:     Masami Hiramatsu <mhiramat@kernel.org>
 S:     Maintained
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
 F:     Documentation/trace/kprobes.rst
 F:     include/asm-generic/kprobes.h
 F:     include/linux/kprobes.h
@@ -20771,14 +20784,11 @@ F:    drivers/hwmon/pmbus/tps546d24.c
 
 TRACING
 M:     Steven Rostedt <rostedt@goodmis.org>
-M:     Ingo Molnar <mingo@redhat.com>
+M:     Masami Hiramatsu <mhiramat@kernel.org>
 S:     Maintained
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
-F:     Documentation/trace/ftrace.rst
-F:     arch/*/*/*/*ftrace*
-F:     arch/*/*/*ftrace*
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
+F:     Documentation/trace/*
 F:     fs/tracefs/
-F:     include/*/ftrace.h
 F:     include/linux/trace*.h
 F:     include/trace/
 F:     kernel/trace/
@@ -20787,7 +20797,7 @@ F:      tools/testing/selftests/ftrace/
 
 TRACING MMIO ACCESSES (MMIOTRACE)
 M:     Steven Rostedt <rostedt@goodmis.org>
-M:     Ingo Molnar <mingo@kernel.org>
+M:     Masami Hiramatsu <mhiramat@kernel.org>
 R:     Karol Herbst <karolherbst@gmail.com>
 R:     Pekka Paalanen <ppaalanen@gmail.com>
 L:     linux-kernel@vger.kernel.org
index b5ef474..908d99b 100644 (file)
@@ -23,7 +23,6 @@
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
 #ifndef __ASSEMBLY__
-extern atomic_t modifying_ftrace_code;
 extern void __fentry__(void);
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
index 71ea2ea..a2e9317 100644 (file)
@@ -50,8 +50,6 @@ extern const int kretprobe_blacklist_size;
 
 void arch_remove_kprobe(struct kprobe *p);
 
-extern void arch_kprobe_override_function(struct pt_regs *regs);
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
        /* copy of the original instruction */
index 4c3c27b..eb8bc82 100644 (file)
@@ -59,8 +59,6 @@
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-
 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
index 0b61371..62557d4 100644 (file)
@@ -1122,47 +1122,6 @@ static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #ifdef CONFIG_TRACING
-
-/* flags for current->trace */
-enum {
-       TSK_TRACE_FL_TRACE_BIT  = 0,
-       TSK_TRACE_FL_GRAPH_BIT  = 1,
-};
-enum {
-       TSK_TRACE_FL_TRACE      = 1 << TSK_TRACE_FL_TRACE_BIT,
-       TSK_TRACE_FL_GRAPH      = 1 << TSK_TRACE_FL_GRAPH_BIT,
-};
-
-static inline void set_tsk_trace_trace(struct task_struct *tsk)
-{
-       set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_trace(struct task_struct *tsk)
-{
-       clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_trace(struct task_struct *tsk)
-{
-       return tsk->trace & TSK_TRACE_FL_TRACE;
-}
-
-static inline void set_tsk_trace_graph(struct task_struct *tsk)
-{
-       set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_graph(struct task_struct *tsk)
-{
-       clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_graph(struct task_struct *tsk)
-{
-       return tsk->trace & TSK_TRACE_FL_GRAPH;
-}
-
 enum ftrace_dump_mode;
 
 extern enum ftrace_dump_mode ftrace_dump_on_oops;
index dac53fd..2504df9 100644 (file)
@@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
 __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
                          struct file *filp, poll_table *poll_table);
-
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
 
 #define RING_BUFFER_ALL_CPUS -1
 
index 68ec44d..11a3178 100644 (file)
@@ -1390,9 +1390,6 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_TRACING
-       /* State flags for use by tracers: */
-       unsigned long                   trace;
-
        /* Bitmask and counter of trace recursion: */
        unsigned long                   trace_recursion;
 #endif /* CONFIG_TRACING */
index 8401dec..20749bd 100644 (file)
@@ -92,6 +92,7 @@ struct trace_iterator {
        unsigned int            temp_size;
        char                    *fmt;   /* modified format holder */
        unsigned int            fmt_size;
+       long                    wait_index;
 
        /* trace_seq for __print_flags() and __print_symbolic() etc. */
        struct trace_seq        tmp_seq;
index 736e056..592a3fb 100644 (file)
 #define USER_EVENTS_SYSTEM "user_events"
 #define USER_EVENTS_PREFIX "u:"
 
-/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
-#define EVENT_BIT_FTRACE 0
-#define EVENT_BIT_PERF 1
-#define EVENT_BIT_OTHER 7
-
-#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
-#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
-#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
-
 /* Create dynamic location entry within a 32-bit value */
 #define DYN_LOC(offset, size) ((size) << 16 | (offset))
 
@@ -45,12 +36,12 @@ struct user_reg {
        /* Input: Pointer to string with event name, description and flags */
        __u64 name_args;
 
-       /* Output: Byte index of the event within the status page */
-       __u32 status_index;
+       /* Output: Bitwise index of the event within the status page */
+       __u32 status_bit;
 
        /* Output: Index of the event to use when writing data */
        __u32 write_index;
-};
+} __attribute__((__packed__));
 
 #define DIAG_IOC_MAGIC '*'
 
index 447d2e2..02e18c4 100644 (file)
@@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
 static struct ftrace_ops *
 ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
 
+static bool skip_record(struct dyn_ftrace *rec)
+{
+       /*
+        * At boot up, weak functions are set to disable. Function tracing
+        * can be enabled before they are, and they still need to be disabled now.
+        * If the record is disabled, still continue if it is marked as already
+        * enabled (this is needed to keep the accounting working).
+        */
+       return rec->flags & FTRACE_FL_DISABLED &&
+               !(rec->flags & FTRACE_FL_ENABLED);
+}
+
 static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
                int in_hash = 0;
                int match = 0;
 
-               if (rec->flags & FTRACE_FL_DISABLED)
+               if (skip_record(rec))
                        continue;
 
                if (all) {
@@ -2126,7 +2138,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 
        ftrace_bug_type = FTRACE_BUG_UNKNOWN;
 
-       if (rec->flags & FTRACE_FL_DISABLED)
+       if (skip_record(rec))
                return FTRACE_UPDATE_IGNORE;
 
        /*
@@ -2241,7 +2253,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
        if (update) {
                /* If there's no more users, clear all flags */
                if (!ftrace_rec_count(rec))
-                       rec->flags = 0;
+                       rec->flags &= FTRACE_FL_DISABLED;
                else
                        /*
                         * Just disable the record, but keep the ops TRAMP
@@ -2634,7 +2646,7 @@ void __weak ftrace_replace_code(int mod_flags)
 
        do_for_each_ftrace_rec(pg, rec) {
 
-               if (rec->flags & FTRACE_FL_DISABLED)
+               if (skip_record(rec))
                        continue;
 
                failed = __ftrace_replace_code(rec, enable);
@@ -5427,6 +5439,8 @@ static struct ftrace_ops stub_ops = {
  * it is safe to modify the ftrace record, where it should be
  * currently calling @old_addr directly, to call @new_addr.
  *
+ * This is called with direct_mutex locked.
+ *
  * Safety checks should be made to make sure that the code at
  * @rec->ip is currently calling @old_addr. And this must
  * also update entry->direct to @new_addr.
@@ -5439,6 +5453,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
        unsigned long ip = rec->ip;
        int ret;
 
+       lockdep_assert_held(&direct_mutex);
+
        /*
         * The ftrace_lock was used to determine if the record
         * had more than one registered user to it. If it did,
@@ -5461,7 +5477,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
        if (ret)
                goto out_lock;
 
-       ret = register_ftrace_function(&stub_ops);
+       ret = register_ftrace_function_nolock(&stub_ops);
        if (ret) {
                ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
                goto out_lock;
@@ -6081,8 +6097,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
                if (filter_hash) {
                        orig_hash = &iter->ops->func_hash->filter_hash;
-                       if (iter->tr && !list_empty(&iter->tr->mod_trace))
-                               iter->hash->flags |= FTRACE_HASH_FL_MOD;
+                       if (iter->tr) {
+                               if (list_empty(&iter->tr->mod_trace))
+                                       iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+                               else
+                                       iter->hash->flags |= FTRACE_HASH_FL_MOD;
+                       }
                } else
                        orig_hash = &iter->ops->func_hash->notrace_hash;
 
index 18b0f1c..80e04a1 100644 (file)
 static struct trace_event_file *gen_kprobe_test;
 static struct trace_event_file *gen_kretprobe_test;
 
+#define KPROBE_GEN_TEST_FUNC   "do_sys_open"
+
+/* X86 */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
+#define KPROBE_GEN_TEST_ARG0   "dfd=%ax"
+#define KPROBE_GEN_TEST_ARG1   "filename=%dx"
+#define KPROBE_GEN_TEST_ARG2   "flags=%cx"
+#define KPROBE_GEN_TEST_ARG3   "mode=+4($stack)"
+
+/* ARM64 */
+#elif defined(CONFIG_ARM64)
+#define KPROBE_GEN_TEST_ARG0   "dfd=%x0"
+#define KPROBE_GEN_TEST_ARG1   "filename=%x1"
+#define KPROBE_GEN_TEST_ARG2   "flags=%x2"
+#define KPROBE_GEN_TEST_ARG3   "mode=%x3"
+
+/* ARM */
+#elif defined(CONFIG_ARM)
+#define KPROBE_GEN_TEST_ARG0   "dfd=%r0"
+#define KPROBE_GEN_TEST_ARG1   "filename=%r1"
+#define KPROBE_GEN_TEST_ARG2   "flags=%r2"
+#define KPROBE_GEN_TEST_ARG3   "mode=%r3"
+
+/* RISCV */
+#elif defined(CONFIG_RISCV)
+#define KPROBE_GEN_TEST_ARG0   "dfd=%a0"
+#define KPROBE_GEN_TEST_ARG1   "filename=%a1"
+#define KPROBE_GEN_TEST_ARG2   "flags=%a2"
+#define KPROBE_GEN_TEST_ARG3   "mode=%a3"
+
+/* others */
+#else
+#define KPROBE_GEN_TEST_ARG0   NULL
+#define KPROBE_GEN_TEST_ARG1   NULL
+#define KPROBE_GEN_TEST_ARG2   NULL
+#define KPROBE_GEN_TEST_ARG3   NULL
+#endif
+
+
 /*
  * Test to make sure we can create a kprobe event, then add more
  * fields.
@@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void)
         * fields.
         */
        ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
-                                        "do_sys_open",
-                                        "dfd=%ax", "filename=%dx");
+                                        KPROBE_GEN_TEST_FUNC,
+                                        KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
        if (ret)
                goto free;
 
        /* Use kprobe_event_add_fields to add the rest of the fields */
 
-       ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+       ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
        if (ret)
                goto free;
 
@@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void)
         * Define the kretprobe event.
         */
        ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
-                                           "do_sys_open",
+                                           KPROBE_GEN_TEST_FUNC,
                                            "$retval");
        if (ret)
                goto free;
@@ -206,7 +245,7 @@ static void __exit kprobe_event_gen_test_exit(void)
        WARN_ON(kprobe_event_delete("gen_kprobe_test"));
 
        /* Disable the event or you can't remove it */
-       WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+       WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
                                          "kprobes",
                                          "gen_kretprobe_test", false));
 
index d59b6a3..c3f354c 100644 (file)
@@ -413,6 +413,7 @@ struct rb_irq_work {
        struct irq_work                 work;
        wait_queue_head_t               waiters;
        wait_queue_head_t               full_waiters;
+       long                            wait_index;
        bool                            waiters_pending;
        bool                            full_waiters_pending;
        bool                            wakeup_full;
@@ -917,13 +918,45 @@ static void rb_wake_up_waiters(struct irq_work *work)
        struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
 
        wake_up_all(&rbwork->waiters);
-       if (rbwork->wakeup_full) {
+       if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
                rbwork->wakeup_full = false;
+               rbwork->full_waiters_pending = false;
                wake_up_all(&rbwork->full_waiters);
        }
 }
 
 /**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       struct rb_irq_work *rbwork;
+
+       if (cpu == RING_BUFFER_ALL_CPUS) {
+
+               /* Wake up individual ones too. One level recursion */
+               for_each_buffer_cpu(buffer, cpu)
+                       ring_buffer_wake_waiters(buffer, cpu);
+
+               rbwork = &buffer->irq_work;
+       } else {
+               cpu_buffer = buffer->buffers[cpu];
+               rbwork = &cpu_buffer->irq_work;
+       }
+
+       rbwork->wait_index++;
+       /* make sure the waiters see the new index */
+       smp_wmb();
+
+       rb_wake_up_waiters(&rbwork->work);
+}
+
+/**
  * ring_buffer_wait - wait for input to the ring buffer
  * @buffer: buffer to wait on
  * @cpu: the cpu buffer to wait on
@@ -938,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
        struct ring_buffer_per_cpu *cpu_buffer;
        DEFINE_WAIT(wait);
        struct rb_irq_work *work;
+       long wait_index;
        int ret = 0;
 
        /*
@@ -956,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
                work = &cpu_buffer->irq_work;
        }
 
+       wait_index = READ_ONCE(work->wait_index);
 
        while (true) {
                if (full)
@@ -1011,7 +1046,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
                        nr_pages = cpu_buffer->nr_pages;
                        dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
                        if (!cpu_buffer->shortest_full ||
-                           cpu_buffer->shortest_full < full)
+                           cpu_buffer->shortest_full > full)
                                cpu_buffer->shortest_full = full;
                        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
                        if (!pagebusy &&
@@ -1020,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
                }
 
                schedule();
+
+               /* Make sure to see the new wait index */
+               smp_rmb();
+               if (wait_index != work->wait_index)
+                       break;
        }
 
        if (full)
@@ -2608,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
                /* Mark the rest of the page with padding */
                rb_event_set_padding(event);
 
+               /* Make sure the padding is visible before the write update */
+               smp_wmb();
+
                /* Set the write back to the previous setting */
                local_sub(length, &tail_page->write);
                return;
@@ -2619,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        /* time delta must be non zero */
        event->time_delta = 1;
 
+       /* Make sure the padding is visible before the tail_page->write update */
+       smp_wmb();
+
        /* Set write to end of buffer */
        length = (tail + length) - BUF_PAGE_SIZE;
        local_sub(length, &tail_page->write);
@@ -4587,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
 
+       /*
+        * The writer has preempt disable, wait for it. But not forever
+        * Although, 1 second is pretty much "forever"
+        */
+#define USECS_WAIT     1000000
+        for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+               /* If the write is past the end of page, a writer is still updating it */
+               if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+                       break;
+
+               udelay(1);
+
+               /* Get the latest version of the reader write value */
+               smp_rmb();
+       }
+
+       /* The writer is not moving forward? Something is wrong */
+       if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+               reader = NULL;
+
+       /*
+        * Make sure we see any padding after the write update
+        * (see rb_reset_tail())
+        */
+       smp_rmb();
+
+
        return reader;
 }
 
@@ -5616,7 +5689,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
                unsigned int pos = 0;
                unsigned int size;
 
-               if (full)
+               /*
+                * If a full page is expected, this can still be returned
+                * if there's been a previous partial read and the
+                * rest of the page can be read and the commit page is off
+                * the reader page.
+                */
+               if (full &&
+                   (!read || (len < (commit - read)) ||
+                    cpu_buffer->reader_page == cpu_buffer->commit_page))
                        goto out_unlock;
 
                if (len > (commit - read))
index 83cace5..b2b49a2 100644 (file)
@@ -16,7 +16,7 @@
 
 #include "wip.h"
 
-struct rv_monitor rv_wip;
+static struct rv_monitor rv_wip;
 DECLARE_DA_MON_PER_CPU(wip, unsigned char);
 
 static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
@@ -60,7 +60,7 @@ static void disable_wip(void)
        da_monitor_destroy_wip();
 }
 
-struct rv_monitor rv_wip = {
+static struct rv_monitor rv_wip = {
        .name = "wip",
        .description = "wakeup in preemptive per-cpu testing monitor.",
        .enable = enable_wip,
@@ -69,13 +69,13 @@ struct rv_monitor rv_wip = {
        .enabled = 0,
 };
 
-static int register_wip(void)
+static int __init register_wip(void)
 {
        rv_register_monitor(&rv_wip);
        return 0;
 }
 
-static void unregister_wip(void)
+static void __exit unregister_wip(void)
 {
        rv_unregister_monitor(&rv_wip);
 }
index 599225d..0e43dd2 100644 (file)
@@ -15,7 +15,7 @@
 
 #include "wwnr.h"
 
-struct rv_monitor rv_wwnr;
+static struct rv_monitor rv_wwnr;
 DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
 
 static void handle_switch(void *data, bool preempt, struct task_struct *p,
@@ -59,7 +59,7 @@ static void disable_wwnr(void)
        da_monitor_destroy_wwnr();
 }
 
-struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_wwnr = {
        .name = "wwnr",
        .description = "wakeup while not running per-task testing model.",
        .enable = enable_wwnr,
@@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = {
        .enabled = 0,
 };
 
-static int register_wwnr(void)
+static int __init register_wwnr(void)
 {
        rv_register_monitor(&rv_wwnr);
        return 0;
 }
 
-static void unregister_wwnr(void)
+static void __exit unregister_wwnr(void)
 {
        rv_unregister_monitor(&rv_wwnr);
 }
index d300527..47a44b0 100644 (file)
@@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
 {
        void *cond_data = NULL;
 
+       local_irq_disable();
        arch_spin_lock(&tr->max_lock);
 
        if (tr->cond_snapshot)
                cond_data = tr->cond_snapshot->cond_data;
 
        arch_spin_unlock(&tr->max_lock);
+       local_irq_enable();
 
        return cond_data;
 }
@@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
                goto fail_unlock;
        }
 
+       local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        tr->cond_snapshot = cond_snapshot;
        arch_spin_unlock(&tr->max_lock);
+       local_irq_enable();
 
        mutex_unlock(&trace_types_lock);
 
@@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
 {
        int ret = 0;
 
+       local_irq_disable();
        arch_spin_lock(&tr->max_lock);
 
        if (!tr->cond_snapshot)
@@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
        }
 
        arch_spin_unlock(&tr->max_lock);
+       local_irq_enable();
 
        return ret;
 }
@@ -2200,6 +2206,11 @@ static size_t tgid_map_max;
 
 #define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 struct saved_cmdlines_buffer {
        unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
         * the lock, but we also don't want to spin
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
+        *
+        * This is called within the scheduler and wake up, so interrupts
+        * had better been disabled and run queue lock been held.
         */
+       lockdep_assert_preemption_disabled();
        if (!arch_spin_trylock(&trace_cmdline_lock))
                return 0;
 
@@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
        char buf[64];
        int r;
 
+       preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
        arch_spin_unlock(&trace_cmdline_lock);
+       preempt_enable();
 
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
                return -ENOMEM;
        }
 
+       preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        savedcmd_temp = savedcmd;
        savedcmd = s;
        arch_spin_unlock(&trace_cmdline_lock);
+       preempt_enable();
        free_saved_cmdlines_buffer(savedcmd_temp);
 
        return 0;
@@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 
 #ifdef CONFIG_TRACER_SNAPSHOT
        if (t->use_max_tr) {
+               local_irq_disable();
                arch_spin_lock(&tr->max_lock);
                if (tr->cond_snapshot)
                        ret = -EBUSY;
                arch_spin_unlock(&tr->max_lock);
+               local_irq_enable();
                if (ret)
                        goto out;
        }
@@ -6407,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
        if (tr->current_trace->reset)
                tr->current_trace->reset(tr);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
+       had_max_tr = tr->current_trace->use_max_tr;
+
        /* Current trace needs to be nop_trace before synchronize_rcu */
        tr->current_trace = &nop_trace;
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-       had_max_tr = tr->allocated_snapshot;
-
        if (had_max_tr && !t->use_max_tr) {
                /*
                 * We need to make sure that the update_max_tr sees that
@@ -6425,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
                free_snapshot(tr);
        }
 
-       if (t->use_max_tr && !had_max_tr) {
+       if (t->use_max_tr && !tr->allocated_snapshot) {
                ret = tracing_alloc_snapshot_instance(tr);
                if (ret < 0)
                        goto out;
        }
+#else
+       tr->current_trace = &nop_trace;
 #endif
 
        if (t->init) {
@@ -7436,10 +7459,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
                goto out;
        }
 
+       local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        if (tr->cond_snapshot)
                ret = -EBUSY;
        arch_spin_unlock(&tr->max_lock);
+       local_irq_enable();
        if (ret)
                goto out;
 
@@ -8137,6 +8162,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 
        __trace_array_put(iter->tr);
 
+       iter->wait_index++;
+       /* Make sure the waiters see the new wait_index */
+       smp_wmb();
+
+       ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
        if (info->spare)
                ring_buffer_free_read_page(iter->array_buffer->buffer,
                                           info->spare_cpu, info->spare);
@@ -8290,6 +8321,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
        /* did we read anything? */
        if (!spd.nr_pages) {
+               long wait_index;
+
                if (ret)
                        goto out;
 
@@ -8297,10 +8330,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
                        goto out;
 
+               wait_index = READ_ONCE(iter->wait_index);
+
                ret = wait_on_pipe(iter, iter->tr->buffer_percent);
                if (ret)
                        goto out;
 
+               /* No need to wait after waking up when tracing is off */
+               if (!tracer_tracing_is_on(iter->tr))
+                       goto out;
+
+               /* Make sure we see the new wait_index */
+               smp_rmb();
+               if (wait_index != iter->wait_index)
+                       goto out;
+
                goto again;
        }
 
@@ -8311,12 +8355,34 @@ out:
        return ret;
 }
 
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct ftrace_buffer_info *info = file->private_data;
+       struct trace_iterator *iter = &info->iter;
+
+       if (cmd)
+               return -ENOIOCTLCMD;
+
+       mutex_lock(&trace_types_lock);
+
+       iter->wait_index++;
+       /* Make sure the waiters see the new wait_index */
+       smp_wmb();
+
+       ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+       mutex_unlock(&trace_types_lock);
+       return 0;
+}
+
 static const struct file_operations tracing_buffers_fops = {
        .open           = tracing_buffers_open,
        .read           = tracing_buffers_read,
        .poll           = tracing_buffers_poll,
        .release        = tracing_buffers_release,
        .splice_read    = tracing_buffers_splice_read,
+       .unlocked_ioctl = tracing_buffers_ioctl,
        .llseek         = no_llseek,
 };
 
@@ -9005,6 +9071,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                        tracer_tracing_off(tr);
                        if (tr->current_trace->stop)
                                tr->current_trace->stop(tr);
+                       /* Wake up any waiters */
+                       ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
                }
                mutex_unlock(&trace_types_lock);
        }
@@ -10091,7 +10159,7 @@ __init static int tracer_alloc_buffers(void)
         * buffer. The memory will be removed once the "instance" is removed.
         */
        ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
-                                     "trace/RB:preapre", trace_rb_cpu_prepare,
+                                     "trace/RB:prepare", trace_rb_cpu_prepare,
                                      NULL);
        if (ret < 0)
                goto out_free_cpumask;
index 900e75d..54ee571 100644 (file)
@@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 struct filter_pred;
 struct regex;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
 enum regex_type {
@@ -1455,17 +1453,6 @@ struct regex {
        regex_match_func        match;
 };
 
-struct filter_pred {
-       filter_pred_fn_t        fn;
-       u64                     val;
-       struct regex            regex;
-       unsigned short          *ops;
-       struct ftrace_event_field *field;
-       int                     offset;
-       int                     not;
-       int                     op;
-};
-
 static inline bool is_string_field(struct ftrace_event_field *field)
 {
        return field->filter_type == FILTER_DYN_STRING ||
index 801c2a7..54d5fa3 100644 (file)
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
 
        local_irq_disable();
        start = trace_clock_local();
-       trace_benchmark_event(bm_str);
+       trace_benchmark_event(bm_str, bm_last);
        stop = trace_clock_local();
        local_irq_enable();
 
index 79e6fbe..c3e9106 100644 (file)
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
 
 TRACE_EVENT_FN(benchmark_event,
 
-       TP_PROTO(const char *str),
+       TP_PROTO(const char *str, u64 delta),
 
-       TP_ARGS(str),
+       TP_ARGS(str, delta),
 
        TP_STRUCT__entry(
                __array(        char,   str,    BENCHMARK_EVENT_STRLEN  )
+               __field(        u64,    delta)
        ),
 
        TP_fast_assign(
                memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+               __entry->delta = delta;
        ),
 
-       TP_printk("%s", __entry->str),
+       TP_printk("%s delta=%llu", __entry->str, __entry->delta),
 
        trace_benchmark_reg, trace_benchmark_unreg
 );
index 1783e34..c08bde9 100644 (file)
@@ -26,6 +26,9 @@ struct trace_eprobe {
        /* tracepoint event */
        const char *event_name;
 
+       /* filter string for the tracepoint */
+       char *filter_str;
+
        struct trace_event_call *event;
 
        struct dyn_event        devent;
@@ -664,14 +667,15 @@ static struct event_trigger_data *
 new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
 {
        struct event_trigger_data *trigger;
+       struct event_filter *filter = NULL;
        struct eprobe_data *edata;
+       int ret;
 
        edata = kzalloc(sizeof(*edata), GFP_KERNEL);
        trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
        if (!trigger || !edata) {
-               kfree(edata);
-               kfree(trigger);
-               return ERR_PTR(-ENOMEM);
+               ret = -ENOMEM;
+               goto error;
        }
 
        trigger->flags = EVENT_TRIGGER_FL_PROBE;
@@ -686,13 +690,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
        trigger->cmd_ops = &event_trigger_cmd;
 
        INIT_LIST_HEAD(&trigger->list);
-       RCU_INIT_POINTER(trigger->filter, NULL);
+
+       if (ep->filter_str) {
+               ret = create_event_filter(file->tr, file->event_call,
+                                       ep->filter_str, false, &filter);
+               if (ret)
+                       goto error;
+       }
+       RCU_INIT_POINTER(trigger->filter, filter);
 
        edata->file = file;
        edata->ep = ep;
        trigger->private_data = edata;
 
        return trigger;
+error:
+       free_event_filter(filter);
+       kfree(edata);
+       kfree(trigger);
+       return ERR_PTR(ret);
 }
 
 static int enable_eprobe(struct trace_eprobe *ep,
@@ -726,6 +742,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
 {
        struct event_trigger_data *trigger = NULL, *iter;
        struct trace_event_file *file;
+       struct event_filter *filter;
        struct eprobe_data *edata;
 
        file = find_event_file(tr, ep->event_system, ep->event_name);
@@ -752,6 +769,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
        /* Make sure nothing is using the edata or trigger */
        tracepoint_synchronize_unregister();
 
+       filter = rcu_access_pointer(trigger->filter);
+
+       if (filter)
+               free_event_filter(filter);
        kfree(edata);
        kfree(trigger);
 
@@ -927,12 +948,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
        return ret;
 }
 
+static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
+{
+       struct event_filter *dummy;
+       int i, ret, len = 0;
+       char *p;
+
+       if (argc == 0) {
+               trace_probe_log_err(0, NO_EP_FILTER);
+               return -EINVAL;
+       }
+
+       /* Recover the filter string */
+       for (i = 0; i < argc; i++)
+               len += strlen(argv[i]) + 1;
+
+       ep->filter_str = kzalloc(len, GFP_KERNEL);
+       if (!ep->filter_str)
+               return -ENOMEM;
+
+       p = ep->filter_str;
+       for (i = 0; i < argc; i++) {
+               ret = snprintf(p, len, "%s ", argv[i]);
+               if (ret < 0)
+                       goto error;
+               if (ret > len) {
+                       ret = -E2BIG;
+                       goto error;
+               }
+               p += ret;
+               len -= ret;
+       }
+       p[-1] = '\0';
+
+       /*
+        * Ensure the filter string can be parsed correctly. Note, this
+        * filter string is for the original event, not for the eprobe.
+        */
+       ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
+                                 true, &dummy);
+       free_event_filter(dummy);
+       if (ret)
+               goto error;
+
+       return 0;
+error:
+       kfree(ep->filter_str);
+       ep->filter_str = NULL;
+       return ret;
+}
+
 static int __trace_eprobe_create(int argc, const char *argv[])
 {
        /*
         * Argument syntax:
-        *      e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS]
-        * Fetch args:
+        *      e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
+        * Fetch args (no space):
         *  <name>=$<field>[:TYPE]
         */
        const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
@@ -942,8 +1013,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
        char buf1[MAX_EVENT_NAME_LEN];
        char buf2[MAX_EVENT_NAME_LEN];
        char gbuf[MAX_EVENT_NAME_LEN];
-       int ret = 0;
-       int i;
+       int ret = 0, filter_idx = 0;
+       int i, filter_cnt;
 
        if (argc < 2 || argv[0][0] != 'e')
                return -ECANCELED;
@@ -968,11 +1039,19 @@ static int __trace_eprobe_create(int argc, const char *argv[])
        }
 
        if (!event) {
-               strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
-               sanitize_event_name(buf1);
+               strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
                event = buf1;
        }
 
+       for (i = 2; i < argc; i++) {
+               if (!strcmp(argv[i], "if")) {
+                       filter_idx = i + 1;
+                       filter_cnt = argc - filter_idx;
+                       argc = i;
+                       break;
+               }
+       }
+
        mutex_lock(&event_mutex);
        event_call = find_and_get_event(sys_name, sys_event);
        ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -988,6 +1067,14 @@ static int __trace_eprobe_create(int argc, const char *argv[])
                goto error;
        }
 
+       if (filter_idx) {
+               trace_probe_log_set_index(filter_idx);
+               ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
+               if (ret)
+                       goto parse_error;
+       } else
+               ep->filter_str = NULL;
+
        argc -= 2; argv += 2;
        /* parse arguments */
        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
index 4b1057a..96acc2b 100644 (file)
@@ -43,6 +43,42 @@ enum filter_op_ids { OPS };
 
 static const char * ops[] = { OPS };
 
+enum filter_pred_fn {
+       FILTER_PRED_FN_NOP,
+       FILTER_PRED_FN_64,
+       FILTER_PRED_FN_S64,
+       FILTER_PRED_FN_U64,
+       FILTER_PRED_FN_32,
+       FILTER_PRED_FN_S32,
+       FILTER_PRED_FN_U32,
+       FILTER_PRED_FN_16,
+       FILTER_PRED_FN_S16,
+       FILTER_PRED_FN_U16,
+       FILTER_PRED_FN_8,
+       FILTER_PRED_FN_S8,
+       FILTER_PRED_FN_U8,
+       FILTER_PRED_FN_COMM,
+       FILTER_PRED_FN_STRING,
+       FILTER_PRED_FN_STRLOC,
+       FILTER_PRED_FN_STRRELLOC,
+       FILTER_PRED_FN_PCHAR_USER,
+       FILTER_PRED_FN_PCHAR,
+       FILTER_PRED_FN_CPU,
+       FILTER_PRED_FN_,
+       FILTER_PRED_TEST_VISITED,
+};
+
+struct filter_pred {
+       enum filter_pred_fn     fn_num;
+       u64                     val;
+       struct regex            regex;
+       unsigned short          *ops;
+       struct ftrace_event_field *field;
+       int                     offset;
+       int                     not;
+       int                     op;
+};
+
 /*
  * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
  * pred_funcs_##type below must match the order of them above.
@@ -590,44 +626,48 @@ out_free:
        return ERR_PTR(ret);
 }
 
+enum pred_cmp_types {
+       PRED_CMP_TYPE_NOP,
+       PRED_CMP_TYPE_LT,
+       PRED_CMP_TYPE_LE,
+       PRED_CMP_TYPE_GT,
+       PRED_CMP_TYPE_GE,
+       PRED_CMP_TYPE_BAND,
+};
+
 #define DEFINE_COMPARISON_PRED(type)                                   \
-static int filter_pred_LT_##type(struct filter_pred *pred, void *event)        \
-{                                                                      \
-       type *addr = (type *)(event + pred->offset);                    \
-       type val = (type)pred->val;                                     \
-       return *addr < val;                                             \
-}                                                                      \
-static int filter_pred_LE_##type(struct filter_pred *pred, void *event)        \
-{                                                                      \
-       type *addr = (type *)(event + pred->offset);                    \
-       type val = (type)pred->val;                                     \
-       return *addr <= val;                                            \
-}                                                                      \
-static int filter_pred_GT_##type(struct filter_pred *pred, void *event)        \
+static int filter_pred_##type(struct filter_pred *pred, void *event)   \
 {                                                                      \
-       type *addr = (type *)(event + pred->offset);                    \
-       type val = (type)pred->val;                                     \
-       return *addr > val;                                     \
-}                                                                      \
-static int filter_pred_GE_##type(struct filter_pred *pred, void *event)        \
-{                                                                      \
-       type *addr = (type *)(event + pred->offset);                    \
-       type val = (type)pred->val;                                     \
-       return *addr >= val;                                            \
-}                                                                      \
-static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
-{                                                                      \
-       type *addr = (type *)(event + pred->offset);                    \
-       type val = (type)pred->val;                                     \
-       return !!(*addr & val);                                         \
-}                                                                      \
-static const filter_pred_fn_t pred_funcs_##type[] = {                  \
-       filter_pred_LE_##type,                                          \
-       filter_pred_LT_##type,                                          \
-       filter_pred_GE_##type,                                          \
-       filter_pred_GT_##type,                                          \
-       filter_pred_BAND_##type,                                        \
-};
+       switch (pred->op) {                                             \
+       case OP_LT: {                                                   \
+               type *addr = (type *)(event + pred->offset);            \
+               type val = (type)pred->val;                             \
+               return *addr < val;                                     \
+       }                                                               \
+       case OP_LE: {                                   \
+               type *addr = (type *)(event + pred->offset);            \
+               type val = (type)pred->val;                             \
+               return *addr <= val;                                    \
+       }                                                               \
+       case OP_GT: {                                   \
+               type *addr = (type *)(event + pred->offset);            \
+               type val = (type)pred->val;                             \
+               return *addr > val;                                     \
+       }                                                               \
+       case OP_GE: {                                   \
+               type *addr = (type *)(event + pred->offset);            \
+               type val = (type)pred->val;                             \
+               return *addr >= val;                                    \
+       }                                                               \
+       case OP_BAND: {                                 \
+               type *addr = (type *)(event + pred->offset);            \
+               type val = (type)pred->val;                             \
+               return !!(*addr & val);                                 \
+       }                                                               \
+       default:                                                        \
+               return 0;                                               \
+       }                                                               \
+}
 
 #define DEFINE_EQUALITY_PRED(size)                                     \
 static int filter_pred_##size(struct filter_pred *pred, void *event)   \
@@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
        return cmp ^ pred->not;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
-{
-       return 0;
-}
-
 /*
  * regex_match_foo - Basic regex callbacks
  *
@@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
        }
 }
 
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int test_pred_visited_fn(struct filter_pred *pred, void *event);
+#else
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+       return 0;
+}
+#endif
+
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event);
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 
        for (i = 0; prog[i].pred; i++) {
                struct filter_pred *pred = prog[i].pred;
-               int match = pred->fn(pred, rec);
+               int match = filter_pred_fn_call(pred, rec);
                if (match == prog[i].when_to_branch)
                        i = prog[i].target;
        }
@@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type)
        return FILTER_OTHER;
 }
 
-static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
-                                           int field_size, int field_is_signed)
+static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
+                                               int field_size, int field_is_signed)
 {
-       filter_pred_fn_t fn = NULL;
+       enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
        int pred_func_index = -1;
 
        switch (op) {
@@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
                break;
        default:
                if (WARN_ON_ONCE(op < PRED_FUNC_START))
-                       return NULL;
+                       return fn;
                pred_func_index = op - PRED_FUNC_START;
                if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
-                       return NULL;
+                       return fn;
        }
 
        switch (field_size) {
        case 8:
                if (pred_func_index < 0)
-                       fn = filter_pred_64;
+                       fn = FILTER_PRED_FN_64;
                else if (field_is_signed)
-                       fn = pred_funcs_s64[pred_func_index];
+                       fn = FILTER_PRED_FN_S64;
                else
-                       fn = pred_funcs_u64[pred_func_index];
+                       fn = FILTER_PRED_FN_U64;
                break;
        case 4:
                if (pred_func_index < 0)
-                       fn = filter_pred_32;
+                       fn = FILTER_PRED_FN_32;
                else if (field_is_signed)
-                       fn = pred_funcs_s32[pred_func_index];
+                       fn = FILTER_PRED_FN_S32;
                else
-                       fn = pred_funcs_u32[pred_func_index];
+                       fn = FILTER_PRED_FN_U32;
                break;
        case 2:
                if (pred_func_index < 0)
-                       fn = filter_pred_16;
+                       fn = FILTER_PRED_FN_16;
                else if (field_is_signed)
-                       fn = pred_funcs_s16[pred_func_index];
+                       fn = FILTER_PRED_FN_S16;
                else
-                       fn = pred_funcs_u16[pred_func_index];
+                       fn = FILTER_PRED_FN_U16;
                break;
        case 1:
                if (pred_func_index < 0)
-                       fn = filter_pred_8;
+                       fn = FILTER_PRED_FN_8;
                else if (field_is_signed)
-                       fn = pred_funcs_s8[pred_func_index];
+                       fn = FILTER_PRED_FN_S8;
                else
-                       fn = pred_funcs_u8[pred_func_index];
+                       fn = FILTER_PRED_FN_U8;
                break;
        }
 
        return fn;
 }
 
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event)
+{
+       switch (pred->fn_num) {
+       case FILTER_PRED_FN_64:
+               return filter_pred_64(pred, event);
+       case FILTER_PRED_FN_S64:
+               return filter_pred_s64(pred, event);
+       case FILTER_PRED_FN_U64:
+               return filter_pred_u64(pred, event);
+       case FILTER_PRED_FN_32:
+               return filter_pred_32(pred, event);
+       case FILTER_PRED_FN_S32:
+               return filter_pred_s32(pred, event);
+       case FILTER_PRED_FN_U32:
+               return filter_pred_u32(pred, event);
+       case FILTER_PRED_FN_16:
+               return filter_pred_16(pred, event);
+       case FILTER_PRED_FN_S16:
+               return filter_pred_s16(pred, event);
+       case FILTER_PRED_FN_U16:
+               return filter_pred_u16(pred, event);
+       case FILTER_PRED_FN_8:
+               return filter_pred_8(pred, event);
+       case FILTER_PRED_FN_S8:
+               return filter_pred_s8(pred, event);
+       case FILTER_PRED_FN_U8:
+               return filter_pred_u8(pred, event);
+       case FILTER_PRED_FN_COMM:
+               return filter_pred_comm(pred, event);
+       case FILTER_PRED_FN_STRING:
+               return filter_pred_string(pred, event);
+       case FILTER_PRED_FN_STRLOC:
+               return filter_pred_strloc(pred, event);
+       case FILTER_PRED_FN_STRRELLOC:
+               return filter_pred_strrelloc(pred, event);
+       case FILTER_PRED_FN_PCHAR_USER:
+               return filter_pred_pchar_user(pred, event);
+       case FILTER_PRED_FN_PCHAR:
+               return filter_pred_pchar(pred, event);
+       case FILTER_PRED_FN_CPU:
+               return filter_pred_cpu(pred, event);
+       case FILTER_PRED_TEST_VISITED:
+               return test_pred_visited_fn(pred, event);
+       default:
+               return 0;
+       }
+}
+
 /* Called when a predicate is encountered by predicate_parse() */
 static int parse_pred(const char *str, void *data,
                      int pos, struct filter_parse_error *pe,
@@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
                        parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
                        goto err_free;
                }
-               pred->fn = filter_pred_none;
+               pred->fn_num = FILTER_PRED_FN_NOP;
 
                /*
                 * Quotes are not required, but if they exist then we need
@@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
                filter_build_regex(pred);
 
                if (field->filter_type == FILTER_COMM) {
-                       pred->fn = filter_pred_comm;
+                       pred->fn_num = FILTER_PRED_FN_COMM;
 
                } else if (field->filter_type == FILTER_STATIC_STRING) {
-                       pred->fn = filter_pred_string;
+                       pred->fn_num = FILTER_PRED_FN_STRING;
                        pred->regex.field_len = field->size;
 
                } else if (field->filter_type == FILTER_DYN_STRING) {
-                       pred->fn = filter_pred_strloc;
+                       pred->fn_num = FILTER_PRED_FN_STRLOC;
                } else if (field->filter_type == FILTER_RDYN_STRING)
-                       pred->fn = filter_pred_strrelloc;
+                       pred->fn_num = FILTER_PRED_FN_STRRELLOC;
                else {
 
                        if (!ustring_per_cpu) {
@@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
                        }
 
                        if (ustring)
-                               pred->fn = filter_pred_pchar_user;
+                               pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
                        else
-                               pred->fn = filter_pred_pchar;
+                               pred->fn_num = FILTER_PRED_FN_PCHAR;
                }
                /* go past the last quote */
                i++;
@@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
                pred->val = val;
 
                if (field->filter_type == FILTER_CPU)
-                       pred->fn = filter_pred_cpu;
+                       pred->fn_num = FILTER_PRED_FN_CPU;
                else {
-                       pred->fn = select_comparison_fn(pred->op, field->size,
-                                                       field->is_signed);
+                       pred->fn_num = select_comparison_fn(pred->op, field->size,
+                                                           field->is_signed);
                        if (pred->op == OP_NE)
                                pred->not = 1;
                }
@@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
                struct filter_pred *pred = prog[i].pred;
                struct ftrace_event_field *field = pred->field;
 
-               WARN_ON_ONCE(!pred->fn);
+               WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
 
                if (!field) {
                        WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
                if (!strchr(fields, *field->name))
                        continue;
 
-               pred->fn = test_pred_visited_fn;
+               pred->fn_num = FILTER_PRED_TEST_VISITED;
        }
 }
 
index fdf7846..48465f7 100644 (file)
@@ -104,6 +104,38 @@ enum field_op_id {
        FIELD_OP_MULT,
 };
 
+enum hist_field_fn {
+       HIST_FIELD_FN_NOP,
+       HIST_FIELD_FN_VAR_REF,
+       HIST_FIELD_FN_COUNTER,
+       HIST_FIELD_FN_CONST,
+       HIST_FIELD_FN_LOG2,
+       HIST_FIELD_FN_BUCKET,
+       HIST_FIELD_FN_TIMESTAMP,
+       HIST_FIELD_FN_CPU,
+       HIST_FIELD_FN_STRING,
+       HIST_FIELD_FN_DYNSTRING,
+       HIST_FIELD_FN_RELDYNSTRING,
+       HIST_FIELD_FN_PSTRING,
+       HIST_FIELD_FN_S64,
+       HIST_FIELD_FN_U64,
+       HIST_FIELD_FN_S32,
+       HIST_FIELD_FN_U32,
+       HIST_FIELD_FN_S16,
+       HIST_FIELD_FN_U16,
+       HIST_FIELD_FN_S8,
+       HIST_FIELD_FN_U8,
+       HIST_FIELD_FN_UMINUS,
+       HIST_FIELD_FN_MINUS,
+       HIST_FIELD_FN_PLUS,
+       HIST_FIELD_FN_DIV,
+       HIST_FIELD_FN_MULT,
+       HIST_FIELD_FN_DIV_POWER2,
+       HIST_FIELD_FN_DIV_NOT_POWER2,
+       HIST_FIELD_FN_DIV_MULT_SHIFT,
+       HIST_FIELD_FN_EXECNAME,
+};
+
 /*
  * A hist_var (histogram variable) contains variable information for
  * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
@@ -123,15 +155,15 @@ struct hist_var {
 struct hist_field {
        struct ftrace_event_field       *field;
        unsigned long                   flags;
-       hist_field_fn_t                 fn;
-       unsigned int                    ref;
-       unsigned int                    size;
-       unsigned int                    offset;
-       unsigned int                    is_signed;
        unsigned long                   buckets;
        const char                      *type;
        struct hist_field               *operands[HIST_FIELD_OPERANDS_MAX];
        struct hist_trigger_data        *hist_data;
+       enum hist_field_fn              fn_num;
+       unsigned int                    ref;
+       unsigned int                    size;
+       unsigned int                    offset;
+       unsigned int                    is_signed;
 
        /*
         * Variable fields contain variable-specific info in var.
@@ -166,14 +198,11 @@ struct hist_field {
        u64                             div_multiplier;
 };
 
-static u64 hist_field_none(struct hist_field *field,
-                          struct tracing_map_elt *elt,
-                          struct trace_buffer *buffer,
-                          struct ring_buffer_event *rbe,
-                          void *event)
-{
-       return 0;
-}
+static u64 hist_fn_call(struct hist_field *hist_field,
+                       struct tracing_map_elt *elt,
+                       struct trace_buffer *buffer,
+                       struct ring_buffer_event *rbe,
+                       void *event);
 
 static u64 hist_field_const(struct hist_field *field,
                           struct tracing_map_elt *elt,
@@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
 {
        struct hist_field *operand = hist_field->operands[0];
 
-       u64 val = operand->fn(operand, elt, buffer, rbe, event);
+       u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
 
        return (u64) ilog2(roundup_pow_of_two(val));
 }
@@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
        struct hist_field *operand = hist_field->operands[0];
        unsigned long buckets = hist_field->buckets;
 
-       u64 val = operand->fn(operand, elt, buffer, rbe, event);
+       u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
 
        if (WARN_ON_ONCE(!buckets))
                return val;
@@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+       u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
        return val1 + val2;
 }
@@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+       u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
        return val1 - val2;
 }
@@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+       u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
        /* Return -1 for the undefined case */
        if (!val2)
@@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
        return val1 >> __ffs64(operand2->constant);
 }
@@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
        return div64_u64(val1, operand2->constant);
 }
@@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
        /*
         * If the divisor is a constant, do a multiplication and shift instead.
@@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+       u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+       u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
        return val1 * val2;
 }
@@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
 {
        struct hist_field *operand = hist_field->operands[0];
 
-       s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
+       s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
        u64 val = (u64)-sval;
 
        return val;
@@ -657,19 +686,19 @@ struct snapshot_context {
  * Returns the specific division function to use if the divisor
  * is constant. This avoids extra branches when the trigger is hit.
  */
-static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
 {
        u64 div = divisor->constant;
 
        if (!(div & (div - 1)))
-               return div_by_power_of_two;
+               return HIST_FIELD_FN_DIV_POWER2;
 
        /* If the divisor is too large, do a regular division */
        if (div > (1 << HIST_DIV_SHIFT))
-               return div_by_not_power_of_two;
+               return HIST_FIELD_FN_DIV_NOT_POWER2;
 
        divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
-       return div_by_mult_and_shift;
+       return HIST_FIELD_FN_DIV_MULT_SHIFT;
 }
 
 static void track_data_free(struct track_data *track_data)
@@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field,
        return field_name;
 }
 
-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
 {
-       hist_field_fn_t fn = NULL;
-
        switch (field_size) {
        case 8:
                if (field_is_signed)
-                       fn = hist_field_s64;
+                       return HIST_FIELD_FN_S64;
                else
-                       fn = hist_field_u64;
-               break;
+                       return HIST_FIELD_FN_U64;
        case 4:
                if (field_is_signed)
-                       fn = hist_field_s32;
+                       return HIST_FIELD_FN_S32;
                else
-                       fn = hist_field_u32;
-               break;
+                       return HIST_FIELD_FN_U32;
        case 2:
                if (field_is_signed)
-                       fn = hist_field_s16;
+                       return HIST_FIELD_FN_S16;
                else
-                       fn = hist_field_u16;
-               break;
+                       return HIST_FIELD_FN_U16;
        case 1:
                if (field_is_signed)
-                       fn = hist_field_s8;
+                       return HIST_FIELD_FN_S8;
                else
-                       fn = hist_field_u8;
-               break;
+                       return HIST_FIELD_FN_U8;
        }
 
-       return fn;
+       return HIST_FIELD_FN_NOP;
 }
 
 static int parse_map_size(char *str)
@@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
                goto out; /* caller will populate */
 
        if (flags & HIST_FIELD_FL_VAR_REF) {
-               hist_field->fn = hist_field_var_ref;
+               hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
                goto out;
        }
 
        if (flags & HIST_FIELD_FL_HITCOUNT) {
-               hist_field->fn = hist_field_counter;
+               hist_field->fn_num = HIST_FIELD_FN_COUNTER;
                hist_field->size = sizeof(u64);
                hist_field->type = "u64";
                goto out;
        }
 
        if (flags & HIST_FIELD_FL_CONST) {
-               hist_field->fn = hist_field_const;
+               hist_field->fn_num = HIST_FIELD_FN_CONST;
                hist_field->size = sizeof(u64);
                hist_field->type = kstrdup("u64", GFP_KERNEL);
                if (!hist_field->type)
@@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
        }
 
        if (flags & HIST_FIELD_FL_STACKTRACE) {
-               hist_field->fn = hist_field_none;
+               hist_field->fn_num = HIST_FIELD_FN_NOP;
                goto out;
        }
 
        if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
                unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
-               hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
-                       hist_field_bucket;
+               hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
+                       HIST_FIELD_FN_BUCKET;
                hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
                hist_field->size = hist_field->operands[0]->size;
                hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
@@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
        }
 
        if (flags & HIST_FIELD_FL_TIMESTAMP) {
-               hist_field->fn = hist_field_timestamp;
+               hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
                hist_field->size = sizeof(u64);
                hist_field->type = "u64";
                goto out;
        }
 
        if (flags & HIST_FIELD_FL_CPU) {
-               hist_field->fn = hist_field_cpu;
+               hist_field->fn_num = HIST_FIELD_FN_CPU;
                hist_field->size = sizeof(int);
                hist_field->type = "unsigned int";
                goto out;
@@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
                        goto free;
 
                if (field->filter_type == FILTER_STATIC_STRING) {
-                       hist_field->fn = hist_field_string;
+                       hist_field->fn_num = HIST_FIELD_FN_STRING;
                        hist_field->size = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING) {
-                       hist_field->fn = hist_field_dynstring;
+                       hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
                } else if (field->filter_type == FILTER_RDYN_STRING)
-                       hist_field->fn = hist_field_reldynstring;
+                       hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
                else
-                       hist_field->fn = hist_field_pstring;
+                       hist_field->fn_num = HIST_FIELD_FN_PSTRING;
        } else {
                hist_field->size = field->size;
                hist_field->is_signed = field->is_signed;
@@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
                if (!hist_field->type)
                        goto free;
 
-               hist_field->fn = select_value_fn(field->size,
-                                                field->is_signed);
-               if (!hist_field->fn) {
+               hist_field->fn_num = select_value_fn(field->size,
+                                                    field->is_signed);
+               if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
                        destroy_hist_field(hist_field, 0);
                        return NULL;
                }
@@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
        if (!alias)
                return NULL;
 
-       alias->fn = var_ref->fn;
+       alias->fn_num = var_ref->fn_num;
        alias->operands[0] = var_ref;
 
        if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 
        expr->flags |= operand1->flags &
                (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
-       expr->fn = hist_field_unary_minus;
+       expr->fn_num = HIST_FIELD_FN_UMINUS;
        expr->operands[0] = operand1;
        expr->size = operand1->size;
        expr->is_signed = operand1->is_signed;
@@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
        unsigned long operand_flags, operand2_flags;
        int field_op, ret = -EINVAL;
        char *sep, *operand1_str;
-       hist_field_fn_t op_fn;
+       enum hist_field_fn op_fn;
        bool combine_consts;
 
        if (*n_subexprs > 3) {
@@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 
        switch (field_op) {
        case FIELD_OP_MINUS:
-               op_fn = hist_field_minus;
+               op_fn = HIST_FIELD_FN_MINUS;
                break;
        case FIELD_OP_PLUS:
-               op_fn = hist_field_plus;
+               op_fn = HIST_FIELD_FN_PLUS;
                break;
        case FIELD_OP_DIV:
-               op_fn = hist_field_div;
+               op_fn = HIST_FIELD_FN_DIV;
                break;
        case FIELD_OP_MULT:
-               op_fn = hist_field_mult;
+               op_fn = HIST_FIELD_FN_MULT;
                break;
        default:
                ret = -EINVAL;
@@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
                op_fn = hist_field_get_div_fn(operand2);
        }
 
+       expr->fn_num = op_fn;
+
        if (combine_consts) {
                if (var1)
                        expr->operands[0] = var1;
                if (var2)
                        expr->operands[1] = var2;
 
-               expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+               expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
+               expr->fn_num = HIST_FIELD_FN_CONST;
 
                expr->operands[0] = NULL;
                expr->operands[1] = NULL;
@@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 
                expr->name = expr_str(expr, 0);
        } else {
-               expr->fn = op_fn;
-
                /* The operand sizes should be the same, so just pick one */
                expr->size = operand1->size;
                expr->is_signed = operand1->is_signed;
@@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
                struct hist_field *var = field_var->var;
                struct hist_field *val = field_var->val;
 
-               var_val = val->fn(val, elt, buffer, rbe, rec);
+               var_val = hist_fn_call(val, elt, buffer, rbe, rec);
                var_idx = var->var.idx;
 
                if (val->flags & HIST_FIELD_FL_STRING) {
@@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
        return (u64)(unsigned long)(elt_data->comm);
 }
 
+static u64 hist_fn_call(struct hist_field *hist_field,
+                       struct tracing_map_elt *elt,
+                       struct trace_buffer *buffer,
+                       struct ring_buffer_event *rbe,
+                       void *event)
+{
+       switch (hist_field->fn_num) {
+       case HIST_FIELD_FN_VAR_REF:
+               return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_COUNTER:
+               return hist_field_counter(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_CONST:
+               return hist_field_const(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_LOG2:
+               return hist_field_log2(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_BUCKET:
+               return hist_field_bucket(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_TIMESTAMP:
+               return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_CPU:
+               return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_STRING:
+               return hist_field_string(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_DYNSTRING:
+               return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_RELDYNSTRING:
+               return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_PSTRING:
+               return hist_field_pstring(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_S64:
+               return hist_field_s64(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_U64:
+               return hist_field_u64(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_S32:
+               return hist_field_s32(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_U32:
+               return hist_field_u32(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_S16:
+               return hist_field_s16(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_U16:
+               return hist_field_u16(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_S8:
+               return hist_field_s8(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_U8:
+               return hist_field_u8(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_UMINUS:
+               return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_MINUS:
+               return hist_field_minus(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_PLUS:
+               return hist_field_plus(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_DIV:
+               return hist_field_div(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_MULT:
+               return hist_field_mult(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_DIV_POWER2:
+               return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_DIV_NOT_POWER2:
+               return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_DIV_MULT_SHIFT:
+               return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
+       case HIST_FIELD_FN_EXECNAME:
+               return hist_field_execname(hist_field, elt, buffer, rbe, event);
+       default:
+               return 0;
+       }
+}
+
 /* Convert a var that points to common_pid.execname to a string */
 static void update_var_execname(struct hist_field *hist_field)
 {
@@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field)
        kfree_const(hist_field->type);
        hist_field->type = "char[]";
 
-       hist_field->fn = hist_field_execname;
+       hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
 }
 
 static int create_var_field(struct hist_trigger_data *hist_data,
@@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 
        for_each_hist_val_field(i, hist_data) {
                hist_field = hist_data->fields[i];
-               hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+               hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
                if (hist_field->flags & HIST_FIELD_FL_VAR) {
                        var_idx = hist_field->var.idx;
 
@@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
        for_each_hist_key_field(i, hist_data) {
                hist_field = hist_data->fields[i];
                if (hist_field->flags & HIST_FIELD_FL_VAR) {
-                       hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+                       hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
                        var_idx = hist_field->var.idx;
                        tracing_map_set_var(elt, var_idx, hist_val);
                }
@@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
                                         HIST_STACKTRACE_SKIP);
                        key = entries;
                } else {
-                       field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
+                       field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
                        if (key_field->flags & HIST_FIELD_FL_STRING) {
                                key = (void *)(unsigned long)field_contents;
                                use_compound_key = true;
index a6621c5..ae78c2d 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/uio.h>
 #include <linux/ioctl.h>
 #include <linux/jhash.h>
+#include <linux/refcount.h>
 #include <linux/trace_events.h>
 #include <linux/tracefs.h>
 #include <linux/types.h>
  */
 #define MAX_PAGE_ORDER 0
 #define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
+#define MAX_EVENTS (MAX_BYTES * 8)
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
-#define MAX_FIELD_ARG_NAME 256
 
-static char *register_page_data;
+/*
+ * The MAP_STATUS_* macros are used for taking a index and determining the
+ * appropriate byte and the bit in the byte to set/reset for an event.
+ *
+ * The lower 3 bits of the index decide which bit to set.
+ * The remaining upper bits of the index decide which byte to use for the bit.
+ *
+ * This is used when an event has a probe attached/removed to reflect live
+ * status of the event wanting tracing or not to user-programs via shared
+ * memory maps.
+ */
+#define MAP_STATUS_BYTE(index) ((index) >> 3)
+#define MAP_STATUS_MASK(index) BIT((index) & 7)
+
+/*
+ * Internal bits (kernel side only) to keep track of connected probes:
+ * These are used when status is requested in text form about an event. These
+ * bits are compared against an internal byte on the event to determine which
+ * probes to print out to the user.
+ *
+ * These do not reflect the mapped bytes between the user and kernel space.
+ */
+#define EVENT_STATUS_FTRACE BIT(0)
+#define EVENT_STATUS_PERF BIT(1)
+#define EVENT_STATUS_OTHER BIT(7)
+
+/*
+ * Stores the pages, tables, and locks for a group of events.
+ * Each logical grouping of events has its own group, with a
+ * matching page for status checks within user programs. This
+ * allows for isolation of events to user programs by various
+ * means.
+ */
+struct user_event_group {
+       struct page *pages;
+       char *register_page_data;
+       char *system_name;
+       struct hlist_node node;
+       struct mutex reg_mutex;
+       DECLARE_HASHTABLE(register_table, 8);
+       DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+};
 
-static DEFINE_MUTEX(reg_mutex);
-static DEFINE_HASHTABLE(register_table, 4);
-static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+/* Group for init_user_ns mapping, top-most group */
+static struct user_event_group *init_group;
 
 /*
  * Stores per-event properties, as users register events
  * within a file a user_event might be created if it does not
  * already exist. These are globally used and their lifetime
  * is tied to the refcnt member. These cannot go away until the
- * refcnt reaches zero.
+ * refcnt reaches one.
  */
 struct user_event {
+       struct user_event_group *group;
        struct tracepoint tracepoint;
        struct trace_event_call call;
        struct trace_event_class class;
@@ -68,10 +110,11 @@ struct user_event {
        struct hlist_node node;
        struct list_head fields;
        struct list_head validators;
-       atomic_t refcnt;
+       refcount_t refcnt;
        int index;
        int flags;
        int min_size;
+       char status;
 };
 
 /*
@@ -86,6 +129,11 @@ struct user_event_refs {
        struct user_event *events[];
 };
 
+struct user_event_file_info {
+       struct user_event_group *group;
+       struct user_event_refs *refs;
+};
+
 #define VALIDATOR_ENSURE_NULL (1 << 0)
 #define VALIDATOR_REL (1 << 1)
 
@@ -98,7 +146,8 @@ struct user_event_validator {
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
                                   void *tpdata, bool *faulted);
 
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+                           char *args, char *flags,
                            struct user_event **newuser);
 
 static u32 user_event_key(char *name)
@@ -106,6 +155,144 @@ static u32 user_event_key(char *name)
        return jhash(name, strlen(name), 0);
 }
 
+static void set_page_reservations(char *pages, bool set)
+{
+       int page;
+
+       for (page = 0; page < MAX_PAGES; ++page) {
+               void *addr = pages + (PAGE_SIZE * page);
+
+               if (set)
+                       SetPageReserved(virt_to_page(addr));
+               else
+                       ClearPageReserved(virt_to_page(addr));
+       }
+}
+
+static void user_event_group_destroy(struct user_event_group *group)
+{
+       if (group->register_page_data)
+               set_page_reservations(group->register_page_data, false);
+
+       if (group->pages)
+               __free_pages(group->pages, MAX_PAGE_ORDER);
+
+       kfree(group->system_name);
+       kfree(group);
+}
+
+static char *user_event_group_system_name(struct user_namespace *user_ns)
+{
+       char *system_name;
+       int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+
+       if (user_ns != &init_user_ns) {
+               /*
+                * Unexpected at this point:
+                * We only currently support init_user_ns.
+                * When we enable more, this will trigger a failure so log.
+                */
+               pr_warn("user_events: Namespace other than init_user_ns!\n");
+               return NULL;
+       }
+
+       system_name = kmalloc(len, GFP_KERNEL);
+
+       if (!system_name)
+               return NULL;
+
+       snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+
+       return system_name;
+}
+
+static inline struct user_event_group
+*user_event_group_from_user_ns(struct user_namespace *user_ns)
+{
+       if (user_ns == &init_user_ns)
+               return init_group;
+
+       return NULL;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+       struct user_namespace *user_ns = current_user_ns();
+       struct user_event_group *group = NULL;
+
+       while (user_ns) {
+               group = user_event_group_from_user_ns(user_ns);
+
+               if (group)
+                       break;
+
+               user_ns = user_ns->parent;
+       }
+
+       return group;
+}
+
+static struct user_event_group
+*user_event_group_create(struct user_namespace *user_ns)
+{
+       struct user_event_group *group;
+
+       group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+       if (!group)
+               return NULL;
+
+       group->system_name = user_event_group_system_name(user_ns);
+
+       if (!group->system_name)
+               goto error;
+
+       group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+
+       if (!group->pages)
+               goto error;
+
+       group->register_page_data = page_address(group->pages);
+
+       set_page_reservations(group->register_page_data, true);
+
+       /* Zero all bits beside 0 (which is reserved for failures) */
+       bitmap_zero(group->page_bitmap, MAX_EVENTS);
+       set_bit(0, group->page_bitmap);
+
+       mutex_init(&group->reg_mutex);
+       hash_init(group->register_table);
+
+       return group;
+error:
+       if (group)
+               user_event_group_destroy(group);
+
+       return NULL;
+};
+
+static __always_inline
+void user_event_register_set(struct user_event *user)
+{
+       int i = user->index;
+
+       user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+}
+
+static __always_inline
+void user_event_register_clear(struct user_event *user)
+{
+       int i = user->index;
+
+       user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+}
+
+static __always_inline __must_check
+bool user_event_last_ref(struct user_event *user)
+{
+       return refcount_read(&user->refcnt) == 1;
+}
+
 static __always_inline __must_check
 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
  *
  * Upon success user_event has its ref count increased by 1.
  */
-static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+static int user_event_parse_cmd(struct user_event_group *group,
+                               char *raw_command, struct user_event **newuser)
 {
        char *name = raw_command;
        char *args = strpbrk(name, " ");
@@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
        if (flags)
                *flags++ = '\0';
 
-       return user_event_parse(name, args, flags, newuser);
+       return user_event_parse(group, name, args, flags, newuser);
 }
 
 static int user_field_array_size(const char *type)
@@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
        goto add_field;
 
 add_validator:
-       if (strstr(type, "char") != 0)
+       if (strstr(type, "char") != NULL)
                validator_flags |= VALIDATOR_ENSURE_NULL;
 
        validator = kmalloc(sizeof(*validator), GFP_KERNEL);
@@ -458,7 +646,7 @@ static const char *user_field_format(const char *type)
                return "%d";
        if (strcmp(type, "unsigned char") == 0)
                return "%u";
-       if (strstr(type, "char[") != 0)
+       if (strstr(type, "char[") != NULL)
                return "%s";
 
        /* Unknown, likely struct, allowed treat as 64-bit */
@@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
 
        return false;
 check:
-       return strstr(type, "char") != 0;
+       return strstr(type, "char") != NULL;
 }
 
 #define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
+                                    char *buf, int len, bool *colon)
+{
+       int pos = 0, i = *iout;
+
+       *colon = false;
+
+       for (; i < argc; ++i) {
+               if (i != *iout)
+                       pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+               pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
+
+               if (strchr(argv[i], ';')) {
+                       ++i;
+                       *colon = true;
+                       break;
+               }
+       }
+
+       /* Actual set, advance i */
+       if (len != 0)
+               *iout = i;
+
+       return pos + 1;
+}
+
+static int user_field_set_string(struct ftrace_event_field *field,
+                                char *buf, int len, bool colon)
+{
+       int pos = 0;
+
+       pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
+       pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+       pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+
+       if (colon)
+               pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
+
+       return pos + 1;
+}
+
 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
 {
        struct ftrace_event_field *field, *next;
@@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user)
 
        dyn_event_remove(&user->devent);
 
-       register_page_data[user->index] = 0;
-       clear_bit(user->index, page_bitmap);
+       user_event_register_clear(user);
+       clear_bit(user->index, user->group->page_bitmap);
        hash_del(&user->node);
 
        user_event_destroy_validators(user);
@@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user)
        return ret;
 }
 
-static struct user_event *find_user_event(char *name, u32 *outkey)
+static struct user_event *find_user_event(struct user_event_group *group,
+                                         char *name, u32 *outkey)
 {
        struct user_event *user;
        u32 key = user_event_key(name);
 
        *outkey = key;
 
-       hash_for_each_possible(register_table, user, node, key)
+       hash_for_each_possible(group->register_table, user, node, key)
                if (!strcmp(EVENT_NAME(user), name)) {
-                       atomic_inc(&user->refcnt);
+                       refcount_inc(&user->refcnt);
                        return user;
                }
 
@@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user)
                rcu_read_unlock_sched();
        }
 
-       register_page_data[user->index] = status;
+       if (status)
+               user_event_register_set(user);
+       else
+               user_event_register_clear(user);
+
+       user->status = status;
 }
 
 /*
@@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call,
 
        return ret;
 inc:
-       atomic_inc(&user->refcnt);
+       refcount_inc(&user->refcnt);
        update_reg_page_for(user);
        return 0;
 dec:
        update_reg_page_for(user);
-       atomic_dec(&user->refcnt);
+       refcount_dec(&user->refcnt);
        return 0;
 }
 
 static int user_event_create(const char *raw_command)
 {
+       struct user_event_group *group;
        struct user_event *user;
        char *name;
        int ret;
@@ -861,14 +1098,19 @@ static int user_event_create(const char *raw_command)
        if (!name)
                return -ENOMEM;
 
-       mutex_lock(&reg_mutex);
+       group = current_user_event_group();
 
-       ret = user_event_parse_cmd(name, &user);
+       if (!group)
+               return -ENOENT;
+
+       mutex_lock(&group->reg_mutex);
+
+       ret = user_event_parse_cmd(group, name, &user);
 
        if (!ret)
-               atomic_dec(&user->refcnt);
+               refcount_dec(&user->refcnt);
 
-       mutex_unlock(&reg_mutex);
+       mutex_unlock(&group->reg_mutex);
 
        if (ret)
                kfree(name);
@@ -910,14 +1152,14 @@ static bool user_event_is_busy(struct dyn_event *ev)
 {
        struct user_event *user = container_of(ev, struct user_event, devent);
 
-       return atomic_read(&user->refcnt) != 0;
+       return !user_event_last_ref(user);
 }
 
 static int user_event_free(struct dyn_event *ev)
 {
        struct user_event *user = container_of(ev, struct user_event, devent);
 
-       if (atomic_read(&user->refcnt) != 0)
+       if (!user_event_last_ref(user))
                return -EBUSY;
 
        return destroy_user_event(user);
@@ -926,49 +1168,35 @@ static int user_event_free(struct dyn_event *ev)
 static bool user_field_match(struct ftrace_event_field *field, int argc,
                             const char **argv, int *iout)
 {
-       char *field_name, *arg_name;
-       int len, pos, i = *iout;
+       char *field_name = NULL, *dyn_field_name = NULL;
        bool colon = false, match = false;
+       int dyn_len, len;
 
-       if (i >= argc)
+       if (*iout >= argc)
                return false;
 
-       len = MAX_FIELD_ARG_NAME;
-       field_name = kmalloc(len, GFP_KERNEL);
-       arg_name = kmalloc(len, GFP_KERNEL);
-
-       if (!arg_name || !field_name)
-               goto out;
-
-       pos = 0;
-
-       for (; i < argc; ++i) {
-               if (i != *iout)
-                       pos += snprintf(arg_name + pos, len - pos, " ");
+       dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+                                           0, &colon);
 
-               pos += snprintf(arg_name + pos, len - pos, argv[i]);
+       len = user_field_set_string(field, field_name, 0, colon);
 
-               if (strchr(argv[i], ';')) {
-                       ++i;
-                       colon = true;
-                       break;
-               }
-       }
+       if (dyn_len != len)
+               return false;
 
-       pos = 0;
+       dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
+       field_name = kmalloc(len, GFP_KERNEL);
 
-       pos += snprintf(field_name + pos, len - pos, field->type);
-       pos += snprintf(field_name + pos, len - pos, " ");
-       pos += snprintf(field_name + pos, len - pos, field->name);
+       if (!dyn_field_name || !field_name)
+               goto out;
 
-       if (colon)
-               pos += snprintf(field_name + pos, len - pos, ";");
+       user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+                                 dyn_len, &colon);
 
-       *iout = i;
+       user_field_set_string(field, field_name, len, colon);
 
-       match = strcmp(arg_name, field_name) == 0;
+       match = strcmp(dyn_field_name, field_name) == 0;
 out:
-       kfree(arg_name);
+       kfree(dyn_field_name);
        kfree(field_name);
 
        return match;
@@ -1036,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user)
  * The name buffer lifetime is owned by this method for success cases only.
  * Upon success the returned user_event has its ref count increased by 1.
  */
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+                           char *args, char *flags,
                            struct user_event **newuser)
 {
        int ret;
@@ -1046,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 
        /* Prevent dyn_event from racing */
        mutex_lock(&event_mutex);
-       user = find_user_event(name, &key);
+       user = find_user_event(group, name, &key);
        mutex_unlock(&event_mutex);
 
        if (user) {
@@ -1059,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags,
                return 0;
        }
 
-       index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+       index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
 
        if (index == MAX_EVENTS)
                return -EMFILE;
@@ -1073,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags,
        INIT_LIST_HEAD(&user->fields);
        INIT_LIST_HEAD(&user->validators);
 
+       user->group = group;
        user->tracepoint.name = name;
 
        ret = user_event_parse_fields(user, args);
@@ -1091,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags,
        user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
        user->call.tp = &user->tracepoint;
        user->call.event.funcs = &user_event_funcs;
+       user->class.system = group->system_name;
 
-       user->class.system = USER_EVENTS_SYSTEM;
        user->class.fields_array = user_event_fields_array;
        user->class.get_fields = user_event_get_fields;
        user->class.reg = user_event_reg;
@@ -1110,13 +1340,13 @@ static int user_event_parse(char *name, char *args, char *flags,
 
        user->index = index;
 
-       /* Ensure we track ref */
-       atomic_inc(&user->refcnt);
+       /* Ensure we track self ref and caller ref (2) */
+       refcount_set(&user->refcnt, 2);
 
        dyn_event_init(&user->devent, &user_event_dops);
        dyn_event_add(&user->devent, &user->call);
-       set_bit(user->index, page_bitmap);
-       hash_add(register_table, &user->node, key);
+       set_bit(user->index, group->page_bitmap);
+       hash_add(group->register_table, &user->node, key);
 
        mutex_unlock(&event_mutex);
 
@@ -1134,32 +1364,20 @@ put_user:
 /*
  * Deletes a previously created event if it is no longer being used.
  */
-static int delete_user_event(char *name)
+static int delete_user_event(struct user_event_group *group, char *name)
 {
        u32 key;
-       int ret;
-       struct user_event *user = find_user_event(name, &key);
+       struct user_event *user = find_user_event(group, name, &key);
 
        if (!user)
                return -ENOENT;
 
-       /* Ensure we are the last ref */
-       if (atomic_read(&user->refcnt) != 1) {
-               ret = -EBUSY;
-               goto put_ref;
-       }
-
-       ret = destroy_user_event(user);
-
-       if (ret)
-               goto put_ref;
+       refcount_dec(&user->refcnt);
 
-       return ret;
-put_ref:
-       /* No longer have this ref */
-       atomic_dec(&user->refcnt);
+       if (!user_event_last_ref(user))
+               return -EBUSY;
 
-       return ret;
+       return destroy_user_event(user);
 }
 
 /*
@@ -1167,6 +1385,7 @@ put_ref:
  */
 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 {
+       struct user_event_file_info *info = file->private_data;
        struct user_event_refs *refs;
        struct user_event *user = NULL;
        struct tracepoint *tp;
@@ -1178,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 
        rcu_read_lock_sched();
 
-       refs = rcu_dereference_sched(file->private_data);
+       refs = rcu_dereference_sched(info->refs);
 
        /*
         * The refs->events array is protected by RCU, and new items may be
@@ -1236,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
        return ret;
 }
 
+static int user_events_open(struct inode *node, struct file *file)
+{
+       struct user_event_group *group;
+       struct user_event_file_info *info;
+
+       group = current_user_event_group();
+
+       if (!group)
+               return -ENOENT;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+       if (!info)
+               return -ENOMEM;
+
+       info->group = group;
+
+       file->private_data = info;
+
+       return 0;
+}
+
 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
                                 size_t count, loff_t *ppos)
 {
@@ -1245,7 +1486,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
        if (unlikely(*ppos != 0))
                return -EFAULT;
 
-       if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+       if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
+                                        count, &iov, &i)))
                return -EFAULT;
 
        return user_events_write_core(file, &i);
@@ -1256,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
        return user_events_write_core(kp->ki_filp, i);
 }
 
-static int user_events_ref_add(struct file *file, struct user_event *user)
+static int user_events_ref_add(struct user_event_file_info *info,
+                              struct user_event *user)
 {
+       struct user_event_group *group = info->group;
        struct user_event_refs *refs, *new_refs;
        int i, size, count = 0;
 
-       refs = rcu_dereference_protected(file->private_data,
-                                        lockdep_is_held(&reg_mutex));
+       refs = rcu_dereference_protected(info->refs,
+                                        lockdep_is_held(&group->reg_mutex));
 
        if (refs) {
                count = refs->count;
@@ -1286,9 +1530,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
 
        new_refs->events[i] = user;
 
-       atomic_inc(&user->refcnt);
+       refcount_inc(&user->refcnt);
 
-       rcu_assign_pointer(file->private_data, new_refs);
+       rcu_assign_pointer(info->refs, new_refs);
 
        if (refs)
                kfree_rcu(refs, rcu);
@@ -1309,13 +1553,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
        if (size > PAGE_SIZE)
                return -E2BIG;
 
-       return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+       if (size < offsetofend(struct user_reg, write_index))
+               return -EINVAL;
+
+       ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+       if (ret)
+               return ret;
+
+       kreg->size = size;
+
+       return 0;
 }
 
 /*
  * Registers a user_event on behalf of a user process.
  */
-static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+static long user_events_ioctl_reg(struct user_event_file_info *info,
+                                 unsigned long uarg)
 {
        struct user_reg __user *ureg = (struct user_reg __user *)uarg;
        struct user_reg reg;
@@ -1336,24 +1591,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
                return ret;
        }
 
-       ret = user_event_parse_cmd(name, &user);
+       ret = user_event_parse_cmd(info->group, name, &user);
 
        if (ret) {
                kfree(name);
                return ret;
        }
 
-       ret = user_events_ref_add(file, user);
+       ret = user_events_ref_add(info, user);
 
        /* No longer need parse ref, ref_add either worked or not */
-       atomic_dec(&user->refcnt);
+       refcount_dec(&user->refcnt);
 
        /* Positive number is index and valid */
        if (ret < 0)
                return ret;
 
        put_user((u32)ret, &ureg->write_index);
-       put_user(user->index, &ureg->status_index);
+       put_user(user->index, &ureg->status_bit);
 
        return 0;
 }
@@ -1361,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 /*
  * Deletes a user_event on behalf of a user process.
  */
-static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+static long user_events_ioctl_del(struct user_event_file_info *info,
+                                 unsigned long uarg)
 {
        void __user *ubuf = (void __user *)uarg;
        char *name;
@@ -1374,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
 
        /* event_mutex prevents dyn_event from racing */
        mutex_lock(&event_mutex);
-       ret = delete_user_event(name);
+       ret = delete_user_event(info->group, name);
        mutex_unlock(&event_mutex);
 
        kfree(name);
@@ -1388,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
 static long user_events_ioctl(struct file *file, unsigned int cmd,
                              unsigned long uarg)
 {
+       struct user_event_file_info *info = file->private_data;
+       struct user_event_group *group = info->group;
        long ret = -ENOTTY;
 
        switch (cmd) {
        case DIAG_IOCSREG:
-               mutex_lock(&reg_mutex);
-               ret = user_events_ioctl_reg(file, uarg);
-               mutex_unlock(&reg_mutex);
+               mutex_lock(&group->reg_mutex);
+               ret = user_events_ioctl_reg(info, uarg);
+               mutex_unlock(&group->reg_mutex);
                break;
 
        case DIAG_IOCSDEL:
-               mutex_lock(&reg_mutex);
-               ret = user_events_ioctl_del(file, uarg);
-               mutex_unlock(&reg_mutex);
+               mutex_lock(&group->reg_mutex);
+               ret = user_events_ioctl_del(info, uarg);
+               mutex_unlock(&group->reg_mutex);
                break;
        }
 
@@ -1412,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
  */
 static int user_events_release(struct inode *node, struct file *file)
 {
+       struct user_event_file_info *info = file->private_data;
+       struct user_event_group *group;
        struct user_event_refs *refs;
        struct user_event *user;
        int i;
 
+       if (!info)
+               return -EINVAL;
+
+       group = info->group;
+
        /*
         * Ensure refs cannot change under any situation by taking the
         * register mutex during the final freeing of the references.
         */
-       mutex_lock(&reg_mutex);
+       mutex_lock(&group->reg_mutex);
 
-       refs = file->private_data;
+       refs = info->refs;
 
        if (!refs)
                goto out;
@@ -1436,37 +1701,56 @@ static int user_events_release(struct inode *node, struct file *file)
                user = refs->events[i];
 
                if (user)
-                       atomic_dec(&user->refcnt);
+                       refcount_dec(&user->refcnt);
        }
 out:
        file->private_data = NULL;
 
-       mutex_unlock(&reg_mutex);
+       mutex_unlock(&group->reg_mutex);
 
        kfree(refs);
+       kfree(info);
 
        return 0;
 }
 
 static const struct file_operations user_data_fops = {
+       .open = user_events_open,
        .write = user_events_write,
        .write_iter = user_events_write_iter,
        .unlocked_ioctl = user_events_ioctl,
        .release = user_events_release,
 };
 
+static struct user_event_group *user_status_group(struct file *file)
+{
+       struct seq_file *m = file->private_data;
+
+       if (!m)
+               return NULL;
+
+       return m->private;
+}
+
 /*
  * Maps the shared page into the user process for checking if event is enabled.
  */
 static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
 {
+       char *pages;
+       struct user_event_group *group = user_status_group(file);
        unsigned long size = vma->vm_end - vma->vm_start;
 
-       if (size != MAX_EVENTS)
+       if (size != MAX_BYTES)
+               return -EINVAL;
+
+       if (!group)
                return -EINVAL;
 
+       pages = group->register_page_data;
+
        return remap_pfn_range(vma, vma->vm_start,
-                              virt_to_phys(register_page_data) >> PAGE_SHIFT,
+                              virt_to_phys(pages) >> PAGE_SHIFT,
                               size, vm_get_page_prot(VM_READ));
 }
 
@@ -1490,14 +1774,18 @@ static void user_seq_stop(struct seq_file *m, void *p)
 
 static int user_seq_show(struct seq_file *m, void *p)
 {
+       struct user_event_group *group = m->private;
        struct user_event *user;
        char status;
        int i, active = 0, busy = 0, flags;
 
-       mutex_lock(&reg_mutex);
+       if (!group)
+               return -EINVAL;
+
+       mutex_lock(&group->reg_mutex);
 
-       hash_for_each(register_table, i, user, node) {
-               status = register_page_data[user->index];
+       hash_for_each(group->register_table, i, user, node) {
+               status = user->status;
                flags = user->flags;
 
                seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
@@ -1520,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p)
                active++;
        }
 
-       mutex_unlock(&reg_mutex);
+       mutex_unlock(&group->reg_mutex);
 
        seq_puts(m, "\n");
        seq_printf(m, "Active: %d\n", active);
@@ -1539,7 +1827,24 @@ static const struct seq_operations user_seq_ops = {
 
 static int user_status_open(struct inode *node, struct file *file)
 {
-       return seq_open(file, &user_seq_ops);
+       struct user_event_group *group;
+       int ret;
+
+       group = current_user_event_group();
+
+       if (!group)
+               return -ENOENT;
+
+       ret = seq_open(file, &user_seq_ops);
+
+       if (!ret) {
+               /* Chain group to seq_file */
+               struct seq_file *m = file->private_data;
+
+               m->private = group;
+       }
+
+       return ret;
 }
 
 static const struct file_operations user_status_fops = {
@@ -1580,42 +1885,21 @@ err:
        return -ENODEV;
 }
 
-static void set_page_reservations(bool set)
-{
-       int page;
-
-       for (page = 0; page < MAX_PAGES; ++page) {
-               void *addr = register_page_data + (PAGE_SIZE * page);
-
-               if (set)
-                       SetPageReserved(virt_to_page(addr));
-               else
-                       ClearPageReserved(virt_to_page(addr));
-       }
-}
-
 static int __init trace_events_user_init(void)
 {
-       struct page *pages;
        int ret;
 
-       /* Zero all bits beside 0 (which is reserved for failures) */
-       bitmap_zero(page_bitmap, MAX_EVENTS);
-       set_bit(0, page_bitmap);
+       init_group = user_event_group_create(&init_user_ns);
 
-       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-       if (!pages)
+       if (!init_group)
                return -ENOMEM;
-       register_page_data = page_address(pages);
-
-       set_page_reservations(true);
 
        ret = create_user_tracefs();
 
        if (ret) {
                pr_warn("user_events could not register with tracefs\n");
-               set_page_reservations(false);
-               __free_pages(pages, MAX_PAGE_ORDER);
+               user_event_group_destroy(init_group);
+               init_group = NULL;
                return ret;
        }
 
index 3134399..78d536d 100644 (file)
@@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void)
        for_each_cpu(cpu, current_mask) {
                retval = start_kthread(cpu);
                if (retval) {
+                       cpus_read_unlock();
                        stop_per_cpu_kthreads();
-                       break;
+                       return retval;
                }
        }
 
index 3b3869a..de38f1c 100644 (file)
@@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
        C(SAME_PROBE,           "There is already the exact same probe event"),\
        C(NO_EVENT_INFO,        "This requires both group and event name to attach"),\
        C(BAD_ATTACH_EVENT,     "Attached event does not exist"),\
-       C(BAD_ATTACH_ARG,       "Attached event does not have this field"),
+       C(BAD_ATTACH_ARG,       "Attached event does not have this field"),\
+       C(NO_EP_FILTER,         "No filter rule after 'if'"),
 
 #undef C
 #define C(a, b)                TP_ERR_##a
index 9901708..c774e56 100644 (file)
@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
 static void detect_dups(struct tracing_map_sort_entry **sort_entries,
                      int n_entries, unsigned int key_size)
 {
-       unsigned int dups = 0, total_dups = 0;
+       unsigned int total_dups = 0;
        int i;
        void *key;
 
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
        key = sort_entries[0]->key;
        for (i = 1; i < n_entries; i++) {
                if (!memcmp(sort_entries[i]->key, key, key_size)) {
-                       dups++; total_dups++;
+                       total_dups++;
                        continue;
                }
                key = sort_entries[i]->key;
-               dups = 0;
        }
 
        WARN_ONCE(total_dups > 0,
index ef42c1a..f23144a 100644 (file)
@@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
 static int tracepoint_module_coming(struct module *mod)
 {
        struct tp_module *tp_mod;
-       int ret = 0;
 
        if (!mod->num_tracepoints)
                return 0;
@@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod)
         */
        if (trace_module_has_bad_taint(mod))
                return 0;
-       mutex_lock(&tracepoint_module_list_mutex);
+
        tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
-       if (!tp_mod) {
-               ret = -ENOMEM;
-               goto end;
-       }
+       if (!tp_mod)
+               return -ENOMEM;
        tp_mod->mod = mod;
+
+       mutex_lock(&tracepoint_module_list_mutex);
        list_add_tail(&tp_mod->list, &tracepoint_module_list);
        blocking_notifier_call_chain(&tracepoint_notify_list,
                        MODULE_STATE_COMING, tp_mod);
-end:
        mutex_unlock(&tracepoint_module_list_mutex);
-       return ret;
+       return 0;
 }
 
 static void tracepoint_module_going(struct module *mod)
index 4f5778e..d06dc24 100644 (file)
 #include <fcntl.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <asm/bitsperlong.h>
+#include <endian.h>
 #include <linux/user_events.h>
 
+#if __BITS_PER_LONG == 64
+#define endian_swap(x) htole64(x)
+#else
+#define endian_swap(x) htole32(x)
+#endif
+
 /* Assumes debugfs is mounted */
 const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
 const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
 
-static int event_status(char **status)
+static int event_status(long **status)
 {
        int fd = open(status_file, O_RDONLY);
 
@@ -33,7 +41,8 @@ static int event_status(char **status)
        return 0;
 }
 
-static int event_reg(int fd, const char *command, int *status, int *write)
+static int event_reg(int fd, const char *command, long *index, long *mask,
+                    int *write)
 {
        struct user_reg reg = {0};
 
@@ -43,7 +52,8 @@ static int event_reg(int fd, const char *command, int *status, int *write)
        if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
                return -1;
 
-       *status = reg.status_index;
+       *index = reg.status_bit / __BITS_PER_LONG;
+       *mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG));
        *write = reg.write_index;
 
        return 0;
@@ -51,8 +61,9 @@ static int event_reg(int fd, const char *command, int *status, int *write)
 
 int main(int argc, char **argv)
 {
-       int data_fd, status, write;
-       char *status_page;
+       int data_fd, write;
+       long index, mask;
+       long *status_page;
        struct iovec io[2];
        __u32 count = 0;
 
@@ -61,7 +72,7 @@ int main(int argc, char **argv)
 
        data_fd = open(data_file, O_RDWR);
 
-       if (event_reg(data_fd, "test u32 count", &status, &write) == -1)
+       if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1)
                return errno;
 
        /* Setup iovec */
@@ -75,7 +86,7 @@ ask:
        getchar();
 
        /* Check if anyone is listening */
-       if (status_page[status]) {
+       if (status_page[index] & mask) {
                /* Yep, trace out our data */
                writev(data_fd, (const struct iovec *)io, 2);
 
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
new file mode 100644 (file)
index 0000000..fc1daac
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Event probe event parser error log check
+# requires: dynamic_events events/syscalls/sys_enter_openat "<attached-group>.<attached-event> [<args>]":README error_log
+
+check_error() { # command-with-error-pos-by-^
+    ftrace_errlog_check 'event_probe' "$1" 'dynamic_events'
+}
+
+check_error 'e ^a.'                    # NO_EVENT_INFO
+check_error 'e ^.b'                    # NO_EVENT_INFO
+check_error 'e ^a.b'                   # BAD_ATTACH_EVENT
+check_error 'e syscalls/sys_enter_openat ^foo' # BAD_ATTACH_ARG
+check_error 'e:^/bar syscalls/sys_enter_openat'        # NO_GROUP_NAME
+check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat'       # GROUP_TOO_LONG
+
+check_error 'e:^foo.1/bar syscalls/sys_enter_openat'   # BAD_GROUP_NAME
+check_error 'e:^ syscalls/sys_enter_openat'            # NO_EVENT_NAME
+check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat'       # EVENT_TOO_LONG
+check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'   # BAD_EVENT_NAME
+
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'     # BAD_FETCH_ARG
+check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'    # BAD_ATTACH_ARG
+
+check_error 'e:foo/bar syscalls/sys_enter_openat if ^' # NO_EP_FILTER
+
+exit 0
index a80fb5e..404a271 100644 (file)
@@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e
 const char *trace_file = "/sys/kernel/debug/tracing/trace";
 const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
 
+static inline int status_check(char *status_page, int status_bit)
+{
+       return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int trace_bytes(void)
 {
        int fd = open(trace_file, O_RDONLY);
@@ -197,12 +202,12 @@ TEST_F(user, register_events) {
        /* Register should work */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
+       ASSERT_NE(0, reg.status_bit);
 
        /* Multiple registers should result in same index */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
+       ASSERT_NE(0, reg.status_bit);
 
        /* Ensure disabled */
        self->enable_fd = open(enable_file, O_RDWR);
@@ -212,15 +217,15 @@ TEST_F(user, register_events) {
        /* MMAP should work and be zero'd */
        ASSERT_NE(MAP_FAILED, status_page);
        ASSERT_NE(NULL, status_page);
-       ASSERT_EQ(0, status_page[reg.status_index]);
+       ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
        /* Enable event and ensure bits updated in status */
        ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
-       ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]);
+       ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
        /* Disable event and ensure bits updated in status */
        ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
-       ASSERT_EQ(0, status_page[reg.status_index]);
+       ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
        /* File still open should return -EBUSY for delete */
        ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
@@ -240,6 +245,8 @@ TEST_F(user, write_events) {
        struct iovec io[3];
        __u32 field1, field2;
        int before = 0, after = 0;
+       int page_size = sysconf(_SC_PAGESIZE);
+       char *status_page;
 
        reg.size = sizeof(reg);
        reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
@@ -254,10 +261,18 @@ TEST_F(user, write_events) {
        io[2].iov_base = &field2;
        io[2].iov_len = sizeof(field2);
 
+       status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+                          self->status_fd, 0);
+
        /* Register should work */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
+       ASSERT_NE(0, reg.status_bit);
+
+       /* MMAP should work and be zero'd */
+       ASSERT_NE(MAP_FAILED, status_page);
+       ASSERT_NE(NULL, status_page);
+       ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
        /* Write should fail on invalid slot with ENOENT */
        io[0].iov_base = &field2;
@@ -271,6 +286,9 @@ TEST_F(user, write_events) {
        self->enable_fd = open(enable_file, O_RDWR);
        ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+       /* Event should now be enabled */
+       ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
        /* Write should make it out to ftrace buffers */
        before = trace_bytes();
        ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
@@ -298,7 +316,7 @@ TEST_F(user, write_fault) {
        /* Register should work */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
+       ASSERT_NE(0, reg.status_bit);
 
        /* Write should work normally */
        ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
@@ -315,6 +333,11 @@ TEST_F(user, write_validator) {
        int loc, bytes;
        char data[8];
        int before = 0, after = 0;
+       int page_size = sysconf(_SC_PAGESIZE);
+       char *status_page;
+
+       status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+                          self->status_fd, 0);
 
        reg.size = sizeof(reg);
        reg.name_args = (__u64)"__test_event __rel_loc char[] data";
@@ -322,7 +345,12 @@ TEST_F(user, write_validator) {
        /* Register should work */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
+       ASSERT_NE(0, reg.status_bit);
+
+       /* MMAP should work and be zero'd */
+       ASSERT_NE(MAP_FAILED, status_page);
+       ASSERT_NE(NULL, status_page);
+       ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
        io[0].iov_base = &reg.write_index;
        io[0].iov_len = sizeof(reg.write_index);
@@ -340,6 +368,9 @@ TEST_F(user, write_validator) {
        self->enable_fd = open(enable_file, O_RDWR);
        ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+       /* Event should now be enabled */
+       ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
        /* Full in-bounds write should work */
        before = trace_bytes();
        loc = DYN_LOC(0, bytes);
index 26851d5..8b4c787 100644 (file)
@@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid,
        return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags);
 }
 
+static inline int status_check(char *status_page, int status_bit)
+{
+       return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int get_id(void)
 {
        FILE *fp = fopen(id_file, "r");
@@ -120,8 +125,8 @@ TEST_F(user, perf_write) {
        /* Register should work */
        ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
        ASSERT_EQ(0, reg.write_index);
-       ASSERT_NE(0, reg.status_index);
-       ASSERT_EQ(0, status_page[reg.status_index]);
+       ASSERT_NE(0, reg.status_bit);
+       ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
        /* Id should be there */
        id = get_id();
@@ -144,7 +149,7 @@ TEST_F(user, perf_write) {
        ASSERT_NE(MAP_FAILED, perf_page);
 
        /* Status should be updated */
-       ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]);
+       ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
        event.index = reg.write_index;
        event.field1 = 0xc001;
index f4b712d..a5658bf 100644 (file)
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_GLOBAL(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
        .name = "MODEL_NAME",
        .description = "auto-generated MODEL_NAME",
        .enable = enable_MODEL_NAME,
@@ -72,13 +72,13 @@ struct rv_monitor rv_MODEL_NAME = {
        .enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
        rv_register_monitor(&rv_MODEL_NAME);
        return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
        rv_unregister_monitor(&rv_MODEL_NAME);
 }
index 4080d1c..03539a9 100644 (file)
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_PER_CPU(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
        .name = "MODEL_NAME",
        .description = "auto-generated MODEL_NAME",
        .enable = enable_MODEL_NAME,
@@ -72,13 +72,13 @@ struct rv_monitor rv_MODEL_NAME = {
        .enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
        rv_register_monitor(&rv_MODEL_NAME);
        return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
        rv_unregister_monitor(&rv_MODEL_NAME);
 }
index 8919717..ffd92af 100644 (file)
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_PER_TASK(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
        .name = "MODEL_NAME",
        .description = "auto-generated MODEL_NAME",
        .enable = enable_MODEL_NAME,
@@ -72,13 +72,13 @@ struct rv_monitor rv_MODEL_NAME = {
        .enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
        rv_register_monitor(&rv_MODEL_NAME);
        return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
        rv_unregister_monitor(&rv_MODEL_NAME);
 }