Merge branch 'uprobes/core' of git://git.kernel.org/pub/scm/linux/kernel/git/oleg...
authorIngo Molnar <mingo@kernel.org>
Mon, 11 Feb 2013 09:41:53 +0000 (10:41 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 11 Feb 2013 09:41:53 +0000 (10:41 +0100)
Improve uprobes performance by adding 'pre-filtering' support,
by Oleg Nesterov:

# time perl -e 'syscall -1 for 1..100_000'
real    0m0.040s
user    0m0.027s
sys     0m0.010s

# perf probe -x /lib/libc.so.6 syscall
# perf record -e probe_libc:syscall sleep 100 &

Before this series:

# time perl -e 'syscall -1 for 1..100_000'
real    0m1.714s
user    0m0.103s
sys     0m1.607s

After:

# time perl -e 'syscall -1 for 1..100_000'
real    0m0.037s
user    0m0.013s
sys     0m0.023s

Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/kernel/uprobes.c
include/linux/perf_event.h
include/linux/uprobes.h
kernel/events/core.c
kernel/events/uprobes.c
kernel/ptrace.c
kernel/trace/trace_probe.h
kernel/trace/trace_uprobe.c

index c71025b..0ba4cfb 100644 (file)
@@ -680,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
                if (auprobe->insn[i] == 0x66)
                        continue;
 
-               if (auprobe->insn[i] == 0x90)
+               if (auprobe->insn[i] == 0x90) {
+                       regs->ip += i + 1;
                        return true;
+               }
 
                break;
        }
index 42adf01..e47ee46 100644 (file)
@@ -135,16 +135,21 @@ struct hw_perf_event {
                struct { /* software */
                        struct hrtimer  hrtimer;
                };
+               struct { /* tracepoint */
+                       struct task_struct      *tp_target;
+                       /* for tp_event->class */
+                       struct list_head        tp_list;
+               };
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
-                       struct arch_hw_breakpoint       info;
-                       struct list_head                bp_list;
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct task_struct              *bp_target;
+                       struct arch_hw_breakpoint       info;
+                       struct list_head                bp_list;
                };
 #endif
        };
index 4f628a6..02b83db 100644 (file)
@@ -35,13 +35,20 @@ struct inode;
 # include <asm/uprobes.h>
 #endif
 
+#define UPROBE_HANDLER_REMOVE          1
+#define UPROBE_HANDLER_MASK            1
+
+enum uprobe_filter_ctx {
+       UPROBE_FILTER_REGISTER,
+       UPROBE_FILTER_UNREGISTER,
+       UPROBE_FILTER_MMAP,
+};
+
 struct uprobe_consumer {
        int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
-       /*
-        * filter is optional; If a filter exists, handler is run
-        * if and only if filter returns true.
-        */
-       bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
+       bool (*filter)(struct uprobe_consumer *self,
+                               enum uprobe_filter_ctx ctx,
+                               struct mm_struct *mm);
 
        struct uprobe_consumer *next;
 };
@@ -94,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
@@ -117,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
        return -ENOSYS;
 }
+static inline int
+uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+{
+       return -ENOSYS;
+}
 static inline void
 uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
index 301079d..e2d4323 100644 (file)
@@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
+
+               if (attr->type == PERF_TYPE_TRACEPOINT)
+                       event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
                 * hw_breakpoint is a bit difficult here..
                 */
-               if (attr->type == PERF_TYPE_BREAKPOINT)
+               else if (attr->type == PERF_TYPE_BREAKPOINT)
                        event->hw.bp_target = task;
 #endif
        }
index 30ea9a4..a567c8c 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>     /* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/export.h>
 #include <linux/rmap.h>                /* anon_vma_prepare */
 #include <linux/mmu_notifier.h>        /* set_pte_at_notify */
 #include <linux/swap.h>                /* try_to_free_swap */
 #define MAX_UPROBE_XOL_SLOTS           UINSNS_PER_PAGE
 
 static struct rb_root uprobes_tree = RB_ROOT;
-
-static DEFINE_SPINLOCK(uprobes_treelock);      /* serialize rbtree access */
-
-#define UPROBES_HASH_SZ        13
-
 /*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time.  Probably a fine grained per inode count is better?
  */
+#define no_uprobe_events()     RB_EMPTY_ROOT(&uprobes_tree)
 
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v)                (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+static DEFINE_SPINLOCK(uprobes_treelock);      /* serialize rbtree access */
 
+#define UPROBES_HASH_SZ        13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)   (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
 
 static struct percpu_rw_semaphore dup_mmap_sem;
 
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN       0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER     1
 /* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP      2
+#define UPROBE_SKIP_SSTEP      1
 
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
+       struct rw_semaphore     register_rwsem;
        struct rw_semaphore     consumer_rwsem;
-       struct mutex            copy_mutex;     /* TODO: kill me and UPROBE_COPY_INSN */
        struct list_head        pending_list;
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        u = __insert_uprobe(uprobe);
        spin_unlock(&uprobes_treelock);
 
-       /* For now assume that the instruction need not be single-stepped */
-       __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-
        return u;
 }
 
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
+       init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
-       mutex_init(&uprobe->copy_mutex);
+       /* For now assume that the instruction need not be single-stepped */
+       __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
 
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
                kfree(uprobe);
                uprobe = cur_uprobe;
                iput(inode);
-       } else {
-               atomic_inc(&uprobe_events);
        }
 
        return uprobe;
 }
 
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
-       struct uprobe_consumer *uc;
-
-       if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
-               return;
-
-       down_read(&uprobe->consumer_rwsem);
-       for (uc = uprobe->consumers; uc; uc = uc->next) {
-               if (!uc->filter || uc->filter(uc, current))
-                       uc->handler(uc, regs);
-       }
-       up_read(&uprobe->consumer_rwsem);
-}
-
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
-
-       return uc->next;
 }
 
 /*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;
 
-       mutex_lock(&uprobe->copy_mutex);
+       /* TODO: move this into _register, until then we abuse this sem. */
+       down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;
 
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 
  out:
-       mutex_unlock(&uprobe->copy_mutex);
+       up_write(&uprobe->consumer_rwsem);
+
+       return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+                                  enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+       return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+                        enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+       struct uprobe_consumer *uc;
+       bool ret = false;
+
+       down_read(&uprobe->consumer_rwsem);
+       for (uc = uprobe->consumers; uc; uc = uc->next) {
+               ret = consumer_filter(uc, ctx, mm);
+               if (ret)
+                       break;
+       }
+       up_read(&uprobe->consumer_rwsem);
 
        return ret;
 }
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        bool first_uprobe;
        int ret;
 
-       /*
-        * If probe is being deleted, unregister thread could be done with
-        * the vma-rmap-walk through. Adding a probe now can be fatal since
-        * nobody will be able to cleanup. Also we could be from fork or
-        * mremap path, where the probe might have already been inserted.
-        * Hence behave as if probe already existed.
-        */
-       if (!uprobe->consumers)
-               return 0;
-
        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 static int
 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-       /* can happen if uprobe_register() fails */
-       if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-               return 0;
-
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
 
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+       return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
 /*
  * There could be threads that have already hit the breakpoint. They
  * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
  */
 static void delete_uprobe(struct uprobe *uprobe)
 {
+       if (WARN_ON(!uprobe_is_active(uprobe)))
+               return;
+
        spin_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock(&uprobes_treelock);
+       RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        iput(uprobe->inode);
        put_uprobe(uprobe);
-       atomic_dec(&uprobe_events);
 }
 
 struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
        return curr;
 }
 
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 {
+       bool is_register = !!new;
        struct map_info *info;
        int err = 0;
 
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;
 
-               if (is_register)
-                       err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-               else
-                       err |= remove_breakpoint(uprobe, mm, info->vaddr);
+               if (is_register) {
+                       /* consult only the "caller", new consumer. */
+                       if (consumer_filter(new,
+                                       UPROBE_FILTER_REGISTER, mm))
+                               err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+               } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+                       if (!filter_chain(uprobe,
+                                       UPROBE_FILTER_UNREGISTER, mm))
+                               err |= remove_breakpoint(uprobe, mm, info->vaddr);
+               }
 
  unlock:
                up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
        return err;
 }
 
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-       return register_for_each_vma(uprobe, true);
+       consumer_add(uprobe, uc);
+       return register_for_each_vma(uprobe, uc);
 }
 
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-       if (!register_for_each_vma(uprobe, false))
-               delete_uprobe(uprobe);
+       int err;
+
+       if (!consumer_del(uprobe, uc))  /* WARN? */
+               return;
 
+       err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
+       if (!uprobe->consumers && !err)
+               delete_uprobe(uprobe);
 }
 
 /*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        struct uprobe *uprobe;
        int ret;
 
-       if (!inode || !uc || uc->next)
-               return -EINVAL;
-
+       /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;
 
-       ret = 0;
-       mutex_lock(uprobes_hash(inode));
+ retry:
        uprobe = alloc_uprobe(inode, offset);
-
-       if (!uprobe) {
-               ret = -ENOMEM;
-       } else if (!consumer_add(uprobe, uc)) {
-               ret = __uprobe_register(uprobe);
-               if (ret) {
-                       uprobe->consumers = NULL;
-                       __uprobe_unregister(uprobe);
-               } else {
-                       set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-               }
+       if (!uprobe)
+               return -ENOMEM;
+       /*
+        * We can race with uprobe_unregister()->delete_uprobe().
+        * Check uprobe_is_active() and retry if it is false.
+        */
+       down_write(&uprobe->register_rwsem);
+       ret = -EAGAIN;
+       if (likely(uprobe_is_active(uprobe))) {
+               ret = __uprobe_register(uprobe, uc);
+               if (ret)
+                       __uprobe_unregister(uprobe, uc);
        }
+       up_write(&uprobe->register_rwsem);
+       put_uprobe(uprobe);
 
-       mutex_unlock(uprobes_hash(inode));
-       if (uprobe)
-               put_uprobe(uprobe);
+       if (unlikely(ret == -EAGAIN))
+               goto retry;
+       return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+                       struct uprobe_consumer *uc, bool add)
+{
+       struct uprobe *uprobe;
+       struct uprobe_consumer *con;
+       int ret = -ENOENT;
+
+       uprobe = find_uprobe(inode, offset);
+       if (!uprobe)
+               return ret;
+
+       down_write(&uprobe->register_rwsem);
+       for (con = uprobe->consumers; con && con != uc ; con = con->next)
+               ;
+       if (con)
+               ret = register_for_each_vma(uprobe, add ? uc : NULL);
+       up_write(&uprobe->register_rwsem);
+       put_uprobe(uprobe);
 
        return ret;
 }
@@ -884,24 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 {
        struct uprobe *uprobe;
 
-       if (!inode || !uc)
-               return;
-
        uprobe = find_uprobe(inode, offset);
        if (!uprobe)
                return;
 
-       mutex_lock(uprobes_hash(inode));
+       down_write(&uprobe->register_rwsem);
+       __uprobe_unregister(uprobe, uc);
+       up_write(&uprobe->register_rwsem);
+       put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
 
-       if (consumer_del(uprobe, uc)) {
-               if (!uprobe->consumers) {
-                       __uprobe_unregister(uprobe);
-                       clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-               }
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+       struct vm_area_struct *vma;
+       int err = 0;
+
+       down_read(&mm->mmap_sem);
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               unsigned long vaddr;
+               loff_t offset;
+
+               if (!valid_vma(vma, false) ||
+                   vma->vm_file->f_mapping->host != uprobe->inode)
+                       continue;
+
+               offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+               if (uprobe->offset <  offset ||
+                   uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+                       continue;
+
+               vaddr = offset_to_vaddr(vma, uprobe->offset);
+               err |= remove_breakpoint(uprobe, mm, vaddr);
        }
+       up_read(&mm->mmap_sem);
 
-       mutex_unlock(uprobes_hash(inode));
-       put_uprobe(uprobe);
+       return err;
 }
 
 static struct rb_node *
@@ -978,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
        struct uprobe *uprobe, *u;
        struct inode *inode;
 
-       if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+       if (no_uprobe_events() || !valid_vma(vma, true))
                return 0;
 
        inode = vma->vm_file->f_mapping->host;
@@ -987,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
 
        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-
+       /*
+        * We can race with uprobe_unregister(), this uprobe can be already
+        * removed. But in this case filter_chain() must return false, all
+        * consumers have gone away.
+        */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-               if (!fatal_signal_pending(current)) {
+               if (!fatal_signal_pending(current) &&
+                   filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
@@ -1024,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
  */
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-       if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+       if (no_uprobe_events() || !valid_vma(vma, false))
                return;
 
        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1041,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 /* Slot allocation for XOL */
 static int xol_add_vma(struct xol_area *area)
 {
-       struct mm_struct *mm;
-       int ret;
-
-       area->page = alloc_page(GFP_HIGHUSER);
-       if (!area->page)
-               return -ENOMEM;
-
-       ret = -EALREADY;
-       mm = current->mm;
+       struct mm_struct *mm = current->mm;
+       int ret = -EALREADY;
 
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
 
        ret = -ENOMEM;
-
        /* Try to map as high as possible, this is only a hint. */
        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
        if (area->vaddr & ~PAGE_MASK) {
@@ -1072,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
        ret = 0;
-
-fail:
+ fail:
        up_write(&mm->mmap_sem);
-       if (ret)
-               __free_page(area->page);
 
        return ret;
 }
 
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
-       struct xol_area *area;
-
-       area = mm->uprobes_state.xol_area;
-       smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-
-       return area;
-}
-
 /*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
  *
  * Returns the allocated area or NULL.
  */
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
 {
+       struct mm_struct *mm = current->mm;
        struct xol_area *area;
 
+       area = mm->uprobes_state.xol_area;
+       if (area)
+               goto ret;
+
        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
-               return NULL;
+               goto out;
 
        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-
        if (!area->bitmap)
-               goto fail;
+               goto free_area;
+
+       area->page = alloc_page(GFP_HIGHUSER);
+       if (!area->page)
+               goto free_bitmap;
 
        init_waitqueue_head(&area->wq);
        if (!xol_add_vma(area))
                return area;
 
-fail:
+       __free_page(area->page);
+ free_bitmap:
        kfree(area->bitmap);
+ free_area:
        kfree(area);
-
-       return get_xol_area(current->mm);
+ out:
+       area = mm->uprobes_state.xol_area;
+ ret:
+       smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+       return area;
 }
 
 /*
@@ -1185,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 }
 
 /*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
+ * xol_get_insn_slot - allocate a slot for xol.
  * Returns the allocated slot address or 0.
  */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
        struct xol_area *area;
        unsigned long offset;
+       unsigned long xol_vaddr;
        void *vaddr;
 
-       area = get_xol_area(current->mm);
-       if (!area) {
-               area = xol_alloc_area();
-               if (!area)
-                       return 0;
-       }
-       current->utask->xol_vaddr = xol_take_insn_slot(area);
+       area = get_xol_area();
+       if (!area)
+               return 0;
 
-       /*
-        * Initialize the slot if xol_vaddr points to valid
-        * instruction slot.
-        */
-       if (unlikely(!current->utask->xol_vaddr))
+       xol_vaddr = xol_take_insn_slot(area);
+       if (unlikely(!xol_vaddr))
                return 0;
 
-       current->utask->vaddr = slot_addr;
-       offset = current->utask->xol_vaddr & ~PAGE_MASK;
+       /* Initialize the slot */
+       offset = xol_vaddr & ~PAGE_MASK;
        vaddr = kmap_atomic(area->page);
        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
        kunmap_atomic(vaddr);
@@ -1221,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
         */
        flush_dcache_page(area->page);
 
-       return current->utask->xol_vaddr;
+       return xol_vaddr;
 }
 
 /*
@@ -1239,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
                return;
 
        slot_addr = tsk->utask->xol_vaddr;
-
-       if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+       if (unlikely(!slot_addr))
                return;
 
        area = tsk->mm->uprobes_state.xol_area;
@@ -1302,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
 }
 
 /*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
  *
  * Returns:
  * - pointer to new uprobe_task on success
  * - NULL otherwise
  */
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
 {
-       struct uprobe_task *utask;
-
-       utask = kzalloc(sizeof *utask, GFP_KERNEL);
-       if (unlikely(!utask))
-               return NULL;
-
-       current->utask = utask;
-       return utask;
+       if (!current->utask)
+               current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+       return current->utask;
 }
 
 /* Prepare to single-step probed instruction out of line. */
 static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 {
-       if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
-               return 0;
+       struct uprobe_task *utask;
+       unsigned long xol_vaddr;
+       int err;
+
+       utask = get_utask();
+       if (!utask)
+               return -ENOMEM;
+
+       xol_vaddr = xol_get_insn_slot(uprobe);
+       if (!xol_vaddr)
+               return -ENOMEM;
+
+       utask->xol_vaddr = xol_vaddr;
+       utask->vaddr = bp_vaddr;
+
+       err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+       if (unlikely(err)) {
+               xol_free_insn_slot(current);
+               return err;
+       }
 
-       return -EFAULT;
+       utask->active_uprobe = uprobe;
+       utask->state = UTASK_SSTEP;
+       return 0;
 }
 
 /*
@@ -1390,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
+                * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
@@ -1451,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
        return uprobe;
 }
 
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+       struct uprobe_consumer *uc;
+       int remove = UPROBE_HANDLER_REMOVE;
+
+       down_read(&uprobe->register_rwsem);
+       for (uc = uprobe->consumers; uc; uc = uc->next) {
+               int rc = uc->handler(uc, regs);
+
+               WARN(rc & ~UPROBE_HANDLER_MASK,
+                       "bad rc=0x%x from %pf()\n", rc, uc->handler);
+               remove &= rc;
+       }
+
+       if (remove && uprobe->consumers) {
+               WARN_ON(!uprobe_is_active(uprobe));
+               unapply_uprobe(uprobe, current->mm);
+       }
+       up_read(&uprobe->register_rwsem);
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
  */
 static void handle_swbp(struct pt_regs *regs)
 {
-       struct uprobe_task *utask;
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int uninitialized_var(is_swbp);
@@ -1482,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
                }
                return;
        }
+
+       /* change it in advance for ->handler() and restart */
+       instruction_pointer_set(regs, bp_vaddr);
+
        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
@@ -1489,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
         */
        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
-               goto restart;
-
-       utask = current->utask;
-       if (!utask) {
-               utask = add_utask();
-               /* Cannot allocate; re-execute the instruction. */
-               if (!utask)
-                       goto restart;
-       }
+               goto out;
 
        handler_chain(uprobe, regs);
        if (can_skip_sstep(uprobe, regs))
                goto out;
 
-       if (!pre_ssout(uprobe, regs, bp_vaddr)) {
-               utask->active_uprobe = uprobe;
-               utask->state = UTASK_SSTEP;
+       if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;
-       }
 
-restart:
-       /*
-        * cannot singlestep; cannot skip instruction;
-        * re-execute the instruction.
-        */
-       instruction_pointer_set(regs, bp_vaddr);
+       /* can_skip_sstep() succeeded, or restart if can't singlestep */
 out:
        put_uprobe(uprobe);
 }
@@ -1608,10 +1649,8 @@ static int __init init_uprobes(void)
 {
        int i;
 
-       for (i = 0; i < UPROBES_HASH_SZ; i++) {
-               mutex_init(&uprobes_mutex[i]);
+       for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);
-       }
 
        if (percpu_init_rwsem(&dup_mmap_sem))
                return -ENOMEM;
index 6cbeaae..acbd284 100644 (file)
@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
                                             kiov->iov_len, kiov->iov_base);
 }
 
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code.  We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
 #endif
 
 int ptrace_request(struct task_struct *child, long request,
index 9337086..5c7e09d 100644 (file)
@@ -66,7 +66,6 @@
 #define TP_FLAG_TRACE          1
 #define TP_FLAG_PROFILE                2
 #define TP_FLAG_REGISTERED     4
-#define TP_FLAG_UPROBE         8
 
 
 /* data_rloc: data relative location, compatible with u32 */
index 87b6db4..8dad2a9 100644 (file)
 
 #define UPROBE_EVENT_SYSTEM    "uprobes"
 
+struct trace_uprobe_filter {
+       rwlock_t                rwlock;
+       int                     nr_systemwide;
+       struct list_head        perf_events;
+};
+
 /*
  * uprobe event core functions
  */
-struct trace_uprobe;
-struct uprobe_trace_consumer {
-       struct uprobe_consumer          cons;
-       struct trace_uprobe             *tu;
-};
-
 struct trace_uprobe {
        struct list_head                list;
        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-       struct uprobe_trace_consumer    *consumer;
+       struct trace_uprobe_filter      filter;
+       struct uprobe_consumer          consumer;
        struct inode                    *inode;
        char                            *filename;
        unsigned long                   offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
 
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+       rwlock_init(&filter->rwlock);
+       filter->nr_systemwide = 0;
+       INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+       return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
 /*
  * Allocate new trace_uprobe and initialize it (including uprobes).
  */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
                goto error;
 
        INIT_LIST_HEAD(&tu->list);
+       tu->consumer.handler = uprobe_dispatcher;
+       init_trace_uprobe_filter(&tu->filter);
        return tu;
 
 error:
@@ -253,16 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
        if (ret)
                goto fail_address_parse;
 
-       ret = kstrtoul(arg, 0, &offset);
-       if (ret)
-               goto fail_address_parse;
-
        inode = igrab(path.dentry->d_inode);
-       if (!S_ISREG(inode->i_mode)) {
+       path_put(&path);
+
+       if (!inode || !S_ISREG(inode->i_mode)) {
                ret = -EINVAL;
                goto fail_address_parse;
        }
 
+       ret = kstrtoul(arg, 0, &offset);
+       if (ret)
+               goto fail_address_parse;
+
        argc -= 2;
        argv += 2;
 
@@ -469,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
 };
 
 /* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct uprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
@@ -479,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tu->call;
 
-       tu->nhit++;
-
        local_save_flags(irq_flags);
        pc = preempt_count();
 
@@ -489,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
                                                  size, irq_flags, pc);
        if (!event)
-               return;
+               return 0;
 
        entry = ring_buffer_event_data(event);
-       entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+       entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+       return 0;
 }
 
 /* Event entry printers */
@@ -537,42 +554,43 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
 {
-       struct uprobe_trace_consumer *utc;
-       int ret = 0;
+       return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
+}
 
-       if (!tu->inode || tu->consumer)
-               return -EINTR;
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+                               enum uprobe_filter_ctx ctx,
+                               struct mm_struct *mm);
+
+static int
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
+       int ret = 0;
 
-       utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
-       if (!utc)
+       if (is_trace_uprobe_enabled(tu))
                return -EINTR;
 
-       utc->cons.handler = uprobe_dispatcher;
-       utc->cons.filter = NULL;
-       ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
-       if (ret) {
-               kfree(utc);
-               return ret;
-       }
+       WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
        tu->flags |= flag;
-       utc->tu = tu;
-       tu->consumer = utc;
+       tu->consumer.filter = filter;
+       ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+       if (ret)
+               tu->flags &= ~flag;
 
-       return 0;
+       return ret;
 }
 
 static void probe_event_disable(struct trace_uprobe *tu, int flag)
 {
-       if (!tu->inode || !tu->consumer)
+       if (!is_trace_uprobe_enabled(tu))
                return;
 
-       uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+       WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+       uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
        tu->flags &= ~flag;
-       kfree(tu->consumer);
-       tu->consumer = NULL;
 }
 
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -646,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
 }
 
 #ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+       struct perf_event *event;
+
+       if (filter->nr_systemwide)
+               return true;
+
+       list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+               if (event->hw.tp_target->mm == mm)
+                       return true;
+       }
+
+       return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+       return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+       bool done;
+
+       write_lock(&tu->filter.rwlock);
+       if (event->hw.tp_target) {
+               /*
+                * event->parent != NULL means copy_process(), we can avoid
+                * uprobe_apply(). current->mm must be probed and we can rely
+                * on dup_mmap() which preserves the already installed bp's.
+                *
+                * attr.enable_on_exec means that exec/mmap will install the
+                * breakpoints we need.
+                */
+               done = tu->filter.nr_systemwide ||
+                       event->parent || event->attr.enable_on_exec ||
+                       uprobe_filter_event(tu, event);
+               list_add(&event->hw.tp_list, &tu->filter.perf_events);
+       } else {
+               done = tu->filter.nr_systemwide;
+               tu->filter.nr_systemwide++;
+       }
+       write_unlock(&tu->filter.rwlock);
+
+       if (!done)
+               uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+
+       return 0;
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+       bool done;
+
+       write_lock(&tu->filter.rwlock);
+       if (event->hw.tp_target) {
+               list_del(&event->hw.tp_list);
+               done = tu->filter.nr_systemwide ||
+                       (event->hw.tp_target->flags & PF_EXITING) ||
+                       uprobe_filter_event(tu, event);
+       } else {
+               tu->filter.nr_systemwide--;
+               done = tu->filter.nr_systemwide;
+       }
+       write_unlock(&tu->filter.rwlock);
+
+       if (!done)
+               uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+       return 0;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+                               enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+       struct trace_uprobe *tu;
+       int ret;
+
+       tu = container_of(uc, struct trace_uprobe, consumer);
+       read_lock(&tu->filter.rwlock);
+       ret = __uprobe_perf_filter(&tu->filter, mm);
+       read_unlock(&tu->filter.rwlock);
+
+       return ret;
+}
+
 /* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tu->call;
        struct uprobe_trace_entry_head *entry;
@@ -656,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        int size, __size, i;
        int rctx;
 
+       if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+               return UPROBE_HANDLER_REMOVE;
+
        __size = sizeof(*entry) + tu->size;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-               return;
+               return 0;
 
        preempt_disable();
 
@@ -668,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        if (!entry)
                goto out;
 
-       entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+       entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -678,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 
  out:
        preempt_enable();
+       return 0;
 }
 #endif /* CONFIG_PERF_EVENTS */
 
@@ -688,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 
        switch (type) {
        case TRACE_REG_REGISTER:
-               return probe_event_enable(tu, TP_FLAG_TRACE);
+               return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
 
        case TRACE_REG_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_TRACE);
@@ -696,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-               return probe_event_enable(tu, TP_FLAG_PROFILE);
+               return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
 
        case TRACE_REG_PERF_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_PROFILE);
                return 0;
+
+       case TRACE_REG_PERF_OPEN:
+               return uprobe_perf_open(tu, data);
+
+       case TRACE_REG_PERF_CLOSE:
+               return uprobe_perf_close(tu, data);
+
 #endif
        default:
                return 0;
@@ -710,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
-       struct uprobe_trace_consumer *utc;
        struct trace_uprobe *tu;
+       int ret = 0;
 
-       utc = container_of(con, struct uprobe_trace_consumer, cons);
-       tu = utc->tu;
-       if (!tu || tu->consumer != utc)
-               return 0;
+       tu = container_of(con, struct trace_uprobe, consumer);
+       tu->nhit++;
 
        if (tu->flags & TP_FLAG_TRACE)
-               uprobe_trace_func(tu, regs);
+               ret |= uprobe_trace_func(tu, regs);
 
 #ifdef CONFIG_PERF_EVENTS
        if (tu->flags & TP_FLAG_PROFILE)
-               uprobe_perf_func(tu, regs);
+               ret |= uprobe_perf_func(tu, regs);
 #endif
-       return 0;
+       return ret;
 }
 
 static struct trace_event_functions uprobe_funcs = {