rtmutex: Turn the plist into an rb-tree
authorPeter Zijlstra <peterz@infradead.org>
Thu, 7 Nov 2013 13:43:43 +0000 (14:43 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 13 Jan 2014 12:41:50 +0000 (13:41 +0100)
Turn the pi-chains from plist to rb-tree, in the rt_mutex code,
and provide a proper comparison function for -deadline and
-priority tasks.

This is done mainly because:
 - classical prio field of the plist is just an int, which might
   not be enough for representing a deadline;
 - manipulating such a list would become O(nr_deadline_tasks),
   which might be to much, as the number of -deadline task increases.

Therefore, an rb-tree is used, and tasks are queued in it according
to the following logic:
 - among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the
   one with the higher (lower, actually!) prio wins;
 - among a -priority and a -deadline task, the latter always wins;
 - among two -deadline tasks, the one with the earliest deadline
   wins.

Queueing and dequeueing functions are changed accordingly, for both
the list of a task's pi-waiters and the list of tasks blocked on
a pi-lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-again-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/init_task.h
include/linux/rtmutex.h
include/linux/sched.h
kernel/fork.c
kernel/futex.c
kernel/locking/rtmutex-debug.c
kernel/locking/rtmutex.c
kernel/locking/rtmutex_common.h
kernel/sched/core.c

index b0ed422..f0e5238 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/user_namespace.h>
 #include <linux/securebits.h>
 #include <linux/seqlock.h>
+#include <linux/rbtree.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
@@ -154,6 +155,14 @@ extern struct task_group root_task_group;
 
 #define INIT_TASK_COMM "swapper"
 
+#ifdef CONFIG_RT_MUTEXES
+# define INIT_RT_MUTEXES(tsk)                                          \
+       .pi_waiters = RB_ROOT,                                          \
+       .pi_waiters_leftmost = NULL,
+#else
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -221,6 +230,7 @@ extern struct task_group root_task_group;
        INIT_TRACE_RECURSION                                            \
        INIT_TASK_RCU_PREEMPT(tsk)                                      \
        INIT_CPUSET_SEQ(tsk)                                            \
+       INIT_RT_MUTEXES(tsk)                                            \
        INIT_VTIME(tsk)                                                 \
 }
 
index de17134..3aed8d7 100644 (file)
@@ -13,7 +13,7 @@
 #define __LINUX_RT_MUTEX_H
 
 #include <linux/linkage.h>
-#include <linux/plist.h>
+#include <linux/rbtree.h>
 #include <linux/spinlock_types.h>
 
 extern int max_lock_depth; /* for sysctl */
@@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */
  * The rt_mutex structure
  *
  * @wait_lock: spinlock to protect the structure
- * @wait_list: pilist head to enqueue waiters in priority order
+ * @waiters:   rbtree root to enqueue waiters in priority order
+ * @waiters_leftmost: top waiter
  * @owner:     the mutex owner
  */
 struct rt_mutex {
        raw_spinlock_t          wait_lock;
-       struct plist_head       wait_list;
+       struct rb_root          waiters;
+       struct rb_node          *waiters_leftmost;
        struct task_struct      *owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
        int                     save_state;
@@ -66,7 +68,7 @@ struct hrtimer_sleeper;
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
        { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
-       , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \
+       , .waiters = RB_ROOT \
        , .owner = NULL \
        __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
 
@@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock);
 
 extern void rt_mutex_unlock(struct rt_mutex *lock);
 
-#ifdef CONFIG_RT_MUTEXES
-# define INIT_RT_MUTEXES(tsk)                                          \
-       .pi_waiters     = PLIST_HEAD_INIT(tsk.pi_waiters),      \
-       INIT_RT_MUTEX_DEBUG(tsk)
-#else
-# define INIT_RT_MUTEXES(tsk)
-#endif
-
 #endif
index 158f4c2..9ea1501 100644 (file)
@@ -16,6 +16,7 @@ struct sched_param {
 #include <linux/types.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/plist.h>
 #include <linux/rbtree.h>
 #include <linux/thread_info.h>
 #include <linux/cpumask.h>
@@ -1354,7 +1355,8 @@ struct task_struct {
 
 #ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task */
-       struct plist_head pi_waiters;
+       struct rb_root pi_waiters;
+       struct rb_node *pi_waiters_leftmost;
        /* Deadlock detection and priority inheritance handling */
        struct rt_mutex_waiter *pi_blocked_on;
 #endif
index e6c0f1a..7049ae5 100644 (file)
@@ -1087,7 +1087,8 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-       plist_head_init(&p->pi_waiters);
+       p->pi_waiters = RB_ROOT;
+       p->pi_waiters_leftmost = NULL;
        p->pi_blocked_on = NULL;
 #endif
 }
index f6ff019..679531c 100644 (file)
@@ -2316,6 +2316,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * code while we sleep on uaddr.
         */
        debug_rt_mutex_init_waiter(&rt_waiter);
+       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+       RB_CLEAR_NODE(&rt_waiter.tree_entry);
        rt_waiter.task = NULL;
 
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
index 13b243a..49b2ed3 100644 (file)
@@ -24,7 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>
-#include <linux/plist.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debug_locks.h>
 
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-       DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+       DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 {
        memset(waiter, 0x11, sizeof(*waiter));
-       plist_node_init(&waiter->list_entry, MAX_PRIO);
-       plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
        waiter->deadlock_task_pid = NULL;
 }
 
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
        put_pid(waiter->deadlock_task_pid);
-       DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
-       DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        memset(waiter, 0x22, sizeof(*waiter));
 }
 
index 0dd6aec..3bf0aa6 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
@@ -91,10 +92,104 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 }
 #endif
 
+static inline int
+rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+                    struct rt_mutex_waiter *right)
+{
+       if (left->task->prio < right->task->prio)
+               return 1;
+
+       /*
+        * If both tasks are dl_task(), we check their deadlines.
+        */
+       if (dl_prio(left->task->prio) && dl_prio(right->task->prio))
+               return (left->task->dl.deadline < right->task->dl.deadline);
+
+       return 0;
+}
+
+static void
+rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+       struct rb_node **link = &lock->waiters.rb_node;
+       struct rb_node *parent = NULL;
+       struct rt_mutex_waiter *entry;
+       int leftmost = 1;
+
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
+               if (rt_mutex_waiter_less(waiter, entry)) {
+                       link = &parent->rb_left;
+               } else {
+                       link = &parent->rb_right;
+                       leftmost = 0;
+               }
+       }
+
+       if (leftmost)
+               lock->waiters_leftmost = &waiter->tree_entry;
+
+       rb_link_node(&waiter->tree_entry, parent, link);
+       rb_insert_color(&waiter->tree_entry, &lock->waiters);
+}
+
+static void
+rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+       if (RB_EMPTY_NODE(&waiter->tree_entry))
+               return;
+
+       if (lock->waiters_leftmost == &waiter->tree_entry)
+               lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+
+       rb_erase(&waiter->tree_entry, &lock->waiters);
+       RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
+static void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+       struct rb_node **link = &task->pi_waiters.rb_node;
+       struct rb_node *parent = NULL;
+       struct rt_mutex_waiter *entry;
+       int leftmost = 1;
+
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
+               if (rt_mutex_waiter_less(waiter, entry)) {
+                       link = &parent->rb_left;
+               } else {
+                       link = &parent->rb_right;
+                       leftmost = 0;
+               }
+       }
+
+       if (leftmost)
+               task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+
+       rb_link_node(&waiter->pi_tree_entry, parent, link);
+       rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+}
+
+static void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+       if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+               return;
+
+       if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
+               task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+
+       rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+       RB_CLEAR_NODE(&waiter->pi_tree_entry);
+}
+
 /*
- * Calculate task priority from the waiter list priority
+ * Calculate task priority from the waiter tree priority
  *
- * Return task->normal_prio when the waiter list is empty or when
+ * Return task->normal_prio when the waiter tree is empty or when
  * the waiter is not allowed to do priority boosting
  */
 int rt_mutex_getprio(struct task_struct *task)
@@ -102,7 +197,7 @@ int rt_mutex_getprio(struct task_struct *task)
        if (likely(!task_has_pi_waiters(task)))
                return task->normal_prio;
 
-       return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+       return min(task_top_pi_waiter(task)->task->prio,
                   task->normal_prio);
 }
 
@@ -233,7 +328,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * When deadlock detection is off then we check, if further
         * priority adjustment is necessary.
         */
-       if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+       if (!detect_deadlock && waiter->task->prio == task->prio)
                goto out_unlock_pi;
 
        lock = waiter->lock;
@@ -254,9 +349,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        top_waiter = rt_mutex_top_waiter(lock);
 
        /* Requeue the waiter */
-       plist_del(&waiter->list_entry, &lock->wait_list);
-       waiter->list_entry.prio = task->prio;
-       plist_add(&waiter->list_entry, &lock->wait_list);
+       rt_mutex_dequeue(lock, waiter);
+       waiter->task->prio = task->prio;
+       rt_mutex_enqueue(lock, waiter);
 
        /* Release the task */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -280,17 +375,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
        if (waiter == rt_mutex_top_waiter(lock)) {
                /* Boost the owner */
-               plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
-               waiter->pi_list_entry.prio = waiter->list_entry.prio;
-               plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+               rt_mutex_dequeue_pi(task, top_waiter);
+               rt_mutex_enqueue_pi(task, waiter);
                __rt_mutex_adjust_prio(task);
 
        } else if (top_waiter == waiter) {
                /* Deboost the owner */
-               plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+               rt_mutex_dequeue_pi(task, waiter);
                waiter = rt_mutex_top_waiter(lock);
-               waiter->pi_list_entry.prio = waiter->list_entry.prio;
-               plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+               rt_mutex_enqueue_pi(task, waiter);
                __rt_mutex_adjust_prio(task);
        }
 
@@ -355,7 +448,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
         * 3) it is top waiter
         */
        if (rt_mutex_has_waiters(lock)) {
-               if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+               if (task->prio >= rt_mutex_top_waiter(lock)->task->prio) {
                        if (!waiter || waiter != rt_mutex_top_waiter(lock))
                                return 0;
                }
@@ -369,7 +462,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 
                /* remove the queued waiter. */
                if (waiter) {
-                       plist_del(&waiter->list_entry, &lock->wait_list);
+                       rt_mutex_dequeue(lock, waiter);
                        task->pi_blocked_on = NULL;
                }
 
@@ -379,8 +472,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                 */
                if (rt_mutex_has_waiters(lock)) {
                        top = rt_mutex_top_waiter(lock);
-                       top->pi_list_entry.prio = top->list_entry.prio;
-                       plist_add(&top->pi_list_entry, &task->pi_waiters);
+                       rt_mutex_enqueue_pi(task, top);
                }
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        }
@@ -416,13 +508,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
-       plist_node_init(&waiter->list_entry, task->prio);
-       plist_node_init(&waiter->pi_list_entry, task->prio);
 
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
                top_waiter = rt_mutex_top_waiter(lock);
-       plist_add(&waiter->list_entry, &lock->wait_list);
+       rt_mutex_enqueue(lock, waiter);
 
        task->pi_blocked_on = waiter;
 
@@ -433,8 +523,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
        if (waiter == rt_mutex_top_waiter(lock)) {
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
-               plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
-               plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+               rt_mutex_dequeue_pi(owner, top_waiter);
+               rt_mutex_enqueue_pi(owner, waiter);
 
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
@@ -486,7 +576,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * boosted mode and go back to normal after releasing
         * lock->wait_lock.
         */
-       plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+       rt_mutex_dequeue_pi(current, waiter);
 
        rt_mutex_set_owner(lock, NULL);
 
@@ -510,7 +600,7 @@ static void remove_waiter(struct rt_mutex *lock,
        int chain_walk = 0;
 
        raw_spin_lock_irqsave(&current->pi_lock, flags);
-       plist_del(&waiter->list_entry, &lock->wait_list);
+       rt_mutex_dequeue(lock, waiter);
        current->pi_blocked_on = NULL;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
@@ -521,13 +611,13 @@ static void remove_waiter(struct rt_mutex *lock,
 
                raw_spin_lock_irqsave(&owner->pi_lock, flags);
 
-               plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+               rt_mutex_dequeue_pi(owner, waiter);
 
                if (rt_mutex_has_waiters(lock)) {
                        struct rt_mutex_waiter *next;
 
                        next = rt_mutex_top_waiter(lock);
-                       plist_add(&next->pi_list_entry, &owner->pi_waiters);
+                       rt_mutex_enqueue_pi(owner, next);
                }
                __rt_mutex_adjust_prio(owner);
 
@@ -537,8 +627,6 @@ static void remove_waiter(struct rt_mutex *lock,
                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
 
-       WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-
        if (!chain_walk)
                return;
 
@@ -565,7 +653,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        raw_spin_lock_irqsave(&task->pi_lock, flags);
 
        waiter = task->pi_blocked_on;
-       if (!waiter || waiter->list_entry.prio == task->prio) {
+       if (!waiter || waiter->task->prio == task->prio) {
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
@@ -638,6 +726,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        int ret = 0;
 
        debug_rt_mutex_init_waiter(&waiter);
+       RB_CLEAR_NODE(&waiter.pi_tree_entry);
+       RB_CLEAR_NODE(&waiter.tree_entry);
 
        raw_spin_lock(&lock->wait_lock);
 
@@ -904,7 +994,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
        raw_spin_lock_init(&lock->wait_lock);
-       plist_head_init(&lock->wait_list);
+       lock->waiters = RB_ROOT;
+       lock->waiters_leftmost = NULL;
 
        debug_rt_mutex_init(lock, name);
 }
index 53a66c8..b65442f 100644 (file)
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
  * This is the control structure for tasks blocked on a rt_mutex,
  * which is allocated on the kernel stack on of the blocked task.
  *
- * @list_entry:                pi node to enqueue into the mutex waiters list
- * @pi_list_entry:     pi node to enqueue into the mutex owner waiters list
+ * @tree_entry:                pi node to enqueue into the mutex waiters tree
+ * @pi_tree_entry:     pi node to enqueue into the mutex owner waiters tree
  * @task:              task reference to the blocked task
  */
 struct rt_mutex_waiter {
-       struct plist_node       list_entry;
-       struct plist_node       pi_list_entry;
+       struct rb_node          tree_entry;
+       struct rb_node          pi_tree_entry;
        struct task_struct      *task;
        struct rt_mutex         *lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -57,11 +57,11 @@ struct rt_mutex_waiter {
 };
 
 /*
- * Various helpers to access the waiters-plist:
+ * Various helpers to access the waiters-tree:
  */
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
-       return !plist_head_empty(&lock->wait_list);
+       return !RB_EMPTY_ROOT(&lock->waiters);
 }
 
 static inline struct rt_mutex_waiter *
@@ -69,8 +69,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 {
        struct rt_mutex_waiter *w;
 
-       w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
-                              list_entry);
+       w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
+                    tree_entry);
        BUG_ON(w->lock != lock);
 
        return w;
@@ -78,14 +78,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 
 static inline int task_has_pi_waiters(struct task_struct *p)
 {
-       return !plist_head_empty(&p->pi_waiters);
+       return !RB_EMPTY_ROOT(&p->pi_waiters);
 }
 
 static inline struct rt_mutex_waiter *
 task_top_pi_waiter(struct task_struct *p)
 {
-       return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
-                                 pi_list_entry);
+       return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
+                       pi_tree_entry);
 }
 
 /*
index 069230b..aebcc70 100644 (file)
@@ -6635,10 +6635,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 
-#ifdef CONFIG_RT_MUTEXES
-       plist_head_init(&init_task.pi_waiters);
-#endif
-
        /*
         * The boot idle thread does lazy MMU switching as well:
         */