Merge branch 'sched/locking' into sched/core

author Ingo Molnar <mingo@elte.hu>

Mon, 18 Apr 2011 12:53:18 +0000 (14:53 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 18 Apr 2011 12:53:33 +0000 (14:53 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 18 Apr 2011 12:53:18 +0000 (14:53 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 18 Apr 2011 12:53:33 +0000 (14:53 +0200)
diff --combined include/linux/sched.h

index e43e5b0,e09dafa..d9ca3aa
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -360,7 -360,7 +360,7 @@@ extern signed long schedule_timeout_int
   extern signed long schedule_timeout_killable(signed long timeout);
   extern signed long schedule_timeout_uninterruptible(signed long timeout);
   asmlinkage void schedule(void);
- extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
   
   struct nsproxy;
   struct user_namespace;
@@@ -868,7 -868,6 +868,7 @@@ static inline int sd_power_saving_flags
   
   struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
+ +      atomic_t ref;
   
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -883,6 -882,9 +883,6 @@@
          * NOTE: this field is variable length. (Allocated dynamically
          * by attaching extra space to the end of the structure,
          * depending on how many CPUs the kernel has booted up with)
- -       *
- -       * It is also be embedded into static data structures at build
- -       * time. (See 'struct static_sched_group' in kernel/sched.c)
          */
         unsigned long cpumask[0];
   };
@@@ -892,6 -894,17 +892,6 @@@ static inline struct cpumask *sched_gro
         return to_cpumask(sg->cpumask);
   }
   
- -enum sched_domain_level {
- -      SD_LV_NONE = 0,
- -      SD_LV_SIBLING,
- -      SD_LV_MC,
- -      SD_LV_BOOK,
- -      SD_LV_CPU,
- -      SD_LV_NODE,
- -      SD_LV_ALLNODES,
- -      SD_LV_MAX
- -};
- -
   struct sched_domain_attr {
         int relax_domain_level;
   };
@@@ -900,8 -913,6 +900,8 @@@
         .relax_domain_level = -1,                       \
   }
   
+ +extern int sched_domain_level_max;
+ +
   struct sched_domain {
         /* These fields must be setup */
         struct sched_domain *parent;    /* top domain must be null terminated */
@@@ -919,7 -930,7 +919,7 @@@
         unsigned int forkexec_idx;
         unsigned int smt_gain;
         int flags;                      /* See SD_* */
- -      enum sched_domain_level level;
+ +      int level;
   
         /* Runtime fields. */
         unsigned long last_balance;     /* init to jiffies. units in jiffies */
@@@ -962,10 -973,6 +962,10 @@@
   #ifdef CONFIG_SCHED_DEBUG
         char *name;
   #endif
+ +      union {
+ +              void *private;          /* used during construction */
+ +              struct rcu_head rcu;    /* used during destruction */
+ +      };
   
         unsigned int span_weight;
         /*
@@@ -974,6 -981,9 +974,6 @@@
          * NOTE: this field is variable length. (Allocated dynamically
          * by attaching extra space to the end of the structure,
          * depending on how many CPUs the kernel has booted up with)
- -       *
- -       * It is also be embedded into static data structures at build
- -       * time. (See 'struct static_sched_domain' in kernel/sched.c)
          */
         unsigned long span[0];
   };
@@@ -1038,8 -1048,12 +1038,12 @@@ struct sched_domain
   #define WF_FORK               0x02            /* child wakeup after fork */
   
   #define ENQUEUE_WAKEUP                1
- #define ENQUEUE_WAKING                2
- #define ENQUEUE_HEAD          4
+ #define ENQUEUE_HEAD          2
+ #ifdef CONFIG_SMP
+ #define ENQUEUE_WAKING                4       /* sched_class::task_waking was called */
+ #else
+ #define ENQUEUE_WAKING                0
+ #endif
   
   #define DEQUEUE_SLEEP         1
   
@@@ -1057,12 -1071,11 +1061,11 @@@ struct sched_class 
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
   
   #ifdef CONFIG_SMP
-       int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
-                              int sd_flag, int flags);
+       int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
   
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
         void (*post_schedule) (struct rq *this_rq);
-       void (*task_waking) (struct rq *this_rq, struct task_struct *task);
+       void (*task_waking) (struct task_struct *task);
         void (*task_woken) (struct rq *this_rq, struct task_struct *task);
   
         void (*set_cpus_allowed)(struct task_struct *p,
@@@ -1190,10 -1203,10 +1193,10 @@@ struct task_struct 
         int lock_depth;         /* BKL lock depth */
   
   #ifdef CONFIG_SMP
- #ifdef __ARCH_WANT_UNLOCKED_CTXSW
-       int oncpu;
- #endif
+       struct task_struct *wake_entry;
+       int on_cpu;
   #endif
+       int on_rq;
   
         int prio, static_prio, normal_prio;
         unsigned int rt_priority;
@@@ -1261,6 -1274,7 +1264,7 @@@
   
         /* Revert to default priority/policy when forking */
         unsigned sched_reset_on_fork:1;
+       unsigned sched_contributes_to_load:1;
   
         pid_t pid;
         pid_t tgid;
@@@ -2179,8 -2193,10 +2183,10 @@@ extern void set_task_comm(struct task_s
   extern char *get_task_comm(char *to, struct task_struct *tsk);
   
   #ifdef CONFIG_SMP
+ void scheduler_ipi(void);
   extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
   #else
+ static inline void scheduler_ipi(void) { }
   static inline unsigned long wait_task_inactive(struct task_struct *p,
                                                long match_state)
   {
diff --combined kernel/sched.c

index 506cb81,cd597c7..0cfe031
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -231,7 -231,7 +231,7 @@@ static void destroy_rt_bandwidth(struc
   #endif
   
   /*
- - * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ + * sched_domains_mutex serializes calls to init_sched_domains,
    * detach_destroy_domains and partition_sched_domains.
    */
   static DEFINE_MUTEX(sched_domains_mutex);
@@@ -312,6 -312,9 +312,9 @@@ struct cfs_rq 
   
         u64 exec_clock;
         u64 min_vruntime;
+ #ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+ #endif
   
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@@ -417,7 -420,6 +420,7 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
+ +      struct rcu_head rcu;
         cpumask_var_t span;
         cpumask_var_t online;
   
@@@ -554,6 -556,10 +557,10 @@@ struct rq 
         unsigned int ttwu_count;
         unsigned int ttwu_local;
   #endif
+ 
+ #ifdef CONFIG_SMP
+       struct task_struct *wake_list;
+ #endif
   };
   
   static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@@ -572,7 -578,7 +579,7 @@@ static inline int cpu_of(struct rq *rq
   
   #define rcu_dereference_check_sched_domain(p) \
         rcu_dereference_check((p), \
- -                            rcu_read_lock_sched_held() || \
+ +                            rcu_read_lock_held() || \
                               lockdep_is_held(&sched_domains_mutex))
   
   /*
@@@ -597,7 -603,7 +604,7 @@@
    * Return the group to which this tasks belongs.
    *
    * We use task_subsys_state_check() and extend the RCU verification
-  * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+  * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
    * holds that lock for each task it moves into the cgroup. Therefore
    * by holding that lock, we pin the task to the current cgroup.
    */
@@@ -607,7 -613,7 +614,7 @@@ static inline struct task_group *task_g
         struct cgroup_subsys_state *css;
   
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                       lockdep_is_held(&task_rq(p)->lock));
+                       lockdep_is_held(&p->pi_lock));
         tg = container_of(css, struct task_group, css);
   
         return autogroup_task_group(p, tg);
@@@ -839,18 -845,39 +846,39 @@@ static inline int task_current(struct r
         return rq->curr == p;
   }
   
- #ifndef __ARCH_WANT_UNLOCKED_CTXSW
   static inline int task_running(struct rq *rq, struct task_struct *p)
   {
+ #ifdef CONFIG_SMP
+       return p->on_cpu;
+ #else
         return task_current(rq, p);
+ #endif
   }
   
+ #ifndef __ARCH_WANT_UNLOCKED_CTXSW
   static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
   {
+ #ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->on_cpu = 1;
+ #endif
   }
   
   static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
   {
+ #ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->on_cpu = 0;
+ #endif
   #ifdef CONFIG_DEBUG_SPINLOCK
         /* this is a valid case when another task releases the spinlock */
         rq->lock.owner = current;
@@@ -866,15 -893,6 +894,6 @@@
   }
   
   #else /* __ARCH_WANT_UNLOCKED_CTXSW */
- static inline int task_running(struct rq *rq, struct task_struct *p)
- {
- #ifdef CONFIG_SMP
-       return p->oncpu;
- #else
-       return task_current(rq, p);
- #endif
- }
- 
   static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
   {
   #ifdef CONFIG_SMP
@@@ -883,7 -901,7 +902,7 @@@
          * SMP rebalancing from interrupt is the only thing that cares
          * here.
          */
-       next->oncpu = 1;
+       next->on_cpu = 1;
   #endif
   #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         raw_spin_unlock_irq(&rq->lock);
@@@ -896,12 -914,12 +915,12 @@@ static inline void finish_lock_switch(s
   {
   #ifdef CONFIG_SMP
         /*
-        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
          * We must ensure this doesn't happen until the switch is completely
          * finished.
          */
         smp_wmb();
-       prev->oncpu = 0;
+       prev->on_cpu = 0;
   #endif
   #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_enable();
@@@ -910,23 -928,15 +929,15 @@@
   #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
   
   /*
-  * Check whether the task is waking, we use this to synchronize ->cpus_allowed
-  * against ttwu().
-  */
- static inline int task_is_waking(struct task_struct *p)
- {
-       return unlikely(p->state == TASK_WAKING);
- }
- 
- /*
-  * __task_rq_lock - lock the runqueue a given task resides on.
-  * Must be called interrupts disabled.
+  * __task_rq_lock - lock the rq @p resides on.
    */
   static inline struct rq *__task_rq_lock(struct task_struct *p)
         __acquires(rq->lock)
   {
         struct rq *rq;
   
+       lockdep_assert_held(&p->pi_lock);
+ 
         for (;;) {
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
@@@ -937,22 -947,22 +948,22 @@@
   }
   
   /*
-  * task_rq_lock - lock the runqueue a given task resides on and disable
-  * interrupts. Note the ordering: we can safely lookup the task_rq without
-  * explicitly disabling preemption.
+  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
    */
   static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+       __acquires(p->pi_lock)
         __acquires(rq->lock)
   {
         struct rq *rq;
   
         for (;;) {
-               local_irq_save(*flags);
+               raw_spin_lock_irqsave(&p->pi_lock, *flags);
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
                 if (likely(rq == task_rq(p)))
                         return rq;
-               raw_spin_unlock_irqrestore(&rq->lock, *flags);
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
         }
   }
   
@@@ -962,10 -972,13 +973,13 @@@ static void __task_rq_unlock(struct rq 
         raw_spin_unlock(&rq->lock);
   }
   
- static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+ static inline void
+ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
         __releases(rq->lock)
+       __releases(p->pi_lock)
   {
-       raw_spin_unlock_irqrestore(&rq->lock, *flags);
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
   }
   
   /*
@@@ -1774,7 -1787,6 +1788,6 @@@ static void enqueue_task(struct rq *rq
         update_rq_clock(rq);
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, flags);
-       p->se.on_rq = 1;
   }
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -1782,7 -1794,6 +1795,6 @@@
         update_rq_clock(rq);
         sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, flags);
-       p->se.on_rq = 0;
   }
   
   /*
@@@ -2117,7 -2128,7 +2129,7 @@@ static void check_preempt_curr(struct r
          * A queue event has occurred, and we're going to schedule.  In
          * this case, we can save a useless back to back clock update.
          */
-       if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                 rq->skip_clock_update = 1;
   }
   
@@@ -2163,6 -2174,11 +2175,11 @@@ void set_task_cpu(struct task_struct *p
          */
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                         !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+ 
+ #ifdef CONFIG_LOCKDEP
+       WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                     lockdep_is_held(&task_rq(p)->lock)));
+ #endif
   #endif
   
         trace_sched_migrate_task(p, new_cpu);
@@@ -2183,19 -2199,6 +2200,6 @@@ struct migration_arg 
   static int migration_cpu_stop(void *data);
   
   /*
-  * The task's runqueue lock must be held.
-  * Returns true if you have to wait for migration thread.
-  */
- static bool migrate_task(struct task_struct *p, struct rq *rq)
- {
-       /*
-        * If the task is not on a runqueue (and not running), then
-        * the next wake-up will properly place the task.
-        */
-       return p->se.on_rq || task_running(rq, p);
- }
- 
- /*
    * wait_task_inactive - wait for a thread to unschedule.
    *
    * If @match_state is nonzero, it's the @p->state value just checked and
@@@ -2252,11 -2255,11 +2256,11 @@@ unsigned long wait_task_inactive(struc
                 rq = task_rq_lock(p, &flags);
                 trace_sched_wait_task(p);
                 running = task_running(rq, p);
-               on_rq = p->se.on_rq;
+               on_rq = p->on_rq;
                 ncsw = 0;
                 if (!match_state || p->state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, &flags);
+               task_rq_unlock(rq, p, &flags);
   
                 /*
                  * If it changed from the expected state, bail out now.
@@@ -2331,7 -2334,7 +2335,7 @@@ EXPORT_SYMBOL_GPL(kick_process)
   
   #ifdef CONFIG_SMP
   /*
-  * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
    */
   static int select_fallback_rq(int cpu, struct task_struct *p)
   {
@@@ -2364,12 -2367,12 +2368,12 @@@
   }
   
   /*
-  * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
    */
   static inline
- int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
   {
-       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
   
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@@ -2395,27 -2398,60 +2399,60 @@@ static void update_avg(u64 *avg, u64 sa
   }
   #endif
   
- static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-                                bool is_sync, bool is_migrate, bool is_local,
-                                unsigned long en_flags)
+ static void
+ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
   {
+ #ifdef CONFIG_SCHEDSTATS
+       struct rq *rq = this_rq();
+ 
+ #ifdef CONFIG_SMP
+       int this_cpu = smp_processor_id();
+ 
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       } else {
+               struct sched_domain *sd;
+ 
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               for_each_domain(this_cpu, sd) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+       }
+ #endif /* CONFIG_SMP */
+ 
+       schedstat_inc(rq, ttwu_count);
         schedstat_inc(p, se.statistics.nr_wakeups);
-       if (is_sync)
+ 
+       if (wake_flags & WF_SYNC)
                 schedstat_inc(p, se.statistics.nr_wakeups_sync);
-       if (is_migrate)
+ 
+       if (cpu != task_cpu(p))
                 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-       if (is_local)
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
-       else
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
   
+ #endif /* CONFIG_SCHEDSTATS */
+ }
+ 
+ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+ {
         activate_task(rq, p, en_flags);
+       p->on_rq = 1;
+ 
+       /* if a worker is waking up, notify workqueue */
+       if (p->flags & PF_WQ_WORKER)
+               wq_worker_waking_up(p, cpu_of(rq));
   }
   
- static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
-                                       int wake_flags, bool success)
+ /*
+  * Mark the task runnable and perform wakeup-preemption.
+  */
+ static void
+ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   {
-       trace_sched_wakeup(p, success);
+       trace_sched_wakeup(p, true);
         check_preempt_curr(rq, p, wake_flags);
   
         p->state = TASK_RUNNING;
@@@ -2434,9 -2470,99 +2471,99 @@@
                 rq->idle_stamp = 0;
         }
   #endif
-       /* if a worker is waking up, notify workqueue */
-       if ((p->flags & PF_WQ_WORKER) && success)
-               wq_worker_waking_up(p, cpu_of(rq));
+ }
+ 
+ static void
+ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ {
+ #ifdef CONFIG_SMP
+       if (p->sched_contributes_to_load)
+               rq->nr_uninterruptible--;
+ #endif
+ 
+       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+       ttwu_do_wakeup(rq, p, wake_flags);
+ }
+ 
+ /*
+  * Called in case the task @p isn't fully descheduled from its runqueue,
+  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+  * since all we need to do is flip p->state to TASK_RUNNING, since
+  * the task is still ->on_rq.
+  */
+ static int ttwu_remote(struct task_struct *p, int wake_flags)
+ {
+       struct rq *rq;
+       int ret = 0;
+ 
+       rq = __task_rq_lock(p);
+       if (p->on_rq) {
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+ 
+       return ret;
+ }
+ 
+ #ifdef CONFIG_SMP
+ static void sched_ttwu_pending(void)
+ {
+       struct rq *rq = this_rq();
+       struct task_struct *list = xchg(&rq->wake_list, NULL);
+ 
+       if (!list)
+               return;
+ 
+       raw_spin_lock(&rq->lock);
+ 
+       while (list) {
+               struct task_struct *p = list;
+               list = list->wake_entry;
+               ttwu_do_activate(rq, p, 0);
+       }
+ 
+       raw_spin_unlock(&rq->lock);
+ }
+ 
+ void scheduler_ipi(void)
+ {
+       sched_ttwu_pending();
+ }
+ 
+ static void ttwu_queue_remote(struct task_struct *p, int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *next = rq->wake_list;
+ 
+       for (;;) {
+               struct task_struct *old = next;
+ 
+               p->wake_entry = next;
+               next = cmpxchg(&rq->wake_list, old, p);
+               if (next == old)
+                       break;
+       }
+ 
+       if (!next)
+               smp_send_reschedule(cpu);
+ }
+ #endif
+ 
+ static void ttwu_queue(struct task_struct *p, int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+ 
+ #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
+       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+               ttwu_queue_remote(p, cpu);
+               return;
+       }
+ #endif
+ 
+       raw_spin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0);
+       raw_spin_unlock(&rq->lock);
   }
   
   /**
@@@ -2454,92 -2580,64 +2581,64 @@@
    * Returns %true if @p was woken up, %false if it was already running
    * or @state didn't match @p's state.
    */
- static int try_to_wake_up(struct task_struct *p, unsigned int state,
-                         int wake_flags)
+ static int
+ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
   {
-       int cpu, orig_cpu, this_cpu, success = 0;
         unsigned long flags;
-       unsigned long en_flags = ENQUEUE_WAKEUP;
-       struct rq *rq;
- 
-       this_cpu = get_cpu();
+       int cpu, success = 0;
   
         smp_wmb();
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
         if (!(p->state & state))
                 goto out;
   
-       if (p->se.on_rq)
-               goto out_running;
- 
+       success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
-       orig_cpu = cpu;
   
- #ifdef CONFIG_SMP
-       if (unlikely(task_running(rq, p)))
-               goto out_activate;
+       if (p->on_rq && ttwu_remote(p, wake_flags))
+               goto stat;
   
+ #ifdef CONFIG_SMP
         /*
-        * In order to handle concurrent wakeups and release the rq->lock
-        * we put the task in TASK_WAKING state.
-        *
-        * First fix up the nr_uninterruptible count:
+        * If the owning (remote) cpu is still in the middle of schedule() with
+        * this task as prev, wait until its done referencing the task.
          */
-       if (task_contributes_to_load(p)) {
-               if (likely(cpu_online(orig_cpu)))
-                       rq->nr_uninterruptible--;
-               else
-                       this_rq()->nr_uninterruptible--;
-       }
-       p->state = TASK_WAKING;
- 
-       if (p->sched_class->task_waking) {
-               p->sched_class->task_waking(rq, p);
-               en_flags |= ENQUEUE_WAKING;
+       while (p->on_cpu) {
+ #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+               /*
+                * If called from interrupt context we could have landed in the
+                * middle of schedule(), in this case we should take care not
+                * to spin on ->on_cpu if p is current, since that would
+                * deadlock.
+                */
+               if (p == current) {
+                       ttwu_queue(p, cpu);
+                       goto stat;
+               }
+ #endif
+               cpu_relax();
         }
- 
-       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu)
-               set_task_cpu(p, cpu);
-       __task_rq_unlock(rq);
- 
-       rq = cpu_rq(cpu);
-       raw_spin_lock(&rq->lock);
- 
         /*
-        * We migrated the task without holding either rq->lock, however
-        * since the task is not on the task list itself, nobody else
-        * will try and migrate the task, hence the rq should match the
-        * cpu we just moved it to.
+        * Pairs with the smp_wmb() in finish_lock_switch().
          */
-       WARN_ON(task_cpu(p) != cpu);
-       WARN_ON(p->state != TASK_WAKING);
+       smp_rmb();
   
- #ifdef CONFIG_SCHEDSTATS
-       schedstat_inc(rq, ttwu_count);
-       if (cpu == this_cpu)
-               schedstat_inc(rq, ttwu_local);
-       else {
-               struct sched_domain *sd;
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
-               }
-       }
- #endif /* CONFIG_SCHEDSTATS */
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
   
- out_activate:
+       if (p->sched_class->task_waking)
+               p->sched_class->task_waking(p);
+ 
+       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       if (task_cpu(p) != cpu)
+               set_task_cpu(p, cpu);
   #endif /* CONFIG_SMP */
-       ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                     cpu == this_cpu, en_flags);
-       success = 1;
- out_running:
-       ttwu_post_activation(p, rq, wake_flags, success);
+ 
+       ttwu_queue(p, cpu);
+ stat:
+       ttwu_stat(p, cpu, wake_flags);
   out:
-       task_rq_unlock(rq, &flags);
-       put_cpu();
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
         return success;
   }
@@@ -2548,31 -2646,34 +2647,34 @@@
    * try_to_wake_up_local - try to wake up a local task with rq lock held
    * @p: the thread to be awakened
    *
-  * Put @p on the run-queue if it's not already there.  The caller must
+  * Put @p on the run-queue if it's not already there. The caller must
    * ensure that this_rq() is locked, @p is bound to this_rq() and not
-  * the current task.  this_rq() stays locked over invocation.
+  * the current task.
    */
   static void try_to_wake_up_local(struct task_struct *p)
   {
         struct rq *rq = task_rq(p);
-       bool success = false;
   
         BUG_ON(rq != this_rq());
         BUG_ON(p == current);
         lockdep_assert_held(&rq->lock);
   
+       if (!raw_spin_trylock(&p->pi_lock)) {
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&p->pi_lock);
+               raw_spin_lock(&rq->lock);
+       }
+ 
         if (!(p->state & TASK_NORMAL))
-               return;
+               goto out;
   
-       if (!p->se.on_rq) {
-               if (likely(!task_running(rq, p))) {
-                       schedstat_inc(rq, ttwu_count);
-                       schedstat_inc(rq, ttwu_local);
-               }
-               ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
-               success = true;
-       }
-       ttwu_post_activation(p, rq, 0, success);
+       if (!p->on_rq)
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ 
+       ttwu_do_wakeup(rq, p, 0);
+       ttwu_stat(p, smp_processor_id(), 0);
+ out:
+       raw_spin_unlock(&p->pi_lock);
   }
   
   /**
@@@ -2605,19 -2706,21 +2707,21 @@@ int wake_up_state(struct task_struct *p
    */
   static void __sched_fork(struct task_struct *p)
   {
+       p->on_rq                        = 0;
+ 
+       p->se.on_rq                     = 0;
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
+       INIT_LIST_HEAD(&p->se.group_node);
   
   #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   #endif
   
         INIT_LIST_HEAD(&p->rt.run_list);
-       p->se.on_rq = 0;
-       INIT_LIST_HEAD(&p->se.group_node);
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@@ -2629,6 -2732,7 +2733,7 @@@
    */
   void sched_fork(struct task_struct *p, int clone_flags)
   {
+       unsigned long flags;
         int cpu = get_cpu();
   
         __sched_fork(p);
@@@ -2679,16 -2783,16 +2784,16 @@@
          *
          * Silence PROVE_RCU.
          */
-       rcu_read_lock();
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
         set_task_cpu(p, cpu);
-       rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         if (likely(sched_info_on()))
                 memset(&p->sched_info, 0, sizeof(p->sched_info));
   #endif
- #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       p->oncpu = 0;
+ #if defined(CONFIG_SMP)
+       p->on_cpu = 0;
   #endif
   #ifdef CONFIG_PREEMPT
         /* Want to start with kernel preemption disabled. */
@@@ -2712,37 -2816,27 +2817,27 @@@ void wake_up_new_task(struct task_struc
   {
         unsigned long flags;
         struct rq *rq;
-       int cpu __maybe_unused = get_cpu();
   
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
   #ifdef CONFIG_SMP
-       rq = task_rq_lock(p, &flags);
-       p->state = TASK_WAKING;
- 
         /*
          * Fork balancing, do it here and not earlier because:
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
-        *
-        * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-        * without people poking at ->cpus_allowed.
          */
-       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
-       set_task_cpu(p, cpu);
- 
-       p->state = TASK_RUNNING;
-       task_rq_unlock(rq, &flags);
+       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
   #endif
   
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
-       trace_sched_wakeup_new(p, 1);
+       p->on_rq = 1;
+       trace_sched_wakeup_new(p, true);
         check_preempt_curr(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_woken)
                 p->sched_class->task_woken(rq, p);
   #endif
-       task_rq_unlock(rq, &flags);
-       put_cpu();
+       task_rq_unlock(rq, p, &flags);
   }
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
@@@ -3451,27 -3545,22 +3546,22 @@@ void sched_exec(void
   {
         struct task_struct *p = current;
         unsigned long flags;
-       struct rq *rq;
         int dest_cpu;
   
-       rq = task_rq_lock(p, &flags);
-       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
   
-       /*
-        * select_task_rq() can race against ->cpus_allowed
-        */
-       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+       if (likely(cpu_active(dest_cpu))) {
                 struct migration_arg arg = { p, dest_cpu };
   
-               task_rq_unlock(rq, &flags);
-               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                 return;
         }
   unlock:
-       task_rq_unlock(rq, &flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   }
   
   #endif
@@@ -3508,7 -3597,7 +3598,7 @@@ unsigned long long task_delta_exec(stru
   
         rq = task_rq_lock(p, &flags);
         ns = do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -3526,7 -3615,7 +3616,7 @@@ unsigned long long task_sched_runtime(s
   
         rq = task_rq_lock(p, &flags);
         ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -3550,7 -3639,7 +3640,7 @@@ unsigned long long thread_group_sched_r
         rq = task_rq_lock(p, &flags);
         thread_group_cputime(p, &totals);
         ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   
         return ns;
   }
@@@ -4036,7 -4125,7 +4126,7 @@@ static inline void schedule_debug(struc
   
   static void put_prev_task(struct rq *rq, struct task_struct *prev)
   {
-       if (prev->se.on_rq)
+       if (prev->on_rq)
                 update_rq_clock(rq);
         prev->sched_class->put_prev_task(rq, prev);
   }
@@@ -4098,11 -4187,13 +4188,13 @@@ need_resched
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       prev->on_rq = 0;
+ 
                         /*
-                        * If a worker is going to sleep, notify and
-                        * ask workqueue whether it wants to wake up a
-                        * task to maintain concurrency.  If so, wake
-                        * up the task.
+                        * If a worker went to sleep, notify and ask workqueue
+                        * whether it wants to wake up a task to maintain
+                        * concurrency.
                          */
                         if (prev->flags & PF_WQ_WORKER) {
                                 struct task_struct *to_wakeup;
@@@ -4111,21 -4202,20 +4203,20 @@@
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ 
+                       /*
+                        * If we are going to sleep and we have plugged IO
+                        * queued, make sure to submit it to avoid deadlocks.
+                        */
+                       if (blk_needs_flush_plug(prev)) {
+                               raw_spin_unlock(&rq->lock);
+                               blk_flush_plug(prev);
+                               raw_spin_lock(&rq->lock);
+                       }
                 }
                 switch_count = &prev->nvcsw;
         }
   
-       /*
-        * If we are going to sleep and we have plugged IO queued, make
-        * sure to submit it to avoid deadlocks.
-        */
-       if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
-               raw_spin_unlock(&rq->lock);
-               blk_flush_plug(prev);
-               raw_spin_lock(&rq->lock);
-       }
- 
         pre_schedule(rq, prev);
   
         if (unlikely(!rq->nr_running))
@@@ -4162,70 -4252,53 +4253,53 @@@
   EXPORT_SYMBOL(schedule);
   
   #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- /*
-  * Look out! "owner" is an entirely speculative pointer
-  * access and not reliable.
-  */
- int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
- {
-       unsigned int cpu;
-       struct rq *rq;
   
-       if (!sched_feat(OWNER_SPIN))
-               return 0;
+ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+ {
+       bool ret = false;
   
- #ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * Need to access the cpu field knowing that
-        * DEBUG_PAGEALLOC could have unmapped it if
-        * the mutex owner just released it and exited.
-        */
-       if (probe_kernel_address(&owner->cpu, cpu))
-               return 0;
- #else
-       cpu = owner->cpu;
- #endif
+       rcu_read_lock();
+       if (lock->owner != owner)
+               goto fail;
   
         /*
-        * Even if the access succeeded (likely case),
-        * the cpu field may no longer be valid.
+        * Ensure we emit the owner->on_cpu, dereference _after_ checking
+        * lock->owner still matches owner, if that fails, owner might
+        * point to free()d memory, if it still matches, the rcu_read_lock()
+        * ensures the memory stays valid.
          */
-       if (cpu >= nr_cpumask_bits)
-               return 0;
+       barrier();
   
-       /*
-        * We need to validate that we can do a
-        * get_cpu() and that we have the percpu area.
-        */
-       if (!cpu_online(cpu))
-               return 0;
+       ret = owner->on_cpu;
+ fail:
+       rcu_read_unlock();
   
-       rq = cpu_rq(cpu);
+       return ret;
+ }
   
-       for (;;) {
-               /*
-                * Owner changed, break to re-assess state.
-                */
-               if (lock->owner != owner) {
-                       /*
-                        * If the lock has switched to a different owner,
-                        * we likely have heavy contention. Return 0 to quit
-                        * optimistic spinning and not contend further:
-                        */
-                       if (lock->owner)
-                               return 0;
-                       break;
-               }
+ /*
+  * Look out! "owner" is an entirely speculative pointer
+  * access and not reliable.
+  */
+ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+ {
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
   
-               /*
-                * Is that owner really running on that cpu?
-                */
-               if (task_thread_info(rq->curr) != owner || need_resched())
+       while (owner_running(lock, owner)) {
+               if (need_resched())
                         return 0;
   
                 arch_mutex_cpu_relax();
         }
   
+       /*
+        * If the owner changed to another task there is likely
+        * heavy contention, stop spinning.
+        */
+       if (lock->owner)
+               return 0;
+ 
         return 1;
   }
   #endif
@@@ -4685,19 -4758,18 +4759,18 @@@ EXPORT_SYMBOL(sleep_on_timeout)
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
-       unsigned long flags;
         int oldprio, on_rq, running;
         struct rq *rq;
         const struct sched_class *prev_class;
   
         BUG_ON(prio < 0 || prio > MAX_PRIO);
   
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
   
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 dequeue_task(rq, p, 0);
@@@ -4717,7 -4789,7 +4790,7 @@@
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
   
         check_class_changed(rq, p, prev_class, oldprio);
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
   }
   
   #endif
@@@ -4745,7 -4817,7 +4818,7 @@@ void set_user_nice(struct task_struct *
                 p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         if (on_rq)
                 dequeue_task(rq, p, 0);
   
@@@ -4765,7 -4837,7 +4838,7 @@@
                         resched_task(rq->curr);
         }
   out_unlock:
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   }
   EXPORT_SYMBOL(set_user_nice);
   
@@@ -4879,8 -4951,6 +4952,6 @@@ static struct task_struct *find_process
   static void
   __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
   {
-       BUG_ON(p->se.on_rq);
- 
         p->policy = policy;
         p->rt_priority = prio;
         p->normal_prio = normal_prio(p);
@@@ -4995,20 -5065,17 +5066,17 @@@ recheck
         /*
          * make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
-        */
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       /*
+        *
          * To be able to change p->policy safely, the appropriate
          * runqueue lock must be held.
          */
-       rq = __task_rq_lock(p);
+       rq = task_rq_lock(p, &flags);
   
         /*
          * Changing the policy of the stop threads its a very bad idea
          */
         if (p == rq->stop) {
-               __task_rq_unlock(rq);
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               task_rq_unlock(rq, p, &flags);
                 return -EINVAL;
         }
   
@@@ -5032,8 -5099,7 +5100,7 @@@
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
-                       __task_rq_unlock(rq);
-                       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                       task_rq_unlock(rq, p, &flags);
                         return -EPERM;
                 }
         }
@@@ -5042,11 -5108,10 +5109,10 @@@
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
-               __task_rq_unlock(rq);
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               task_rq_unlock(rq, p, &flags);
                 goto recheck;
         }
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
                 deactivate_task(rq, p, 0);
@@@ -5065,8 -5130,7 +5131,7 @@@
                 activate_task(rq, p, 0);
   
         check_class_changed(rq, p, prev_class, oldprio);
-       __task_rq_unlock(rq);
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+       task_rq_unlock(rq, p, &flags);
   
         rt_mutex_adjust_pi(p);
   
@@@ -5317,7 -5381,6 +5382,6 @@@ long sched_getaffinity(pid_t pid, struc
   {
         struct task_struct *p;
         unsigned long flags;
-       struct rq *rq;
         int retval;
   
         get_online_cpus();
@@@ -5332,9 -5395,9 +5396,9 @@@
         if (retval)
                 goto out_unlock;
   
-       rq = task_rq_lock(p, &flags);
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
         cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-       task_rq_unlock(rq, &flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
   out_unlock:
         rcu_read_unlock();
@@@ -5659,7 -5722,7 +5723,7 @@@ SYSCALL_DEFINE2(sched_rr_get_interval, 
   
         rq = task_rq_lock(p, &flags);
         time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   
         rcu_read_unlock();
         jiffies_to_timespec(time_slice, &t);
@@@ -5777,8 -5840,8 +5841,8 @@@ void __cpuinit init_idle(struct task_st
         rcu_read_unlock();
   
         rq->curr = rq->idle = idle;
- #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-       idle->oncpu = 1;
+ #if defined(CONFIG_SMP)
+       idle->on_cpu = 1;
   #endif
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   
@@@ -5882,18 -5945,7 +5946,7 @@@ int set_cpus_allowed_ptr(struct task_st
         unsigned int dest_cpu;
         int ret = 0;
   
-       /*
-        * Serialize against TASK_WAKING so that ttwu() and wunt() can
-        * drop the rq->lock and still rely on ->cpus_allowed.
-        */
- again:
-       while (task_is_waking(p))
-               cpu_relax();
         rq = task_rq_lock(p, &flags);
-       if (task_is_waking(p)) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
   
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
@@@ -5918,16 -5970,16 +5971,16 @@@
                 goto out;
   
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, rq)) {
+       if (p->on_rq) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, &flags);
+               task_rq_unlock(rq, p, &flags);
                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                 tlb_migrate_finish(p->mm);
                 return 0;
         }
   out:
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, p, &flags);
   
         return ret;
   }
@@@ -5955,6 -6007,7 +6008,7 @@@ static int __migrate_task(struct task_s
         rq_src = cpu_rq(src_cpu);
         rq_dest = cpu_rq(dest_cpu);
   
+       raw_spin_lock(&p->pi_lock);
         double_rq_lock(rq_src, rq_dest);
         /* Already moved. */
         if (task_cpu(p) != src_cpu)
@@@ -5967,7 -6020,7 +6021,7 @@@
          * If we're not on a rq, the next wake-up will ensure we're
          * placed properly.
          */
-       if (p->se.on_rq) {
+       if (p->on_rq) {
                 deactivate_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
                 activate_task(rq_dest, p, 0);
@@@ -5977,6 -6030,7 +6031,7 @@@ done
         ret = 1;
   fail:
         double_rq_unlock(rq_src, rq_dest);
+       raw_spin_unlock(&p->pi_lock);
         return ret;
   }
   
@@@ -6317,6 -6371,7 +6372,7 @@@ migration_call(struct notifier_block *n
   
   #ifdef CONFIG_HOTPLUG_CPU
         case CPU_DYING:
+               sched_ttwu_pending();
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
@@@ -6395,8 -6450,6 +6451,8 @@@ early_initcall(migration_init)
   
   #ifdef CONFIG_SMP
   
+ +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+ +
   #ifdef CONFIG_SCHED_DEBUG
   
   static __read_mostly int sched_domain_debug_enabled;
@@@ -6492,6 -6545,7 +6548,6 @@@ static int sched_domain_debug_one(struc
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
- -      cpumask_var_t groupmask;
         int level = 0;
   
         if (!sched_domain_debug_enabled)
@@@ -6504,14 -6558,20 +6560,14 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
- -      if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- -              printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- -              return;
- -      }
- -
         for (;;) {
- -              if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ +              if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                         break;
                 level++;
                 sd = sd->parent;
                 if (!sd)
                         break;
         }
- -      free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6568,11 -6628,12 +6624,11 @@@ sd_parent_degenerate(struct sched_domai
         return 1;
   }
   
- -static void free_rootdomain(struct root_domain *rd)
+ +static void free_rootdomain(struct rcu_head *rcu)
   {
- -      synchronize_sched();
+ +      struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
   
         cpupri_cleanup(&rd->cpupri);
- -
         free_cpumask_var(rd->rto_mask);
         free_cpumask_var(rd->online);
         free_cpumask_var(rd->span);
@@@ -6613,7 -6674,7 +6669,7 @@@ static void rq_attach_root(struct rq *r
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   
         if (old_rd)
- -              free_rootdomain(old_rd);
+ +              call_rcu_sched(&old_rd->rcu, free_rootdomain);
   }
   
   static int init_rootdomain(struct root_domain *rd)
@@@ -6664,25 -6725,6 +6720,25 @@@ static struct root_domain *alloc_rootdo
         return rd;
   }
   
+ +static void free_sched_domain(struct rcu_head *rcu)
+ +{
+ +      struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ +      if (atomic_dec_and_test(&sd->groups->ref))
+ +              kfree(sd->groups);
+ +      kfree(sd);
+ +}
+ +
+ +static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+ +{
+ +      call_rcu(&sd->rcu, free_sched_domain);
+ +}
+ +
+ +static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+ +{
+ +      for (; sd; sd = sd->parent)
+ +              destroy_sched_domain(sd, cpu);
+ +}
+ +
   /*
    * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
    * hold the hotplug lock.
@@@ -6693,6 -6735,9 +6749,6 @@@ cpu_attach_domain(struct sched_domain *
         struct rq *rq = cpu_rq(cpu);
         struct sched_domain *tmp;
   
- -      for (tmp = sd; tmp; tmp = tmp->parent)
- -              tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
- -
         /* Remove the sched domains which do not contribute to scheduling. */
         for (tmp = sd; tmp; ) {
                 struct sched_domain *parent = tmp->parent;
@@@ -6703,15 -6748,12 +6759,15 @@@
                         tmp->parent = parent->parent;
                         if (parent->parent)
                                 parent->parent->child = tmp;
+ +                      destroy_sched_domain(parent, cpu);
                 } else
                         tmp = tmp->parent;
         }
   
         if (sd && sd_degenerate(sd)) {
+ +              tmp = sd;
                 sd = sd->parent;
+ +              destroy_sched_domain(tmp, cpu);
                 if (sd)
                         sd->child = NULL;
         }
@@@ -6719,9 -6761,7 +6775,9 @@@
         sched_domain_debug(sd, cpu);
   
         rq_attach_root(rq, rd);
+ +      tmp = rq->sd;
         rcu_assign_pointer(rq->sd, sd);
+ +      destroy_sched_domains(tmp, cpu);
   }
   
   /* cpus with isolated domains */
@@@ -6737,6 -6777,56 +6793,6 @@@ static int __init isolated_cpu_setup(ch
   
   __setup("isolcpus=", isolated_cpu_setup);
   
- -/*
- - * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- - * to a function which identifies what group(along with sched group) a CPU
- - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- - * (due to the fact that we keep track of groups covered with a struct cpumask).
- - *
- - * init_sched_build_groups will build a circular linked list of the groups
- - * covered by the given span, and will set each group's ->cpumask correctly,
- - * and ->cpu_power to 0.
- - */
- -static void
- -init_sched_build_groups(const struct cpumask *span,
- -                      const struct cpumask *cpu_map,
- -                      int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- -                                      struct sched_group **sg,
- -                                      struct cpumask *tmpmask),
- -                      struct cpumask *covered, struct cpumask *tmpmask)
- -{
- -      struct sched_group *first = NULL, *last = NULL;
- -      int i;
- -
- -      cpumask_clear(covered);
- -
- -      for_each_cpu(i, span) {
- -              struct sched_group *sg;
- -              int group = group_fn(i, cpu_map, &sg, tmpmask);
- -              int j;
- -
- -              if (cpumask_test_cpu(i, covered))
- -                      continue;
- -
- -              cpumask_clear(sched_group_cpus(sg));
- -              sg->cpu_power = 0;
- -
- -              for_each_cpu(j, span) {
- -                      if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- -                              continue;
- -
- -                      cpumask_set_cpu(j, covered);
- -                      cpumask_set_cpu(j, sched_group_cpus(sg));
- -              }
- -              if (!first)
- -                      first = sg;
- -              if (last)
- -                      last->next = sg;
- -              last = sg;
- -      }
- -      last->next = first;
- -}
- -
   #define SD_NODES_PER_DOMAIN 16
   
   #ifdef CONFIG_NUMA
@@@ -6807,125 -6897,311 +6863,125 @@@ static void sched_domain_node_span(int 
                 cpumask_or(span, span, cpumask_of_node(next_node));
         }
   }
+ +
+ +static const struct cpumask *cpu_node_mask(int cpu)
+ +{
+ +      lockdep_assert_held(&sched_domains_mutex);
+ +
+ +      sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+ +
+ +      return sched_domains_tmpmask;
+ +}
+ +
+ +static const struct cpumask *cpu_allnodes_mask(int cpu)
+ +{
+ +      return cpu_possible_mask;
+ +}
   #endif /* CONFIG_NUMA */
   
- -int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+ +static const struct cpumask *cpu_cpu_mask(int cpu)
+ +{
+ +      return cpumask_of_node(cpu_to_node(cpu));
+ +}
   
- -/*
- - * The cpus mask in sched_group and sched_domain hangs off the end.
- - *
- - * ( See the the comments in include/linux/sched.h:struct sched_group
- - *   and struct sched_domain. )
- - */
- -struct static_sched_group {
- -      struct sched_group sg;
- -      DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
- -};
+ +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
- -struct static_sched_domain {
- -      struct sched_domain sd;
- -      DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ +struct sd_data {
+ +      struct sched_domain **__percpu sd;
+ +      struct sched_group **__percpu sg;
   };
   
   struct s_data {
- -#ifdef CONFIG_NUMA
- -      int                     sd_allnodes;
- -      cpumask_var_t           domainspan;
- -      cpumask_var_t           covered;
- -      cpumask_var_t           notcovered;
- -#endif
- -      cpumask_var_t           nodemask;
- -      cpumask_var_t           this_sibling_map;
- -      cpumask_var_t           this_core_map;
- -      cpumask_var_t           this_book_map;
- -      cpumask_var_t           send_covered;
- -      cpumask_var_t           tmpmask;
- -      struct sched_group      **sched_group_nodes;
+ +      struct sched_domain ** __percpu sd;
         struct root_domain      *rd;
   };
   
   enum s_alloc {
- -      sa_sched_groups = 0,
         sa_rootdomain,
- -      sa_tmpmask,
- -      sa_send_covered,
- -      sa_this_book_map,
- -      sa_this_core_map,
- -      sa_this_sibling_map,
- -      sa_nodemask,
- -      sa_sched_group_nodes,
- -#ifdef CONFIG_NUMA
- -      sa_notcovered,
- -      sa_covered,
- -      sa_domainspan,
- -#endif
+ +      sa_sd,
+ +      sa_sd_storage,
         sa_none,
   };
   
- -/*
- - * SMT sched-domains:
- - */
- -#ifdef CONFIG_SCHED_SMT
- -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
- -
- -static int
- -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- -               struct sched_group **sg, struct cpumask *unused)
- -{
- -      if (sg)
- -              *sg = &per_cpu(sched_groups, cpu).sg;
- -      return cpu;
- -}
- -#endif /* CONFIG_SCHED_SMT */
+ +struct sched_domain_topology_level;
   
- -/*
- - * multi-core sched-domains:
- - */
- -#ifdef CONFIG_SCHED_MC
- -static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+ +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+ +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
   
- -static int
- -cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
- -{
- -      int group;
- -#ifdef CONFIG_SCHED_SMT
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#else
- -      group = cpu;
- -#endif
- -      if (sg)
- -              *sg = &per_cpu(sched_group_core, group).sg;
- -      return group;
- -}
- -#endif /* CONFIG_SCHED_MC */
+ +struct sched_domain_topology_level {
+ +      sched_domain_init_f init;
+ +      sched_domain_mask_f mask;
+ +      struct sd_data      data;
+ +};
   
   /*
- - * book sched-domains:
+ + * Assumes the sched_domain tree is fully constructed
    */
- -#ifdef CONFIG_SCHED_BOOK
- -static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
- -
- -static int
- -cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
+ +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
   {
- -      int group = cpu;
- -#ifdef CONFIG_SCHED_MC
- -      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_SMT)
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#endif
- -      if (sg)
- -              *sg = &per_cpu(sched_group_book, group).sg;
- -      return group;
- -}
- -#endif /* CONFIG_SCHED_BOOK */
+ +      struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ +      struct sched_domain *child = sd->child;
   
- -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ +      if (child)
+ +              cpu = cpumask_first(sched_domain_span(child));
   
- -static int
- -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- -                struct sched_group **sg, struct cpumask *mask)
- -{
- -      int group;
- -#ifdef CONFIG_SCHED_BOOK
- -      cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_MC)
- -      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#elif defined(CONFIG_SCHED_SMT)
- -      cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- -      group = cpumask_first(mask);
- -#else
- -      group = cpu;
- -#endif
         if (sg)
- -              *sg = &per_cpu(sched_group_phys, group).sg;
- -      return group;
+ +              *sg = *per_cpu_ptr(sdd->sg, cpu);
+ +
+ +      return cpu;
   }
   
- -#ifdef CONFIG_NUMA
   /*
- - * The init_sched_build_groups can't handle what we want to do with node
- - * groups, so roll our own. Now each node has its own list of groups which
- - * gets dynamically allocated.
+ + * build_sched_groups takes the cpumask we wish to span, and a pointer
+ + * to a function which identifies what group(along with sched group) a CPU
+ + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ + * (due to the fact that we keep track of groups covered with a struct cpumask).
+ + *
+ + * build_sched_groups will build a circular linked list of the groups
+ + * covered by the given span, and will set each group's ->cpumask correctly,
+ + * and ->cpu_power to 0.
    */
- -static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
- -static struct sched_group ***sched_group_nodes_bycpu;
- -
- -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
- -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
- -
- -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- -                               struct sched_group **sg,
- -                               struct cpumask *nodemask)
- -{
- -      int group;
- -
- -      cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- -      group = cpumask_first(nodemask);
- -
- -      if (sg)
- -              *sg = &per_cpu(sched_group_allnodes, group).sg;
- -      return group;
- -}
- -
- -static void init_numa_sched_groups_power(struct sched_group *group_head)
- -{
- -      struct sched_group *sg = group_head;
- -      int j;
- -
- -      if (!sg)
- -              return;
- -      do {
- -              for_each_cpu(j, sched_group_cpus(sg)) {
- -                      struct sched_domain *sd;
- -
- -                      sd = &per_cpu(phys_domains, j).sd;
- -                      if (j != group_first_cpu(sd->groups)) {
- -                              /*
- -                               * Only add "power" once for each
- -                               * physical package.
- -                               */
- -                              continue;
- -                      }
- -
- -                      sg->cpu_power += sd->groups->cpu_power;
- -              }
- -              sg = sg->next;
- -      } while (sg != group_head);
- -}
- -
- -static int build_numa_sched_groups(struct s_data *d,
- -                                 const struct cpumask *cpu_map, int num)
+ +static void
+ +build_sched_groups(struct sched_domain *sd)
   {
- -      struct sched_domain *sd;
- -      struct sched_group *sg, *prev;
- -      int n, j;
- -
- -      cpumask_clear(d->covered);
- -      cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- -      if (cpumask_empty(d->nodemask)) {
- -              d->sched_group_nodes[num] = NULL;
- -              goto out;
- -      }
- -
- -      sched_domain_node_span(num, d->domainspan);
- -      cpumask_and(d->domainspan, d->domainspan, cpu_map);
- -
- -      sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                        GFP_KERNEL, num);
- -      if (!sg) {
- -              printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- -                     num);
- -              return -ENOMEM;
- -      }
- -      d->sched_group_nodes[num] = sg;
- -
- -      for_each_cpu(j, d->nodemask) {
- -              sd = &per_cpu(node_domains, j).sd;
- -              sd->groups = sg;
- -      }
+ +      struct sched_group *first = NULL, *last = NULL;
+ +      struct sd_data *sdd = sd->private;
+ +      const struct cpumask *span = sched_domain_span(sd);
+ +      struct cpumask *covered;
+ +      int i;
   
- -      sg->cpu_power = 0;
- -      cpumask_copy(sched_group_cpus(sg), d->nodemask);
- -      sg->next = sg;
- -      cpumask_or(d->covered, d->covered, d->nodemask);
- -
- -      prev = sg;
- -      for (j = 0; j < nr_node_ids; j++) {
- -              n = (num + j) % nr_node_ids;
- -              cpumask_complement(d->notcovered, d->covered);
- -              cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- -              cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- -              if (cpumask_empty(d->tmpmask))
- -                      break;
- -              cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- -              if (cpumask_empty(d->tmpmask))
- -                      continue;
- -              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                                GFP_KERNEL, num);
- -              if (!sg) {
- -                      printk(KERN_WARNING
- -                             "Can not alloc domain group for node %d\n", j);
- -                      return -ENOMEM;
- -              }
- -              sg->cpu_power = 0;
- -              cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- -              sg->next = prev->next;
- -              cpumask_or(d->covered, d->covered, d->tmpmask);
- -              prev->next = sg;
- -              prev = sg;
- -      }
- -out:
- -      return 0;
- -}
- -#endif /* CONFIG_NUMA */
+ +      lockdep_assert_held(&sched_domains_mutex);
+ +      covered = sched_domains_tmpmask;
   
- -#ifdef CONFIG_NUMA
- -/* Free memory allocated for various sched_group structures */
- -static void free_sched_groups(const struct cpumask *cpu_map,
- -                            struct cpumask *nodemask)
- -{
- -      int cpu, i;
+ +      cpumask_clear(covered);
   
- -      for_each_cpu(cpu, cpu_map) {
- -              struct sched_group **sched_group_nodes
- -                      = sched_group_nodes_bycpu[cpu];
+ +      for_each_cpu(i, span) {
+ +              struct sched_group *sg;
+ +              int group = get_group(i, sdd, &sg);
+ +              int j;
   
- -              if (!sched_group_nodes)
+ +              if (cpumask_test_cpu(i, covered))
                         continue;
   
- -              for (i = 0; i < nr_node_ids; i++) {
- -                      struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ +              cpumask_clear(sched_group_cpus(sg));
+ +              sg->cpu_power = 0;
   
- -                      cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- -                      if (cpumask_empty(nodemask))
+ +              for_each_cpu(j, span) {
+ +                      if (get_group(j, sdd, NULL) != group)
                                 continue;
   
- -                      if (sg == NULL)
- -                              continue;
- -                      sg = sg->next;
- -next_sg:
- -                      oldsg = sg;
- -                      sg = sg->next;
- -                      kfree(oldsg);
- -                      if (oldsg != sched_group_nodes[i])
- -                              goto next_sg;
+ +                      cpumask_set_cpu(j, covered);
+ +                      cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
- -              kfree(sched_group_nodes);
- -              sched_group_nodes_bycpu[cpu] = NULL;
+ +
+ +              if (!first)
+ +                      first = sg;
+ +              if (last)
+ +                      last->next = sg;
+ +              last = sg;
         }
+ +      last->next = first;
   }
- -#else /* !CONFIG_NUMA */
- -static void free_sched_groups(const struct cpumask *cpu_map,
- -                            struct cpumask *nodemask)
- -{
- -}
- -#endif /* CONFIG_NUMA */
   
   /*
    * Initialize sched groups cpu_power.
@@@ -6939,6 -7215,11 +6995,6 @@@
    */
   static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   {
- -      struct sched_domain *child;
- -      struct sched_group *group;
- -      long power;
- -      int weight;
- -
         WARN_ON(!sd || !sd->groups);
   
         if (cpu != group_first_cpu(sd->groups))
@@@ -6946,7 -7227,36 +7002,7 @@@
   
         sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
   
- -      child = sd->child;
- -
- -      sd->groups->cpu_power = 0;
- -
- -      if (!child) {
- -              power = SCHED_LOAD_SCALE;
- -              weight = cpumask_weight(sched_domain_span(sd));
- -              /*
- -               * SMT siblings share the power of a single core.
- -               * Usually multiple threads get a better yield out of
- -               * that one core than a single thread would have,
- -               * reflect that in sd->smt_gain.
- -               */
- -              if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- -                      power *= sd->smt_gain;
- -                      power /= weight;
- -                      power >>= SCHED_LOAD_SHIFT;
- -              }
- -              sd->groups->cpu_power += power;
- -              return;
- -      }
- -
- -      /*
- -       * Add cpu_power of each child group to this groups cpu_power.
- -       */
- -      group = child->groups;
- -      do {
- -              sd->groups->cpu_power += group->cpu_power;
- -              group = group->next;
- -      } while (group != child->groups);
+ +      update_group_power(sd, cpu);
   }
   
   /*
@@@ -6960,15 -7270,15 +7016,15 @@@
   # define SD_INIT_NAME(sd, type)               do { } while (0)
   #endif
   
- -#define       SD_INIT(sd, type)       sd_init_##type(sd)
- -
- -#define SD_INIT_FUNC(type)    \
- -static noinline void sd_init_##type(struct sched_domain *sd)  \
- -{                                                             \
- -      memset(sd, 0, sizeof(*sd));                             \
- -      *sd = SD_##type##_INIT;                                 \
- -      sd->level = SD_LV_##type;                               \
- -      SD_INIT_NAME(sd, type);                                 \
+ +#define SD_INIT_FUNC(type)                                            \
+ +static noinline struct sched_domain *                                 \
+ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu)       \
+ +{                                                                     \
+ +      struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
+ +      *sd = SD_##type##_INIT;                                         \
+ +      SD_INIT_NAME(sd, type);                                         \
+ +      sd->private = &tl->data;                                        \
+ +      return sd;                                                      \
   }
   
   SD_INIT_FUNC(CPU)
@@@ -6987,14 -7297,13 +7043,14 @@@
   #endif
   
   static int default_relax_domain_level = -1;
+ +int sched_domain_level_max;
   
   static int __init setup_relax_domain_level(char *str)
   {
         unsigned long val;
   
         val = simple_strtoul(str, NULL, 0);
- -      if (val < SD_LV_MAX)
+ +      if (val < sched_domain_level_max)
                 default_relax_domain_level = val;
   
         return 1;
@@@ -7022,20 -7331,37 +7078,20 @@@ static void set_domain_attribute(struc
         }
   }
   
+ +static void __sdt_free(const struct cpumask *cpu_map);
+ +static int __sdt_alloc(const struct cpumask *cpu_map);
+ +
   static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                  const struct cpumask *cpu_map)
   {
         switch (what) {
- -      case sa_sched_groups:
- -              free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- -              d->sched_group_nodes = NULL;
         case sa_rootdomain:
- -              free_rootdomain(d->rd); /* fall through */
- -      case sa_tmpmask:
- -              free_cpumask_var(d->tmpmask); /* fall through */
- -      case sa_send_covered:
- -              free_cpumask_var(d->send_covered); /* fall through */
- -      case sa_this_book_map:
- -              free_cpumask_var(d->this_book_map); /* fall through */
- -      case sa_this_core_map:
- -              free_cpumask_var(d->this_core_map); /* fall through */
- -      case sa_this_sibling_map:
- -              free_cpumask_var(d->this_sibling_map); /* fall through */
- -      case sa_nodemask:
- -              free_cpumask_var(d->nodemask); /* fall through */
- -      case sa_sched_group_nodes:
- -#ifdef CONFIG_NUMA
- -              kfree(d->sched_group_nodes); /* fall through */
- -      case sa_notcovered:
- -              free_cpumask_var(d->notcovered); /* fall through */
- -      case sa_covered:
- -              free_cpumask_var(d->covered); /* fall through */
- -      case sa_domainspan:
- -              free_cpumask_var(d->domainspan); /* fall through */
- -#endif
+ +              if (!atomic_read(&d->rd->refcount))
+ +                      free_rootdomain(&d->rd->rcu); /* fall through */
+ +      case sa_sd:
+ +              free_percpu(d->sd); /* fall through */
+ +      case sa_sd_storage:
+ +              __sdt_free(cpu_map); /* fall through */
         case sa_none:
                 break;
         }
@@@ -7044,212 -7370,308 +7100,212 @@@
   static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                    const struct cpumask *cpu_map)
   {
- -#ifdef CONFIG_NUMA
- -      if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- -              return sa_none;
- -      if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- -              return sa_domainspan;
- -      if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- -              return sa_covered;
- -      /* Allocate the per-node list of sched groups */
- -      d->sched_group_nodes = kcalloc(nr_node_ids,
- -                                    sizeof(struct sched_group *), GFP_KERNEL);
- -      if (!d->sched_group_nodes) {
- -              printk(KERN_WARNING "Can not alloc sched group node list\n");
- -              return sa_notcovered;
- -      }
- -      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
- -#endif
- -      if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- -              return sa_sched_group_nodes;
- -      if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- -              return sa_nodemask;
- -      if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- -              return sa_this_sibling_map;
- -      if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- -              return sa_this_core_map;
- -      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- -              return sa_this_book_map;
- -      if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- -              return sa_send_covered;
+ +      memset(d, 0, sizeof(*d));
+ +
+ +      if (__sdt_alloc(cpu_map))
+ +              return sa_sd_storage;
+ +      d->sd = alloc_percpu(struct sched_domain *);
+ +      if (!d->sd)
+ +              return sa_sd_storage;
         d->rd = alloc_rootdomain();
- -      if (!d->rd) {
- -              printk(KERN_WARNING "Cannot alloc root domain\n");
- -              return sa_tmpmask;
- -      }
+ +      if (!d->rd)
+ +              return sa_sd;
         return sa_rootdomain;
   }
   
- -static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ +/*
+ + * NULL the sd_data elements we've used to build the sched_domain and
+ + * sched_group structure so that the subsequent __free_domain_allocs()
+ + * will not free the data we're using.
+ + */
+ +static void claim_allocations(int cpu, struct sched_domain *sd)
   {
- -      struct sched_domain *sd = NULL;
- -#ifdef CONFIG_NUMA
- -      struct sched_domain *parent;
- -
- -      d->sd_allnodes = 0;
- -      if (cpumask_weight(cpu_map) >
- -          SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- -              sd = &per_cpu(allnodes_domains, i).sd;
- -              SD_INIT(sd, ALLNODES);
- -              set_domain_attribute(sd, attr);
- -              cpumask_copy(sched_domain_span(sd), cpu_map);
- -              cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- -              d->sd_allnodes = 1;
- -      }
- -      parent = sd;
+ +      struct sd_data *sdd = sd->private;
+ +      struct sched_group *sg = sd->groups;
   
- -      sd = &per_cpu(node_domains, i).sd;
- -      SD_INIT(sd, NODE);
- -      set_domain_attribute(sd, attr);
- -      sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- -      sd->parent = parent;
- -      if (parent)
- -              parent->child = sd;
- -      cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
- -#endif
- -      return sd;
- -}
+ +      WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ +      *per_cpu_ptr(sdd->sd, cpu) = NULL;
   
- -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd;
- -      sd = &per_cpu(phys_domains, i).sd;
- -      SD_INIT(sd, CPU);
- -      set_domain_attribute(sd, attr);
- -      cpumask_copy(sched_domain_span(sd), d->nodemask);
- -      sd->parent = parent;
- -      if (parent)
- -              parent->child = sd;
- -      cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- -      return sd;
- -}
- -
- -static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd = parent;
- -#ifdef CONFIG_SCHED_BOOK
- -      sd = &per_cpu(book_domains, i).sd;
- -      SD_INIT(sd, BOOK);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
- -#endif
- -      return sd;
+ +      if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ +              WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ +              *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ +      }
   }
   
- -static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
+ +#ifdef CONFIG_SCHED_SMT
+ +static const struct cpumask *cpu_smt_mask(int cpu)
   {
- -      struct sched_domain *sd = parent;
- -#ifdef CONFIG_SCHED_MC
- -      sd = &per_cpu(core_domains, i).sd;
- -      SD_INIT(sd, MC);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
- -#endif
- -      return sd;
+ +      return topology_thread_cpumask(cpu);
   }
- -
- -static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- -      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- -      struct sched_domain *parent, int i)
- -{
- -      struct sched_domain *sd = parent;
- -#ifdef CONFIG_SCHED_SMT
- -      sd = &per_cpu(cpu_domains, i).sd;
- -      SD_INIT(sd, SIBLING);
- -      set_domain_attribute(sd, attr);
- -      cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- -      sd->parent = parent;
- -      parent->child = sd;
- -      cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
   #endif
- -      return sd;
- -}
   
- -static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- -                             const struct cpumask *cpu_map, int cpu)
- -{
- -      switch (l) {
+ +/*
+ + * Topology list, bottom-up.
+ + */
+ +static struct sched_domain_topology_level default_topology[] = {
   #ifdef CONFIG_SCHED_SMT
- -      case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- -              cpumask_and(d->this_sibling_map, cpu_map,
- -                          topology_thread_cpumask(cpu));
- -              if (cpu == cpumask_first(d->this_sibling_map))
- -                      init_sched_build_groups(d->this_sibling_map, cpu_map,
- -                                              &cpu_to_cpu_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_SIBLING, cpu_smt_mask, },
   #endif
   #ifdef CONFIG_SCHED_MC
- -      case SD_LV_MC: /* set up multi-core groups */
- -              cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- -              if (cpu == cpumask_first(d->this_core_map))
- -                      init_sched_build_groups(d->this_core_map, cpu_map,
- -                                              &cpu_to_core_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_MC, cpu_coregroup_mask, },
   #endif
   #ifdef CONFIG_SCHED_BOOK
- -      case SD_LV_BOOK: /* set up book groups */
- -              cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- -              if (cpu == cpumask_first(d->this_book_map))
- -                      init_sched_build_groups(d->this_book_map, cpu_map,
- -                                              &cpu_to_book_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_BOOK, cpu_book_mask, },
   #endif
- -      case SD_LV_CPU: /* set up physical groups */
- -              cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- -              if (!cpumask_empty(d->nodemask))
- -                      init_sched_build_groups(d->nodemask, cpu_map,
- -                                              &cpu_to_phys_group,
- -                                              d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_CPU, cpu_cpu_mask, },
   #ifdef CONFIG_NUMA
- -      case SD_LV_ALLNODES:
- -              init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- -                                      d->send_covered, d->tmpmask);
- -              break;
+ +      { sd_init_NODE, cpu_node_mask, },
+ +      { sd_init_ALLNODES, cpu_allnodes_mask, },
   #endif
- -      default:
- -              break;
+ +      { NULL, },
+ +};
+ +
+ +static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ +
+ +static int __sdt_alloc(const struct cpumask *cpu_map)
+ +{
+ +      struct sched_domain_topology_level *tl;
+ +      int j;
+ +
+ +      for (tl = sched_domain_topology; tl->init; tl++) {
+ +              struct sd_data *sdd = &tl->data;
+ +
+ +              sdd->sd = alloc_percpu(struct sched_domain *);
+ +              if (!sdd->sd)
+ +                      return -ENOMEM;
+ +
+ +              sdd->sg = alloc_percpu(struct sched_group *);
+ +              if (!sdd->sg)
+ +                      return -ENOMEM;
+ +
+ +              for_each_cpu(j, cpu_map) {
+ +                      struct sched_domain *sd;
+ +                      struct sched_group *sg;
+ +
+ +                      sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ +                                      GFP_KERNEL, cpu_to_node(j));
+ +                      if (!sd)
+ +                              return -ENOMEM;
+ +
+ +                      *per_cpu_ptr(sdd->sd, j) = sd;
+ +
+ +                      sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                                      GFP_KERNEL, cpu_to_node(j));
+ +                      if (!sg)
+ +                              return -ENOMEM;
+ +
+ +                      *per_cpu_ptr(sdd->sg, j) = sg;
+ +              }
         }
+ +
+ +      return 0;
+ +}
+ +
+ +static void __sdt_free(const struct cpumask *cpu_map)
+ +{
+ +      struct sched_domain_topology_level *tl;
+ +      int j;
+ +
+ +      for (tl = sched_domain_topology; tl->init; tl++) {
+ +              struct sd_data *sdd = &tl->data;
+ +
+ +              for_each_cpu(j, cpu_map) {
+ +                      kfree(*per_cpu_ptr(sdd->sd, j));
+ +                      kfree(*per_cpu_ptr(sdd->sg, j));
+ +              }
+ +              free_percpu(sdd->sd);
+ +              free_percpu(sdd->sg);
+ +      }
+ +}
+ +
+ +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ +              struct s_data *d, const struct cpumask *cpu_map,
+ +              struct sched_domain_attr *attr, struct sched_domain *child,
+ +              int cpu)
+ +{
+ +      struct sched_domain *sd = tl->init(tl, cpu);
+ +      if (!sd)
+ +              return child;
+ +
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ +      if (child) {
+ +              sd->level = child->level + 1;
+ +              sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ +              child->parent = sd;
+ +      }
+ +      sd->child = child;
+ +
+ +      return sd;
   }
   
   /*
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- -static int __build_sched_domains(const struct cpumask *cpu_map,
- -                               struct sched_domain_attr *attr)
+ +static int build_sched_domains(const struct cpumask *cpu_map,
+ +                             struct sched_domain_attr *attr)
   {
         enum s_alloc alloc_state = sa_none;
- -      struct s_data d;
         struct sched_domain *sd;
- -      int i;
- -#ifdef CONFIG_NUMA
- -      d.sd_allnodes = 0;
- -#endif
+ +      struct s_data d;
+ +      int i, ret = -ENOMEM;
   
         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
         if (alloc_state != sa_rootdomain)
                 goto error;
- -      alloc_state = sa_sched_groups;
- -
- -      /*
- -       * Set up domains for cpus specified by the cpu_map.
- -       */
- -      for_each_cpu(i, cpu_map) {
- -              cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- -                          cpu_map);
- -
- -              sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- -              sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- -              sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- -      }
   
+ +      /* Set up domains for cpus specified by the cpu_map. */
         for_each_cpu(i, cpu_map) {
- -              build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- -              build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- -              build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- -      }
+ +              struct sched_domain_topology_level *tl;
   
- -      /* Set up physical groups */
- -      for (i = 0; i < nr_node_ids; i++)
- -              build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ +              sd = NULL;
+ +              for (tl = sched_domain_topology; tl->init; tl++)
+ +                      sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
   
- -#ifdef CONFIG_NUMA
- -      /* Set up node groups */
- -      if (d.sd_allnodes)
- -              build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
+ +              while (sd->child)
+ +                      sd = sd->child;
   
- -      for (i = 0; i < nr_node_ids; i++)
- -              if (build_numa_sched_groups(&d, cpu_map, i))
- -                      goto error;
- -#endif
- -
- -      /* Calculate CPU power for physical packages and nodes */
- -#ifdef CONFIG_SCHED_SMT
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(cpu_domains, i).sd;
- -              init_sched_groups_power(i, sd);
+ +              *per_cpu_ptr(d.sd, i) = sd;
         }
- -#endif
- -#ifdef CONFIG_SCHED_MC
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(core_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
- -#endif
- -#ifdef CONFIG_SCHED_BOOK
- -      for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(book_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
- -#endif
   
+ +      /* Build the groups for the domains */
         for_each_cpu(i, cpu_map) {
- -              sd = &per_cpu(phys_domains, i).sd;
- -              init_sched_groups_power(i, sd);
- -      }
+ +              for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ +                      sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ +                      get_group(i, sd->private, &sd->groups);
+ +                      atomic_inc(&sd->groups->ref);
   
- -#ifdef CONFIG_NUMA
- -      for (i = 0; i < nr_node_ids; i++)
- -              init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ +                      if (i != cpumask_first(sched_domain_span(sd)))
+ +                              continue;
   
- -      if (d.sd_allnodes) {
- -              struct sched_group *sg;
+ +                      build_sched_groups(sd);
+ +              }
+ +      }
+ +
+ +      /* Calculate CPU power for physical packages and nodes */
+ +      for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ +              if (!cpumask_test_cpu(i, cpu_map))
+ +                      continue;
   
- -              cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- -                                                              d.tmpmask);
- -              init_numa_sched_groups_power(sg);
+ +              for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ +                      claim_allocations(i, sd);
+ +                      init_sched_groups_power(i, sd);
+ +              }
         }
- -#endif
   
         /* Attach the domains */
+ +      rcu_read_lock();
         for_each_cpu(i, cpu_map) {
- -#ifdef CONFIG_SCHED_SMT
- -              sd = &per_cpu(cpu_domains, i).sd;
- -#elif defined(CONFIG_SCHED_MC)
- -              sd = &per_cpu(core_domains, i).sd;
- -#elif defined(CONFIG_SCHED_BOOK)
- -              sd = &per_cpu(book_domains, i).sd;
- -#else
- -              sd = &per_cpu(phys_domains, i).sd;
- -#endif
+ +              sd = *per_cpu_ptr(d.sd, i);
                 cpu_attach_domain(sd, d.rd, i);
         }
+ +      rcu_read_unlock();
   
- -      d.sched_group_nodes = NULL; /* don't free this we still need it */
- -      __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- -      return 0;
- -
+ +      ret = 0;
   error:
         __free_domain_allocs(&d, alloc_state, cpu_map);
- -      return -ENOMEM;
- -}
- -
- -static int build_sched_domains(const struct cpumask *cpu_map)
- -{
- -      return __build_sched_domains(cpu_map, NULL);
+ +      return ret;
   }
   
   static cpumask_var_t *doms_cur;       /* current sched domains */
@@@ -7304,7 -7726,7 +7360,7 @@@ void free_sched_domains(cpumask_var_t d
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- -static int arch_init_sched_domains(const struct cpumask *cpu_map)
+ +static int init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
@@@ -7315,24 -7737,32 +7371,24 @@@
                 doms_cur = &fallback_doms;
         cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
- -      err = build_sched_domains(doms_cur[0]);
+ +      err = build_sched_domains(doms_cur[0], NULL);
         register_sched_domain_sysctl();
   
         return err;
   }
   
- -static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- -                                     struct cpumask *tmpmask)
- -{
- -      free_sched_groups(cpu_map, tmpmask);
- -}
- -
   /*
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
   static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
- -      /* Save because hotplug lock held. */
- -      static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
+ +      rcu_read_lock();
         for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
- -      synchronize_sched();
- -      arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ +      rcu_read_unlock();
   }
   
   /* handle null as "default" */
@@@ -7421,7 -7851,8 +7477,7 @@@ match1
                                 goto match2;
                 }
                 /* no match - add a new doms_new */
- -              __build_sched_domains(doms_new[i],
- -                                      dattr_new ? dattr_new + i : NULL);
+ +              build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
   match2:
                 ;
         }
@@@ -7440,7 -7871,7 +7496,7 @@@
   }
   
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -static void arch_reinit_sched_domains(void)
+ +static void reinit_sched_domains(void)
   {
         get_online_cpus();
   
@@@ -7473,7 -7904,7 +7529,7 @@@ static ssize_t sched_power_savings_stor
         else
                 sched_mc_power_savings = level;
   
- -      arch_reinit_sched_domains();
+ +      reinit_sched_domains();
   
         return count;
   }
@@@ -7592,9 -8023,14 +7648,9 @@@ void __init sched_init_smp(void
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
- -#if defined(CONFIG_NUMA)
- -      sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- -                                                              GFP_KERNEL);
- -      BUG_ON(sched_group_nodes_bycpu == NULL);
- -#endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
- -      arch_init_sched_domains(cpu_active_mask);
+ +      init_sched_domains(cpu_active_mask);
         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
         if (cpumask_empty(non_isolated_cpus))
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@@ -7901,7 -8337,6 +7957,7 @@@ void __init sched_init(void
         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
   #ifdef CONFIG_SMP
+ +      zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
   #ifdef CONFIG_NO_HZ
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
         alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@@ -7961,7 -8396,7 +8017,7 @@@ static void normalize_task(struct rq *r
         int old_prio = p->prio;
         int on_rq;
   
-       on_rq = p->se.on_rq;
+       on_rq = p->on_rq;
         if (on_rq)
                 deactivate_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
@@@ -8304,7 -8739,7 +8360,7 @@@ void sched_move_task(struct task_struc
         rq = task_rq_lock(tsk, &flags);
   
         running = task_current(rq, tsk);
-       on_rq = tsk->se.on_rq;
+       on_rq = tsk->on_rq;
   
         if (on_rq)
                 dequeue_task(rq, tsk, 0);
@@@ -8323,7 -8758,7 +8379,7 @@@
         if (on_rq)
                 enqueue_task(rq, tsk, 0);
   
-       task_rq_unlock(rq, &flags);
+       task_rq_unlock(rq, tsk, &flags);
   }
   #endif /* CONFIG_CGROUP_SCHED */
   
diff --combined kernel/sched_fair.c

index 9c5679c,054cebb..8744593
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -358,6 -358,10 +358,10 @@@ static void update_min_vruntime(struct 
         }
   
         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+ #ifndef CONFIG_64BIT
+       smp_wmb();
+       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+ #endif
   }
   
   /*
@@@ -1372,12 -1376,25 +1376,25 @@@ static void dequeue_task_fair(struct r
   
   #ifdef CONFIG_SMP
   
- static void task_waking_fair(struct rq *rq, struct task_struct *p)
+ static void task_waking_fair(struct task_struct *p)
   {
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 min_vruntime;
+ 
+ #ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+ 
+       do {
+               min_vruntime_copy = cfs_rq->min_vruntime_copy;
+               smp_rmb();
+               min_vruntime = cfs_rq->min_vruntime;
+       } while (min_vruntime != min_vruntime_copy);
+ #else
+       min_vruntime = cfs_rq->min_vruntime;
+ #endif
   
-       se->vruntime -= cfs_rq->min_vruntime;
+       se->vruntime -= min_vruntime;
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1622,7 -1639,6 +1639,7 @@@ static int select_idle_sibling(struct t
         /*
          * Otherwise, iterate the domains and find an elegible idle cpu.
          */
+ +      rcu_read_lock();
         for_each_domain(target, sd) {
                 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                         break;
@@@ -1642,7 -1658,6 +1659,7 @@@
                     cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                         break;
         }
+ +      rcu_read_unlock();
   
         return target;
   }
@@@ -1659,7 -1674,7 +1676,7 @@@
    * preempt must be disabled.
    */
   static int
- select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
   {
         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
         int cpu = smp_processor_id();
@@@ -1675,7 -1690,6 +1692,7 @@@
                 new_cpu = prev_cpu;
         }
   
+ +      rcu_read_lock();
         for_each_domain(cpu, tmp) {
                 if (!(tmp->flags & SD_LOAD_BALANCE))
                         continue;
@@@ -1726,10 -1740,9 +1743,10 @@@
   
         if (affine_sd) {
                 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- -                      return select_idle_sibling(p, cpu);
- -              else
- -                      return select_idle_sibling(p, prev_cpu);
+ +                      prev_cpu = cpu;
+ +
+ +              new_cpu = select_idle_sibling(p, prev_cpu);
+ +              goto unlock;
         }
   
         while (sd) {
@@@ -1770,8 -1783,6 +1787,8 @@@
                 }
                 /* while loop will break here if sd == NULL */
         }
+ +unlock:
+ +      rcu_read_unlock();
   
         return new_cpu;
   }
@@@ -2651,7 -2662,7 +2668,7 @@@ fix_small_capacity(struct sched_domain 
         /*
          * Only siblings can have significantly less than SCHED_LOAD_SCALE
          */
- -      if (sd->level != SD_LV_SIBLING)
+ +      if (!(sd->flags & SD_SHARE_CPUPOWER))
                 return 0;
   
         /*
@@@ -3468,7 -3479,6 +3485,7 @@@ static void idle_balance(int this_cpu, 
         raw_spin_unlock(&this_rq->lock);
   
         update_shares(this_cpu);
+ +      rcu_read_lock();
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int balance = 1;
@@@ -3490,7 -3500,6 +3507,7 @@@
                         break;
                 }
         }
+ +      rcu_read_unlock();
   
         raw_spin_lock(&this_rq->lock);
   
@@@ -3539,7 -3548,6 +3556,7 @@@ static int active_load_balance_cpu_stop
         double_lock_balance(busiest_rq, target_rq);
   
         /* Search for an sd spanning us and the target CPU. */
+ +      rcu_read_lock();
         for_each_domain(target_cpu, sd) {
                 if ((sd->flags & SD_LOAD_BALANCE) &&
                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@@ -3555,7 -3563,6 +3572,7 @@@
                 else
                         schedstat_inc(sd, alb_failed);
         }
+ +      rcu_read_unlock();
         double_unlock_balance(busiest_rq, target_rq);
   out_unlock:
         busiest_rq->active_balance = 0;
@@@ -3682,7 -3689,6 +3699,7 @@@ static int find_new_ilb(int cpu
   {
         struct sched_domain *sd;
         struct sched_group *ilb_group;
+ +      int ilb = nr_cpu_ids;
   
         /*
          * Have idle load balancer selection from semi-idle packages only
@@@ -3698,25 -3704,20 +3715,25 @@@
         if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                 goto out_done;
   
+ +      rcu_read_lock();
         for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
                 ilb_group = sd->groups;
   
                 do {
- -                      if (is_semi_idle_group(ilb_group))
- -                              return cpumask_first(nohz.grp_idle_mask);
+ +                      if (is_semi_idle_group(ilb_group)) {
+ +                              ilb = cpumask_first(nohz.grp_idle_mask);
+ +                              goto unlock;
+ +                      }
   
                         ilb_group = ilb_group->next;
   
                 } while (ilb_group != sd->groups);
         }
+ +unlock:
+ +      rcu_read_unlock();
   
   out_done:
- -      return nr_cpu_ids;
+ +      return ilb;
   }
   #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
   static inline int find_new_ilb(int call_cpu)
@@@ -3861,7 -3862,6 +3878,7 @@@ static void rebalance_domains(int cpu, 
   
         update_shares(cpu);
   
+ +      rcu_read_lock();
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
@@@ -3907,7 -3907,6 +3924,7 @@@ out
                 if (!balance)
                         break;
         }
+ +      rcu_read_unlock();
   
         /*
          * next_balance will be updated only when there is a need.
author	Ingo Molnar <mingo@elte.hu>
	Mon, 18 Apr 2011 12:53:18 +0000 (14:53 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 18 Apr 2011 12:53:33 +0000 (14:53 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history