Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)
diff --combined include/linux/sched.h

index 30364cb,185a750..6633e83
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,7 -25,7 +25,7 @@@ struct sched_param 
   #include <linux/errno.h>
   #include <linux/nodemask.h>
   #include <linux/mm_types.h>
- -#include <linux/preempt_mask.h>
+ +#include <linux/preempt.h>
   
   #include <asm/page.h>
   #include <asm/ptrace.h>
@@@ -132,7 -132,6 +132,7 @@@ struct fs_struct
   struct perf_event_context;
   struct blk_plug;
   struct filename;
+ +struct nameidata;
   
   #define VMACACHE_BITS 2
   #define VMACACHE_SIZE (1U << VMACACHE_BITS)
@@@ -174,12 -173,7 +174,12 @@@ extern unsigned long nr_iowait_cpu(int 
   extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
   
   extern void calc_global_load(unsigned long ticks);
+ +
+ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
   extern void update_cpu_load_nohz(void);
+ +#else
+ +static inline void update_cpu_load_nohz(void) { }
+ +#endif
   
   extern unsigned long get_parent_ip(unsigned long addr);
   
@@@ -219,10 -213,9 +219,10 @@@ print_cfs_rq(struct seq_file *m, int cp
   #define TASK_WAKEKILL         128
   #define TASK_WAKING           256
   #define TASK_PARKED           512
- -#define TASK_STATE_MAX                1024
+ +#define TASK_NOLOAD           1024
+ +#define TASK_STATE_MAX                2048
   
- -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+ +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
   
   extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@@ -232,8 -225,6 +232,8 @@@
   #define TASK_STOPPED          (TASK_WAKEKILL | __TASK_STOPPED)
   #define TASK_TRACED           (TASK_WAKEKILL | __TASK_TRACED)
   
+ +#define TASK_IDLE             (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+ +
   /* Convenience macros for the sake of wake_up */
   #define TASK_NORMAL           (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
   #define TASK_ALL              (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@@ -249,8 -240,7 +249,8 @@@
                         ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
   #define task_contributes_to_load(task)        \
                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
- -                               (task->flags & PF_FROZEN) == 0)
+ +                               (task->flags & PF_FROZEN) == 0 && \
+ +                               (task->state & TASK_NOLOAD) == 0)
   
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   
@@@ -262,7 -252,7 +262,7 @@@
   #define set_task_state(tsk, state_value)                      \
         do {                                                    \
                 (tsk)->task_state_change = _THIS_IP_;           \
- -              set_mb((tsk)->state, (state_value));            \
+ +              smp_store_mb((tsk)->state, (state_value));              \
         } while (0)
   
   /*
@@@ -284,7 -274,7 +284,7 @@@
   #define set_current_state(state_value)                                \
         do {                                                    \
                 current->task_state_change = _THIS_IP_;         \
- -              set_mb(current->state, (state_value));          \
+ +              smp_store_mb(current->state, (state_value));            \
         } while (0)
   
   #else
@@@ -292,7 -282,7 +292,7 @@@
   #define __set_task_state(tsk, state_value)            \
         do { (tsk)->state = (state_value); } while (0)
   #define set_task_state(tsk, state_value)              \
- -      set_mb((tsk)->state, (state_value))
+ +      smp_store_mb((tsk)->state, (state_value))
   
   /*
    * set_current_state() includes a barrier so that the write of current->state
@@@ -308,7 -298,7 +308,7 @@@
   #define __set_current_state(state_value)              \
         do { current->state = (state_value); } while (0)
   #define set_current_state(state_value)                        \
- -      set_mb(current->state, (state_value))
+ +      smp_store_mb(current->state, (state_value))
   
   #endif
   
@@@ -345,10 -335,14 +345,10 @@@ extern int runqueue_is_locked(int cpu)
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
   extern void nohz_balance_enter_idle(int cpu);
   extern void set_cpu_sd_state_idle(void);
- -extern int get_nohz_timer_target(int pinned);
+ +extern int get_nohz_timer_target(void);
   #else
   static inline void nohz_balance_enter_idle(int cpu) { }
   static inline void set_cpu_sd_state_idle(void) { }
- -static inline int get_nohz_timer_target(int pinned)
- -{
- -      return smp_processor_id();
- -}
   #endif
   
   /*
@@@ -573,23 -567,6 +573,23 @@@ struct task_cputime 
                 .sum_exec_runtime = 0,                          \
         }
   
+ +/*
+ + * This is the atomic variant of task_cputime, which can be used for
+ + * storing and updating task_cputime statistics without locking.
+ + */
+ +struct task_cputime_atomic {
+ +      atomic64_t utime;
+ +      atomic64_t stime;
+ +      atomic64_t sum_exec_runtime;
+ +};
+ +
+ +#define INIT_CPUTIME_ATOMIC \
+ +      (struct task_cputime_atomic) {                          \
+ +              .utime = ATOMIC64_INIT(0),                      \
+ +              .stime = ATOMIC64_INIT(0),                      \
+ +              .sum_exec_runtime = ATOMIC64_INIT(0),           \
+ +      }
+ +
   #ifdef CONFIG_PREEMPT_COUNT
   #define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
   #else
@@@ -607,16 -584,18 +607,16 @@@
   
   /**
    * struct thread_group_cputimer - thread group interval timer counts
- - * @cputime:          thread group interval timers.
+ + * @cputime_atomic:   atomic thread group interval timers.
    * @running:          non-zero when there are timers running and
    *                    @cputime receives updates.
- - * @lock:             lock for fields in this struct.
    *
    * This structure contains the version of task_cputime, above, that is
    * used for thread group CPU timer calculations.
    */
   struct thread_group_cputimer {
- -      struct task_cputime cputime;
+ +      struct task_cputime_atomic cputime_atomic;
         int running;
- -      raw_spinlock_t lock;
   };
   
   #include <linux/rwsem.h>
@@@ -921,50 -900,6 +921,50 @@@ enum cpu_idle_type 
   #define SCHED_CAPACITY_SCALE  (1L << SCHED_CAPACITY_SHIFT)
   
   /*
+ + * Wake-queues are lists of tasks with a pending wakeup, whose
+ + * callers have already marked the task as woken internally,
+ + * and can thus carry on. A common use case is being able to
+ + * do the wakeups once the corresponding user lock as been
+ + * released.
+ + *
+ + * We hold reference to each task in the list across the wakeup,
+ + * thus guaranteeing that the memory is still valid by the time
+ + * the actual wakeups are performed in wake_up_q().
+ + *
+ + * One per task suffices, because there's never a need for a task to be
+ + * in two wake queues simultaneously; it is forbidden to abandon a task
+ + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ + * already in a wake queue, the wakeup will happen soon and the second
+ + * waker can just skip it.
+ + *
+ + * The WAKE_Q macro declares and initializes the list head.
+ + * wake_up_q() does NOT reinitialize the list; it's expected to be
+ + * called near the end of a function, where the fact that the queue is
+ + * not used again will be easy to see by inspection.
+ + *
+ + * Note that this can cause spurious wakeups. schedule() callers
+ + * must ensure the call is done inside a loop, confirming that the
+ + * wakeup condition has in fact occurred.
+ + */
+ +struct wake_q_node {
+ +      struct wake_q_node *next;
+ +};
+ +
+ +struct wake_q_head {
+ +      struct wake_q_node *first;
+ +      struct wake_q_node **lastp;
+ +};
+ +
+ +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+ +
+ +#define WAKE_Q(name)                                  \
+ +      struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+ +
+ +extern void wake_q_add(struct wake_q_head *head,
+ +                     struct task_struct *task);
+ +extern void wake_up_q(struct wake_q_head *head);
+ +
+ +/*
    * sched-domains (multiprocessor balancing) declarations:
    */
   #ifdef CONFIG_SMP
@@@ -1399,6 -1334,8 +1399,6 @@@ struct task_struct 
         int rcu_read_lock_nesting;
         union rcu_special rcu_read_unlock_special;
         struct list_head rcu_node_entry;
- -#endif /* #ifdef CONFIG_PREEMPT_RCU */
- -#ifdef CONFIG_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_PREEMPT_RCU */
   #ifdef CONFIG_TASKS_RCU
@@@ -1419,6 -1356,9 +1419,6 @@@
   #endif
   
         struct mm_struct *mm, *active_mm;
- -#ifdef CONFIG_COMPAT_BRK
- -      unsigned brk_randomized:1;
- -#endif
         /* per-thread vma caching */
         u32 vmacache_seqnum;
         struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@@ -1429,7 -1369,7 +1429,7 @@@
         int exit_state;
         int exit_code, exit_signal;
         int pdeath_signal;  /*  The signal sent when the parent dies  */
- -      unsigned int jobctl;    /* JOBCTL_*, siglock protected */
+ +      unsigned long jobctl;   /* JOBCTL_*, siglock protected */
   
         /* Used for emulating ABI behavior of previous Linux versions */
         unsigned int personality;
@@@ -1441,14 -1381,10 +1441,14 @@@
         /* Revert to default priority/policy when forking */
         unsigned sched_reset_on_fork:1;
         unsigned sched_contributes_to_load:1;
+ +      unsigned sched_migrated:1;
   
   #ifdef CONFIG_MEMCG_KMEM
         unsigned memcg_kmem_skip_account:1;
   #endif
+ +#ifdef CONFIG_COMPAT_BRK
+ +      unsigned brk_randomized:1;
+ +#endif
   
         unsigned long atomic_flags; /* Flags needing atomic access. */
   
@@@ -1525,7 -1461,7 +1525,7 @@@
                                        it with task_lock())
                                      - initialized normally by setup_new_exec */
   /* file system info */
- -      int link_count, total_link_count;
+ +      struct nameidata *nameidata;
   #ifdef CONFIG_SYSVIPC
   /* ipc stuff */
         struct sysv_sem sysvsem;
@@@ -1575,8 -1511,6 +1575,8 @@@
         /* Protection of the PI data structures: */
         raw_spinlock_t pi_lock;
   
+ +      struct wake_q_node wake_q;
+ +
   #ifdef CONFIG_RT_MUTEXES
         /* PI waiters blocked on a rt_mutex held by this task */
         struct rb_root pi_waiters;
@@@ -1790,7 -1724,6 +1790,7 @@@
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
         unsigned long   task_state_change;
   #endif
+ +      int pagefault_disabled;
   };
   
   /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -2144,22 -2077,22 +2144,22 @@@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab
   #define JOBCTL_TRAPPING_BIT   21      /* switching to TRACED */
   #define JOBCTL_LISTENING_BIT  22      /* ptracer is listening for events */
   
- -#define JOBCTL_STOP_DEQUEUED  (1 << JOBCTL_STOP_DEQUEUED_BIT)
- -#define JOBCTL_STOP_PENDING   (1 << JOBCTL_STOP_PENDING_BIT)
- -#define JOBCTL_STOP_CONSUME   (1 << JOBCTL_STOP_CONSUME_BIT)
- -#define JOBCTL_TRAP_STOP      (1 << JOBCTL_TRAP_STOP_BIT)
- -#define JOBCTL_TRAP_NOTIFY    (1 << JOBCTL_TRAP_NOTIFY_BIT)
- -#define JOBCTL_TRAPPING               (1 << JOBCTL_TRAPPING_BIT)
- -#define JOBCTL_LISTENING      (1 << JOBCTL_LISTENING_BIT)
+ +#define JOBCTL_STOP_DEQUEUED  (1UL << JOBCTL_STOP_DEQUEUED_BIT)
+ +#define JOBCTL_STOP_PENDING   (1UL << JOBCTL_STOP_PENDING_BIT)
+ +#define JOBCTL_STOP_CONSUME   (1UL << JOBCTL_STOP_CONSUME_BIT)
+ +#define JOBCTL_TRAP_STOP      (1UL << JOBCTL_TRAP_STOP_BIT)
+ +#define JOBCTL_TRAP_NOTIFY    (1UL << JOBCTL_TRAP_NOTIFY_BIT)
+ +#define JOBCTL_TRAPPING               (1UL << JOBCTL_TRAPPING_BIT)
+ +#define JOBCTL_LISTENING      (1UL << JOBCTL_LISTENING_BIT)
   
   #define JOBCTL_TRAP_MASK      (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
   #define JOBCTL_PENDING_MASK   (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
   
   extern bool task_set_jobctl_pending(struct task_struct *task,
- -                                  unsigned int mask);
+ +                                  unsigned long mask);
   extern void task_clear_jobctl_trapping(struct task_struct *task);
   extern void task_clear_jobctl_pending(struct task_struct *task,
- -                                    unsigned int mask);
+ +                                    unsigned long mask);
   
   static inline void rcu_copy_process(struct task_struct *p)
   {
@@@ -2599,6 -2532,9 +2599,9 @@@ static inline unsigned long wait_task_i
   }
   #endif
   
+ #define tasklist_empty() \
+       list_empty(&init_task.tasks)
+ 
   #define next_task(p) \
         list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
   
@@@ -3029,6 -2965,11 +3032,6 @@@ static __always_inline bool need_resche
   void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
   
- -static inline void thread_group_cputime_init(struct signal_struct *sig)
- -{
- -      raw_spin_lock_init(&sig->cputimer.lock);
- -}
- -
   /*
    * Reevaluate whether the task has signals pending delivery.
    * Wake the task if so.
@@@ -3142,13 -3083,13 +3145,13 @@@ static inline void mm_update_next_owner
   static inline unsigned long task_rlimit(const struct task_struct *tsk,
                 unsigned int limit)
   {
- -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+ +      return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
   }
   
   static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
                 unsigned int limit)
   {
- -      return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+ +      return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
   }
   
   static inline unsigned long rlimit(unsigned int limit)
diff --combined kernel/sched/core.c

index c9a707b,b8f4876..c86935a
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -90,6 -90,26 +90,6 @@@
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
   
- -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
- -{
- -      unsigned long delta;
- -      ktime_t soft, hard, now;
- -
- -      for (;;) {
- -              if (hrtimer_active(period_timer))
- -                      break;
- -
- -              now = hrtimer_cb_get_time(period_timer);
- -              hrtimer_forward(period_timer, now, period);
- -
- -              soft = hrtimer_get_softexpires(period_timer);
- -              hard = hrtimer_get_expires(period_timer);
- -              delta = ktime_to_ns(ktime_sub(hard, soft));
- -              __hrtimer_start_range_ns(period_timer, soft, delta,
- -                                       HRTIMER_MODE_ABS_PINNED, 0);
- -      }
- -}
- -
   DEFINE_MUTEX(sched_domains_mutex);
   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
@@@ -335,11 -355,12 +335,11 @@@ static enum hrtimer_restart hrtick(stru
   
   #ifdef CONFIG_SMP
   
- -static int __hrtick_restart(struct rq *rq)
+ +static void __hrtick_restart(struct rq *rq)
   {
         struct hrtimer *timer = &rq->hrtick_timer;
- -      ktime_t time = hrtimer_get_softexpires(timer);
   
- -      return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+ +      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
   }
   
   /*
@@@ -419,8 -440,8 +419,8 @@@ void hrtick_start(struct rq *rq, u64 de
          * doesn't make sense. Rely on vruntime for fairness.
          */
         delay = max_t(u64, delay, 10000LL);
- -      __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- -                      HRTIMER_MODE_REL_PINNED, 0);
+ +      hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+ +                    HRTIMER_MODE_REL_PINNED);
   }
   
   static inline void init_hrtick(void)
@@@ -490,7 -511,7 +490,7 @@@ static bool set_nr_and_not_polling(stru
   static bool set_nr_if_polling(struct task_struct *p)
   {
         struct thread_info *ti = task_thread_info(p);
- -      typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ +      typeof(ti->flags) old, val = READ_ONCE(ti->flags);
   
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
@@@ -520,52 -541,6 +520,52 @@@ static bool set_nr_if_polling(struct ta
   #endif
   #endif
   
+ +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+ +{
+ +      struct wake_q_node *node = &task->wake_q;
+ +
+ +      /*
+ +       * Atomically grab the task, if ->wake_q is !nil already it means
+ +       * its already queued (either by us or someone else) and will get the
+ +       * wakeup due to that.
+ +       *
+ +       * This cmpxchg() implies a full barrier, which pairs with the write
+ +       * barrier implied by the wakeup in wake_up_list().
+ +       */
+ +      if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ +              return;
+ +
+ +      get_task_struct(task);
+ +
+ +      /*
+ +       * The head is context local, there can be no concurrency.
+ +       */
+ +      *head->lastp = node;
+ +      head->lastp = &node->next;
+ +}
+ +
+ +void wake_up_q(struct wake_q_head *head)
+ +{
+ +      struct wake_q_node *node = head->first;
+ +
+ +      while (node != WAKE_Q_TAIL) {
+ +              struct task_struct *task;
+ +
+ +              task = container_of(node, struct task_struct, wake_q);
+ +              BUG_ON(!task);
+ +              /* task can safely be re-inserted now */
+ +              node = node->next;
+ +              task->wake_q.next = NULL;
+ +
+ +              /*
+ +               * wake_up_process() implies a wmb() to pair with the queueing
+ +               * in wake_q_add() so as not to miss wakeups.
+ +               */
+ +              wake_up_process(task);
+ +              put_task_struct(task);
+ +      }
+ +}
+ +
   /*
    * resched_curr - mark rq's current task 'to be rescheduled now'.
    *
@@@ -618,12 -593,13 +618,12 @@@ void resched_cpu(int cpu
    * selecting an idle cpu will add more delays to the timers than intended
    * (as that cpu's timer base may not be uptodate wrt jiffies etc).
    */
- -int get_nohz_timer_target(int pinned)
+ +int get_nohz_timer_target(void)
   {
- -      int cpu = smp_processor_id();
- -      int i;
+ +      int i, cpu = smp_processor_id();
         struct sched_domain *sd;
   
- -      if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ +      if (!idle_cpu(cpu))
                 return cpu;
   
         rcu_read_lock();
@@@ -1073,7 -1049,7 +1073,7 @@@ void set_task_cpu(struct task_struct *p
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
- -              perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ +              perf_event_task_migrate(p);
         }
   
         __set_task_cpu(p, new_cpu);
@@@ -2129,15 -2105,12 +2129,15 @@@ void wake_up_new_task(struct task_struc
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
+ +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+ +
   /**
    * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
   {
+ +      static_key_slow_inc(&preempt_notifier_key);
         hlist_add_head(&notifier->link, &current->preempt_notifiers);
   }
   EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@@ -2146,16 -2119,15 +2146,16 @@@
    * preempt_notifier_unregister - no longer interested in preemption notifications
    * @notifier: notifier struct to unregister
    *
- - * This is safe to call from within a preemption notifier.
+ + * This is *not* safe to call from within a preemption notifier.
    */
   void preempt_notifier_unregister(struct preempt_notifier *notifier)
   {
         hlist_del(&notifier->link);
+ +      static_key_slow_dec(&preempt_notifier_key);
   }
   EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
   
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
         struct preempt_notifier *notifier;
   
@@@ -2163,15 -2135,9 +2163,15 @@@
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
   }
   
+ +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +{
+ +      if (static_key_false(&preempt_notifier_key))
+ +              __fire_sched_in_preempt_notifiers(curr);
+ +}
+ +
   static void
- -fire_sched_out_preempt_notifiers(struct task_struct *curr,
- -                               struct task_struct *next)
+ +__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ +                                 struct task_struct *next)
   {
         struct preempt_notifier *notifier;
   
@@@ -2179,21 -2145,13 +2179,21 @@@
                 notifier->ops->sched_out(notifier, next);
   }
   
+ +static __always_inline void
+ +fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ +                               struct task_struct *next)
+ +{
+ +      if (static_key_false(&preempt_notifier_key))
+ +              __fire_sched_out_preempt_notifiers(curr, next);
+ +}
+ +
   #else /* !CONFIG_PREEMPT_NOTIFIERS */
   
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+ +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
   }
   
- -static void
+ +static inline void
   fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
   {
@@@ -2374,7 -2332,6 +2374,6 @@@ context_switch(struct rq *rq, struct ta
          */
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
   
-       context_tracking_task_switch(prev, next);
         /* Here we just switch the register state and the stack. */
         switch_to(prev, next, prev);
         barrier();
@@@ -2439,9 -2396,9 +2438,9 @@@ unsigned long nr_iowait_cpu(int cpu
   
   void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
   {
- -      struct rq *this = this_rq();
- -      *nr_waiters = atomic_read(&this->nr_iowait);
- -      *load = this->cpu_load[0];
+ +      struct rq *rq = this_rq();
+ +      *nr_waiters = atomic_read(&rq->nr_iowait);
+ +      *load = rq->load.weight;
   }
   
   #ifdef CONFIG_SMP
@@@ -2539,7 -2496,6 +2538,7 @@@ void scheduler_tick(void
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+ +      calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
   
         perf_event_task_tick();
@@@ -2568,7 -2524,7 +2567,7 @@@
   u64 scheduler_tick_max_deferment(void)
   {
         struct rq *rq = this_rq();
- -      unsigned long next, now = ACCESS_ONCE(jiffies);
+ +      unsigned long next, now = READ_ONCE(jiffies);
   
         next = rq->last_sched_tick + HZ;
   
@@@ -2769,7 -2725,9 +2768,7 @@@ again
    *          - return from syscall or exception to user-space
    *          - return from interrupt-handler to user-space
    *
- - * WARNING: all callers must re-check need_resched() afterward and reschedule
- - * accordingly in case an event triggered the need for rescheduling (such as
- - * an interrupt waking up a task) while preemption was disabled in __schedule().
+ + * WARNING: must be called with preemption disabled!
    */
   static void __sched __schedule(void)
   {
@@@ -2778,6 -2736,7 +2777,6 @@@
         struct rq *rq;
         int cpu;
   
- -      preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_note_context_switch();
@@@ -2841,6 -2800,8 +2840,6 @@@
                 raw_spin_unlock_irq(&rq->lock);
   
         post_schedule(rq);
- -
- -      sched_preempt_enable_no_resched();
   }
   
   static inline void sched_submit_work(struct task_struct *tsk)
@@@ -2861,9 -2822,7 +2860,9 @@@ asmlinkage __visible void __sched sched
   
         sched_submit_work(tsk);
         do {
+ +              preempt_disable();
                 __schedule();
+ +              sched_preempt_enable_no_resched();
         } while (need_resched());
   }
   EXPORT_SYMBOL(schedule);
@@@ -2902,14 -2861,15 +2901,14 @@@ void __sched schedule_preempt_disabled(
   static void __sched notrace preempt_schedule_common(void)
   {
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              preempt_active_enter();
                 __schedule();
- -              __preempt_count_sub(PREEMPT_ACTIVE);
+ +              preempt_active_exit();
   
                 /*
                  * Check again in case we missed a preemption opportunity
                  * between schedule and now.
                  */
- -              barrier();
         } while (need_resched());
   }
   
@@@ -2933,8 -2893,9 +2932,8 @@@ asmlinkage __visible void __sched notra
   NOKPROBE_SYMBOL(preempt_schedule);
   EXPORT_SYMBOL(preempt_schedule);
   
- -#ifdef CONFIG_CONTEXT_TRACKING
   /**
- - * preempt_schedule_context - preempt_schedule called by tracing
+ + * preempt_schedule_notrace - preempt_schedule called by tracing
    *
    * The tracing infrastructure uses preempt_enable_notrace to prevent
    * recursion and tracing preempt enabling caused by the tracing
@@@ -2947,7 -2908,7 +2946,7 @@@
    * instead of preempt_schedule() to exit user context if needed before
    * calling the scheduler.
    */
- -asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+ +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
   {
         enum ctx_state prev_ctx;
   
@@@ -2955,13 -2916,7 +2954,13 @@@
                 return;
   
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              /*
+ +               * Use raw __prempt_count() ops that don't call function.
+ +               * We can't call functions before disabling preemption which
+ +               * disarm preemption tracing recursions.
+ +               */
+ +              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ +              barrier();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@@ -2971,11 -2926,12 +2970,11 @@@
                 __schedule();
                 exception_exit(prev_ctx);
   
- -              __preempt_count_sub(PREEMPT_ACTIVE);
                 barrier();
+ +              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
         } while (need_resched());
   }
- -EXPORT_SYMBOL_GPL(preempt_schedule_context);
- -#endif /* CONFIG_CONTEXT_TRACKING */
+ +EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
   #endif /* CONFIG_PREEMPT */
   
@@@ -2995,11 -2951,17 +2994,11 @@@ asmlinkage __visible void __sched preem
         prev_state = exception_enter();
   
         do {
- -              __preempt_count_add(PREEMPT_ACTIVE);
+ +              preempt_active_enter();
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
- -              __preempt_count_sub(PREEMPT_ACTIVE);
- -
- -              /*
- -               * Check again in case we missed a preemption opportunity
- -               * between schedule and now.
- -               */
- -              barrier();
+ +              preempt_active_exit();
         } while (need_resched());
   
         exception_exit(prev_state);
@@@ -3077,6 -3039,7 +3076,6 @@@ void rt_mutex_setprio(struct task_struc
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
- -                      p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
@@@ -3336,18 -3299,15 +3335,18 @@@ static void __setscheduler_params(struc
   
   /* Actually do priority change: must hold pi & rq lock. */
   static void __setscheduler(struct rq *rq, struct task_struct *p,
- -                         const struct sched_attr *attr)
+ +                         const struct sched_attr *attr, bool keep_boost)
   {
         __setscheduler_params(p, attr);
   
         /*
- -       * If we get here, there was no pi waiters boosting the
- -       * task. It is safe to use the normal prio.
+ +       * Keep a potential priority boosting if called from
+ +       * sched_setscheduler().
          */
- -      p->prio = normal_prio(p);
+ +      if (keep_boost)
+ +              p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+ +      else
+ +              p->prio = normal_prio(p);
   
         if (dl_prio(p->prio))
                 p->sched_class = &dl_sched_class;
@@@ -3447,7 -3407,7 +3446,7 @@@ static int __sched_setscheduler(struct 
         int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                       MAX_RT_PRIO - 1 - attr->sched_priority;
         int retval, oldprio, oldpolicy = -1, queued, running;
- -      int policy = attr->sched_policy;
+ +      int new_effective_prio, policy = attr->sched_policy;
         unsigned long flags;
         const struct sched_class *prev_class;
         struct rq *rq;
@@@ -3629,14 -3589,15 +3628,14 @@@ change
         oldprio = p->prio;
   
         /*
- -       * Special case for priority boosted tasks.
- -       *
- -       * If the new priority is lower or equal (user space view)
- -       * than the current (boosted) priority, we just store the new
+ +       * Take priority boosted tasks into account. If the new
+ +       * effective priority is unchanged, we just store the new
          * normal parameters and do not touch the scheduler class and
          * the runqueue. This will be done when the task deboost
          * itself.
          */
- -      if (rt_mutex_check_prio(p, newprio)) {
+ +      new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ +      if (new_effective_prio == oldprio) {
                 __setscheduler_params(p, attr);
                 task_rq_unlock(rq, p, &flags);
                 return 0;
@@@ -3650,7 -3611,7 +3649,7 @@@
                 put_prev_task(rq, p);
   
         prev_class = p->sched_class;
- -      __setscheduler(rq, p, attr);
+ +      __setscheduler(rq, p, attr, true);
   
         if (running)
                 p->sched_class->set_curr_task(rq);
@@@ -4425,7 -4386,10 +4424,7 @@@ long __sched io_schedule_timeout(long t
         long ret;
   
         current->in_iowait = 1;
- -      if (old_iowait)
- -              blk_schedule_flush_plug(current);
- -      else
- -              blk_flush_plug(current);
+ +      blk_schedule_flush_plug(current);
   
         delayacct_blkio_start();
         rq = raw_rq();
@@@ -5350,7 -5314,7 +5349,7 @@@ static struct notifier_block migration_
         .priority = CPU_PRI_MIGRATION,
   };
   
- -static void __cpuinit set_cpu_rq_start_time(void)
+ +static void set_cpu_rq_start_time(void)
   {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
@@@ -7032,23 -6996,27 +7031,23 @@@ static int cpuset_cpu_inactive(struct n
         unsigned long flags;
         long cpu = (long)hcpu;
         struct dl_bw *dl_b;
+ +      bool overflow;
+ +      int cpus;
   
- -      switch (action & ~CPU_TASKS_FROZEN) {
+ +      switch (action) {
         case CPU_DOWN_PREPARE:
- -              /* explicitly allow suspend */
- -              if (!(action & CPU_TASKS_FROZEN)) {
- -                      bool overflow;
- -                      int cpus;
- -
- -                      rcu_read_lock_sched();
- -                      dl_b = dl_bw_of(cpu);
+ +              rcu_read_lock_sched();
+ +              dl_b = dl_bw_of(cpu);
   
- -                      raw_spin_lock_irqsave(&dl_b->lock, flags);
- -                      cpus = dl_bw_cpus(cpu);
- -                      overflow = __dl_overflow(dl_b, cpus, 0, 0);
- -                      raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ +              raw_spin_lock_irqsave(&dl_b->lock, flags);
+ +              cpus = dl_bw_cpus(cpu);
+ +              overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ +              raw_spin_unlock_irqrestore(&dl_b->lock, flags);
   
- -                      rcu_read_unlock_sched();
+ +              rcu_read_unlock_sched();
   
- -                      if (overflow)
- -                              return notifier_from_errno(-EBUSY);
- -              }
+ +              if (overflow)
+ +                      return notifier_from_errno(-EBUSY);
                 cpuset_update_active_cpus(false);
                 break;
         case CPU_DOWN_PREPARE_FROZEN:
@@@ -7068,6 -7036,9 +7067,9 @@@ void __init sched_init_smp(void
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
+       /* nohz_full won't take effect without isolating the cpus. */
+       tick_nohz_full_add_cpus_to(cpu_isolated_map);
+ 
         sched_init_numa();
   
         /*
@@@ -7104,6 -7075,8 +7106,6 @@@ void __init sched_init_smp(void
   }
   #endif /* CONFIG_SMP */
   
- -const_debug unsigned int sysctl_timer_migration = 1;
- -
   int in_sched_functions(unsigned long addr)
   {
         return in_lock_functions(addr) ||
@@@ -7375,7 -7348,7 +7377,7 @@@ static void normalize_task(struct rq *r
         queued = task_on_rq_queued(p);
         if (queued)
                 dequeue_task(rq, p, 0);
- -      __setscheduler(rq, p, &attr);
+ +      __setscheduler(rq, p, &attr, false);
         if (queued) {
                 enqueue_task(rq, p, 0);
                 resched_curr(rq);
@@@ -7768,11 -7741,11 +7770,11 @@@ static long sched_group_rt_runtime(stru
         return rt_runtime_us;
   }
   
- -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+ +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
   {
         u64 rt_runtime, rt_period;
   
- -      rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ +      rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
   
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@@ -8139,8 -8112,10 +8141,8 @@@ static int tg_set_cfs_bandwidth(struct 
   
         __refill_cfs_bandwidth_runtime(cfs_b);
         /* restart the period timer (if active) to handle new period expiry */
- -      if (runtime_enabled && cfs_b->timer_active) {
- -              /* force a reprogram */
- -              __start_cfs_bandwidth(cfs_b, true);
- -      }
+ +      if (runtime_enabled)
+ +              start_cfs_bandwidth(cfs_b);
         raw_spin_unlock_irq(&cfs_b->lock);
   
         for_each_online_cpu(i) {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 02:20:04 +0000 (19:20 -0700)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history