Merge branches 'sched/domains' and 'sched/clock' into sched/core
[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched.c
index cf4c953..da1edc8 100644 (file)
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user)
 
 /*
  * Root task group.
- *     Every UID task group (including init_task_group aka UID-0) will
- *     be a child to this group.
+ *     Every UID task group (including init_task_group aka UID-0) will
+ *     be a child to this group.
  */
 struct task_group root_task_group;
 
@@ -318,7 +318,7 @@ struct task_group root_task_group;
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +616,7 @@ struct rq {
 
        unsigned char idle_at_tick;
        /* For active balancing */
+       int post_schedule;
        int active_balance;
        int push_cpu;
        /* cpu of this runqueue: */
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq)
 #define this_rq()              (&__get_cpu_var(runqueues))
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+#define raw_rq()               (&__raw_get_cpu_var(runqueues))
 
 inline void update_rq_clock(struct rq *rq)
 {
@@ -1513,28 +1515,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+struct update_shares_data {
+       unsigned long rq_weight[NR_CPUS];
+};
+
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
  * Calculate and set the cpu's group shares.
  */
-static void
-update_group_shares_cpu(struct task_group *tg, int cpu,
-                       unsigned long sd_shares, unsigned long sd_rq_weight)
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
+                                   unsigned long sd_shares,
+                                   unsigned long sd_rq_weight,
+                                   struct update_shares_data *usd)
 {
-       unsigned long shares;
-       unsigned long rq_weight;
-
-       if (!tg->se[cpu])
-               return;
+       unsigned long shares, rq_weight;
+       int boost = 0;
 
-       rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       rq_weight = usd->rq_weight[cpu];
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
 
        /*
-        *           \Sum shares * rq_weight
-        * shares =  -----------------------
-        *               \Sum rq_weight
-        *
+        *             \Sum_j shares_j * rq_weight_i
+        * shares_i =  -----------------------------
+        *                  \Sum_j rq_weight_j
         */
        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1554,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
-
+               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -1559,22 +1568,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long weight, rq_weight = 0;
-       unsigned long shares = 0;
+       unsigned long weight, rq_weight = 0, shares = 0;
+       struct update_shares_data *usd;
        struct sched_domain *sd = data;
+       unsigned long flags;
        int i;
 
+       if (!tg->se[0])
+               return 0;
+
+       local_irq_save(flags);
+       usd = &__get_cpu_var(update_shares_data);
+
        for_each_cpu(i, sched_domain_span(sd)) {
+               weight = tg->cfs_rq[i]->load.weight;
+               usd->rq_weight[i] = weight;
+
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
                 * run here it will not get delayed by group starvation.
                 */
-               weight = tg->cfs_rq[i]->load.weight;
                if (!weight)
                        weight = NICE_0_LOAD;
 
-               tg->cfs_rq[i]->rq_weight = weight;
                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1586,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
                shares = tg->shares;
 
        for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+               update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+
+       local_irq_restore(flags);
 
        return 0;
 }
@@ -1616,8 +1635,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_shares(struct sched_domain *sd)
 {
-       u64 now = cpu_clock(raw_smp_processor_id());
-       s64 elapsed = now - sd->last_update;
+       s64 elapsed;
+       u64 now;
+
+       if (root_task_group_empty())
+               return;
+
+       now = cpu_clock(raw_smp_processor_id());
+       elapsed = now - sd->last_update;
 
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
@@ -1627,6 +1652,9 @@ static void update_shares(struct sched_domain *sd)
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
+       if (root_task_group_empty())
+               return;
+
        spin_unlock(&rq->lock);
        update_shares(sd);
        spin_lock(&rq->lock);
@@ -1634,6 +1662,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 static void update_h_load(long cpu)
 {
+       if (root_task_group_empty())
+               return;
+
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
@@ -2637,9 +2668,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
        set_task_cpu(p, cpu);
 
        /*
-        * Make sure we do not leak PI boosting priority to the child:
+        * Make sure we do not leak PI boosting priority to the child.
         */
        p->prio = current->normal_prio;
+
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                       p->policy = SCHED_NORMAL;
+
+               if (p->normal_prio < DEFAULT_PRIO)
+                       p->prio = DEFAULT_PRIO;
+
+               if (PRIO_TO_NICE(p->static_prio) < 0) {
+                       p->static_prio = NICE_TO_PRIO(0);
+                       set_load_weight(p);
+               }
+
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
 
@@ -2796,12 +2850,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
-#ifdef CONFIG_SMP
-       int post_schedule = 0;
-
-       if (current->sched_class->needs_post_schedule)
-               post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
 
        rq->prev_mm = NULL;
 
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-       if (post_schedule)
-               current->sched_class->post_schedule(rq);
-#endif
 
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2838,6 +2882,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        }
 }
 
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->sched_class->pre_schedule)
+               prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+       if (rq->post_schedule) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&rq->lock, flags);
+               if (rq->curr->sched_class->post_schedule)
+                       rq->curr->sched_class->post_schedule(rq);
+               spin_unlock_irqrestore(&rq->lock, flags);
+
+               rq->post_schedule = 0;
+       }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -2848,6 +2928,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
        struct rq *rq = this_rq();
 
        finish_task_switch(rq, prev);
+
+       /*
+        * FIXME: do we need to worry about rq being invalidated by the
+        * task_switch?
+        */
+       post_schedule(rq);
+
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
@@ -3379,9 +3466,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
        const struct sched_class *class;
 
-       for (class = sched_class_highest; class; class = class->next)
+       for_each_class(class) {
                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                        return 1;
+       }
 
        return 0;
 }
@@ -5349,10 +5437,7 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
 
-#ifdef CONFIG_SMP
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
-#endif
+       pre_schedule(rq, prev);
 
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
@@ -5378,6 +5463,8 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
 
+       post_schedule(rq);
+
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
 
@@ -6123,17 +6210,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
        unsigned long flags;
        const struct sched_class *prev_class = p->sched_class;
        struct rq *rq;
+       int reset_on_fork;
 
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
 recheck:
        /* double check policy once rq lock held */
-       if (policy < 0)
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
-       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                       policy != SCHED_IDLE)
-               return -EINVAL;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6272,10 @@ recheck:
                /* can't change other user's priorities */
                if (!check_same_owner(p))
                        return -EPERM;
+
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
        }
 
        if (user) {
@@ -6220,6 +6319,8 @@ recheck:
        if (running)
                p->sched_class->put_prev_task(rq, p);
 
+       p->sched_reset_on_fork = reset_on_fork;
+
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
 
@@ -6336,14 +6437,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
        if (p) {
                retval = security_task_getscheduler(p);
                if (!retval)
-                       retval = p->policy;
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
        read_unlock(&tasklist_lock);
        return retval;
 }
 
 /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
@@ -6571,19 +6673,9 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-       __might_sleep(__FILE__, __LINE__);
-#endif
-       /*
-        * The BKS might be reacquired before we have dropped
-        * PREEMPT_ACTIVE, which could trigger a second
-        * cond_resched() call.
-        */
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               schedule();
-               sub_preempt_count(PREEMPT_ACTIVE);
-       } while (need_resched());
+       add_preempt_count(PREEMPT_ACTIVE);
+       schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
@@ -6597,14 +6689,14 @@ int __sched _cond_resched(void)
 EXPORT_SYMBOL(_cond_resched);
 
 /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
 {
        int resched = should_resched();
        int ret = 0;
@@ -6620,9 +6712,9 @@ int cond_resched_lock(spinlock_t *lock)
        }
        return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
@@ -6634,7 +6726,7 @@ int __sched cond_resched_softirq(void)
        }
        return 0;
 }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
 
 /**
  * yield - yield the current processor to other threads.
@@ -6658,11 +6750,13 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
        schedule();
+       current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
 }
@@ -6670,12 +6764,14 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
        long ret;
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
        ret = schedule_timeout(timeout);
+       current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
        return ret;
@@ -6992,8 +7088,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
+               struct task_struct *mt = rq->migration_thread;
+
+               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
+               put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
@@ -7625,7 +7725,7 @@ static int __init migration_init(void)
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
 
-       return err;
+       return 0;
 }
 early_initcall(migration_init);
 #endif
@@ -7841,7 +7941,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        rq->rd = rd;
 
        cpumask_set_cpu(rq->cpu, rd->span);
-       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
@@ -9379,11 +9479,11 @@ void __init sched_init(void)
                 * system cpu resource, based on the weight assigned to root
                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                * (init_cfs_rq) and having one entity represent this group of
+                * (init_tg_cfs_rq) and having one entity represent this group of
                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                 */
                init_tg_cfs_entry(&init_task_group,
-                               &per_cpu(init_cfs_rq, i),
+                               &per_cpu(init_tg_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1,
                                root_task_group.se[i]);
 
@@ -9409,6 +9509,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
+               rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
@@ -9473,13 +9574,20 @@ void __init sched_init(void)
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+       int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
 
-       if ((!in_atomic() && !irqs_disabled()) ||
-                   system_state != SYSTEM_RUNNING || oops_in_progress)
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;