sched: Migrate throttled tasks on HOTPLUG

[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index ccacdbd..3973172 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
         return sysctl_sched_rt_runtime >= 0;
  }
  
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
  {
-       ktime_t now;
+       unsigned long delta;
+       ktime_t soft, hard, now;
+
+       for (;;) {
+               if (hrtimer_active(period_timer))
+                       break;
  
+               now = hrtimer_cb_get_time(period_timer);
+               hrtimer_forward(period_timer, now, period);
+
+               soft = hrtimer_get_softexpires(period_timer);
+               hard = hrtimer_get_expires(period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(period_timer, soft, delta,
+                                        HRTIMER_MODE_ABS_PINNED, 0);
+       }
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return;
  
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                 return;
  
         raw_spin_lock(&rt_b->rt_runtime_lock);
-       for (;;) {
-               unsigned long delta;
-               ktime_t soft, hard;
-
-               if (hrtimer_active(&rt_b->rt_period_timer))
-                       break;
-
-               now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-               hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
-               soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-               hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                               HRTIMER_MODE_ABS_PINNED, 0);
-       }
+       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
         raw_spin_unlock(&rt_b->rt_runtime_lock);
  }
  
@@ -247,6 +250,21 @@ struct cfs_rq;
  
  static LIST_HEAD(task_groups);
  
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+       raw_spinlock_t lock;
+       ktime_t period;
+       u64 quota, runtime;
+       s64 hierarchal_quota;
+       u64 runtime_expires;
+
+       int idle, timer_active;
+       struct hrtimer period_timer;
+       struct list_head throttled_cfs_rq;
+
+#endif
+};
+
  /* task group related information */
  struct task_group {
         struct cgroup_subsys_state css;
@@ -278,6 +296,8 @@ struct task_group {
  #ifdef CONFIG_SCHED_AUTOGROUP
         struct autogroup *autogroup;
  #endif
+
+       struct cfs_bandwidth cfs_bandwidth;
  };
  
  /* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +331,7 @@ struct task_group root_task_group;
  /* CFS-related fields in a runqueue */
  struct cfs_rq {
         struct load_weight load;
-       unsigned long nr_running;
+       unsigned long nr_running, h_nr_running;
  
         u64 exec_clock;
         u64 min_vruntime;
@@ -377,9 +397,106 @@ struct cfs_rq {
  
         unsigned long load_contribution;
  #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       int runtime_enabled;
+       u64 runtime_expires;
+       s64 runtime_remaining;
+
+       int throttled, throttle_count;
+       struct list_head throttled_list;
+#endif
  #endif
  };
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+       struct cfs_bandwidth *cfs_b =
+               container_of(timer, struct cfs_bandwidth, period_timer);
+       ktime_t now;
+       int overrun;
+       int idle = 0;
+
+       for (;;) {
+               now = hrtimer_cb_get_time(timer);
+               overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+               if (!overrun)
+                       break;
+
+               idle = do_sched_cfs_period_timer(cfs_b, overrun);
+       }
+
+       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       raw_spin_lock_init(&cfs_b->lock);
+       cfs_b->runtime = 0;
+       cfs_b->quota = RUNTIME_INF;
+       cfs_b->period = ns_to_ktime(default_cfs_period());
+
+       INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       cfs_b->period_timer.function = sched_cfs_period_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->runtime_enabled = 0;
+       INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       /*
+        * The timer may be active because we're trying to set a new bandwidth
+        * period or because we're racing with the tear-down path
+        * (timer_active==0 becomes visible before the hrtimer call-back
+        * terminates).  In either case we ensure that it's re-programmed
+        */
+       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+               raw_spin_unlock(&cfs_b->lock);
+               /* ensure cfs_b->lock is available while we wait */
+               hrtimer_cancel(&cfs_b->period_timer);
+
+               raw_spin_lock(&cfs_b->lock);
+               /* if someone else restarted the timer then we're done */
+               if (cfs_b->timer_active)
+                       return;
+       }
+
+       cfs_b->timer_active = 1;
+       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       hrtimer_cancel(&cfs_b->period_timer);
+}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
  /* Real-Time classes' related field in a runqueue: */
  struct rt_rq {
         struct rt_prio_array active;
@@ -520,8 +637,6 @@ struct rq {
         int cpu;
         int online;
  
-       unsigned long avg_load_per_task;
-
         u64 rt_avg;
         u64 age_stamp;
         u64 idle_stamp;
@@ -1471,24 +1586,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
         update_load_sub(&rq->load, load);
  }
  
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+                       (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
  typedef int (*tg_visitor)(struct task_group *, void *);
  
  /*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
   */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+                            tg_visitor down, tg_visitor up, void *data)
  {
         struct task_group *parent, *child;
         int ret;
  
-       rcu_read_lock();
-       parent = &root_task_group;
+       parent = from;
+
  down:
         ret = (*down)(parent, data);
         if (ret)
-               goto out_unlock;
+               goto out;
         list_for_each_entry_rcu(child, &parent->children, siblings) {
                 parent = child;
                 goto down;
@@ -1497,19 +1616,29 @@ up:
                 continue;
         }
         ret = (*up)(parent, data);
-       if (ret)
-               goto out_unlock;
+       if (ret || parent == from)
+               goto out;
  
         child = parent;
         parent = parent->parent;
         if (parent)
                 goto up;
-out_unlock:
-       rcu_read_unlock();
-
+out:
         return ret;
  }
  
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+       return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
  static int tg_nop(struct task_group *tg, void *data)
  {
         return 0;
@@ -1569,11 +1698,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
  
         if (nr_running)
-               rq->avg_load_per_task = rq->load.weight / nr_running;
-       else
-               rq->avg_load_per_task = 0;
+               return rq->load.weight / nr_running;
  
-       return rq->avg_load_per_task;
+       return 0;
  }
  
  #ifdef CONFIG_PREEMPT
@@ -1806,7 +1933,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
                 rq->nr_uninterruptible--;
  
         enqueue_task(rq, p, flags);
-       inc_nr_running(rq);
  }
  
  /*
@@ -1818,7 +1944,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
                 rq->nr_uninterruptible++;
  
         dequeue_task(rq, p, flags);
-       dec_nr_running(rq);
  }
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2848,19 +2973,23 @@ void sched_fork(struct task_struct *p)
         p->state = TASK_RUNNING;
  
         /*
+        * Make sure we do not leak PI boosting priority to the child.
+        */
+       p->prio = current->normal_prio;
+
+       /*
          * Revert to default priority/policy on fork if requested.
          */
         if (unlikely(p->sched_reset_on_fork)) {
-               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+               if (task_has_rt_policy(p)) {
                         p->policy = SCHED_NORMAL;
-                       p->normal_prio = p->static_prio;
-               }
-
-               if (PRIO_TO_NICE(p->static_prio) < 0) {
                         p->static_prio = NICE_TO_PRIO(0);
-                       p->normal_prio = p->static_prio;
-                       set_load_weight(p);
-               }
+                       p->rt_priority = 0;
+               } else if (PRIO_TO_NICE(p->static_prio) < 0)
+                       p->static_prio = NICE_TO_PRIO(0);
+
+               p->prio = p->normal_prio = __normal_prio(p);
+               set_load_weight(p);
  
                 /*
                  * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +2998,6 @@ void sched_fork(struct task_struct *p)
                 p->sched_reset_on_fork = 0;
         }
  
-       /*
-        * Make sure we do not leak PI boosting priority to the child.
-        */
-       p->prio = current->normal_prio;
-
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
  
@@ -4263,7 +4387,7 @@ pick_next_task(struct rq *rq)
          * Optimization: we know that if all tasks are in
          * the fair class we can call that function directly:
          */
-       if (likely(rq->nr_running == rq->cfs.nr_running)) {
+       if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
                 p = fair_sched_class.pick_next_task(rq);
                 if (likely(p))
                         return p;
@@ -6211,6 +6335,30 @@ static void calc_global_load_remove(struct rq *rq)
         rq->calc_load_active = 0;
  }
  
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+       struct cfs_rq *cfs_rq;
+
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+               if (!cfs_rq->runtime_enabled)
+                       continue;
+
+               /*
+                * clock_task is not advancing so we just need to make sure
+                * there's some valid quota amount
+                */
+               cfs_rq->runtime_remaining = cfs_b->quota;
+               if (cfs_rq_throttled(cfs_rq))
+                       unthrottle_cfs_rq(cfs_rq);
+       }
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
+
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
   * try_to_wake_up()->select_task_rq().
@@ -6236,6 +6384,9 @@ static void migrate_tasks(unsigned int dead_cpu)
          */
         rq->stop = NULL;
  
+       /* Ensure any throttled groups are reachable by pick_next_task */
+       unthrottle_offline_cfs_rqs(rq);
+
         for ( ; ; ) {
                 /*
                  * There's this thread running, bail when that's the only
@@ -7978,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
         /* allow initial update_cfs_load() to truncate */
         cfs_rq->load_stamp = 1;
  #endif
+       init_cfs_rq_runtime(cfs_rq);
  
         tg->cfs_rq[cpu] = cfs_rq;
         tg->se[cpu] = se;
@@ -8117,6 +8269,7 @@ void __init sched_init(void)
                  * We achieve this by letting root_task_group's tasks sit
                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                  */
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
@@ -8358,6 +8511,8 @@ static void free_fair_sched_group(struct task_group *tg)
  {
         int i;
  
+       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
@@ -8385,6 +8540,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         tg->shares = NICE_0_LOAD;
  
+       init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
         for_each_possible_cpu(i) {
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
@@ -8660,12 +8817,7 @@ unsigned long sched_group_shares(struct task_group *tg)
  }
  #endif
  
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
  static unsigned long to_ratio(u64 period, u64 runtime)
  {
         if (runtime == RUNTIME_INF)
@@ -8673,6 +8825,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
  
         return div64_u64(runtime << 20, period);
  }
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
  
  /* Must be called with tasklist_lock held */
  static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8693,7 +8852,7 @@ struct rt_schedulable_data {
         u64 rt_runtime;
  };
  
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
  {
         struct rt_schedulable_data *d = data;
         struct task_group *child;
@@ -8751,16 +8910,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
  
  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
+       int ret;
+
         struct rt_schedulable_data data = {
                 .tg = tg,
                 .rt_period = period,
                 .rt_runtime = runtime,
         };
  
-       return walk_tg_tree(tg_schedulable, tg_nop, &data);
+       rcu_read_lock();
+       ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+       rcu_read_unlock();
+
+       return ret;
  }
  
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
                 u64 rt_period, u64 rt_runtime)
  {
         int i, err = 0;
@@ -8799,7 +8964,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
         if (rt_runtime_us < 0)
                 rt_runtime = RUNTIME_INF;
  
-       return tg_set_bandwidth(tg, rt_period, rt_runtime);
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
  long sched_group_rt_runtime(struct task_group *tg)
@@ -8824,7 +8989,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
         if (rt_period == 0)
                 return -EINVAL;
  
-       return tg_set_bandwidth(tg, rt_period, rt_runtime);
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
  long sched_group_rt_period(struct task_group *tg)
@@ -9014,6 +9179,225 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
  
         return (u64) scale_load_down(tg->shares);
  }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+       int i, ret = 0, runtime_enabled;
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+       if (tg == &root_task_group)
+               return -EINVAL;
+
+       /*
+        * Ensure we have at some amount of bandwidth every period.  This is
+        * to prevent reaching a state of large arrears when throttled via
+        * entity_tick() resulting in prolonged exit starvation.
+        */
+       if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+               return -EINVAL;
+
+       /*
+        * Likewise, bound things on the otherside by preventing insane quota
+        * periods.  This also allows us to normalize in computing quota
+        * feasibility.
+        */
+       if (period > max_cfs_quota_period)
+               return -EINVAL;
+
+       mutex_lock(&cfs_constraints_mutex);
+       ret = __cfs_schedulable(tg, period, quota);
+       if (ret)
+               goto out_unlock;
+
+       runtime_enabled = quota != RUNTIME_INF;
+       raw_spin_lock_irq(&cfs_b->lock);
+       cfs_b->period = ns_to_ktime(period);
+       cfs_b->quota = quota;
+
+       __refill_cfs_bandwidth_runtime(cfs_b);
+       /* restart the period timer (if active) to handle new period expiry */
+       if (runtime_enabled && cfs_b->timer_active) {
+               /* force a reprogram */
+               cfs_b->timer_active = 0;
+               __start_cfs_bandwidth(cfs_b);
+       }
+       raw_spin_unlock_irq(&cfs_b->lock);
+
+       for_each_possible_cpu(i) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+               struct rq *rq = rq_of(cfs_rq);
+
+               raw_spin_lock_irq(&rq->lock);
+               cfs_rq->runtime_enabled = runtime_enabled;
+               cfs_rq->runtime_remaining = 0;
+
+               if (cfs_rq_throttled(cfs_rq))
+                       unthrottle_cfs_rq(cfs_rq);
+               raw_spin_unlock_irq(&rq->lock);
+       }
+out_unlock:
+       mutex_unlock(&cfs_constraints_mutex);
+
+       return ret;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+       u64 quota, period;
+
+       period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+       if (cfs_quota_us < 0)
+               quota = RUNTIME_INF;
+       else
+               quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+       u64 quota_us;
+
+       if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+               return -1;
+
+       quota_us = tg_cfs_bandwidth(tg)->quota;
+       do_div(quota_us, NSEC_PER_USEC);
+
+       return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+       u64 quota, period;
+
+       period = (u64)cfs_period_us * NSEC_PER_USEC;
+       quota = tg_cfs_bandwidth(tg)->quota;
+
+       if (period <= 0)
+               return -EINVAL;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+       u64 cfs_period_us;
+
+       cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+       do_div(cfs_period_us, NSEC_PER_USEC);
+
+       return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                               s64 cfs_quota_us)
+{
+       return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                               u64 cfs_period_us)
+{
+       return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+struct cfs_schedulable_data {
+       struct task_group *tg;
+       u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+                              struct cfs_schedulable_data *d)
+{
+       u64 quota, period;
+
+       if (tg == d->tg) {
+               period = d->period;
+               quota = d->quota;
+       } else {
+               period = tg_get_cfs_period(tg);
+               quota = tg_get_cfs_quota(tg);
+       }
+
+       /* note: these should typically be equivalent */
+       if (quota == RUNTIME_INF || quota == -1)
+               return RUNTIME_INF;
+
+       return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+       struct cfs_schedulable_data *d = data;
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+       s64 quota = 0, parent_quota = -1;
+
+       if (!tg->parent) {
+               quota = RUNTIME_INF;
+       } else {
+               struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+               quota = normalize_cfs_quota(tg, d);
+               parent_quota = parent_b->hierarchal_quota;
+
+               /*
+                * ensure max(child_quota) <= parent_quota, inherit when no
+                * limit is set
+                */
+               if (quota == RUNTIME_INF)
+                       quota = parent_quota;
+               else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+                       return -EINVAL;
+       }
+       cfs_b->hierarchal_quota = quota;
+
+       return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+       int ret;
+       struct cfs_schedulable_data data = {
+               .tg = tg,
+               .period = period,
+               .quota = quota,
+       };
+
+       if (quota != RUNTIME_INF) {
+               do_div(data.period, NSEC_PER_USEC);
+               do_div(data.quota, NSEC_PER_USEC);
+       }
+
+       rcu_read_lock();
+       ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+       rcu_read_unlock();
+
+       return ret;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -9048,6 +9432,18 @@ static struct cftype cpu_files[] = {
                 .write_u64 = cpu_shares_write_u64,
         },
  #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .name = "cfs_quota_us",
+               .read_s64 = cpu_cfs_quota_read_s64,
+               .write_s64 = cpu_cfs_quota_write_s64,
+       },
+       {
+               .name = "cfs_period_us",
+               .read_u64 = cpu_cfs_period_read_u64,
+               .write_u64 = cpu_cfs_period_write_u64,
+       },
+#endif
  #ifdef CONFIG_RT_GROUP_SCHED
         {
                 .name = "rt_runtime_us",
@@ -9357,4 +9753,3 @@ struct cgroup_subsys cpuacct_subsys = {
         .subsys_id = cpuacct_subsys_id,
  };
  #endif /* CONFIG_CGROUP_CPUACCT */
-