sched/fair: add util_est on top of PELT

author Patrick Bellasi <patrick.bellasi@arm.com>

Tue, 31 Oct 2017 18:25:03 +0000 (18:25 +0000)

committer Lukasz Luba <l.luba@partner.samsung.com>

Mon, 10 Sep 2018 08:24:19 +0000 (10:24 +0200)
author Patrick Bellasi <patrick.bellasi@arm.com>
Tue, 31 Oct 2017 18:25:03 +0000 (18:25 +0000)
committer Lukasz Luba <l.luba@partner.samsung.com>
Mon, 10 Sep 2018 08:24:19 +0000 (10:24 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b4add1f46cde10ec5c55249e9077fd1f1aa9dce0..5ab58ede33ada061be51acdf4f21c7b4f8bee049 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,6 +338,21 @@ struct sched_avg {
         unsigned long                   util_avg;
  };
  
+/**
+ * Estimation Utilization for FAIR tasks.
+ *
+ * Support data structure to track an Exponential Weighted Moving Average
+ * (EWMA) of a FAIR task's utilization. New samples are added to the moving
+ * average each time a task completes an activation. Sample's weight is
+ * chosen so that the EWMA will be relatively insensitive to transient changes
+ * to the task's workload.
+ */
+struct util_est {
+       unsigned long                   last;
+       unsigned long                   ewma;
+#define UTIL_EST_WEIGHT_SHIFT          2
+};
+
  struct sched_statistics {
  #ifdef CONFIG_SCHEDSTATS
         u64                             wait_start;
@@ -562,6 +577,12 @@ struct task_struct {
  
         const struct sched_class        *sched_class;
         struct sched_entity             se;
+       /*
+        * Since we use se.avg.util_avg to update util_est fields,
+        * this last can benefit from being close to se which
+        * also defines se.avg as cache aligned.
+        */
+       struct util_est                 util_est;
         struct sched_rt_entity          rt;
  #ifdef CONFIG_CGROUP_SCHED
         struct task_group               *sched_task_group;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 1ca0130ed4f937d8b1abcc0e64fd14ebb3a3cfb7..5ffa8234524a209fdd115c4b5d108dea7a0bf7f4 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -567,6 +567,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                         cfs_rq->avg.runnable_load_avg);
         SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                         cfs_rq->avg.util_avg);
+       SEQ_printf(m, "  .%-30s: %lu\n", "util_est_runnable",
+                       cfs_rq->util_est_runnable);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
                         cfs_rq->removed.load_avg);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -1018,6 +1020,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
         P(se.avg.runnable_load_avg);
         P(se.avg.util_avg);
         P(se.avg.last_update_time);
+       P(util_est.ewma);
+       P(util_est.last);
  #endif
         P(policy);
         P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 47b56833a146d097c39522ab0e0b53768cb2d1da..295a393da2b98594893a6b66292f439cc2ef7e88 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -732,6 +732,12 @@ void init_entity_runnable_average(struct sched_entity *se)
         se->runnable_weight = se->load.weight;
  
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+
+       /* Utilization estimation */
+       if (entity_is_task(se)) {
+               task_of(se)->util_est.ewma = 0;
+               task_of(se)->util_est.last = 0;
+       }
  }
  
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
@@ -5170,6 +5176,20 @@ static void clear_sd_overutilized(struct sched_domain *sd)
         sd->shared->overutilized = false;
  }
  
+static inline unsigned long task_util(struct task_struct *p);
+static inline unsigned long task_util_est(struct task_struct *p);
+
+static inline void util_est_enqueue(struct task_struct *p)
+{
+       struct cfs_rq *cfs_rq = &task_rq(p)->cfs;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Update root cfs_rq's estimated utilization */
+       cfs_rq->util_est_runnable += task_util_est(p);
+}
+
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@ -5230,9 +5250,85 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         set_sd_overutilized(sd);
                 rcu_read_unlock();
         }
+
+       util_est_enqueue(p);
         hrtick_update(rq);
  }
  
+static inline void util_est_dequeue(struct task_struct *p, int flags)
+{
+       struct cfs_rq *cfs_rq = &task_rq(p)->cfs;
+       unsigned long util_last = task_util(p);
+       bool sleep = flags & DEQUEUE_SLEEP;
+       unsigned long ewma;
+       long util_est;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /*
+        * Update root cfs_rq's estimated utilization
+        *
+        * If *p is the last task then the root cfs_rq's estimated utilization
+        * of a CPU is 0 by definition.
+        *
+        * Otherwise, in removing *p's util_est from its cfs_rq's
+        * util_est_runnable we should account for cases where this last
+        * activation of *p was longer then the previous ones.
+        * Also in these cases we need to set 0 the estimated utilization for
+        * the CPU.
+        */
+       if (cfs_rq->nr_running > 0) {
+               util_est  = cfs_rq->util_est_runnable;
+               util_est -= task_util_est(p);
+               if (util_est < 0)
+                       util_est = 0;
+               cfs_rq->util_est_runnable = util_est;
+       } else {
+               cfs_rq->util_est_runnable = 0;
+       }
+
+       /*
+        * Skip update of task's estimated utilization when the task has not
+        * yet completed an activation, e.g. being migrated.
+        */
+       if (!sleep)
+               return;
+
+       /*
+        * Skip update of task's estimated utilization when its EWMA is already
+        * ~1% close to its last activation value.
+        */
+       util_est = p->util_est.ewma;
+       if (abs(util_est - util_last) <= (SCHED_CAPACITY_SCALE / 100))
+               return;
+
+       /*
+        * Update Task's estimated utilization
+        *
+        * When *p completes an activation we can consolidate another sample
+        * about the task size. This is done by storing the last PELT value
+        * for this task and using this value to load another sample in the
+        * exponential weighted moving average:
+        *
+        *      ewma(t) = w *  task_util(p) + (1 - w) ewma(t-1)
+        *              = w *  task_util(p) + ewma(t-1) - w * ewma(t-1)
+        *              = w * (task_util(p) + ewma(t-1) / w - ewma(t-1))
+        *
+        * Where 'w' is the weight of new samples, which is configured to be
+        * 0.25, thus making w=1/4
+        */
+       p->util_est.last = util_last;
+       ewma = p->util_est.ewma;
+       if (likely(ewma != 0)) {
+               ewma   = util_last + (ewma << UTIL_EST_WEIGHT_SHIFT) - ewma;
+               ewma >>= UTIL_EST_WEIGHT_SHIFT;
+       } else {
+               ewma = util_last;
+       }
+       p->util_est.ewma = ewma;
+}
+
  static void set_next_buddy(struct sched_entity *se);
  
  /*
@@ -5289,6 +5385,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         if (!se)
                 sub_nr_running(rq, 1);
  
+       util_est_dequeue(p, flags);
         hrtick_update(rq);
  }
  
@@ -6060,7 +6157,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
         return affine;
  }
  
-static inline unsigned long task_util(struct task_struct *p);
  static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
  
  static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6521,6 +6617,11 @@ static inline unsigned long task_util(struct task_struct *p)
         return p->se.avg.util_avg;
  }
  
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+       return max(p->util_est.ewma, p->util_est.last);
+}
+
  /*
   * cpu_util_wake: Compute cpu utilization with any contributions from
   * the waking task p removed.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 8be01be159f6df67b5d036fccb90bdfd741fb11e..a57149a927de43759d77c2db5ebccd1fc92d21cf 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -91,3 +91,8 @@ SCHED_FEAT(WA_BIAS, true)
   * decisions optimizing for energy efficiency.
   */
  SCHED_FEAT(ENERGY_AWARE, false)
+
+/*
+ * UtilEstimation. Use estimated CPU utiliation.
+ */
+SCHED_FEAT(UTIL_EST, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 0dab02acac26d58c75608556d1b4202b7325abfe..53684a22ae55e8c1898ed0838cbc63f1da126981 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -444,6 +444,7 @@ struct cfs_rq {
          * CFS load tracking
          */
         struct sched_avg avg;
+       unsigned long util_est_runnable;
  #ifndef CONFIG_64BIT
         u64 load_last_update_time_copy;
  #endif
author	Patrick Bellasi <patrick.bellasi@arm.com>
	Tue, 31 Oct 2017 18:25:03 +0000 (18:25 +0000)
committer	Lukasz Luba <l.luba@partner.samsung.com>
	Mon, 10 Sep 2018 08:24:19 +0000 (10:24 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history