ARM: Experimental Frequency-Invariant Load Scaling Patch

author Olivier Cozette <olivier.cozette@arm.com>

Fri, 30 Jan 2015 06:47:57 +0000 (15:47 +0900)

committer Seung-Woo Kim <sw0312.kim@samsung.com>

Wed, 14 Dec 2016 04:41:33 +0000 (13:41 +0900)
author Olivier Cozette <olivier.cozette@arm.com>
Fri, 30 Jan 2015 06:47:57 +0000 (15:47 +0900)
committer Seung-Woo Kim <sw0312.kim@samsung.com>
Wed, 14 Dec 2016 04:41:33 +0000 (13:41 +0900)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 83f41ad..0ef7ba9 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -586,6 +586,21 @@ config HMP_VARIABLE_SCALE
                 (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
           but it remove intermadiate overflows in computation.
  
+config HMP_FREQUENCY_INVARIANT_SCALE
+       bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
+       depends on HMP_VARIABLE_SCALE && CPU_FREQ
+       help
+         Scales the current load contribution in line with the frequency
+         of the CPU that the task was executed on.
+         In this version, we use a simple linear scale derived from the
+         maximum frequency reported by CPUFreq.
+         Restricting tracked load to be scaled by the CPU's frequency
+         represents the consumption of possible compute capacity
+         (rather than consumption of actual instantaneous capacity as
+         normal) and allows the HMP migration's simple threshold
+         migration strategy to interact more predictably with CPUFreq's
+         asynchronous compute capacity changes.
+
  config NR_CPUS
         int "Maximum number of CPUs (2-4096)"
         range 2 4096
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 092d164..cfb50c0 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -35,10 +35,17 @@
  #ifdef CONFIG_HMP_VARIABLE_SCALE
  #include <linux/sysfs.h>
  #include <linux/vmalloc.h>
-#endif
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
  
  #include "sched.h"
  
+
  /*
   * Targeted preemption latency for CPU-bound tasks:
   * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -2495,8 +2502,93 @@ static u32 __compute_runnable_contrib(u64 n)
  }
  
  #ifdef CONFIG_HMP_VARIABLE_SCALE
-static u64 hmp_variable_scale_convert(u64 delta);
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct kobject *kobj,
+                       struct attribute *attr, char *buf);
+       ssize_t (*store)(struct kobject *a, struct attribute *b,
+                       const char *c, size_t count);
+       int *value;
+       int (*to_sysfs)(int);
+       int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
  #endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       int freqinvar_load_scale_enabled;
+#endif
+       int multiplier; /* used to scale the time delta */
+       struct attribute_group attr_group;
+       struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+       struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+       u32 curr_scale;
+       u32 min;
+       u32 max;
+       u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
  /* We can represent the historical contribution to runnable average as the
   * coefficients of a geometric series.  To do this we sub-divide our runnable
   * history into segments of approximately 1ms (1024us); label the segment that
@@ -2533,6 +2625,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
         u32 runnable_contrib;
         int delta_w, decayed = 0;
         unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       u64 scaled_delta;
+       u32 scaled_runnable_contrib;
+       int scaled_delta_w;
+       u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
  
         delta = now - sa->last_runnable_update;
  #ifdef CONFIG_HMP_VARIABLE_SCALE
@@ -2556,6 +2654,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                 return 0;
         sa->last_runnable_update = now;
  
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* retrieve scale factor for load */
+       if (hmp_data.freqinvar_load_scale_enabled)
+               curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
         /* delta_w is the amount already accumulated against our next period */
         delta_w = sa->avg_period % 1024;
         if (delta + delta_w >= 1024) {
@@ -2568,10 +2672,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                  * period and accrue it.
                  */
                 delta_w = 1024 - delta_w;
+               /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_delta_w = (delta_w * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_delta_w;
+               if (running)
+                       sa->usage_avg_sum += scaled_delta_w;
+#else
                 if (runnable)
                         sa->runnable_avg_sum += delta_w;
                 if (running)
                         sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                 if (running)
                         sa->running_avg_sum += delta_w * scale_freq
                                 >> SCHED_CAPACITY_SHIFT;
@@ -2583,6 +2697,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                 periods = delta / 1024;
                 delta %= 1024;
  
+               /* decay the load we have accumulated so far */
                 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                   periods + 1);
                 sa->running_avg_sum = decay_load(sa->running_avg_sum,
@@ -2591,12 +2706,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                                                      periods + 1);
                 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
  
+               /* add the contribution from this period */
                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                 runnable_contrib = __compute_runnable_contrib(periods);
+               /* Apply load scaling if necessary.
+                * Note that multiplying the whole series is same as
+                * multiplying all terms
+                */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_runnable_contrib = (runnable_contrib * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_runnable_contrib;
+               if (running)
+                       sa->usage_avg_sum += scaled_runnable_contrib;
+#else
                 if (runnable)
                         sa->runnable_avg_sum += runnable_contrib;
                 if (running)
                         sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                 if (running)
                         sa->running_avg_sum += runnable_contrib * scale_freq
                                 >> SCHED_CAPACITY_SHIFT;
@@ -2604,10 +2733,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
         }
  
         /* Remainder of delta accrued against u_0` */
+       /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+       if (runnable)
+               sa->runnable_avg_sum += scaled_delta;
+       if (running)
+               sa->usage_avg_sum += scaled_delta;
+#else
         if (runnable)
                 sa->runnable_avg_sum += delta;
         if (running)
                 sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
         if (running)
                 sa->running_avg_sum += delta * scale_freq
                         >> SCHED_CAPACITY_SHIFT;
@@ -2833,7 +2971,7 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                 now = cfs_rq_clock_task(group_cfs_rq(se));
  
         if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
-                                       cfs_rq->curr == se))
+                       cfs_rq->curr == se))
                 return;
  
         contrib_delta = __update_entity_load_avg_contrib(se);
@@ -5050,27 +5188,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
   * delta time by 1/22 and setting load_avg_period_ms = 706.
   */
  
-#define HMP_VARIABLE_SCALE_SHIFT 16ULL
-struct hmp_global_attr {
-       struct attribute attr;
-       ssize_t (*show)(struct kobject *kobj,
-                       struct attribute *attr, char *buf);
-       ssize_t (*store)(struct kobject *a, struct attribute *b,
-                       const char *c, size_t count);
-       int *value;
-       int (*to_sysfs)(int);
-       int (*from_sysfs)(int);
-};
-
-#define HMP_DATA_SYSFS_MAX 3
-
-struct hmp_data_struct {
-       int multiplier; /* used to scale the time delta */
-       struct attribute_group attr_group;
-       struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
-       struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
-} hmp_data;
-
  /*
   * By scaling the delta time it end-up increasing or decrease the
   * growing speed of the per entity load_avg_ratio
@@ -5139,6 +5256,15 @@ static int hmp_theshold_from_sysfs(int value)
         return value;
  }
  
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+       if (value < 0 || value > 1)
+               return -1;
+       return value;
+}
+#endif
  static void hmp_attr_add(
         const char *name,
         int *value,
@@ -5184,6 +5310,14 @@ static int hmp_attr_init(void)
                 NULL,
                 hmp_theshold_from_sysfs);
  
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* default frequency-invariant scaling ON */
+       hmp_data.freqinvar_load_scale_enabled = 1;
+       hmp_attr_add("frequency_invariant_load_scale",
+               &hmp_data.freqinvar_load_scale_enabled,
+               NULL,
+               hmp_freqinvar_from_sysfs);
+#endif
         hmp_data.attr_group.name = "hmp";
         hmp_data.attr_group.attrs = hmp_data.attributes;
         ret = sysfs_create_group(kernel_kobj,
@@ -8970,3 +9104,132 @@ __init void init_sched_fair_class(void)
  #endif /* SMP */
  
  }
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+       u32 result = curr / max;
+       return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+                                       unsigned long val, void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       int cpu = freq->cpu;
+       struct cpufreq_extents *extents;
+
+       if (freq->flags & CPUFREQ_CONST_LOOPS)
+               return NOTIFY_OK;
+
+       if (val != CPUFREQ_POSTCHANGE)
+               return NOTIFY_OK;
+
+       /* if dynamic load scale is disabled, set the load scale to 1.0 */
+       if (!hmp_data.freqinvar_load_scale_enabled) {
+               freq_scale[cpu].curr_scale = 1024;
+               return NOTIFY_OK;
+       }
+
+       extents = &freq_scale[cpu];
+       if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+               /* If our governor was recognised as a single-freq governor,
+                * use 1.0
+                */
+               extents->curr_scale = 1024;
+       } else {
+               extents->curr_scale = cpufreq_calc_scale(extents->min,
+                               extents->max, freq->new);
+       }
+
+       return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+                                      unsigned long event, void *data)
+{
+       struct cpufreq_policy *policy = data;
+       struct cpufreq_extents *extents;
+       int cpu, singleFreq = 0;
+       static const char performance_governor[] = "performance";
+       static const char powersave_governor[] = "powersave";
+
+       if (event == CPUFREQ_START)
+               return 0;
+
+       if (event != CPUFREQ_INCOMPATIBLE)
+               return 0;
+
+       /* CPUFreq governors do not accurately report the range of
+        * CPU Frequencies they will choose from.
+        * We recognise performance and powersave governors as
+        * single-frequency only.
+        */
+       if (!strncmp(policy->governor->name, performance_governor,
+                       strlen(performance_governor)) ||
+               !strncmp(policy->governor->name, powersave_governor,
+                               strlen(powersave_governor)))
+               singleFreq = 1;
+
+       /* Make sure that all CPUs impacted by this policy are
+        * updated since we will only get a notification when the
+        * user explicitly changes the policy on a CPU.
+        */
+       for_each_cpu(cpu, policy->cpus) {
+               extents = &freq_scale[cpu];
+               extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+               extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+               if (!hmp_data.freqinvar_load_scale_enabled) {
+                       extents->curr_scale = 1024;
+               } else if (singleFreq) {
+                       extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = 1024;
+               } else {
+                       extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = cpufreq_calc_scale(extents->min,
+                                       extents->max, policy->cur);
+               }
+       }
+
+       return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+       .notifier_call  = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+       .notifier_call  = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+       int ret = 0;
+
+       /* init safe defaults since there are no policies at registration */
+       for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+               /* safe defaults */
+               freq_scale[ret].max = 1024;
+               freq_scale[ret].min = 1024;
+               freq_scale[ret].curr_scale = 1024;
+       }
+
+       pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+       ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+                       CPUFREQ_POLICY_NOTIFIER);
+
+       if (ret != -EINVAL)
+               ret = cpufreq_register_notifier(&cpufreq_notifier,
+                       CPUFREQ_TRANSITION_NOTIFIER);
+
+       return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
author	Olivier Cozette <olivier.cozette@arm.com>
	Fri, 30 Jan 2015 06:47:57 +0000 (15:47 +0900)
committer	Seung-Woo Kim <sw0312.kim@samsung.com>
	Wed, 14 Dec 2016 04:41:33 +0000 (13:41 +0900)
arch/arm64/Kconfig		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history