sched/core: uclamp: map TASK's clamp values into CPU's clamp groups

author Patrick Bellasi <patrick.bellasi@arm.com>

Wed, 16 Aug 2017 15:43:30 +0000 (16:43 +0100)

committer Douglas RAILLARD <douglas.raillard@arm.com>

Tue, 14 Aug 2018 15:32:24 +0000 (16:32 +0100)
author Patrick Bellasi <patrick.bellasi@arm.com>
Wed, 16 Aug 2017 15:43:30 +0000 (16:43 +0100)
committer Douglas RAILLARD <douglas.raillard@arm.com>
Tue, 14 Aug 2018 15:32:24 +0000 (16:32 +0100)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 42f6439378e104936adc67bea3ae8a5936e5fa57..8f48e64fb8a6dbeeeef496133ffc4b0f3b0c59b6 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -279,6 +279,9 @@ struct vtime {
         u64                     gtime;
  };
  
+/* Clamp not valid, i.e. group not assigned or invalid value */
+#define UCLAMP_NOT_VALID -1
+
  enum uclamp_id {
         UCLAMP_MIN = 0, /* Minimum utilization */
         UCLAMP_MAX,     /* Maximum utilization */
@@ -575,6 +578,19 @@ struct sched_dl_entity {
         struct hrtimer inactive_timer;
  };
  
+/**
+ * Utilization's clamp group
+ *
+ * A utilization clamp group maps a "clamp value" (value), i.e.
+ * util_{min,max}, to a "clamp group index" (group_id).
+ */
+struct uclamp_se {
+       /* Utilization constraint for tasks in this group */
+       unsigned int value;
+       /* Utilization clamp group for this constraint */
+       unsigned int group_id;
+};
+
  union rcu_special {
         struct {
                 u8                      blocked;
@@ -659,7 +675,7 @@ struct task_struct {
  
  #ifdef CONFIG_UCLAMP_TASK
         /* Utlization clamp values for this task */
-       int                             uclamp[UCLAMP_CNT];
+       struct uclamp_se                uclamp[UCLAMP_CNT];
  #endif
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h

index c27d6e81517b68c6e7c708fbc01de43128565ec7..ae7e12de32ca2b76c30fd9c27223382182bbc247 100644 (file)
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -50,7 +50,11 @@
  #define SCHED_FLAG_RESET_ON_FORK       0x01
  #define SCHED_FLAG_RECLAIM             0x02
  #define SCHED_FLAG_DL_OVERRUN          0x04
-#define SCHED_FLAG_UTIL_CLAMP          0x08
+
+#define SCHED_FLAG_UTIL_CLAMP_MIN      0x10
+#define SCHED_FLAG_UTIL_CLAMP_MAX      0x20
+#define SCHED_FLAG_UTIL_CLAMP  (SCHED_FLAG_UTIL_CLAMP_MIN | \
+                                SCHED_FLAG_UTIL_CLAMP_MAX)
  
  #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK       | \
                          SCHED_FLAG_RECLAIM             | \
diff --git a/init/Kconfig b/init/Kconfig

index 1d45a6877d6f7704209d42ce09b44e66ceef7dd1..701300e8f0eb8e6192e874f465e540c8cef62ea7 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -601,7 +601,27 @@ config UCLAMP_TASK
  
           If in doubt, say N.
  
+config UCLAMP_GROUPS_COUNT
+       int "Number of different utilization clamp values supported"
+       range 0 32
+       default 5
+       depends on UCLAMP_TASK
+       help
+         This defines the maximum number of different utilization clamp
+         values which can be concurrently enforced for each utilization
+         clamp index (i.e. minimum and maximum utilization).
+
+         Only a limited number of clamp values are supported because:
+           1. there are usually only few classes of workloads for which it
+              makes sense to boost/cap for different frequencies,
+              e.g. background vs foreground, interactive vs low-priority.
+           2. it allows a simpler and more memory/time efficient tracking of
+              the per-CPU clamp values.
+
+         If in doubt, use the default value.
+
  endmenu
+
  #
  # For architectures that want to enable the support for NUMA-affine scheduler
  # balancing logic:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2cabbbcaa4474cedd03b2a8b15cc53f604b9db56..4caa2686644b39ce4d3d65478e356d2c69dc0a6f 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -717,25 +717,337 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+/**
+ * uclamp_mutex: serializes updates of utilization clamp values
+ *
+ * A utilization clamp value update is usually triggered from a user-space
+ * process (slow-path) but it requires a synchronization with the scheduler's
+ * (fast-path) enqueue/dequeue operations.
+ * While the fast-path synchronization is protected by RQs spinlock, this
+ * mutex ensures that we sequentially serve user-space requests.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
+/**
+ * uclamp_map: reference counts a utilization "clamp value"
+ * @value:    the utilization "clamp value" required
+ * @se_count: the number of scheduling entities requiring the "clamp value"
+ * @se_lock:  serialize reference count updates by protecting se_count
+ */
+struct uclamp_map {
+       int value;
+       int se_count;
+       raw_spinlock_t se_lock;
+};
+
+/**
+ * uclamp_maps: maps each SEs "clamp value" into a CPUs "clamp group"
+ *
+ * Since only a limited number of different "clamp values" are supported, we
+ * need to map each different clamp value into a "clamp group" (group_id) to
+ * be used by the per-CPU accounting in the fast-path, when tasks are
+ * enqueued and dequeued.
+ * We also support different kind of utilization clamping, min and max
+ * utilization for example, each representing what we call a "clamp index"
+ * (clamp_id).
+ *
+ * A matrix is thus required to map "clamp values" to "clamp groups"
+ * (group_id), for each "clamp index" (clamp_id), where:
+ * - rows are indexed by clamp_id and they collect the clamp groups for a
+ *   given clamp index
+ * - columns are indexed by group_id and they collect the clamp values which
+ *   maps to that clamp group
+ *
+ * Thus, the column index of a given (clamp_id, value) pair represents the
+ * clamp group (group_id) used by the fast-path's per-CPU accounting.
+ *
+ * NOTE: first clamp group (group_id=0) is reserved for tracking of non
+ * clamped tasks.  Thus we allocate one more slot than the value of
+ * CONFIG_UCLAMP_GROUPS_COUNT.
+ *
+ * Here is the map layout and, right below, how entries are accessed by the
+ * following code.
+ *
+ *                          uclamp_maps is a matrix of
+ *          +------- UCLAMP_CNT by CONFIG_UCLAMP_GROUPS_COUNT+1 entries
+ *          |                                |
+ *          |                /---------------+---------------\
+ *          |               +------------+       +------------+
+ *          |  / UCLAMP_MIN | value      |       | value      |
+ *          |  |            | se_count   |...... | se_count   |
+ *          |  |            +------------+       +------------+
+ *          +--+            +------------+       +------------+
+ *             |            | value      |       | value      |
+ *             \ UCLAMP_MAX | se_count   |...... | se_count   |
+ *                          +-----^------+       +----^-------+
+ *                                |                   |
+ *                      uc_map =  +                   |
+ *                     &uclamp_maps[clamp_id][0]      +
+ *                                                clamp_value =
+ *                                       uc_map[group_id].value
+ */
+static struct uclamp_map uclamp_maps[UCLAMP_CNT]
+                                   [CONFIG_UCLAMP_GROUPS_COUNT + 1];
+
+/**
+ * uclamp_group_available: checks if a clamp group is available
+ * @clamp_id: the utilization clamp index (i.e. min or max clamp)
+ * @group_id: the group index in the given clamp_id
+ *
+ * A clamp group is not free if there is at least one SE which is sing a clamp
+ * value mapped on the specified clamp_id. These SEs are reference counted by
+ * the se_count of a uclamp_map entry.
+ *
+ * Return: true if there are no SE's mapped on the specified clamp
+ *         index and group
+ */
+static inline bool uclamp_group_available(int clamp_id, int group_id)
+{
+       struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+
+       return (uc_map[group_id].value == UCLAMP_NOT_VALID);
+}
+
+/**
+ * uclamp_group_init: maps a clamp value on a specified clamp group
+ * @clamp_id: the utilization clamp index (i.e. min or max clamp)
+ * @group_id: the group index to map a given clamp_value
+ * @clamp_value: the utilization clamp value to map
+ *
+ * Initializes a clamp group to track tasks from the fast-path.
+ * Each different clamp value, for a given clamp index (i.e. min/max
+ * utilization clamp), is mapped by a clamp group which index is used by the
+ * fast-path code to keep track of RUNNABLE tasks requiring a certain clamp
+ * value.
+ *
+ */
+static inline void uclamp_group_init(int clamp_id, int group_id,
+                                    unsigned int clamp_value)
+{
+       struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+
+       uc_map[group_id].value = clamp_value;
+       uc_map[group_id].se_count = 0;
+}
+
+/**
+ * uclamp_group_reset: resets a specified clamp group
+ * @clamp_id: the utilization clamp index (i.e. min or max clamping)
+ * @group_id: the group index to release
+ *
+ * A clamp group can be reset every time there are no more task groups using
+ * the clamp value it maps for a given clamp index.
+ */
+static inline void uclamp_group_reset(int clamp_id, int group_id)
+{
+       uclamp_group_init(clamp_id, group_id, UCLAMP_NOT_VALID);
+}
+
+/**
+ * uclamp_group_find: finds the group index of a utilization clamp group
+ * @clamp_id: the utilization clamp index (i.e. min or max clamping)
+ * @clamp_value: the utilization clamping value lookup for
+ *
+ * Verify if a group has been assigned to a certain clamp value and return
+ * its index to be used for accounting.
+ *
+ * Since only a limited number of utilization clamp groups are allowed, if no
+ * groups have been assigned for the specified value, a new group is assigned,
+ * if possible.
+ * Otherwise an error is returned, meaning that an additional clamp value is
+ * not (currently) supported.
+ */
+static int
+uclamp_group_find(int clamp_id, unsigned int clamp_value)
+{
+       struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+       int free_group_id = UCLAMP_NOT_VALID;
+       unsigned int group_id = 0;
+
+       for ( ; group_id <= CONFIG_UCLAMP_GROUPS_COUNT; ++group_id) {
+               /* Keep track of first free clamp group */
+               if (uclamp_group_available(clamp_id, group_id)) {
+                       if (free_group_id == UCLAMP_NOT_VALID)
+                               free_group_id = group_id;
+                       continue;
+               }
+               /* Return index of first group with same clamp value */
+               if (uc_map[group_id].value == clamp_value)
+                       return group_id;
+       }
+
+       if (likely(free_group_id != UCLAMP_NOT_VALID))
+               return free_group_id;
+
+       return -ENOSPC;
+}
+
+/**
+ * uclamp_group_put: decrease the reference count for a clamp group
+ * @clamp_id: the clamp index which was affected by a task group
+ * @uc_se: the utilization clamp data for that task group
+ *
+ * When the clamp value for a task group is changed we decrease the reference
+ * count for the clamp group mapping its current clamp value. A clamp group is
+ * released when there are no more task groups referencing its clamp value.
+ */
+static inline void uclamp_group_put(int clamp_id, int group_id)
+{
+       struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+       unsigned long flags;
+
+       /* Ignore SE's not yet attached */
+       if (group_id == UCLAMP_NOT_VALID)
+               return;
+
+       /* Remove SE from this clamp group */
+       raw_spin_lock_irqsave(&uc_map[group_id].se_lock, flags);
+#ifdef SCHED_DEBUG
+       if (unlikely(uc_map[group_id].se_count == 0)) {
+               WARN(1, "invalid SE clamp group [%d:%d] refcount\n",
+                    clamp_id, group_id);
+               uc_map[group_id].se_count = 1;
+       }
+#endif
+       uc_map[group_id].se_count -= 1;
+       if (uc_map[group_id].se_count == 0)
+               uclamp_group_reset(clamp_id, group_id);
+       raw_spin_unlock_irqrestore(&uc_map[group_id].se_lock, flags);
+}
+
+/**
+ * uclamp_group_get: increase the reference count for a clamp group
+ * @clamp_id: the clamp index affected by the task
+ * @next_group_id: the clamp group to refcount
+ * @uc_se: the utilization clamp data for the task
+ * @clamp_value: the new clamp value for the task
+ *
+ * Each time a task changes its utilization clamp value, for a specified clamp
+ * index, we need to find an available clamp group which can be used to track
+ * this new clamp value. The corresponding clamp group index will be used by
+ * the task to reference count the clamp value on CPUs while enqueued.
+ */
+static inline void uclamp_group_get(int clamp_id, int next_group_id,
+                                  struct uclamp_se *uc_se,
+                                  unsigned int clamp_value)
+{
+       struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+       int prev_group_id = uc_se->group_id;
+       unsigned long flags;
+
+       /* Allocate new clamp group for this clamp value */
+       if (uclamp_group_available(clamp_id, next_group_id))
+               uclamp_group_init(clamp_id, next_group_id, clamp_value);
+
+       /* Update SE's clamp values and attach it to new clamp group */
+       raw_spin_lock_irqsave(&uc_map[next_group_id].se_lock, flags);
+       uc_se->value = clamp_value;
+       uc_se->group_id = next_group_id;
+       uc_map[next_group_id].se_count += 1;
+       raw_spin_unlock_irqrestore(&uc_map[next_group_id].se_lock, flags);
+
+       /* Release the previous clamp group */
+       uclamp_group_put(clamp_id, prev_group_id);
+}
+
  static inline int __setscheduler_uclamp(struct task_struct *p,
                                         const struct sched_attr *attr)
  {
-       if (attr->sched_util_min > attr->sched_util_max)
-               return -EINVAL;
-       if (attr->sched_util_max > SCHED_CAPACITY_SCALE)
-               return -EINVAL;
+       int group_id[UCLAMP_CNT] = { UCLAMP_NOT_VALID };
+       struct uclamp_se *uc_se;
+       int result = 0;
  
-       p->uclamp[UCLAMP_MIN] = attr->sched_util_min;
-       p->uclamp[UCLAMP_MAX] = attr->sched_util_max;
+       mutex_lock(&uclamp_mutex);
  
-       return 0;
+       /* Find a valid group_id for each required clamp value */
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+               int upper_bound = (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+                       ? attr->sched_util_max
+                       : p->uclamp[UCLAMP_MAX].value;
+
+               if (upper_bound == UCLAMP_NOT_VALID)
+                       upper_bound = SCHED_CAPACITY_SCALE;
+               if (attr->sched_util_min > upper_bound) {
+                       result = -EINVAL;
+                       goto done;
+               }
+
+               result = uclamp_group_find(UCLAMP_MIN, attr->sched_util_min);
+               if (result == -ENOSPC) {
+                       pr_err("Cannot allocate more than %d UTIL_MIN clamp groups\n",
+                              CONFIG_UCLAMP_GROUPS_COUNT);
+                       goto done;
+               }
+               group_id[UCLAMP_MIN] = result;
+       }
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+               int lower_bound = (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+                       ? attr->sched_util_min
+                       : p->uclamp[UCLAMP_MIN].value;
+
+               if (lower_bound == UCLAMP_NOT_VALID)
+                       lower_bound = 0;
+               if (attr->sched_util_max < lower_bound ||
+                   attr->sched_util_max > SCHED_CAPACITY_SCALE) {
+                       result = -EINVAL;
+                       goto done;
+               }
+
+               result = uclamp_group_find(UCLAMP_MAX, attr->sched_util_max);
+               if (result == -ENOSPC) {
+                       pr_err("Cannot allocate more than %d UTIL_MAX clamp groups\n",
+                              CONFIG_UCLAMP_GROUPS_COUNT);
+                       goto done;
+               }
+               group_id[UCLAMP_MAX] = result;
+       }
+
+       /* Update each required clamp group */
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+               uc_se = &p->uclamp[UCLAMP_MIN];
+               uclamp_group_get(UCLAMP_MIN, group_id[UCLAMP_MIN],
+                                uc_se, attr->sched_util_min);
+       }
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+               uc_se = &p->uclamp[UCLAMP_MAX];
+               uclamp_group_get(UCLAMP_MAX, group_id[UCLAMP_MAX],
+                                uc_se, attr->sched_util_max);
+       }
+
+done:
+       mutex_unlock(&uclamp_mutex);
+
+       return result;
  }
+
+/**
+ * init_uclamp: initialize data structures required for utilization clamping
+ */
+static void __init init_uclamp(void)
+{
+       int clamp_id;
+
+       mutex_init(&uclamp_mutex);
+
+       /* Init SE's clamp map */
+       for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
+               struct uclamp_map *uc_map = &uclamp_maps[clamp_id][0];
+               int group_id = 0;
+
+               for ( ; group_id <= CONFIG_UCLAMP_GROUPS_COUNT; ++group_id) {
+                       uc_map[group_id].value = UCLAMP_NOT_VALID;
+                       raw_spin_lock_init(&uc_map[group_id].se_lock);
+               }
+       }
+}
+
  #else /* CONFIG_UCLAMP_TASK */
  static inline int __setscheduler_uclamp(struct task_struct *p,
                                         const struct sched_attr *attr)
  {
         return -EINVAL;
  }
+static inline void init_uclamp(void) { }
  #endif /* CONFIG_UCLAMP_TASK */
  
  static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2179,8 +2491,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  #endif
  
  #ifdef CONFIG_UCLAMP_TASK
-       p->uclamp[UCLAMP_MIN] = 0;
-       p->uclamp[UCLAMP_MAX] = SCHED_CAPACITY_SCALE;
+       p->uclamp[UCLAMP_MIN].value = 0;
+       p->uclamp[UCLAMP_MIN].group_id = UCLAMP_NOT_VALID;
+       p->uclamp[UCLAMP_MAX].value = SCHED_CAPACITY_SCALE;
+       p->uclamp[UCLAMP_MAX].group_id = UCLAMP_NOT_VALID;
  #endif
  
  #ifdef CONFIG_SCHEDSTATS
@@ -4759,8 +5073,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                 attr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
-       attr.sched_util_min = p->uclamp[UCLAMP_MIN];
-       attr.sched_util_max = p->uclamp[UCLAMP_MAX];
+       attr.sched_util_min = p->uclamp[UCLAMP_MIN].value;
+       attr.sched_util_max = p->uclamp[UCLAMP_MAX].value;
  #endif
  
         rcu_read_unlock();
@@ -6117,6 +6431,8 @@ void __init sched_init(void)
  
         init_schedstats();
  
+       init_uclamp();
+
         scheduler_running = 1;
  }
author	Patrick Bellasi <patrick.bellasi@arm.com>
	Wed, 16 Aug 2017 15:43:30 +0000 (16:43 +0100)
committer	Douglas RAILLARD <douglas.raillard@arm.com>
	Tue, 14 Aug 2018 15:32:24 +0000 (16:32 +0100)
include/linux/sched.h		patch \| blob \| history
include/uapi/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history