core: add cgroup memory controller support on the unified hierarchy (#3315)
authorTejun Heo <htejun@fb.com>
Fri, 27 May 2016 16:10:18 +0000 (09:10 -0700)
committerLennart Poettering <lennart@poettering.net>
Fri, 27 May 2016 16:10:18 +0000 (18:10 +0200)
On the unified hierarchy, memory controller implements three control knobs -
low, high and max which enables more useable and versatile control over memory
usage.  This patch implements support for the three control knobs.

* MemoryLow, MemoryHigh and MemoryMax are added for memory.low, memory.high and
  memory.max, respectively.

* As all absolute limits on the unified hierarchy use "max" for no limit, make
  memory limit parse functions accept "max" in addition to "infinity" and
  document "max" for the new knobs.

* Implement compatibility translation between MemoryMax and MemoryLimit.

v2:

- Fixed missing else's in config_parse_memory_limit().
- Fixed missing newline when writing out drop-ins.
- Coding style updates to use "val > 0" instead of "val".
- Minor updates to documentation.

man/systemd.resource-control.xml
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/load-fragment-gperf.gperf.m4
src/core/load-fragment.c
src/shared/bus-unit-util.c
src/systemctl/systemctl.c

index 066f2cc..570619a 100644 (file)
             prefixed ones. On unified hierarchy, IO resource control also applies to buffered writes.</para>
           </listitem>
         </varlistentry>
+        <varlistentry>
+          <term><option>Memory</option></term>
+          <listitem>
+            <para><varname>MemoryMax</varname> replaces <varname>MemoryLimit</varname>. <varname>MemoryLow</varname>
+            and <varname>MemoryHigh</varname> are effective only on unified hierarchy.</para>
+          </listitem>
+        </varlistentry>
       </variablelist>
     </para>
 
       </varlistentry>
 
       <varlistentry>
+        <term><varname>MemoryLow=<replaceable>bytes</replaceable></varname></term>
+
+        <listitem>
+          <para>Specify the best-effort memory usage protection of the executed processes in this unit. If the memory
+          usages of this unit and all its ancestors are below their low boundaries, this unit's memory won't be
+          reclaimed as long as memory can be reclaimed from unprotected units.</para>
+
+          <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+          parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. This controls the
+          <literal>memory.low</literal> control group attribute. For details about this control group attribute, see
+          <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+          <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+          <para>This setting is supported only if the unified control group hierarchy is used.</para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>MemoryHigh=<replaceable>bytes</replaceable></varname></term>
+
+        <listitem>
+          <para>Specify the high limit on memory usage of the executed processes in this unit. Memory usage may go
+          above the limit if unavoidable, but the processes are heavily slowed down and memory is taken away
+          aggressively in such cases. This is the main mechanism to control memory usage of a unit.</para>
+
+          <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+          parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the
+          special value <literal>max</literal>, no memory limit is applied. This controls the
+          <literal>memory.high</literal> control group attribute. For details about this control group attribute, see
+          <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+          <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+          <para>This setting is supported only if the unified control group hierarchy is used.</para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>MemoryMax=<replaceable>bytes</replaceable></varname></term>
+
+        <listitem>
+          <para>Specify the absolute limit on memory usage of the executed processes in this unit. If memory usage
+          cannot be contained under the limit, out-of-memory killer is invoked inside the unit. It is recommended to
+          use <varname>MemoryHigh=</varname> as the main control mechanism and use <varname>MemoryMax=</varname> as the
+          last line of defense.</para>
+
+          <para>Takes a memory size in bytes. If the value is suffixed with K, M, G or T, the specified memory size is
+          parsed as Kilobytes, Megabytes, Gigabytes, or Terabytes (with the base 1024), respectively. If assigned the
+          special value <literal>max</literal>, no memory limit is applied. This controls the
+          <literal>memory.max</literal> control group attribute. For details about this control group attribute, see
+          <ulink url="https://www.kernel.org/doc/Documentation/cgroup-v2.txt">cgroup-v2.txt</ulink>.</para>
+
+          <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+          <para>This setting is supported only if the unified control group hierarchy is used. Use
+          <varname>MemoryLimit=</varname> on systems using the legacy control group hierarchy.</para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
         <term><varname>MemoryLimit=<replaceable>bytes</replaceable></varname></term>
 
         <listitem>
           url="https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt">memory.txt</ulink>.</para>
 
           <para>Implies <literal>MemoryAccounting=true</literal>.</para>
+
+          <para>This setting is supported only if the legacy control group hierarchy is used. Use
+          <varname>MemoryMax=</varname> on systems using the unified control group hierarchy.</para>
         </listitem>
       </varlistentry>
 
index 0fb63b1..fbe69df 100644 (file)
@@ -46,7 +46,10 @@ void cgroup_context_init(CGroupContext *c) {
         c->startup_cpu_shares = CGROUP_CPU_SHARES_INVALID;
         c->cpu_quota_per_sec_usec = USEC_INFINITY;
 
-        c->memory_limit = (uint64_t) -1;
+        c->memory_high = CGROUP_LIMIT_MAX;
+        c->memory_max = CGROUP_LIMIT_MAX;
+
+        c->memory_limit = CGROUP_LIMIT_MAX;
 
         c->io_weight = CGROUP_WEIGHT_INVALID;
         c->startup_io_weight = CGROUP_WEIGHT_INVALID;
@@ -147,6 +150,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                 "%sStartupIOWeight=%" PRIu64 "\n"
                 "%sBlockIOWeight=%" PRIu64 "\n"
                 "%sStartupBlockIOWeight=%" PRIu64 "\n"
+                "%sMemoryLow=%" PRIu64 "\n"
+                "%sMemoryHigh=%" PRIu64 "\n"
+                "%sMemoryMax=%" PRIu64 "\n"
                 "%sMemoryLimit=%" PRIu64 "\n"
                 "%sTasksMax=%" PRIu64 "\n"
                 "%sDevicePolicy=%s\n"
@@ -163,6 +169,9 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
                 prefix, c->startup_io_weight,
                 prefix, c->blockio_weight,
                 prefix, c->startup_blockio_weight,
+                prefix, c->memory_low,
+                prefix, c->memory_high,
+                prefix, c->memory_max,
                 prefix, c->memory_limit,
                 prefix, c->tasks_max,
                 prefix, cgroup_device_policy_to_string(c->device_policy),
@@ -496,6 +505,23 @@ static unsigned cgroup_apply_blkio_device_limit(const char *path, const char *de
         return n;
 }
 
+static bool cgroup_context_has_unified_memory_config(CGroupContext *c) {
+        return c->memory_low > 0 || c->memory_high != CGROUP_LIMIT_MAX || c->memory_max != CGROUP_LIMIT_MAX;
+}
+
+static void cgroup_apply_unified_memory_limit(const char *path, const char *file, uint64_t v) {
+        char buf[DECIMAL_STR_MAX(uint64_t) + 1] = "max";
+        int r;
+
+        if (v != CGROUP_LIMIT_MAX)
+                xsprintf(buf, "%" PRIu64 "\n", v);
+
+        r = cg_set_attribute("memory", path, file, buf);
+        if (r < 0)
+                log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to set %s on %s: %m", file, path);
+}
+
 void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
         bool is_root;
         int r;
@@ -662,26 +688,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, M
         }
 
         if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
-                if (c->memory_limit != (uint64_t) -1) {
-                        char buf[DECIMAL_STR_MAX(uint64_t) + 1];
-
-                        sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
+                if (cg_unified() > 0) {
+                        uint64_t max = c->memory_max;
 
-                        if (cg_unified() <= 0)
-                                r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
+                        if (cgroup_context_has_unified_memory_config(c))
+                                max = c->memory_max;
                         else
-                                r = cg_set_attribute("memory", path, "memory.max", buf);
+                                max = c->memory_limit;
 
+                        cgroup_apply_unified_memory_limit(path, "memory.low", c->memory_low);
+                        cgroup_apply_unified_memory_limit(path, "memory.high", c->memory_high);
+                        cgroup_apply_unified_memory_limit(path, "memory.max", max);
                 } else {
-                        if (cg_unified() <= 0)
-                                r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
+                        char buf[DECIMAL_STR_MAX(uint64_t) + 1];
+
+                        if (c->memory_limit != CGROUP_LIMIT_MAX)
+                                xsprintf(buf, "%" PRIu64 "\n", c->memory_limit);
                         else
-                                r = cg_set_attribute("memory", path, "memory.max", "max");
-                }
+                                xsprintf(buf, "%" PRIu64 "\n", c->memory_max);
 
-                if (r < 0)
-                        log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
-                                       "Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
+                        r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
+                        if (r < 0)
+                                log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
+                                               "Failed to set memory.limit_in_bytes on %s: %m", path);
+                }
         }
 
         if ((mask & CGROUP_MASK_DEVICES) && !is_root) {
@@ -778,7 +808,8 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
                 mask |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO;
 
         if (c->memory_accounting ||
-            c->memory_limit != (uint64_t) -1)
+            c->memory_limit != CGROUP_LIMIT_MAX ||
+            cgroup_context_has_unified_memory_config(c))
                 mask |= CGROUP_MASK_MEMORY;
 
         if (c->device_allow ||
index 2b1edba..ff87adf 100644 (file)
@@ -94,6 +94,10 @@ struct CGroupContext {
         LIST_HEAD(CGroupIODeviceWeight, io_device_weights);
         LIST_HEAD(CGroupIODeviceLimit, io_device_limits);
 
+        uint64_t memory_low;
+        uint64_t memory_high;
+        uint64_t memory_max;
+
         /* For legacy hierarchies */
         uint64_t cpu_shares;
         uint64_t startup_cpu_shares;
index d605358..27050b4 100644 (file)
@@ -228,6 +228,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_PROPERTY("BlockIOReadBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
         SD_BUS_PROPERTY("BlockIOWriteBandwidth", "a(st)", property_get_blockio_device_bandwidths, 0, 0),
         SD_BUS_PROPERTY("MemoryAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, memory_accounting), 0),
+        SD_BUS_PROPERTY("MemoryLow", "t", NULL, offsetof(CGroupContext, memory_low), 0),
+        SD_BUS_PROPERTY("MemoryHigh", "t", NULL, offsetof(CGroupContext, memory_high), 0),
+        SD_BUS_PROPERTY("MemoryMax", "t", NULL, offsetof(CGroupContext, memory_max), 0),
         SD_BUS_PROPERTY("MemoryLimit", "t", NULL, offsetof(CGroupContext, memory_limit), 0),
         SD_BUS_PROPERTY("DevicePolicy", "s", property_get_cgroup_device_policy, offsetof(CGroupContext, device_policy), 0),
         SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0),
@@ -826,6 +829,31 @@ int bus_cgroup_set_property(
 
                 return 1;
 
+        } else if (STR_IN_SET(name, "MemoryLow", "MemoryHigh", "MemoryMax")) {
+                uint64_t v;
+
+                r = sd_bus_message_read(message, "t", &v);
+                if (r < 0)
+                        return r;
+
+                if (mode != UNIT_CHECK) {
+                        if (streq(name, "MemoryLow"))
+                                c->memory_low = v;
+                        else if (streq(name, "MemoryHigh"))
+                                c->memory_high = v;
+                        else
+                                c->memory_max = v;
+
+                        unit_invalidate_cgroup(u, CGROUP_MASK_MEMORY);
+
+                        if (v == CGROUP_LIMIT_MAX)
+                                unit_write_drop_in_private_format(u, mode, name, "%s=max\n", name);
+                        else
+                                unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64 "\n", name, v);
+                }
+
+                return 1;
+
         } else if (streq(name, "MemoryLimit")) {
                 uint64_t limit;
 
index 8193418..00bdc23 100644 (file)
@@ -117,6 +117,9 @@ $1.CPUShares,                    config_parse_cpu_shares,            0,
 $1.StartupCPUShares,             config_parse_cpu_shares,            0,                             offsetof($1, cgroup_context.startup_cpu_shares)
 $1.CPUQuota,                     config_parse_cpu_quota,             0,                             offsetof($1, cgroup_context)
 $1.MemoryAccounting,             config_parse_bool,                  0,                             offsetof($1, cgroup_context.memory_accounting)
+$1.MemoryLow,                    config_parse_memory_limit,          0,                             offsetof($1, cgroup_context)
+$1.MemoryHigh,                   config_parse_memory_limit,          0,                             offsetof($1, cgroup_context)
+$1.MemoryMax,                    config_parse_memory_limit,          0,                             offsetof($1, cgroup_context)
 $1.MemoryLimit,                  config_parse_memory_limit,          0,                             offsetof($1, cgroup_context)
 $1.DeviceAllow,                  config_parse_device_allow,          0,                             offsetof($1, cgroup_context)
 $1.DevicePolicy,                 config_parse_device_policy,         0,                             offsetof($1, cgroup_context.device_policy)
index 86b4fb0..09d3f65 100644 (file)
@@ -2793,21 +2793,26 @@ int config_parse_memory_limit(
                 void *userdata) {
 
         CGroupContext *c = data;
-        uint64_t bytes;
+        uint64_t bytes = CGROUP_LIMIT_MAX;
         int r;
 
-        if (isempty(rvalue) || streq(rvalue, "infinity")) {
-                c->memory_limit = (uint64_t) -1;
-                return 0;
+        if (!isempty(rvalue) && !streq(rvalue, "infinity") && !streq(rvalue, "max")) {
+                r = parse_size(rvalue, 1024, &bytes);
+                if (r < 0 || bytes < 1) {
+                        log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue);
+                        return 0;
+                }
         }
 
-        r = parse_size(rvalue, 1024, &bytes);
-        if (r < 0 || bytes < 1) {
-                log_syntax(unit, LOG_ERR, filename, line, r, "Memory limit '%s' invalid. Ignoring.", rvalue);
-                return 0;
-        }
+        if (streq(lvalue, "MemoryLow"))
+                c->memory_low = bytes;
+        else if (streq(lvalue, "MemoryHigh"))
+                c->memory_high = bytes;
+        else if (streq(lvalue, "MemoryMax"))
+                c->memory_max = bytes;
+        else
+                c->memory_limit = bytes;
 
-        c->memory_limit = bytes;
         return 0;
 }
 
index f68c4a4..502e98d 100644 (file)
@@ -166,11 +166,11 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
 
                 r = sd_bus_message_append(m, "v", "b", r);
 
-        } else if (streq(field, "MemoryLimit")) {
+        } else if (STR_IN_SET(field, "MemoryLow", "MemoryHigh", "MemoryMax", "MemoryLimit")) {
                 uint64_t bytes;
 
-                if (isempty(eq) || streq(eq, "infinity"))
-                        bytes = (uint64_t) -1;
+                if (isempty(eq) || streq(eq, "max") || streq(eq, "infinity"))
+                        bytes = CGROUP_LIMIT_MAX;
                 else {
                         r = parse_size(eq, 1024, &bytes);
                         if (r < 0) {
index 0500593..b2ee00f 100644 (file)
@@ -3493,6 +3493,9 @@ typedef struct UnitStatusInfo {
 
         /* CGroup */
         uint64_t memory_current;
+        uint64_t memory_low;
+        uint64_t memory_high;
+        uint64_t memory_max;
         uint64_t memory_limit;
         uint64_t cpu_usage_nsec;
         uint64_t tasks_current;
@@ -3775,10 +3778,30 @@ static void print_status_info(
 
                 printf("   Memory: %s", format_bytes(buf, sizeof(buf), i->memory_current));
 
-                if (i->memory_limit != (uint64_t) -1)
-                        printf(" (limit: %s)\n", format_bytes(buf, sizeof(buf), i->memory_limit));
-                else
-                        printf("\n");
+                if (i->memory_low > 0 || i->memory_high != CGROUP_LIMIT_MAX || i->memory_max != CGROUP_LIMIT_MAX ||
+                    i->memory_limit != CGROUP_LIMIT_MAX) {
+                        const char *prefix = "";
+
+                        printf(" (");
+                        if (i->memory_low > 0) {
+                                printf("%slow: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_low));
+                                prefix = " ";
+                        }
+                        if (i->memory_high != CGROUP_LIMIT_MAX) {
+                                printf("%shigh: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_high));
+                                prefix = " ";
+                        }
+                        if (i->memory_max != CGROUP_LIMIT_MAX) {
+                                printf("%smax: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_max));
+                                prefix = " ";
+                        }
+                        if (i->memory_limit != CGROUP_LIMIT_MAX) {
+                                printf("%slimit: %s", prefix, format_bytes(buf, sizeof(buf), i->memory_limit));
+                                prefix = " ";
+                        }
+                        printf(")");
+                }
+                printf("\n");
         }
 
         if (i->cpu_usage_nsec != (uint64_t) -1) {
@@ -4007,6 +4030,12 @@ static int status_property(const char *name, sd_bus_message *m, UnitStatusInfo *
                         i->assert_timestamp = (usec_t) u;
                 else if (streq(name, "MemoryCurrent"))
                         i->memory_current = u;
+                else if (streq(name, "MemoryLow"))
+                        i->memory_low = u;
+                else if (streq(name, "MemoryHigh"))
+                        i->memory_high = u;
+                else if (streq(name, "MemoryMax"))
+                        i->memory_max = u;
                 else if (streq(name, "MemoryLimit"))
                         i->memory_limit = u;
                 else if (streq(name, "TasksCurrent"))
@@ -4500,6 +4529,8 @@ static int show_one(
         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
         UnitStatusInfo info = {
                 .memory_current = (uint64_t) -1,
+                .memory_high = CGROUP_LIMIT_MAX,
+                .memory_max = CGROUP_LIMIT_MAX,
                 .memory_limit = (uint64_t) -1,
                 .cpu_usage_nsec = (uint64_t) -1,
                 .tasks_current = (uint64_t) -1,