cgroup: Allow checking systemd-internal limits against the kernel

author Chris Down <chris@chrisdown.name>

Mon, 30 Sep 2019 15:13:32 +0000 (16:13 +0100)

committer Chris Down <chris@chrisdown.name>

Thu, 3 Oct 2019 14:06:25 +0000 (15:06 +0100)
author Chris Down <chris@chrisdown.name>
Mon, 30 Sep 2019 15:13:32 +0000 (16:13 +0100)
committer Chris Down <chris@chrisdown.name>
Thu, 3 Oct 2019 14:06:25 +0000 (15:06 +0100)
diff --git a/src/core/cgroup.c b/src/core/cgroup.c

index edd10fc..25c7880 100644 (file)
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -208,6 +208,126 @@ void cgroup_context_done(CGroupContext *c) {
          cpu_set_reset(&c->cpuset_mems);
  }
  
+static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) {
+        _cleanup_free_ char *raw_kval = NULL;
+        uint64_t kval;
+        int r;
+
+        assert(u);
+
+        if (!u->cgroup_realized)
+                return -EOWNERDEAD;
+
+        r = cg_get_attribute("memory", u->cgroup_path, file, &raw_kval);
+        if (r < 0)
+                return r;
+
+        if (streq(raw_kval, "max")) {
+                *ret = CGROUP_LIMIT_MAX;
+                return 0;
+        }
+
+        r = safe_atou64(raw_kval, &kval);
+        if (r < 0)
+                return r;
+
+        *ret = kval;
+
+        return 0;
+}
+
+static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) {
+        CGroupContext *c;
+        CGroupMask m;
+        const char *file;
+        uint64_t unit_value;
+        int r;
+
+        /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will
+         * return -ENODATA) on cgroup v1.
+         *
+         * Returns:
+         *
+         * <0: On error.
+         *  0: If the kernel memory setting doesn't match our configuration.
+         * >0: If the kernel memory setting matches our configuration.
+         *
+         * The following values are only guaranteed to be populated on return >=0:
+         *
+         * - ret_unit_value will contain our internal expected value for the unit, page-aligned.
+         * - ret_kernel_value will contain the actual value presented by the kernel. */
+
+        assert(u);
+
+        r = cg_all_unified();
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m");
+
+        /* Unsupported on v1.
+         *
+         * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has
+         * silently masked the controller. */
+        if (r == 0)
+                return -ENODATA;
+
+        /* The root slice doesn't have any controller files, so we can't compare anything. */
+        if (unit_has_name(u, SPECIAL_ROOT_SLICE))
+                return -ENODATA;
+
+        /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled,
+         * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To
+         * avoid specious errors in these scenarios, check that we even expect the memory controller to be
+         * enabled at all. */
+        m = unit_get_target_mask(u);
+        if (!FLAGS_SET(m, CGROUP_MASK_MEMORY))
+                return -ENODATA;
+
+        c = unit_get_cgroup_context(u);
+        assert(c);
+
+        if (streq(property_name, "MemoryLow")) {
+                unit_value = unit_get_ancestor_memory_low(u);
+                file = "memory.low";
+        } else if (streq(property_name, "MemoryMin")) {
+                unit_value = unit_get_ancestor_memory_min(u);
+                file = "memory.min";
+        } else if (streq(property_name, "MemoryHigh")) {
+                unit_value = c->memory_high;
+                file = "memory.high";
+        } else if (streq(property_name, "MemoryMax")) {
+                unit_value = c->memory_max;
+                file = "memory.max";
+        } else if (streq(property_name, "MemorySwapMax")) {
+                unit_value = c->memory_swap_max;
+                file = "memory.swap.max";
+        } else
+                return -EINVAL;
+
+        r = unit_get_kernel_memory_limit(u, file, ret_kernel_value);
+        if (r < 0)
+                return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file);
+
+        /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page
+         * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from
+         * our internal page-counting. To support those future kernels, just check the value itself first
+         * without any page-alignment. */
+        if (*ret_kernel_value == unit_value) {
+                *ret_unit_value = unit_value;
+                return 1;
+        }
+
+        /* The current kernel behaviour, by comparison, is that even if you write a particular number of
+         * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel
+         * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is
+         * cricket. */
+        if (unit_value != CGROUP_LIMIT_MAX)
+                unit_value = PAGE_ALIGN_DOWN(unit_value);
+
+        *ret_unit_value = unit_value;
+
+        return *ret_kernel_value == *ret_unit_value;
+}
+
  void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
          _cleanup_free_ char *disable_controllers_str = NULL;
          _cleanup_free_ char *cpuset_cpus = NULL;
author	Chris Down <chris@chrisdown.name>
	Mon, 30 Sep 2019 15:13:32 +0000 (16:13 +0100)
committer	Chris Down <chris@chrisdown.name>
	Thu, 3 Oct 2019 14:06:25 +0000 (15:06 +0100)