From 74b5fb272f3c6e8c05f6fb63fa6f23db8204b3a6 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Mon, 30 Sep 2019 16:13:32 +0100 Subject: [PATCH] cgroup: Allow checking systemd-internal limits against the kernel We currently don't have any mitigations against another privileged user on the system messing with the cgroup hierarchy, bringing the system out of line with what we've set in systemd. We also don't have any real way to surface this to the user (we do have logs, but you have to know to look in the first place). There are a few possible solutions: 1. Maintaining our own cgroup tree with the new fsopen API and having a read-only copy for everyone else. However, there are some complications on this front, and this may be infeasible in some environments. I'd rate this as a longer term effort that's tangential to this patch. 2. Actively checking for changes with {fa,i}notify and changing them back afterwards to match our configuration again. This is also possible, but it's also good to have a way to do passive monitoring of the situation without taking hard action. Also, currently daemons like senpai do actually need to modify the tree behind systemd's back (although hopefully this should be more integrated soon). This patch implements another option, where one can, on demand, monitor deviations in cgroup memory configuration from systemd's internal state. Currently the only consumer is `systemd-analyze dump`, but the interface is generic enough that it can also be exposed elsewhere later (for example, over D-Bus). Currently only memory limit style properties are supported, but later I also plan to expand this out to other properties that systemd should have ultimate control over. --- src/core/cgroup.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index edd10fc..25c7880 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -208,6 +208,126 @@ void cgroup_context_done(CGroupContext *c) { cpu_set_reset(&c->cpuset_mems); } +static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { + _cleanup_free_ char *raw_kval = NULL; + uint64_t kval; + int r; + + assert(u); + + if (!u->cgroup_realized) + return -EOWNERDEAD; + + r = cg_get_attribute("memory", u->cgroup_path, file, &raw_kval); + if (r < 0) + return r; + + if (streq(raw_kval, "max")) { + *ret = CGROUP_LIMIT_MAX; + return 0; + } + + r = safe_atou64(raw_kval, &kval); + if (r < 0) + return r; + + *ret = kval; + + return 0; +} + +static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { + CGroupContext *c; + CGroupMask m; + const char *file; + uint64_t unit_value; + int r; + + /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will + * return -ENODATA) on cgroup v1. + * + * Returns: + * + * <0: On error. + * 0: If the kernel memory setting doesn't match our configuration. + * >0: If the kernel memory setting matches our configuration. + * + * The following values are only guaranteed to be populated on return >=0: + * + * - ret_unit_value will contain our internal expected value for the unit, page-aligned. + * - ret_kernel_value will contain the actual value presented by the kernel. */ + + assert(u); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m"); + + /* Unsupported on v1. + * + * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has + * silently masked the controller. */ + if (r == 0) + return -ENODATA; + + /* The root slice doesn't have any controller files, so we can't compare anything. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return -ENODATA; + + /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled, + * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To + * avoid specious errors in these scenarios, check that we even expect the memory controller to be + * enabled at all. */ + m = unit_get_target_mask(u); + if (!FLAGS_SET(m, CGROUP_MASK_MEMORY)) + return -ENODATA; + + c = unit_get_cgroup_context(u); + assert(c); + + if (streq(property_name, "MemoryLow")) { + unit_value = unit_get_ancestor_memory_low(u); + file = "memory.low"; + } else if (streq(property_name, "MemoryMin")) { + unit_value = unit_get_ancestor_memory_min(u); + file = "memory.min"; + } else if (streq(property_name, "MemoryHigh")) { + unit_value = c->memory_high; + file = "memory.high"; + } else if (streq(property_name, "MemoryMax")) { + unit_value = c->memory_max; + file = "memory.max"; + } else if (streq(property_name, "MemorySwapMax")) { + unit_value = c->memory_swap_max; + file = "memory.swap.max"; + } else + return -EINVAL; + + r = unit_get_kernel_memory_limit(u, file, ret_kernel_value); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file); + + /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page + * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from + * our internal page-counting. To support those future kernels, just check the value itself first + * without any page-alignment. */ + if (*ret_kernel_value == unit_value) { + *ret_unit_value = unit_value; + return 1; + } + + /* The current kernel behaviour, by comparison, is that even if you write a particular number of + * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel + * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is + * cricket. */ + if (unit_value != CGROUP_LIMIT_MAX) + unit_value = PAGE_ALIGN_DOWN(unit_value); + + *ret_unit_value = unit_value; + + return *ret_kernel_value == *ret_unit_value; +} + void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { _cleanup_free_ char *disable_controllers_str = NULL; _cleanup_free_ char *cpuset_cpus = NULL; -- 2.7.4