From 00b5974f70f566d52400e20139448cec08a60c7c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 17 Jan 2018 18:50:27 +0100 Subject: [PATCH] core: propagate TasksMax= on the root slice to sysctls The cgroup "pids" controller is not supported on the root cgroup. However we expose TasksMax= on it, but currently don't actually apply it to anything. Let's correct this: if set, let's propagate things to the right sysctls. This way we can expose TasksMax= on all units in a somewhat sensible way. --- src/core/cgroup.c | 47 +++++++++++++++++++++++++++++++++++++---------- src/core/manager.h | 3 +++ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index c779859..be03a5f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1028,19 +1028,46 @@ static void cgroup_context_apply( } } - if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) { + if (apply_mask & CGROUP_MASK_PIDS) { + + if (is_root) { + /* So, the "pids" controller does not expose anything on the root cgroup, in order not to + * replicate knobs exposed elsewhere needlessly. We abstract this away here however, and when + * the knobs of the root cgroup are modified propagate this to the relevant sysctls. There's a + * non-obvious asymmetry however: unlike the cgroup properties we don't really want to take + * exclusive ownership of the sysctls, but we still want to honour things if the user sets + * limits. Hence we employ sort of a one-way strategy: when the user sets a bounded limit + * through us it counts. When the user afterwards unsets it again (i.e. sets it to unbounded) + * it also counts. But if the user never set a limit through us (i.e. we are the default of + * "unbounded") we leave things unmodified. For this we manage a global boolean that we turn on + * the first time we set a limit. Note that this boolean is flushed out on manager reload, + * which is desirable so that there's an offical way to release control of the sysctl from + * systemd: set the limit to unbounded and reload. */ + + if (c->tasks_max != CGROUP_LIMIT_MAX) { + u->manager->sysctl_pid_max_changed = true; + r = procfs_tasks_set_limit(c->tasks_max); + } else if (u->manager->sysctl_pid_max_changed) + r = procfs_tasks_set_limit(TASKS_MAX); + else + r = 0; - if (c->tasks_max != CGROUP_LIMIT_MAX) { - char buf[DECIMAL_STR_MAX(uint64_t) + 2]; + if (r < 0) + log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to write to tasks limit sysctls: %m"); - sprintf(buf, "%" PRIu64 "\n", c->tasks_max); - r = cg_set_attribute("pids", path, "pids.max", buf); - } else - r = cg_set_attribute("pids", path, "pids.max", "max"); + } else { + if (c->tasks_max != CGROUP_LIMIT_MAX) { + char buf[DECIMAL_STR_MAX(uint64_t) + 2]; - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set pids.max: %m"); + sprintf(buf, "%" PRIu64 "\n", c->tasks_max); + r = cg_set_attribute("pids", path, "pids.max", buf); + } else + r = cg_set_attribute("pids", path, "pids.max", "max"); + if (r < 0) + log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set pids.max: %m"); + } } if (apply_bpf) diff --git a/src/core/manager.h b/src/core/manager.h index 1531374..eef08ef 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -267,6 +267,9 @@ struct Manager { /* Have we already printed the taint line if necessary? */ bool taint_logged:1; + /* Have we ever changed the "kernel.pid_max" sysctl? */ + bool sysctl_pid_max_changed:1; + unsigned test_run_flags:8; /* If non-zero, exit with the following value when the systemd -- 2.7.4