core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=
authorLennart Poettering <lennart@poettering.net>
Mon, 22 Aug 2016 16:43:59 +0000 (18:43 +0200)
committerDjalal Harouni <tixxdz@opendz.org>
Sun, 25 Sep 2016 08:18:48 +0000 (10:18 +0200)
If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/namespace.c
src/core/namespace.h
src/shared/bus-unit-util.c
src/test/test-ns.c

index bcedebd..07128b4 100644 (file)
       </varlistentry>
 
       <varlistentry>
+        <term><varname>ProtectKernelTunables=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
+        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
+        unit. Usually, tunable kernel variables should only be written at boot-time, with the
+        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
+        no services need to write to these at runtime; it is hence recommended to turn this on for most
+        services. Defaults to off.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>ProtectControlGroups=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
+        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
+        container managers no services should require write access to the control groups hierarchies; it is hence
+        recommended to turn this on for most services. Defaults to off.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
         <listitem><para>Takes a mount propagation flag:
index 7e33a2d..eec4500 100644 (file)
@@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+                              "ProtectControlGroups")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
                                 c->dynamic_user = b;
                         else if (streq(name, "RemoveIPC"))
                                 c->remove_ipc = b;
+                        else if (streq(name, "ProtectKernelTunables"))
+                                c->protect_kernel_tunables = b;
+                        else if (streq(name, "ProtectControlGroups"))
+                                c->protect_control_groups = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
index ee734e8..609b69a 100644 (file)
@@ -1383,6 +1383,45 @@ finish:
         return r;
 }
 
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(_sysctl),
+                        0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
 
         if (context->private_devices ||
             context->protect_system != PROTECT_SYSTEM_NO ||
-            context->protect_home != PROTECT_HOME_NO)
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_control_groups)
                 return true;
 
         return false;
@@ -1804,6 +1845,37 @@ static int close_remaining_fds(
         return close_all_fds(dont_close, n_dont_close);
 }
 
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter) ||
+                !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                c->protect_kernel_tunables ||
+                context_has_syscall_filters(c);
+}
+
 static int send_user_lookup(
                 Unit *unit,
                 int user_lookup_fd,
@@ -2255,6 +2327,8 @@ static int exec_child(
                                 tmp,
                                 var,
                                 context->private_devices,
+                                context->protect_kernel_tunables,
+                                context->protect_control_groups,
                                 context->protect_home,
                                 context->protect_system,
                                 context->mount_flags);
@@ -2343,11 +2417,6 @@ static int exec_child(
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
 
-                bool use_address_families = context->address_families_whitelist ||
-                        !set_isempty(context->address_families);
-                bool use_syscall_filter = context->syscall_whitelist ||
-                        !set_isempty(context->syscall_filter) ||
-                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2424,15 +2493,14 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+                if (context_has_no_new_privileges(context))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (use_address_families) {
+                if (context_has_address_families(context)) {
                         r = apply_address_families(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
@@ -2456,7 +2524,15 @@ static int exec_child(
                         }
                 }
 
-                if (use_syscall_filter) {
+                if (context->protect_kernel_tunables) {
+                        r = apply_protect_sysctl(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context_has_syscall_filters(context)) {
                         r = apply_seccomp(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
@@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectControlGroups: %s\n"
                 "%sPrivateNetwork: %s\n"
                 "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
@@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_control_groups),
                 prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
index 6082c42..449180c 100644 (file)
@@ -174,6 +174,8 @@ struct ExecContext {
         bool private_users;
         ProtectSystem protect_system;
         ProtectHome protect_home;
+        bool protect_kernel_tunables;
+        bool protect_control_groups;
 
         bool no_new_privileges;
 
index 2e6c965..c49c1d6 100644 (file)
@@ -89,6 +89,8 @@ $1.ReadOnlyPaths,                config_parse_namespace_path_strv,   0,
 $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,                             offsetof($1, exec_context.inaccessible_paths)
 $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)
 $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)
+$1.ProtectKernelTunables,        config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectControlGroups,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_control_groups)
 $1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
 $1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users)
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
index 52a2505..f2768ae 100644 (file)
@@ -53,7 +53,7 @@ typedef enum MountMode {
         PRIVATE_TMP,
         PRIVATE_VAR_TMP,
         PRIVATE_DEV,
-        READWRITE
+        READWRITE,
 } MountMode;
 
 typedef struct BindMount {
@@ -366,6 +366,8 @@ int setup_namespace(
                 const char* tmp_dir,
                 const char* var_tmp_dir,
                 bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
                 ProtectHome protect_home,
                 ProtectSystem protect_system,
                 unsigned long mount_flags) {
@@ -385,6 +387,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
+                (protect_sysctl ? 3 : 0) +
+                (protect_cgroups != protect_sysctl) +
                 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
                 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
                 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@@ -421,6 +425,27 @@ int setup_namespace(
                         m++;
                 }
 
+                if (protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/proc/sys");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
+                        m->mode = READONLY;
+                        m->ignore = true; /* Not always compiled into the kernel */
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/sys");
+                        m->mode = READONLY;
+                        m++;
+                }
+
+                if (protect_cgroups != protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
+                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m++;
+                }
+
                 if (protect_home != PROTECT_HOME_NO) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
@@ -505,9 +530,12 @@ int setup_namespace(
 
 fail:
         if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m)
-                        if (m->done)
-                                (void) umount2(m->path, MNT_DETACH);
+                for (m = mounts; m < mounts + n; ++m) {
+                        if (!m->done)
+                                continue;
+
+                        (void) umount2(m->path, MNT_DETACH);
+                }
         }
 
         return r;
index 1aedf5f..3845336 100644 (file)
@@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
                     const char *tmp_dir,
                     const char *var_tmp_dir,
                     bool private_dev,
+                    bool protect_sysctl,
+                    bool protect_cgroups,
                     ProtectHome protect_home,
                     ProtectSystem protect_system,
                     unsigned long mount_flags);
index feb4a06..c6bd2f1 100644 (file)
@@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)
index 9248f29..05f243c 100644 (file)
@@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
                             tmp_dir,
                             var_tmp_dir,
                             true,
+                            true,
+                            true,
                             PROTECT_HOME_NO,
                             PROTECT_SYSTEM_NO,
                             0);