From 502d704e5ed2d288069471f4e3611115cde107d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 13:31:21 +0200 Subject: [PATCH] core:sandbox: Add ProtectKernelModules= option This is useful to turn off explicit module load and unload operations on modular kernels. This option removes CAP_SYS_MODULE from the capability bounding set for the unit, and installs a system call filter to block module system calls. This option will not prevent the kernel from loading modules using the module auto-load feature which is a system wide operation. --- man/systemd.exec.xml | 17 ++++++++++++ src/core/dbus-execute.c | 5 +++- src/core/execute.c | 52 +++++++++++++++++++++++++++++++++++ src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/unit.c | 3 ++ src/shared/bus-unit-util.c | 3 +- 7 files changed, 80 insertions(+), 2 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 986985a..3bea497 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1405,6 +1405,23 @@ + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recomended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + + + Personality= Controls which kernel architecture remove_ipc = b; else if (streq(name, "ProtectKernelTunables")) c->protect_kernel_tunables = b; + else if (streq(name, "ProtectKernelModules")) + c->protect_kernel_modules = b; else if (streq(name, "ProtectControlGroups")) c->protect_control_groups = b; diff --git a/src/core/execute.c b/src/core/execute.c index 0c983f4..7a278b7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1436,6 +1436,50 @@ finish: return r; } +static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { + static const int module_syscalls[] = { + SCMP_SYS(delete_module), + SCMP_SYS(finit_module), + SCMP_SYS(init_module), + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r; + + assert(c); + + /* Turn of module syscalls on ProtectKernelModules=yes */ + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { + r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), + module_syscalls[i], 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + static int apply_private_devices(Unit *u, const ExecContext *c) { const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; @@ -2690,6 +2734,14 @@ static int exec_child( } } + if (context->protect_kernel_modules) { + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (context->private_devices) { r = apply_private_devices(unit, context); if (r < 0) { diff --git a/src/core/execute.h b/src/core/execute.h index 449180c..1de439c 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -175,6 +175,7 @@ struct ExecContext { ProtectSystem protect_system; ProtectHome protect_home; bool protect_kernel_tunables; + bool protect_kernel_modules; bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c49c1d6..a700d85 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) diff --git a/src/core/unit.c b/src/core/unit.c index 690f7f7..71f95c0 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3401,6 +3401,9 @@ int unit_patch_contexts(Unit *u) { if (ec->private_devices) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + if (ec->dynamic_user) { if (!ec->user) { r = user_from_unit_name(u, &ec->user); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a550a37..f639e0e 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectKernelModules", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0) -- 2.7.4