core: Restrict mmap and mprotect with PAGE_WRITE|PAGE_EXEC (#3319) (#3379)
authorTopi Miettinen <topimiettinen@users.noreply.github.com>
Fri, 3 Jun 2016 15:58:18 +0000 (15:58 +0000)
committerLennart Poettering <lennart@poettering.net>
Fri, 3 Jun 2016 15:58:18 +0000 (17:58 +0200)
New exec boolean MemoryDenyWriteExecute, when set, installs
a seccomp filter to reject mmap(2) with PAGE_WRITE|PAGE_EXEC
and mprotect(2) with PAGE_EXEC.

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/shared/bus-unit-util.c

index 58f18f3..4a3dd14 100644 (file)
         <citerefentry><refentrytitle>tmpfiles.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>MemoryDenyWriteExecute=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If set, attempts to create memory mappings that are writable and
+        executable at the same time, or to change existing memory mappings to become executable are prohibited.
+        Specifically, a system call filter is added that rejects
+        <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+        system calls with both <constant>PROT_EXEC</constant> and <constant>PROT_WRITE</constant> set
+        and <citerefentry><refentrytitle>mprotect</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+        system calls with <constant>PROT_EXEC</constant> set. Note that this option is incompatible with programs
+        that generate program code dynamically at runtime, such as JIT execution engines, or programs compiled making
+        use of the code "trampoline" feature of various C compilers. This option improves service security, as it makes
+        harder for software exploits to change running code dynamically.
+        </para></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index de29d5d..4c88c41 100644 (file)
@@ -719,6 +719,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("RestrictAddressFamilies", "(bas)", property_get_address_families, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_VTABLE_END
 };
 
@@ -1056,7 +1057,7 @@ int bus_exec_context_set_transient_property(
         } else if (STR_IN_SET(name,
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork",
-                              "NoNewPrivileges", "SyslogLevelPrefix")) {
+                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1080,6 +1081,8 @@ int bus_exec_context_set_transient_property(
                                 c->no_new_privileges = b;
                         else if (streq(name, "SyslogLevelPrefix"))
                                 c->syslog_level_prefix = b;
+                        else if (streq(name, "MemoryDenyWriteExecute"))
+                                c->memory_deny_write_execute = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
index 5eb3f13..2cef70e 100644 (file)
@@ -25,6 +25,7 @@
 #include <signal.h>
 #include <string.h>
 #include <sys/capability.h>
+#include <sys/mman.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
 #include <sys/socket.h>
@@ -1190,6 +1191,45 @@ finish:
         return r;
 }
 
+static int apply_memory_deny_write_execute(const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_KILL,
+                        SCMP_SYS(mmap),
+                        1,
+                        SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_KILL,
+                        SCMP_SYS(mprotect),
+                        1,
+                        SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1912,6 +1952,13 @@ static int exec_child(
                         }
                 }
 
+                if (context->memory_deny_write_execute) {
+                        r = apply_memory_deny_write_execute(context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
                 if (use_syscall_filter) {
                         r = apply_seccomp(context);
                         if (r < 0) {
@@ -2371,7 +2418,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sPrivateDevices: %s\n"
                 "%sProtectHome: %s\n"
                 "%sProtectSystem: %s\n"
-                "%sIgnoreSIGPIPE: %s\n",
+                "%sIgnoreSIGPIPE: %s\n"
+                "%sMemoryDenyWriteExecute: %s\n",
                 prefix, c->umask,
                 prefix, c->working_directory ? c->working_directory : "/",
                 prefix, c->root_directory ? c->root_directory : "/",
@@ -2381,7 +2429,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->private_devices),
                 prefix, protect_home_to_string(c->protect_home),
                 prefix, protect_system_to_string(c->protect_system),
-                prefix, yes_no(c->ignore_sigpipe));
+                prefix, yes_no(c->ignore_sigpipe),
+                prefix, yes_no(c->memory_deny_write_execute));
 
         STRV_FOREACH(e, c->environment)
                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
index 41148bc..464869d 100644 (file)
@@ -197,6 +197,7 @@ struct ExecContext {
         bool ioprio_set:1;
         bool cpu_sched_set:1;
         bool no_new_privileges_set:1;
+        bool memory_deny_write_execute;
 };
 
 #include "cgroup-util.h"
index 00bdc23..eb58586 100644 (file)
@@ -55,10 +55,12 @@ m4_ifdef(`HAVE_SECCOMP',
 `$1.SystemCallFilter,            config_parse_syscall_filter,        0,                             offsetof($1, exec_context)
 $1.SystemCallArchitectures,      config_parse_syscall_archs,         0,                             offsetof($1, exec_context.syscall_archs)
 $1.SystemCallErrorNumber,        config_parse_syscall_errno,         0,                             offsetof($1, exec_context)
+$1.MemoryDenyWriteExecute,       config_parse_bool,                  0,                             offsetof($1, exec_context.memory_deny_write_execute)
 $1.RestrictAddressFamilies,      config_parse_address_families,      0,                             offsetof($1, exec_context)',
 `$1.SystemCallFilter,            config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.SystemCallArchitectures,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.SystemCallErrorNumber,        config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
+$1.MemoryDenyWriteExecute,       config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.RestrictAddressFamilies,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0')
 $1.LimitCPU,                     config_parse_limit,                 RLIMIT_CPU,                    offsetof($1, exec_context.rlimit)
 $1.LimitFSIZE,                   config_parse_limit,                 RLIMIT_FSIZE,                  offsetof($1, exec_context.rlimit)
index bf0b2e8..8f4f93e 100644 (file)
@@ -158,7 +158,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                        "SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies",
                        "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
                        "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges",
-                       "SyslogLevelPrefix", "Delegate", "RemainAfterElapse")) {
+                       "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)