From b507808ebce23561d4ff8c2aa1fb949fe402bc61 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 19 Jan 2023 16:03:43 +0000 Subject: [PATCH] mm: implement memory-deny-write-execute as a prctl MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)", v2. The background to this is that systemd has a configuration option called MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter. Its aim is to prevent a user task from inadvertently creating an executable mapping that is (or was) writeable. Since such BPF filter is stateless, it cannot detect mappings that were previously writeable but subsequently changed to read-only. Therefore the filter simply rejects any mprotect(PROT_EXEC). The side-effect is that on arm64 with BTI support (Branch Target Identification), the dynamic loader cannot change an ELF section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect(). For libraries, it can resort to unmapping and re-mapping but for the main executable it does not have a file descriptor. The original bug report in the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries - [4]. This series adds in-kernel support for this feature as a prctl PR_SET_MDWE, that is inherited on fork(). The prctl denies PROT_WRITE | PROT_EXEC mappings. Like the systemd BPF filter it also denies adding PROT_EXEC to mappings. However unlike the BPF filter it only denies it if the mapping didn't previous have PROT_EXEC. This allows to PROT_EXEC -> PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF filter. This patch (of 2): The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com Signed-off-by: Joey Gouly Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Alexander Viro Cc: Jeremy Linton Cc: Kees Cook Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Shuah Khan Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mman.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/sched/coredump.h | 6 +++++- include/uapi/linux/prctl.h | 6 ++++++ kernel/sys.c | 33 +++++++++++++++++++++++++++++++++ mm/mmap.c | 10 ++++++++++ mm/mprotect.c | 5 +++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd..cee1e4b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7..0e17ae7 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a5e06dc..1312a13 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 66 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf..b3cab94 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/mm/mmap.c b/mm/mmap.c index 335ba3d..ffc0815 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2669,6 +2669,16 @@ cannot_expand: vma_set_anonymous(vma); } + if (map_deny_write_exec(vma, vma->vm_flags)) { + error = -EACCES; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + /* Allow architectures to sanity-check the vm_flags */ if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; diff --git a/mm/mprotect.c b/mm/mprotect.c index 6ecdf06..6a22f3a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; -- 2.7.4