riscv/vdso: Add support for time namespaces
authorTong Tiangen <tongtiangen@huawei.com>
Wed, 1 Sep 2021 03:20:25 +0000 (03:20 +0000)
committerPalmer Dabbelt <palmerdabbelt@google.com>
Mon, 4 Oct 2021 21:16:43 +0000 (14:16 -0700)
Implement generic vdso time namespace support which also enables time
namespaces for riscv. This is quite similar to what arm64 does.

selftest/timens test result:
  1..10
  ok 1 Passed for CLOCK_BOOTTIME (syscall)
  ok 2 Passed for CLOCK_BOOTTIME (vdso)
  ok 3 # SKIP CLOCK_BOOTTIME_ALARM isn't supported
  ok 4 # SKIP CLOCK_BOOTTIME_ALARM isn't supported
  ok 5 Passed for CLOCK_MONOTONIC (syscall)
  ok 6 Passed for CLOCK_MONOTONIC (vdso)
  ok 7 Passed for CLOCK_MONOTONIC_COARSE (syscall)
  ok 8 Passed for CLOCK_MONOTONIC_COARSE (vdso)
  ok 9 Passed for CLOCK_MONOTONIC_RAW (syscall)
  ok 10 Passed for CLOCK_MONOTONIC_RAW (vdso)
  # Totals: pass:8 fail:0 xfail:0 xpass:0 skip:2 error:0

Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
arch/riscv/Kconfig
arch/riscv/include/asm/page.h
arch/riscv/include/asm/vdso.h
arch/riscv/include/asm/vdso/gettimeofday.h
arch/riscv/kernel/vdso.c
arch/riscv/kernel/vdso/vdso.lds.S

index c3f3fd5..ea16fa2 100644 (file)
@@ -62,6 +62,7 @@ config RISCV
        select GENERIC_SCHED_CLOCK
        select GENERIC_SMP_IDLE_THREAD
        select GENERIC_TIME_VSYSCALL if MMU && 64BIT
+       select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
        select HANDLE_DOMAIN_IRQ
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
index 109c97e..b3e5ff0 100644 (file)
@@ -157,6 +157,8 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
 #define page_to_bus(page)      (page_to_phys(page))
 #define phys_to_page(paddr)    (pfn_to_page(phys_to_pfn(paddr)))
 
+#define sym_to_pfn(x)           __phys_to_pfn(__pa_symbol(x))
+
 #ifdef CONFIG_FLATMEM
 #define pfn_valid(pfn) \
        (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr))
index 208e31b..e310b22 100644 (file)
@@ -22,7 +22,7 @@
  */
 #ifdef CONFIG_MMU
 
-#define __VVAR_PAGES    1
+#define __VVAR_PAGES    2
 
 #ifndef __ASSEMBLY__
 #include <generated/vdso-offsets.h>
index f839f16..77d9c2f 100644 (file)
@@ -76,6 +76,13 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
        return _vdso_data;
 }
 
+#ifdef CONFIG_TIME_NS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
+{
+       return _timens_data;
+}
+#endif
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
index b70956d..a9436a6 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
+#include <linux/time_namespace.h>
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 #include <vdso/datapage.h>
@@ -25,14 +26,12 @@ extern char vdso_start[], vdso_end[];
 
 enum vvar_pages {
        VVAR_DATA_PAGE_OFFSET,
+       VVAR_TIMENS_PAGE_OFFSET,
        VVAR_NR_PAGES,
 };
 
 #define VVAR_SIZE  (VVAR_NR_PAGES << PAGE_SHIFT)
 
-static unsigned int vdso_pages __ro_after_init;
-static struct page **vdso_pagelist __ro_after_init;
-
 /*
  * The vDSO data page.
  */
@@ -42,83 +41,228 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
-static int __init vdso_init(void)
+struct __vdso_info {
+       const char *name;
+       const char *vdso_code_start;
+       const char *vdso_code_end;
+       unsigned long vdso_pages;
+       /* Data Mapping */
+       struct vm_special_mapping *dm;
+       /* Code Mapping */
+       struct vm_special_mapping *cm;
+};
+
+static struct __vdso_info vdso_info __ro_after_init = {
+       .name = "vdso",
+       .vdso_code_start = vdso_start,
+       .vdso_code_end = vdso_end,
+};
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+                      struct vm_area_struct *new_vma)
+{
+       current->mm->context.vdso = (void *)new_vma->vm_start;
+
+       return 0;
+}
+
+static int __init __vdso_init(void)
 {
        unsigned int i;
+       struct page **vdso_pagelist;
+       unsigned long pfn;
 
-       vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
-       vdso_pagelist =
-               kcalloc(vdso_pages + VVAR_NR_PAGES, sizeof(struct page *), GFP_KERNEL);
-       if (unlikely(vdso_pagelist == NULL)) {
-               pr_err("vdso: pagelist allocation failed\n");
-               return -ENOMEM;
+       if (memcmp(vdso_info.vdso_code_start, "\177ELF", 4)) {
+               pr_err("vDSO is not a valid ELF object!\n");
+               return -EINVAL;
        }
 
-       for (i = 0; i < vdso_pages; i++) {
-               struct page *pg;
+       vdso_info.vdso_pages = (
+               vdso_info.vdso_code_end -
+               vdso_info.vdso_code_start) >>
+               PAGE_SHIFT;
+
+       vdso_pagelist = kcalloc(vdso_info.vdso_pages,
+                               sizeof(struct page *),
+                               GFP_KERNEL);
+       if (vdso_pagelist == NULL)
+               return -ENOMEM;
+
+       /* Grab the vDSO code pages. */
+       pfn = sym_to_pfn(vdso_info.vdso_code_start);
+
+       for (i = 0; i < vdso_info.vdso_pages; i++)
+               vdso_pagelist[i] = pfn_to_page(pfn + i);
+
+       vdso_info.cm->pages = vdso_pagelist;
+
+       return 0;
+}
+
+#ifdef CONFIG_TIME_NS
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+       return (struct vdso_data *)(vvar_page);
+}
+
+/*
+ * The vvar mapping contains data for a specific time namespace, so when a task
+ * changes namespace we must unmap its vvar data for the old namespace.
+ * Subsequent faults will map in data for the new namespace.
+ *
+ * For more details see timens_setup_vdso_data().
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+       struct mm_struct *mm = task->mm;
+       struct vm_area_struct *vma;
+
+       mmap_read_lock(mm);
 
-               pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
-               vdso_pagelist[i] = pg;
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               unsigned long size = vma->vm_end - vma->vm_start;
+
+               if (vma_is_special_mapping(vma, vdso_info.dm))
+                       zap_page_range(vma, vma->vm_start, size);
        }
-       vdso_pagelist[i] = virt_to_page(vdso_data);
 
+       mmap_read_unlock(mm);
        return 0;
 }
+
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+       if (likely(vma->vm_mm == current->mm))
+               return current->nsproxy->time_ns->vvar_page;
+
+       /*
+        * VM_PFNMAP | VM_IO protect .fault() handler from being called
+        * through interfaces like /proc/$pid/mem or
+        * process_vm_{readv,writev}() as long as there's no .access()
+        * in special_mapping_vmops.
+        * For more details check_vma_flags() and __access_remote_vm()
+        */
+       WARN(1, "vvar_page accessed remotely");
+
+       return NULL;
+}
+#else
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+       return NULL;
+}
+#endif
+
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+                            struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct page *timens_page = find_timens_vvar_page(vma);
+       unsigned long pfn;
+
+       switch (vmf->pgoff) {
+       case VVAR_DATA_PAGE_OFFSET:
+               if (timens_page)
+                       pfn = page_to_pfn(timens_page);
+               else
+                       pfn = sym_to_pfn(vdso_data);
+               break;
+#ifdef CONFIG_TIME_NS
+       case VVAR_TIMENS_PAGE_OFFSET:
+               /*
+                * If a task belongs to a time namespace then a namespace
+                * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
+                * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
+                * offset.
+                * See also the comment near timens_setup_vdso_data().
+                */
+               if (!timens_page)
+                       return VM_FAULT_SIGBUS;
+               pfn = sym_to_pfn(vdso_data);
+               break;
+#endif /* CONFIG_TIME_NS */
+       default:
+               return VM_FAULT_SIGBUS;
+       }
+
+       return vmf_insert_pfn(vma, vmf->address, pfn);
+}
+
+enum rv_vdso_map {
+       RV_VDSO_MAP_VVAR,
+       RV_VDSO_MAP_VDSO,
+};
+
+static struct vm_special_mapping rv_vdso_maps[] __ro_after_init = {
+       [RV_VDSO_MAP_VVAR] = {
+               .name   = "[vvar]",
+               .fault = vvar_fault,
+       },
+       [RV_VDSO_MAP_VDSO] = {
+               .name   = "[vdso]",
+               .mremap = vdso_mremap,
+       },
+};
+
+static int __init vdso_init(void)
+{
+       vdso_info.dm = &rv_vdso_maps[RV_VDSO_MAP_VVAR];
+       vdso_info.cm = &rv_vdso_maps[RV_VDSO_MAP_VDSO];
+
+       return __vdso_init();
+}
 arch_initcall(vdso_init);
 
-int arch_setup_additional_pages(struct linux_binprm *bprm,
-       int uses_interp)
+static int __setup_additional_pages(struct mm_struct *mm,
+                                   struct linux_binprm *bprm,
+                                   int uses_interp)
 {
-       struct mm_struct *mm = current->mm;
-       unsigned long vdso_base, vdso_len;
-       int ret;
+       unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
+       void *ret;
 
        BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
 
-       vdso_len = (vdso_pages + VVAR_NR_PAGES) << PAGE_SHIFT;
+       vdso_text_len = vdso_info.vdso_pages << PAGE_SHIFT;
+       /* Be sure to map the data page */
+       vdso_mapping_len = vdso_text_len + VVAR_SIZE;
 
-       if (mmap_write_lock_killable(mm))
-               return -EINTR;
-
-       vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0);
+       vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
        if (IS_ERR_VALUE(vdso_base)) {
-               ret = vdso_base;
-               goto end;
+               ret = ERR_PTR(vdso_base);
+               goto up_fail;
        }
 
-       mm->context.vdso = NULL;
-       ret = install_special_mapping(mm, vdso_base, VVAR_SIZE,
-               (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]);
-       if (unlikely(ret))
-               goto end;
+       ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE,
+               (VM_READ | VM_MAYREAD | VM_PFNMAP), vdso_info.dm);
+       if (IS_ERR(ret))
+               goto up_fail;
 
+       vdso_base += VVAR_SIZE;
+       mm->context.vdso = (void *)vdso_base;
        ret =
-          install_special_mapping(mm, vdso_base + VVAR_SIZE,
-               vdso_pages << PAGE_SHIFT,
+          _install_special_mapping(mm, vdso_base, vdso_text_len,
                (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC),
-               vdso_pagelist);
+               vdso_info.cm);
 
-       if (unlikely(ret))
-               goto end;
+       if (IS_ERR(ret))
+               goto up_fail;
 
-       /*
-        * Put vDSO base into mm struct. We need to do this before calling
-        * install_special_mapping or the perf counter mmap tracking code
-        * will fail to recognise it as a vDSO (since arch_vma_name fails).
-        */
-       mm->context.vdso = (void *)vdso_base + VVAR_SIZE;
+       return 0;
 
-end:
-       mmap_write_unlock(mm);
-       return ret;
+up_fail:
+       mm->context.vdso = NULL;
+       return PTR_ERR(ret);
 }
 
-const char *arch_vma_name(struct vm_area_struct *vma)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-       if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso))
-               return "[vdso]";
-       if (vma->vm_mm && (vma->vm_start ==
-                          (long)vma->vm_mm->context.vdso - VVAR_SIZE))
-               return "[vdso_data]";
-       return NULL;
+       struct mm_struct *mm = current->mm;
+       int ret;
+
+       if (mmap_write_lock_killable(mm))
+               return -EINTR;
+
+       ret = __setup_additional_pages(mm, bprm, uses_interp);
+       mmap_write_unlock(mm);
+
+       return ret;
 }
index e9111f7..01d94aa 100644 (file)
@@ -10,6 +10,9 @@ OUTPUT_ARCH(riscv)
 SECTIONS
 {
        PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+       PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
        . = SIZEOF_HEADERS;
 
        .hash           : { *(.hash) }                  :text