x86/pti: Put the LDT in its own PGD if PTI is on

author Andy Lutomirski <luto@kernel.org>

Tue, 12 Dec 2017 15:56:45 +0000 (07:56 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 2 Jan 2018 19:31:00 +0000 (20:31 +0100)
author Andy Lutomirski <luto@kernel.org>
Tue, 12 Dec 2017 15:56:45 +0000 (07:56 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 2 Jan 2018 19:31:00 +0000 (20:31 +0100)
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt

index 496a1db..ad41b38 100644 (file)
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
  ... unused hole ...
  ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
  ... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
  fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
  ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
  ... unused hole ...
@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
  hole caused by [56:63] sign extension
  ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
  ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
  ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
  ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
  ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 5ede7ca..c931b88 100644 (file)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
          * call gates.  On native, we could merge the ldt_struct and LDT
          * allocations, but it's not worth trying to optimize.
          */
-       struct desc_struct *entries;
-       unsigned int nr_entries;
+       struct desc_struct      *entries;
+       unsigned int            nr_entries;
+
+       /*
+        * If PTI is in use, then the entries array is not mapped while we're
+        * in user mode.  The whole array will be aliased at the addressed
+        * given by ldt_slot_va(slot).  We use two slots so that we can allocate
+        * and map, and enable a new LDT without invalidating the mapping
+        * of an older, still-in-use LDT.
+        *
+        * slot will be -1 if this LDT doesn't have an alias mapping.
+        */
+       int                     slot;
  };
  
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+       return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+       BUG();
+#endif
+}
+
  /*
   * Used for LDT copy/destruction.
   */
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
  }
  int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
  void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
  #else  /* CONFIG_MODIFY_LDT_SYSCALL */
  static inline void init_new_context_ldt(struct mm_struct *mm) { }
  static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
  {
         return 0;
  }
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
  #endif
  
  static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
          * that we can see.
          */
  
-       if (unlikely(ldt))
-               set_ldt(ldt->entries, ldt->nr_entries);
-       else
+       if (unlikely(ldt)) {
+               if (static_cpu_has(X86_FEATURE_PTI)) {
+                       if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+                               /*
+                                * Whoops -- either the new LDT isn't mapped
+                                * (if slot == -1) or is mapped into a bogus
+                                * slot (if slot > 1).
+                                */
+                               clear_LDT();
+                               return;
+                       }
+
+                       /*
+                        * If page table isolation is enabled, ldt->entries
+                        * will not be mapped in the userspace pagetables.
+                        * Tell the CPU to access the LDT through the alias
+                        * at ldt_slot_va(ldt->slot).
+                        */
+                       set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+               } else {
+                       set_ldt(ldt->entries, ldt->nr_entries);
+               }
+       } else {
                 clear_LDT();
+       }
  #else
         clear_LDT();
  #endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
  static inline void arch_exit_mmap(struct mm_struct *mm)
  {
         paravirt_arch_exit_mmap(mm);
+       ldt_arch_exit_mmap(mm);
  }
  
  #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

index 83e9489..b97a539 100644 (file)
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
  # define VMALLOC_SIZE_TB       _AC(12800, UL)
  # define __VMALLOC_BASE                _AC(0xffa0000000000000, UL)
  # define __VMEMMAP_BASE                _AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY         _AC(-112, UL)
+# define LDT_BASE_ADDR         (LDT_PGD_ENTRY << PGDIR_SHIFT)
  #else
  # define VMALLOC_SIZE_TB       _AC(32, UL)
  # define __VMALLOC_BASE                _AC(0xffffc90000000000, UL)
  # define __VMEMMAP_BASE                _AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY         _AC(-4, UL)
+# define LDT_BASE_ADDR         (LDT_PGD_ENTRY << PGDIR_SHIFT)
  #endif
  
  #ifdef CONFIG_RANDOMIZE_MEMORY
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 9e482d8..9c18da6 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
  
  #else
  /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
   */
  #define TASK_SIZE_MAX  ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
  
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c

index a6b5d62..9629c5d 100644 (file)
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
  #include <linux/uaccess.h>
  
  #include <asm/ldt.h>
+#include <asm/tlb.h>
  #include <asm/desc.h>
  #include <asm/mmu_context.h>
  #include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
  static void flush_ldt(void *__mm)
  {
         struct mm_struct *mm = __mm;
-       mm_context_t *pc;
  
         if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
                 return;
  
-       pc = &mm->context;
-       set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+       load_mm_ldt(mm);
  
         refresh_ldt_segments();
  }
@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
                 return NULL;
         }
  
+       /* The new LDT isn't aliased for PTI yet. */
+       new_ldt->slot = -1;
+
         new_ldt->nr_entries = num_entries;
         return new_ldt;
  }
  
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function.  Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       bool is_vmalloc, had_top_level_entry;
+       unsigned long va;
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       int i;
+
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return 0;
+
+       /*
+        * Any given ldt_struct should have map_ldt_struct() called at most
+        * once.
+        */
+       WARN_ON(ldt->slot != -1);
+
+       /*
+        * Did we already have the top level entry allocated?  We can't
+        * use pgd_none() for this because it doens't do anything on
+        * 4-level page table kernels.
+        */
+       pgd = pgd_offset(mm, LDT_BASE_ADDR);
+       had_top_level_entry = (pgd->pgd != 0);
+
+       is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+       for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+               unsigned long offset = i << PAGE_SHIFT;
+               const void *src = (char *)ldt->entries + offset;
+               unsigned long pfn;
+               pte_t pte, *ptep;
+
+               va = (unsigned long)ldt_slot_va(slot) + offset;
+               pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+                       page_to_pfn(virt_to_page(src));
+               /*
+                * Treat the PTI LDT range as a *userspace* range.
+                * get_locked_pte() will allocate all needed pagetables
+                * and account for them in this mm.
+                */
+               ptep = get_locked_pte(mm, va, &ptl);
+               if (!ptep)
+                       return -ENOMEM;
+               pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+               set_pte_at(mm, va, ptep, pte);
+               pte_unmap_unlock(ptep, ptl);
+       }
+
+       if (mm->context.ldt) {
+               /*
+                * We already had an LDT.  The top-level entry should already
+                * have been allocated and synchronized with the usermode
+                * tables.
+                */
+               WARN_ON(!had_top_level_entry);
+               if (static_cpu_has(X86_FEATURE_PTI))
+                       WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+       } else {
+               /*
+                * This is the first time we're mapping an LDT for this process.
+                * Sync the pgd to the usermode tables.
+                */
+               WARN_ON(had_top_level_entry);
+               if (static_cpu_has(X86_FEATURE_PTI)) {
+                       WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+                       set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+               }
+       }
+
+       va = (unsigned long)ldt_slot_va(slot);
+       flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+       ldt->slot = slot;
+#endif
+       return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       struct mmu_gather tlb;
+       unsigned long start = LDT_BASE_ADDR;
+       unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return;
+
+       tlb_gather_mmu(&tlb, mm, start, end);
+       free_pgd_range(&tlb, start, end, start, end);
+       tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
  /* After calling this, the LDT is immutable. */
  static void finalize_ldt_struct(struct ldt_struct *ldt)
  {
@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
                new_ldt->nr_entries * LDT_ENTRY_SIZE);
         finalize_ldt_struct(new_ldt);
  
+       retval = map_ldt_struct(mm, new_ldt, 0);
+       if (retval) {
+               free_ldt_pgtables(mm);
+               free_ldt_struct(new_ldt);
+               goto out_unlock;
+       }
         mm->context.ldt = new_ldt;
  
  out_unlock:
@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm)
         mm->context.ldt = NULL;
  }
  
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+       free_ldt_pgtables(mm);
+}
+
  static int read_ldt(void __user *ptr, unsigned long bytecount)
  {
         struct mm_struct *mm = current->mm;
@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
         new_ldt->entries[ldt_info.entry_number] = ldt;
         finalize_ldt_struct(new_ldt);
  
+       /*
+        * If we are using PTI, map the new LDT into the userspace pagetables.
+        * If there is already an LDT, use the other slot so that other CPUs
+        * will continue to use the old LDT until install_ldt() switches
+        * them over to the new LDT.
+        */
+       error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+       if (error) {
+               free_ldt_struct(old_ldt);
+               goto out_unlock;
+       }
+
         install_ldt(mm, new_ldt);
         free_ldt_struct(old_ldt);
         error = 0;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c

index 43dedbf..690eaf3 100644 (file)
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,12 +52,18 @@ enum address_markers_idx {
         USER_SPACE_NR = 0,
         KERNEL_SPACE_NR,
         LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+       LDT_NR,
+#endif
         VMALLOC_START_NR,
         VMEMMAP_START_NR,
  #ifdef CONFIG_KASAN
         KASAN_SHADOW_START_NR,
         KASAN_SHADOW_END_NR,
  #endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+       LDT_NR,
+#endif
         CPU_ENTRY_AREA_NR,
  #ifdef CONFIG_X86_ESPFIX64
         ESPFIX_START_NR,
@@ -82,6 +88,9 @@ static struct addr_marker address_markers[] = {
         [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
         [KASAN_SHADOW_END_NR]   = { KASAN_SHADOW_END,   "KASAN shadow end" },
  #endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+       [LDT_NR]                = { LDT_BASE_ADDR,      "LDT remap" },
+#endif
         [CPU_ENTRY_AREA_NR]     = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
  #ifdef CONFIG_X86_ESPFIX64
         [ESPFIX_START_NR]       = { ESPFIX_BASE_ADDR,   "ESPfix Area", 16 },
author	Andy Lutomirski <luto@kernel.org>
	Tue, 12 Dec 2017 15:56:45 +0000 (07:56 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 2 Jan 2018 19:31:00 +0000 (20:31 +0100)
Documentation/x86/x86_64/mm.txt		patch \| blob \| history
arch/x86/include/asm/mmu_context.h		patch \| blob \| history
arch/x86/include/asm/pgtable_64_types.h		patch \| blob \| history
arch/x86/include/asm/processor.h		patch \| blob \| history
arch/x86/kernel/ldt.c		patch \| blob \| history
arch/x86/mm/dump_pagetables.c		patch \| blob \| history