Merge branch 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2020 18:04:05 +0000 (11:04 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2020 18:04:05 +0000 (11:04 -0700)
Pull x86 cleanups from Ingo Molnar:
 "This topic tree contains more commits than usual:

   - most of it are uaccess cleanups/reorganization by Al

   - there's a bunch of prototype declaration (--Wmissing-prototypes)
     cleanups

   - misc other cleanups all around the map"

* 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  x86/mm/set_memory: Fix -Wmissing-prototypes warnings
  x86/efi: Add a prototype for efi_arch_mem_reserve()
  x86/mm: Mark setup_emu2phys_nid() static
  x86/jump_label: Move 'inline' keyword placement
  x86/platform/uv: Add a missing prototype for uv_bau_message_interrupt()
  kill uaccess_try()
  x86: unsafe_put-style macro for sigmask
  x86: x32_setup_rt_frame(): consolidate uaccess areas
  x86: __setup_rt_frame(): consolidate uaccess areas
  x86: __setup_frame(): consolidate uaccess areas
  x86: setup_sigcontext(): list user_access_{begin,end}() into callers
  x86: get rid of put_user_try in __setup_rt_frame() (both 32bit and 64bit)
  x86: ia32_setup_rt_frame(): consolidate uaccess areas
  x86: ia32_setup_frame(): consolidate uaccess areas
  x86: ia32_setup_sigcontext(): lift user_access_{begin,end}() into the callers
  x86/alternatives: Mark text_poke_loc_init() static
  x86/cpu: Fix a -Wmissing-prototypes warning for init_ia32_feat_ctl()
  x86/mm: Drop pud_mknotpresent()
  x86: Replace setup_irq() by request_irq()
  x86/configs: Slightly reduce defconfigs
  ...

15 files changed:
1  2 
Documentation/x86/exception-tables.rst
arch/x86/entry/common.c
arch/x86/include/asm/processor.h
arch/x86/include/asm/sighandling.h
arch/x86/include/asm/syscall.h
arch/x86/include/asm/uaccess.h
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/time.c
arch/x86/kernel/tsc.c
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
include/linux/compat.h
include/linux/efi.h

@@@ -257,9 -257,6 +257,9 @@@ the fault, in our case the actual valu
  the original assembly code: > 3:      movl $-14,%eax
  and linked in vmlinux     : > c0199ff5 <.fixup+10b5> movl   $0xfffffff2,%eax
  
 +If the fixup was able to handle the exception, control flow may be returned
 +to the instruction after the one that triggered the fault, ie. local label 2b.
 +
  The assembly code::
  
   > .section __ex_table,"a"
@@@ -340,21 -337,4 +340,15 @@@ pointer which points to one of
       entry->insn. It is used to distinguish page faults from machine
       check.
  
- 3) ``int ex_handler_ext(const struct exception_table_entry *fixup)``
-      This case is used for uaccess_err ... we need to set a flag
-      in the task structure. Before the handler functions existed this
-      case was handled by adding a large offset to the fixup to tag
-      it as special.
  More functions can easily be added.
 +
 +CONFIG_BUILDTIME_TABLE_SORT allows the __ex_table section to be sorted post
 +link of the kernel image, via a host utility scripts/sorttable. It will set the
 +symbol main_extable_sort_needed to 0, avoiding sorting the __ex_table section
 +at boot time. With the exception table sorted, at runtime when an exception
 +occurs we can quickly lookup the __ex_table entry via binary search.
 +
 +This is not just a boot time optimization, some architectures require this
 +table to be sorted in order to handle exceptions relatively early in the boot
 +process. For example, i386 makes use of this form of exception handling before
 +paging support is even enabled!
diff --combined arch/x86/entry/common.c
@@@ -34,6 -34,7 +34,7 @@@
  #include <asm/fpu/api.h>
  #include <asm/nospec-branch.h>
  #include <asm/io_bitmap.h>
+ #include <asm/syscall.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/syscalls.h>
@@@ -333,7 -334,20 +334,7 @@@ static __always_inline void do_syscall_
  
        if (likely(nr < IA32_NR_syscalls)) {
                nr = array_index_nospec(nr, IA32_NR_syscalls);
 -#ifdef CONFIG_IA32_EMULATION
                regs->ax = ia32_sys_call_table[nr](regs);
 -#else
 -              /*
 -               * It's possible that a 32-bit syscall implementation
 -               * takes a 64-bit parameter but nonetheless assumes that
 -               * the high bits are zero.  Make sure we zero-extend all
 -               * of the args.
 -               */
 -              regs->ax = ia32_sys_call_table[nr](
 -                      (unsigned int)regs->bx, (unsigned int)regs->cx,
 -                      (unsigned int)regs->dx, (unsigned int)regs->si,
 -                      (unsigned int)regs->di, (unsigned int)regs->bp);
 -#endif /* CONFIG_IA32_EMULATION */
        }
  
        syscall_return_slowpath(regs);
@@@ -425,8 -439,3 +426,8 @@@ __visible long do_fast_syscall_32(struc
  #endif
  }
  #endif
 +
 +SYSCALL_DEFINE0(ni_syscall)
 +{
 +      return -ENOSYS;
 +}
@@@ -26,7 -26,6 +26,7 @@@ struct vm86
  #include <asm/fpu/types.h>
  #include <asm/unwind_hints.h>
  #include <asm/vmxfeatures.h>
 +#include <asm/vdso/processor.h>
  
  #include <linux/personality.h>
  #include <linux/cache.h>
@@@ -542,7 -541,6 +542,6 @@@ struct thread_struct 
        mm_segment_t            addr_limit;
  
        unsigned int            sig_on_uaccess_err:1;
-       unsigned int            uaccess_err:1;  /* uaccess failed */
  
        /* Floating point and extended processor state */
        struct fpu              fpu;
@@@ -678,6 -676,17 +677,6 @@@ static inline unsigned int cpuid_edx(un
        return edx;
  }
  
 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 -static __always_inline void rep_nop(void)
 -{
 -      asm volatile("rep; nop" ::: "memory");
 -}
 -
 -static __always_inline void cpu_relax(void)
 -{
 -      rep_nop();
 -}
 -
  /*
   * This function forces the icache and prefetched instruction stream to
   * catch up with reality in two very specific cases:
@@@ -14,7 -14,9 +14,5 @@@
                         X86_EFLAGS_CF | X86_EFLAGS_RF)
  
  void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
- int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-                    struct pt_regs *regs, unsigned long mask);
  
 -#ifdef CONFIG_X86_X32_ABI
 -asmlinkage long sys32_x32_rt_sigreturn(void);
 -#endif
 -
  #endif /* _ASM_X86_SIGHANDLING_H */
  #include <uapi/linux/audit.h>
  #include <linux/sched.h>
  #include <linux/err.h>
 -#include <asm/asm-offsets.h>  /* For NR_syscalls */
  #include <asm/thread_info.h>  /* for TS_COMPAT */
  #include <asm/unistd.h>
  
 -#ifdef CONFIG_X86_64
 -typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
 -#else
 -typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
 -                                        unsigned long, unsigned long,
 -                                        unsigned long, unsigned long);
 -#endif /* CONFIG_X86_64 */
 +typedef long (*sys_call_ptr_t)(const struct pt_regs *);
  extern const sys_call_ptr_t sys_call_table[];
  
  #if defined(CONFIG_X86_32)
  #define ia32_sys_call_table sys_call_table
 -#define __NR_syscall_compat_max __NR_syscall_max
 -#define IA32_NR_syscalls NR_syscalls
  #endif
  
  #if defined(CONFIG_IA32_EMULATION)
@@@ -159,6 -168,11 +159,11 @@@ static inline int syscall_get_arch(stru
                task->thread_info.status & TS_COMPAT)
                ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
  }
+ void do_syscall_64(unsigned long nr, struct pt_regs *regs);
+ void do_int80_syscall_32(struct pt_regs *regs);
+ long do_fast_syscall_32(struct pt_regs *regs);
  #endif        /* CONFIG_X86_32 */
  
  #endif        /* _ASM_X86_SYSCALL_H */
@@@ -193,23 -193,12 +193,12 @@@ __typeof__(__builtin_choose_expr(sizeof
                     : : "A" (x), "r" (addr)                    \
                     : : label)
  
- #define __put_user_asm_ex_u64(x, addr)                                        \
-       asm volatile("\n"                                               \
-                    "1:        movl %%eax,0(%1)\n"                     \
-                    "2:        movl %%edx,4(%1)\n"                     \
-                    "3:"                                               \
-                    _ASM_EXTABLE_EX(1b, 2b)                            \
-                    _ASM_EXTABLE_EX(2b, 3b)                            \
-                    : : "A" (x), "r" (addr))
  #define __put_user_x8(x, ptr, __ret_pu)                               \
        asm volatile("call __put_user_8" : "=a" (__ret_pu)      \
                     : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
  #else
  #define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "", "er", label)
- #define __put_user_asm_ex_u64(x, addr)        \
-       __put_user_asm_ex(x, addr, "q", "", "er")
  #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
  #endif
  
@@@ -289,31 -278,6 +278,6 @@@ do {                                                                      
        }                                                               \
  } while (0)
  
- /*
-  * This doesn't do __uaccess_begin/end - the exception handling
-  * around it must do that.
-  */
- #define __put_user_size_ex(x, ptr, size)                              \
- do {                                                                  \
-       __chk_user_ptr(ptr);                                            \
-       switch (size) {                                                 \
-       case 1:                                                         \
-               __put_user_asm_ex(x, ptr, "b", "b", "iq");              \
-               break;                                                  \
-       case 2:                                                         \
-               __put_user_asm_ex(x, ptr, "w", "w", "ir");              \
-               break;                                                  \
-       case 4:                                                         \
-               __put_user_asm_ex(x, ptr, "l", "k", "ir");              \
-               break;                                                  \
-       case 8:                                                         \
-               __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr);      \
-               break;                                                  \
-       default:                                                        \
-               __put_user_bad();                                       \
-       }                                                               \
- } while (0)
  #ifdef CONFIG_X86_32
  #define __get_user_asm_u64(x, ptr, retval, errret)                    \
  ({                                                                    \
                       "i" (errret), "0" (retval));                     \
  })
  
- #define __get_user_asm_ex_u64(x, ptr)                 (x) = __get_user_bad()
  #else
  #define __get_user_asm_u64(x, ptr, retval, errret) \
         __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
- #define __get_user_asm_ex_u64(x, ptr) \
-        __get_user_asm_ex(x, ptr, "q", "", "=r")
  #endif
  
  #define __get_user_size(x, ptr, size, retval, errret)                 \
@@@ -378,53 -339,6 +339,6 @@@ do {                                                                      
                     : "=r" (err), ltype(x)                             \
                     : "m" (__m(addr)), "i" (errret), "0" (err))
  
- #define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret)      \
-       asm volatile("\n"                                               \
-                    "1:        mov"itype" %2,%"rtype"1\n"              \
-                    "2:\n"                                             \
-                    ".section .fixup,\"ax\"\n"                         \
-                    "3:        mov %3,%0\n"                            \
-                    "  jmp 2b\n"                                       \
-                    ".previous\n"                                      \
-                    _ASM_EXTABLE_UA(1b, 3b)                            \
-                    : "=r" (err), ltype(x)                             \
-                    : "m" (__m(addr)), "i" (errret), "0" (err))
- /*
-  * This doesn't do __uaccess_begin/end - the exception handling
-  * around it must do that.
-  */
- #define __get_user_size_ex(x, ptr, size)                              \
- do {                                                                  \
-       __chk_user_ptr(ptr);                                            \
-       switch (size) {                                                 \
-       case 1:                                                         \
-               __get_user_asm_ex(x, ptr, "b", "b", "=q");              \
-               break;                                                  \
-       case 2:                                                         \
-               __get_user_asm_ex(x, ptr, "w", "w", "=r");              \
-               break;                                                  \
-       case 4:                                                         \
-               __get_user_asm_ex(x, ptr, "l", "k", "=r");              \
-               break;                                                  \
-       case 8:                                                         \
-               __get_user_asm_ex_u64(x, ptr);                          \
-               break;                                                  \
-       default:                                                        \
-               (x) = __get_user_bad();                                 \
-       }                                                               \
- } while (0)
- #define __get_user_asm_ex(x, addr, itype, rtype, ltype)                       \
-       asm volatile("1:        mov"itype" %1,%"rtype"0\n"              \
-                    "2:\n"                                             \
-                    ".section .fixup,\"ax\"\n"                         \
-                      "3:xor"itype" %"rtype"0,%"rtype"0\n"             \
-                    "  jmp 2b\n"                                       \
-                    ".previous\n"                                      \
-                    _ASM_EXTABLE_EX(1b, 3b)                            \
-                    : ltype(x) : "m" (__m(addr)))
  #define __put_user_nocheck(x, ptr, size)                      \
  ({                                                            \
        __label__ __pu_label;                                   \
@@@ -480,29 -394,6 +394,6 @@@ struct __large_struct { unsigned long b
        retval = __put_user_failed(x, addr, itype, rtype, ltype, errret);       \
  } while (0)
  
- #define __put_user_asm_ex(x, addr, itype, rtype, ltype)                       \
-       asm volatile("1:        mov"itype" %"rtype"0,%1\n"              \
-                    "2:\n"                                             \
-                    _ASM_EXTABLE_EX(1b, 2b)                            \
-                    : : ltype(x), "m" (__m(addr)))
- /*
-  * uaccess_try and catch
-  */
- #define uaccess_try   do {                                            \
-       current->thread.uaccess_err = 0;                                \
-       __uaccess_begin();                                              \
-       barrier();
- #define uaccess_try_nospec do {                                               \
-       current->thread.uaccess_err = 0;                                \
-       __uaccess_begin_nospec();                                       \
- #define uaccess_catch(err)                                            \
-       __uaccess_end();                                                \
-       (err) |= (current->thread.uaccess_err ? -EFAULT : 0);           \
- } while (0)
  /**
   * __get_user - Get a simple variable from user space, with less checking.
   * @x:   Variable to store result.
  #define __put_user(x, ptr)                                            \
        __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
  
- /*
-  * {get|put}_user_try and catch
-  *
-  * get_user_try {
-  *    get_user_ex(...);
-  * } get_user_catch(err)
-  */
- #define get_user_try          uaccess_try_nospec
- #define get_user_catch(err)   uaccess_catch(err)
- #define get_user_ex(x, ptr)   do {                                    \
-       unsigned long __gue_val;                                        \
-       __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr))));       \
-       (x) = (__force __typeof__(*(ptr)))__gue_val;                    \
- } while (0)
- #define put_user_try          uaccess_try
- #define put_user_catch(err)   uaccess_catch(err)
- #define put_user_ex(x, ptr)                                           \
-       __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
  extern unsigned long
  copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
  extern __must_check long
@@@ -584,6 -453,99 +453,6 @@@ extern __must_check long strnlen_user(c
  unsigned long __must_check clear_user(void __user *mem, unsigned long len);
  unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
  
 -extern void __cmpxchg_wrong_size(void)
 -      __compiletime_error("Bad argument size for cmpxchg");
 -
 -#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size)     \
 -({                                                                    \
 -      int __ret = 0;                                                  \
 -      __typeof__(*(ptr)) __old = (old);                               \
 -      __typeof__(*(ptr)) __new = (new);                               \
 -      __uaccess_begin_nospec();                                       \
 -      switch (size) {                                                 \
 -      case 1:                                                         \
 -      {                                                               \
 -              asm volatile("\n"                                       \
 -                      "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n"          \
 -                      "2:\n"                                          \
 -                      "\t.section .fixup, \"ax\"\n"                   \
 -                      "3:\tmov     %3, %0\n"                          \
 -                      "\tjmp     2b\n"                                \
 -                      "\t.previous\n"                                 \
 -                      _ASM_EXTABLE_UA(1b, 3b)                         \
 -                      : "+r" (__ret), "=a" (__old), "+m" (*(ptr))     \
 -                      : "i" (-EFAULT), "q" (__new), "1" (__old)       \
 -                      : "memory"                                      \
 -              );                                                      \
 -              break;                                                  \
 -      }                                                               \
 -      case 2:                                                         \
 -      {                                                               \
 -              asm volatile("\n"                                       \
 -                      "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n"          \
 -                      "2:\n"                                          \
 -                      "\t.section .fixup, \"ax\"\n"                   \
 -                      "3:\tmov     %3, %0\n"                          \
 -                      "\tjmp     2b\n"                                \
 -                      "\t.previous\n"                                 \
 -                      _ASM_EXTABLE_UA(1b, 3b)                         \
 -                      : "+r" (__ret), "=a" (__old), "+m" (*(ptr))     \
 -                      : "i" (-EFAULT), "r" (__new), "1" (__old)       \
 -                      : "memory"                                      \
 -              );                                                      \
 -              break;                                                  \
 -      }                                                               \
 -      case 4:                                                         \
 -      {                                                               \
 -              asm volatile("\n"                                       \
 -                      "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"          \
 -                      "2:\n"                                          \
 -                      "\t.section .fixup, \"ax\"\n"                   \
 -                      "3:\tmov     %3, %0\n"                          \
 -                      "\tjmp     2b\n"                                \
 -                      "\t.previous\n"                                 \
 -                      _ASM_EXTABLE_UA(1b, 3b)                         \
 -                      : "+r" (__ret), "=a" (__old), "+m" (*(ptr))     \
 -                      : "i" (-EFAULT), "r" (__new), "1" (__old)       \
 -                      : "memory"                                      \
 -              );                                                      \
 -              break;                                                  \
 -      }                                                               \
 -      case 8:                                                         \
 -      {                                                               \
 -              if (!IS_ENABLED(CONFIG_X86_64))                         \
 -                      __cmpxchg_wrong_size();                         \
 -                                                                      \
 -              asm volatile("\n"                                       \
 -                      "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"          \
 -                      "2:\n"                                          \
 -                      "\t.section .fixup, \"ax\"\n"                   \
 -                      "3:\tmov     %3, %0\n"                          \
 -                      "\tjmp     2b\n"                                \
 -                      "\t.previous\n"                                 \
 -                      _ASM_EXTABLE_UA(1b, 3b)                         \
 -                      : "+r" (__ret), "=a" (__old), "+m" (*(ptr))     \
 -                      : "i" (-EFAULT), "r" (__new), "1" (__old)       \
 -                      : "memory"                                      \
 -              );                                                      \
 -              break;                                                  \
 -      }                                                               \
 -      default:                                                        \
 -              __cmpxchg_wrong_size();                                 \
 -      }                                                               \
 -      __uaccess_end();                                                \
 -      *(uval) = __old;                                                \
 -      __ret;                                                          \
 -})
 -
 -#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new)             \
 -({                                                                    \
 -      access_ok((ptr), sizeof(*(ptr))) ?              \
 -              __user_atomic_cmpxchg_inatomic((uval), (ptr),           \
 -                              (old), (new), sizeof(*(ptr))) :         \
 -              -EFAULT;                                                \
 -})
 -
  /*
   * movsl can be slow when source and dest are not both 8-byte aligned
   */
@@@ -602,15 -564,6 +471,6 @@@ extern struct movsl_mask 
  #endif
  
  /*
-  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
-  * nested NMI paths are careful to preserve CR2.
-  *
-  * Caller must use pagefault_enable/disable, or run in interrupt context,
-  * and also do a uaccess_ok() check
-  */
- #define __copy_from_user_nmi __copy_from_user_inatomic
- /*
   * The "unsafe" user accesses aren't really "unsafe", but the naming
   * is a big fat warning: you have to not only do the access_ok()
   * checking before using them, but you have to surround them with the
diff --combined arch/x86/kernel/signal.c
  #endif /* CONFIG_X86_64 */
  
  #include <asm/syscall.h>
 -#include <asm/syscalls.h>
 -
  #include <asm/sigframe.h>
  #include <asm/signal.h>
  
- #define COPY(x)                       do {                    \
-       get_user_ex(regs->x, &sc->x);                   \
- } while (0)
- #define GET_SEG(seg)          ({                      \
-       unsigned short tmp;                             \
-       get_user_ex(tmp, &sc->seg);                     \
-       tmp;                                            \
- })
- #define COPY_SEG(seg)         do {                    \
-       regs->seg = GET_SEG(seg);                       \
- } while (0)
- #define COPY_SEG_CPL3(seg)    do {                    \
-       regs->seg = GET_SEG(seg) | 3;                   \
- } while (0)
  #ifdef CONFIG_X86_64
  /*
   * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
@@@ -90,53 -74,58 +72,58 @@@ static void force_valid_ss(struct pt_re
            ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
                regs->ss = __USER_DS;
  }
+ # define CONTEXT_COPY_SIZE    offsetof(struct sigcontext, reserved1)
+ #else
+ # define CONTEXT_COPY_SIZE    sizeof(struct sigcontext)
  #endif
  
  static int restore_sigcontext(struct pt_regs *regs,
-                             struct sigcontext __user *sc,
+                             struct sigcontext __user *usc,
                              unsigned long uc_flags)
  {
-       unsigned long buf_val;
-       void __user *buf;
-       unsigned int tmpflags;
-       unsigned int err = 0;
+       struct sigcontext sc;
  
        /* Always make any pending restarted system calls return -EINTR */
        current->restart_block.fn = do_no_restart_syscall;
  
-       get_user_try {
+       if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
+               return -EFAULT;
  
  #ifdef CONFIG_X86_32
-               set_user_gs(regs, GET_SEG(gs));
-               COPY_SEG(fs);
-               COPY_SEG(es);
-               COPY_SEG(ds);
+       set_user_gs(regs, sc.gs);
+       regs->fs = sc.fs;
+       regs->es = sc.es;
+       regs->ds = sc.ds;
  #endif /* CONFIG_X86_32 */
  
-               COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-               COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+       regs->bx = sc.bx;
+       regs->cx = sc.cx;
+       regs->dx = sc.dx;
+       regs->si = sc.si;
+       regs->di = sc.di;
+       regs->bp = sc.bp;
+       regs->ax = sc.ax;
+       regs->sp = sc.sp;
+       regs->ip = sc.ip;
  
  #ifdef CONFIG_X86_64
-               COPY(r8);
-               COPY(r9);
-               COPY(r10);
-               COPY(r11);
-               COPY(r12);
-               COPY(r13);
-               COPY(r14);
-               COPY(r15);
+       regs->r8 = sc.r8;
+       regs->r9 = sc.r9;
+       regs->r10 = sc.r10;
+       regs->r11 = sc.r11;
+       regs->r12 = sc.r12;
+       regs->r13 = sc.r13;
+       regs->r14 = sc.r14;
+       regs->r15 = sc.r15;
  #endif /* CONFIG_X86_64 */
  
-               COPY_SEG_CPL3(cs);
-               COPY_SEG_CPL3(ss);
-               get_user_ex(tmpflags, &sc->flags);
-               regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-               regs->orig_ax = -1;             /* disable syscall checks */
+       /* Get CS/SS and force CPL3 */
+       regs->cs = sc.cs | 0x03;
+       regs->ss = sc.ss | 0x03;
  
-               get_user_ex(buf_val, &sc->fpstate);
-               buf = (void __user *)buf_val;
-       } get_user_catch(err);
+       regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+       /* disable syscall checks */
+       regs->orig_ax = -1;
  
  #ifdef CONFIG_X86_64
        /*
                force_valid_ss(regs);
  #endif
  
-       err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
-       return err;
+       return fpu__restore_sig((void __user *)sc.fpstate,
+                              IS_ENABLED(CONFIG_X86_32));
  }
  
- int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ static __always_inline int
+ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                     struct pt_regs *regs, unsigned long mask)
  {
-       int err = 0;
-       put_user_try {
  #ifdef CONFIG_X86_32
-               put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
-               put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
-               put_user_ex(regs->es, (unsigned int __user *)&sc->es);
-               put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+       unsafe_put_user(get_user_gs(regs),
+                                 (unsigned int __user *)&sc->gs, Efault);
+       unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+       unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+       unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
  #endif /* CONFIG_X86_32 */
  
-               put_user_ex(regs->di, &sc->di);
-               put_user_ex(regs->si, &sc->si);
-               put_user_ex(regs->bp, &sc->bp);
-               put_user_ex(regs->sp, &sc->sp);
-               put_user_ex(regs->bx, &sc->bx);
-               put_user_ex(regs->dx, &sc->dx);
-               put_user_ex(regs->cx, &sc->cx);
-               put_user_ex(regs->ax, &sc->ax);
+       unsafe_put_user(regs->di, &sc->di, Efault);
+       unsafe_put_user(regs->si, &sc->si, Efault);
+       unsafe_put_user(regs->bp, &sc->bp, Efault);
+       unsafe_put_user(regs->sp, &sc->sp, Efault);
+       unsafe_put_user(regs->bx, &sc->bx, Efault);
+       unsafe_put_user(regs->dx, &sc->dx, Efault);
+       unsafe_put_user(regs->cx, &sc->cx, Efault);
+       unsafe_put_user(regs->ax, &sc->ax, Efault);
  #ifdef CONFIG_X86_64
-               put_user_ex(regs->r8, &sc->r8);
-               put_user_ex(regs->r9, &sc->r9);
-               put_user_ex(regs->r10, &sc->r10);
-               put_user_ex(regs->r11, &sc->r11);
-               put_user_ex(regs->r12, &sc->r12);
-               put_user_ex(regs->r13, &sc->r13);
-               put_user_ex(regs->r14, &sc->r14);
-               put_user_ex(regs->r15, &sc->r15);
+       unsafe_put_user(regs->r8, &sc->r8, Efault);
+       unsafe_put_user(regs->r9, &sc->r9, Efault);
+       unsafe_put_user(regs->r10, &sc->r10, Efault);
+       unsafe_put_user(regs->r11, &sc->r11, Efault);
+       unsafe_put_user(regs->r12, &sc->r12, Efault);
+       unsafe_put_user(regs->r13, &sc->r13, Efault);
+       unsafe_put_user(regs->r14, &sc->r14, Efault);
+       unsafe_put_user(regs->r15, &sc->r15, Efault);
  #endif /* CONFIG_X86_64 */
  
-               put_user_ex(current->thread.trap_nr, &sc->trapno);
-               put_user_ex(current->thread.error_code, &sc->err);
-               put_user_ex(regs->ip, &sc->ip);
+       unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+       unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+       unsafe_put_user(regs->ip, &sc->ip, Efault);
  #ifdef CONFIG_X86_32
-               put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
-               put_user_ex(regs->flags, &sc->flags);
-               put_user_ex(regs->sp, &sc->sp_at_signal);
-               put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+       unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+       unsafe_put_user(regs->flags, &sc->flags, Efault);
+       unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+       unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
  #else /* !CONFIG_X86_32 */
-               put_user_ex(regs->flags, &sc->flags);
-               put_user_ex(regs->cs, &sc->cs);
-               put_user_ex(0, &sc->gs);
-               put_user_ex(0, &sc->fs);
-               put_user_ex(regs->ss, &sc->ss);
+       unsafe_put_user(regs->flags, &sc->flags, Efault);
+       unsafe_put_user(regs->cs, &sc->cs, Efault);
+       unsafe_put_user(0, &sc->gs, Efault);
+       unsafe_put_user(0, &sc->fs, Efault);
+       unsafe_put_user(regs->ss, &sc->ss, Efault);
  #endif /* CONFIG_X86_32 */
  
-               put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate);
+       unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
  
-               /* non-iBCS2 extensions.. */
-               put_user_ex(mask, &sc->oldmask);
-               put_user_ex(current->thread.cr2, &sc->cr2);
-       } put_user_catch(err);
-       return err;
+       /* non-iBCS2 extensions.. */
+       unsafe_put_user(mask, &sc->oldmask, Efault);
+       unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+       return 0;
+ Efault:
+       return -EFAULT;
  }
  
+ #define unsafe_put_sigcontext(sc, fp, regs, set, label)                       \
+ do {                                                                  \
+       if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0]))       \
+               goto label;                                             \
+ } while(0);
+ #define unsafe_put_sigmask(set, frame, label) \
+       unsafe_put_user(*(__u64 *)(set), \
+                       (__u64 __user *)&(frame)->uc.uc_sigmask, \
+                       label)
  /*
   * Set up a signal frame.
   */
@@@ -310,26 -307,16 +305,16 @@@ __setup_frame(int sig, struct ksignal *
  {
        struct sigframe __user *frame;
        void __user *restorer;
-       int err = 0;
-       void __user *fpstate = NULL;
-       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+       void __user *fp = NULL;
  
-       if (!access_ok(frame, sizeof(*frame)))
-               return -EFAULT;
+       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
  
-       if (__put_user(sig, &frame->sig))
+       if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;
  
-       if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
-               return -EFAULT;
-       if (_NSIG_WORDS > 1) {
-               if (__copy_to_user(&frame->extramask, &set->sig[1],
-                                  sizeof(frame->extramask)))
-                       return -EFAULT;
-       }
+       unsafe_put_user(sig, &frame->sig, Efault);
+       unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
+       unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
        if (current->mm->context.vdso)
                restorer = current->mm->context.vdso +
                        vdso_image_32.sym___kernel_sigreturn;
                restorer = ksig->ka.sa.sa_restorer;
  
        /* Set up to return from userspace.  */
-       err |= __put_user(restorer, &frame->pretcode);
+       unsafe_put_user(restorer, &frame->pretcode, Efault);
  
        /*
         * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
         * reasons and because gdb uses it as a signature to notice
         * signal handler stack frames.
         */
-       err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
-       if (err)
-               return -EFAULT;
+       unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault);
+       user_access_end();
  
        /* Set up registers for signal handler */
        regs->sp = (unsigned long)frame;
        regs->cs = __USER_CS;
  
        return 0;
+ Efault:
+       user_access_end();
+       return -EFAULT;
  }
  
  static int __setup_rt_frame(int sig, struct ksignal *ksig,
  {
        struct rt_sigframe __user *frame;
        void __user *restorer;
-       int err = 0;
-       void __user *fpstate = NULL;
+       void __user *fp = NULL;
  
-       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
  
-       if (!access_ok(frame, sizeof(*frame)))
+       if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;
  
-       put_user_try {
-               put_user_ex(sig, &frame->sig);
-               put_user_ex(&frame->info, &frame->pinfo);
-               put_user_ex(&frame->uc, &frame->puc);
+       unsafe_put_user(sig, &frame->sig, Efault);
+       unsafe_put_user(&frame->info, &frame->pinfo, Efault);
+       unsafe_put_user(&frame->uc, &frame->puc, Efault);
  
-               /* Create the ucontext.  */
-               if (static_cpu_has(X86_FEATURE_XSAVE))
-                       put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-               else
-                       put_user_ex(0, &frame->uc.uc_flags);
-               put_user_ex(0, &frame->uc.uc_link);
-               save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+       /* Create the ucontext.  */
+       if (static_cpu_has(X86_FEATURE_XSAVE))
+               unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+       else
+               unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+       unsafe_put_user(0, &frame->uc.uc_link, Efault);
+       unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
  
-               /* Set up to return from userspace.  */
-               restorer = current->mm->context.vdso +
-                       vdso_image_32.sym___kernel_rt_sigreturn;
-               if (ksig->ka.sa.sa_flags & SA_RESTORER)
-                       restorer = ksig->ka.sa.sa_restorer;
-               put_user_ex(restorer, &frame->pretcode);
+       /* Set up to return from userspace.  */
+       restorer = current->mm->context.vdso +
+               vdso_image_32.sym___kernel_rt_sigreturn;
+       if (ksig->ka.sa.sa_flags & SA_RESTORER)
+               restorer = ksig->ka.sa.sa_restorer;
+       unsafe_put_user(restorer, &frame->pretcode, Efault);
  
-               /*
-                * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
-                *
-                * WE DO NOT USE IT ANY MORE! It's only left here for historical
-                * reasons and because gdb uses it as a signature to notice
-                * signal handler stack frames.
-                */
-               put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
-       } put_user_catch(err);
+       /*
+        * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+        *
+        * WE DO NOT USE IT ANY MORE! It's only left here for historical
+        * reasons and because gdb uses it as a signature to notice
+        * signal handler stack frames.
+        */
+       unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
+       unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+       unsafe_put_sigmask(set, frame, Efault);
+       user_access_end();
        
-       err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-                               regs, set->sig[0]);
-       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-       if (err)
+       if (copy_siginfo_to_user(&frame->info, &ksig->info))
                return -EFAULT;
  
        /* Set up registers for signal handler */
        regs->cs = __USER_CS;
  
        return 0;
+ Efault:
+       user_access_end();
+       return -EFAULT;
  }
  #else /* !CONFIG_X86_32 */
  static unsigned long frame_uc_flags(struct pt_regs *regs)
@@@ -455,43 -442,34 +440,34 @@@ static int __setup_rt_frame(int sig, st
        struct rt_sigframe __user *frame;
        void __user *fp = NULL;
        unsigned long uc_flags;
-       int err = 0;
+       /* x86-64 should always use SA_RESTORER. */
+       if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+               return -EFAULT;
  
        frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
+       uc_flags = frame_uc_flags(regs);
  
-       if (!access_ok(frame, sizeof(*frame)))
+       if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;
  
+       /* Create the ucontext.  */
+       unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+       unsafe_put_user(0, &frame->uc.uc_link, Efault);
+       unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+       /* Set up to return from userspace.  If provided, use a stub
+          already in userspace.  */
+       unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+       unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+       unsafe_put_sigmask(set, frame, Efault);
+       user_access_end();
        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                if (copy_siginfo_to_user(&frame->info, &ksig->info))
                        return -EFAULT;
        }
  
-       uc_flags = frame_uc_flags(regs);
-       put_user_try {
-               /* Create the ucontext.  */
-               put_user_ex(uc_flags, &frame->uc.uc_flags);
-               put_user_ex(0, &frame->uc.uc_link);
-               save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-               /* Set up to return from userspace.  If provided, use a stub
-                  already in userspace.  */
-               /* x86-64 should always use SA_RESTORER. */
-               if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-                       put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
-               } else {
-                       /* could use a vstub here */
-                       err |= -EFAULT;
-               }
-       } put_user_catch(err);
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
-       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-       if (err)
-               return -EFAULT;
        /* Set up registers for signal handler */
        regs->di = sig;
        /* In case the signal handler was declared without prototypes */
                force_valid_ss(regs);
  
        return 0;
+ Efault:
+       user_access_end();
+       return -EFAULT;
  }
  #endif /* CONFIG_X86_32 */
  
@@@ -539,44 -521,33 +519,33 @@@ static int x32_setup_rt_frame(struct ks
        struct rt_sigframe_x32 __user *frame;
        unsigned long uc_flags;
        void __user *restorer;
-       int err = 0;
-       void __user *fpstate = NULL;
-       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+       void __user *fp = NULL;
  
-       if (!access_ok(frame, sizeof(*frame)))
+       if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
                return -EFAULT;
  
-       if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-               if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
-                       return -EFAULT;
-       }
+       frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
  
        uc_flags = frame_uc_flags(regs);
  
-       put_user_try {
-               /* Create the ucontext.  */
-               put_user_ex(uc_flags, &frame->uc.uc_flags);
-               put_user_ex(0, &frame->uc.uc_link);
-               compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
-               put_user_ex(0, &frame->uc.uc__pad0);
-               if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-                       restorer = ksig->ka.sa.sa_restorer;
-               } else {
-                       /* could use a vstub here */
-                       restorer = NULL;
-                       err |= -EFAULT;
-               }
-               put_user_ex(restorer, (unsigned long __user *)&frame->pretcode);
-       } put_user_catch(err);
+       if (!user_access_begin(frame, sizeof(*frame)))
+               return -EFAULT;
  
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-                               regs, set->sig[0]);
-       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+       /* Create the ucontext.  */
+       unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+       unsafe_put_user(0, &frame->uc.uc_link, Efault);
+       unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+       unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+       restorer = ksig->ka.sa.sa_restorer;
+       unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+       unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+       unsafe_put_sigmask(set, frame, Efault);
+       user_access_end();
  
-       if (err)
-               return -EFAULT;
+       if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+               if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+                       return -EFAULT;
+       }
  
        /* Set up registers for signal handler */
        regs->sp = (unsigned long) frame;
  #endif        /* CONFIG_X86_X32_ABI */
  
        return 0;
+ #ifdef CONFIG_X86_X32_ABI
+ Efault:
+       user_access_end();
+       return -EFAULT;
+ #endif
  }
  
  /*
@@@ -611,9 -587,8 +585,8 @@@ SYSCALL_DEFINE0(sigreturn
  
        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
-       if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
-               && __copy_from_user(&set.sig[1], &frame->extramask,
-                                   sizeof(frame->extramask))))
+       if (__get_user(set.sig[0], &frame->sc.oldmask) ||
+           __get_user(set.sig[1], &frame->extramask[0]))
                goto badframe;
  
        set_current_blocked(&set);
@@@ -643,7 -618,7 +616,7 @@@ SYSCALL_DEFINE0(rt_sigreturn
        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
-       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+       if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;
@@@ -857,7 -832,7 +830,7 @@@ void signal_fault(struct pt_regs *regs
  }
  
  #ifdef CONFIG_X86_X32_ABI
 -asmlinkage long sys32_x32_rt_sigreturn(void)
 +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
  {
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_x32 __user *frame;
  
        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
-       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+       if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;
@@@ -147,8 -147,6 +147,8 @@@ static inline void smpboot_restore_warm
        *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
  }
  
 +static void init_freq_invariance(void);
 +
  /*
   * Report back to the Boot Processor during boot time or to the caller processor
   * during CPU online.
@@@ -185,8 -183,6 +185,8 @@@ static void smp_callin(void
         */
        set_cpu_sibling_map(raw_smp_processor_id());
  
 +      init_freq_invariance();
 +
        /*
         * Get our bogomips.
         * Update loops_per_jiffy in cpu_data. Previous call to
@@@ -470,7 -466,7 +470,7 @@@ static bool match_smt(struct cpuinfo_x8
   */
  
  static const struct x86_cpu_id snc_cpu[] = {
 -      { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
 +      X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL),
        {}
  };
  
@@@ -1341,7 -1337,7 +1341,7 @@@ void __init native_smp_prepare_cpus(uns
        set_sched_topology(x86_topology);
  
        set_cpu_sibling_map(0);
 -
 +      init_freq_invariance();
        smp_sanity_check();
  
        switch (apic_intr_mode) {
@@@ -1438,7 -1434,7 +1438,7 @@@ early_param("possible_cpus", _setup_pos
  /*
   * cpu_possible_mask should be static, it cannot change as cpu's
   * are onlined, or offlined. The reason is per-cpu data-structures
-  * are allocated by some modules at init time, and dont expect to
+  * are allocated by some modules at init time, and don't expect to
   * do this dynamically on cpu arrival/departure.
   * cpu_present_mask on the other hand can change dynamically.
   * In case when cpu_hotplug is not compiled, then we resort to current
@@@ -1768,287 -1764,3 +1768,287 @@@ void native_play_dead(void
  }
  
  #endif
 +
 +/*
 + * APERF/MPERF frequency ratio computation.
 + *
 + * The scheduler wants to do frequency invariant accounting and needs a <1
 + * ratio to account for the 'current' frequency, corresponding to
 + * freq_curr / freq_max.
 + *
 + * Since the frequency freq_curr on x86 is controlled by micro-controller and
 + * our P-state setting is little more than a request/hint, we need to observe
 + * the effective frequency 'BusyMHz', i.e. the average frequency over a time
 + * interval after discarding idle time. This is given by:
 + *
 + *   BusyMHz = delta_APERF / delta_MPERF * freq_base
 + *
 + * where freq_base is the max non-turbo P-state.
 + *
 + * The freq_max term has to be set to a somewhat arbitrary value, because we
 + * can't know which turbo states will be available at a given point in time:
 + * it all depends on the thermal headroom of the entire package. We set it to
 + * the turbo level with 4 cores active.
 + *
 + * Benchmarks show that's a good compromise between the 1C turbo ratio
 + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
 + * which would ignore the entire turbo range (a conspicuous part, making
 + * freq_curr/freq_max always maxed out).
 + *
 + * An exception to the heuristic above is the Atom uarch, where we choose the
 + * highest turbo level for freq_max since Atom's are generally oriented towards
 + * power efficiency.
 + *
 + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
 + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
 + */
 +
 +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
 +
 +static DEFINE_PER_CPU(u64, arch_prev_aperf);
 +static DEFINE_PER_CPU(u64, arch_prev_mperf);
 +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
 +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
 +
 +void arch_set_max_freq_ratio(bool turbo_disabled)
 +{
 +      arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
 +                                      arch_turbo_freq_ratio;
 +}
 +
 +static bool turbo_disabled(void)
 +{
 +      u64 misc_en;
 +      int err;
 +
 +      err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
 +      if (err)
 +              return false;
 +
 +      return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
 +}
 +
 +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
 +{
 +      int err;
 +
 +      err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
 +      if (err)
 +              return false;
 +
 +      err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
 +      if (err)
 +              return false;
 +
 +      *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
 +      *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
 +
 +      return true;
 +}
 +
 +#include <asm/cpu_device_id.h>
 +#include <asm/intel-family.h>
 +
 +#define ICPU(model) \
 +      {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
 +
 +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
 +      ICPU(INTEL_FAM6_XEON_PHI_KNL),
 +      ICPU(INTEL_FAM6_XEON_PHI_KNM),
 +      {}
 +};
 +
 +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
 +      ICPU(INTEL_FAM6_SKYLAKE_X),
 +      {}
 +};
 +
 +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
 +      ICPU(INTEL_FAM6_ATOM_GOLDMONT),
 +      ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
 +      ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
 +      {}
 +};
 +
 +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
 +                              int num_delta_fratio)
 +{
 +      int fratio, delta_fratio, found;
 +      int err, i;
 +      u64 msr;
 +
 +      if (!x86_match_cpu(has_knl_turbo_ratio_limits))
 +              return false;
 +
 +      err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 +      if (err)
 +              return false;
 +
 +      *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
 +
 +      err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
 +      if (err)
 +              return false;
 +
 +      fratio = (msr >> 8) & 0xFF;
 +      i = 16;
 +      found = 0;
 +      do {
 +              if (found >= num_delta_fratio) {
 +                      *turbo_freq = fratio;
 +                      return true;
 +              }
 +
 +              delta_fratio = (msr >> (i + 5)) & 0x7;
 +
 +              if (delta_fratio) {
 +                      found += 1;
 +                      fratio -= delta_fratio;
 +              }
 +
 +              i += 8;
 +      } while (i < 64);
 +
 +      return true;
 +}
 +
 +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
 +{
 +      u64 ratios, counts;
 +      u32 group_size;
 +      int err, i;
 +
 +      err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 +      if (err)
 +              return false;
 +
 +      *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
 +
 +      err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
 +      if (err)
 +              return false;
 +
 +      err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
 +      if (err)
 +              return false;
 +
 +      for (i = 0; i < 64; i += 8) {
 +              group_size = (counts >> i) & 0xFF;
 +              if (group_size >= size) {
 +                      *turbo_freq = (ratios >> i) & 0xFF;
 +                      return true;
 +              }
 +      }
 +
 +      return false;
 +}
 +
 +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
 +{
 +      int err;
 +
 +      err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 +      if (err)
 +              return false;
 +
 +      err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
 +      if (err)
 +              return false;
 +
 +      *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
 +      *turbo_freq = (*turbo_freq >> 24) & 0xFF;   /* 4C turbo    */
 +
 +      return true;
 +}
 +
 +static bool intel_set_max_freq_ratio(void)
 +{
 +      u64 base_freq, turbo_freq;
 +
 +      if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
 +              goto out;
 +
 +      if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
 +          skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
 +              goto out;
 +
 +      if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
 +              goto out;
 +
 +      if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
 +          skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
 +              goto out;
 +
 +      if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
 +              goto out;
 +
 +      return false;
 +
 +out:
 +      arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
 +                                      base_freq);
 +      arch_set_max_freq_ratio(turbo_disabled());
 +      return true;
 +}
 +
 +static void init_counter_refs(void *arg)
 +{
 +      u64 aperf, mperf;
 +
 +      rdmsrl(MSR_IA32_APERF, aperf);
 +      rdmsrl(MSR_IA32_MPERF, mperf);
 +
 +      this_cpu_write(arch_prev_aperf, aperf);
 +      this_cpu_write(arch_prev_mperf, mperf);
 +}
 +
 +static void init_freq_invariance(void)
 +{
 +      bool ret = false;
 +
 +      if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
 +              return;
 +
 +      if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
 +              ret = intel_set_max_freq_ratio();
 +
 +      if (ret) {
 +              on_each_cpu(init_counter_refs, NULL, 1);
 +              static_branch_enable(&arch_scale_freq_key);
 +      } else {
 +              pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
 +      }
 +}
 +
 +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 +
 +void arch_scale_freq_tick(void)
 +{
 +      u64 freq_scale;
 +      u64 aperf, mperf;
 +      u64 acnt, mcnt;
 +
 +      if (!arch_scale_freq_invariant())
 +              return;
 +
 +      rdmsrl(MSR_IA32_APERF, aperf);
 +      rdmsrl(MSR_IA32_MPERF, mperf);
 +
 +      acnt = aperf - this_cpu_read(arch_prev_aperf);
 +      mcnt = mperf - this_cpu_read(arch_prev_mperf);
 +      if (!mcnt)
 +              return;
 +
 +      this_cpu_write(arch_prev_aperf, aperf);
 +      this_cpu_write(arch_prev_mperf, mperf);
 +
 +      acnt <<= 2*SCHED_CAPACITY_SHIFT;
 +      mcnt *= arch_max_freq_ratio;
 +
 +      freq_scale = div64_u64(acnt, mcnt);
 +
 +      if (freq_scale > SCHED_CAPACITY_SCALE)
 +              freq_scale = SCHED_CAPACITY_SCALE;
 +
 +      this_cpu_write(arch_freq_scale, freq_scale);
 +}
diff --combined arch/x86/kernel/time.c
@@@ -62,19 -62,16 +62,16 @@@ static irqreturn_t timer_interrupt(int 
        return IRQ_HANDLED;
  }
  
- static struct irqaction irq0  = {
-       .handler = timer_interrupt,
-       .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-       .name = "timer"
- };
  static void __init setup_default_timer_irq(void)
  {
+       unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER;
        /*
-        * Unconditionally register the legacy timer; even without legacy
-        * PIC/PIT we need this for the HPET0 in legacy replacement mode.
+        * Unconditionally register the legacy timer interrupt; even
+        * without legacy PIC/PIT we need this for the HPET0 in legacy
+        * replacement mode.
         */
-       if (setup_irq(0, &irq0))
+       if (request_irq(0, timer_interrupt, flags, "timer", NULL))
                pr_info("Failed to register legacy timer interrupt\n");
  }
  
@@@ -122,12 -119,18 +119,12 @@@ void __init time_init(void
   */
  void clocksource_arch_init(struct clocksource *cs)
  {
 -      if (cs->archdata.vclock_mode == VCLOCK_NONE)
 +      if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE)
                return;
  
 -      if (cs->archdata.vclock_mode > VCLOCK_MAX) {
 -              pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n",
 -                      cs->name, cs->archdata.vclock_mode);
 -              cs->archdata.vclock_mode = VCLOCK_NONE;
 -      }
 -
        if (cs->mask != CLOCKSOURCE_MASK(64)) {
 -              pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n",
 +              pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n",
                        cs->name, cs->mask);
 -              cs->archdata.vclock_mode = VCLOCK_NONE;
 +              cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
        }
  }
diff --combined arch/x86/kernel/tsc.c
@@@ -477,7 -477,7 +477,7 @@@ static unsigned long pit_calibrate_tsc(
   * transition from one expected value to another with a fairly
   * high accuracy, and we didn't miss any events. We can thus
   * use the TSC value at the transitions to calculate a pretty
-  * good value for the TSC frequencty.
+  * good value for the TSC frequency.
   */
  static inline int pit_verify_msb(unsigned char val)
  {
@@@ -1108,24 -1108,17 +1108,24 @@@ static void tsc_cs_tick_stable(struct c
                sched_clock_tick_stable();
  }
  
 +static int tsc_cs_enable(struct clocksource *cs)
 +{
 +      vclocks_set_used(VDSO_CLOCKMODE_TSC);
 +      return 0;
 +}
 +
  /*
   * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
   */
  static struct clocksource clocksource_tsc_early = {
 -      .name                   = "tsc-early",
 -      .rating                 = 299,
 -      .read                   = read_tsc,
 -      .mask                   = CLOCKSOURCE_MASK(64),
 -      .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 +      .name                   = "tsc-early",
 +      .rating                 = 299,
 +      .read                   = read_tsc,
 +      .mask                   = CLOCKSOURCE_MASK(64),
 +      .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
 -      .archdata               = { .vclock_mode = VCLOCK_TSC },
 +      .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
 +      .enable                 = tsc_cs_enable,
        .resume                 = tsc_resume,
        .mark_unstable          = tsc_cs_mark_unstable,
        .tick_stable            = tsc_cs_tick_stable,
   * been found good.
   */
  static struct clocksource clocksource_tsc = {
 -      .name                   = "tsc",
 -      .rating                 = 300,
 -      .read                   = read_tsc,
 -      .mask                   = CLOCKSOURCE_MASK(64),
 -      .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 +      .name                   = "tsc",
 +      .rating                 = 300,
 +      .read                   = read_tsc,
 +      .mask                   = CLOCKSOURCE_MASK(64),
 +      .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_VALID_FOR_HRES |
                                  CLOCK_SOURCE_MUST_VERIFY,
 -      .archdata               = { .vclock_mode = VCLOCK_TSC },
 +      .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
 +      .enable                 = tsc_cs_enable,
        .resume                 = tsc_resume,
        .mark_unstable          = tsc_cs_mark_unstable,
        .tick_stable            = tsc_cs_tick_stable,
@@@ -33,7 -33,7 +33,7 @@@
        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #define PT_HAVE_ACCESSED_DIRTY(mmu) true
        #ifdef CONFIG_X86_64
 -      #define PT_MAX_FULL_LEVELS 4
 +      #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
        #define CMPXCHG cmpxchg
        #else
        #define CMPXCHG cmpxchg64
@@@ -400,7 -400,7 +400,7 @@@ retry_walk
                        goto error;
  
                ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-               if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+               if (unlikely(__get_user(pte, ptep_user)))
                        goto error;
                walker->ptep_user[walker->level - 1] = ptep_user;
  
diff --combined arch/x86/kvm/vmx/vmx.c
@@@ -31,7 -31,6 +31,7 @@@
  #include <asm/apic.h>
  #include <asm/asm.h>
  #include <asm/cpu.h>
 +#include <asm/cpu_device_id.h>
  #include <asm/debugreg.h>
  #include <asm/desc.h>
  #include <asm/fpu/internal.h>
@@@ -42,6 -41,7 +42,7 @@@
  #include <asm/mce.h>
  #include <asm/mmu_context.h>
  #include <asm/mshyperv.h>
+ #include <asm/mwait.h>
  #include <asm/spec-ctrl.h>
  #include <asm/virtext.h>
  #include <asm/vmx.h>
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
 +#ifdef MODULE
  static const struct x86_cpu_id vmx_cpu_id[] = {
 -      X86_FEATURE_MATCH(X86_FEATURE_VMX),
 +      X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
        {}
  };
  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 +#endif
  
  bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
@@@ -98,7 -96,7 +99,7 @@@ module_param(emulate_invalid_guest_stat
  static bool __read_mostly fasteoi = 1;
  module_param(fasteoi, bool, S_IRUGO);
  
 -static bool __read_mostly enable_apicv = 1;
 +bool __read_mostly enable_apicv = 1;
  module_param(enable_apicv, bool, S_IRUGO);
  
  /*
@@@ -1178,10 -1176,6 +1179,10 @@@ void vmx_prepare_switch_to_guest(struc
                                           vmx->guest_msrs[i].mask);
  
        }
 +
 +      if (vmx->nested.need_vmcs12_to_shadow_sync)
 +              nested_sync_vmcs12_to_shadow(vcpu);
 +
        if (vmx->guest_state_loaded)
                return;
  
@@@ -1606,40 -1600,6 +1607,40 @@@ static int skip_emulated_instruction(st
        return 1;
  }
  
 +
 +/*
 + * Recognizes a pending MTF VM-exit and records the nested state for later
 + * delivery.
 + */
 +static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
 +{
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 +      if (!is_guest_mode(vcpu))
 +              return;
 +
 +      /*
 +       * Per the SDM, MTF takes priority over debug-trap exceptions besides
 +       * T-bit traps. As instruction emulation is completed (i.e. at the
 +       * instruction boundary), any #DB exception pending delivery must be a
 +       * debug-trap. Record the pending MTF state to be delivered in
 +       * vmx_check_nested_events().
 +       */
 +      if (nested_cpu_has_mtf(vmcs12) &&
 +          (!vcpu->arch.exception.pending ||
 +           vcpu->arch.exception.nr == DB_VECTOR))
 +              vmx->nested.mtf_pending = true;
 +      else
 +              vmx->nested.mtf_pending = false;
 +}
 +
 +static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 +{
 +      vmx_update_emulated_instruction(vcpu);
 +      return skip_emulated_instruction(vcpu);
 +}
 +
  static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
  {
        /*
@@@ -2339,17 -2299,6 +2340,17 @@@ static void hardware_disable(void
        kvm_cpu_vmxoff();
  }
  
 +/*
 + * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
 + * directly instead of going through cpu_has(), to ensure KVM is trapping
 + * ENCLS whenever it's supported in hardware.  It does not matter whether
 + * the host OS supports or has enabled SGX.
 + */
 +static bool cpu_has_sgx(void)
 +{
 +      return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
 +}
 +
  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
                                      u32 msr, u32 *result)
  {
@@@ -2430,9 -2379,8 +2431,9 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
                        SECONDARY_EXEC_PT_USE_GPA |
                        SECONDARY_EXEC_PT_CONCEAL_VMX |
 -                      SECONDARY_EXEC_ENABLE_VMFUNC |
 -                      SECONDARY_EXEC_ENCLS_EXITING;
 +                      SECONDARY_EXEC_ENABLE_VMFUNC;
 +              if (cpu_has_sgx())
 +                      opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@@ -3000,9 -2948,6 +3001,9 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu
  
  static int get_ept_level(struct kvm_vcpu *vcpu)
  {
 +      /* Nested EPT currently only supports 4-level walks. */
 +      if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
 +              return 4;
        if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
                return 5;
        return 4;
@@@ -3871,29 -3816,24 +3872,29 @@@ static int vmx_deliver_nested_posted_in
   * 2. If target vcpu isn't running(root mode), kick it to pick up the
   * interrupt from PIR in next vmentry.
   */
 -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 +static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;
  
        r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
        if (!r)
 -              return;
 +              return 0;
 +
 +      if (!vcpu->arch.apicv_active)
 +              return -1;
  
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
 -              return;
 +              return 0;
  
        /* If a previous notification has sent the IPI, nothing to do.  */
        if (pi_test_and_set_on(&vmx->pi_desc))
 -              return;
 +              return 0;
  
        if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                kvm_vcpu_kick(vcpu);
 +
 +      return 0;
  }
  
  /*
@@@ -4299,6 -4239,7 +4300,6 @@@ static void vmx_vcpu_reset(struct kvm_v
  
        vmx->msr_ia32_umwait_control = 0;
  
 -      vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        vmx->hv_deadline_tsc = -1;
        kvm_set_cr8(vcpu, 0);
@@@ -6288,7 -6229,7 +6289,7 @@@ static void handle_external_interrupt_i
  #endif
                ASM_CALL_CONSTRAINT
                :
 -              THUNK_TARGET(entry),
 +              [thunk_target]"r"(entry),
                [ss]"i"(__KERNEL_DS),
                [cs]"i"(__KERNEL_CS)
        );
@@@ -6540,11 -6481,8 +6541,11 @@@ static void vmx_vcpu_run(struct kvm_vcp
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
        }
  
 -      if (vmx->nested.need_vmcs12_to_shadow_sync)
 -              nested_sync_vmcs12_to_shadow(vcpu);
 +      /*
 +       * We did this in prepare_switch_to_guest, because it needs to
 +       * be within srcu_read_lock.
 +       */
 +      WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
  
        if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@@ -6818,14 -6756,14 +6819,14 @@@ static int vmx_create_vcpu(struct kvm_v
  
        if (nested)
                nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
 -                                         vmx_capability.ept,
 -                                         kvm_vcpu_apicv_active(vcpu));
 +                                         vmx_capability.ept);
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
  
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
  
 +      vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
  
        /*
@@@ -6899,7 -6837,8 +6900,7 @@@ static int __init vmx_check_processor_c
        if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
                return -EIO;
        if (nested)
 -              nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
 -                                         enable_apicv);
 +              nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
                                smp_processor_id());
@@@ -7160,40 -7099,6 +7161,40 @@@ static void vmx_request_immediate_exit(
        to_vmx(vcpu)->req_immediate_exit = true;
  }
  
 +static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
 +                                struct x86_instruction_info *info)
 +{
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 +      unsigned short port;
 +      bool intercept;
 +      int size;
 +
 +      if (info->intercept == x86_intercept_in ||
 +          info->intercept == x86_intercept_ins) {
 +              port = info->src_val;
 +              size = info->dst_bytes;
 +      } else {
 +              port = info->dst_val;
 +              size = info->src_bytes;
 +      }
 +
 +      /*
 +       * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
 +       * VM-exits depend on the 'unconditional IO exiting' VM-execution
 +       * control.
 +       *
 +       * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
 +       */
 +      if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 +              intercept = nested_cpu_has(vmcs12,
 +                                         CPU_BASED_UNCOND_IO_EXITING);
 +      else
 +              intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
 +
 +      /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
 +      return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
 +}
 +
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
  
 +      switch (info->intercept) {
        /*
         * RDPID causes #UD if disabled through secondary execution controls.
         * Because it is marked as EmulateOnUD, we need to intercept it here.
         */
 -      if (info->intercept == x86_intercept_rdtscp &&
 -          !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
 -              ctxt->exception.vector = UD_VECTOR;
 -              ctxt->exception.error_code_valid = false;
 -              return X86EMUL_PROPAGATE_FAULT;
 -      }
 +      case x86_intercept_rdtscp:
 +              if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
 +                      ctxt->exception.vector = UD_VECTOR;
 +                      ctxt->exception.error_code_valid = false;
 +                      return X86EMUL_PROPAGATE_FAULT;
 +              }
 +              break;
 +
 +      case x86_intercept_in:
 +      case x86_intercept_ins:
 +      case x86_intercept_out:
 +      case x86_intercept_outs:
 +              return vmx_check_intercept_io(vcpu, info);
 +
 +      case x86_intercept_lgdt:
 +      case x86_intercept_lidt:
 +      case x86_intercept_lldt:
 +      case x86_intercept_ltr:
 +      case x86_intercept_sgdt:
 +      case x86_intercept_sidt:
 +      case x86_intercept_sldt:
 +      case x86_intercept_str:
 +              if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
 +                      return X86EMUL_CONTINUE;
 +
 +              /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
 +              break;
  
        /* TODO: check more intercepts... */
 -      return X86EMUL_CONTINUE;
 +      default:
 +              break;
 +      }
 +
 +      return X86EMUL_UNHANDLEABLE;
  }
  
  #ifdef CONFIG_X86_64
@@@ -7821,7 -7700,7 +7822,7 @@@ static __init int hardware_setup(void
  
        if (nested) {
                nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
 -                                         vmx_capability.ept, enable_apicv);
 +                                         vmx_capability.ept);
  
                r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
                if (r)
@@@ -7905,8 -7784,7 +7906,8 @@@ static struct kvm_x86_ops vmx_x86_ops _
  
        .run = vmx_vcpu_run,
        .handle_exit = vmx_handle_exit,
 -      .skip_emulated_instruction = skip_emulated_instruction,
 +      .skip_emulated_instruction = vmx_skip_emulated_instruction,
 +      .update_emulated_instruction = vmx_update_emulated_instruction,
        .set_interrupt_shadow = vmx_set_interrupt_shadow,
        .get_interrupt_shadow = vmx_get_interrupt_shadow,
        .patch_hypercall = vmx_patch_hypercall,
diff --combined arch/x86/kvm/vmx/vmx.h
@@@ -14,8 -14,6 +14,6 @@@
  extern const u32 vmx_msr_index[];
  extern u64 host_efer;
  
- extern u32 get_umwait_control_msr(void);
  #define MSR_TYPE_R    1
  #define MSR_TYPE_W    2
  #define MSR_TYPE_RW   3
@@@ -150,9 -148,6 +148,9 @@@ struct nested_vmx 
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
  
 +      /* Pending MTF VM-exit into L1.  */
 +      bool mtf_pending;
 +
        struct loaded_vmcs vmcs02;
  
        /*
diff --combined include/linux/compat.h
@@@ -248,6 -248,15 +248,6 @@@ typedef struct compat_siginfo 
        } _sifields;
  } compat_siginfo_t;
  
 -/*
 - * These functions operate on 32- or 64-bit specs depending on
 - * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments.
 - */
 -extern int compat_get_timespec(struct timespec *, const void __user *);
 -extern int compat_put_timespec(const struct timespec *, void __user *);
 -extern int compat_get_timeval(struct timeval *, const void __user *);
 -extern int compat_put_timeval(const struct timeval *, void __user *);
 -
  struct compat_iovec {
        compat_uptr_t   iov_base;
        compat_size_t   iov_len;
@@@ -407,6 -416,26 +407,6 @@@ int copy_siginfo_to_user32(struct compa
  int get_compat_sigevent(struct sigevent *event,
                const struct compat_sigevent __user *u_event);
  
 -static inline int old_timeval32_compare(struct old_timeval32 *lhs,
 -                                      struct old_timeval32 *rhs)
 -{
 -      if (lhs->tv_sec < rhs->tv_sec)
 -              return -1;
 -      if (lhs->tv_sec > rhs->tv_sec)
 -              return 1;
 -      return lhs->tv_usec - rhs->tv_usec;
 -}
 -
 -static inline int old_timespec32_compare(struct old_timespec32 *lhs,
 -                                      struct old_timespec32 *rhs)
 -{
 -      if (lhs->tv_sec < rhs->tv_sec)
 -              return -1;
 -      if (lhs->tv_sec > rhs->tv_sec)
 -              return 1;
 -      return lhs->tv_nsec - rhs->tv_nsec;
 -}
 -
  extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
  
  /*
@@@ -454,12 -483,13 +454,13 @@@ extern void __user *compat_alloc_user_s
  
  int compat_restore_altstack(const compat_stack_t __user *uss);
  int __compat_save_altstack(compat_stack_t __user *, unsigned long);
- #define compat_save_altstack_ex(uss, sp) do { \
+ #define unsafe_compat_save_altstack(uss, sp, label) do { \
        compat_stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
-       put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-       put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
-       put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+       unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \
+                       &__uss->ss_sp, label); \
+       unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
+       unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
        if (t->sas_ss_flags & SS_AUTODISARM) \
                sas_ss_reset(t); \
  } while (0);
diff --combined include/linux/efi.h
@@@ -56,6 -56,19 +56,6 @@@ typedef void *efi_handle_t
  #define __efiapi
  #endif
  
 -#define efi_get_handle_at(array, idx)                                 \
 -      (efi_is_native() ? (array)[idx]                                 \
 -              : (efi_handle_t)(unsigned long)((u32 *)(array))[idx])
 -
 -#define efi_get_handle_num(size)                                      \
 -      ((size) / (efi_is_native() ? sizeof(efi_handle_t) : sizeof(u32)))
 -
 -#define for_each_efi_handle(handle, array, size, i)                   \
 -      for (i = 0;                                                     \
 -           i < efi_get_handle_num(size) &&                            \
 -              ((handle = efi_get_handle_at((array), i)) || true);     \
 -           i++)
 -
  /*
   * The UEFI spec and EDK2 reference implementation both define EFI_GUID as
   * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment
@@@ -144,6 -157,15 +144,6 @@@ typedef struct 
        u32 imagesize;
  } efi_capsule_header_t;
  
 -struct efi_boot_memmap {
 -      efi_memory_desc_t       **map;
 -      unsigned long           *map_size;
 -      unsigned long           *desc_size;
 -      u32                     *desc_ver;
 -      unsigned long           *key_ptr;
 -      unsigned long           *buff_size;
 -};
 -
  /*
   * EFI capsule flags
   */
@@@ -165,6 -187,14 +165,6 @@@ struct capsule_info 
  
  int __efi_capsule_setup_info(struct capsule_info *cap_info);
  
 -/*
 - * Allocation types for calls to boottime->allocate_pages.
 - */
 -#define EFI_ALLOCATE_ANY_PAGES                0
 -#define EFI_ALLOCATE_MAX_ADDRESS      1
 -#define EFI_ALLOCATE_ADDRESS          2
 -#define EFI_MAX_ALLOCATE_TYPE         3
 -
  typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
  
  /*
@@@ -194,7 -224,291 +194,7 @@@ typedef struct 
        u8 sets_to_zero;
  } efi_time_cap_t;
  
 -typedef struct {
 -      efi_table_hdr_t hdr;
 -      u32 raise_tpl;
 -      u32 restore_tpl;
 -      u32 allocate_pages;
 -      u32 free_pages;
 -      u32 get_memory_map;
 -      u32 allocate_pool;
 -      u32 free_pool;
 -      u32 create_event;
 -      u32 set_timer;
 -      u32 wait_for_event;
 -      u32 signal_event;
 -      u32 close_event;
 -      u32 check_event;
 -      u32 install_protocol_interface;
 -      u32 reinstall_protocol_interface;
 -      u32 uninstall_protocol_interface;
 -      u32 handle_protocol;
 -      u32 __reserved;
 -      u32 register_protocol_notify;
 -      u32 locate_handle;
 -      u32 locate_device_path;
 -      u32 install_configuration_table;
 -      u32 load_image;
 -      u32 start_image;
 -      u32 exit;
 -      u32 unload_image;
 -      u32 exit_boot_services;
 -      u32 get_next_monotonic_count;
 -      u32 stall;
 -      u32 set_watchdog_timer;
 -      u32 connect_controller;
 -      u32 disconnect_controller;
 -      u32 open_protocol;
 -      u32 close_protocol;
 -      u32 open_protocol_information;
 -      u32 protocols_per_handle;
 -      u32 locate_handle_buffer;
 -      u32 locate_protocol;
 -      u32 install_multiple_protocol_interfaces;
 -      u32 uninstall_multiple_protocol_interfaces;
 -      u32 calculate_crc32;
 -      u32 copy_mem;
 -      u32 set_mem;
 -      u32 create_event_ex;
 -} __packed efi_boot_services_32_t;
 -
 -/*
 - * EFI Boot Services table
 - */
 -typedef union {
 -      struct {
 -              efi_table_hdr_t hdr;
 -              void *raise_tpl;
 -              void *restore_tpl;
 -              efi_status_t (__efiapi *allocate_pages)(int, int, unsigned long,
 -                                                      efi_physical_addr_t *);
 -              efi_status_t (__efiapi *free_pages)(efi_physical_addr_t,
 -                                                  unsigned long);
 -              efi_status_t (__efiapi *get_memory_map)(unsigned long *, void *,
 -                                                      unsigned long *,
 -                                                      unsigned long *, u32 *);
 -              efi_status_t (__efiapi *allocate_pool)(int, unsigned long,
 -                                                     void **);
 -              efi_status_t (__efiapi *free_pool)(void *);
 -              void *create_event;
 -              void *set_timer;
 -              void *wait_for_event;
 -              void *signal_event;
 -              void *close_event;
 -              void *check_event;
 -              void *install_protocol_interface;
 -              void *reinstall_protocol_interface;
 -              void *uninstall_protocol_interface;
 -              efi_status_t (__efiapi *handle_protocol)(efi_handle_t,
 -                                                       efi_guid_t *, void **);
 -              void *__reserved;
 -              void *register_protocol_notify;
 -              efi_status_t (__efiapi *locate_handle)(int, efi_guid_t *,
 -                                                     void *, unsigned long *,
 -                                                     efi_handle_t *);
 -              void *locate_device_path;
 -              efi_status_t (__efiapi *install_configuration_table)(efi_guid_t *,
 -                                                                   void *);
 -              void *load_image;
 -              void *start_image;
 -              void *exit;
 -              void *unload_image;
 -              efi_status_t (__efiapi *exit_boot_services)(efi_handle_t,
 -                                                          unsigned long);
 -              void *get_next_monotonic_count;
 -              void *stall;
 -              void *set_watchdog_timer;
 -              void *connect_controller;
 -              efi_status_t (__efiapi *disconnect_controller)(efi_handle_t,
 -                                                             efi_handle_t,
 -                                                             efi_handle_t);
 -              void *open_protocol;
 -              void *close_protocol;
 -              void *open_protocol_information;
 -              void *protocols_per_handle;
 -              void *locate_handle_buffer;
 -              efi_status_t (__efiapi *locate_protocol)(efi_guid_t *, void *,
 -                                                       void **);
 -              void *install_multiple_protocol_interfaces;
 -              void *uninstall_multiple_protocol_interfaces;
 -              void *calculate_crc32;
 -              void *copy_mem;
 -              void *set_mem;
 -              void *create_event_ex;
 -      };
 -      efi_boot_services_32_t mixed_mode;
 -} efi_boot_services_t;
 -
 -typedef enum {
 -      EfiPciIoWidthUint8,
 -      EfiPciIoWidthUint16,
 -      EfiPciIoWidthUint32,
 -      EfiPciIoWidthUint64,
 -      EfiPciIoWidthFifoUint8,
 -      EfiPciIoWidthFifoUint16,
 -      EfiPciIoWidthFifoUint32,
 -      EfiPciIoWidthFifoUint64,
 -      EfiPciIoWidthFillUint8,
 -      EfiPciIoWidthFillUint16,
 -      EfiPciIoWidthFillUint32,
 -      EfiPciIoWidthFillUint64,
 -      EfiPciIoWidthMaximum
 -} EFI_PCI_IO_PROTOCOL_WIDTH;
 -
 -typedef enum {
 -      EfiPciIoAttributeOperationGet,
 -      EfiPciIoAttributeOperationSet,
 -      EfiPciIoAttributeOperationEnable,
 -      EfiPciIoAttributeOperationDisable,
 -      EfiPciIoAttributeOperationSupported,
 -    EfiPciIoAttributeOperationMaximum
 -} EFI_PCI_IO_PROTOCOL_ATTRIBUTE_OPERATION;
 -
 -typedef struct {
 -      u32 read;
 -      u32 write;
 -} efi_pci_io_protocol_access_32_t;
 -
 -typedef union efi_pci_io_protocol efi_pci_io_protocol_t;
 -
 -typedef
 -efi_status_t (__efiapi *efi_pci_io_protocol_cfg_t)(efi_pci_io_protocol_t *,
 -                                                 EFI_PCI_IO_PROTOCOL_WIDTH,
 -                                                 u32 offset,
 -                                                 unsigned long count,
 -                                                 void *buffer);
 -
 -typedef struct {
 -      void *read;
 -      void *write;
 -} efi_pci_io_protocol_access_t;
 -
 -typedef struct {
 -      efi_pci_io_protocol_cfg_t read;
 -      efi_pci_io_protocol_cfg_t write;
 -} efi_pci_io_protocol_config_access_t;
 -
 -union efi_pci_io_protocol {
 -      struct {
 -              void *poll_mem;
 -              void *poll_io;
 -              efi_pci_io_protocol_access_t mem;
 -              efi_pci_io_protocol_access_t io;
 -              efi_pci_io_protocol_config_access_t pci;
 -              void *copy_mem;
 -              void *map;
 -              void *unmap;
 -              void *allocate_buffer;
 -              void *free_buffer;
 -              void *flush;
 -              efi_status_t (__efiapi *get_location)(efi_pci_io_protocol_t *,
 -                                                    unsigned long *segment_nr,
 -                                                    unsigned long *bus_nr,
 -                                                    unsigned long *device_nr,
 -                                                    unsigned long *func_nr);
 -              void *attributes;
 -              void *get_bar_attributes;
 -              void *set_bar_attributes;
 -              uint64_t romsize;
 -              void *romimage;
 -      };
 -      struct {
 -              u32 poll_mem;
 -              u32 poll_io;
 -              efi_pci_io_protocol_access_32_t mem;
 -              efi_pci_io_protocol_access_32_t io;
 -              efi_pci_io_protocol_access_32_t pci;
 -              u32 copy_mem;
 -              u32 map;
 -              u32 unmap;
 -              u32 allocate_buffer;
 -              u32 free_buffer;
 -              u32 flush;
 -              u32 get_location;
 -              u32 attributes;
 -              u32 get_bar_attributes;
 -              u32 set_bar_attributes;
 -              u64 romsize;
 -              u32 romimage;
 -      } mixed_mode;
 -};
 -
 -#define EFI_PCI_IO_ATTRIBUTE_ISA_MOTHERBOARD_IO 0x0001
 -#define EFI_PCI_IO_ATTRIBUTE_ISA_IO 0x0002
 -#define EFI_PCI_IO_ATTRIBUTE_VGA_PALETTE_IO 0x0004
 -#define EFI_PCI_IO_ATTRIBUTE_VGA_MEMORY 0x0008
 -#define EFI_PCI_IO_ATTRIBUTE_VGA_IO 0x0010
 -#define EFI_PCI_IO_ATTRIBUTE_IDE_PRIMARY_IO 0x0020
 -#define EFI_PCI_IO_ATTRIBUTE_IDE_SECONDARY_IO 0x0040
 -#define EFI_PCI_IO_ATTRIBUTE_MEMORY_WRITE_COMBINE 0x0080
 -#define EFI_PCI_IO_ATTRIBUTE_IO 0x0100
 -#define EFI_PCI_IO_ATTRIBUTE_MEMORY 0x0200
 -#define EFI_PCI_IO_ATTRIBUTE_BUS_MASTER 0x0400
 -#define EFI_PCI_IO_ATTRIBUTE_MEMORY_CACHED 0x0800
 -#define EFI_PCI_IO_ATTRIBUTE_MEMORY_DISABLE 0x1000
 -#define EFI_PCI_IO_ATTRIBUTE_EMBEDDED_DEVICE 0x2000
 -#define EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM 0x4000
 -#define EFI_PCI_IO_ATTRIBUTE_DUAL_ADDRESS_CYCLE 0x8000
 -#define EFI_PCI_IO_ATTRIBUTE_ISA_IO_16 0x10000
 -#define EFI_PCI_IO_ATTRIBUTE_VGA_PALETTE_IO_16 0x20000
 -#define EFI_PCI_IO_ATTRIBUTE_VGA_IO_16 0x40000
 -
 -struct efi_dev_path;
 -
 -typedef union apple_properties_protocol apple_properties_protocol_t;
 -
 -union apple_properties_protocol {
 -      struct {
 -              unsigned long version;
 -              efi_status_t (__efiapi *get)(apple_properties_protocol_t *,
 -                                           struct efi_dev_path *,
 -                                           efi_char16_t *, void *, u32 *);
 -              efi_status_t (__efiapi *set)(apple_properties_protocol_t *,
 -                                           struct efi_dev_path *,
 -                                           efi_char16_t *, void *, u32);
 -              efi_status_t (__efiapi *del)(apple_properties_protocol_t *,
 -                                           struct efi_dev_path *,
 -                                           efi_char16_t *);
 -              efi_status_t (__efiapi *get_all)(apple_properties_protocol_t *,
 -                                               void *buffer, u32 *);
 -      };
 -      struct {
 -              u32 version;
 -              u32 get;
 -              u32 set;
 -              u32 del;
 -              u32 get_all;
 -      } mixed_mode;
 -};
 -
 -typedef u32 efi_tcg2_event_log_format;
 -
 -typedef union efi_tcg2_protocol efi_tcg2_protocol_t;
 -
 -union efi_tcg2_protocol {
 -      struct {
 -              void *get_capability;
 -              efi_status_t (__efiapi *get_event_log)(efi_handle_t,
 -                                                     efi_tcg2_event_log_format,
 -                                                     efi_physical_addr_t *,
 -                                                     efi_physical_addr_t *,
 -                                                     efi_bool_t *);
 -              void *hash_log_extend_event;
 -              void *submit_command;
 -              void *get_active_pcr_banks;
 -              void *set_active_pcr_banks;
 -              void *get_result_of_set_active_pcr_banks;
 -      };
 -      struct {
 -              u32 get_capability;
 -              u32 get_event_log;
 -              u32 hash_log_extend_event;
 -              u32 submit_command;
 -              u32 get_active_pcr_banks;
 -              u32 set_active_pcr_banks;
 -              u32 get_result_of_set_active_pcr_banks;
 -      } mixed_mode;
 -};
 +typedef union efi_boot_services efi_boot_services_t;
  
  /*
   * Types and defines for EFI ResetSystem
@@@ -332,9 -646,6 +332,9 @@@ void efi_native_runtime_setup(void)
  #define EFI_CONSOLE_OUT_DEVICE_GUID           EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4,  0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
  #define APPLE_PROPERTIES_PROTOCOL_GUID                EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb,  0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0)
  #define EFI_TCG2_PROTOCOL_GUID                        EFI_GUID(0x607f766c, 0x7455, 0x42be,  0x93, 0x0b, 0xe4, 0xd7, 0x6d, 0xb2, 0x72, 0x0f)
 +#define EFI_LOAD_FILE_PROTOCOL_GUID           EFI_GUID(0x56ec3091, 0x954c, 0x11d2,  0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
 +#define EFI_LOAD_FILE2_PROTOCOL_GUID          EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e,  0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d)
 +#define EFI_RT_PROPERTIES_TABLE_GUID          EFI_GUID(0xeb66918a, 0x7eef, 0x402a,  0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9)
  
  #define EFI_IMAGE_SECURITY_DATABASE_GUID      EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596,  0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
  #define EFI_SHIM_LOCK_GUID                    EFI_GUID(0x605dab50, 0xe046, 0x4300,  0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
  #define LINUX_EFI_TPM_EVENT_LOG_GUID          EFI_GUID(0xb7799cb0, 0xeca2, 0x4943,  0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa)
  #define LINUX_EFI_TPM_FINAL_LOG_GUID          EFI_GUID(0x1e2ed096, 0x30e2, 0x4254,  0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25)
  #define LINUX_EFI_MEMRESERVE_TABLE_GUID               EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5,  0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2)
 +#define LINUX_EFI_INITRD_MEDIA_GUID           EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
  
  /* OEM GUIDs */
  #define DELLEMC_EFI_RCI2_TABLE_GUID           EFI_GUID(0x2d9f28a2, 0xa886, 0x456a,  0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)
@@@ -478,6 -788,74 +478,6 @@@ struct efi_mem_range 
        u64 attribute;
  };
  
 -struct efi_fdt_params {
 -      u64 system_table;
 -      u64 mmap;
 -      u32 mmap_size;
 -      u32 desc_size;
 -      u32 desc_ver;
 -};
 -
 -typedef struct {
 -      u32 revision;
 -      efi_handle_t parent_handle;
 -      efi_system_table_t *system_table;
 -      efi_handle_t device_handle;
 -      void *file_path;
 -      void *reserved;
 -      u32 load_options_size;
 -      void *load_options;
 -      void *image_base;
 -      __aligned_u64 image_size;
 -      unsigned int image_code_type;
 -      unsigned int image_data_type;
 -      efi_status_t ( __efiapi *unload)(efi_handle_t image_handle);
 -} efi_loaded_image_t;
 -
 -typedef struct {
 -      u64 size;
 -      u64 file_size;
 -      u64 phys_size;
 -      efi_time_t create_time;
 -      efi_time_t last_access_time;
 -      efi_time_t modification_time;
 -      __aligned_u64 attribute;
 -      efi_char16_t filename[1];
 -} efi_file_info_t;
 -
 -typedef struct efi_file_handle efi_file_handle_t;
 -
 -struct efi_file_handle {
 -      u64 revision;
 -      efi_status_t (__efiapi *open)(efi_file_handle_t *,
 -                                    efi_file_handle_t **,
 -                                    efi_char16_t *, u64, u64);
 -      efi_status_t (__efiapi *close)(efi_file_handle_t *);
 -      void *delete;
 -      efi_status_t (__efiapi *read)(efi_file_handle_t *,
 -                                    unsigned long *, void *);
 -      void *write;
 -      void *get_position;
 -      void *set_position;
 -      efi_status_t (__efiapi *get_info)(efi_file_handle_t *,
 -                                        efi_guid_t *, unsigned long *,
 -                                        void *);
 -      void *set_info;
 -      void *flush;
 -};
 -
 -typedef struct efi_file_io_interface efi_file_io_interface_t;
 -
 -struct efi_file_io_interface {
 -      u64 revision;
 -      int (__efiapi *open_volume)(efi_file_io_interface_t *,
 -                                  efi_file_handle_t **);
 -};
 -
 -#define EFI_FILE_MODE_READ    0x0000000000000001
 -#define EFI_FILE_MODE_WRITE   0x0000000000000002
 -#define EFI_FILE_MODE_CREATE  0x8000000000000000
 -
  typedef struct {
        u32 version;
        u32 length;
  #define EFI_PROPERTIES_TABLE_VERSION  0x00010000
  #define EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA       0x1
  
 +typedef struct {
 +      u16 version;
 +      u16 length;
 +      u32 runtime_services_supported;
 +} efi_rt_properties_table_t;
 +
 +#define EFI_RT_PROPERTIES_TABLE_VERSION       0x1
 +
  #define EFI_INVALID_TABLE_ADDR                (~0UL)
  
  typedef struct {
@@@ -526,63 -896,48 +526,63 @@@ typedef struct 
        efi_time_t time_of_revocation;
  } efi_cert_x509_sha256_t;
  
 +extern unsigned long __ro_after_init efi_rng_seed;            /* RNG Seed table */
 +
  /*
   * All runtime access to EFI goes through this structure:
   */
  extern struct efi {
 -      efi_system_table_t *systab;     /* EFI system table */
 -      unsigned int runtime_version;   /* Runtime services version */
 -      unsigned long mps;              /* MPS table */
 -      unsigned long acpi;             /* ACPI table  (IA64 ext 0.71) */
 -      unsigned long acpi20;           /* ACPI table  (ACPI 2.0) */
 -      unsigned long smbios;           /* SMBIOS table (32 bit entry point) */
 -      unsigned long smbios3;          /* SMBIOS table (64 bit entry point) */
 -      unsigned long boot_info;        /* boot info table */
 -      unsigned long hcdp;             /* HCDP table */
 -      unsigned long uga;              /* UGA table */
 -      unsigned long fw_vendor;        /* fw_vendor */
 -      unsigned long runtime;          /* runtime table */
 -      unsigned long config_table;     /* config tables */
 -      unsigned long esrt;             /* ESRT table */
 -      unsigned long properties_table; /* properties table */
 -      unsigned long mem_attr_table;   /* memory attributes table */
 -      unsigned long rng_seed;         /* UEFI firmware random seed */
 -      unsigned long tpm_log;          /* TPM2 Event Log table */
 -      unsigned long tpm_final_log;    /* TPM2 Final Events Log table */
 -      unsigned long mem_reserve;      /* Linux EFI memreserve table */
 -      efi_get_time_t *get_time;
 -      efi_set_time_t *set_time;
 -      efi_get_wakeup_time_t *get_wakeup_time;
 -      efi_set_wakeup_time_t *set_wakeup_time;
 -      efi_get_variable_t *get_variable;
 -      efi_get_next_variable_t *get_next_variable;
 -      efi_set_variable_t *set_variable;
 -      efi_set_variable_t *set_variable_nonblocking;
 -      efi_query_variable_info_t *query_variable_info;
 -      efi_query_variable_info_t *query_variable_info_nonblocking;
 -      efi_update_capsule_t *update_capsule;
 -      efi_query_capsule_caps_t *query_capsule_caps;
 -      efi_get_next_high_mono_count_t *get_next_high_mono_count;
 -      efi_reset_system_t *reset_system;
 -      struct efi_memory_map memmap;
 -      unsigned long flags;
 +      const efi_runtime_services_t    *runtime;               /* EFI runtime services table */
 +      unsigned int                    runtime_version;        /* Runtime services version */
 +      unsigned int                    runtime_supported_mask;
 +
 +      unsigned long                   acpi;                   /* ACPI table  (IA64 ext 0.71) */
 +      unsigned long                   acpi20;                 /* ACPI table  (ACPI 2.0) */
 +      unsigned long                   smbios;                 /* SMBIOS table (32 bit entry point) */
 +      unsigned long                   smbios3;                /* SMBIOS table (64 bit entry point) */
 +      unsigned long                   esrt;                   /* ESRT table */
 +      unsigned long                   tpm_log;                /* TPM2 Event Log table */
 +      unsigned long                   tpm_final_log;          /* TPM2 Final Events Log table */
 +
 +      efi_get_time_t                  *get_time;
 +      efi_set_time_t                  *set_time;
 +      efi_get_wakeup_time_t           *get_wakeup_time;
 +      efi_set_wakeup_time_t           *set_wakeup_time;
 +      efi_get_variable_t              *get_variable;
 +      efi_get_next_variable_t         *get_next_variable;
 +      efi_set_variable_t              *set_variable;
 +      efi_set_variable_t              *set_variable_nonblocking;
 +      efi_query_variable_info_t       *query_variable_info;
 +      efi_query_variable_info_t       *query_variable_info_nonblocking;
 +      efi_update_capsule_t            *update_capsule;
 +      efi_query_capsule_caps_t        *query_capsule_caps;
 +      efi_get_next_high_mono_count_t  *get_next_high_mono_count;
 +      efi_reset_system_t              *reset_system;
 +
 +      struct efi_memory_map           memmap;
 +      unsigned long                   flags;
  } efi;
  
 +#define EFI_RT_SUPPORTED_GET_TIME                             0x0001
 +#define EFI_RT_SUPPORTED_SET_TIME                             0x0002
 +#define EFI_RT_SUPPORTED_GET_WAKEUP_TIME                      0x0004
 +#define EFI_RT_SUPPORTED_SET_WAKEUP_TIME                      0x0008
 +#define EFI_RT_SUPPORTED_GET_VARIABLE                         0x0010
 +#define EFI_RT_SUPPORTED_GET_NEXT_VARIABLE_NAME                       0x0020
 +#define EFI_RT_SUPPORTED_SET_VARIABLE                         0x0040
 +#define EFI_RT_SUPPORTED_SET_VIRTUAL_ADDRESS_MAP              0x0080
 +#define EFI_RT_SUPPORTED_CONVERT_POINTER                      0x0100
 +#define EFI_RT_SUPPORTED_GET_NEXT_HIGH_MONOTONIC_COUNT                0x0200
 +#define EFI_RT_SUPPORTED_RESET_SYSTEM                         0x0400
 +#define EFI_RT_SUPPORTED_UPDATE_CAPSULE                               0x0800
 +#define EFI_RT_SUPPORTED_QUERY_CAPSULE_CAPABILITIES           0x1000
 +#define EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO                  0x2000
 +
 +#define EFI_RT_SUPPORTED_ALL                                  0x3fff
 +
 +#define EFI_RT_SUPPORTED_TIME_SERVICES                                0x000f
 +#define EFI_RT_SUPPORTED_VARIABLE_SERVICES                    0x0070
 +
  extern struct mm_struct efi_mm;
  
  static inline int
@@@ -632,18 -987,14 +632,18 @@@ extern int __init efi_memmap_split_coun
  extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap,
                                     void *buf, struct efi_mem_range *mem);
  
 -extern int efi_config_init(efi_config_table_type_t *arch_tables);
  #ifdef CONFIG_EFI_ESRT
  extern void __init efi_esrt_init(void);
  #else
  static inline void efi_esrt_init(void) { }
  #endif
 -extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 -                                 efi_config_table_type_t *arch_tables);
 +extern int efi_config_parse_tables(const efi_config_table_t *config_tables,
 +                                 int count,
 +                                 const efi_config_table_type_t *arch_tables);
 +extern int efi_systab_check_header(const efi_table_hdr_t *systab_hdr,
 +                                 int min_major_version);
 +extern void efi_systab_report_header(const efi_table_hdr_t *systab_hdr,
 +                                   unsigned long fw_vendor);
  extern u64 efi_get_iobase (void);
  extern int efi_mem_type(unsigned long phys_addr);
  extern u64 efi_mem_attributes (unsigned long phys_addr);
@@@ -655,7 -1006,7 +655,7 @@@ extern void efi_mem_reserve(phys_addr_
  extern int efi_mem_reserve_persistent(phys_addr_t addr, u64 size);
  extern void efi_initialize_iomem_resources(struct resource *code_resource,
                struct resource *data_resource, struct resource *bss_resource);
 -extern int efi_get_fdt_params(struct efi_fdt_params *params);
 +extern u64 efi_get_fdt_params(struct efi_memory_map_data *data);
  extern struct kobject *efi_kobj;
  
  extern int efi_reboot_quirk_mode;
@@@ -667,8 -1018,6 +667,8 @@@ extern void __init efi_fake_memmap(void
  static inline void efi_fake_memmap(void) { }
  #endif
  
 +extern unsigned long efi_mem_attr_table;
 +
  /*
   * efi_memattr_perm_setter - arch specific callback function passed into
   *                           efi_memattr_apply_permissions() that updates the
@@@ -775,7 -1124,6 +775,7 @@@ extern int __init efi_setup_pcdp_consol
  #define EFI_NX_PE_DATA                9       /* Can runtime data regions be mapped non-executable? */
  #define EFI_MEM_ATTR          10      /* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */
  #define EFI_MEM_NO_SOFT_RESERVE       11      /* Is the kernel configured to ignore soft reservations? */
 +#define EFI_PRESERVE_BS_REGIONS       12      /* Are EFI boot-services memory segments available? */
  
  #ifdef CONFIG_EFI
  /*
@@@ -794,11 -1142,6 +794,11 @@@ static inline bool __pure efi_soft_rese
        return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
                && __efi_soft_reserve_enabled();
  }
 +
 +static inline bool efi_rt_services_supported(unsigned int mask)
 +{
 +      return (efi.runtime_supported_mask & mask) == mask;
 +}
  #else
  static inline bool efi_enabled(int feature)
  {
@@@ -817,11 -1160,6 +817,11 @@@ static inline bool efi_soft_reserve_ena
  {
        return false;
  }
 +
 +static inline bool efi_rt_services_supported(unsigned int mask)
 +{
 +      return false;
 +}
  #endif
  
  extern int efi_status_to_err(efi_status_t status);
  #define EFI_VARIABLE_GUID_LEN UUID_STRING_LEN
  
  /*
 - * The type of search to perform when calling boottime->locate_handle
 - */
 -#define EFI_LOCATE_ALL_HANDLES                        0
 -#define EFI_LOCATE_BY_REGISTER_NOTIFY         1
 -#define EFI_LOCATE_BY_PROTOCOL                        2
 -
 -/*
   * EFI Device Path information
   */
  #define EFI_DEV_HW                    0x01
  #define   EFI_DEV_END_ENTIRE                  0xFF
  
  struct efi_generic_dev_path {
 -      u8 type;
 -      u8 sub_type;
 -      u16 length;
 -} __attribute ((packed));
 +      u8                              type;
 +      u8                              sub_type;
 +      u16                             length;
 +} __packed;
 +
 +struct efi_acpi_dev_path {
 +      struct efi_generic_dev_path     header;
 +      u32                             hid;
 +      u32                             uid;
 +} __packed;
 +
 +struct efi_pci_dev_path {
 +      struct efi_generic_dev_path     header;
 +      u8                              fn;
 +      u8                              dev;
 +} __packed;
 +
 +struct efi_vendor_dev_path {
 +      struct efi_generic_dev_path     header;
 +      efi_guid_t                      vendorguid;
 +      u8                              vendordata[];
 +} __packed;
  
  struct efi_dev_path {
 -      u8 type;        /* can be replaced with unnamed */
 -      u8 sub_type;    /* struct efi_generic_dev_path; */
 -      u16 length;     /* once we've moved to -std=c11 */
        union {
 -              struct {
 -                      u32 hid;
 -                      u32 uid;
 -              } acpi;
 -              struct {
 -                      u8 fn;
 -                      u8 dev;
 -              } pci;
 +              struct efi_generic_dev_path     header;
 +              struct efi_acpi_dev_path        acpi;
 +              struct efi_pci_dev_path         pci;
 +              struct efi_vendor_dev_path      vendor;
        };
 -} __attribute ((packed));
 +} __packed;
  
 -#if IS_ENABLED(CONFIG_EFI_DEV_PATH_PARSER)
 -struct device *efi_get_device_by_path(struct efi_dev_path **node, size_t *len);
 -#endif
 +struct device *efi_get_device_by_path(const struct efi_dev_path **node,
 +                                    size_t *len);
  
  static inline void memrange_efi_to_native(u64 *addr, u64 *npages)
  {
@@@ -977,6 -1312,80 +977,6 @@@ struct efivar_entry 
        bool deleting;
  };
  
 -union efi_simple_text_output_protocol {
 -      struct {
 -              void *reset;
 -              efi_status_t (__efiapi *output_string)(efi_simple_text_output_protocol_t *,
 -                                                     efi_char16_t *);
 -              void *test_string;
 -      };
 -      struct {
 -              u32 reset;
 -              u32 output_string;
 -              u32 test_string;
 -      } mixed_mode;
 -};
 -
 -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR             0
 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR             1
 -#define PIXEL_BIT_MASK                                        2
 -#define PIXEL_BLT_ONLY                                        3
 -#define PIXEL_FORMAT_MAX                              4
 -
 -typedef struct {
 -      u32 red_mask;
 -      u32 green_mask;
 -      u32 blue_mask;
 -      u32 reserved_mask;
 -} efi_pixel_bitmask_t;
 -
 -typedef struct {
 -      u32 version;
 -      u32 horizontal_resolution;
 -      u32 vertical_resolution;
 -      int pixel_format;
 -      efi_pixel_bitmask_t pixel_information;
 -      u32 pixels_per_scan_line;
 -} efi_graphics_output_mode_info_t;
 -
 -typedef union efi_graphics_output_protocol_mode efi_graphics_output_protocol_mode_t;
 -
 -union efi_graphics_output_protocol_mode {
 -      struct {
 -              u32 max_mode;
 -              u32 mode;
 -              efi_graphics_output_mode_info_t *info;
 -              unsigned long size_of_info;
 -              efi_physical_addr_t frame_buffer_base;
 -              unsigned long frame_buffer_size;
 -      };
 -      struct {
 -              u32 max_mode;
 -              u32 mode;
 -              u32 info;
 -              u32 size_of_info;
 -              u64 frame_buffer_base;
 -              u32 frame_buffer_size;
 -      } mixed_mode;
 -};
 -
 -typedef union efi_graphics_output_protocol efi_graphics_output_protocol_t;
 -
 -union efi_graphics_output_protocol {
 -      struct {
 -              void *query_mode;
 -              void *set_mode;
 -              void *blt;
 -              efi_graphics_output_protocol_mode_t *mode;
 -      };
 -      struct {
 -              u32 query_mode;
 -              u32 set_mode;
 -              u32 blt;
 -              u32 mode;
 -      } mixed_mode;
 -};
 -
  extern struct list_head efivar_sysfs_list;
  
  static inline void
@@@ -1074,6 -1483,52 +1074,6 @@@ static inline int efi_runtime_map_copy(
  
  #endif
  
 -/* prototypes shared between arch specific and generic stub code */
 -
 -void efi_printk(char *str);
 -
 -void efi_free(unsigned long size, unsigned long addr);
 -
 -char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len);
 -
 -efi_status_t efi_get_memory_map(struct efi_boot_memmap *map);
 -
 -efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align,
 -                               unsigned long *addr, unsigned long min);
 -
 -static inline
 -efi_status_t efi_low_alloc(unsigned long size, unsigned long align,
 -                         unsigned long *addr)
 -{
 -      /*
 -       * Don't allocate at 0x0. It will confuse code that
 -       * checks pointers against NULL. Skip the first 8
 -       * bytes so we start at a nice even number.
 -       */
 -      return efi_low_alloc_above(size, align, addr, 0x8);
 -}
 -
 -efi_status_t efi_high_alloc(unsigned long size, unsigned long align,
 -                          unsigned long *addr, unsigned long max);
 -
 -efi_status_t efi_relocate_kernel(unsigned long *image_addr,
 -                               unsigned long image_size,
 -                               unsigned long alloc_size,
 -                               unsigned long preferred_addr,
 -                               unsigned long alignment,
 -                               unsigned long min_addr);
 -
 -efi_status_t handle_cmdline_files(efi_loaded_image_t *image,
 -                                char *cmd_line, char *option_string,
 -                                unsigned long max_addr,
 -                                unsigned long *load_addr,
 -                                unsigned long *load_size);
 -
 -efi_status_t efi_parse_options(char const *cmdline);
 -
 -efi_status_t efi_setup_gop(struct screen_info *si, efi_guid_t *proto,
 -                         unsigned long size);
 -
  #ifdef CONFIG_EFI
  extern bool efi_runtime_disabled(void);
  #else
@@@ -1098,12 -1553,6 +1098,12 @@@ static inline voi
  efi_enable_reset_attack_mitigation(void) { }
  #endif
  
 +#ifdef CONFIG_EFI_EMBEDDED_FIRMWARE
 +void efi_check_for_embedded_firmwares(void);
 +#else
 +static inline void efi_check_for_embedded_firmwares(void) { }
 +#endif
 +
  efi_status_t efi_random_get_seed(void);
  
  void efi_retrieve_tpm2_eventlog(void);
        arch_efi_call_virt_teardown();                                  \
  })
  
 -typedef efi_status_t (*efi_exit_boot_map_processing)(
 -      struct efi_boot_memmap *map,
 -      void *priv);
 -
 -efi_status_t efi_exit_boot_services(void *handle,
 -                                  struct efi_boot_memmap *map,
 -                                  void *priv,
 -                                  efi_exit_boot_map_processing priv_func);
 -
  #define EFI_RANDOM_SEED_SIZE          64U
  
  struct linux_efi_random_seed {
@@@ -1243,4 -1701,8 +1243,6 @@@ struct linux_efi_memreserve 
  #define EFI_MEMRESERVE_COUNT(size) (((size) - sizeof(struct linux_efi_memreserve)) \
        / sizeof(((struct linux_efi_memreserve *)0)->entry[0]))
  
 -void efi_pci_disable_bridge_busmaster(void);
 -
+ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size);
  #endif /* _LINUX_EFI_H */