Merge branch 'kvm-arm64/misc-5.9' into kvmarm-master/next
authorMarc Zyngier <maz@kernel.org>
Thu, 30 Jul 2020 15:13:04 +0000 (16:13 +0100)
committerMarc Zyngier <maz@kernel.org>
Thu, 30 Jul 2020 15:13:04 +0000 (16:13 +0100)
Signed-off-by: Marc Zyngier <maz@kernel.org>
1  2 
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/kvm/hyp/include/hyp/switch.h
arch/arm64/kvm/mmu.c

@@@ -124,12 -124,33 +124,12 @@@ static inline void vcpu_set_vsesr(struc
  
  static __always_inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
  {
 -      return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
 -}
 -
 -static inline unsigned long *__vcpu_elr_el1(const struct kvm_vcpu *vcpu)
 -{
 -      return (unsigned long *)&vcpu_gp_regs(vcpu)->elr_el1;
 -}
 -
 -static inline unsigned long vcpu_read_elr_el1(const struct kvm_vcpu *vcpu)
 -{
 -      if (vcpu->arch.sysregs_loaded_on_cpu)
 -              return read_sysreg_el1(SYS_ELR);
 -      else
 -              return *__vcpu_elr_el1(vcpu);
 -}
 -
 -static inline void vcpu_write_elr_el1(const struct kvm_vcpu *vcpu, unsigned long v)
 -{
 -      if (vcpu->arch.sysregs_loaded_on_cpu)
 -              write_sysreg_el1(v, SYS_ELR);
 -      else
 -              *__vcpu_elr_el1(vcpu) = v;
 +      return (unsigned long *)&vcpu_gp_regs(vcpu)->pc;
  }
  
  static __always_inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
  {
 -      return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pstate;
 +      return (unsigned long *)&vcpu_gp_regs(vcpu)->pstate;
  }
  
  static __always_inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
@@@ -158,14 -179,14 +158,14 @@@ static inline void vcpu_set_thumb(struc
  static __always_inline unsigned long vcpu_get_reg(const struct kvm_vcpu *vcpu,
                                         u8 reg_num)
  {
 -      return (reg_num == 31) ? 0 : vcpu_gp_regs(vcpu)->regs.regs[reg_num];
 +      return (reg_num == 31) ? 0 : vcpu_gp_regs(vcpu)->regs[reg_num];
  }
  
  static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
                                unsigned long val)
  {
        if (reg_num != 31)
 -              vcpu_gp_regs(vcpu)->regs.regs[reg_num] = val;
 +              vcpu_gp_regs(vcpu)->regs[reg_num] = val;
  }
  
  static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu)
        if (vcpu->arch.sysregs_loaded_on_cpu)
                return read_sysreg_el1(SYS_SPSR);
        else
 -              return vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1];
 +              return __vcpu_sys_reg(vcpu, SPSR_EL1);
  }
  
  static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
        if (vcpu->arch.sysregs_loaded_on_cpu)
                write_sysreg_el1(v, SYS_SPSR);
        else
 -              vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v;
 +              __vcpu_sys_reg(vcpu, SPSR_EL1) = v;
  }
  
  /*
@@@ -345,7 -366,7 +345,7 @@@ static __always_inline u8 kvm_vcpu_trap
        return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE;
  }
  
- static __always_inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
+ static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
  {
        switch (kvm_vcpu_trap_get_fault(vcpu)) {
        case FSC_SEA:
@@@ -495,14 -516,14 +495,14 @@@ static __always_inline void kvm_skip_in
   * Skip an instruction which has been emulated at hyp while most guest sysregs
   * are live.
   */
 -static __always_inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu)
 +static __always_inline void __kvm_skip_instr(struct kvm_vcpu *vcpu)
  {
        *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
 -      vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(SYS_SPSR);
 +      vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR);
  
        kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
  
 -      write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, SYS_SPSR);
 +      write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR);
        write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
  }
  
index 0511af1,0000000..426ef65
mode 100644,000000..100644
--- /dev/null
@@@ -1,511 -1,0 +1,511 @@@
-                       !kvm_vcpu_dabt_isextabt(vcpu) &&
 +// SPDX-License-Identifier: GPL-2.0-only
 +/*
 + * Copyright (C) 2015 - ARM Ltd
 + * Author: Marc Zyngier <marc.zyngier@arm.com>
 + */
 +
 +#ifndef __ARM64_KVM_HYP_SWITCH_H__
 +#define __ARM64_KVM_HYP_SWITCH_H__
 +
 +#include <linux/arm-smccc.h>
 +#include <linux/kvm_host.h>
 +#include <linux/types.h>
 +#include <linux/jump_label.h>
 +#include <uapi/linux/psci.h>
 +
 +#include <kvm/arm_psci.h>
 +
 +#include <asm/barrier.h>
 +#include <asm/cpufeature.h>
 +#include <asm/kprobes.h>
 +#include <asm/kvm_asm.h>
 +#include <asm/kvm_emulate.h>
 +#include <asm/kvm_hyp.h>
 +#include <asm/kvm_mmu.h>
 +#include <asm/fpsimd.h>
 +#include <asm/debug-monitors.h>
 +#include <asm/processor.h>
 +#include <asm/thread_info.h>
 +
 +extern const char __hyp_panic_string[];
 +
 +/* Check whether the FP regs were dirtied while in the host-side run loop: */
 +static inline bool update_fp_enabled(struct kvm_vcpu *vcpu)
 +{
 +      /*
 +       * When the system doesn't support FP/SIMD, we cannot rely on
 +       * the _TIF_FOREIGN_FPSTATE flag. However, we always inject an
 +       * abort on the very first access to FP and thus we should never
 +       * see KVM_ARM64_FP_ENABLED. For added safety, make sure we always
 +       * trap the accesses.
 +       */
 +      if (!system_supports_fpsimd() ||
 +          vcpu->arch.host_thread_info->flags & _TIF_FOREIGN_FPSTATE)
 +              vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED |
 +                                    KVM_ARM64_FP_HOST);
 +
 +      return !!(vcpu->arch.flags & KVM_ARM64_FP_ENABLED);
 +}
 +
 +/* Save the 32-bit only FPSIMD system register state */
 +static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
 +{
 +      if (!vcpu_el1_is_32bit(vcpu))
 +              return;
 +
 +      __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2);
 +}
 +
 +static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
 +{
 +      /*
 +       * We are about to set CPTR_EL2.TFP to trap all floating point
 +       * register accesses to EL2, however, the ARM ARM clearly states that
 +       * traps are only taken to EL2 if the operation would not otherwise
 +       * trap to EL1.  Therefore, always make sure that for 32-bit guests,
 +       * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
 +       * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
 +       * it will cause an exception.
 +       */
 +      if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) {
 +              write_sysreg(1 << 30, fpexc32_el2);
 +              isb();
 +      }
 +}
 +
 +static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 +{
 +      /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
 +      write_sysreg(1 << 15, hstr_el2);
 +
 +      /*
 +       * Make sure we trap PMU access from EL0 to EL2. Also sanitize
 +       * PMSELR_EL0 to make sure it never contains the cycle
 +       * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
 +       * EL1 instead of being trapped to EL2.
 +       */
 +      write_sysreg(0, pmselr_el0);
 +      write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
 +      write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 +}
 +
 +static inline void __deactivate_traps_common(void)
 +{
 +      write_sysreg(0, hstr_el2);
 +      write_sysreg(0, pmuserenr_el0);
 +}
 +
 +static inline void ___activate_traps(struct kvm_vcpu *vcpu)
 +{
 +      u64 hcr = vcpu->arch.hcr_el2;
 +
 +      if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM))
 +              hcr |= HCR_TVM;
 +
 +      write_sysreg(hcr, hcr_el2);
 +
 +      if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
 +              write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
 +}
 +
 +static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
 +{
 +      /*
 +       * If we pended a virtual abort, preserve it until it gets
 +       * cleared. See D1.14.3 (Virtual Interrupts) for details, but
 +       * the crucial bit is "On taking a vSError interrupt,
 +       * HCR_EL2.VSE is cleared to 0."
 +       */
 +      if (vcpu->arch.hcr_el2 & HCR_VSE) {
 +              vcpu->arch.hcr_el2 &= ~HCR_VSE;
 +              vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE;
 +      }
 +}
 +
 +static inline void __activate_vm(struct kvm_s2_mmu *mmu)
 +{
 +      __load_guest_stage2(mmu);
 +}
 +
 +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
 +{
 +      u64 par, tmp;
 +
 +      /*
 +       * Resolve the IPA the hard way using the guest VA.
 +       *
 +       * Stage-1 translation already validated the memory access
 +       * rights. As such, we can use the EL1 translation regime, and
 +       * don't have to distinguish between EL0 and EL1 access.
 +       *
 +       * We do need to save/restore PAR_EL1 though, as we haven't
 +       * saved the guest context yet, and we may return early...
 +       */
 +      par = read_sysreg(par_el1);
 +      asm volatile("at s1e1r, %0" : : "r" (far));
 +      isb();
 +
 +      tmp = read_sysreg(par_el1);
 +      write_sysreg(par, par_el1);
 +
 +      if (unlikely(tmp & SYS_PAR_EL1_F))
 +              return false; /* Translation failed, back to guest */
 +
 +      /* Convert PAR to HPFAR format */
 +      *hpfar = PAR_TO_HPFAR(tmp);
 +      return true;
 +}
 +
 +static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
 +{
 +      u8 ec;
 +      u64 esr;
 +      u64 hpfar, far;
 +
 +      esr = vcpu->arch.fault.esr_el2;
 +      ec = ESR_ELx_EC(esr);
 +
 +      if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
 +              return true;
 +
 +      far = read_sysreg_el2(SYS_FAR);
 +
 +      /*
 +       * The HPFAR can be invalid if the stage 2 fault did not
 +       * happen during a stage 1 page table walk (the ESR_EL2.S1PTW
 +       * bit is clear) and one of the two following cases are true:
 +       *   1. The fault was due to a permission fault
 +       *   2. The processor carries errata 834220
 +       *
 +       * Therefore, for all non S1PTW faults where we either have a
 +       * permission fault or the errata workaround is enabled, we
 +       * resolve the IPA using the AT instruction.
 +       */
 +      if (!(esr & ESR_ELx_S1PTW) &&
 +          (cpus_have_final_cap(ARM64_WORKAROUND_834220) ||
 +           (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) {
 +              if (!__translate_far_to_hpfar(far, &hpfar))
 +                      return false;
 +      } else {
 +              hpfar = read_sysreg(hpfar_el2);
 +      }
 +
 +      vcpu->arch.fault.far_el2 = far;
 +      vcpu->arch.fault.hpfar_el2 = hpfar;
 +      return true;
 +}
 +
 +/* Check for an FPSIMD/SVE trap and handle as appropriate */
 +static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
 +{
 +      bool vhe, sve_guest, sve_host;
 +      u8 esr_ec;
 +
 +      if (!system_supports_fpsimd())
 +              return false;
 +
 +      /*
 +       * Currently system_supports_sve() currently implies has_vhe(),
 +       * so the check is redundant. However, has_vhe() can be determined
 +       * statically and helps the compiler remove dead code.
 +       */
 +      if (has_vhe() && system_supports_sve()) {
 +              sve_guest = vcpu_has_sve(vcpu);
 +              sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
 +              vhe = true;
 +      } else {
 +              sve_guest = false;
 +              sve_host = false;
 +              vhe = has_vhe();
 +      }
 +
 +      esr_ec = kvm_vcpu_trap_get_class(vcpu);
 +      if (esr_ec != ESR_ELx_EC_FP_ASIMD &&
 +          esr_ec != ESR_ELx_EC_SVE)
 +              return false;
 +
 +      /* Don't handle SVE traps for non-SVE vcpus here: */
 +      if (!sve_guest)
 +              if (esr_ec != ESR_ELx_EC_FP_ASIMD)
 +                      return false;
 +
 +      /* Valid trap.  Switch the context: */
 +
 +      if (vhe) {
 +              u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
 +
 +              if (sve_guest)
 +                      reg |= CPACR_EL1_ZEN;
 +
 +              write_sysreg(reg, cpacr_el1);
 +      } else {
 +              write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
 +                           cptr_el2);
 +      }
 +
 +      isb();
 +
 +      if (vcpu->arch.flags & KVM_ARM64_FP_HOST) {
 +              /*
 +               * In the SVE case, VHE is assumed: it is enforced by
 +               * Kconfig and kvm_arch_init().
 +               */
 +              if (sve_host) {
 +                      struct thread_struct *thread = container_of(
 +                              vcpu->arch.host_fpsimd_state,
 +                              struct thread_struct, uw.fpsimd_state);
 +
 +                      sve_save_state(sve_pffr(thread),
 +                                     &vcpu->arch.host_fpsimd_state->fpsr);
 +              } else {
 +                      __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
 +              }
 +
 +              vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
 +      }
 +
 +      if (sve_guest) {
 +              sve_load_state(vcpu_sve_pffr(vcpu),
 +                             &vcpu->arch.ctxt.fp_regs.fpsr,
 +                             sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
 +              write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12);
 +      } else {
 +              __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
 +      }
 +
 +      /* Skip restoring fpexc32 for AArch64 guests */
 +      if (!(read_sysreg(hcr_el2) & HCR_RW))
 +              write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2);
 +
 +      vcpu->arch.flags |= KVM_ARM64_FP_ENABLED;
 +
 +      return true;
 +}
 +
 +static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu)
 +{
 +      u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
 +      int rt = kvm_vcpu_sys_get_rt(vcpu);
 +      u64 val = vcpu_get_reg(vcpu, rt);
 +
 +      /*
 +       * The normal sysreg handling code expects to see the traps,
 +       * let's not do anything here.
 +       */
 +      if (vcpu->arch.hcr_el2 & HCR_TVM)
 +              return false;
 +
 +      switch (sysreg) {
 +      case SYS_SCTLR_EL1:
 +              write_sysreg_el1(val, SYS_SCTLR);
 +              break;
 +      case SYS_TTBR0_EL1:
 +              write_sysreg_el1(val, SYS_TTBR0);
 +              break;
 +      case SYS_TTBR1_EL1:
 +              write_sysreg_el1(val, SYS_TTBR1);
 +              break;
 +      case SYS_TCR_EL1:
 +              write_sysreg_el1(val, SYS_TCR);
 +              break;
 +      case SYS_ESR_EL1:
 +              write_sysreg_el1(val, SYS_ESR);
 +              break;
 +      case SYS_FAR_EL1:
 +              write_sysreg_el1(val, SYS_FAR);
 +              break;
 +      case SYS_AFSR0_EL1:
 +              write_sysreg_el1(val, SYS_AFSR0);
 +              break;
 +      case SYS_AFSR1_EL1:
 +              write_sysreg_el1(val, SYS_AFSR1);
 +              break;
 +      case SYS_MAIR_EL1:
 +              write_sysreg_el1(val, SYS_MAIR);
 +              break;
 +      case SYS_AMAIR_EL1:
 +              write_sysreg_el1(val, SYS_AMAIR);
 +              break;
 +      case SYS_CONTEXTIDR_EL1:
 +              write_sysreg_el1(val, SYS_CONTEXTIDR);
 +              break;
 +      default:
 +              return false;
 +      }
 +
 +      __kvm_skip_instr(vcpu);
 +      return true;
 +}
 +
 +static inline bool esr_is_ptrauth_trap(u32 esr)
 +{
 +      u32 ec = ESR_ELx_EC(esr);
 +
 +      if (ec == ESR_ELx_EC_PAC)
 +              return true;
 +
 +      if (ec != ESR_ELx_EC_SYS64)
 +              return false;
 +
 +      switch (esr_sys64_to_sysreg(esr)) {
 +      case SYS_APIAKEYLO_EL1:
 +      case SYS_APIAKEYHI_EL1:
 +      case SYS_APIBKEYLO_EL1:
 +      case SYS_APIBKEYHI_EL1:
 +      case SYS_APDAKEYLO_EL1:
 +      case SYS_APDAKEYHI_EL1:
 +      case SYS_APDBKEYLO_EL1:
 +      case SYS_APDBKEYHI_EL1:
 +      case SYS_APGAKEYLO_EL1:
 +      case SYS_APGAKEYHI_EL1:
 +              return true;
 +      }
 +
 +      return false;
 +}
 +
 +#define __ptrauth_save_key(ctxt, key)                                 \
 +      do {                                                            \
 +      u64 __val;                                                      \
 +      __val = read_sysreg_s(SYS_ ## key ## KEYLO_EL1);                \
 +      ctxt_sys_reg(ctxt, key ## KEYLO_EL1) = __val;                   \
 +      __val = read_sysreg_s(SYS_ ## key ## KEYHI_EL1);                \
 +      ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val;                   \
 +} while(0)
 +
 +static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
 +{
 +      struct kvm_cpu_context *ctxt;
 +      u64 val;
 +
 +      if (!vcpu_has_ptrauth(vcpu) ||
 +          !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
 +              return false;
 +
 +      ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
 +      __ptrauth_save_key(ctxt, APIA);
 +      __ptrauth_save_key(ctxt, APIB);
 +      __ptrauth_save_key(ctxt, APDA);
 +      __ptrauth_save_key(ctxt, APDB);
 +      __ptrauth_save_key(ctxt, APGA);
 +
 +      vcpu_ptrauth_enable(vcpu);
 +
 +      val = read_sysreg(hcr_el2);
 +      val |= (HCR_API | HCR_APK);
 +      write_sysreg(val, hcr_el2);
 +
 +      return true;
 +}
 +
 +/*
 + * Return true when we were able to fixup the guest exit and should return to
 + * the guest, false when we should restore the host state and return to the
 + * main run loop.
 + */
 +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
 +{
 +      if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
 +              vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
 +
 +      /*
 +       * We're using the raw exception code in order to only process
 +       * the trap if no SError is pending. We will come back to the
 +       * same PC once the SError has been injected, and replay the
 +       * trapping instruction.
 +       */
 +      if (*exit_code != ARM_EXCEPTION_TRAP)
 +              goto exit;
 +
 +      if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
 +          kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 &&
 +          handle_tx2_tvm(vcpu))
 +              return true;
 +
 +      /*
 +       * We trap the first access to the FP/SIMD to save the host context
 +       * and restore the guest context lazily.
 +       * If FP/SIMD is not implemented, handle the trap and inject an
 +       * undefined instruction exception to the guest.
 +       * Similarly for trapped SVE accesses.
 +       */
 +      if (__hyp_handle_fpsimd(vcpu))
 +              return true;
 +
 +      if (__hyp_handle_ptrauth(vcpu))
 +              return true;
 +
 +      if (!__populate_fault_info(vcpu))
 +              return true;
 +
 +      if (static_branch_unlikely(&vgic_v2_cpuif_trap)) {
 +              bool valid;
 +
 +              valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
 +                      kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
 +                      kvm_vcpu_dabt_isvalid(vcpu) &&
++                      !kvm_vcpu_abt_issea(vcpu) &&
 +                      !kvm_vcpu_dabt_iss1tw(vcpu);
 +
 +              if (valid) {
 +                      int ret = __vgic_v2_perform_cpuif_access(vcpu);
 +
 +                      if (ret == 1)
 +                              return true;
 +
 +                      /* Promote an illegal access to an SError.*/
 +                      if (ret == -1)
 +                              *exit_code = ARM_EXCEPTION_EL1_SERROR;
 +
 +                      goto exit;
 +              }
 +      }
 +
 +      if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
 +          (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 ||
 +           kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
 +              int ret = __vgic_v3_perform_cpuif_access(vcpu);
 +
 +              if (ret == 1)
 +                      return true;
 +      }
 +
 +exit:
 +      /* Return to the host kernel and handle the exit */
 +      return false;
 +}
 +
 +static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
 +{
 +      if (!cpus_have_final_cap(ARM64_SSBD))
 +              return false;
 +
 +      return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
 +}
 +
 +static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
 +{
 +#ifdef CONFIG_ARM64_SSBD
 +      /*
 +       * The host runs with the workaround always present. If the
 +       * guest wants it disabled, so be it...
 +       */
 +      if (__needs_ssbd_off(vcpu) &&
 +          __hyp_this_cpu_read(arm64_ssbd_callback_required))
 +              arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
 +#endif
 +}
 +
 +static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
 +{
 +#ifdef CONFIG_ARM64_SSBD
 +      /*
 +       * If the guest has disabled the workaround, bring it back on.
 +       */
 +      if (__needs_ssbd_off(vcpu) &&
 +          __hyp_this_cpu_read(arm64_ssbd_callback_required))
 +              arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
 +#endif
 +}
 +
 +#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --combined arch/arm64/kvm/mmu.c
@@@ -55,13 -55,12 +55,13 @@@ static bool memslot_is_logging(struct k
   */
  void kvm_flush_remote_tlbs(struct kvm *kvm)
  {
 -      kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 +      kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
  }
  
 -static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 +static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
 +                                 int level)
  {
 -      kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 +      kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
  }
  
  /*
@@@ -91,39 -90,37 +91,39 @@@ static bool kvm_is_device_pfn(unsigned 
  
  /**
   * stage2_dissolve_pmd() - clear and flush huge PMD entry
 - * @kvm:      pointer to kvm structure.
 + * @mmu:      pointer to mmu structure to operate on
   * @addr:     IPA
   * @pmd:      pmd pointer for IPA
   *
   * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
   */
 -static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
 +static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
  {
        if (!pmd_thp_or_huge(*pmd))
                return;
  
        pmd_clear(pmd);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
        put_page(virt_to_page(pmd));
  }
  
  /**
   * stage2_dissolve_pud() - clear and flush huge PUD entry
 - * @kvm:      pointer to kvm structure.
 + * @mmu:      pointer to mmu structure to operate on
   * @addr:     IPA
   * @pud:      pud pointer for IPA
   *
   * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
   */
 -static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
 +static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
  {
 +      struct kvm *kvm = mmu->kvm;
 +
        if (!stage2_pud_huge(kvm, *pudp))
                return;
  
        stage2_pud_clear(kvm, pudp);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
        put_page(virt_to_page(pudp));
  }
  
@@@ -159,44 -156,40 +159,44 @@@ static void *mmu_memory_cache_alloc(str
        return p;
  }
  
 -static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 +static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
        stage2_pgd_clear(kvm, pgd);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
        stage2_p4d_free(kvm, p4d_table);
        put_page(virt_to_page(pgd));
  }
  
 -static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr)
 +static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
        stage2_p4d_clear(kvm, p4d);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
        stage2_pud_free(kvm, pud_table);
        put_page(virt_to_page(p4d));
  }
  
 -static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 +static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
 +
        VM_BUG_ON(stage2_pud_huge(kvm, *pud));
        stage2_pud_clear(kvm, pud);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
        stage2_pmd_free(kvm, pmd_table);
        put_page(virt_to_page(pud));
  }
  
 -static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 +static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
  {
        pte_t *pte_table = pte_offset_kernel(pmd, 0);
        VM_BUG_ON(pmd_thp_or_huge(*pmd));
        pmd_clear(pmd);
 -      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
        free_page((unsigned long)pte_table);
        put_page(virt_to_page(pmd));
  }
@@@ -262,7 -255,7 +262,7 @@@ static inline void kvm_pgd_populate(pgd
   * we then fully enforce cacheability of RAM, no matter what the guest
   * does.
   */
 -static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
 +static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
                       phys_addr_t addr, phys_addr_t end)
  {
        phys_addr_t start_addr = addr;
                        pte_t old_pte = *pte;
  
                        kvm_set_pte(pte, __pte(0));
 -                      kvm_tlb_flush_vmid_ipa(kvm, addr);
 +                      kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
  
                        /* No need to invalidate the cache for device mappings */
                        if (!kvm_is_device_pfn(pte_pfn(old_pte)))
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
  
 -      if (stage2_pte_table_empty(kvm, start_pte))
 -              clear_stage2_pmd_entry(kvm, pmd, start_addr);
 +      if (stage2_pte_table_empty(mmu->kvm, start_pte))
 +              clear_stage2_pmd_entry(mmu, pmd, start_addr);
  }
  
 -static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
 +static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
                       phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        phys_addr_t next, start_addr = addr;
        pmd_t *pmd, *start_pmd;
  
                                pmd_t old_pmd = *pmd;
  
                                pmd_clear(pmd);
 -                              kvm_tlb_flush_vmid_ipa(kvm, addr);
 +                              kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
  
                                kvm_flush_dcache_pmd(old_pmd);
  
                                put_page(virt_to_page(pmd));
                        } else {
 -                              unmap_stage2_ptes(kvm, pmd, addr, next);
 +                              unmap_stage2_ptes(mmu, pmd, addr, next);
                        }
                }
        } while (pmd++, addr = next, addr != end);
  
        if (stage2_pmd_table_empty(kvm, start_pmd))
 -              clear_stage2_pud_entry(kvm, pud, start_addr);
 +              clear_stage2_pud_entry(mmu, pud, start_addr);
  }
  
 -static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d,
 +static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
                       phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        phys_addr_t next, start_addr = addr;
        pud_t *pud, *start_pud;
  
                                pud_t old_pud = *pud;
  
                                stage2_pud_clear(kvm, pud);
 -                              kvm_tlb_flush_vmid_ipa(kvm, addr);
 +                              kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
                                kvm_flush_dcache_pud(old_pud);
                                put_page(virt_to_page(pud));
                        } else {
 -                              unmap_stage2_pmds(kvm, pud, addr, next);
 +                              unmap_stage2_pmds(mmu, pud, addr, next);
                        }
                }
        } while (pud++, addr = next, addr != end);
  
        if (stage2_pud_table_empty(kvm, start_pud))
 -              clear_stage2_p4d_entry(kvm, p4d, start_addr);
 +              clear_stage2_p4d_entry(mmu, p4d, start_addr);
  }
  
 -static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd,
 +static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
                       phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        phys_addr_t next, start_addr = addr;
        p4d_t *p4d, *start_p4d;
  
        do {
                next = stage2_p4d_addr_end(kvm, addr, end);
                if (!stage2_p4d_none(kvm, *p4d))
 -                      unmap_stage2_puds(kvm, p4d, addr, next);
 +                      unmap_stage2_puds(mmu, p4d, addr, next);
        } while (p4d++, addr = next, addr != end);
  
        if (stage2_p4d_table_empty(kvm, start_p4d))
 -              clear_stage2_pgd_entry(kvm, pgd, start_addr);
 +              clear_stage2_pgd_entry(mmu, pgd, start_addr);
  }
  
  /**
   * destroying the VM), otherwise another faulting VCPU may come in and mess
   * with things behind our backs.
   */
 -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 +static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
  {
 +      struct kvm *kvm = mmu->kvm;
        pgd_t *pgd;
        phys_addr_t addr = start, end = start + size;
        phys_addr_t next;
        assert_spin_locked(&kvm->mmu_lock);
        WARN_ON(size & ~PAGE_MASK);
  
 -      pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
 +      pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
        do {
                /*
                 * Make sure the page table is still active, as another thread
                 * could have possibly freed the page table, while we released
                 * the lock.
                 */
 -              if (!READ_ONCE(kvm->arch.pgd))
 +              if (!READ_ONCE(mmu->pgd))
                        break;
                next = stage2_pgd_addr_end(kvm, addr, end);
                if (!stage2_pgd_none(kvm, *pgd))
 -                      unmap_stage2_p4ds(kvm, pgd, addr, next);
 +                      unmap_stage2_p4ds(mmu, pgd, addr, next);
                /*
                 * If the range is too large, release the kvm->mmu_lock
                 * to prevent starvation and lockup detector warnings.
        } while (pgd++, addr = next, addr != end);
  }
  
 -static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
 +static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
                              phys_addr_t addr, phys_addr_t end)
  {
        pte_t *pte;
        } while (pte++, addr += PAGE_SIZE, addr != end);
  }
  
 -static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
 +static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
                              phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        pmd_t *pmd;
        phys_addr_t next;
  
                        if (pmd_thp_or_huge(*pmd))
                                kvm_flush_dcache_pmd(*pmd);
                        else
 -                              stage2_flush_ptes(kvm, pmd, addr, next);
 +                              stage2_flush_ptes(mmu, pmd, addr, next);
                }
        } while (pmd++, addr = next, addr != end);
  }
  
 -static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d,
 +static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
                              phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pud;
        phys_addr_t next;
  
                        if (stage2_pud_huge(kvm, *pud))
                                kvm_flush_dcache_pud(*pud);
                        else
 -                              stage2_flush_pmds(kvm, pud, addr, next);
 +                              stage2_flush_pmds(mmu, pud, addr, next);
                }
        } while (pud++, addr = next, addr != end);
  }
  
 -static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd,
 +static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
                              phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        p4d_t *p4d;
        phys_addr_t next;
  
        do {
                next = stage2_p4d_addr_end(kvm, addr, end);
                if (!stage2_p4d_none(kvm, *p4d))
 -                      stage2_flush_puds(kvm, p4d, addr, next);
 +                      stage2_flush_puds(mmu, p4d, addr, next);
        } while (p4d++, addr = next, addr != end);
  }
  
  static void stage2_flush_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot)
  {
 +      struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
        phys_addr_t next;
        pgd_t *pgd;
  
 -      pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
 +      pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
        do {
                next = stage2_pgd_addr_end(kvm, addr, end);
                if (!stage2_pgd_none(kvm, *pgd))
 -                      stage2_flush_p4ds(kvm, pgd, addr, next);
 +                      stage2_flush_p4ds(mmu, pgd, addr, next);
  
                if (next != end)
                        cond_resched_lock(&kvm->mmu_lock);
@@@ -1011,23 -996,21 +1011,23 @@@ int create_hyp_exec_mappings(phys_addr_
  }
  
  /**
 - * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 - * @kvm:      The KVM struct pointer for the VM.
 + * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
 + * @kvm:      The pointer to the KVM structure
 + * @mmu:      The pointer to the s2 MMU structure
   *
   * Allocates only the stage-2 HW PGD level table(s) of size defined by
 - * stage2_pgd_size(kvm).
 + * stage2_pgd_size(mmu->kvm).
   *
   * Note we don't need locking here as this is only called when the VM is
   * created, which can only be done once.
   */
 -int kvm_alloc_stage2_pgd(struct kvm *kvm)
 +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
  {
        phys_addr_t pgd_phys;
        pgd_t *pgd;
 +      int cpu;
  
 -      if (kvm->arch.pgd != NULL) {
 +      if (mmu->pgd != NULL) {
                kvm_err("kvm_arch already initialized?\n");
                return -EINVAL;
        }
        if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
                return -EINVAL;
  
 -      kvm->arch.pgd = pgd;
 -      kvm->arch.pgd_phys = pgd_phys;
 +      mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
 +      if (!mmu->last_vcpu_ran) {
 +              free_pages_exact(pgd, stage2_pgd_size(kvm));
 +              return -ENOMEM;
 +      }
 +
 +      for_each_possible_cpu(cpu)
 +              *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 +
 +      mmu->kvm = kvm;
 +      mmu->pgd = pgd;
 +      mmu->pgd_phys = pgd_phys;
 +      mmu->vmid.vmid_gen = 0;
 +
        return 0;
  }
  
@@@ -1093,7 -1064,7 +1093,7 @@@ static void stage2_unmap_memslot(struc
  
                if (!(vma->vm_flags & VM_PFNMAP)) {
                        gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 -                      unmap_stage2_range(kvm, gpa, vm_end - vm_start);
 +                      unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
                }
                hva = vm_end;
        } while (hva < reg_end);
@@@ -1125,34 -1096,39 +1125,34 @@@ void stage2_unmap_vm(struct kvm *kvm
        srcu_read_unlock(&kvm->srcu, idx);
  }
  
 -/**
 - * kvm_free_stage2_pgd - free all stage-2 tables
 - * @kvm:      The KVM struct pointer for the VM.
 - *
 - * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
 - * underlying level-2 and level-3 tables before freeing the actual level-1 table
 - * and setting the struct pointer to NULL.
 - */
 -void kvm_free_stage2_pgd(struct kvm *kvm)
 +void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
  {
 +      struct kvm *kvm = mmu->kvm;
        void *pgd = NULL;
  
        spin_lock(&kvm->mmu_lock);
 -      if (kvm->arch.pgd) {
 -              unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
 -              pgd = READ_ONCE(kvm->arch.pgd);
 -              kvm->arch.pgd = NULL;
 -              kvm->arch.pgd_phys = 0;
 +      if (mmu->pgd) {
 +              unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
 +              pgd = READ_ONCE(mmu->pgd);
 +              mmu->pgd = NULL;
        }
        spin_unlock(&kvm->mmu_lock);
  
        /* Free the HW pgd, one page at a time */
 -      if (pgd)
 +      if (pgd) {
                free_pages_exact(pgd, stage2_pgd_size(kvm));
 +              free_percpu(mmu->last_vcpu_ran);
 +      }
  }
  
 -static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 +static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
                             phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        pgd_t *pgd;
        p4d_t *p4d;
  
 -      pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
 +      pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
        if (stage2_pgd_none(kvm, *pgd)) {
                if (!cache)
                        return NULL;
        return stage2_p4d_offset(kvm, pgd, addr);
  }
  
 -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 +static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
                             phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        p4d_t *p4d;
        pud_t *pud;
  
 -      p4d = stage2_get_p4d(kvm, cache, addr);
 +      p4d = stage2_get_p4d(mmu, cache, addr);
        if (stage2_p4d_none(kvm, *p4d)) {
                if (!cache)
                        return NULL;
        return stage2_pud_offset(kvm, p4d, addr);
  }
  
 -static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 +static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
                             phys_addr_t addr)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pud;
        pmd_t *pmd;
  
 -      pud = stage2_get_pud(kvm, cache, addr);
 +      pud = stage2_get_pud(mmu, cache, addr);
        if (!pud || stage2_pud_huge(kvm, *pud))
                return NULL;
  
        return stage2_pmd_offset(kvm, pud, addr);
  }
  
 -static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 -                             *cache, phys_addr_t addr, const pmd_t *new_pmd)
 +static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
 +                             struct kvm_mmu_memory_cache *cache,
 +                             phys_addr_t addr, const pmd_t *new_pmd)
  {
        pmd_t *pmd, old_pmd;
  
  retry:
 -      pmd = stage2_get_pmd(kvm, cache, addr);
 +      pmd = stage2_get_pmd(mmu, cache, addr);
        VM_BUG_ON(!pmd);
  
        old_pmd = *pmd;
                 * get handled accordingly.
                 */
                if (!pmd_thp_or_huge(old_pmd)) {
 -                      unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
 +                      unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
                        goto retry;
                }
                /*
                 */
                WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
                pmd_clear(pmd);
 -              kvm_tlb_flush_vmid_ipa(kvm, addr);
 +              kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
        } else {
                get_page(virt_to_page(pmd));
        }
        return 0;
  }
  
 -static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 +static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
 +                             struct kvm_mmu_memory_cache *cache,
                               phys_addr_t addr, const pud_t *new_pudp)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pudp, old_pud;
  
  retry:
 -      pudp = stage2_get_pud(kvm, cache, addr);
 +      pudp = stage2_get_pud(mmu, cache, addr);
        VM_BUG_ON(!pudp);
  
        old_pud = *pudp;
                 * the range for this block and retry.
                 */
                if (!stage2_pud_huge(kvm, old_pud)) {
 -                      unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
 +                      unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
                        goto retry;
                }
  
                WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
                stage2_pud_clear(kvm, pudp);
 -              kvm_tlb_flush_vmid_ipa(kvm, addr);
 +              kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
        } else {
                get_page(virt_to_page(pudp));
        }
   * leaf-entry is returned in the appropriate level variable - pudpp,
   * pmdpp, ptepp.
   */
 -static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
 +static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
                                  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pudp;
        pmd_t *pmdp;
        pte_t *ptep;
        *pmdpp = NULL;
        *ptepp = NULL;
  
 -      pudp = stage2_get_pud(kvm, NULL, addr);
 +      pudp = stage2_get_pud(mmu, NULL, addr);
        if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
                return false;
  
        return true;
  }
  
 -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
 +static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr)
  {
        pud_t *pudp;
        pmd_t *pmdp;
        pte_t *ptep;
        bool found;
  
 -      found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
 +      found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
        if (!found)
                return false;
  
                return kvm_s2pte_exec(ptep);
  }
  
 -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 +static int stage2_set_pte(struct kvm_s2_mmu *mmu,
 +                        struct kvm_mmu_memory_cache *cache,
                          phys_addr_t addr, const pte_t *new_pte,
                          unsigned long flags)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte, old_pte;
        VM_BUG_ON(logging_active && !cache);
  
        /* Create stage-2 page table mapping - Levels 0 and 1 */
 -      pud = stage2_get_pud(kvm, cache, addr);
 +      pud = stage2_get_pud(mmu, cache, addr);
        if (!pud) {
                /*
                 * Ignore calls from kvm_set_spte_hva for unallocated
         * on to allocate page.
         */
        if (logging_active)
 -              stage2_dissolve_pud(kvm, addr, pud);
 +              stage2_dissolve_pud(mmu, addr, pud);
  
        if (stage2_pud_none(kvm, *pud)) {
                if (!cache)
         * allocate page.
         */
        if (logging_active)
 -              stage2_dissolve_pmd(kvm, addr, pmd);
 +              stage2_dissolve_pmd(mmu, addr, pmd);
  
        /* Create stage-2 page mappings - Level 2 */
        if (pmd_none(*pmd)) {
                        return 0;
  
                kvm_set_pte(pte, __pte(0));
 -              kvm_tlb_flush_vmid_ipa(kvm, addr);
 +              kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
        } else {
                get_page(virt_to_page(pte));
        }
@@@ -1518,8 -1486,8 +1518,8 @@@ int kvm_phys_addr_ioremap(struct kvm *k
                if (ret)
                        goto out;
                spin_lock(&kvm->mmu_lock);
 -              ret = stage2_set_pte(kvm, &cache, addr, &pte,
 -                                              KVM_S2PTE_FLAG_IS_IOMAP);
 +              ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
 +                                   KVM_S2PTE_FLAG_IS_IOMAP);
                spin_unlock(&kvm->mmu_lock);
                if (ret)
                        goto out;
@@@ -1558,10 -1526,9 +1558,10 @@@ static void stage2_wp_ptes(pmd_t *pmd, 
   * @addr:     range start address
   * @end:      range end address
   */
 -static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
 +static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
                           phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        pmd_t *pmd;
        phys_addr_t next;
  
  
  /**
   * stage2_wp_puds - write protect P4D range
 - * @pgd:      pointer to pgd entry
 + * @p4d:      pointer to p4d entry
   * @addr:     range start address
   * @end:      range end address
   */
 -static void  stage2_wp_puds(struct kvm *kvm, p4d_t *p4d,
 +static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
                            phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        pud_t *pud;
        phys_addr_t next;
  
                                if (!kvm_s2pud_readonly(pud))
                                        kvm_set_s2pud_readonly(pud);
                        } else {
 -                              stage2_wp_pmds(kvm, pud, addr, next);
 +                              stage2_wp_pmds(mmu, pud, addr, next);
                        }
                }
        } while (pud++, addr = next, addr != end);
   * @addr:     range start address
   * @end:      range end address
   */
 -static void  stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd,
 +static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
                            phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        p4d_t *p4d;
        phys_addr_t next;
  
        do {
                next = stage2_p4d_addr_end(kvm, addr, end);
                if (!stage2_p4d_none(kvm, *p4d))
 -                      stage2_wp_puds(kvm, p4d, addr, next);
 +                      stage2_wp_puds(mmu, p4d, addr, next);
        } while (p4d++, addr = next, addr != end);
  }
  
   * @addr:     Start address of range
   * @end:      End address of range
   */
 -static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 +static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
  {
 +      struct kvm *kvm = mmu->kvm;
        pgd_t *pgd;
        phys_addr_t next;
  
 -      pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
 +      pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
        do {
                /*
                 * Release kvm_mmu_lock periodically if the memory region is
                 * the lock.
                 */
                cond_resched_lock(&kvm->mmu_lock);
 -              if (!READ_ONCE(kvm->arch.pgd))
 +              if (!READ_ONCE(mmu->pgd))
                        break;
                next = stage2_pgd_addr_end(kvm, addr, end);
                if (stage2_pgd_present(kvm, *pgd))
 -                      stage2_wp_p4ds(kvm, pgd, addr, next);
 +                      stage2_wp_p4ds(mmu, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
  }
  
@@@ -1686,7 -1650,7 +1686,7 @@@ void kvm_mmu_wp_memory_region(struct kv
        end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
  
        spin_lock(&kvm->mmu_lock);
 -      stage2_wp_range(kvm, start, end);
 +      stage2_wp_range(&kvm->arch.mmu, start, end);
        spin_unlock(&kvm->mmu_lock);
        kvm_flush_remote_tlbs(kvm);
  }
@@@ -1710,7 -1674,7 +1710,7 @@@ static void kvm_mmu_write_protect_pt_ma
        phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
        phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
  
 -      stage2_wp_range(kvm, start, end);
 +      stage2_wp_range(&kvm->arch.mmu, start, end);
  }
  
  /*
@@@ -1873,7 -1837,6 +1873,7 @@@ static int user_mem_abort(struct kvm_vc
        pgprot_t mem_type = PAGE_S2;
        bool logging_active = memslot_is_logging(memslot);
        unsigned long vma_pagesize, flags = 0;
 +      struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
  
        write_fault = kvm_is_write_fault(vcpu);
        exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
         * execute permissions, and we preserve whatever we have.
         */
        needs_exec = exec_fault ||
 -              (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
 +              (fault_status == FSC_PERM && stage2_is_exec(mmu, fault_ipa));
  
        if (vma_pagesize == PUD_SIZE) {
                pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
                if (needs_exec)
                        new_pud = kvm_s2pud_mkexec(new_pud);
  
 -              ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
 +              ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
        } else if (vma_pagesize == PMD_SIZE) {
                pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
  
                if (needs_exec)
                        new_pmd = kvm_s2pmd_mkexec(new_pmd);
  
 -              ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 +              ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
        } else {
                pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
  
                if (needs_exec)
                        new_pte = kvm_s2pte_mkexec(new_pte);
  
 -              ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 +              ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
        }
  
  out_unlock:
@@@ -2060,7 -2023,7 +2060,7 @@@ static void handle_access_fault(struct 
  
        spin_lock(&vcpu->kvm->mmu_lock);
  
 -      if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
 +      if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
                goto out;
  
        if (pud) {              /* HugeTLB */
@@@ -2111,18 -2074,15 +2111,15 @@@ int kvm_handle_guest_abort(struct kvm_v
        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
  
        /* Synchronous External Abort? */
-       if (kvm_vcpu_dabt_isextabt(vcpu)) {
+       if (kvm_vcpu_abt_issea(vcpu)) {
                /*
                 * For RAS the host kernel may handle this abort.
                 * There is no need to pass the error into the guest.
                 */
-               if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
-                       return 1;
-               if (unlikely(!is_iabt)) {
+               if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
                        kvm_inject_vabt(vcpu);
-                       return 1;
-               }
+               return 1;
        }
  
        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
        hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
        write_fault = kvm_is_write_fault(vcpu);
        if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
+               /*
+                * The guest has put either its instructions or its page-tables
+                * somewhere it shouldn't have. Userspace won't be able to do
+                * anything about this (there's no syndrome for a start), so
+                * re-inject the abort back into the guest.
+                */
                if (is_iabt) {
-                       /* Prefetch Abort on I/O address */
                        ret = -ENOEXEC;
                        goto out;
                }
  
+               if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+                       kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+                       ret = 1;
+                       goto out_unlock;
+               }
                /*
                 * Check for a cache maintenance operation. Since we
                 * ended-up here, we know it is outside of any memory
                 * So let's assume that the guest is just being
                 * cautious, and skip the instruction.
                 */
-               if (kvm_vcpu_dabt_is_cm(vcpu)) {
+               if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
                        kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
                        ret = 1;
                        goto out_unlock;
@@@ -2234,14 -2205,14 +2242,14 @@@ static int handle_hva_to_gpa(struct kv
  
  static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
  {
 -      unmap_stage2_range(kvm, gpa, size);
 +      unmap_stage2_range(&kvm->arch.mmu, gpa, size);
        return 0;
  }
  
  int kvm_unmap_hva_range(struct kvm *kvm,
                        unsigned long start, unsigned long end)
  {
 -      if (!kvm->arch.pgd)
 +      if (!kvm->arch.mmu.pgd)
                return 0;
  
        trace_kvm_unmap_hva_range(start, end);
@@@ -2261,7 -2232,7 +2269,7 @@@ static int kvm_set_spte_handler(struct 
         * therefore stage2_set_pte() never needs to clear out a huge PMD
         * through this calling path.
         */
 -      stage2_set_pte(kvm, NULL, gpa, pte, 0);
 +      stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
        return 0;
  }
  
@@@ -2272,7 -2243,7 +2280,7 @@@ int kvm_set_spte_hva(struct kvm *kvm, u
        kvm_pfn_t pfn = pte_pfn(pte);
        pte_t stage2_pte;
  
 -      if (!kvm->arch.pgd)
 +      if (!kvm->arch.mmu.pgd)
                return 0;
  
        trace_kvm_set_spte_hva(hva);
@@@ -2295,7 -2266,7 +2303,7 @@@ static int kvm_age_hva_handler(struct k
        pte_t *pte;
  
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
 -      if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
 +      if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
                return 0;
  
        if (pud)
@@@ -2313,7 -2284,7 +2321,7 @@@ static int kvm_test_age_hva_handler(str
        pte_t *pte;
  
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
 -      if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
 +      if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
                return 0;
  
        if (pud)
  
  int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
  {
 -      if (!kvm->arch.pgd)
 +      if (!kvm->arch.mmu.pgd)
                return 0;
        trace_kvm_age_hva(start, end);
        return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
  
  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
  {
 -      if (!kvm->arch.pgd)
 +      if (!kvm->arch.mmu.pgd)
                return 0;
        trace_kvm_test_age_hva(hva);
        return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
@@@ -2547,7 -2518,7 +2555,7 @@@ int kvm_arch_prepare_memory_region(stru
  
        spin_lock(&kvm->mmu_lock);
        if (ret)
 -              unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
 +              unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
        else
                stage2_flush_memslot(kvm, memslot);
        spin_unlock(&kvm->mmu_lock);
@@@ -2566,7 -2537,7 +2574,7 @@@ void kvm_arch_memslots_updated(struct k
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
  {
 -      kvm_free_stage2_pgd(kvm);
 +      kvm_free_stage2_pgd(&kvm->arch.mmu);
  }
  
  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        phys_addr_t size = slot->npages << PAGE_SHIFT;
  
        spin_lock(&kvm->mmu_lock);
 -      unmap_stage2_range(kvm, gpa, size);
 +      unmap_stage2_range(&kvm->arch.mmu, gpa, size);
        spin_unlock(&kvm->mmu_lock);
  }